sys/kern/subr_trap.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-4-Clause
   3  *
   4  * Copyright (C) 1994, David Greenman
   5  * Copyright (c) 1990, 1993
   6  *      The Regents of the University of California.  All rights reserved.
   7  * Copyright (c) 2007, 2022 The FreeBSD Foundation
   8  *
   9  * This code is derived from software contributed to Berkeley by
  10  * the University of Utah, and William Jolitz.
  11  *
  12  * Portions of this software were developed by A. Joseph Koshy under
  13  * sponsorship from the FreeBSD Foundation and Google, Inc.
  14  *
  15  * Redistribution and use in source and binary forms, with or without
  16  * modification, are permitted provided that the following conditions
  17  * are met:
  18  * 1. Redistributions of source code must retain the above copyright
  19  *    notice, this list of conditions and the following disclaimer.
  20  * 2. Redistributions in binary form must reproduce the above copyright
  21  *    notice, this list of conditions and the following disclaimer in the
  22  *    documentation and/or other materials provided with the distribution.
  23  * 3. All advertising materials mentioning features or use of this software
  24  *    must display the following acknowledgement:
  25  *      This product includes software developed by the University of
  26  *      California, Berkeley and its contributors.
  27  * 4. Neither the name of the University nor the names of its contributors
  28  *    may be used to endorse or promote products derived from this software
  29  *    without specific prior written permission.
  30  *
  31  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  32  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  33  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  34  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  35  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  36  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  37  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  38  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  39  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  40  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  41  * SUCH DAMAGE.
  42  */
  43
  44 #include <sys/cdefs.h>
  45 #include "opt_hwpmc_hooks.h"
  46
  47 #include <sys/param.h>
  48 #include <sys/kernel.h>
  49 #include <sys/limits.h>
  50 #include <sys/lock.h>
  51 #include <sys/msan.h>
  52 #include <sys/mutex.h>
  53 #include <sys/proc.h>
  54 #include <sys/ktr.h>
  55 #include <sys/resourcevar.h>
  56 #include <sys/sched.h>
  57 #include <sys/syscall.h>
  58 #include <sys/syscallsubr.h>
  59 #include <sys/sysent.h>
  60 #include <sys/systm.h>
  61 #include <sys/vmmeter.h>
  62
  63 #include <machine/cpu.h>
  64
  65 #ifdef VIMAGE
  66 #include <net/vnet.h>
  67 #endif
  68
  69 #ifdef  HWPMC_HOOKS
  70 #include <sys/pmckern.h>
  71 #endif
  72
  73 #ifdef EPOCH_TRACE
  74 #include <sys/epoch.h>
  75 #endif
  76
  77 volatile uint32_t __read_frequently hpts_that_need_softclock = 0;
  78
  79 void    (*tcp_hpts_softclock)(void);
  80
  81 /*
  82  * Define the code needed before returning to user mode, for trap and
  83  * syscall.
  84  */
  85 void
  86 userret(struct thread *td, struct trapframe *frame)
  87 {
  88         struct proc *p = td->td_proc;
  89
  90         CTR3(KTR_SYSC, "userret: thread %p (pid %d, %s)", td, p->p_pid,
  91             td->td_name);
  92         KASSERT((p->p_flag & P_WEXIT) == 0,
  93             ("Exiting process returns to usermode"));
  94 #ifdef DIAGNOSTIC
  95         /*
  96          * Check that we called signotify() enough.  For
  97          * multi-threaded processes, where signal distribution might
  98          * change due to other threads changing sigmask, the check is
  99          * racy and cannot be performed reliably.
 100          * If current process is vfork child, indicated by P_PPWAIT, then
 101          * issignal() ignores stops, so we block the check to avoid
 102          * classifying pending signals.
 103          */
 104         if (p->p_numthreads == 1) {
 105                 PROC_LOCK(p);
 106                 thread_lock(td);
 107                 if ((p->p_flag & P_PPWAIT) == 0 &&
 108                     (td->td_pflags & TDP_SIGFASTBLOCK) == 0 &&
 109                     SIGPENDING(td) && !td_ast_pending(td, TDA_AST) &&
 110                     !td_ast_pending(td, TDA_SIG)) {
 111                         thread_unlock(td);
 112                         panic(
 113                             "failed to set signal flags for ast p %p "
 114                             "td %p td_ast %#x fl %#x",
 115                             p, td, td->td_ast, td->td_flags);
 116                 }
 117                 thread_unlock(td);
 118                 PROC_UNLOCK(p);
 119         }
 120 #endif
 121
 122         /*
 123          * Charge system time if profiling.
 124          */
 125         if (__predict_false(p->p_flag & P_PROFIL))
 126                 addupc_task(td, TRAPF_PC(frame), td->td_pticks * psratio);
 127
 128 #ifdef HWPMC_HOOKS
 129         if (PMC_THREAD_HAS_SAMPLES(td))
 130                 PMC_CALL_HOOK(td, PMC_FN_THR_USERRET, NULL);
 131 #endif
 132         /*
 133          * Calling tcp_hpts_softclock() here allows us to avoid frequent,
 134          * expensive callouts that trash the cache and lead to a much higher
 135          * number of interrupts and context switches.  Testing on busy web
 136          * servers at Netflix has shown that this improves CPU use by 7% over
 137          * relying only on callouts to drive HPTS, and also results in idle
 138          * power savings on mostly idle servers.
 139          * This was inspired by the paper "Soft Timers: Efficient Microsecond
 140          * Software Timer Support for Network Processing"
 141          * by Mohit Aron and Peter Druschel.
 142          */
 143         tcp_hpts_softclock();
 144         /*
 145          * Let the scheduler adjust our priority etc.
 146          */
 147         sched_userret(td);
 148
 149         /*
 150          * Check for misbehavior.
 151          *
 152          * In case there is a callchain tracing ongoing because of
 153          * hwpmc(4), skip the scheduler pinning check.
 154          * hwpmc(4) subsystem, infact, will collect callchain informations
 155          * at ast() checkpoint, which is past userret().
 156          */
 157         WITNESS_WARN(WARN_PANIC, NULL, "userret: returning");
 158         KASSERT(td->td_critnest == 0,
 159             ("userret: Returning in a critical section"));
 160         KASSERT(td->td_locks == 0,
 161             ("userret: Returning with %d locks held", td->td_locks));
 162         KASSERT(td->td_rw_rlocks == 0,
 163             ("userret: Returning with %d rwlocks held in read mode",
 164             td->td_rw_rlocks));
 165         KASSERT(td->td_sx_slocks == 0,
 166             ("userret: Returning with %d sx locks held in shared mode",
 167             td->td_sx_slocks));
 168         KASSERT(td->td_lk_slocks == 0,
 169             ("userret: Returning with %d lockmanager locks held in shared mode",
 170             td->td_lk_slocks));
 171         KASSERT((td->td_pflags & TDP_NOFAULTING) == 0,
 172             ("userret: Returning with pagefaults disabled"));
 173         if (__predict_false(!THREAD_CAN_SLEEP())) {
 174 #ifdef EPOCH_TRACE
 175                 epoch_trace_list(curthread);
 176 #endif
 177                 KASSERT(0, ("userret: Returning with sleep disabled"));
 178         }
 179         KASSERT(td->td_pinned == 0 || (td->td_pflags & TDP_CALLCHAIN) != 0,
 180             ("userret: Returning with pinned thread"));
 181         KASSERT(td->td_vp_reserved == NULL,
 182             ("userret: Returning with preallocated vnode"));
 183         KASSERT((td->td_flags & (TDF_SBDRY | TDF_SEINTR | TDF_SERESTART)) == 0,
 184             ("userret: Returning with stop signals deferred"));
 185         KASSERT(td->td_vslock_sz == 0,
 186             ("userret: Returning with vslock-wired space"));
 187 #ifdef VIMAGE
 188         /* Unfortunately td_vnet_lpush needs VNET_DEBUG. */
 189         VNET_ASSERT(curvnet == NULL,
 190             ("%s: Returning on td %p (pid %d, %s) with vnet %p set in %s",
 191             __func__, td, p->p_pid, td->td_name, curvnet,
 192             (td->td_vnet_lpush != NULL) ? td->td_vnet_lpush : "N/A"));
 193 #endif
 194 }
 195
 196 static void
 197 ast_prep(struct thread *td, int tda __unused)
 198 {
 199         VM_CNT_INC(v_trap);
 200         td->td_pticks = 0;
 201         if (td->td_cowgen != atomic_load_int(&td->td_proc->p_cowgen))
 202                 thread_cow_update(td);
 203
 204 }
 205
 206 struct ast_entry {
 207         int     ae_flags;
 208         int     ae_tdp;
 209         void    (*ae_f)(struct thread *td, int ast);
 210 };
 211
 212 _Static_assert(TDAI(TDA_MAX) <= UINT_MAX, "Too many ASTs");
 213
 214 static struct ast_entry ast_entries[TDA_MAX] __read_mostly = {
 215         [TDA_AST] = { .ae_f = ast_prep, .ae_flags = ASTR_UNCOND},
 216 };
 217
 218 void
 219 ast_register(int ast, int flags, int tdp,
 220     void (*f)(struct thread *, int asts))
 221 {
 222         struct ast_entry *ae;
 223
 224         MPASS(ast < TDA_MAX);
 225         MPASS((flags & ASTR_TDP) == 0 || ((flags & ASTR_ASTF_REQUIRED) != 0
 226             && __bitcount(tdp) == 1));
 227         ae = &ast_entries[ast];
 228         MPASS(ae->ae_f == NULL);
 229         ae->ae_flags = flags;
 230         ae->ae_tdp = tdp;
 231         atomic_interrupt_fence();
 232         ae->ae_f = f;
 233 }
 234
 235 /*
 236  * XXXKIB Note that the deregistration of an AST handler does not
 237  * drain threads possibly executing it, which affects unloadable
 238  * modules.  The issue is either handled by the subsystem using
 239  * handlers, or simply ignored.  Fixing the problem is considered not
 240  * worth the overhead.
 241  */
 242 void
 243 ast_deregister(int ast)
 244 {
 245         struct ast_entry *ae;
 246
 247         MPASS(ast < TDA_MAX);
 248         ae = &ast_entries[ast];
 249         MPASS(ae->ae_f != NULL);
 250         ae->ae_f = NULL;
 251         atomic_interrupt_fence();
 252         ae->ae_flags = 0;
 253         ae->ae_tdp = 0;
 254 }
 255
 256 void
 257 ast_sched_locked(struct thread *td, int tda)
 258 {
 259         THREAD_LOCK_ASSERT(td, MA_OWNED);
 260         MPASS(tda < TDA_MAX);
 261
 262         td->td_ast |= TDAI(tda);
 263 }
 264
 265 void
 266 ast_unsched_locked(struct thread *td, int tda)
 267 {
 268         THREAD_LOCK_ASSERT(td, MA_OWNED);
 269         MPASS(tda < TDA_MAX);
 270
 271         td->td_ast &= ~TDAI(tda);
 272 }
 273
 274 void
 275 ast_sched(struct thread *td, int tda)
 276 {
 277         thread_lock(td);
 278         ast_sched_locked(td, tda);
 279         thread_unlock(td);
 280 }
 281
 282 void
 283 ast_sched_mask(struct thread *td, int ast)
 284 {
 285         thread_lock(td);
 286         td->td_ast |= ast;
 287         thread_unlock(td);
 288 }
 289
 290 static bool
 291 ast_handler_calc_tdp_run(struct thread *td, const struct ast_entry *ae)
 292 {
 293         return ((ae->ae_flags & ASTR_TDP) == 0 ||
 294             (td->td_pflags & ae->ae_tdp) != 0);
 295 }
 296
 297 /*
 298  * Process an asynchronous software trap.
 299  */
 300 static void
 301 ast_handler(struct thread *td, struct trapframe *framep, bool dtor)
 302 {
 303         struct ast_entry *ae;
 304         void (*f)(struct thread *td, int asts);
 305         int a, td_ast;
 306         bool run;
 307
 308         if (framep != NULL) {
 309                 kmsan_mark(framep, sizeof(*framep), KMSAN_STATE_INITED);
 310                 td->td_frame = framep;
 311         }
 312
 313         if (__predict_true(!dtor)) {
 314                 WITNESS_WARN(WARN_PANIC, NULL, "Returning to user mode");
 315                 mtx_assert(&Giant, MA_NOTOWNED);
 316                 THREAD_LOCK_ASSERT(td, MA_NOTOWNED);
 317
 318                 /*
 319                  * This updates the td_ast for the checks below in one
 320                  * atomic operation with turning off all scheduled AST's.
 321                  * If another AST is triggered while we are handling the
 322                  * AST's saved in td_ast, the td_ast is again non-zero and
 323                  * ast() will be called again.
 324                  */
 325                 thread_lock(td);
 326                 td_ast = td->td_ast;
 327                 td->td_ast = 0;
 328                 thread_unlock(td);
 329         } else {
 330                 /*
 331                  * The td thread's td_lock is not guaranteed to exist,
 332                  * the thread might be not initialized enough when it's
 333                  * destructor is called.  It is safe to read and
 334                  * update td_ast without locking since the thread is
 335                  * not runnable or visible to other threads.
 336                  */
 337                 td_ast = td->td_ast;
 338                 td->td_ast = 0;
 339         }
 340
 341         CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, td->td_proc->p_pid,
 342             td->td_proc->p_comm);
 343         KASSERT(framep == NULL || TRAPF_USERMODE(framep),
 344             ("ast in kernel mode"));
 345
 346         for (a = 0; a < nitems(ast_entries); a++) {
 347                 ae = &ast_entries[a];
 348                 f = ae->ae_f;
 349                 if (f == NULL)
 350                         continue;
 351                 atomic_interrupt_fence();
 352
 353                 run = false;
 354                 if (__predict_false(framep == NULL)) {
 355                         if ((ae->ae_flags & ASTR_KCLEAR) != 0)
 356                                 run = ast_handler_calc_tdp_run(td, ae);
 357                 } else {
 358                         if ((ae->ae_flags & ASTR_UNCOND) != 0)
 359                                 run = true;
 360                         else if ((ae->ae_flags & ASTR_ASTF_REQUIRED) != 0 &&
 361                             (td_ast & TDAI(a)) != 0)
 362                                 run = ast_handler_calc_tdp_run(td, ae);
 363                 }
 364                 if (run)
 365                         f(td, td_ast);
 366         }
 367 }
 368
 369 void
 370 ast(struct trapframe *framep)
 371 {
 372         struct thread *td;
 373
 374         td = curthread;
 375         ast_handler(td, framep, false);
 376         userret(td, framep);
 377 }
 378
 379 void
 380 ast_kclear(struct thread *td)
 381 {
 382         ast_handler(td, NULL, td != curthread);
 383 }
 384
 385 const char *
 386 syscallname(struct proc *p, u_int code)
 387 {
 388         static const char unknown[] = "unknown";
 389         struct sysentvec *sv;
 390
 391         sv = p->p_sysent;
 392         if (sv->sv_syscallnames == NULL || code >= sv->sv_size)
 393                 return (unknown);
 394         return (sv->sv_syscallnames[code]);
 395 }