sys/compat/linux/linux_misc.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 2002 Doug Rabson
   5  * Copyright (c) 1994-1995 Søren Schmidt
   6  * All rights reserved.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer
  13  *    in this position and unchanged.
  14  * 2. Redistributions in binary form must reproduce the above copyright
  15  *    notice, this list of conditions and the following disclaimer in the
  16  *    documentation and/or other materials provided with the distribution.
  17  * 3. The name of the author may not be used to endorse or promote products
  18  *    derived from this software without specific prior written permission
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  21  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  22  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  23  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  24  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  25  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  29  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 #include <sys/cdefs.h>
  33 __FBSDID("$FreeBSD$");
  34
  35 #include "opt_compat.h"
  36
  37 #include <sys/param.h>
  38 #include <sys/blist.h>
  39 #include <sys/fcntl.h>
  40 #if defined(__i386__)
  41 #include <sys/imgact_aout.h>
  42 #endif
  43 #include <sys/jail.h>
  44 #include <sys/kernel.h>
  45 #include <sys/limits.h>
  46 #include <sys/lock.h>
  47 #include <sys/malloc.h>
  48 #include <sys/mman.h>
  49 #include <sys/mount.h>
  50 #include <sys/msgbuf.h>
  51 #include <sys/mutex.h>
  52 #include <sys/namei.h>
  53 #include <sys/poll.h>
  54 #include <sys/priv.h>
  55 #include <sys/proc.h>
  56 #include <sys/procctl.h>
  57 #include <sys/reboot.h>
  58 #include <sys/racct.h>
  59 #include <sys/random.h>
  60 #include <sys/resourcevar.h>
  61 #include <sys/sched.h>
  62 #include <sys/sdt.h>
  63 #include <sys/signalvar.h>
  64 #include <sys/stat.h>
  65 #include <sys/syscallsubr.h>
  66 #include <sys/sysctl.h>
  67 #include <sys/sysproto.h>
  68 #include <sys/systm.h>
  69 #include <sys/time.h>
  70 #include <sys/vmmeter.h>
  71 #include <sys/vnode.h>
  72 #include <sys/wait.h>
  73 #include <sys/cpuset.h>
  74 #include <sys/uio.h>
  75
  76 #include <security/mac/mac_framework.h>
  77
  78 #include <vm/vm.h>
  79 #include <vm/pmap.h>
  80 #include <vm/vm_kern.h>
  81 #include <vm/vm_map.h>
  82 #include <vm/vm_extern.h>
  83 #include <vm/swap_pager.h>
  84
  85 #ifdef COMPAT_LINUX32
  86 #include <machine/../linux32/linux.h>
  87 #include <machine/../linux32/linux32_proto.h>
  88 #else
  89 #include <machine/../linux/linux.h>
  90 #include <machine/../linux/linux_proto.h>
  91 #endif
  92
  93 #include <compat/linux/linux_common.h>
  94 #include <compat/linux/linux_dtrace.h>
  95 #include <compat/linux/linux_file.h>
  96 #include <compat/linux/linux_mib.h>
  97 #include <compat/linux/linux_signal.h>
  98 #include <compat/linux/linux_timer.h>
  99 #include <compat/linux/linux_util.h>
 100 #include <compat/linux/linux_sysproto.h>
 101 #include <compat/linux/linux_emul.h>
 102 #include <compat/linux/linux_misc.h>
 103
 104 int stclohz;                            /* Statistics clock frequency */
 105
 106 static unsigned int linux_to_bsd_resource[LINUX_RLIM_NLIMITS] = {
 107         RLIMIT_CPU, RLIMIT_FSIZE, RLIMIT_DATA, RLIMIT_STACK,
 108         RLIMIT_CORE, RLIMIT_RSS, RLIMIT_NPROC, RLIMIT_NOFILE,
 109         RLIMIT_MEMLOCK, RLIMIT_AS
 110 };
 111
 112 struct l_sysinfo {
 113         l_long          uptime;         /* Seconds since boot */
 114         l_ulong         loads[3];       /* 1, 5, and 15 minute load averages */
 115 #define LINUX_SYSINFO_LOADS_SCALE 65536
 116         l_ulong         totalram;       /* Total usable main memory size */
 117         l_ulong         freeram;        /* Available memory size */
 118         l_ulong         sharedram;      /* Amount of shared memory */
 119         l_ulong         bufferram;      /* Memory used by buffers */
 120         l_ulong         totalswap;      /* Total swap space size */
 121         l_ulong         freeswap;       /* swap space still available */
 122         l_ushort        procs;          /* Number of current processes */
 123         l_ushort        pads;
 124         l_ulong         totalhigh;
 125         l_ulong         freehigh;
 126         l_uint          mem_unit;
 127         char            _f[20-2*sizeof(l_long)-sizeof(l_int)];  /* padding */
 128 };
 129
 130 struct l_pselect6arg {
 131         l_uintptr_t     ss;
 132         l_size_t        ss_len;
 133 };
 134
 135 static int      linux_utimensat_lts_to_ts(struct l_timespec *,
 136                         struct timespec *);
 137 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
 138 static int      linux_utimensat_lts64_to_ts(struct l_timespec64 *,
 139                         struct timespec *);
 140 #endif
 141 static int      linux_common_utimensat(struct thread *, int,
 142                         const char *, struct timespec *, int);
 143 static int      linux_common_pselect6(struct thread *, l_int,
 144                         l_fd_set *, l_fd_set *, l_fd_set *,
 145                         struct timespec *, l_uintptr_t *);
 146 static int      linux_common_ppoll(struct thread *, struct pollfd *,
 147                         uint32_t, struct timespec *, l_sigset_t *,
 148                         l_size_t);
 149 static int      linux_pollin(struct thread *, struct pollfd *,
 150                         struct pollfd *, u_int);
 151 static int      linux_pollout(struct thread *, struct pollfd *,
 152                         struct pollfd *, u_int);
 153
 154 int
 155 linux_sysinfo(struct thread *td, struct linux_sysinfo_args *args)
 156 {
 157         struct l_sysinfo sysinfo;
 158         int i, j;
 159         struct timespec ts;
 160
 161         bzero(&sysinfo, sizeof(sysinfo));
 162         getnanouptime(&ts);
 163         if (ts.tv_nsec != 0)
 164                 ts.tv_sec++;
 165         sysinfo.uptime = ts.tv_sec;
 166
 167         /* Use the information from the mib to get our load averages */
 168         for (i = 0; i < 3; i++)
 169                 sysinfo.loads[i] = averunnable.ldavg[i] *
 170                     LINUX_SYSINFO_LOADS_SCALE / averunnable.fscale;
 171
 172         sysinfo.totalram = physmem * PAGE_SIZE;
 173         sysinfo.freeram = (u_long)vm_free_count() * PAGE_SIZE;
 174
 175         /*
 176          * sharedram counts pages allocated to named, swap-backed objects such
 177          * as shared memory segments and tmpfs files.  There is no cheap way to
 178          * compute this, so just leave the field unpopulated.  Linux itself only
 179          * started setting this field in the 3.x timeframe.
 180          */
 181         sysinfo.sharedram = 0;
 182         sysinfo.bufferram = 0;
 183
 184         swap_pager_status(&i, &j);
 185         sysinfo.totalswap = i * PAGE_SIZE;
 186         sysinfo.freeswap = (i - j) * PAGE_SIZE;
 187
 188         sysinfo.procs = nprocs;
 189
 190         /*
 191          * Platforms supported by the emulation layer do not have a notion of
 192          * high memory.
 193          */
 194         sysinfo.totalhigh = 0;
 195         sysinfo.freehigh = 0;
 196
 197         sysinfo.mem_unit = 1;
 198
 199         return (copyout(&sysinfo, args->info, sizeof(sysinfo)));
 200 }
 201
 202 #ifdef LINUX_LEGACY_SYSCALLS
 203 int
 204 linux_alarm(struct thread *td, struct linux_alarm_args *args)
 205 {
 206         struct itimerval it, old_it;
 207         u_int secs;
 208         int error __diagused;
 209
 210         secs = args->secs;
 211         /*
 212          * Linux alarm() is always successful. Limit secs to INT32_MAX / 2
 213          * to match kern_setitimer()'s limit to avoid error from it.
 214          *
 215          * XXX. Linux limit secs to INT_MAX on 32 and does not limit on 64-bit
 216          * platforms.
 217          */
 218         if (secs > INT32_MAX / 2)
 219                 secs = INT32_MAX / 2;
 220
 221         it.it_value.tv_sec = secs;
 222         it.it_value.tv_usec = 0;
 223         timevalclear(&it.it_interval);
 224         error = kern_setitimer(td, ITIMER_REAL, &it, &old_it);
 225         KASSERT(error == 0, ("kern_setitimer returns %d", error));
 226
 227         if ((old_it.it_value.tv_sec == 0 && old_it.it_value.tv_usec > 0) ||
 228             old_it.it_value.tv_usec >= 500000)
 229                 old_it.it_value.tv_sec++;
 230         td->td_retval[0] = old_it.it_value.tv_sec;
 231         return (0);
 232 }
 233 #endif
 234
 235 int
 236 linux_brk(struct thread *td, struct linux_brk_args *args)
 237 {
 238         struct vmspace *vm = td->td_proc->p_vmspace;
 239         uintptr_t new, old;
 240
 241         old = (uintptr_t)vm->vm_daddr + ctob(vm->vm_dsize);
 242         new = (uintptr_t)args->dsend;
 243         if ((caddr_t)new > vm->vm_daddr && !kern_break(td, &new))
 244                 td->td_retval[0] = (register_t)new;
 245         else
 246                 td->td_retval[0] = (register_t)old;
 247
 248         return (0);
 249 }
 250
 251 #if defined(__i386__)
 252 /* XXX: what about amd64/linux32? */
 253
 254 int
 255 linux_uselib(struct thread *td, struct linux_uselib_args *args)
 256 {
 257         struct nameidata ni;
 258         struct vnode *vp;
 259         struct exec *a_out;
 260         vm_map_t map;
 261         vm_map_entry_t entry;
 262         struct vattr attr;
 263         vm_offset_t vmaddr;
 264         unsigned long file_offset;
 265         unsigned long bss_size;
 266         char *library;
 267         ssize_t aresid;
 268         int error;
 269         bool locked, opened, textset;
 270
 271         a_out = NULL;
 272         vp = NULL;
 273         locked = false;
 274         textset = false;
 275         opened = false;
 276
 277         if (!LUSECONVPATH(td)) {
 278                 NDINIT(&ni, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | AUDITVNODE1,
 279                     UIO_USERSPACE, args->library);
 280                 error = namei(&ni);
 281         } else {
 282                 LCONVPATHEXIST(args->library, &library);
 283                 NDINIT(&ni, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | AUDITVNODE1,
 284                     UIO_SYSSPACE, library);
 285                 error = namei(&ni);
 286                 LFREEPATH(library);
 287         }
 288         if (error)
 289                 goto cleanup;
 290
 291         vp = ni.ni_vp;
 292         NDFREE_PNBUF(&ni);
 293
 294         /*
 295          * From here on down, we have a locked vnode that must be unlocked.
 296          * XXX: The code below largely duplicates exec_check_permissions().
 297          */
 298         locked = true;
 299
 300         /* Executable? */
 301         error = VOP_GETATTR(vp, &attr, td->td_ucred);
 302         if (error)
 303                 goto cleanup;
 304
 305         if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
 306             ((attr.va_mode & 0111) == 0) || (attr.va_type != VREG)) {
 307                 /* EACCESS is what exec(2) returns. */
 308                 error = ENOEXEC;
 309                 goto cleanup;
 310         }
 311
 312         /* Sensible size? */
 313         if (attr.va_size == 0) {
 314                 error = ENOEXEC;
 315                 goto cleanup;
 316         }
 317
 318         /* Can we access it? */
 319         error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
 320         if (error)
 321                 goto cleanup;
 322
 323         /*
 324          * XXX: This should use vn_open() so that it is properly authorized,
 325          * and to reduce code redundancy all over the place here.
 326          * XXX: Not really, it duplicates far more of exec_check_permissions()
 327          * than vn_open().
 328          */
 329 #ifdef MAC
 330         error = mac_vnode_check_open(td->td_ucred, vp, VREAD);
 331         if (error)
 332                 goto cleanup;
 333 #endif
 334         error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
 335         if (error)
 336                 goto cleanup;
 337         opened = true;
 338
 339         /* Pull in executable header into exec_map */
 340         error = vm_mmap(exec_map, (vm_offset_t *)&a_out, PAGE_SIZE,
 341             VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, vp, 0);
 342         if (error)
 343                 goto cleanup;
 344
 345         /* Is it a Linux binary ? */
 346         if (((a_out->a_magic >> 16) & 0xff) != 0x64) {
 347                 error = ENOEXEC;
 348                 goto cleanup;
 349         }
 350
 351         /*
 352          * While we are here, we should REALLY do some more checks
 353          */
 354
 355         /* Set file/virtual offset based on a.out variant. */
 356         switch ((int)(a_out->a_magic & 0xffff)) {
 357         case 0413:                      /* ZMAGIC */
 358                 file_offset = 1024;
 359                 break;
 360         case 0314:                      /* QMAGIC */
 361                 file_offset = 0;
 362                 break;
 363         default:
 364                 error = ENOEXEC;
 365                 goto cleanup;
 366         }
 367
 368         bss_size = round_page(a_out->a_bss);
 369
 370         /* Check various fields in header for validity/bounds. */
 371         if (a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) {
 372                 error = ENOEXEC;
 373                 goto cleanup;
 374         }
 375
 376         /* text + data can't exceed file size */
 377         if (a_out->a_data + a_out->a_text > attr.va_size) {
 378                 error = EFAULT;
 379                 goto cleanup;
 380         }
 381
 382         /*
 383          * text/data/bss must not exceed limits
 384          * XXX - this is not complete. it should check current usage PLUS
 385          * the resources needed by this library.
 386          */
 387         PROC_LOCK(td->td_proc);
 388         if (a_out->a_text > maxtsiz ||
 389             a_out->a_data + bss_size > lim_cur_proc(td->td_proc, RLIMIT_DATA) ||
 390             racct_set(td->td_proc, RACCT_DATA, a_out->a_data +
 391             bss_size) != 0) {
 392                 PROC_UNLOCK(td->td_proc);
 393                 error = ENOMEM;
 394                 goto cleanup;
 395         }
 396         PROC_UNLOCK(td->td_proc);
 397
 398         /*
 399          * Prevent more writers.
 400          */
 401         error = VOP_SET_TEXT(vp);
 402         if (error != 0)
 403                 goto cleanup;
 404         textset = true;
 405
 406         /*
 407          * Lock no longer needed
 408          */
 409         locked = false;
 410         VOP_UNLOCK(vp);
 411
 412         /*
 413          * Check if file_offset page aligned. Currently we cannot handle
 414          * misalinged file offsets, and so we read in the entire image
 415          * (what a waste).
 416          */
 417         if (file_offset & PAGE_MASK) {
 418                 /* Map text+data read/write/execute */
 419
 420                 /* a_entry is the load address and is page aligned */
 421                 vmaddr = trunc_page(a_out->a_entry);
 422
 423                 /* get anon user mapping, read+write+execute */
 424                 error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0,
 425                     &vmaddr, a_out->a_text + a_out->a_data, 0, VMFS_NO_SPACE,
 426                     VM_PROT_ALL, VM_PROT_ALL, 0);
 427                 if (error)
 428                         goto cleanup;
 429
 430                 error = vn_rdwr(UIO_READ, vp, (void *)vmaddr, file_offset,
 431                     a_out->a_text + a_out->a_data, UIO_USERSPACE, 0,
 432                     td->td_ucred, NOCRED, &aresid, td);
 433                 if (error != 0)
 434                         goto cleanup;
 435                 if (aresid != 0) {
 436                         error = ENOEXEC;
 437                         goto cleanup;
 438                 }
 439         } else {
 440                 /*
 441                  * for QMAGIC, a_entry is 20 bytes beyond the load address
 442                  * to skip the executable header
 443                  */
 444                 vmaddr = trunc_page(a_out->a_entry);
 445
 446                 /*
 447                  * Map it all into the process's space as a single
 448                  * copy-on-write "data" segment.
 449                  */
 450                 map = &td->td_proc->p_vmspace->vm_map;
 451                 error = vm_mmap(map, &vmaddr,
 452                     a_out->a_text + a_out->a_data, VM_PROT_ALL, VM_PROT_ALL,
 453                     MAP_PRIVATE | MAP_FIXED, OBJT_VNODE, vp, file_offset);
 454                 if (error)
 455                         goto cleanup;
 456                 vm_map_lock(map);
 457                 if (!vm_map_lookup_entry(map, vmaddr, &entry)) {
 458                         vm_map_unlock(map);
 459                         error = EDOOFUS;
 460                         goto cleanup;
 461                 }
 462                 entry->eflags |= MAP_ENTRY_VN_EXEC;
 463                 vm_map_unlock(map);
 464                 textset = false;
 465         }
 466
 467         if (bss_size != 0) {
 468                 /* Calculate BSS start address */
 469                 vmaddr = trunc_page(a_out->a_entry) + a_out->a_text +
 470                     a_out->a_data;
 471
 472                 /* allocate some 'anon' space */
 473                 error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0,
 474                     &vmaddr, bss_size, 0, VMFS_NO_SPACE, VM_PROT_ALL,
 475                     VM_PROT_ALL, 0);
 476                 if (error)
 477                         goto cleanup;
 478         }
 479
 480 cleanup:
 481         if (opened) {
 482                 if (locked)
 483                         VOP_UNLOCK(vp);
 484                 locked = false;
 485                 VOP_CLOSE(vp, FREAD, td->td_ucred, td);
 486         }
 487         if (textset) {
 488                 if (!locked) {
 489                         locked = true;
 490                         VOP_LOCK(vp, LK_SHARED | LK_RETRY);
 491                 }
 492                 VOP_UNSET_TEXT_CHECKED(vp);
 493         }
 494         if (locked)
 495                 VOP_UNLOCK(vp);
 496
 497         /* Release the temporary mapping. */
 498         if (a_out)
 499                 kmap_free_wakeup(exec_map, (vm_offset_t)a_out, PAGE_SIZE);
 500
 501         return (error);
 502 }
 503
 504 #endif  /* __i386__ */
 505
 506 #ifdef LINUX_LEGACY_SYSCALLS
 507 int
 508 linux_select(struct thread *td, struct linux_select_args *args)
 509 {
 510         l_timeval ltv;
 511         struct timeval tv0, tv1, utv, *tvp;
 512         int error;
 513
 514         /*
 515          * Store current time for computation of the amount of
 516          * time left.
 517          */
 518         if (args->timeout) {
 519                 if ((error = copyin(args->timeout, &ltv, sizeof(ltv))))
 520                         goto select_out;
 521                 utv.tv_sec = ltv.tv_sec;
 522                 utv.tv_usec = ltv.tv_usec;
 523
 524                 if (itimerfix(&utv)) {
 525                         /*
 526                          * The timeval was invalid.  Convert it to something
 527                          * valid that will act as it does under Linux.
 528                          */
 529                         utv.tv_sec += utv.tv_usec / 1000000;
 530                         utv.tv_usec %= 1000000;
 531                         if (utv.tv_usec < 0) {
 532                                 utv.tv_sec -= 1;
 533                                 utv.tv_usec += 1000000;
 534                         }
 535                         if (utv.tv_sec < 0)
 536                                 timevalclear(&utv);
 537                 }
 538                 microtime(&tv0);
 539                 tvp = &utv;
 540         } else
 541                 tvp = NULL;
 542
 543         error = kern_select(td, args->nfds, args->readfds, args->writefds,
 544             args->exceptfds, tvp, LINUX_NFDBITS);
 545         if (error)
 546                 goto select_out;
 547
 548         if (args->timeout) {
 549                 if (td->td_retval[0]) {
 550                         /*
 551                          * Compute how much time was left of the timeout,
 552                          * by subtracting the current time and the time
 553                          * before we started the call, and subtracting
 554                          * that result from the user-supplied value.
 555                          */
 556                         microtime(&tv1);
 557                         timevalsub(&tv1, &tv0);
 558                         timevalsub(&utv, &tv1);
 559                         if (utv.tv_sec < 0)
 560                                 timevalclear(&utv);
 561                 } else
 562                         timevalclear(&utv);
 563                 ltv.tv_sec = utv.tv_sec;
 564                 ltv.tv_usec = utv.tv_usec;
 565                 if ((error = copyout(&ltv, args->timeout, sizeof(ltv))))
 566                         goto select_out;
 567         }
 568
 569 select_out:
 570         return (error);
 571 }
 572 #endif
 573
 574 int
 575 linux_mremap(struct thread *td, struct linux_mremap_args *args)
 576 {
 577         uintptr_t addr;
 578         size_t len;
 579         int error = 0;
 580
 581         if (args->flags & ~(LINUX_MREMAP_FIXED | LINUX_MREMAP_MAYMOVE)) {
 582                 td->td_retval[0] = 0;
 583                 return (EINVAL);
 584         }
 585
 586         /*
 587          * Check for the page alignment.
 588          * Linux defines PAGE_MASK to be FreeBSD ~PAGE_MASK.
 589          */
 590         if (args->addr & PAGE_MASK) {
 591                 td->td_retval[0] = 0;
 592                 return (EINVAL);
 593         }
 594
 595         args->new_len = round_page(args->new_len);
 596         args->old_len = round_page(args->old_len);
 597
 598         if (args->new_len > args->old_len) {
 599                 td->td_retval[0] = 0;
 600                 return (ENOMEM);
 601         }
 602
 603         if (args->new_len < args->old_len) {
 604                 addr = args->addr + args->new_len;
 605                 len = args->old_len - args->new_len;
 606                 error = kern_munmap(td, addr, len);
 607         }
 608
 609         td->td_retval[0] = error ? 0 : (uintptr_t)args->addr;
 610         return (error);
 611 }
 612
 613 #define LINUX_MS_ASYNC       0x0001
 614 #define LINUX_MS_INVALIDATE  0x0002
 615 #define LINUX_MS_SYNC        0x0004
 616
 617 int
 618 linux_msync(struct thread *td, struct linux_msync_args *args)
 619 {
 620
 621         return (kern_msync(td, args->addr, args->len,
 622             args->fl & ~LINUX_MS_SYNC));
 623 }
 624
 625 #ifdef LINUX_LEGACY_SYSCALLS
 626 int
 627 linux_time(struct thread *td, struct linux_time_args *args)
 628 {
 629         struct timeval tv;
 630         l_time_t tm;
 631         int error;
 632
 633         microtime(&tv);
 634         tm = tv.tv_sec;
 635         if (args->tm && (error = copyout(&tm, args->tm, sizeof(tm))))
 636                 return (error);
 637         td->td_retval[0] = tm;
 638         return (0);
 639 }
 640 #endif
 641
 642 struct l_times_argv {
 643         l_clock_t       tms_utime;
 644         l_clock_t       tms_stime;
 645         l_clock_t       tms_cutime;
 646         l_clock_t       tms_cstime;
 647 };
 648
 649 /*
 650  * Glibc versions prior to 2.2.1 always use hard-coded CLK_TCK value.
 651  * Since 2.2.1 Glibc uses value exported from kernel via AT_CLKTCK
 652  * auxiliary vector entry.
 653  */
 654 #define CLK_TCK         100
 655
 656 #define CONVOTCK(r)     (r.tv_sec * CLK_TCK + r.tv_usec / (1000000 / CLK_TCK))
 657 #define CONVNTCK(r)     (r.tv_sec * stclohz + r.tv_usec / (1000000 / stclohz))
 658
 659 #define CONVTCK(r)      (linux_kernver(td) >= LINUX_KERNVER_2004000 ?           \
 660                             CONVNTCK(r) : CONVOTCK(r))
 661
 662 int
 663 linux_times(struct thread *td, struct linux_times_args *args)
 664 {
 665         struct timeval tv, utime, stime, cutime, cstime;
 666         struct l_times_argv tms;
 667         struct proc *p;
 668         int error;
 669
 670         if (args->buf != NULL) {
 671                 p = td->td_proc;
 672                 PROC_LOCK(p);
 673                 PROC_STATLOCK(p);
 674                 calcru(p, &utime, &stime);
 675                 PROC_STATUNLOCK(p);
 676                 calccru(p, &cutime, &cstime);
 677                 PROC_UNLOCK(p);
 678
 679                 tms.tms_utime = CONVTCK(utime);
 680                 tms.tms_stime = CONVTCK(stime);
 681
 682                 tms.tms_cutime = CONVTCK(cutime);
 683                 tms.tms_cstime = CONVTCK(cstime);
 684
 685                 if ((error = copyout(&tms, args->buf, sizeof(tms))))
 686                         return (error);
 687         }
 688
 689         microuptime(&tv);
 690         td->td_retval[0] = (int)CONVTCK(tv);
 691         return (0);
 692 }
 693
 694 int
 695 linux_newuname(struct thread *td, struct linux_newuname_args *args)
 696 {
 697         struct l_new_utsname utsname;
 698         char osname[LINUX_MAX_UTSNAME];
 699         char osrelease[LINUX_MAX_UTSNAME];
 700         char *p;
 701
 702         linux_get_osname(td, osname);
 703         linux_get_osrelease(td, osrelease);
 704
 705         bzero(&utsname, sizeof(utsname));
 706         strlcpy(utsname.sysname, osname, LINUX_MAX_UTSNAME);
 707         getcredhostname(td->td_ucred, utsname.nodename, LINUX_MAX_UTSNAME);
 708         getcreddomainname(td->td_ucred, utsname.domainname, LINUX_MAX_UTSNAME);
 709         strlcpy(utsname.release, osrelease, LINUX_MAX_UTSNAME);
 710         strlcpy(utsname.version, version, LINUX_MAX_UTSNAME);
 711         for (p = utsname.version; *p != '\0'; ++p)
 712                 if (*p == '\n') {
 713                         *p = '\0';
 714                         break;
 715                 }
 716 #if defined(__amd64__)
 717         /*
 718          * On amd64, Linux uname(2) needs to return "x86_64"
 719          * for both 64-bit and 32-bit applications.  On 32-bit,
 720          * the string returned by getauxval(AT_PLATFORM) needs
 721          * to remain "i686", though.
 722          */
 723         strlcpy(utsname.machine, "x86_64", LINUX_MAX_UTSNAME);
 724 #elif defined(__aarch64__)
 725         strlcpy(utsname.machine, "aarch64", LINUX_MAX_UTSNAME);
 726 #elif defined(__i386__)
 727         strlcpy(utsname.machine, "i686", LINUX_MAX_UTSNAME);
 728 #endif
 729
 730         return (copyout(&utsname, args->buf, sizeof(utsname)));
 731 }
 732
 733 struct l_utimbuf {
 734         l_time_t l_actime;
 735         l_time_t l_modtime;
 736 };
 737
 738 #ifdef LINUX_LEGACY_SYSCALLS
 739 int
 740 linux_utime(struct thread *td, struct linux_utime_args *args)
 741 {
 742         struct timeval tv[2], *tvp;
 743         struct l_utimbuf lut;
 744         char *fname;
 745         int error;
 746
 747         if (args->times) {
 748                 if ((error = copyin(args->times, &lut, sizeof lut)) != 0)
 749                         return (error);
 750                 tv[0].tv_sec = lut.l_actime;
 751                 tv[0].tv_usec = 0;
 752                 tv[1].tv_sec = lut.l_modtime;
 753                 tv[1].tv_usec = 0;
 754                 tvp = tv;
 755         } else
 756                 tvp = NULL;
 757
 758         if (!LUSECONVPATH(td)) {
 759                 error = kern_utimesat(td, AT_FDCWD, args->fname, UIO_USERSPACE,
 760                     tvp, UIO_SYSSPACE);
 761         } else {
 762                 LCONVPATHEXIST(args->fname, &fname);
 763                 error = kern_utimesat(td, AT_FDCWD, fname, UIO_SYSSPACE, tvp,
 764                     UIO_SYSSPACE);
 765                 LFREEPATH(fname);
 766         }
 767         return (error);
 768 }
 769 #endif
 770
 771 #ifdef LINUX_LEGACY_SYSCALLS
 772 int
 773 linux_utimes(struct thread *td, struct linux_utimes_args *args)
 774 {
 775         l_timeval ltv[2];
 776         struct timeval tv[2], *tvp = NULL;
 777         char *fname;
 778         int error;
 779
 780         if (args->tptr != NULL) {
 781                 if ((error = copyin(args->tptr, ltv, sizeof ltv)) != 0)
 782                         return (error);
 783                 tv[0].tv_sec = ltv[0].tv_sec;
 784                 tv[0].tv_usec = ltv[0].tv_usec;
 785                 tv[1].tv_sec = ltv[1].tv_sec;
 786                 tv[1].tv_usec = ltv[1].tv_usec;
 787                 tvp = tv;
 788         }
 789
 790         if (!LUSECONVPATH(td)) {
 791                 error = kern_utimesat(td, AT_FDCWD, args->fname, UIO_USERSPACE,
 792                     tvp, UIO_SYSSPACE);
 793         } else {
 794                 LCONVPATHEXIST(args->fname, &fname);
 795                 error = kern_utimesat(td, AT_FDCWD, fname, UIO_SYSSPACE,
 796                     tvp, UIO_SYSSPACE);
 797                 LFREEPATH(fname);
 798         }
 799         return (error);
 800 }
 801 #endif
 802
 803 static int
 804 linux_utimensat_lts_to_ts(struct l_timespec *l_times, struct timespec *times)
 805 {
 806
 807         if (l_times->tv_nsec != LINUX_UTIME_OMIT &&
 808             l_times->tv_nsec != LINUX_UTIME_NOW &&
 809             (l_times->tv_nsec < 0 || l_times->tv_nsec > 999999999))
 810                 return (EINVAL);
 811
 812         times->tv_sec = l_times->tv_sec;
 813         switch (l_times->tv_nsec)
 814         {
 815         case LINUX_UTIME_OMIT:
 816                 times->tv_nsec = UTIME_OMIT;
 817                 break;
 818         case LINUX_UTIME_NOW:
 819                 times->tv_nsec = UTIME_NOW;
 820                 break;
 821         default:
 822                 times->tv_nsec = l_times->tv_nsec;
 823         }
 824
 825         return (0);
 826 }
 827
 828 static int
 829 linux_common_utimensat(struct thread *td, int ldfd, const char *pathname,
 830     struct timespec *timesp, int lflags)
 831 {
 832         char *path = NULL;
 833         int error, dfd, flags = 0;
 834
 835         dfd = (ldfd == LINUX_AT_FDCWD) ? AT_FDCWD : ldfd;
 836
 837         if (lflags & ~(LINUX_AT_SYMLINK_NOFOLLOW | LINUX_AT_EMPTY_PATH))
 838                 return (EINVAL);
 839
 840         if (timesp != NULL) {
 841                 /* This breaks POSIX, but is what the Linux kernel does
 842                  * _on purpose_ (documented in the man page for utimensat(2)),
 843                  * so we must follow that behaviour. */
 844                 if (timesp[0].tv_nsec == UTIME_OMIT &&
 845                     timesp[1].tv_nsec == UTIME_OMIT)
 846                         return (0);
 847         }
 848
 849         if (lflags & LINUX_AT_SYMLINK_NOFOLLOW)
 850                 flags |= AT_SYMLINK_NOFOLLOW;
 851         if (lflags & LINUX_AT_EMPTY_PATH)
 852                 flags |= AT_EMPTY_PATH;
 853
 854         if (!LUSECONVPATH(td)) {
 855                 if (pathname != NULL) {
 856                         return (kern_utimensat(td, dfd, pathname,
 857                             UIO_USERSPACE, timesp, UIO_SYSSPACE, flags));
 858                 }
 859         }
 860
 861         if (pathname != NULL)
 862                 LCONVPATHEXIST_AT(pathname, &path, dfd);
 863         else if (lflags != 0)
 864                 return (EINVAL);
 865
 866         if (path == NULL)
 867                 error = kern_futimens(td, dfd, timesp, UIO_SYSSPACE);
 868         else {
 869                 error = kern_utimensat(td, dfd, path, UIO_SYSSPACE, timesp,
 870                         UIO_SYSSPACE, flags);
 871                 LFREEPATH(path);
 872         }
 873
 874         return (error);
 875 }
 876
 877 int
 878 linux_utimensat(struct thread *td, struct linux_utimensat_args *args)
 879 {
 880         struct l_timespec l_times[2];
 881         struct timespec times[2], *timesp;
 882         int error;
 883
 884         if (args->times != NULL) {
 885                 error = copyin(args->times, l_times, sizeof(l_times));
 886                 if (error != 0)
 887                         return (error);
 888
 889                 error = linux_utimensat_lts_to_ts(&l_times[0], &times[0]);
 890                 if (error != 0)
 891                         return (error);
 892                 error = linux_utimensat_lts_to_ts(&l_times[1], &times[1]);
 893                 if (error != 0)
 894                         return (error);
 895                 timesp = times;
 896         } else
 897                 timesp = NULL;
 898
 899         return (linux_common_utimensat(td, args->dfd, args->pathname,
 900             timesp, args->flags));
 901 }
 902
 903 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
 904 static int
 905 linux_utimensat_lts64_to_ts(struct l_timespec64 *l_times, struct timespec *times)
 906 {
 907
 908         if (l_times->tv_nsec != LINUX_UTIME_OMIT &&
 909             l_times->tv_nsec != LINUX_UTIME_NOW &&
 910             (l_times->tv_nsec < 0 || l_times->tv_nsec > 999999999))
 911                 return (EINVAL);
 912
 913         times->tv_sec = l_times->tv_sec;
 914         switch (l_times->tv_nsec)
 915         {
 916         case LINUX_UTIME_OMIT:
 917                 times->tv_nsec = UTIME_OMIT;
 918                 break;
 919         case LINUX_UTIME_NOW:
 920                 times->tv_nsec = UTIME_NOW;
 921                 break;
 922         default:
 923                 times->tv_nsec = l_times->tv_nsec;
 924         }
 925
 926         return (0);
 927 }
 928
 929 int
 930 linux_utimensat_time64(struct thread *td, struct linux_utimensat_time64_args *args)
 931 {
 932         struct l_timespec64 l_times[2];
 933         struct timespec times[2], *timesp;
 934         int error;
 935
 936         if (args->times64 != NULL) {
 937                 error = copyin(args->times64, l_times, sizeof(l_times));
 938                 if (error != 0)
 939                         return (error);
 940
 941                 error = linux_utimensat_lts64_to_ts(&l_times[0], &times[0]);
 942                 if (error != 0)
 943                         return (error);
 944                 error = linux_utimensat_lts64_to_ts(&l_times[1], &times[1]);
 945                 if (error != 0)
 946                         return (error);
 947                 timesp = times;
 948         } else
 949                 timesp = NULL;
 950
 951         return (linux_common_utimensat(td, args->dfd, args->pathname,
 952             timesp, args->flags));
 953 }
 954 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
 955
 956 #ifdef LINUX_LEGACY_SYSCALLS
 957 int
 958 linux_futimesat(struct thread *td, struct linux_futimesat_args *args)
 959 {
 960         l_timeval ltv[2];
 961         struct timeval tv[2], *tvp = NULL;
 962         char *fname;
 963         int error, dfd;
 964
 965         dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
 966
 967         if (args->utimes != NULL) {
 968                 if ((error = copyin(args->utimes, ltv, sizeof ltv)) != 0)
 969                         return (error);
 970                 tv[0].tv_sec = ltv[0].tv_sec;
 971                 tv[0].tv_usec = ltv[0].tv_usec;
 972                 tv[1].tv_sec = ltv[1].tv_sec;
 973                 tv[1].tv_usec = ltv[1].tv_usec;
 974                 tvp = tv;
 975         }
 976
 977         if (!LUSECONVPATH(td)) {
 978                 error = kern_utimesat(td, dfd, args->filename, UIO_USERSPACE,
 979                     tvp, UIO_SYSSPACE);
 980         } else {
 981                 LCONVPATHEXIST_AT(args->filename, &fname, dfd);
 982                 error = kern_utimesat(td, dfd, fname, UIO_SYSSPACE,
 983                     tvp, UIO_SYSSPACE);
 984                 LFREEPATH(fname);
 985         }
 986         return (error);
 987 }
 988 #endif
 989
 990 static int
 991 linux_common_wait(struct thread *td, idtype_t idtype, int id, int *statusp,
 992     int options, void *rup, l_siginfo_t *infop)
 993 {
 994         l_siginfo_t lsi;
 995         siginfo_t siginfo;
 996         struct __wrusage wru;
 997         int error, status, tmpstat, sig;
 998
 999         error = kern_wait6(td, idtype, id, &status, options,
1000             rup != NULL ? &wru : NULL, &siginfo);
1001
1002         if (error == 0 && statusp) {
1003                 tmpstat = status & 0xffff;
1004                 if (WIFSIGNALED(tmpstat)) {
1005                         tmpstat = (tmpstat & 0xffffff80) |
1006                             bsd_to_linux_signal(WTERMSIG(tmpstat));
1007                 } else if (WIFSTOPPED(tmpstat)) {
1008                         tmpstat = (tmpstat & 0xffff00ff) |
1009                             (bsd_to_linux_signal(WSTOPSIG(tmpstat)) << 8);
1010 #if defined(__aarch64__) || (defined(__amd64__) && !defined(COMPAT_LINUX32))
1011                         if (WSTOPSIG(status) == SIGTRAP) {
1012                                 tmpstat = linux_ptrace_status(td,
1013                                     siginfo.si_pid, tmpstat);
1014                         }
1015 #endif
1016                 } else if (WIFCONTINUED(tmpstat)) {
1017                         tmpstat = 0xffff;
1018                 }
1019                 error = copyout(&tmpstat, statusp, sizeof(int));
1020         }
1021         if (error == 0 && rup != NULL)
1022                 error = linux_copyout_rusage(&wru.wru_self, rup);
1023         if (error == 0 && infop != NULL && td->td_retval[0] != 0) {
1024                 sig = bsd_to_linux_signal(siginfo.si_signo);
1025                 siginfo_to_lsiginfo(&siginfo, &lsi, sig);
1026                 error = copyout(&lsi, infop, sizeof(lsi));
1027         }
1028
1029         return (error);
1030 }
1031
1032 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
1033 int
1034 linux_waitpid(struct thread *td, struct linux_waitpid_args *args)
1035 {
1036         struct linux_wait4_args wait4_args;
1037
1038         wait4_args.pid = args->pid;
1039         wait4_args.status = args->status;
1040         wait4_args.options = args->options;
1041         wait4_args.rusage = NULL;
1042
1043         return (linux_wait4(td, &wait4_args));
1044 }
1045 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
1046
1047 int
1048 linux_wait4(struct thread *td, struct linux_wait4_args *args)
1049 {
1050         struct proc *p;
1051         int options, id, idtype;
1052
1053         if (args->options & ~(LINUX_WUNTRACED | LINUX_WNOHANG |
1054             LINUX_WCONTINUED | __WCLONE | __WNOTHREAD | __WALL))
1055                 return (EINVAL);
1056
1057         options = 0;
1058         linux_to_bsd_waitopts(args->options, &options);
1059
1060         /*
1061          * For backward compatibility we implicitly add flags WEXITED
1062          * and WTRAPPED here.
1063          */
1064         options |= WEXITED | WTRAPPED;
1065
1066         if (args->pid == WAIT_ANY) {
1067                 idtype = P_ALL;
1068                 id = 0;
1069         } else if (args->pid < 0) {
1070                 idtype = P_PGID;
1071                 id = (id_t)-args->pid;
1072         } else if (args->pid == 0) {
1073                 idtype = P_PGID;
1074                 p = td->td_proc;
1075                 PROC_LOCK(p);
1076                 id = p->p_pgid;
1077                 PROC_UNLOCK(p);
1078         } else {
1079                 idtype = P_PID;
1080                 id = (id_t)args->pid;
1081         }
1082
1083         return (linux_common_wait(td, idtype, id, args->status, options,
1084             args->rusage, NULL));
1085 }
1086
1087 int
1088 linux_waitid(struct thread *td, struct linux_waitid_args *args)
1089 {
1090         idtype_t idtype;
1091         int error, options;
1092
1093         if (args->options & ~(LINUX_WNOHANG | LINUX_WNOWAIT | LINUX_WEXITED |
1094             LINUX_WSTOPPED | LINUX_WCONTINUED | __WCLONE | __WNOTHREAD | __WALL))
1095                 return (EINVAL);
1096
1097         options = 0;
1098         linux_to_bsd_waitopts(args->options, &options);
1099
1100         switch (args->idtype) {
1101         case LINUX_P_ALL:
1102                 idtype = P_ALL;
1103                 break;
1104         case LINUX_P_PID:
1105                 if (args->id <= 0)
1106                         return (EINVAL);
1107                 idtype = P_PID;
1108                 break;
1109         case LINUX_P_PGID:
1110                 if (args->id <= 0)
1111                         return (EINVAL);
1112                 idtype = P_PGID;
1113                 break;
1114         default:
1115                 return (EINVAL);
1116         }
1117
1118         error = linux_common_wait(td, idtype, args->id, NULL, options,
1119             args->rusage, args->info);
1120         td->td_retval[0] = 0;
1121
1122         return (error);
1123 }
1124
1125 #ifdef LINUX_LEGACY_SYSCALLS
1126 int
1127 linux_mknod(struct thread *td, struct linux_mknod_args *args)
1128 {
1129         char *path;
1130         int error;
1131         enum uio_seg seg;
1132         bool convpath;
1133
1134         convpath = LUSECONVPATH(td);
1135         if (!convpath) {
1136                 path = args->path;
1137                 seg = UIO_USERSPACE;
1138         } else {
1139                 LCONVPATHCREAT(args->path, &path);
1140                 seg = UIO_SYSSPACE;
1141         }
1142
1143         switch (args->mode & S_IFMT) {
1144         case S_IFIFO:
1145         case S_IFSOCK:
1146                 error = kern_mkfifoat(td, AT_FDCWD, path, seg,
1147                     args->mode);
1148                 break;
1149
1150         case S_IFCHR:
1151         case S_IFBLK:
1152                 error = kern_mknodat(td, AT_FDCWD, path, seg,
1153                     args->mode, args->dev);
1154                 break;
1155
1156         case S_IFDIR:
1157                 error = EPERM;
1158                 break;
1159
1160         case 0:
1161                 args->mode |= S_IFREG;
1162                 /* FALLTHROUGH */
1163         case S_IFREG:
1164                 error = kern_openat(td, AT_FDCWD, path, seg,
1165                     O_WRONLY | O_CREAT | O_TRUNC, args->mode);
1166                 if (error == 0)
1167                         kern_close(td, td->td_retval[0]);
1168                 break;
1169
1170         default:
1171                 error = EINVAL;
1172                 break;
1173         }
1174         if (convpath)
1175                 LFREEPATH(path);
1176         return (error);
1177 }
1178 #endif
1179
1180 int
1181 linux_mknodat(struct thread *td, struct linux_mknodat_args *args)
1182 {
1183         char *path;
1184         int error, dfd;
1185         enum uio_seg seg;
1186         bool convpath;
1187
1188         dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
1189
1190         convpath = LUSECONVPATH(td);
1191         if (!convpath) {
1192                 path = __DECONST(char *, args->filename);
1193                 seg = UIO_USERSPACE;
1194         } else {
1195                 LCONVPATHCREAT_AT(args->filename, &path, dfd);
1196                 seg = UIO_SYSSPACE;
1197         }
1198
1199         switch (args->mode & S_IFMT) {
1200         case S_IFIFO:
1201         case S_IFSOCK:
1202                 error = kern_mkfifoat(td, dfd, path, seg, args->mode);
1203                 break;
1204
1205         case S_IFCHR:
1206         case S_IFBLK:
1207                 error = kern_mknodat(td, dfd, path, seg, args->mode,
1208                     args->dev);
1209                 break;
1210
1211         case S_IFDIR:
1212                 error = EPERM;
1213                 break;
1214
1215         case 0:
1216                 args->mode |= S_IFREG;
1217                 /* FALLTHROUGH */
1218         case S_IFREG:
1219                 error = kern_openat(td, dfd, path, seg,
1220                     O_WRONLY | O_CREAT | O_TRUNC, args->mode);
1221                 if (error == 0)
1222                         kern_close(td, td->td_retval[0]);
1223                 break;
1224
1225         default:
1226                 error = EINVAL;
1227                 break;
1228         }
1229         if (convpath)
1230                 LFREEPATH(path);
1231         return (error);
1232 }
1233
1234 /*
1235  * UGH! This is just about the dumbest idea I've ever heard!!
1236  */
1237 int
1238 linux_personality(struct thread *td, struct linux_personality_args *args)
1239 {
1240         struct linux_pemuldata *pem;
1241         struct proc *p = td->td_proc;
1242         uint32_t old;
1243
1244         PROC_LOCK(p);
1245         pem = pem_find(p);
1246         old = pem->persona;
1247         if (args->per != 0xffffffff)
1248                 pem->persona = args->per;
1249         PROC_UNLOCK(p);
1250
1251         td->td_retval[0] = old;
1252         return (0);
1253 }
1254
1255 struct l_itimerval {
1256         l_timeval it_interval;
1257         l_timeval it_value;
1258 };
1259
1260 #define B2L_ITIMERVAL(bip, lip)                                         \
1261         (bip)->it_interval.tv_sec = (lip)->it_interval.tv_sec;          \
1262         (bip)->it_interval.tv_usec = (lip)->it_interval.tv_usec;        \
1263         (bip)->it_value.tv_sec = (lip)->it_value.tv_sec;                \
1264         (bip)->it_value.tv_usec = (lip)->it_value.tv_usec;
1265
1266 int
1267 linux_setitimer(struct thread *td, struct linux_setitimer_args *uap)
1268 {
1269         int error;
1270         struct l_itimerval ls;
1271         struct itimerval aitv, oitv;
1272
1273         if (uap->itv == NULL) {
1274                 uap->itv = uap->oitv;
1275                 return (linux_getitimer(td, (struct linux_getitimer_args *)uap));
1276         }
1277
1278         error = copyin(uap->itv, &ls, sizeof(ls));
1279         if (error != 0)
1280                 return (error);
1281         B2L_ITIMERVAL(&aitv, &ls);
1282         error = kern_setitimer(td, uap->which, &aitv, &oitv);
1283         if (error != 0 || uap->oitv == NULL)
1284                 return (error);
1285         B2L_ITIMERVAL(&ls, &oitv);
1286
1287         return (copyout(&ls, uap->oitv, sizeof(ls)));
1288 }
1289
1290 int
1291 linux_getitimer(struct thread *td, struct linux_getitimer_args *uap)
1292 {
1293         int error;
1294         struct l_itimerval ls;
1295         struct itimerval aitv;
1296
1297         error = kern_getitimer(td, uap->which, &aitv);
1298         if (error != 0)
1299                 return (error);
1300         B2L_ITIMERVAL(&ls, &aitv);
1301         return (copyout(&ls, uap->itv, sizeof(ls)));
1302 }
1303
1304 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
1305 int
1306 linux_nice(struct thread *td, struct linux_nice_args *args)
1307 {
1308
1309         return (kern_setpriority(td, PRIO_PROCESS, 0, args->inc));
1310 }
1311 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
1312
1313 int
1314 linux_setgroups(struct thread *td, struct linux_setgroups_args *args)
1315 {
1316         struct ucred *newcred, *oldcred;
1317         l_gid_t *linux_gidset;
1318         gid_t *bsd_gidset;
1319         int ngrp, error;
1320         struct proc *p;
1321
1322         ngrp = args->gidsetsize;
1323         if (ngrp < 0 || ngrp >= ngroups_max + 1)
1324                 return (EINVAL);
1325         linux_gidset = malloc(ngrp * sizeof(*linux_gidset), M_LINUX, M_WAITOK);
1326         error = copyin(args->grouplist, linux_gidset, ngrp * sizeof(l_gid_t));
1327         if (error)
1328                 goto out;
1329         newcred = crget();
1330         crextend(newcred, ngrp + 1);
1331         p = td->td_proc;
1332         PROC_LOCK(p);
1333         oldcred = p->p_ucred;
1334         crcopy(newcred, oldcred);
1335
1336         /*
1337          * cr_groups[0] holds egid. Setting the whole set from
1338          * the supplied set will cause egid to be changed too.
1339          * Keep cr_groups[0] unchanged to prevent that.
1340          */
1341
1342         if ((error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS)) != 0) {
1343                 PROC_UNLOCK(p);
1344                 crfree(newcred);
1345                 goto out;
1346         }
1347
1348         if (ngrp > 0) {
1349                 newcred->cr_ngroups = ngrp + 1;
1350
1351                 bsd_gidset = newcred->cr_groups;
1352                 ngrp--;
1353                 while (ngrp >= 0) {
1354                         bsd_gidset[ngrp + 1] = linux_gidset[ngrp];
1355                         ngrp--;
1356                 }
1357         } else
1358                 newcred->cr_ngroups = 1;
1359
1360         setsugid(p);
1361         proc_set_cred(p, newcred);
1362         PROC_UNLOCK(p);
1363         crfree(oldcred);
1364         error = 0;
1365 out:
1366         free(linux_gidset, M_LINUX);
1367         return (error);
1368 }
1369
1370 int
1371 linux_getgroups(struct thread *td, struct linux_getgroups_args *args)
1372 {
1373         struct ucred *cred;
1374         l_gid_t *linux_gidset;
1375         gid_t *bsd_gidset;
1376         int bsd_gidsetsz, ngrp, error;
1377
1378         cred = td->td_ucred;
1379         bsd_gidset = cred->cr_groups;
1380         bsd_gidsetsz = cred->cr_ngroups - 1;
1381
1382         /*
1383          * cr_groups[0] holds egid. Returning the whole set
1384          * here will cause a duplicate. Exclude cr_groups[0]
1385          * to prevent that.
1386          */
1387
1388         if ((ngrp = args->gidsetsize) == 0) {
1389                 td->td_retval[0] = bsd_gidsetsz;
1390                 return (0);
1391         }
1392
1393         if (ngrp < bsd_gidsetsz)
1394                 return (EINVAL);
1395
1396         ngrp = 0;
1397         linux_gidset = malloc(bsd_gidsetsz * sizeof(*linux_gidset),
1398             M_LINUX, M_WAITOK);
1399         while (ngrp < bsd_gidsetsz) {
1400                 linux_gidset[ngrp] = bsd_gidset[ngrp + 1];
1401                 ngrp++;
1402         }
1403
1404         error = copyout(linux_gidset, args->grouplist, ngrp * sizeof(l_gid_t));
1405         free(linux_gidset, M_LINUX);
1406         if (error)
1407                 return (error);
1408
1409         td->td_retval[0] = ngrp;
1410         return (0);
1411 }
1412
1413 static bool
1414 linux_get_dummy_limit(l_uint resource, struct rlimit *rlim)
1415 {
1416
1417         if (linux_dummy_rlimits == 0)
1418                 return (false);
1419
1420         switch (resource) {
1421         case LINUX_RLIMIT_LOCKS:
1422         case LINUX_RLIMIT_SIGPENDING:
1423         case LINUX_RLIMIT_MSGQUEUE:
1424         case LINUX_RLIMIT_RTTIME:
1425                 rlim->rlim_cur = LINUX_RLIM_INFINITY;
1426                 rlim->rlim_max = LINUX_RLIM_INFINITY;
1427                 return (true);
1428         case LINUX_RLIMIT_NICE:
1429         case LINUX_RLIMIT_RTPRIO:
1430                 rlim->rlim_cur = 0;
1431                 rlim->rlim_max = 0;
1432                 return (true);
1433         default:
1434                 return (false);
1435         }
1436 }
1437
1438 int
1439 linux_setrlimit(struct thread *td, struct linux_setrlimit_args *args)
1440 {
1441         struct rlimit bsd_rlim;
1442         struct l_rlimit rlim;
1443         u_int which;
1444         int error;
1445
1446         if (args->resource >= LINUX_RLIM_NLIMITS)
1447                 return (EINVAL);
1448
1449         which = linux_to_bsd_resource[args->resource];
1450         if (which == -1)
1451                 return (EINVAL);
1452
1453         error = copyin(args->rlim, &rlim, sizeof(rlim));
1454         if (error)
1455                 return (error);
1456
1457         bsd_rlim.rlim_cur = (rlim_t)rlim.rlim_cur;
1458         bsd_rlim.rlim_max = (rlim_t)rlim.rlim_max;
1459         return (kern_setrlimit(td, which, &bsd_rlim));
1460 }
1461
1462 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
1463 int
1464 linux_old_getrlimit(struct thread *td, struct linux_old_getrlimit_args *args)
1465 {
1466         struct l_rlimit rlim;
1467         struct rlimit bsd_rlim;
1468         u_int which;
1469
1470         if (linux_get_dummy_limit(args->resource, &bsd_rlim)) {
1471                 rlim.rlim_cur = bsd_rlim.rlim_cur;
1472                 rlim.rlim_max = bsd_rlim.rlim_max;
1473                 return (copyout(&rlim, args->rlim, sizeof(rlim)));
1474         }
1475
1476         if (args->resource >= LINUX_RLIM_NLIMITS)
1477                 return (EINVAL);
1478
1479         which = linux_to_bsd_resource[args->resource];
1480         if (which == -1)
1481                 return (EINVAL);
1482
1483         lim_rlimit(td, which, &bsd_rlim);
1484
1485 #ifdef COMPAT_LINUX32
1486         rlim.rlim_cur = (unsigned int)bsd_rlim.rlim_cur;
1487         if (rlim.rlim_cur == UINT_MAX)
1488                 rlim.rlim_cur = INT_MAX;
1489         rlim.rlim_max = (unsigned int)bsd_rlim.rlim_max;
1490         if (rlim.rlim_max == UINT_MAX)
1491                 rlim.rlim_max = INT_MAX;
1492 #else
1493         rlim.rlim_cur = (unsigned long)bsd_rlim.rlim_cur;
1494         if (rlim.rlim_cur == ULONG_MAX)
1495                 rlim.rlim_cur = LONG_MAX;
1496         rlim.rlim_max = (unsigned long)bsd_rlim.rlim_max;
1497         if (rlim.rlim_max == ULONG_MAX)
1498                 rlim.rlim_max = LONG_MAX;
1499 #endif
1500         return (copyout(&rlim, args->rlim, sizeof(rlim)));
1501 }
1502 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
1503
1504 int
1505 linux_getrlimit(struct thread *td, struct linux_getrlimit_args *args)
1506 {
1507         struct l_rlimit rlim;
1508         struct rlimit bsd_rlim;
1509         u_int which;
1510
1511         if (linux_get_dummy_limit(args->resource, &bsd_rlim)) {
1512                 rlim.rlim_cur = bsd_rlim.rlim_cur;
1513                 rlim.rlim_max = bsd_rlim.rlim_max;
1514                 return (copyout(&rlim, args->rlim, sizeof(rlim)));
1515         }
1516
1517         if (args->resource >= LINUX_RLIM_NLIMITS)
1518                 return (EINVAL);
1519
1520         which = linux_to_bsd_resource[args->resource];
1521         if (which == -1)
1522                 return (EINVAL);
1523
1524         lim_rlimit(td, which, &bsd_rlim);
1525
1526         rlim.rlim_cur = (l_ulong)bsd_rlim.rlim_cur;
1527         rlim.rlim_max = (l_ulong)bsd_rlim.rlim_max;
1528         return (copyout(&rlim, args->rlim, sizeof(rlim)));
1529 }
1530
1531 int
1532 linux_sched_setscheduler(struct thread *td,
1533     struct linux_sched_setscheduler_args *args)
1534 {
1535         struct sched_param sched_param;
1536         struct thread *tdt;
1537         int error, policy;
1538
1539         switch (args->policy) {
1540         case LINUX_SCHED_OTHER:
1541                 policy = SCHED_OTHER;
1542                 break;
1543         case LINUX_SCHED_FIFO:
1544                 policy = SCHED_FIFO;
1545                 break;
1546         case LINUX_SCHED_RR:
1547                 policy = SCHED_RR;
1548                 break;
1549         default:
1550                 return (EINVAL);
1551         }
1552
1553         error = copyin(args->param, &sched_param, sizeof(sched_param));
1554         if (error)
1555                 return (error);
1556
1557         if (linux_map_sched_prio) {
1558                 switch (policy) {
1559                 case SCHED_OTHER:
1560                         if (sched_param.sched_priority != 0)
1561                                 return (EINVAL);
1562
1563                         sched_param.sched_priority =
1564                             PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE;
1565                         break;
1566                 case SCHED_FIFO:
1567                 case SCHED_RR:
1568                         if (sched_param.sched_priority < 1 ||
1569                             sched_param.sched_priority >= LINUX_MAX_RT_PRIO)
1570                                 return (EINVAL);
1571
1572                         /*
1573                          * Map [1, LINUX_MAX_RT_PRIO - 1] to
1574                          * [0, RTP_PRIO_MAX - RTP_PRIO_MIN] (rounding down).
1575                          */
1576                         sched_param.sched_priority =
1577                             (sched_param.sched_priority - 1) *
1578                             (RTP_PRIO_MAX - RTP_PRIO_MIN + 1) /
1579                             (LINUX_MAX_RT_PRIO - 1);
1580                         break;
1581                 }
1582         }
1583
1584         tdt = linux_tdfind(td, args->pid, -1);
1585         if (tdt == NULL)
1586                 return (ESRCH);
1587
1588         error = kern_sched_setscheduler(td, tdt, policy, &sched_param);
1589         PROC_UNLOCK(tdt->td_proc);
1590         return (error);
1591 }
1592
1593 int
1594 linux_sched_getscheduler(struct thread *td,
1595     struct linux_sched_getscheduler_args *args)
1596 {
1597         struct thread *tdt;
1598         int error, policy;
1599
1600         tdt = linux_tdfind(td, args->pid, -1);
1601         if (tdt == NULL)
1602                 return (ESRCH);
1603
1604         error = kern_sched_getscheduler(td, tdt, &policy);
1605         PROC_UNLOCK(tdt->td_proc);
1606
1607         switch (policy) {
1608         case SCHED_OTHER:
1609                 td->td_retval[0] = LINUX_SCHED_OTHER;
1610                 break;
1611         case SCHED_FIFO:
1612                 td->td_retval[0] = LINUX_SCHED_FIFO;
1613                 break;
1614         case SCHED_RR:
1615                 td->td_retval[0] = LINUX_SCHED_RR;
1616                 break;
1617         }
1618         return (error);
1619 }
1620
1621 int
1622 linux_sched_get_priority_max(struct thread *td,
1623     struct linux_sched_get_priority_max_args *args)
1624 {
1625         struct sched_get_priority_max_args bsd;
1626
1627         if (linux_map_sched_prio) {
1628                 switch (args->policy) {
1629                 case LINUX_SCHED_OTHER:
1630                         td->td_retval[0] = 0;
1631                         return (0);
1632                 case LINUX_SCHED_FIFO:
1633                 case LINUX_SCHED_RR:
1634                         td->td_retval[0] = LINUX_MAX_RT_PRIO - 1;
1635                         return (0);
1636                 default:
1637                         return (EINVAL);
1638                 }
1639         }
1640
1641         switch (args->policy) {
1642         case LINUX_SCHED_OTHER:
1643                 bsd.policy = SCHED_OTHER;
1644                 break;
1645         case LINUX_SCHED_FIFO:
1646                 bsd.policy = SCHED_FIFO;
1647                 break;
1648         case LINUX_SCHED_RR:
1649                 bsd.policy = SCHED_RR;
1650                 break;
1651         default:
1652                 return (EINVAL);
1653         }
1654         return (sys_sched_get_priority_max(td, &bsd));
1655 }
1656
1657 int
1658 linux_sched_get_priority_min(struct thread *td,
1659     struct linux_sched_get_priority_min_args *args)
1660 {
1661         struct sched_get_priority_min_args bsd;
1662
1663         if (linux_map_sched_prio) {
1664                 switch (args->policy) {
1665                 case LINUX_SCHED_OTHER:
1666                         td->td_retval[0] = 0;
1667                         return (0);
1668                 case LINUX_SCHED_FIFO:
1669                 case LINUX_SCHED_RR:
1670                         td->td_retval[0] = 1;
1671                         return (0);
1672                 default:
1673                         return (EINVAL);
1674                 }
1675         }
1676
1677         switch (args->policy) {
1678         case LINUX_SCHED_OTHER:
1679                 bsd.policy = SCHED_OTHER;
1680                 break;
1681         case LINUX_SCHED_FIFO:
1682                 bsd.policy = SCHED_FIFO;
1683                 break;
1684         case LINUX_SCHED_RR:
1685                 bsd.policy = SCHED_RR;
1686                 break;
1687         default:
1688                 return (EINVAL);
1689         }
1690         return (sys_sched_get_priority_min(td, &bsd));
1691 }
1692
1693 #define REBOOT_CAD_ON   0x89abcdef
1694 #define REBOOT_CAD_OFF  0
1695 #define REBOOT_HALT     0xcdef0123
1696 #define REBOOT_RESTART  0x01234567
1697 #define REBOOT_RESTART2 0xA1B2C3D4
1698 #define REBOOT_POWEROFF 0x4321FEDC
1699 #define REBOOT_MAGIC1   0xfee1dead
1700 #define REBOOT_MAGIC2   0x28121969
1701 #define REBOOT_MAGIC2A  0x05121996
1702 #define REBOOT_MAGIC2B  0x16041998
1703
1704 int
1705 linux_reboot(struct thread *td, struct linux_reboot_args *args)
1706 {
1707         struct reboot_args bsd_args;
1708
1709         if (args->magic1 != REBOOT_MAGIC1)
1710                 return (EINVAL);
1711
1712         switch (args->magic2) {
1713         case REBOOT_MAGIC2:
1714         case REBOOT_MAGIC2A:
1715         case REBOOT_MAGIC2B:
1716                 break;
1717         default:
1718                 return (EINVAL);
1719         }
1720
1721         switch (args->cmd) {
1722         case REBOOT_CAD_ON:
1723         case REBOOT_CAD_OFF:
1724                 return (priv_check(td, PRIV_REBOOT));
1725         case REBOOT_HALT:
1726                 bsd_args.opt = RB_HALT;
1727                 break;
1728         case REBOOT_RESTART:
1729         case REBOOT_RESTART2:
1730                 bsd_args.opt = 0;
1731                 break;
1732         case REBOOT_POWEROFF:
1733                 bsd_args.opt = RB_POWEROFF;
1734                 break;
1735         default:
1736                 return (EINVAL);
1737         }
1738         return (sys_reboot(td, &bsd_args));
1739 }
1740
1741 int
1742 linux_getpid(struct thread *td, struct linux_getpid_args *args)
1743 {
1744
1745         td->td_retval[0] = td->td_proc->p_pid;
1746
1747         return (0);
1748 }
1749
1750 int
1751 linux_gettid(struct thread *td, struct linux_gettid_args *args)
1752 {
1753         struct linux_emuldata *em;
1754
1755         em = em_find(td);
1756         KASSERT(em != NULL, ("gettid: emuldata not found.\n"));
1757
1758         td->td_retval[0] = em->em_tid;
1759
1760         return (0);
1761 }
1762
1763 int
1764 linux_getppid(struct thread *td, struct linux_getppid_args *args)
1765 {
1766
1767         td->td_retval[0] = kern_getppid(td);
1768         return (0);
1769 }
1770
1771 int
1772 linux_getgid(struct thread *td, struct linux_getgid_args *args)
1773 {
1774
1775         td->td_retval[0] = td->td_ucred->cr_rgid;
1776         return (0);
1777 }
1778
1779 int
1780 linux_getuid(struct thread *td, struct linux_getuid_args *args)
1781 {
1782
1783         td->td_retval[0] = td->td_ucred->cr_ruid;
1784         return (0);
1785 }
1786
1787 int
1788 linux_getsid(struct thread *td, struct linux_getsid_args *args)
1789 {
1790
1791         return (kern_getsid(td, args->pid));
1792 }
1793
1794 int
1795 linux_nosys(struct thread *td, struct nosys_args *ignore)
1796 {
1797
1798         return (ENOSYS);
1799 }
1800
1801 int
1802 linux_getpriority(struct thread *td, struct linux_getpriority_args *args)
1803 {
1804         int error;
1805
1806         error = kern_getpriority(td, args->which, args->who);
1807         td->td_retval[0] = 20 - td->td_retval[0];
1808         return (error);
1809 }
1810
1811 int
1812 linux_sethostname(struct thread *td, struct linux_sethostname_args *args)
1813 {
1814         int name[2];
1815
1816         name[0] = CTL_KERN;
1817         name[1] = KERN_HOSTNAME;
1818         return (userland_sysctl(td, name, 2, 0, 0, 0, args->hostname,
1819             args->len, 0, 0));
1820 }
1821
1822 int
1823 linux_setdomainname(struct thread *td, struct linux_setdomainname_args *args)
1824 {
1825         int name[2];
1826
1827         name[0] = CTL_KERN;
1828         name[1] = KERN_NISDOMAINNAME;
1829         return (userland_sysctl(td, name, 2, 0, 0, 0, args->name,
1830             args->len, 0, 0));
1831 }
1832
1833 int
1834 linux_exit_group(struct thread *td, struct linux_exit_group_args *args)
1835 {
1836
1837         LINUX_CTR2(exit_group, "thread(%d) (%d)", td->td_tid,
1838             args->error_code);
1839
1840         /*
1841          * XXX: we should send a signal to the parent if
1842          * SIGNAL_EXIT_GROUP is set. We ignore that (temporarily?)
1843          * as it doesnt occur often.
1844          */
1845         exit1(td, args->error_code, 0);
1846                 /* NOTREACHED */
1847 }
1848
1849 #define _LINUX_CAPABILITY_VERSION_1  0x19980330
1850 #define _LINUX_CAPABILITY_VERSION_2  0x20071026
1851 #define _LINUX_CAPABILITY_VERSION_3  0x20080522
1852
1853 struct l_user_cap_header {
1854         l_int   version;
1855         l_int   pid;
1856 };
1857
1858 struct l_user_cap_data {
1859         l_int   effective;
1860         l_int   permitted;
1861         l_int   inheritable;
1862 };
1863
1864 int
1865 linux_capget(struct thread *td, struct linux_capget_args *uap)
1866 {
1867         struct l_user_cap_header luch;
1868         struct l_user_cap_data lucd[2];
1869         int error, u32s;
1870
1871         if (uap->hdrp == NULL)
1872                 return (EFAULT);
1873
1874         error = copyin(uap->hdrp, &luch, sizeof(luch));
1875         if (error != 0)
1876                 return (error);
1877
1878         switch (luch.version) {
1879         case _LINUX_CAPABILITY_VERSION_1:
1880                 u32s = 1;
1881                 break;
1882         case _LINUX_CAPABILITY_VERSION_2:
1883         case _LINUX_CAPABILITY_VERSION_3:
1884                 u32s = 2;
1885                 break;
1886         default:
1887                 luch.version = _LINUX_CAPABILITY_VERSION_1;
1888                 error = copyout(&luch, uap->hdrp, sizeof(luch));
1889                 if (error)
1890                         return (error);
1891                 return (EINVAL);
1892         }
1893
1894         if (luch.pid)
1895                 return (EPERM);
1896
1897         if (uap->datap) {
1898                 /*
1899                  * The current implementation doesn't support setting
1900                  * a capability (it's essentially a stub) so indicate
1901                  * that no capabilities are currently set or available
1902                  * to request.
1903                  */
1904                 memset(&lucd, 0, u32s * sizeof(lucd[0]));
1905                 error = copyout(&lucd, uap->datap, u32s * sizeof(lucd[0]));
1906         }
1907
1908         return (error);
1909 }
1910
1911 int
1912 linux_capset(struct thread *td, struct linux_capset_args *uap)
1913 {
1914         struct l_user_cap_header luch;
1915         struct l_user_cap_data lucd[2];
1916         int error, i, u32s;
1917
1918         if (uap->hdrp == NULL || uap->datap == NULL)
1919                 return (EFAULT);
1920
1921         error = copyin(uap->hdrp, &luch, sizeof(luch));
1922         if (error != 0)
1923                 return (error);
1924
1925         switch (luch.version) {
1926         case _LINUX_CAPABILITY_VERSION_1:
1927                 u32s = 1;
1928                 break;
1929         case _LINUX_CAPABILITY_VERSION_2:
1930         case _LINUX_CAPABILITY_VERSION_3:
1931                 u32s = 2;
1932                 break;
1933         default:
1934                 luch.version = _LINUX_CAPABILITY_VERSION_1;
1935                 error = copyout(&luch, uap->hdrp, sizeof(luch));
1936                 if (error)
1937                         return (error);
1938                 return (EINVAL);
1939         }
1940
1941         if (luch.pid)
1942                 return (EPERM);
1943
1944         error = copyin(uap->datap, &lucd, u32s * sizeof(lucd[0]));
1945         if (error != 0)
1946                 return (error);
1947
1948         /* We currently don't support setting any capabilities. */
1949         for (i = 0; i < u32s; i++) {
1950                 if (lucd[i].effective || lucd[i].permitted ||
1951                     lucd[i].inheritable) {
1952                         linux_msg(td,
1953                             "capset[%d] effective=0x%x, permitted=0x%x, "
1954                             "inheritable=0x%x is not implemented", i,
1955                             (int)lucd[i].effective, (int)lucd[i].permitted,
1956                             (int)lucd[i].inheritable);
1957                         return (EPERM);
1958                 }
1959         }
1960
1961         return (0);
1962 }
1963
1964 int
1965 linux_prctl(struct thread *td, struct linux_prctl_args *args)
1966 {
1967         int error = 0, max_size, arg;
1968         struct proc *p = td->td_proc;
1969         char comm[LINUX_MAX_COMM_LEN];
1970         int pdeath_signal, trace_state;
1971
1972         switch (args->option) {
1973         case LINUX_PR_SET_PDEATHSIG:
1974                 if (!LINUX_SIG_VALID(args->arg2))
1975                         return (EINVAL);
1976                 pdeath_signal = linux_to_bsd_signal(args->arg2);
1977                 return (kern_procctl(td, P_PID, 0, PROC_PDEATHSIG_CTL,
1978                     &pdeath_signal));
1979         case LINUX_PR_GET_PDEATHSIG:
1980                 error = kern_procctl(td, P_PID, 0, PROC_PDEATHSIG_STATUS,
1981                     &pdeath_signal);
1982                 if (error != 0)
1983                         return (error);
1984                 pdeath_signal = bsd_to_linux_signal(pdeath_signal);
1985                 return (copyout(&pdeath_signal,
1986                     (void *)(register_t)args->arg2,
1987                     sizeof(pdeath_signal)));
1988         /*
1989          * In Linux, this flag controls if set[gu]id processes can coredump.
1990          * There are additional semantics imposed on processes that cannot
1991          * coredump:
1992          * - Such processes can not be ptraced.
1993          * - There are some semantics around ownership of process-related files
1994          *   in the /proc namespace.
1995          *
1996          * In FreeBSD, we can (and by default, do) disable setuid coredump
1997          * system-wide with 'sugid_coredump.'  We control tracability on a
1998          * per-process basis with the procctl PROC_TRACE (=> P2_NOTRACE flag).
1999          * By happy coincidence, P2_NOTRACE also prevents coredumping.  So the
2000          * procctl is roughly analogous to Linux's DUMPABLE.
2001          *
2002          * So, proxy these knobs to the corresponding PROC_TRACE setting.
2003          */
2004         case LINUX_PR_GET_DUMPABLE:
2005                 error = kern_procctl(td, P_PID, p->p_pid, PROC_TRACE_STATUS,
2006                     &trace_state);
2007                 if (error != 0)
2008                         return (error);
2009                 td->td_retval[0] = (trace_state != -1);
2010                 return (0);
2011         case LINUX_PR_SET_DUMPABLE:
2012                 /*
2013                  * It is only valid for userspace to set one of these two
2014                  * flags, and only one at a time.
2015                  */
2016                 switch (args->arg2) {
2017                 case LINUX_SUID_DUMP_DISABLE:
2018                         trace_state = PROC_TRACE_CTL_DISABLE_EXEC;
2019                         break;
2020                 case LINUX_SUID_DUMP_USER:
2021                         trace_state = PROC_TRACE_CTL_ENABLE;
2022                         break;
2023                 default:
2024                         return (EINVAL);
2025                 }
2026                 return (kern_procctl(td, P_PID, p->p_pid, PROC_TRACE_CTL,
2027                     &trace_state));
2028         case LINUX_PR_GET_KEEPCAPS:
2029                 /*
2030                  * Indicate that we always clear the effective and
2031                  * permitted capability sets when the user id becomes
2032                  * non-zero (actually the capability sets are simply
2033                  * always zero in the current implementation).
2034                  */
2035                 td->td_retval[0] = 0;
2036                 break;
2037         case LINUX_PR_SET_KEEPCAPS:
2038                 /*
2039                  * Ignore requests to keep the effective and permitted
2040                  * capability sets when the user id becomes non-zero.
2041                  */
2042                 break;
2043         case LINUX_PR_SET_NAME:
2044                 /*
2045                  * To be on the safe side we need to make sure to not
2046                  * overflow the size a Linux program expects. We already
2047                  * do this here in the copyin, so that we don't need to
2048                  * check on copyout.
2049                  */
2050                 max_size = MIN(sizeof(comm), sizeof(p->p_comm));
2051                 error = copyinstr((void *)(register_t)args->arg2, comm,
2052                     max_size, NULL);
2053
2054                 /* Linux silently truncates the name if it is too long. */
2055                 if (error == ENAMETOOLONG) {
2056                         /*
2057                          * XXX: copyinstr() isn't documented to populate the
2058                          * array completely, so do a copyin() to be on the
2059                          * safe side. This should be changed in case
2060                          * copyinstr() is changed to guarantee this.
2061                          */
2062                         error = copyin((void *)(register_t)args->arg2, comm,
2063                             max_size - 1);
2064                         comm[max_size - 1] = '\0';
2065                 }
2066                 if (error)
2067                         return (error);
2068
2069                 PROC_LOCK(p);
2070                 strlcpy(p->p_comm, comm, sizeof(p->p_comm));
2071                 PROC_UNLOCK(p);
2072                 break;
2073         case LINUX_PR_GET_NAME:
2074                 PROC_LOCK(p);
2075                 strlcpy(comm, p->p_comm, sizeof(comm));
2076                 PROC_UNLOCK(p);
2077                 error = copyout(comm, (void *)(register_t)args->arg2,
2078                     strlen(comm) + 1);
2079                 break;
2080         case LINUX_PR_GET_SECCOMP:
2081         case LINUX_PR_SET_SECCOMP:
2082                 /*
2083                  * Same as returned by Linux without CONFIG_SECCOMP enabled.
2084                  */
2085                 error = EINVAL;
2086                 break;
2087         case LINUX_PR_CAPBSET_READ:
2088 #if 0
2089                 /*
2090                  * This makes too much noise with Ubuntu Focal.
2091                  */
2092                 linux_msg(td, "unsupported prctl PR_CAPBSET_READ %d",
2093                     (int)args->arg2);
2094 #endif
2095                 error = EINVAL;
2096                 break;
2097         case LINUX_PR_SET_NO_NEW_PRIVS:
2098                 arg = args->arg2 == 1 ?
2099                     PROC_NO_NEW_PRIVS_ENABLE : PROC_NO_NEW_PRIVS_DISABLE;
2100                 error = kern_procctl(td, P_PID, p->p_pid,
2101                     PROC_NO_NEW_PRIVS_CTL, &arg);
2102                 break;
2103         case LINUX_PR_SET_PTRACER:
2104                 linux_msg(td, "unsupported prctl PR_SET_PTRACER");
2105                 error = EINVAL;
2106                 break;
2107         default:
2108                 linux_msg(td, "unsupported prctl option %d", args->option);
2109                 error = EINVAL;
2110                 break;
2111         }
2112
2113         return (error);
2114 }
2115
2116 int
2117 linux_sched_setparam(struct thread *td,
2118     struct linux_sched_setparam_args *uap)
2119 {
2120         struct sched_param sched_param;
2121         struct thread *tdt;
2122         int error, policy;
2123
2124         error = copyin(uap->param, &sched_param, sizeof(sched_param));
2125         if (error)
2126                 return (error);
2127
2128         tdt = linux_tdfind(td, uap->pid, -1);
2129         if (tdt == NULL)
2130                 return (ESRCH);
2131
2132         if (linux_map_sched_prio) {
2133                 error = kern_sched_getscheduler(td, tdt, &policy);
2134                 if (error)
2135                         goto out;
2136
2137                 switch (policy) {
2138                 case SCHED_OTHER:
2139                         if (sched_param.sched_priority != 0) {
2140                                 error = EINVAL;
2141                                 goto out;
2142                         }
2143                         sched_param.sched_priority =
2144                             PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE;
2145                         break;
2146                 case SCHED_FIFO:
2147                 case SCHED_RR:
2148                         if (sched_param.sched_priority < 1 ||
2149                             sched_param.sched_priority >= LINUX_MAX_RT_PRIO) {
2150                                 error = EINVAL;
2151                                 goto out;
2152                         }
2153                         /*
2154                          * Map [1, LINUX_MAX_RT_PRIO - 1] to
2155                          * [0, RTP_PRIO_MAX - RTP_PRIO_MIN] (rounding down).
2156                          */
2157                         sched_param.sched_priority =
2158                             (sched_param.sched_priority - 1) *
2159                             (RTP_PRIO_MAX - RTP_PRIO_MIN + 1) /
2160                             (LINUX_MAX_RT_PRIO - 1);
2161                         break;
2162                 }
2163         }
2164
2165         error = kern_sched_setparam(td, tdt, &sched_param);
2166 out:    PROC_UNLOCK(tdt->td_proc);
2167         return (error);
2168 }
2169
2170 int
2171 linux_sched_getparam(struct thread *td,
2172     struct linux_sched_getparam_args *uap)
2173 {
2174         struct sched_param sched_param;
2175         struct thread *tdt;
2176         int error, policy;
2177
2178         tdt = linux_tdfind(td, uap->pid, -1);
2179         if (tdt == NULL)
2180                 return (ESRCH);
2181
2182         error = kern_sched_getparam(td, tdt, &sched_param);
2183         if (error) {
2184                 PROC_UNLOCK(tdt->td_proc);
2185                 return (error);
2186         }
2187
2188         if (linux_map_sched_prio) {
2189                 error = kern_sched_getscheduler(td, tdt, &policy);
2190                 PROC_UNLOCK(tdt->td_proc);
2191                 if (error)
2192                         return (error);
2193
2194                 switch (policy) {
2195                 case SCHED_OTHER:
2196                         sched_param.sched_priority = 0;
2197                         break;
2198                 case SCHED_FIFO:
2199                 case SCHED_RR:
2200                         /*
2201                          * Map [0, RTP_PRIO_MAX - RTP_PRIO_MIN] to
2202                          * [1, LINUX_MAX_RT_PRIO - 1] (rounding up).
2203                          */
2204                         sched_param.sched_priority =
2205                             (sched_param.sched_priority *
2206                             (LINUX_MAX_RT_PRIO - 1) +
2207                             (RTP_PRIO_MAX - RTP_PRIO_MIN - 1)) /
2208                             (RTP_PRIO_MAX - RTP_PRIO_MIN) + 1;
2209                         break;
2210                 }
2211         } else
2212                 PROC_UNLOCK(tdt->td_proc);
2213
2214         error = copyout(&sched_param, uap->param, sizeof(sched_param));
2215         return (error);
2216 }
2217
2218 /*
2219  * Get affinity of a process.
2220  */
2221 int
2222 linux_sched_getaffinity(struct thread *td,
2223     struct linux_sched_getaffinity_args *args)
2224 {
2225         int error;
2226         struct thread *tdt;
2227
2228         if (args->len < sizeof(cpuset_t))
2229                 return (EINVAL);
2230
2231         tdt = linux_tdfind(td, args->pid, -1);
2232         if (tdt == NULL)
2233                 return (ESRCH);
2234
2235         PROC_UNLOCK(tdt->td_proc);
2236
2237         error = kern_cpuset_getaffinity(td, CPU_LEVEL_WHICH, CPU_WHICH_TID,
2238             tdt->td_tid, sizeof(cpuset_t), (cpuset_t *)args->user_mask_ptr);
2239         if (error == 0)
2240                 td->td_retval[0] = sizeof(cpuset_t);
2241
2242         return (error);
2243 }
2244
2245 /*
2246  *  Set affinity of a process.
2247  */
2248 int
2249 linux_sched_setaffinity(struct thread *td,
2250     struct linux_sched_setaffinity_args *args)
2251 {
2252         struct thread *tdt;
2253
2254         if (args->len < sizeof(cpuset_t))
2255                 return (EINVAL);
2256
2257         tdt = linux_tdfind(td, args->pid, -1);
2258         if (tdt == NULL)
2259                 return (ESRCH);
2260
2261         PROC_UNLOCK(tdt->td_proc);
2262
2263         return (kern_cpuset_setaffinity(td, CPU_LEVEL_WHICH, CPU_WHICH_TID,
2264             tdt->td_tid, sizeof(cpuset_t), (cpuset_t *) args->user_mask_ptr));
2265 }
2266
2267 struct linux_rlimit64 {
2268         uint64_t        rlim_cur;
2269         uint64_t        rlim_max;
2270 };
2271
2272 int
2273 linux_prlimit64(struct thread *td, struct linux_prlimit64_args *args)
2274 {
2275         struct rlimit rlim, nrlim;
2276         struct linux_rlimit64 lrlim;
2277         struct proc *p;
2278         u_int which;
2279         int flags;
2280         int error;
2281
2282         if (args->new == NULL && args->old != NULL) {
2283                 if (linux_get_dummy_limit(args->resource, &rlim)) {
2284                         lrlim.rlim_cur = rlim.rlim_cur;
2285                         lrlim.rlim_max = rlim.rlim_max;
2286                         return (copyout(&lrlim, args->old, sizeof(lrlim)));
2287                 }
2288         }
2289
2290         if (args->resource >= LINUX_RLIM_NLIMITS)
2291                 return (EINVAL);
2292
2293         which = linux_to_bsd_resource[args->resource];
2294         if (which == -1)
2295                 return (EINVAL);
2296
2297         if (args->new != NULL) {
2298                 /*
2299                  * Note. Unlike FreeBSD where rlim is signed 64-bit Linux
2300                  * rlim is unsigned 64-bit. FreeBSD treats negative limits
2301                  * as INFINITY so we do not need a conversion even.
2302                  */
2303                 error = copyin(args->new, &nrlim, sizeof(nrlim));
2304                 if (error != 0)
2305                         return (error);
2306         }
2307
2308         flags = PGET_HOLD | PGET_NOTWEXIT;
2309         if (args->new != NULL)
2310                 flags |= PGET_CANDEBUG;
2311         else
2312                 flags |= PGET_CANSEE;
2313         if (args->pid == 0) {
2314                 p = td->td_proc;
2315                 PHOLD(p);
2316         } else {
2317                 error = pget(args->pid, flags, &p);
2318                 if (error != 0)
2319                         return (error);
2320         }
2321         if (args->old != NULL) {
2322                 PROC_LOCK(p);
2323                 lim_rlimit_proc(p, which, &rlim);
2324                 PROC_UNLOCK(p);
2325                 if (rlim.rlim_cur == RLIM_INFINITY)
2326                         lrlim.rlim_cur = LINUX_RLIM_INFINITY;
2327                 else
2328                         lrlim.rlim_cur = rlim.rlim_cur;
2329                 if (rlim.rlim_max == RLIM_INFINITY)
2330                         lrlim.rlim_max = LINUX_RLIM_INFINITY;
2331                 else
2332                         lrlim.rlim_max = rlim.rlim_max;
2333                 error = copyout(&lrlim, args->old, sizeof(lrlim));
2334                 if (error != 0)
2335                         goto out;
2336         }
2337
2338         if (args->new != NULL)
2339                 error = kern_proc_setrlimit(td, p, which, &nrlim);
2340
2341  out:
2342         PRELE(p);
2343         return (error);
2344 }
2345
2346 int
2347 linux_pselect6(struct thread *td, struct linux_pselect6_args *args)
2348 {
2349         struct l_timespec lts;
2350         struct timespec ts, *tsp;
2351         int error;
2352
2353         if (args->tsp != NULL) {
2354                 error = copyin(args->tsp, &lts, sizeof(lts));
2355                 if (error != 0)
2356                         return (error);
2357                 error = linux_to_native_timespec(&ts, &lts);
2358                 if (error != 0)
2359                         return (error);
2360                 tsp = &ts;
2361         } else
2362                 tsp = NULL;
2363
2364         error = linux_common_pselect6(td, args->nfds, args->readfds,
2365             args->writefds, args->exceptfds, tsp, args->sig);
2366         if (error != 0)
2367                 return (error);
2368
2369         if (args->tsp != NULL) {
2370                 error = native_to_linux_timespec(&lts, tsp);
2371                 if (error == 0)
2372                         error = copyout(&lts, args->tsp, sizeof(lts));
2373         }
2374         return (error);
2375 }
2376
2377 static int
2378 linux_common_pselect6(struct thread *td, l_int nfds, l_fd_set *readfds,
2379     l_fd_set *writefds, l_fd_set *exceptfds, struct timespec *tsp,
2380     l_uintptr_t *sig)
2381 {
2382         struct timeval utv, tv0, tv1, *tvp;
2383         struct l_pselect6arg lpse6;
2384         l_sigset_t l_ss;
2385         sigset_t *ssp;
2386         sigset_t ss;
2387         int error;
2388
2389         ssp = NULL;
2390         if (sig != NULL) {
2391                 error = copyin(sig, &lpse6, sizeof(lpse6));
2392                 if (error != 0)
2393                         return (error);
2394                 if (lpse6.ss_len != sizeof(l_ss))
2395                         return (EINVAL);
2396                 if (lpse6.ss != 0) {
2397                         error = copyin(PTRIN(lpse6.ss), &l_ss,
2398                             sizeof(l_ss));
2399                         if (error != 0)
2400                                 return (error);
2401                         linux_to_bsd_sigset(&l_ss, &ss);
2402                         ssp = &ss;
2403                 }
2404         } else
2405                 ssp = NULL;
2406
2407         /*
2408          * Currently glibc changes nanosecond number to microsecond.
2409          * This mean losing precision but for now it is hardly seen.
2410          */
2411         if (tsp != NULL) {
2412                 TIMESPEC_TO_TIMEVAL(&utv, tsp);
2413                 if (itimerfix(&utv))
2414                         return (EINVAL);
2415
2416                 microtime(&tv0);
2417                 tvp = &utv;
2418         } else
2419                 tvp = NULL;
2420
2421         error = kern_pselect(td, nfds, readfds, writefds,
2422             exceptfds, tvp, ssp, LINUX_NFDBITS);
2423
2424         if (error == 0 && tsp != NULL) {
2425                 if (td->td_retval[0] != 0) {
2426                         /*
2427                          * Compute how much time was left of the timeout,
2428                          * by subtracting the current time and the time
2429                          * before we started the call, and subtracting
2430                          * that result from the user-supplied value.
2431                          */
2432
2433                         microtime(&tv1);
2434                         timevalsub(&tv1, &tv0);
2435                         timevalsub(&utv, &tv1);
2436                         if (utv.tv_sec < 0)
2437                                 timevalclear(&utv);
2438                 } else
2439                         timevalclear(&utv);
2440                 TIMEVAL_TO_TIMESPEC(&utv, tsp);
2441         }
2442         return (error);
2443 }
2444
2445 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
2446 int
2447 linux_pselect6_time64(struct thread *td,
2448     struct linux_pselect6_time64_args *args)
2449 {
2450         struct l_timespec64 lts;
2451         struct timespec ts, *tsp;
2452         int error;
2453
2454         if (args->tsp != NULL) {
2455                 error = copyin(args->tsp, &lts, sizeof(lts));
2456                 if (error != 0)
2457                         return (error);
2458                 error = linux_to_native_timespec64(&ts, &lts);
2459                 if (error != 0)
2460                         return (error);
2461                 tsp = &ts;
2462         } else
2463                 tsp = NULL;
2464
2465         error = linux_common_pselect6(td, args->nfds, args->readfds,
2466             args->writefds, args->exceptfds, tsp, args->sig);
2467         if (error != 0)
2468                 return (error);
2469
2470         if (args->tsp != NULL) {
2471                 error = native_to_linux_timespec64(&lts, tsp);
2472                 if (error == 0)
2473                         error = copyout(&lts, args->tsp, sizeof(lts));
2474         }
2475         return (error);
2476 }
2477 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
2478
2479 int
2480 linux_ppoll(struct thread *td, struct linux_ppoll_args *args)
2481 {
2482         struct timespec uts, *tsp;
2483         struct l_timespec lts;
2484         int error;
2485
2486         if (args->tsp != NULL) {
2487                 error = copyin(args->tsp, &lts, sizeof(lts));
2488                 if (error)
2489                         return (error);
2490                 error = linux_to_native_timespec(&uts, &lts);
2491                 if (error != 0)
2492                         return (error);
2493                 tsp = &uts;
2494         } else
2495                 tsp = NULL;
2496
2497         error = linux_common_ppoll(td, args->fds, args->nfds, tsp,
2498             args->sset, args->ssize);
2499         if (error != 0)
2500                 return (error);
2501         if (tsp != NULL) {
2502                 error = native_to_linux_timespec(&lts, tsp);
2503                 if (error == 0)
2504                         error = copyout(&lts, args->tsp, sizeof(lts));
2505         }
2506         return (error);
2507 }
2508
2509 static int
2510 linux_common_ppoll(struct thread *td, struct pollfd *fds, uint32_t nfds,
2511     struct timespec *tsp, l_sigset_t *sset, l_size_t ssize)
2512 {
2513         struct timespec ts0, ts1;
2514         struct pollfd stackfds[32];
2515         struct pollfd *kfds;
2516         l_sigset_t l_ss;
2517         sigset_t *ssp;
2518         sigset_t ss;
2519         int error;
2520
2521         if (kern_poll_maxfds(nfds))
2522                 return (EINVAL);
2523         if (sset != NULL) {
2524                 if (ssize != sizeof(l_ss))
2525                         return (EINVAL);
2526                 error = copyin(sset, &l_ss, sizeof(l_ss));
2527                 if (error)
2528                         return (error);
2529                 linux_to_bsd_sigset(&l_ss, &ss);
2530                 ssp = &ss;
2531         } else
2532                 ssp = NULL;
2533         if (tsp != NULL)
2534                 nanotime(&ts0);
2535
2536         if (nfds > nitems(stackfds))
2537                 kfds = mallocarray(nfds, sizeof(*kfds), M_TEMP, M_WAITOK);
2538         else
2539                 kfds = stackfds;
2540         error = linux_pollin(td, kfds, fds, nfds);
2541         if (error != 0)
2542                 goto out;
2543
2544         error = kern_poll_kfds(td, kfds, nfds, tsp, ssp);
2545         if (error == 0)
2546                 error = linux_pollout(td, kfds, fds, nfds);
2547
2548         if (error == 0 && tsp != NULL) {
2549                 if (td->td_retval[0]) {
2550                         nanotime(&ts1);
2551                         timespecsub(&ts1, &ts0, &ts1);
2552                         timespecsub(tsp, &ts1, tsp);
2553                         if (tsp->tv_sec < 0)
2554                                 timespecclear(tsp);
2555                 } else
2556                         timespecclear(tsp);
2557         }
2558
2559 out:
2560         if (nfds > nitems(stackfds))
2561                 free(kfds, M_TEMP);
2562         return (error);
2563 }
2564
2565 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
2566 int
2567 linux_ppoll_time64(struct thread *td, struct linux_ppoll_time64_args *args)
2568 {
2569         struct timespec uts, *tsp;
2570         struct l_timespec64 lts;
2571         int error;
2572
2573         if (args->tsp != NULL) {
2574                 error = copyin(args->tsp, &lts, sizeof(lts));
2575                 if (error != 0)
2576                         return (error);
2577                 error = linux_to_native_timespec64(&uts, &lts);
2578                 if (error != 0)
2579                         return (error);
2580                 tsp = &uts;
2581         } else
2582                 tsp = NULL;
2583         error = linux_common_ppoll(td, args->fds, args->nfds, tsp,
2584             args->sset, args->ssize);
2585         if (error != 0)
2586                 return (error);
2587         if (tsp != NULL) {
2588                 error = native_to_linux_timespec64(&lts, tsp);
2589                 if (error == 0)
2590                         error = copyout(&lts, args->tsp, sizeof(lts));
2591         }
2592         return (error);
2593 }
2594 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
2595
2596 static int
2597 linux_pollin(struct thread *td, struct pollfd *fds, struct pollfd *ufds, u_int nfd)
2598 {
2599         int error;
2600         u_int i;
2601
2602         error = copyin(ufds, fds, nfd * sizeof(*fds));
2603         if (error != 0)
2604                 return (error);
2605
2606         for (i = 0; i < nfd; i++) {
2607                 if (fds->events != 0)
2608                         linux_to_bsd_poll_events(td, fds->fd,
2609                             fds->events, &fds->events);
2610                 fds++;
2611         }
2612         return (0);
2613 }
2614
2615 static int
2616 linux_pollout(struct thread *td, struct pollfd *fds, struct pollfd *ufds, u_int nfd)
2617 {
2618         int error = 0;
2619         u_int i, n = 0;
2620
2621         for (i = 0; i < nfd; i++) {
2622                 if (fds->revents != 0) {
2623                         bsd_to_linux_poll_events(fds->revents,
2624                             &fds->revents);
2625                         n++;
2626                 }
2627                 error = copyout(&fds->revents, &ufds->revents,
2628                     sizeof(ufds->revents));
2629                 if (error)
2630                         return (error);
2631                 fds++;
2632                 ufds++;
2633         }
2634         td->td_retval[0] = n;
2635         return (0);
2636 }
2637
2638 int
2639 linux_sched_rr_get_interval(struct thread *td,
2640     struct linux_sched_rr_get_interval_args *uap)
2641 {
2642         struct timespec ts;
2643         struct l_timespec lts;
2644         struct thread *tdt;
2645         int error;
2646
2647         /*
2648          * According to man in case the invalid pid specified
2649          * EINVAL should be returned.
2650          */
2651         if (uap->pid < 0)
2652                 return (EINVAL);
2653
2654         tdt = linux_tdfind(td, uap->pid, -1);
2655         if (tdt == NULL)
2656                 return (ESRCH);
2657
2658         error = kern_sched_rr_get_interval_td(td, tdt, &ts);
2659         PROC_UNLOCK(tdt->td_proc);
2660         if (error != 0)
2661                 return (error);
2662         error = native_to_linux_timespec(&lts, &ts);
2663         if (error != 0)
2664                 return (error);
2665         return (copyout(&lts, uap->interval, sizeof(lts)));
2666 }
2667
2668 /*
2669  * In case when the Linux thread is the initial thread in
2670  * the thread group thread id is equal to the process id.
2671  * Glibc depends on this magic (assert in pthread_getattr_np.c).
2672  */
2673 struct thread *
2674 linux_tdfind(struct thread *td, lwpid_t tid, pid_t pid)
2675 {
2676         struct linux_emuldata *em;
2677         struct thread *tdt;
2678         struct proc *p;
2679
2680         tdt = NULL;
2681         if (tid == 0 || tid == td->td_tid) {
2682                 tdt = td;
2683                 PROC_LOCK(tdt->td_proc);
2684         } else if (tid > PID_MAX)
2685                 tdt = tdfind(tid, pid);
2686         else {
2687                 /*
2688                  * Initial thread where the tid equal to the pid.
2689                  */
2690                 p = pfind(tid);
2691                 if (p != NULL) {
2692                         if (SV_PROC_ABI(p) != SV_ABI_LINUX) {
2693                                 /*
2694                                  * p is not a Linuxulator process.
2695                                  */
2696                                 PROC_UNLOCK(p);
2697                                 return (NULL);
2698                         }
2699                         FOREACH_THREAD_IN_PROC(p, tdt) {
2700                                 em = em_find(tdt);
2701                                 if (tid == em->em_tid)
2702                                         return (tdt);
2703                         }
2704                         PROC_UNLOCK(p);
2705                 }
2706                 return (NULL);
2707         }
2708
2709         return (tdt);
2710 }
2711
2712 void
2713 linux_to_bsd_waitopts(int options, int *bsdopts)
2714 {
2715
2716         if (options & LINUX_WNOHANG)
2717                 *bsdopts |= WNOHANG;
2718         if (options & LINUX_WUNTRACED)
2719                 *bsdopts |= WUNTRACED;
2720         if (options & LINUX_WEXITED)
2721                 *bsdopts |= WEXITED;
2722         if (options & LINUX_WCONTINUED)
2723                 *bsdopts |= WCONTINUED;
2724         if (options & LINUX_WNOWAIT)
2725                 *bsdopts |= WNOWAIT;
2726
2727         if (options & __WCLONE)
2728                 *bsdopts |= WLINUXCLONE;
2729 }
2730
2731 int
2732 linux_getrandom(struct thread *td, struct linux_getrandom_args *args)
2733 {
2734         struct uio uio;
2735         struct iovec iov;
2736         int error;
2737
2738         if (args->flags & ~(LINUX_GRND_NONBLOCK|LINUX_GRND_RANDOM))
2739                 return (EINVAL);
2740         if (args->count > INT_MAX)
2741                 args->count = INT_MAX;
2742
2743         iov.iov_base = args->buf;
2744         iov.iov_len = args->count;
2745
2746         uio.uio_iov = &iov;
2747         uio.uio_iovcnt = 1;
2748         uio.uio_resid = iov.iov_len;
2749         uio.uio_segflg = UIO_USERSPACE;
2750         uio.uio_rw = UIO_READ;
2751         uio.uio_td = td;
2752
2753         error = read_random_uio(&uio, args->flags & LINUX_GRND_NONBLOCK);
2754         if (error == 0)
2755                 td->td_retval[0] = args->count - uio.uio_resid;
2756         return (error);
2757 }
2758
2759 int
2760 linux_mincore(struct thread *td, struct linux_mincore_args *args)
2761 {
2762
2763         /* Needs to be page-aligned */
2764         if (args->start & PAGE_MASK)
2765                 return (EINVAL);
2766         return (kern_mincore(td, args->start, args->len, args->vec));
2767 }
2768
2769 #define SYSLOG_TAG      "<6>"
2770
2771 int
2772 linux_syslog(struct thread *td, struct linux_syslog_args *args)
2773 {
2774         char buf[128], *src, *dst;
2775         u_int seq;
2776         int buflen, error;
2777
2778         if (args->type != LINUX_SYSLOG_ACTION_READ_ALL) {
2779                 linux_msg(td, "syslog unsupported type 0x%x", args->type);
2780                 return (EINVAL);
2781         }
2782
2783         if (args->len < 6) {
2784                 td->td_retval[0] = 0;
2785                 return (0);
2786         }
2787
2788         error = priv_check(td, PRIV_MSGBUF);
2789         if (error)
2790                 return (error);
2791
2792         mtx_lock(&msgbuf_lock);
2793         msgbuf_peekbytes(msgbufp, NULL, 0, &seq);
2794         mtx_unlock(&msgbuf_lock);
2795
2796         dst = args->buf;
2797         error = copyout(&SYSLOG_TAG, dst, sizeof(SYSLOG_TAG));
2798         /* The -1 is to skip the trailing '\0'. */
2799         dst += sizeof(SYSLOG_TAG) - 1;
2800
2801         while (error == 0) {
2802                 mtx_lock(&msgbuf_lock);
2803                 buflen = msgbuf_peekbytes(msgbufp, buf, sizeof(buf), &seq);
2804                 mtx_unlock(&msgbuf_lock);
2805
2806                 if (buflen == 0)
2807                         break;
2808
2809                 for (src = buf; src < buf + buflen && error == 0; src++) {
2810                         if (*src == '\0')
2811                                 continue;
2812
2813                         if (dst >= args->buf + args->len)
2814                                 goto out;
2815
2816                         error = copyout(src, dst, 1);
2817                         dst++;
2818
2819                         if (*src == '\n' && *(src + 1) != '<' &&
2820                             dst + sizeof(SYSLOG_TAG) < args->buf + args->len) {
2821                                 error = copyout(&SYSLOG_TAG,
2822                                     dst, sizeof(SYSLOG_TAG));
2823                                 dst += sizeof(SYSLOG_TAG) - 1;
2824                         }
2825                 }
2826         }
2827 out:
2828         td->td_retval[0] = dst - args->buf;
2829         return (error);
2830 }
2831
2832 int
2833 linux_getcpu(struct thread *td, struct linux_getcpu_args *args)
2834 {
2835         int cpu, error, node;
2836
2837         cpu = td->td_oncpu; /* Make sure it doesn't change during copyout(9) */
2838         error = 0;
2839         node = cpuid_to_pcpu[cpu]->pc_domain;
2840
2841         if (args->cpu != NULL)
2842                 error = copyout(&cpu, args->cpu, sizeof(l_int));
2843         if (args->node != NULL)
2844                 error = copyout(&node, args->node, sizeof(l_int));
2845         return (error);
2846 }
2847
2848 #if defined(__i386__) || defined(__amd64__)
2849 int
2850 linux_poll(struct thread *td, struct linux_poll_args *args)
2851 {
2852         struct timespec ts, *tsp;
2853
2854         if (args->timeout != INFTIM) {
2855                 if (args->timeout < 0)
2856                         return (EINVAL);
2857                 ts.tv_sec = args->timeout / 1000;
2858                 ts.tv_nsec = (args->timeout % 1000) * 1000000;
2859                 tsp = &ts;
2860         } else
2861                 tsp = NULL;
2862
2863         return (linux_common_ppoll(td, args->fds, args->nfds,
2864             tsp, NULL, 0));
2865 }
2866 #endif /* __i386__ || __amd64__ */
2867
2868 int
2869 linux_seccomp(struct thread *td, struct linux_seccomp_args *args)
2870 {
2871
2872         switch (args->op) {
2873         case LINUX_SECCOMP_GET_ACTION_AVAIL:
2874                 return (EOPNOTSUPP);
2875         default:
2876                 /*
2877                  * Ignore unknown operations, just like Linux kernel built
2878                  * without CONFIG_SECCOMP.
2879                  */
2880                 return (EINVAL);
2881         }
2882 }