sys/compat/linux/linux_misc.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 2002 Doug Rabson
   5  * Copyright (c) 1994-1995 Søren Schmidt
   6  * All rights reserved.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer
  13  *    in this position and unchanged.
  14  * 2. Redistributions in binary form must reproduce the above copyright
  15  *    notice, this list of conditions and the following disclaimer in the
  16  *    documentation and/or other materials provided with the distribution.
  17  * 3. The name of the author may not be used to endorse or promote products
  18  *    derived from this software without specific prior written permission
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  21  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  22  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  23  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  24  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  25  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  29  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 #include <sys/cdefs.h>
  33 __FBSDID("$FreeBSD$");
  34
  35 #include "opt_compat.h"
  36
  37 #include <sys/param.h>
  38 #include <sys/blist.h>
  39 #include <sys/fcntl.h>
  40 #if defined(__i386__)
  41 #include <sys/imgact_aout.h>
  42 #endif
  43 #include <sys/jail.h>
  44 #include <sys/kernel.h>
  45 #include <sys/limits.h>
  46 #include <sys/lock.h>
  47 #include <sys/malloc.h>
  48 #include <sys/mman.h>
  49 #include <sys/mount.h>
  50 #include <sys/msgbuf.h>
  51 #include <sys/mutex.h>
  52 #include <sys/namei.h>
  53 #include <sys/priv.h>
  54 #include <sys/proc.h>
  55 #include <sys/procctl.h>
  56 #include <sys/reboot.h>
  57 #include <sys/racct.h>
  58 #include <sys/random.h>
  59 #include <sys/resourcevar.h>
  60 #include <sys/sched.h>
  61 #include <sys/sdt.h>
  62 #include <sys/signalvar.h>
  63 #include <sys/stat.h>
  64 #include <sys/syscallsubr.h>
  65 #include <sys/sysctl.h>
  66 #include <sys/sysproto.h>
  67 #include <sys/systm.h>
  68 #include <sys/time.h>
  69 #include <sys/vmmeter.h>
  70 #include <sys/vnode.h>
  71 #include <sys/wait.h>
  72 #include <sys/cpuset.h>
  73 #include <sys/uio.h>
  74
  75 #include <security/mac/mac_framework.h>
  76
  77 #include <vm/vm.h>
  78 #include <vm/pmap.h>
  79 #include <vm/vm_kern.h>
  80 #include <vm/vm_map.h>
  81 #include <vm/vm_extern.h>
  82 #include <vm/swap_pager.h>
  83
  84 #ifdef COMPAT_LINUX32
  85 #include <machine/../linux32/linux.h>
  86 #include <machine/../linux32/linux32_proto.h>
  87 #else
  88 #include <machine/../linux/linux.h>
  89 #include <machine/../linux/linux_proto.h>
  90 #endif
  91
  92 #include <compat/linux/linux_dtrace.h>
  93 #include <compat/linux/linux_file.h>
  94 #include <compat/linux/linux_mib.h>
  95 #include <compat/linux/linux_signal.h>
  96 #include <compat/linux/linux_timer.h>
  97 #include <compat/linux/linux_util.h>
  98 #include <compat/linux/linux_sysproto.h>
  99 #include <compat/linux/linux_emul.h>
 100 #include <compat/linux/linux_misc.h>
 101
 102 /**
 103  * Special DTrace provider for the linuxulator.
 104  *
 105  * In this file we define the provider for the entire linuxulator. All
 106  * modules (= files of the linuxulator) use it.
 107  *
 108  * We define a different name depending on the emulated bitsize, see
 109  * ../../<ARCH>/linux{,32}/linux.h, e.g.:
 110  *      native bitsize          = linuxulator
 111  *      amd64, 32bit emulation  = linuxulator32
 112  */
 113 LIN_SDT_PROVIDER_DEFINE(LINUX_DTRACE);
 114
 115 int stclohz;                            /* Statistics clock frequency */
 116
 117 static unsigned int linux_to_bsd_resource[LINUX_RLIM_NLIMITS] = {
 118         RLIMIT_CPU, RLIMIT_FSIZE, RLIMIT_DATA, RLIMIT_STACK,
 119         RLIMIT_CORE, RLIMIT_RSS, RLIMIT_NPROC, RLIMIT_NOFILE,
 120         RLIMIT_MEMLOCK, RLIMIT_AS
 121 };
 122
 123 struct l_sysinfo {
 124         l_long          uptime;         /* Seconds since boot */
 125         l_ulong         loads[3];       /* 1, 5, and 15 minute load averages */
 126 #define LINUX_SYSINFO_LOADS_SCALE 65536
 127         l_ulong         totalram;       /* Total usable main memory size */
 128         l_ulong         freeram;        /* Available memory size */
 129         l_ulong         sharedram;      /* Amount of shared memory */
 130         l_ulong         bufferram;      /* Memory used by buffers */
 131         l_ulong         totalswap;      /* Total swap space size */
 132         l_ulong         freeswap;       /* swap space still available */
 133         l_ushort        procs;          /* Number of current processes */
 134         l_ushort        pads;
 135         l_ulong         totalhigh;
 136         l_ulong         freehigh;
 137         l_uint          mem_unit;
 138         char            _f[20-2*sizeof(l_long)-sizeof(l_int)];  /* padding */
 139 };
 140
 141 struct l_pselect6arg {
 142         l_uintptr_t     ss;
 143         l_size_t        ss_len;
 144 };
 145
 146 static int      linux_utimensat_nsec_valid(l_long);
 147
 148 int
 149 linux_sysinfo(struct thread *td, struct linux_sysinfo_args *args)
 150 {
 151         struct l_sysinfo sysinfo;
 152         int i, j;
 153         struct timespec ts;
 154
 155         bzero(&sysinfo, sizeof(sysinfo));
 156         getnanouptime(&ts);
 157         if (ts.tv_nsec != 0)
 158                 ts.tv_sec++;
 159         sysinfo.uptime = ts.tv_sec;
 160
 161         /* Use the information from the mib to get our load averages */
 162         for (i = 0; i < 3; i++)
 163                 sysinfo.loads[i] = averunnable.ldavg[i] *
 164                     LINUX_SYSINFO_LOADS_SCALE / averunnable.fscale;
 165
 166         sysinfo.totalram = physmem * PAGE_SIZE;
 167         sysinfo.freeram = (u_long)vm_free_count() * PAGE_SIZE;
 168
 169         /*
 170          * sharedram counts pages allocated to named, swap-backed objects such
 171          * as shared memory segments and tmpfs files.  There is no cheap way to
 172          * compute this, so just leave the field unpopulated.  Linux itself only
 173          * started setting this field in the 3.x timeframe.
 174          */
 175         sysinfo.sharedram = 0;
 176         sysinfo.bufferram = 0;
 177
 178         swap_pager_status(&i, &j);
 179         sysinfo.totalswap = i * PAGE_SIZE;
 180         sysinfo.freeswap = (i - j) * PAGE_SIZE;
 181
 182         sysinfo.procs = nprocs;
 183
 184         /*
 185          * Platforms supported by the emulation layer do not have a notion of
 186          * high memory.
 187          */
 188         sysinfo.totalhigh = 0;
 189         sysinfo.freehigh = 0;
 190
 191         sysinfo.mem_unit = 1;
 192
 193         return (copyout(&sysinfo, args->info, sizeof(sysinfo)));
 194 }
 195
 196 #ifdef LINUX_LEGACY_SYSCALLS
 197 int
 198 linux_alarm(struct thread *td, struct linux_alarm_args *args)
 199 {
 200         struct itimerval it, old_it;
 201         u_int secs;
 202         int error;
 203
 204         secs = args->secs;
 205         /*
 206          * Linux alarm() is always successful. Limit secs to INT32_MAX / 2
 207          * to match kern_setitimer()'s limit to avoid error from it.
 208          *
 209          * XXX. Linux limit secs to INT_MAX on 32 and does not limit on 64-bit
 210          * platforms.
 211          */
 212         if (secs > INT32_MAX / 2)
 213                 secs = INT32_MAX / 2;
 214
 215         it.it_value.tv_sec = secs;
 216         it.it_value.tv_usec = 0;
 217         timevalclear(&it.it_interval);
 218         error = kern_setitimer(td, ITIMER_REAL, &it, &old_it);
 219         KASSERT(error == 0, ("kern_setitimer returns %d", error));
 220
 221         if ((old_it.it_value.tv_sec == 0 && old_it.it_value.tv_usec > 0) ||
 222             old_it.it_value.tv_usec >= 500000)
 223                 old_it.it_value.tv_sec++;
 224         td->td_retval[0] = old_it.it_value.tv_sec;
 225         return (0);
 226 }
 227 #endif
 228
 229 int
 230 linux_brk(struct thread *td, struct linux_brk_args *args)
 231 {
 232         struct vmspace *vm = td->td_proc->p_vmspace;
 233         uintptr_t new, old;
 234
 235         old = (uintptr_t)vm->vm_daddr + ctob(vm->vm_dsize);
 236         new = (uintptr_t)args->dsend;
 237         if ((caddr_t)new > vm->vm_daddr && !kern_break(td, &new))
 238                 td->td_retval[0] = (register_t)new;
 239         else
 240                 td->td_retval[0] = (register_t)old;
 241
 242         return (0);
 243 }
 244
 245 #if defined(__i386__)
 246 /* XXX: what about amd64/linux32? */
 247
 248 int
 249 linux_uselib(struct thread *td, struct linux_uselib_args *args)
 250 {
 251         struct nameidata ni;
 252         struct vnode *vp;
 253         struct exec *a_out;
 254         vm_map_t map;
 255         vm_map_entry_t entry;
 256         struct vattr attr;
 257         vm_offset_t vmaddr;
 258         unsigned long file_offset;
 259         unsigned long bss_size;
 260         char *library;
 261         ssize_t aresid;
 262         int error;
 263         bool locked, opened, textset;
 264
 265         a_out = NULL;
 266         vp = NULL;
 267         locked = false;
 268         textset = false;
 269         opened = false;
 270
 271         if (!LUSECONVPATH(td)) {
 272                 NDINIT(&ni, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | AUDITVNODE1,
 273                     UIO_USERSPACE, args->library, td);
 274                 error = namei(&ni);
 275         } else {
 276                 LCONVPATHEXIST(td, args->library, &library);
 277                 NDINIT(&ni, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | AUDITVNODE1,
 278                     UIO_SYSSPACE, library, td);
 279                 error = namei(&ni);
 280                 LFREEPATH(library);
 281         }
 282         if (error)
 283                 goto cleanup;
 284
 285         vp = ni.ni_vp;
 286         NDFREE(&ni, NDF_ONLY_PNBUF);
 287
 288         /*
 289          * From here on down, we have a locked vnode that must be unlocked.
 290          * XXX: The code below largely duplicates exec_check_permissions().
 291          */
 292         locked = true;
 293
 294         /* Executable? */
 295         error = VOP_GETATTR(vp, &attr, td->td_ucred);
 296         if (error)
 297                 goto cleanup;
 298
 299         if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
 300             ((attr.va_mode & 0111) == 0) || (attr.va_type != VREG)) {
 301                 /* EACCESS is what exec(2) returns. */
 302                 error = ENOEXEC;
 303                 goto cleanup;
 304         }
 305
 306         /* Sensible size? */
 307         if (attr.va_size == 0) {
 308                 error = ENOEXEC;
 309                 goto cleanup;
 310         }
 311
 312         /* Can we access it? */
 313         error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
 314         if (error)
 315                 goto cleanup;
 316
 317         /*
 318          * XXX: This should use vn_open() so that it is properly authorized,
 319          * and to reduce code redundancy all over the place here.
 320          * XXX: Not really, it duplicates far more of exec_check_permissions()
 321          * than vn_open().
 322          */
 323 #ifdef MAC
 324         error = mac_vnode_check_open(td->td_ucred, vp, VREAD);
 325         if (error)
 326                 goto cleanup;
 327 #endif
 328         error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
 329         if (error)
 330                 goto cleanup;
 331         opened = true;
 332
 333         /* Pull in executable header into exec_map */
 334         error = vm_mmap(exec_map, (vm_offset_t *)&a_out, PAGE_SIZE,
 335             VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, vp, 0);
 336         if (error)
 337                 goto cleanup;
 338
 339         /* Is it a Linux binary ? */
 340         if (((a_out->a_magic >> 16) & 0xff) != 0x64) {
 341                 error = ENOEXEC;
 342                 goto cleanup;
 343         }
 344
 345         /*
 346          * While we are here, we should REALLY do some more checks
 347          */
 348
 349         /* Set file/virtual offset based on a.out variant. */
 350         switch ((int)(a_out->a_magic & 0xffff)) {
 351         case 0413:                      /* ZMAGIC */
 352                 file_offset = 1024;
 353                 break;
 354         case 0314:                      /* QMAGIC */
 355                 file_offset = 0;
 356                 break;
 357         default:
 358                 error = ENOEXEC;
 359                 goto cleanup;
 360         }
 361
 362         bss_size = round_page(a_out->a_bss);
 363
 364         /* Check various fields in header for validity/bounds. */
 365         if (a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) {
 366                 error = ENOEXEC;
 367                 goto cleanup;
 368         }
 369
 370         /* text + data can't exceed file size */
 371         if (a_out->a_data + a_out->a_text > attr.va_size) {
 372                 error = EFAULT;
 373                 goto cleanup;
 374         }
 375
 376         /*
 377          * text/data/bss must not exceed limits
 378          * XXX - this is not complete. it should check current usage PLUS
 379          * the resources needed by this library.
 380          */
 381         PROC_LOCK(td->td_proc);
 382         if (a_out->a_text > maxtsiz ||
 383             a_out->a_data + bss_size > lim_cur_proc(td->td_proc, RLIMIT_DATA) ||
 384             racct_set(td->td_proc, RACCT_DATA, a_out->a_data +
 385             bss_size) != 0) {
 386                 PROC_UNLOCK(td->td_proc);
 387                 error = ENOMEM;
 388                 goto cleanup;
 389         }
 390         PROC_UNLOCK(td->td_proc);
 391
 392         /*
 393          * Prevent more writers.
 394          */
 395         error = VOP_SET_TEXT(vp);
 396         if (error != 0)
 397                 goto cleanup;
 398         textset = true;
 399
 400         /*
 401          * Lock no longer needed
 402          */
 403         locked = false;
 404         VOP_UNLOCK(vp);
 405
 406         /*
 407          * Check if file_offset page aligned. Currently we cannot handle
 408          * misalinged file offsets, and so we read in the entire image
 409          * (what a waste).
 410          */
 411         if (file_offset & PAGE_MASK) {
 412                 /* Map text+data read/write/execute */
 413
 414                 /* a_entry is the load address and is page aligned */
 415                 vmaddr = trunc_page(a_out->a_entry);
 416
 417                 /* get anon user mapping, read+write+execute */
 418                 error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0,
 419                     &vmaddr, a_out->a_text + a_out->a_data, 0, VMFS_NO_SPACE,
 420                     VM_PROT_ALL, VM_PROT_ALL, 0);
 421                 if (error)
 422                         goto cleanup;
 423
 424                 error = vn_rdwr(UIO_READ, vp, (void *)vmaddr, file_offset,
 425                     a_out->a_text + a_out->a_data, UIO_USERSPACE, 0,
 426                     td->td_ucred, NOCRED, &aresid, td);
 427                 if (error != 0)
 428                         goto cleanup;
 429                 if (aresid != 0) {
 430                         error = ENOEXEC;
 431                         goto cleanup;
 432                 }
 433         } else {
 434                 /*
 435                  * for QMAGIC, a_entry is 20 bytes beyond the load address
 436                  * to skip the executable header
 437                  */
 438                 vmaddr = trunc_page(a_out->a_entry);
 439
 440                 /*
 441                  * Map it all into the process's space as a single
 442                  * copy-on-write "data" segment.
 443                  */
 444                 map = &td->td_proc->p_vmspace->vm_map;
 445                 error = vm_mmap(map, &vmaddr,
 446                     a_out->a_text + a_out->a_data, VM_PROT_ALL, VM_PROT_ALL,
 447                     MAP_PRIVATE | MAP_FIXED, OBJT_VNODE, vp, file_offset);
 448                 if (error)
 449                         goto cleanup;
 450                 vm_map_lock(map);
 451                 if (!vm_map_lookup_entry(map, vmaddr, &entry)) {
 452                         vm_map_unlock(map);
 453                         error = EDOOFUS;
 454                         goto cleanup;
 455                 }
 456                 entry->eflags |= MAP_ENTRY_VN_EXEC;
 457                 vm_map_unlock(map);
 458                 textset = false;
 459         }
 460
 461         if (bss_size != 0) {
 462                 /* Calculate BSS start address */
 463                 vmaddr = trunc_page(a_out->a_entry) + a_out->a_text +
 464                     a_out->a_data;
 465
 466                 /* allocate some 'anon' space */
 467                 error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0,
 468                     &vmaddr, bss_size, 0, VMFS_NO_SPACE, VM_PROT_ALL,
 469                     VM_PROT_ALL, 0);
 470                 if (error)
 471                         goto cleanup;
 472         }
 473
 474 cleanup:
 475         if (opened) {
 476                 if (locked)
 477                         VOP_UNLOCK(vp);
 478                 locked = false;
 479                 VOP_CLOSE(vp, FREAD, td->td_ucred, td);
 480         }
 481         if (textset) {
 482                 if (!locked) {
 483                         locked = true;
 484                         VOP_LOCK(vp, LK_SHARED | LK_RETRY);
 485                 }
 486                 VOP_UNSET_TEXT_CHECKED(vp);
 487         }
 488         if (locked)
 489                 VOP_UNLOCK(vp);
 490
 491         /* Release the temporary mapping. */
 492         if (a_out)
 493                 kmap_free_wakeup(exec_map, (vm_offset_t)a_out, PAGE_SIZE);
 494
 495         return (error);
 496 }
 497
 498 #endif  /* __i386__ */
 499
 500 #ifdef LINUX_LEGACY_SYSCALLS
 501 int
 502 linux_select(struct thread *td, struct linux_select_args *args)
 503 {
 504         l_timeval ltv;
 505         struct timeval tv0, tv1, utv, *tvp;
 506         int error;
 507
 508         /*
 509          * Store current time for computation of the amount of
 510          * time left.
 511          */
 512         if (args->timeout) {
 513                 if ((error = copyin(args->timeout, &ltv, sizeof(ltv))))
 514                         goto select_out;
 515                 utv.tv_sec = ltv.tv_sec;
 516                 utv.tv_usec = ltv.tv_usec;
 517
 518                 if (itimerfix(&utv)) {
 519                         /*
 520                          * The timeval was invalid.  Convert it to something
 521                          * valid that will act as it does under Linux.
 522                          */
 523                         utv.tv_sec += utv.tv_usec / 1000000;
 524                         utv.tv_usec %= 1000000;
 525                         if (utv.tv_usec < 0) {
 526                                 utv.tv_sec -= 1;
 527                                 utv.tv_usec += 1000000;
 528                         }
 529                         if (utv.tv_sec < 0)
 530                                 timevalclear(&utv);
 531                 }
 532                 microtime(&tv0);
 533                 tvp = &utv;
 534         } else
 535                 tvp = NULL;
 536
 537         error = kern_select(td, args->nfds, args->readfds, args->writefds,
 538             args->exceptfds, tvp, LINUX_NFDBITS);
 539         if (error)
 540                 goto select_out;
 541
 542         if (args->timeout) {
 543                 if (td->td_retval[0]) {
 544                         /*
 545                          * Compute how much time was left of the timeout,
 546                          * by subtracting the current time and the time
 547                          * before we started the call, and subtracting
 548                          * that result from the user-supplied value.
 549                          */
 550                         microtime(&tv1);
 551                         timevalsub(&tv1, &tv0);
 552                         timevalsub(&utv, &tv1);
 553                         if (utv.tv_sec < 0)
 554                                 timevalclear(&utv);
 555                 } else
 556                         timevalclear(&utv);
 557                 ltv.tv_sec = utv.tv_sec;
 558                 ltv.tv_usec = utv.tv_usec;
 559                 if ((error = copyout(&ltv, args->timeout, sizeof(ltv))))
 560                         goto select_out;
 561         }
 562
 563 select_out:
 564         return (error);
 565 }
 566 #endif
 567
 568 int
 569 linux_mremap(struct thread *td, struct linux_mremap_args *args)
 570 {
 571         uintptr_t addr;
 572         size_t len;
 573         int error = 0;
 574
 575         if (args->flags & ~(LINUX_MREMAP_FIXED | LINUX_MREMAP_MAYMOVE)) {
 576                 td->td_retval[0] = 0;
 577                 return (EINVAL);
 578         }
 579
 580         /*
 581          * Check for the page alignment.
 582          * Linux defines PAGE_MASK to be FreeBSD ~PAGE_MASK.
 583          */
 584         if (args->addr & PAGE_MASK) {
 585                 td->td_retval[0] = 0;
 586                 return (EINVAL);
 587         }
 588
 589         args->new_len = round_page(args->new_len);
 590         args->old_len = round_page(args->old_len);
 591
 592         if (args->new_len > args->old_len) {
 593                 td->td_retval[0] = 0;
 594                 return (ENOMEM);
 595         }
 596
 597         if (args->new_len < args->old_len) {
 598                 addr = args->addr + args->new_len;
 599                 len = args->old_len - args->new_len;
 600                 error = kern_munmap(td, addr, len);
 601         }
 602
 603         td->td_retval[0] = error ? 0 : (uintptr_t)args->addr;
 604         return (error);
 605 }
 606
 607 #define LINUX_MS_ASYNC       0x0001
 608 #define LINUX_MS_INVALIDATE  0x0002
 609 #define LINUX_MS_SYNC        0x0004
 610
 611 int
 612 linux_msync(struct thread *td, struct linux_msync_args *args)
 613 {
 614
 615         return (kern_msync(td, args->addr, args->len,
 616             args->fl & ~LINUX_MS_SYNC));
 617 }
 618
 619 #ifdef LINUX_LEGACY_SYSCALLS
 620 int
 621 linux_time(struct thread *td, struct linux_time_args *args)
 622 {
 623         struct timeval tv;
 624         l_time_t tm;
 625         int error;
 626
 627         microtime(&tv);
 628         tm = tv.tv_sec;
 629         if (args->tm && (error = copyout(&tm, args->tm, sizeof(tm))))
 630                 return (error);
 631         td->td_retval[0] = tm;
 632         return (0);
 633 }
 634 #endif
 635
 636 struct l_times_argv {
 637         l_clock_t       tms_utime;
 638         l_clock_t       tms_stime;
 639         l_clock_t       tms_cutime;
 640         l_clock_t       tms_cstime;
 641 };
 642
 643 /*
 644  * Glibc versions prior to 2.2.1 always use hard-coded CLK_TCK value.
 645  * Since 2.2.1 Glibc uses value exported from kernel via AT_CLKTCK
 646  * auxiliary vector entry.
 647  */
 648 #define CLK_TCK         100
 649
 650 #define CONVOTCK(r)     (r.tv_sec * CLK_TCK + r.tv_usec / (1000000 / CLK_TCK))
 651 #define CONVNTCK(r)     (r.tv_sec * stclohz + r.tv_usec / (1000000 / stclohz))
 652
 653 #define CONVTCK(r)      (linux_kernver(td) >= LINUX_KERNVER_2004000 ?           \
 654                             CONVNTCK(r) : CONVOTCK(r))
 655
 656 int
 657 linux_times(struct thread *td, struct linux_times_args *args)
 658 {
 659         struct timeval tv, utime, stime, cutime, cstime;
 660         struct l_times_argv tms;
 661         struct proc *p;
 662         int error;
 663
 664         if (args->buf != NULL) {
 665                 p = td->td_proc;
 666                 PROC_LOCK(p);
 667                 PROC_STATLOCK(p);
 668                 calcru(p, &utime, &stime);
 669                 PROC_STATUNLOCK(p);
 670                 calccru(p, &cutime, &cstime);
 671                 PROC_UNLOCK(p);
 672
 673                 tms.tms_utime = CONVTCK(utime);
 674                 tms.tms_stime = CONVTCK(stime);
 675
 676                 tms.tms_cutime = CONVTCK(cutime);
 677                 tms.tms_cstime = CONVTCK(cstime);
 678
 679                 if ((error = copyout(&tms, args->buf, sizeof(tms))))
 680                         return (error);
 681         }
 682
 683         microuptime(&tv);
 684         td->td_retval[0] = (int)CONVTCK(tv);
 685         return (0);
 686 }
 687
 688 int
 689 linux_newuname(struct thread *td, struct linux_newuname_args *args)
 690 {
 691         struct l_new_utsname utsname;
 692         char osname[LINUX_MAX_UTSNAME];
 693         char osrelease[LINUX_MAX_UTSNAME];
 694         char *p;
 695
 696         linux_get_osname(td, osname);
 697         linux_get_osrelease(td, osrelease);
 698
 699         bzero(&utsname, sizeof(utsname));
 700         strlcpy(utsname.sysname, osname, LINUX_MAX_UTSNAME);
 701         getcredhostname(td->td_ucred, utsname.nodename, LINUX_MAX_UTSNAME);
 702         getcreddomainname(td->td_ucred, utsname.domainname, LINUX_MAX_UTSNAME);
 703         strlcpy(utsname.release, osrelease, LINUX_MAX_UTSNAME);
 704         strlcpy(utsname.version, version, LINUX_MAX_UTSNAME);
 705         for (p = utsname.version; *p != '\0'; ++p)
 706                 if (*p == '\n') {
 707                         *p = '\0';
 708                         break;
 709                 }
 710 #if defined(__amd64__)
 711         /*
 712          * On amd64, Linux uname(2) needs to return "x86_64"
 713          * for both 64-bit and 32-bit applications.  On 32-bit,
 714          * the string returned by getauxval(AT_PLATFORM) needs
 715          * to remain "i686", though.
 716          */
 717         strlcpy(utsname.machine, "x86_64", LINUX_MAX_UTSNAME);
 718 #else
 719         strlcpy(utsname.machine, linux_kplatform, LINUX_MAX_UTSNAME);
 720 #endif
 721
 722         return (copyout(&utsname, args->buf, sizeof(utsname)));
 723 }
 724
 725 struct l_utimbuf {
 726         l_time_t l_actime;
 727         l_time_t l_modtime;
 728 };
 729
 730 #ifdef LINUX_LEGACY_SYSCALLS
 731 int
 732 linux_utime(struct thread *td, struct linux_utime_args *args)
 733 {
 734         struct timeval tv[2], *tvp;
 735         struct l_utimbuf lut;
 736         char *fname;
 737         int error;
 738         bool convpath;
 739
 740         convpath = LUSECONVPATH(td);
 741         if (convpath)
 742                 LCONVPATHEXIST(td, args->fname, &fname);
 743
 744         if (args->times) {
 745                 if ((error = copyin(args->times, &lut, sizeof lut))) {
 746                         if (convpath)
 747                                 LFREEPATH(fname);
 748                         return (error);
 749                 }
 750                 tv[0].tv_sec = lut.l_actime;
 751                 tv[0].tv_usec = 0;
 752                 tv[1].tv_sec = lut.l_modtime;
 753                 tv[1].tv_usec = 0;
 754                 tvp = tv;
 755         } else
 756                 tvp = NULL;
 757
 758         if (!convpath) {
 759                 error = kern_utimesat(td, AT_FDCWD, args->fname, UIO_USERSPACE,
 760                     tvp, UIO_SYSSPACE);
 761         } else {
 762                 error = kern_utimesat(td, AT_FDCWD, fname, UIO_SYSSPACE, tvp,
 763                     UIO_SYSSPACE);
 764                 LFREEPATH(fname);
 765         }
 766         return (error);
 767 }
 768 #endif
 769
 770 #ifdef LINUX_LEGACY_SYSCALLS
 771 int
 772 linux_utimes(struct thread *td, struct linux_utimes_args *args)
 773 {
 774         l_timeval ltv[2];
 775         struct timeval tv[2], *tvp = NULL;
 776         char *fname;
 777         int error;
 778         bool convpath;
 779
 780         convpath = LUSECONVPATH(td);
 781         if (convpath)
 782                 LCONVPATHEXIST(td, args->fname, &fname);
 783
 784         if (args->tptr != NULL) {
 785                 if ((error = copyin(args->tptr, ltv, sizeof ltv))) {
 786                         LFREEPATH(fname);
 787                         return (error);
 788                 }
 789                 tv[0].tv_sec = ltv[0].tv_sec;
 790                 tv[0].tv_usec = ltv[0].tv_usec;
 791                 tv[1].tv_sec = ltv[1].tv_sec;
 792                 tv[1].tv_usec = ltv[1].tv_usec;
 793                 tvp = tv;
 794         }
 795
 796         if (!convpath) {
 797                 error = kern_utimesat(td, AT_FDCWD, args->fname, UIO_USERSPACE,
 798                     tvp, UIO_SYSSPACE);
 799         } else {
 800                 error = kern_utimesat(td, AT_FDCWD, fname, UIO_SYSSPACE,
 801                     tvp, UIO_SYSSPACE);
 802                 LFREEPATH(fname);
 803         }
 804         return (error);
 805 }
 806 #endif
 807
 808 static int
 809 linux_utimensat_nsec_valid(l_long nsec)
 810 {
 811
 812         if (nsec == LINUX_UTIME_OMIT || nsec == LINUX_UTIME_NOW)
 813                 return (0);
 814         if (nsec >= 0 && nsec <= 999999999)
 815                 return (0);
 816         return (1);
 817 }
 818
 819 int
 820 linux_utimensat(struct thread *td, struct linux_utimensat_args *args)
 821 {
 822         struct l_timespec l_times[2];
 823         struct timespec times[2], *timesp = NULL;
 824         char *path = NULL;
 825         int error, dfd, flags = 0;
 826
 827         dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
 828
 829         if (args->flags & ~LINUX_AT_SYMLINK_NOFOLLOW)
 830                 return (EINVAL);
 831
 832         if (args->times != NULL) {
 833                 error = copyin(args->times, l_times, sizeof(l_times));
 834                 if (error != 0)
 835                         return (error);
 836
 837                 if (linux_utimensat_nsec_valid(l_times[0].tv_nsec) != 0 ||
 838                     linux_utimensat_nsec_valid(l_times[1].tv_nsec) != 0)
 839                         return (EINVAL);
 840
 841                 times[0].tv_sec = l_times[0].tv_sec;
 842                 switch (l_times[0].tv_nsec)
 843                 {
 844                 case LINUX_UTIME_OMIT:
 845                         times[0].tv_nsec = UTIME_OMIT;
 846                         break;
 847                 case LINUX_UTIME_NOW:
 848                         times[0].tv_nsec = UTIME_NOW;
 849                         break;
 850                 default:
 851                         times[0].tv_nsec = l_times[0].tv_nsec;
 852                 }
 853
 854                 times[1].tv_sec = l_times[1].tv_sec;
 855                 switch (l_times[1].tv_nsec)
 856                 {
 857                 case LINUX_UTIME_OMIT:
 858                         times[1].tv_nsec = UTIME_OMIT;
 859                         break;
 860                 case LINUX_UTIME_NOW:
 861                         times[1].tv_nsec = UTIME_NOW;
 862                         break;
 863                 default:
 864                         times[1].tv_nsec = l_times[1].tv_nsec;
 865                         break;
 866                 }
 867                 timesp = times;
 868
 869                 /* This breaks POSIX, but is what the Linux kernel does
 870                  * _on purpose_ (documented in the man page for utimensat(2)),
 871                  * so we must follow that behaviour. */
 872                 if (times[0].tv_nsec == UTIME_OMIT &&
 873                     times[1].tv_nsec == UTIME_OMIT)
 874                         return (0);
 875         }
 876
 877         if (!LUSECONVPATH(td)) {
 878                 if (args->pathname != NULL) {
 879                         return (kern_utimensat(td, dfd, args->pathname,
 880                             UIO_USERSPACE, timesp, UIO_SYSSPACE, flags));
 881                 }
 882         }
 883
 884         if (args->pathname != NULL)
 885                 LCONVPATHEXIST_AT(td, args->pathname, &path, dfd);
 886         else if (args->flags != 0)
 887                 return (EINVAL);
 888
 889         if (args->flags & LINUX_AT_SYMLINK_NOFOLLOW)
 890                 flags |= AT_SYMLINK_NOFOLLOW;
 891
 892         if (path == NULL)
 893                 error = kern_futimens(td, dfd, timesp, UIO_SYSSPACE);
 894         else {
 895                 error = kern_utimensat(td, dfd, path, UIO_SYSSPACE, timesp,
 896                         UIO_SYSSPACE, flags);
 897                 LFREEPATH(path);
 898         }
 899
 900         return (error);
 901 }
 902
 903 #ifdef LINUX_LEGACY_SYSCALLS
 904 int
 905 linux_futimesat(struct thread *td, struct linux_futimesat_args *args)
 906 {
 907         l_timeval ltv[2];
 908         struct timeval tv[2], *tvp = NULL;
 909         char *fname;
 910         int error, dfd;
 911         bool convpath;
 912
 913         convpath = LUSECONVPATH(td);
 914         dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
 915         if (convpath)
 916                 LCONVPATHEXIST_AT(td, args->filename, &fname, dfd);
 917
 918         if (args->utimes != NULL) {
 919                 if ((error = copyin(args->utimes, ltv, sizeof ltv))) {
 920                         if (convpath)
 921                                 LFREEPATH(fname);
 922                         return (error);
 923                 }
 924                 tv[0].tv_sec = ltv[0].tv_sec;
 925                 tv[0].tv_usec = ltv[0].tv_usec;
 926                 tv[1].tv_sec = ltv[1].tv_sec;
 927                 tv[1].tv_usec = ltv[1].tv_usec;
 928                 tvp = tv;
 929         }
 930
 931         if (!convpath) {
 932                 error = kern_utimesat(td, dfd, args->filename, UIO_USERSPACE,
 933                     tvp, UIO_SYSSPACE);
 934         } else {
 935                 error = kern_utimesat(td, dfd, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE);
 936                 LFREEPATH(fname);
 937         }
 938         return (error);
 939 }
 940 #endif
 941
 942 static int
 943 linux_common_wait(struct thread *td, int pid, int *statusp,
 944     int options, struct __wrusage *wrup)
 945 {
 946         siginfo_t siginfo;
 947         idtype_t idtype;
 948         id_t id;
 949         int error, status, tmpstat;
 950
 951         if (pid == WAIT_ANY) {
 952                 idtype = P_ALL;
 953                 id = 0;
 954         } else if (pid < 0) {
 955                 idtype = P_PGID;
 956                 id = (id_t)-pid;
 957         } else {
 958                 idtype = P_PID;
 959                 id = (id_t)pid;
 960         }
 961
 962         /*
 963          * For backward compatibility we implicitly add flags WEXITED
 964          * and WTRAPPED here.
 965          */
 966         options |= WEXITED | WTRAPPED;
 967         error = kern_wait6(td, idtype, id, &status, options, wrup, &siginfo);
 968         if (error)
 969                 return (error);
 970
 971         if (statusp) {
 972                 tmpstat = status & 0xffff;
 973                 if (WIFSIGNALED(tmpstat)) {
 974                         tmpstat = (tmpstat & 0xffffff80) |
 975                             bsd_to_linux_signal(WTERMSIG(tmpstat));
 976                 } else if (WIFSTOPPED(tmpstat)) {
 977                         tmpstat = (tmpstat & 0xffff00ff) |
 978                             (bsd_to_linux_signal(WSTOPSIG(tmpstat)) << 8);
 979 #if defined(__amd64__) && !defined(COMPAT_LINUX32)
 980                         if (WSTOPSIG(status) == SIGTRAP) {
 981                                 tmpstat = linux_ptrace_status(td,
 982                                     siginfo.si_pid, tmpstat);
 983                         }
 984 #endif
 985                 } else if (WIFCONTINUED(tmpstat)) {
 986                         tmpstat = 0xffff;
 987                 }
 988                 error = copyout(&tmpstat, statusp, sizeof(int));
 989         }
 990
 991         return (error);
 992 }
 993
 994 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
 995 int
 996 linux_waitpid(struct thread *td, struct linux_waitpid_args *args)
 997 {
 998         struct linux_wait4_args wait4_args;
 999
1000         wait4_args.pid = args->pid;
1001         wait4_args.status = args->status;
1002         wait4_args.options = args->options;
1003         wait4_args.rusage = NULL;
1004
1005         return (linux_wait4(td, &wait4_args));
1006 }
1007 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
1008
1009 int
1010 linux_wait4(struct thread *td, struct linux_wait4_args *args)
1011 {
1012         int error, options;
1013         struct __wrusage wru, *wrup;
1014
1015         if (args->options & ~(LINUX_WUNTRACED | LINUX_WNOHANG |
1016             LINUX_WCONTINUED | __WCLONE | __WNOTHREAD | __WALL))
1017                 return (EINVAL);
1018
1019         options = WEXITED;
1020         linux_to_bsd_waitopts(args->options, &options);
1021
1022         if (args->rusage != NULL)
1023                 wrup = &wru;
1024         else
1025                 wrup = NULL;
1026         error = linux_common_wait(td, args->pid, args->status, options, wrup);
1027         if (error != 0)
1028                 return (error);
1029         if (args->rusage != NULL)
1030                 error = linux_copyout_rusage(&wru.wru_self, args->rusage);
1031         return (error);
1032 }
1033
1034 int
1035 linux_waitid(struct thread *td, struct linux_waitid_args *args)
1036 {
1037         int status, options, sig;
1038         struct __wrusage wru;
1039         siginfo_t siginfo;
1040         l_siginfo_t lsi;
1041         idtype_t idtype;
1042         struct proc *p;
1043         int error;
1044
1045         options = 0;
1046         linux_to_bsd_waitopts(args->options, &options);
1047
1048         if (options & ~(WNOHANG | WNOWAIT | WEXITED | WUNTRACED | WCONTINUED))
1049                 return (EINVAL);
1050         if (!(options & (WEXITED | WUNTRACED | WCONTINUED)))
1051                 return (EINVAL);
1052
1053         switch (args->idtype) {
1054         case LINUX_P_ALL:
1055                 idtype = P_ALL;
1056                 break;
1057         case LINUX_P_PID:
1058                 if (args->id <= 0)
1059                         return (EINVAL);
1060                 idtype = P_PID;
1061                 break;
1062         case LINUX_P_PGID:
1063                 if (args->id <= 0)
1064                         return (EINVAL);
1065                 idtype = P_PGID;
1066                 break;
1067         default:
1068                 return (EINVAL);
1069         }
1070
1071         error = kern_wait6(td, idtype, args->id, &status, options,
1072             &wru, &siginfo);
1073         if (error != 0)
1074                 return (error);
1075         if (args->rusage != NULL) {
1076                 error = linux_copyout_rusage(&wru.wru_children,
1077                     args->rusage);
1078                 if (error != 0)
1079                         return (error);
1080         }
1081         if (args->info != NULL) {
1082                 p = td->td_proc;
1083                 bzero(&lsi, sizeof(lsi));
1084                 if (td->td_retval[0] != 0) {
1085                         sig = bsd_to_linux_signal(siginfo.si_signo);
1086                         siginfo_to_lsiginfo(&siginfo, &lsi, sig);
1087                 }
1088                 error = copyout(&lsi, args->info, sizeof(lsi));
1089         }
1090         td->td_retval[0] = 0;
1091
1092         return (error);
1093 }
1094
1095 #ifdef LINUX_LEGACY_SYSCALLS
1096 int
1097 linux_mknod(struct thread *td, struct linux_mknod_args *args)
1098 {
1099         char *path;
1100         int error;
1101         enum uio_seg seg;
1102         bool convpath;
1103
1104         convpath = LUSECONVPATH(td);
1105         if (!convpath) {
1106                 path = args->path;
1107                 seg = UIO_USERSPACE;
1108         } else {
1109                 LCONVPATHCREAT(td, args->path, &path);
1110                 seg = UIO_SYSSPACE;
1111         }
1112
1113         switch (args->mode & S_IFMT) {
1114         case S_IFIFO:
1115         case S_IFSOCK:
1116                 error = kern_mkfifoat(td, AT_FDCWD, path, seg,
1117                     args->mode);
1118                 break;
1119
1120         case S_IFCHR:
1121         case S_IFBLK:
1122                 error = kern_mknodat(td, AT_FDCWD, path, seg,
1123                     args->mode, args->dev);
1124                 break;
1125
1126         case S_IFDIR:
1127                 error = EPERM;
1128                 break;
1129
1130         case 0:
1131                 args->mode |= S_IFREG;
1132                 /* FALLTHROUGH */
1133         case S_IFREG:
1134                 error = kern_openat(td, AT_FDCWD, path, seg,
1135                     O_WRONLY | O_CREAT | O_TRUNC, args->mode);
1136                 if (error == 0)
1137                         kern_close(td, td->td_retval[0]);
1138                 break;
1139
1140         default:
1141                 error = EINVAL;
1142                 break;
1143         }
1144         if (convpath)
1145                 LFREEPATH(path);
1146         return (error);
1147 }
1148 #endif
1149
1150 int
1151 linux_mknodat(struct thread *td, struct linux_mknodat_args *args)
1152 {
1153         char *path;
1154         int error, dfd;
1155         enum uio_seg seg;
1156         bool convpath;
1157
1158         dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
1159
1160         convpath = LUSECONVPATH(td);
1161         if (!convpath) {
1162                 path = __DECONST(char *, args->filename);
1163                 seg = UIO_USERSPACE;
1164         } else {
1165                 LCONVPATHCREAT_AT(td, args->filename, &path, dfd);
1166                 seg = UIO_SYSSPACE;
1167         }
1168
1169         switch (args->mode & S_IFMT) {
1170         case S_IFIFO:
1171         case S_IFSOCK:
1172                 error = kern_mkfifoat(td, dfd, path, seg, args->mode);
1173                 break;
1174
1175         case S_IFCHR:
1176         case S_IFBLK:
1177                 error = kern_mknodat(td, dfd, path, seg, args->mode,
1178                     args->dev);
1179                 break;
1180
1181         case S_IFDIR:
1182                 error = EPERM;
1183                 break;
1184
1185         case 0:
1186                 args->mode |= S_IFREG;
1187                 /* FALLTHROUGH */
1188         case S_IFREG:
1189                 error = kern_openat(td, dfd, path, seg,
1190                     O_WRONLY | O_CREAT | O_TRUNC, args->mode);
1191                 if (error == 0)
1192                         kern_close(td, td->td_retval[0]);
1193                 break;
1194
1195         default:
1196                 error = EINVAL;
1197                 break;
1198         }
1199         if (convpath)
1200                 LFREEPATH(path);
1201         return (error);
1202 }
1203
1204 /*
1205  * UGH! This is just about the dumbest idea I've ever heard!!
1206  */
1207 int
1208 linux_personality(struct thread *td, struct linux_personality_args *args)
1209 {
1210         struct linux_pemuldata *pem;
1211         struct proc *p = td->td_proc;
1212         uint32_t old;
1213
1214         PROC_LOCK(p);
1215         pem = pem_find(p);
1216         old = pem->persona;
1217         if (args->per != 0xffffffff)
1218                 pem->persona = args->per;
1219         PROC_UNLOCK(p);
1220
1221         td->td_retval[0] = old;
1222         return (0);
1223 }
1224
1225 struct l_itimerval {
1226         l_timeval it_interval;
1227         l_timeval it_value;
1228 };
1229
1230 #define B2L_ITIMERVAL(bip, lip)                                         \
1231         (bip)->it_interval.tv_sec = (lip)->it_interval.tv_sec;          \
1232         (bip)->it_interval.tv_usec = (lip)->it_interval.tv_usec;        \
1233         (bip)->it_value.tv_sec = (lip)->it_value.tv_sec;                \
1234         (bip)->it_value.tv_usec = (lip)->it_value.tv_usec;
1235
1236 int
1237 linux_setitimer(struct thread *td, struct linux_setitimer_args *uap)
1238 {
1239         int error;
1240         struct l_itimerval ls;
1241         struct itimerval aitv, oitv;
1242
1243         if (uap->itv == NULL) {
1244                 uap->itv = uap->oitv;
1245                 return (linux_getitimer(td, (struct linux_getitimer_args *)uap));
1246         }
1247
1248         error = copyin(uap->itv, &ls, sizeof(ls));
1249         if (error != 0)
1250                 return (error);
1251         B2L_ITIMERVAL(&aitv, &ls);
1252         error = kern_setitimer(td, uap->which, &aitv, &oitv);
1253         if (error != 0 || uap->oitv == NULL)
1254                 return (error);
1255         B2L_ITIMERVAL(&ls, &oitv);
1256
1257         return (copyout(&ls, uap->oitv, sizeof(ls)));
1258 }
1259
1260 int
1261 linux_getitimer(struct thread *td, struct linux_getitimer_args *uap)
1262 {
1263         int error;
1264         struct l_itimerval ls;
1265         struct itimerval aitv;
1266
1267         error = kern_getitimer(td, uap->which, &aitv);
1268         if (error != 0)
1269                 return (error);
1270         B2L_ITIMERVAL(&ls, &aitv);
1271         return (copyout(&ls, uap->itv, sizeof(ls)));
1272 }
1273
1274 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
1275 int
1276 linux_nice(struct thread *td, struct linux_nice_args *args)
1277 {
1278
1279         return (kern_setpriority(td, PRIO_PROCESS, 0, args->inc));
1280 }
1281 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
1282
1283 int
1284 linux_setgroups(struct thread *td, struct linux_setgroups_args *args)
1285 {
1286         struct ucred *newcred, *oldcred;
1287         l_gid_t *linux_gidset;
1288         gid_t *bsd_gidset;
1289         int ngrp, error;
1290         struct proc *p;
1291
1292         ngrp = args->gidsetsize;
1293         if (ngrp < 0 || ngrp >= ngroups_max + 1)
1294                 return (EINVAL);
1295         linux_gidset = malloc(ngrp * sizeof(*linux_gidset), M_LINUX, M_WAITOK);
1296         error = copyin(args->grouplist, linux_gidset, ngrp * sizeof(l_gid_t));
1297         if (error)
1298                 goto out;
1299         newcred = crget();
1300         crextend(newcred, ngrp + 1);
1301         p = td->td_proc;
1302         PROC_LOCK(p);
1303         oldcred = p->p_ucred;
1304         crcopy(newcred, oldcred);
1305
1306         /*
1307          * cr_groups[0] holds egid. Setting the whole set from
1308          * the supplied set will cause egid to be changed too.
1309          * Keep cr_groups[0] unchanged to prevent that.
1310          */
1311
1312         if ((error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS)) != 0) {
1313                 PROC_UNLOCK(p);
1314                 crfree(newcred);
1315                 goto out;
1316         }
1317
1318         if (ngrp > 0) {
1319                 newcred->cr_ngroups = ngrp + 1;
1320
1321                 bsd_gidset = newcred->cr_groups;
1322                 ngrp--;
1323                 while (ngrp >= 0) {
1324                         bsd_gidset[ngrp + 1] = linux_gidset[ngrp];
1325                         ngrp--;
1326                 }
1327         } else
1328                 newcred->cr_ngroups = 1;
1329
1330         setsugid(p);
1331         proc_set_cred(p, newcred);
1332         PROC_UNLOCK(p);
1333         crfree(oldcred);
1334         error = 0;
1335 out:
1336         free(linux_gidset, M_LINUX);
1337         return (error);
1338 }
1339
1340 int
1341 linux_getgroups(struct thread *td, struct linux_getgroups_args *args)
1342 {
1343         struct ucred *cred;
1344         l_gid_t *linux_gidset;
1345         gid_t *bsd_gidset;
1346         int bsd_gidsetsz, ngrp, error;
1347
1348         cred = td->td_ucred;
1349         bsd_gidset = cred->cr_groups;
1350         bsd_gidsetsz = cred->cr_ngroups - 1;
1351
1352         /*
1353          * cr_groups[0] holds egid. Returning the whole set
1354          * here will cause a duplicate. Exclude cr_groups[0]
1355          * to prevent that.
1356          */
1357
1358         if ((ngrp = args->gidsetsize) == 0) {
1359                 td->td_retval[0] = bsd_gidsetsz;
1360                 return (0);
1361         }
1362
1363         if (ngrp < bsd_gidsetsz)
1364                 return (EINVAL);
1365
1366         ngrp = 0;
1367         linux_gidset = malloc(bsd_gidsetsz * sizeof(*linux_gidset),
1368             M_LINUX, M_WAITOK);
1369         while (ngrp < bsd_gidsetsz) {
1370                 linux_gidset[ngrp] = bsd_gidset[ngrp + 1];
1371                 ngrp++;
1372         }
1373
1374         error = copyout(linux_gidset, args->grouplist, ngrp * sizeof(l_gid_t));
1375         free(linux_gidset, M_LINUX);
1376         if (error)
1377                 return (error);
1378
1379         td->td_retval[0] = ngrp;
1380         return (0);
1381 }
1382
1383 static bool
1384 linux_get_dummy_limit(l_uint resource, struct rlimit *rlim)
1385 {
1386
1387         if (linux_dummy_rlimits == 0)
1388                 return (false);
1389
1390         switch (resource) {
1391         case LINUX_RLIMIT_LOCKS:
1392         case LINUX_RLIMIT_SIGPENDING:
1393         case LINUX_RLIMIT_MSGQUEUE:
1394         case LINUX_RLIMIT_RTTIME:
1395                 rlim->rlim_cur = LINUX_RLIM_INFINITY;
1396                 rlim->rlim_max = LINUX_RLIM_INFINITY;
1397                 return (true);
1398         case LINUX_RLIMIT_NICE:
1399         case LINUX_RLIMIT_RTPRIO:
1400                 rlim->rlim_cur = 0;
1401                 rlim->rlim_max = 0;
1402                 return (true);
1403         default:
1404                 return (false);
1405         }
1406 }
1407
1408 int
1409 linux_setrlimit(struct thread *td, struct linux_setrlimit_args *args)
1410 {
1411         struct rlimit bsd_rlim;
1412         struct l_rlimit rlim;
1413         u_int which;
1414         int error;
1415
1416         if (args->resource >= LINUX_RLIM_NLIMITS)
1417                 return (EINVAL);
1418
1419         which = linux_to_bsd_resource[args->resource];
1420         if (which == -1)
1421                 return (EINVAL);
1422
1423         error = copyin(args->rlim, &rlim, sizeof(rlim));
1424         if (error)
1425                 return (error);
1426
1427         bsd_rlim.rlim_cur = (rlim_t)rlim.rlim_cur;
1428         bsd_rlim.rlim_max = (rlim_t)rlim.rlim_max;
1429         return (kern_setrlimit(td, which, &bsd_rlim));
1430 }
1431
1432 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
1433 int
1434 linux_old_getrlimit(struct thread *td, struct linux_old_getrlimit_args *args)
1435 {
1436         struct l_rlimit rlim;
1437         struct rlimit bsd_rlim;
1438         u_int which;
1439
1440         if (linux_get_dummy_limit(args->resource, &bsd_rlim)) {
1441                 rlim.rlim_cur = bsd_rlim.rlim_cur;
1442                 rlim.rlim_max = bsd_rlim.rlim_max;
1443                 return (copyout(&rlim, args->rlim, sizeof(rlim)));
1444         }
1445
1446         if (args->resource >= LINUX_RLIM_NLIMITS)
1447                 return (EINVAL);
1448
1449         which = linux_to_bsd_resource[args->resource];
1450         if (which == -1)
1451                 return (EINVAL);
1452
1453         lim_rlimit(td, which, &bsd_rlim);
1454
1455 #ifdef COMPAT_LINUX32
1456         rlim.rlim_cur = (unsigned int)bsd_rlim.rlim_cur;
1457         if (rlim.rlim_cur == UINT_MAX)
1458                 rlim.rlim_cur = INT_MAX;
1459         rlim.rlim_max = (unsigned int)bsd_rlim.rlim_max;
1460         if (rlim.rlim_max == UINT_MAX)
1461                 rlim.rlim_max = INT_MAX;
1462 #else
1463         rlim.rlim_cur = (unsigned long)bsd_rlim.rlim_cur;
1464         if (rlim.rlim_cur == ULONG_MAX)
1465                 rlim.rlim_cur = LONG_MAX;
1466         rlim.rlim_max = (unsigned long)bsd_rlim.rlim_max;
1467         if (rlim.rlim_max == ULONG_MAX)
1468                 rlim.rlim_max = LONG_MAX;
1469 #endif
1470         return (copyout(&rlim, args->rlim, sizeof(rlim)));
1471 }
1472 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
1473
1474 int
1475 linux_getrlimit(struct thread *td, struct linux_getrlimit_args *args)
1476 {
1477         struct l_rlimit rlim;
1478         struct rlimit bsd_rlim;
1479         u_int which;
1480
1481         if (linux_get_dummy_limit(args->resource, &bsd_rlim)) {
1482                 rlim.rlim_cur = bsd_rlim.rlim_cur;
1483                 rlim.rlim_max = bsd_rlim.rlim_max;
1484                 return (copyout(&rlim, args->rlim, sizeof(rlim)));
1485         }
1486
1487         if (args->resource >= LINUX_RLIM_NLIMITS)
1488                 return (EINVAL);
1489
1490         which = linux_to_bsd_resource[args->resource];
1491         if (which == -1)
1492                 return (EINVAL);
1493
1494         lim_rlimit(td, which, &bsd_rlim);
1495
1496         rlim.rlim_cur = (l_ulong)bsd_rlim.rlim_cur;
1497         rlim.rlim_max = (l_ulong)bsd_rlim.rlim_max;
1498         return (copyout(&rlim, args->rlim, sizeof(rlim)));
1499 }
1500
1501 int
1502 linux_sched_setscheduler(struct thread *td,
1503     struct linux_sched_setscheduler_args *args)
1504 {
1505         struct sched_param sched_param;
1506         struct thread *tdt;
1507         int error, policy;
1508
1509         switch (args->policy) {
1510         case LINUX_SCHED_OTHER:
1511                 policy = SCHED_OTHER;
1512                 break;
1513         case LINUX_SCHED_FIFO:
1514                 policy = SCHED_FIFO;
1515                 break;
1516         case LINUX_SCHED_RR:
1517                 policy = SCHED_RR;
1518                 break;
1519         default:
1520                 return (EINVAL);
1521         }
1522
1523         error = copyin(args->param, &sched_param, sizeof(sched_param));
1524         if (error)
1525                 return (error);
1526
1527         if (linux_map_sched_prio) {
1528                 switch (policy) {
1529                 case SCHED_OTHER:
1530                         if (sched_param.sched_priority != 0)
1531                                 return (EINVAL);
1532
1533                         sched_param.sched_priority =
1534                             PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE;
1535                         break;
1536                 case SCHED_FIFO:
1537                 case SCHED_RR:
1538                         if (sched_param.sched_priority < 1 ||
1539                             sched_param.sched_priority >= LINUX_MAX_RT_PRIO)
1540                                 return (EINVAL);
1541
1542                         /*
1543                          * Map [1, LINUX_MAX_RT_PRIO - 1] to
1544                          * [0, RTP_PRIO_MAX - RTP_PRIO_MIN] (rounding down).
1545                          */
1546                         sched_param.sched_priority =
1547                             (sched_param.sched_priority - 1) *
1548                             (RTP_PRIO_MAX - RTP_PRIO_MIN + 1) /
1549                             (LINUX_MAX_RT_PRIO - 1);
1550                         break;
1551                 }
1552         }
1553
1554         tdt = linux_tdfind(td, args->pid, -1);
1555         if (tdt == NULL)
1556                 return (ESRCH);
1557
1558         error = kern_sched_setscheduler(td, tdt, policy, &sched_param);
1559         PROC_UNLOCK(tdt->td_proc);
1560         return (error);
1561 }
1562
1563 int
1564 linux_sched_getscheduler(struct thread *td,
1565     struct linux_sched_getscheduler_args *args)
1566 {
1567         struct thread *tdt;
1568         int error, policy;
1569
1570         tdt = linux_tdfind(td, args->pid, -1);
1571         if (tdt == NULL)
1572                 return (ESRCH);
1573
1574         error = kern_sched_getscheduler(td, tdt, &policy);
1575         PROC_UNLOCK(tdt->td_proc);
1576
1577         switch (policy) {
1578         case SCHED_OTHER:
1579                 td->td_retval[0] = LINUX_SCHED_OTHER;
1580                 break;
1581         case SCHED_FIFO:
1582                 td->td_retval[0] = LINUX_SCHED_FIFO;
1583                 break;
1584         case SCHED_RR:
1585                 td->td_retval[0] = LINUX_SCHED_RR;
1586                 break;
1587         }
1588         return (error);
1589 }
1590
1591 int
1592 linux_sched_get_priority_max(struct thread *td,
1593     struct linux_sched_get_priority_max_args *args)
1594 {
1595         struct sched_get_priority_max_args bsd;
1596
1597         if (linux_map_sched_prio) {
1598                 switch (args->policy) {
1599                 case LINUX_SCHED_OTHER:
1600                         td->td_retval[0] = 0;
1601                         return (0);
1602                 case LINUX_SCHED_FIFO:
1603                 case LINUX_SCHED_RR:
1604                         td->td_retval[0] = LINUX_MAX_RT_PRIO - 1;
1605                         return (0);
1606                 default:
1607                         return (EINVAL);
1608                 }
1609         }
1610
1611         switch (args->policy) {
1612         case LINUX_SCHED_OTHER:
1613                 bsd.policy = SCHED_OTHER;
1614                 break;
1615         case LINUX_SCHED_FIFO:
1616                 bsd.policy = SCHED_FIFO;
1617                 break;
1618         case LINUX_SCHED_RR:
1619                 bsd.policy = SCHED_RR;
1620                 break;
1621         default:
1622                 return (EINVAL);
1623         }
1624         return (sys_sched_get_priority_max(td, &bsd));
1625 }
1626
1627 int
1628 linux_sched_get_priority_min(struct thread *td,
1629     struct linux_sched_get_priority_min_args *args)
1630 {
1631         struct sched_get_priority_min_args bsd;
1632
1633         if (linux_map_sched_prio) {
1634                 switch (args->policy) {
1635                 case LINUX_SCHED_OTHER:
1636                         td->td_retval[0] = 0;
1637                         return (0);
1638                 case LINUX_SCHED_FIFO:
1639                 case LINUX_SCHED_RR:
1640                         td->td_retval[0] = 1;
1641                         return (0);
1642                 default:
1643                         return (EINVAL);
1644                 }
1645         }
1646
1647         switch (args->policy) {
1648         case LINUX_SCHED_OTHER:
1649                 bsd.policy = SCHED_OTHER;
1650                 break;
1651         case LINUX_SCHED_FIFO:
1652                 bsd.policy = SCHED_FIFO;
1653                 break;
1654         case LINUX_SCHED_RR:
1655                 bsd.policy = SCHED_RR;
1656                 break;
1657         default:
1658                 return (EINVAL);
1659         }
1660         return (sys_sched_get_priority_min(td, &bsd));
1661 }
1662
1663 #define REBOOT_CAD_ON   0x89abcdef
1664 #define REBOOT_CAD_OFF  0
1665 #define REBOOT_HALT     0xcdef0123
1666 #define REBOOT_RESTART  0x01234567
1667 #define REBOOT_RESTART2 0xA1B2C3D4
1668 #define REBOOT_POWEROFF 0x4321FEDC
1669 #define REBOOT_MAGIC1   0xfee1dead
1670 #define REBOOT_MAGIC2   0x28121969
1671 #define REBOOT_MAGIC2A  0x05121996
1672 #define REBOOT_MAGIC2B  0x16041998
1673
1674 int
1675 linux_reboot(struct thread *td, struct linux_reboot_args *args)
1676 {
1677         struct reboot_args bsd_args;
1678
1679         if (args->magic1 != REBOOT_MAGIC1)
1680                 return (EINVAL);
1681
1682         switch (args->magic2) {
1683         case REBOOT_MAGIC2:
1684         case REBOOT_MAGIC2A:
1685         case REBOOT_MAGIC2B:
1686                 break;
1687         default:
1688                 return (EINVAL);
1689         }
1690
1691         switch (args->cmd) {
1692         case REBOOT_CAD_ON:
1693         case REBOOT_CAD_OFF:
1694                 return (priv_check(td, PRIV_REBOOT));
1695         case REBOOT_HALT:
1696                 bsd_args.opt = RB_HALT;
1697                 break;
1698         case REBOOT_RESTART:
1699         case REBOOT_RESTART2:
1700                 bsd_args.opt = 0;
1701                 break;
1702         case REBOOT_POWEROFF:
1703                 bsd_args.opt = RB_POWEROFF;
1704                 break;
1705         default:
1706                 return (EINVAL);
1707         }
1708         return (sys_reboot(td, &bsd_args));
1709 }
1710
1711 int
1712 linux_getpid(struct thread *td, struct linux_getpid_args *args)
1713 {
1714
1715         td->td_retval[0] = td->td_proc->p_pid;
1716
1717         return (0);
1718 }
1719
1720 int
1721 linux_gettid(struct thread *td, struct linux_gettid_args *args)
1722 {
1723         struct linux_emuldata *em;
1724
1725         em = em_find(td);
1726         KASSERT(em != NULL, ("gettid: emuldata not found.\n"));
1727
1728         td->td_retval[0] = em->em_tid;
1729
1730         return (0);
1731 }
1732
1733 int
1734 linux_getppid(struct thread *td, struct linux_getppid_args *args)
1735 {
1736
1737         td->td_retval[0] = kern_getppid(td);
1738         return (0);
1739 }
1740
1741 int
1742 linux_getgid(struct thread *td, struct linux_getgid_args *args)
1743 {
1744
1745         td->td_retval[0] = td->td_ucred->cr_rgid;
1746         return (0);
1747 }
1748
1749 int
1750 linux_getuid(struct thread *td, struct linux_getuid_args *args)
1751 {
1752
1753         td->td_retval[0] = td->td_ucred->cr_ruid;
1754         return (0);
1755 }
1756
1757 int
1758 linux_getsid(struct thread *td, struct linux_getsid_args *args)
1759 {
1760
1761         return (kern_getsid(td, args->pid));
1762 }
1763
1764 int
1765 linux_nosys(struct thread *td, struct nosys_args *ignore)
1766 {
1767
1768         return (ENOSYS);
1769 }
1770
1771 int
1772 linux_getpriority(struct thread *td, struct linux_getpriority_args *args)
1773 {
1774         int error;
1775
1776         error = kern_getpriority(td, args->which, args->who);
1777         td->td_retval[0] = 20 - td->td_retval[0];
1778         return (error);
1779 }
1780
1781 int
1782 linux_sethostname(struct thread *td, struct linux_sethostname_args *args)
1783 {
1784         int name[2];
1785
1786         name[0] = CTL_KERN;
1787         name[1] = KERN_HOSTNAME;
1788         return (userland_sysctl(td, name, 2, 0, 0, 0, args->hostname,
1789             args->len, 0, 0));
1790 }
1791
1792 int
1793 linux_setdomainname(struct thread *td, struct linux_setdomainname_args *args)
1794 {
1795         int name[2];
1796
1797         name[0] = CTL_KERN;
1798         name[1] = KERN_NISDOMAINNAME;
1799         return (userland_sysctl(td, name, 2, 0, 0, 0, args->name,
1800             args->len, 0, 0));
1801 }
1802
1803 int
1804 linux_exit_group(struct thread *td, struct linux_exit_group_args *args)
1805 {
1806
1807         LINUX_CTR2(exit_group, "thread(%d) (%d)", td->td_tid,
1808             args->error_code);
1809
1810         /*
1811          * XXX: we should send a signal to the parent if
1812          * SIGNAL_EXIT_GROUP is set. We ignore that (temporarily?)
1813          * as it doesnt occur often.
1814          */
1815         exit1(td, args->error_code, 0);
1816                 /* NOTREACHED */
1817 }
1818
1819 #define _LINUX_CAPABILITY_VERSION_1  0x19980330
1820 #define _LINUX_CAPABILITY_VERSION_2  0x20071026
1821 #define _LINUX_CAPABILITY_VERSION_3  0x20080522
1822
1823 struct l_user_cap_header {
1824         l_int   version;
1825         l_int   pid;
1826 };
1827
1828 struct l_user_cap_data {
1829         l_int   effective;
1830         l_int   permitted;
1831         l_int   inheritable;
1832 };
1833
1834 int
1835 linux_capget(struct thread *td, struct linux_capget_args *uap)
1836 {
1837         struct l_user_cap_header luch;
1838         struct l_user_cap_data lucd[2];
1839         int error, u32s;
1840
1841         if (uap->hdrp == NULL)
1842                 return (EFAULT);
1843
1844         error = copyin(uap->hdrp, &luch, sizeof(luch));
1845         if (error != 0)
1846                 return (error);
1847
1848         switch (luch.version) {
1849         case _LINUX_CAPABILITY_VERSION_1:
1850                 u32s = 1;
1851                 break;
1852         case _LINUX_CAPABILITY_VERSION_2:
1853         case _LINUX_CAPABILITY_VERSION_3:
1854                 u32s = 2;
1855                 break;
1856         default:
1857                 luch.version = _LINUX_CAPABILITY_VERSION_1;
1858                 error = copyout(&luch, uap->hdrp, sizeof(luch));
1859                 if (error)
1860                         return (error);
1861                 return (EINVAL);
1862         }
1863
1864         if (luch.pid)
1865                 return (EPERM);
1866
1867         if (uap->datap) {
1868                 /*
1869                  * The current implementation doesn't support setting
1870                  * a capability (it's essentially a stub) so indicate
1871                  * that no capabilities are currently set or available
1872                  * to request.
1873                  */
1874                 memset(&lucd, 0, u32s * sizeof(lucd[0]));
1875                 error = copyout(&lucd, uap->datap, u32s * sizeof(lucd[0]));
1876         }
1877
1878         return (error);
1879 }
1880
1881 int
1882 linux_capset(struct thread *td, struct linux_capset_args *uap)
1883 {
1884         struct l_user_cap_header luch;
1885         struct l_user_cap_data lucd[2];
1886         int error, i, u32s;
1887
1888         if (uap->hdrp == NULL || uap->datap == NULL)
1889                 return (EFAULT);
1890
1891         error = copyin(uap->hdrp, &luch, sizeof(luch));
1892         if (error != 0)
1893                 return (error);
1894
1895         switch (luch.version) {
1896         case _LINUX_CAPABILITY_VERSION_1:
1897                 u32s = 1;
1898                 break;
1899         case _LINUX_CAPABILITY_VERSION_2:
1900         case _LINUX_CAPABILITY_VERSION_3:
1901                 u32s = 2;
1902                 break;
1903         default:
1904                 luch.version = _LINUX_CAPABILITY_VERSION_1;
1905                 error = copyout(&luch, uap->hdrp, sizeof(luch));
1906                 if (error)
1907                         return (error);
1908                 return (EINVAL);
1909         }
1910
1911         if (luch.pid)
1912                 return (EPERM);
1913
1914         error = copyin(uap->datap, &lucd, u32s * sizeof(lucd[0]));
1915         if (error != 0)
1916                 return (error);
1917
1918         /* We currently don't support setting any capabilities. */
1919         for (i = 0; i < u32s; i++) {
1920                 if (lucd[i].effective || lucd[i].permitted ||
1921                     lucd[i].inheritable) {
1922                         linux_msg(td,
1923                             "capset[%d] effective=0x%x, permitted=0x%x, "
1924                             "inheritable=0x%x is not implemented", i,
1925                             (int)lucd[i].effective, (int)lucd[i].permitted,
1926                             (int)lucd[i].inheritable);
1927                         return (EPERM);
1928                 }
1929         }
1930
1931         return (0);
1932 }
1933
1934 int
1935 linux_prctl(struct thread *td, struct linux_prctl_args *args)
1936 {
1937         int error = 0, max_size;
1938         struct proc *p = td->td_proc;
1939         char comm[LINUX_MAX_COMM_LEN];
1940         int pdeath_signal;
1941
1942         switch (args->option) {
1943         case LINUX_PR_SET_PDEATHSIG:
1944                 if (!LINUX_SIG_VALID(args->arg2))
1945                         return (EINVAL);
1946                 pdeath_signal = linux_to_bsd_signal(args->arg2);
1947                 return (kern_procctl(td, P_PID, 0, PROC_PDEATHSIG_CTL,
1948                     &pdeath_signal));
1949         case LINUX_PR_GET_PDEATHSIG:
1950                 error = kern_procctl(td, P_PID, 0, PROC_PDEATHSIG_STATUS,
1951                     &pdeath_signal);
1952                 if (error != 0)
1953                         return (error);
1954                 pdeath_signal = bsd_to_linux_signal(pdeath_signal);
1955                 return (copyout(&pdeath_signal,
1956                     (void *)(register_t)args->arg2,
1957                     sizeof(pdeath_signal)));
1958                 break;
1959         case LINUX_PR_SET_DUMPABLE:
1960                 linux_msg(td, "unsupported prctl PR_SET_DUMPABLE");
1961                 error = EINVAL;
1962                 break;
1963         case LINUX_PR_GET_KEEPCAPS:
1964                 /*
1965                  * Indicate that we always clear the effective and
1966                  * permitted capability sets when the user id becomes
1967                  * non-zero (actually the capability sets are simply
1968                  * always zero in the current implementation).
1969                  */
1970                 td->td_retval[0] = 0;
1971                 break;
1972         case LINUX_PR_SET_KEEPCAPS:
1973                 /*
1974                  * Ignore requests to keep the effective and permitted
1975                  * capability sets when the user id becomes non-zero.
1976                  */
1977                 break;
1978         case LINUX_PR_SET_NAME:
1979                 /*
1980                  * To be on the safe side we need to make sure to not
1981                  * overflow the size a Linux program expects. We already
1982                  * do this here in the copyin, so that we don't need to
1983                  * check on copyout.
1984                  */
1985                 max_size = MIN(sizeof(comm), sizeof(p->p_comm));
1986                 error = copyinstr((void *)(register_t)args->arg2, comm,
1987                     max_size, NULL);
1988
1989                 /* Linux silently truncates the name if it is too long. */
1990                 if (error == ENAMETOOLONG) {
1991                         /*
1992                          * XXX: copyinstr() isn't documented to populate the
1993                          * array completely, so do a copyin() to be on the
1994                          * safe side. This should be changed in case
1995                          * copyinstr() is changed to guarantee this.
1996                          */
1997                         error = copyin((void *)(register_t)args->arg2, comm,
1998                             max_size - 1);
1999                         comm[max_size - 1] = '\0';
2000                 }
2001                 if (error)
2002                         return (error);
2003
2004                 PROC_LOCK(p);
2005                 strlcpy(p->p_comm, comm, sizeof(p->p_comm));
2006                 PROC_UNLOCK(p);
2007                 break;
2008         case LINUX_PR_GET_NAME:
2009                 PROC_LOCK(p);
2010                 strlcpy(comm, p->p_comm, sizeof(comm));
2011                 PROC_UNLOCK(p);
2012                 error = copyout(comm, (void *)(register_t)args->arg2,
2013                     strlen(comm) + 1);
2014                 break;
2015         case LINUX_PR_GET_SECCOMP:
2016         case LINUX_PR_SET_SECCOMP:
2017                 /*
2018                  * Same as returned by Linux without CONFIG_SECCOMP enabled.
2019                  */
2020                 error = EINVAL;
2021                 break;
2022         case LINUX_PR_SET_NO_NEW_PRIVS:
2023                 linux_msg(td, "unsupported prctl PR_SET_NO_NEW_PRIVS");
2024                 error = EINVAL;
2025                 break;
2026         case LINUX_PR_SET_PTRACER:
2027                 linux_msg(td, "unsupported prctl PR_SET_PTRACER");
2028                 error = EINVAL;
2029                 break;
2030         default:
2031                 linux_msg(td, "unsupported prctl option %d", args->option);
2032                 error = EINVAL;
2033                 break;
2034         }
2035
2036         return (error);
2037 }
2038
2039 int
2040 linux_sched_setparam(struct thread *td,
2041     struct linux_sched_setparam_args *uap)
2042 {
2043         struct sched_param sched_param;
2044         struct thread *tdt;
2045         int error, policy;
2046
2047         error = copyin(uap->param, &sched_param, sizeof(sched_param));
2048         if (error)
2049                 return (error);
2050
2051         tdt = linux_tdfind(td, uap->pid, -1);
2052         if (tdt == NULL)
2053                 return (ESRCH);
2054
2055         if (linux_map_sched_prio) {
2056                 error = kern_sched_getscheduler(td, tdt, &policy);
2057                 if (error)
2058                         goto out;
2059
2060                 switch (policy) {
2061                 case SCHED_OTHER:
2062                         if (sched_param.sched_priority != 0) {
2063                                 error = EINVAL;
2064                                 goto out;
2065                         }
2066                         sched_param.sched_priority =
2067                             PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE;
2068                         break;
2069                 case SCHED_FIFO:
2070                 case SCHED_RR:
2071                         if (sched_param.sched_priority < 1 ||
2072                             sched_param.sched_priority >= LINUX_MAX_RT_PRIO) {
2073                                 error = EINVAL;
2074                                 goto out;
2075                         }
2076                         /*
2077                          * Map [1, LINUX_MAX_RT_PRIO - 1] to
2078                          * [0, RTP_PRIO_MAX - RTP_PRIO_MIN] (rounding down).
2079                          */
2080                         sched_param.sched_priority =
2081                             (sched_param.sched_priority - 1) *
2082                             (RTP_PRIO_MAX - RTP_PRIO_MIN + 1) /
2083                             (LINUX_MAX_RT_PRIO - 1);
2084                         break;
2085                 }
2086         }
2087
2088         error = kern_sched_setparam(td, tdt, &sched_param);
2089 out:    PROC_UNLOCK(tdt->td_proc);
2090         return (error);
2091 }
2092
2093 int
2094 linux_sched_getparam(struct thread *td,
2095     struct linux_sched_getparam_args *uap)
2096 {
2097         struct sched_param sched_param;
2098         struct thread *tdt;
2099         int error, policy;
2100
2101         tdt = linux_tdfind(td, uap->pid, -1);
2102         if (tdt == NULL)
2103                 return (ESRCH);
2104
2105         error = kern_sched_getparam(td, tdt, &sched_param);
2106         if (error) {
2107                 PROC_UNLOCK(tdt->td_proc);
2108                 return (error);
2109         }
2110
2111         if (linux_map_sched_prio) {
2112                 error = kern_sched_getscheduler(td, tdt, &policy);
2113                 PROC_UNLOCK(tdt->td_proc);
2114                 if (error)
2115                         return (error);
2116
2117                 switch (policy) {
2118                 case SCHED_OTHER:
2119                         sched_param.sched_priority = 0;
2120                         break;
2121                 case SCHED_FIFO:
2122                 case SCHED_RR:
2123                         /*
2124                          * Map [0, RTP_PRIO_MAX - RTP_PRIO_MIN] to
2125                          * [1, LINUX_MAX_RT_PRIO - 1] (rounding up).
2126                          */
2127                         sched_param.sched_priority =
2128                             (sched_param.sched_priority *
2129                             (LINUX_MAX_RT_PRIO - 1) +
2130                             (RTP_PRIO_MAX - RTP_PRIO_MIN - 1)) /
2131                             (RTP_PRIO_MAX - RTP_PRIO_MIN) + 1;
2132                         break;
2133                 }
2134         } else
2135                 PROC_UNLOCK(tdt->td_proc);
2136
2137         error = copyout(&sched_param, uap->param, sizeof(sched_param));
2138         return (error);
2139 }
2140
2141 /*
2142  * Get affinity of a process.
2143  */
2144 int
2145 linux_sched_getaffinity(struct thread *td,
2146     struct linux_sched_getaffinity_args *args)
2147 {
2148         int error;
2149         struct thread *tdt;
2150
2151         if (args->len < sizeof(cpuset_t))
2152                 return (EINVAL);
2153
2154         tdt = linux_tdfind(td, args->pid, -1);
2155         if (tdt == NULL)
2156                 return (ESRCH);
2157
2158         PROC_UNLOCK(tdt->td_proc);
2159
2160         error = kern_cpuset_getaffinity(td, CPU_LEVEL_WHICH, CPU_WHICH_TID,
2161             tdt->td_tid, sizeof(cpuset_t), (cpuset_t *)args->user_mask_ptr);
2162         if (error == 0)
2163                 td->td_retval[0] = sizeof(cpuset_t);
2164
2165         return (error);
2166 }
2167
2168 /*
2169  *  Set affinity of a process.
2170  */
2171 int
2172 linux_sched_setaffinity(struct thread *td,
2173     struct linux_sched_setaffinity_args *args)
2174 {
2175         struct thread *tdt;
2176
2177         if (args->len < sizeof(cpuset_t))
2178                 return (EINVAL);
2179
2180         tdt = linux_tdfind(td, args->pid, -1);
2181         if (tdt == NULL)
2182                 return (ESRCH);
2183
2184         PROC_UNLOCK(tdt->td_proc);
2185
2186         return (kern_cpuset_setaffinity(td, CPU_LEVEL_WHICH, CPU_WHICH_TID,
2187             tdt->td_tid, sizeof(cpuset_t), (cpuset_t *) args->user_mask_ptr));
2188 }
2189
2190 struct linux_rlimit64 {
2191         uint64_t        rlim_cur;
2192         uint64_t        rlim_max;
2193 };
2194
2195 int
2196 linux_prlimit64(struct thread *td, struct linux_prlimit64_args *args)
2197 {
2198         struct rlimit rlim, nrlim;
2199         struct linux_rlimit64 lrlim;
2200         struct proc *p;
2201         u_int which;
2202         int flags;
2203         int error;
2204
2205         if (args->new == NULL && args->old != NULL) {
2206                 if (linux_get_dummy_limit(args->resource, &rlim)) {
2207                         lrlim.rlim_cur = rlim.rlim_cur;
2208                         lrlim.rlim_max = rlim.rlim_max;
2209                         return (copyout(&lrlim, args->old, sizeof(lrlim)));
2210                 }
2211         }
2212
2213         if (args->resource >= LINUX_RLIM_NLIMITS)
2214                 return (EINVAL);
2215
2216         which = linux_to_bsd_resource[args->resource];
2217         if (which == -1)
2218                 return (EINVAL);
2219
2220         if (args->new != NULL) {
2221                 /*
2222                  * Note. Unlike FreeBSD where rlim is signed 64-bit Linux
2223                  * rlim is unsigned 64-bit. FreeBSD treats negative limits
2224                  * as INFINITY so we do not need a conversion even.
2225                  */
2226                 error = copyin(args->new, &nrlim, sizeof(nrlim));
2227                 if (error != 0)
2228                         return (error);
2229         }
2230
2231         flags = PGET_HOLD | PGET_NOTWEXIT;
2232         if (args->new != NULL)
2233                 flags |= PGET_CANDEBUG;
2234         else
2235                 flags |= PGET_CANSEE;
2236         if (args->pid == 0) {
2237                 p = td->td_proc;
2238                 PHOLD(p);
2239         } else {
2240                 error = pget(args->pid, flags, &p);
2241                 if (error != 0)
2242                         return (error);
2243         }
2244         if (args->old != NULL) {
2245                 PROC_LOCK(p);
2246                 lim_rlimit_proc(p, which, &rlim);
2247                 PROC_UNLOCK(p);
2248                 if (rlim.rlim_cur == RLIM_INFINITY)
2249                         lrlim.rlim_cur = LINUX_RLIM_INFINITY;
2250                 else
2251                         lrlim.rlim_cur = rlim.rlim_cur;
2252                 if (rlim.rlim_max == RLIM_INFINITY)
2253                         lrlim.rlim_max = LINUX_RLIM_INFINITY;
2254                 else
2255                         lrlim.rlim_max = rlim.rlim_max;
2256                 error = copyout(&lrlim, args->old, sizeof(lrlim));
2257                 if (error != 0)
2258                         goto out;
2259         }
2260
2261         if (args->new != NULL)
2262                 error = kern_proc_setrlimit(td, p, which, &nrlim);
2263
2264  out:
2265         PRELE(p);
2266         return (error);
2267 }
2268
2269 int
2270 linux_pselect6(struct thread *td, struct linux_pselect6_args *args)
2271 {
2272         struct timeval utv, tv0, tv1, *tvp;
2273         struct l_pselect6arg lpse6;
2274         struct l_timespec lts;
2275         struct timespec uts;
2276         l_sigset_t l_ss;
2277         sigset_t *ssp;
2278         sigset_t ss;
2279         int error;
2280
2281         ssp = NULL;
2282         if (args->sig != NULL) {
2283                 error = copyin(args->sig, &lpse6, sizeof(lpse6));
2284                 if (error != 0)
2285                         return (error);
2286                 if (lpse6.ss_len != sizeof(l_ss))
2287                         return (EINVAL);
2288                 if (lpse6.ss != 0) {
2289                         error = copyin(PTRIN(lpse6.ss), &l_ss,
2290                             sizeof(l_ss));
2291                         if (error != 0)
2292                                 return (error);
2293                         linux_to_bsd_sigset(&l_ss, &ss);
2294                         ssp = &ss;
2295                 }
2296         }
2297
2298         /*
2299          * Currently glibc changes nanosecond number to microsecond.
2300          * This mean losing precision but for now it is hardly seen.
2301          */
2302         if (args->tsp != NULL) {
2303                 error = copyin(args->tsp, &lts, sizeof(lts));
2304                 if (error != 0)
2305                         return (error);
2306                 error = linux_to_native_timespec(&uts, &lts);
2307                 if (error != 0)
2308                         return (error);
2309
2310                 TIMESPEC_TO_TIMEVAL(&utv, &uts);
2311                 if (itimerfix(&utv))
2312                         return (EINVAL);
2313
2314                 microtime(&tv0);
2315                 tvp = &utv;
2316         } else
2317                 tvp = NULL;
2318
2319         error = kern_pselect(td, args->nfds, args->readfds, args->writefds,
2320             args->exceptfds, tvp, ssp, LINUX_NFDBITS);
2321
2322         if (error == 0 && args->tsp != NULL) {
2323                 if (td->td_retval[0] != 0) {
2324                         /*
2325                          * Compute how much time was left of the timeout,
2326                          * by subtracting the current time and the time
2327                          * before we started the call, and subtracting
2328                          * that result from the user-supplied value.
2329                          */
2330
2331                         microtime(&tv1);
2332                         timevalsub(&tv1, &tv0);
2333                         timevalsub(&utv, &tv1);
2334                         if (utv.tv_sec < 0)
2335                                 timevalclear(&utv);
2336                 } else
2337                         timevalclear(&utv);
2338
2339                 TIMEVAL_TO_TIMESPEC(&utv, &uts);
2340
2341                 error = native_to_linux_timespec(&lts, &uts);
2342                 if (error == 0)
2343                         error = copyout(&lts, args->tsp, sizeof(lts));
2344         }
2345
2346         return (error);
2347 }
2348
2349 int
2350 linux_ppoll(struct thread *td, struct linux_ppoll_args *args)
2351 {
2352         struct timespec ts0, ts1;
2353         struct l_timespec lts;
2354         struct timespec uts, *tsp;
2355         l_sigset_t l_ss;
2356         sigset_t *ssp;
2357         sigset_t ss;
2358         int error;
2359
2360         if (args->sset != NULL) {
2361                 if (args->ssize != sizeof(l_ss))
2362                         return (EINVAL);
2363                 error = copyin(args->sset, &l_ss, sizeof(l_ss));
2364                 if (error)
2365                         return (error);
2366                 linux_to_bsd_sigset(&l_ss, &ss);
2367                 ssp = &ss;
2368         } else
2369                 ssp = NULL;
2370         if (args->tsp != NULL) {
2371                 error = copyin(args->tsp, &lts, sizeof(lts));
2372                 if (error)
2373                         return (error);
2374                 error = linux_to_native_timespec(&uts, &lts);
2375                 if (error != 0)
2376                         return (error);
2377
2378                 nanotime(&ts0);
2379                 tsp = &uts;
2380         } else
2381                 tsp = NULL;
2382
2383         error = kern_poll(td, args->fds, args->nfds, tsp, ssp);
2384
2385         if (error == 0 && args->tsp != NULL) {
2386                 if (td->td_retval[0]) {
2387                         nanotime(&ts1);
2388                         timespecsub(&ts1, &ts0, &ts1);
2389                         timespecsub(&uts, &ts1, &uts);
2390                         if (uts.tv_sec < 0)
2391                                 timespecclear(&uts);
2392                 } else
2393                         timespecclear(&uts);
2394
2395                 error = native_to_linux_timespec(&lts, &uts);
2396                 if (error == 0)
2397                         error = copyout(&lts, args->tsp, sizeof(lts));
2398         }
2399
2400         return (error);
2401 }
2402
2403 int
2404 linux_sched_rr_get_interval(struct thread *td,
2405     struct linux_sched_rr_get_interval_args *uap)
2406 {
2407         struct timespec ts;
2408         struct l_timespec lts;
2409         struct thread *tdt;
2410         int error;
2411
2412         /*
2413          * According to man in case the invalid pid specified
2414          * EINVAL should be returned.
2415          */
2416         if (uap->pid < 0)
2417                 return (EINVAL);
2418
2419         tdt = linux_tdfind(td, uap->pid, -1);
2420         if (tdt == NULL)
2421                 return (ESRCH);
2422
2423         error = kern_sched_rr_get_interval_td(td, tdt, &ts);
2424         PROC_UNLOCK(tdt->td_proc);
2425         if (error != 0)
2426                 return (error);
2427         error = native_to_linux_timespec(&lts, &ts);
2428         if (error != 0)
2429                 return (error);
2430         return (copyout(&lts, uap->interval, sizeof(lts)));
2431 }
2432
2433 /*
2434  * In case when the Linux thread is the initial thread in
2435  * the thread group thread id is equal to the process id.
2436  * Glibc depends on this magic (assert in pthread_getattr_np.c).
2437  */
2438 struct thread *
2439 linux_tdfind(struct thread *td, lwpid_t tid, pid_t pid)
2440 {
2441         struct linux_emuldata *em;
2442         struct thread *tdt;
2443         struct proc *p;
2444
2445         tdt = NULL;
2446         if (tid == 0 || tid == td->td_tid) {
2447                 tdt = td;
2448                 PROC_LOCK(tdt->td_proc);
2449         } else if (tid > PID_MAX)
2450                 tdt = tdfind(tid, pid);
2451         else {
2452                 /*
2453                  * Initial thread where the tid equal to the pid.
2454                  */
2455                 p = pfind(tid);
2456                 if (p != NULL) {
2457                         if (SV_PROC_ABI(p) != SV_ABI_LINUX) {
2458                                 /*
2459                                  * p is not a Linuxulator process.
2460                                  */
2461                                 PROC_UNLOCK(p);
2462                                 return (NULL);
2463                         }
2464                         FOREACH_THREAD_IN_PROC(p, tdt) {
2465                                 em = em_find(tdt);
2466                                 if (tid == em->em_tid)
2467                                         return (tdt);
2468                         }
2469                         PROC_UNLOCK(p);
2470                 }
2471                 return (NULL);
2472         }
2473
2474         return (tdt);
2475 }
2476
2477 void
2478 linux_to_bsd_waitopts(int options, int *bsdopts)
2479 {
2480
2481         if (options & LINUX_WNOHANG)
2482                 *bsdopts |= WNOHANG;
2483         if (options & LINUX_WUNTRACED)
2484                 *bsdopts |= WUNTRACED;
2485         if (options & LINUX_WEXITED)
2486                 *bsdopts |= WEXITED;
2487         if (options & LINUX_WCONTINUED)
2488                 *bsdopts |= WCONTINUED;
2489         if (options & LINUX_WNOWAIT)
2490                 *bsdopts |= WNOWAIT;
2491
2492         if (options & __WCLONE)
2493                 *bsdopts |= WLINUXCLONE;
2494 }
2495
2496 int
2497 linux_getrandom(struct thread *td, struct linux_getrandom_args *args)
2498 {
2499         struct uio uio;
2500         struct iovec iov;
2501         int error;
2502
2503         if (args->flags & ~(LINUX_GRND_NONBLOCK|LINUX_GRND_RANDOM))
2504                 return (EINVAL);
2505         if (args->count > INT_MAX)
2506                 args->count = INT_MAX;
2507
2508         iov.iov_base = args->buf;
2509         iov.iov_len = args->count;
2510
2511         uio.uio_iov = &iov;
2512         uio.uio_iovcnt = 1;
2513         uio.uio_resid = iov.iov_len;
2514         uio.uio_segflg = UIO_USERSPACE;
2515         uio.uio_rw = UIO_READ;
2516         uio.uio_td = td;
2517
2518         error = read_random_uio(&uio, args->flags & LINUX_GRND_NONBLOCK);
2519         if (error == 0)
2520                 td->td_retval[0] = args->count - uio.uio_resid;
2521         return (error);
2522 }
2523
2524 int
2525 linux_mincore(struct thread *td, struct linux_mincore_args *args)
2526 {
2527
2528         /* Needs to be page-aligned */
2529         if (args->start & PAGE_MASK)
2530                 return (EINVAL);
2531         return (kern_mincore(td, args->start, args->len, args->vec));
2532 }
2533
2534 #define SYSLOG_TAG      "<6>"
2535
2536 int
2537 linux_syslog(struct thread *td, struct linux_syslog_args *args)
2538 {
2539         char buf[128], *src, *dst;
2540         u_int seq;
2541         int buflen, error;
2542
2543         if (args->type != LINUX_SYSLOG_ACTION_READ_ALL) {
2544                 linux_msg(td, "syslog unsupported type 0x%x", args->type);
2545                 return (EINVAL);
2546         }
2547
2548         if (args->len < 6) {
2549                 td->td_retval[0] = 0;
2550                 return (0);
2551         }
2552
2553         error = priv_check(td, PRIV_MSGBUF);
2554         if (error)
2555                 return (error);
2556
2557         mtx_lock(&msgbuf_lock);
2558         msgbuf_peekbytes(msgbufp, NULL, 0, &seq);
2559         mtx_unlock(&msgbuf_lock);
2560
2561         dst = args->buf;
2562         error = copyout(&SYSLOG_TAG, dst, sizeof(SYSLOG_TAG));
2563         /* The -1 is to skip the trailing '\0'. */
2564         dst += sizeof(SYSLOG_TAG) - 1;
2565
2566         while (error == 0) {
2567                 mtx_lock(&msgbuf_lock);
2568                 buflen = msgbuf_peekbytes(msgbufp, buf, sizeof(buf), &seq);
2569                 mtx_unlock(&msgbuf_lock);
2570
2571                 if (buflen == 0)
2572                         break;
2573
2574                 for (src = buf; src < buf + buflen && error == 0; src++) {
2575                         if (*src == '\0')
2576                                 continue;
2577
2578                         if (dst >= args->buf + args->len)
2579                                 goto out;
2580
2581                         error = copyout(src, dst, 1);
2582                         dst++;
2583
2584                         if (*src == '\n' && *(src + 1) != '<' &&
2585                             dst + sizeof(SYSLOG_TAG) < args->buf + args->len) {
2586                                 error = copyout(&SYSLOG_TAG,
2587                                     dst, sizeof(SYSLOG_TAG));
2588                                 dst += sizeof(SYSLOG_TAG) - 1;
2589                         }
2590                 }
2591         }
2592 out:
2593         td->td_retval[0] = dst - args->buf;
2594         return (error);
2595 }
2596
2597 int
2598 linux_getcpu(struct thread *td, struct linux_getcpu_args *args)
2599 {
2600         int cpu, error, node;
2601
2602         cpu = td->td_oncpu; /* Make sure it doesn't change during copyout(9) */
2603         error = 0;
2604         node = cpuid_to_pcpu[cpu]->pc_domain;
2605
2606         if (args->cpu != NULL)
2607                 error = copyout(&cpu, args->cpu, sizeof(l_int));
2608         if (args->node != NULL)
2609                 error = copyout(&node, args->node, sizeof(l_int));
2610         return (error);
2611 }