sys/compat/linux/linux_misc.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 2002 Doug Rabson
   5  * Copyright (c) 1994-1995 Søren Schmidt
   6  * All rights reserved.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer
  13  *    in this position and unchanged.
  14  * 2. Redistributions in binary form must reproduce the above copyright
  15  *    notice, this list of conditions and the following disclaimer in the
  16  *    documentation and/or other materials provided with the distribution.
  17  * 3. The name of the author may not be used to endorse or promote products
  18  *    derived from this software without specific prior written permission
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  21  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  22  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  23  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  24  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  25  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  29  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 #include <sys/cdefs.h>
  33 __FBSDID("$FreeBSD$");
  34
  35 #include "opt_compat.h"
  36
  37 #include <sys/param.h>
  38 #include <sys/blist.h>
  39 #include <sys/fcntl.h>
  40 #if defined(__i386__)
  41 #include <sys/imgact_aout.h>
  42 #endif
  43 #include <sys/jail.h>
  44 #include <sys/kernel.h>
  45 #include <sys/limits.h>
  46 #include <sys/lock.h>
  47 #include <sys/malloc.h>
  48 #include <sys/mman.h>
  49 #include <sys/mount.h>
  50 #include <sys/msgbuf.h>
  51 #include <sys/mutex.h>
  52 #include <sys/namei.h>
  53 #include <sys/priv.h>
  54 #include <sys/proc.h>
  55 #include <sys/procctl.h>
  56 #include <sys/reboot.h>
  57 #include <sys/racct.h>
  58 #include <sys/random.h>
  59 #include <sys/resourcevar.h>
  60 #include <sys/sched.h>
  61 #include <sys/sdt.h>
  62 #include <sys/signalvar.h>
  63 #include <sys/stat.h>
  64 #include <sys/syscallsubr.h>
  65 #include <sys/sysctl.h>
  66 #include <sys/sysproto.h>
  67 #include <sys/systm.h>
  68 #include <sys/time.h>
  69 #include <sys/vmmeter.h>
  70 #include <sys/vnode.h>
  71 #include <sys/wait.h>
  72 #include <sys/cpuset.h>
  73 #include <sys/uio.h>
  74
  75 #include <security/mac/mac_framework.h>
  76
  77 #include <vm/vm.h>
  78 #include <vm/pmap.h>
  79 #include <vm/vm_kern.h>
  80 #include <vm/vm_map.h>
  81 #include <vm/vm_extern.h>
  82 #include <vm/vm_object.h>
  83 #include <vm/swap_pager.h>
  84
  85 #ifdef COMPAT_LINUX32
  86 #include <machine/../linux32/linux.h>
  87 #include <machine/../linux32/linux32_proto.h>
  88 #else
  89 #include <machine/../linux/linux.h>
  90 #include <machine/../linux/linux_proto.h>
  91 #endif
  92
  93 #include <compat/linux/linux_dtrace.h>
  94 #include <compat/linux/linux_file.h>
  95 #include <compat/linux/linux_mib.h>
  96 #include <compat/linux/linux_signal.h>
  97 #include <compat/linux/linux_timer.h>
  98 #include <compat/linux/linux_util.h>
  99 #include <compat/linux/linux_sysproto.h>
 100 #include <compat/linux/linux_emul.h>
 101 #include <compat/linux/linux_misc.h>
 102
 103 /**
 104  * Special DTrace provider for the linuxulator.
 105  *
 106  * In this file we define the provider for the entire linuxulator. All
 107  * modules (= files of the linuxulator) use it.
 108  *
 109  * We define a different name depending on the emulated bitsize, see
 110  * ../../<ARCH>/linux{,32}/linux.h, e.g.:
 111  *      native bitsize          = linuxulator
 112  *      amd64, 32bit emulation  = linuxulator32
 113  */
 114 LIN_SDT_PROVIDER_DEFINE(LINUX_DTRACE);
 115
 116 int stclohz;                            /* Statistics clock frequency */
 117
 118 static unsigned int linux_to_bsd_resource[LINUX_RLIM_NLIMITS] = {
 119         RLIMIT_CPU, RLIMIT_FSIZE, RLIMIT_DATA, RLIMIT_STACK,
 120         RLIMIT_CORE, RLIMIT_RSS, RLIMIT_NPROC, RLIMIT_NOFILE,
 121         RLIMIT_MEMLOCK, RLIMIT_AS
 122 };
 123
 124 struct l_sysinfo {
 125         l_long          uptime;         /* Seconds since boot */
 126         l_ulong         loads[3];       /* 1, 5, and 15 minute load averages */
 127 #define LINUX_SYSINFO_LOADS_SCALE 65536
 128         l_ulong         totalram;       /* Total usable main memory size */
 129         l_ulong         freeram;        /* Available memory size */
 130         l_ulong         sharedram;      /* Amount of shared memory */
 131         l_ulong         bufferram;      /* Memory used by buffers */
 132         l_ulong         totalswap;      /* Total swap space size */
 133         l_ulong         freeswap;       /* swap space still available */
 134         l_ushort        procs;          /* Number of current processes */
 135         l_ushort        pads;
 136         l_ulong         totalbig;
 137         l_ulong         freebig;
 138         l_uint          mem_unit;
 139         char            _f[20-2*sizeof(l_long)-sizeof(l_int)];  /* padding */
 140 };
 141
 142 struct l_pselect6arg {
 143         l_uintptr_t     ss;
 144         l_size_t        ss_len;
 145 };
 146
 147 static bool map_sched_prio = true;
 148 SYSCTL_BOOL(_compat_linux, OID_AUTO, map_sched_prio, CTLFLAG_RDTUN,
 149     &map_sched_prio, 0, "Map scheduler priorities to Linux priorities "
 150     "(not POSIX compliant)");
 151
 152 static int      linux_utimensat_nsec_valid(l_long);
 153
 154
 155 int
 156 linux_sysinfo(struct thread *td, struct linux_sysinfo_args *args)
 157 {
 158         struct l_sysinfo sysinfo;
 159         vm_object_t object;
 160         int i, j;
 161         struct timespec ts;
 162
 163         bzero(&sysinfo, sizeof(sysinfo));
 164         getnanouptime(&ts);
 165         if (ts.tv_nsec != 0)
 166                 ts.tv_sec++;
 167         sysinfo.uptime = ts.tv_sec;
 168
 169         /* Use the information from the mib to get our load averages */
 170         for (i = 0; i < 3; i++)
 171                 sysinfo.loads[i] = averunnable.ldavg[i] *
 172                     LINUX_SYSINFO_LOADS_SCALE / averunnable.fscale;
 173
 174         sysinfo.totalram = physmem * PAGE_SIZE;
 175         sysinfo.freeram = sysinfo.totalram - vm_wire_count() * PAGE_SIZE;
 176
 177         sysinfo.sharedram = 0;
 178         mtx_lock(&vm_object_list_mtx);
 179         TAILQ_FOREACH(object, &vm_object_list, object_list)
 180                 if (object->shadow_count > 1)
 181                         sysinfo.sharedram += object->resident_page_count;
 182         mtx_unlock(&vm_object_list_mtx);
 183
 184         sysinfo.sharedram *= PAGE_SIZE;
 185         sysinfo.bufferram = 0;
 186
 187         swap_pager_status(&i, &j);
 188         sysinfo.totalswap = i * PAGE_SIZE;
 189         sysinfo.freeswap = (i - j) * PAGE_SIZE;
 190
 191         sysinfo.procs = nprocs;
 192
 193         /* The following are only present in newer Linux kernels. */
 194         sysinfo.totalbig = 0;
 195         sysinfo.freebig = 0;
 196         sysinfo.mem_unit = 1;
 197
 198         return (copyout(&sysinfo, args->info, sizeof(sysinfo)));
 199 }
 200
 201 #ifdef LINUX_LEGACY_SYSCALLS
 202 int
 203 linux_alarm(struct thread *td, struct linux_alarm_args *args)
 204 {
 205         struct itimerval it, old_it;
 206         u_int secs;
 207         int error;
 208
 209         secs = args->secs;
 210         /*
 211          * Linux alarm() is always successful. Limit secs to INT32_MAX / 2
 212          * to match kern_setitimer()'s limit to avoid error from it.
 213          *
 214          * XXX. Linux limit secs to INT_MAX on 32 and does not limit on 64-bit
 215          * platforms.
 216          */
 217         if (secs > INT32_MAX / 2)
 218                 secs = INT32_MAX / 2;
 219
 220         it.it_value.tv_sec = secs;
 221         it.it_value.tv_usec = 0;
 222         timevalclear(&it.it_interval);
 223         error = kern_setitimer(td, ITIMER_REAL, &it, &old_it);
 224         KASSERT(error == 0, ("kern_setitimer returns %d", error));
 225
 226         if ((old_it.it_value.tv_sec == 0 && old_it.it_value.tv_usec > 0) ||
 227             old_it.it_value.tv_usec >= 500000)
 228                 old_it.it_value.tv_sec++;
 229         td->td_retval[0] = old_it.it_value.tv_sec;
 230         return (0);
 231 }
 232 #endif
 233
 234 int
 235 linux_brk(struct thread *td, struct linux_brk_args *args)
 236 {
 237         struct vmspace *vm = td->td_proc->p_vmspace;
 238         uintptr_t new, old;
 239
 240         old = (uintptr_t)vm->vm_daddr + ctob(vm->vm_dsize);
 241         new = (uintptr_t)args->dsend;
 242         if ((caddr_t)new > vm->vm_daddr && !kern_break(td, &new))
 243                 td->td_retval[0] = (register_t)new;
 244         else
 245                 td->td_retval[0] = (register_t)old;
 246
 247         return (0);
 248 }
 249
 250 #if defined(__i386__)
 251 /* XXX: what about amd64/linux32? */
 252
 253 int
 254 linux_uselib(struct thread *td, struct linux_uselib_args *args)
 255 {
 256         struct nameidata ni;
 257         struct vnode *vp;
 258         struct exec *a_out;
 259         vm_map_t map;
 260         vm_map_entry_t entry;
 261         struct vattr attr;
 262         vm_offset_t vmaddr;
 263         unsigned long file_offset;
 264         unsigned long bss_size;
 265         char *library;
 266         ssize_t aresid;
 267         int error;
 268         bool locked, opened, textset;
 269
 270         LCONVPATHEXIST(td, args->library, &library);
 271
 272         a_out = NULL;
 273         vp = NULL;
 274         locked = false;
 275         textset = false;
 276         opened = false;
 277
 278         NDINIT(&ni, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | AUDITVNODE1,
 279             UIO_SYSSPACE, library, td);
 280         error = namei(&ni);
 281         LFREEPATH(library);
 282         if (error)
 283                 goto cleanup;
 284
 285         vp = ni.ni_vp;
 286         NDFREE(&ni, NDF_ONLY_PNBUF);
 287
 288         /*
 289          * From here on down, we have a locked vnode that must be unlocked.
 290          * XXX: The code below largely duplicates exec_check_permissions().
 291          */
 292         locked = true;
 293
 294         /* Executable? */
 295         error = VOP_GETATTR(vp, &attr, td->td_ucred);
 296         if (error)
 297                 goto cleanup;
 298
 299         if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
 300             ((attr.va_mode & 0111) == 0) || (attr.va_type != VREG)) {
 301                 /* EACCESS is what exec(2) returns. */
 302                 error = ENOEXEC;
 303                 goto cleanup;
 304         }
 305
 306         /* Sensible size? */
 307         if (attr.va_size == 0) {
 308                 error = ENOEXEC;
 309                 goto cleanup;
 310         }
 311
 312         /* Can we access it? */
 313         error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
 314         if (error)
 315                 goto cleanup;
 316
 317         /*
 318          * XXX: This should use vn_open() so that it is properly authorized,
 319          * and to reduce code redundancy all over the place here.
 320          * XXX: Not really, it duplicates far more of exec_check_permissions()
 321          * than vn_open().
 322          */
 323 #ifdef MAC
 324         error = mac_vnode_check_open(td->td_ucred, vp, VREAD);
 325         if (error)
 326                 goto cleanup;
 327 #endif
 328         error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
 329         if (error)
 330                 goto cleanup;
 331         opened = true;
 332
 333         /* Pull in executable header into exec_map */
 334         error = vm_mmap(exec_map, (vm_offset_t *)&a_out, PAGE_SIZE,
 335             VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, vp, 0);
 336         if (error)
 337                 goto cleanup;
 338
 339         /* Is it a Linux binary ? */
 340         if (((a_out->a_magic >> 16) & 0xff) != 0x64) {
 341                 error = ENOEXEC;
 342                 goto cleanup;
 343         }
 344
 345         /*
 346          * While we are here, we should REALLY do some more checks
 347          */
 348
 349         /* Set file/virtual offset based on a.out variant. */
 350         switch ((int)(a_out->a_magic & 0xffff)) {
 351         case 0413:                      /* ZMAGIC */
 352                 file_offset = 1024;
 353                 break;
 354         case 0314:                      /* QMAGIC */
 355                 file_offset = 0;
 356                 break;
 357         default:
 358                 error = ENOEXEC;
 359                 goto cleanup;
 360         }
 361
 362         bss_size = round_page(a_out->a_bss);
 363
 364         /* Check various fields in header for validity/bounds. */
 365         if (a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) {
 366                 error = ENOEXEC;
 367                 goto cleanup;
 368         }
 369
 370         /* text + data can't exceed file size */
 371         if (a_out->a_data + a_out->a_text > attr.va_size) {
 372                 error = EFAULT;
 373                 goto cleanup;
 374         }
 375
 376         /*
 377          * text/data/bss must not exceed limits
 378          * XXX - this is not complete. it should check current usage PLUS
 379          * the resources needed by this library.
 380          */
 381         PROC_LOCK(td->td_proc);
 382         if (a_out->a_text > maxtsiz ||
 383             a_out->a_data + bss_size > lim_cur_proc(td->td_proc, RLIMIT_DATA) ||
 384             racct_set(td->td_proc, RACCT_DATA, a_out->a_data +
 385             bss_size) != 0) {
 386                 PROC_UNLOCK(td->td_proc);
 387                 error = ENOMEM;
 388                 goto cleanup;
 389         }
 390         PROC_UNLOCK(td->td_proc);
 391
 392         /*
 393          * Prevent more writers.
 394          */
 395         error = VOP_SET_TEXT(vp);
 396         if (error != 0)
 397                 goto cleanup;
 398         textset = true;
 399
 400         /*
 401          * Lock no longer needed
 402          */
 403         locked = false;
 404         VOP_UNLOCK(vp);
 405
 406         /*
 407          * Check if file_offset page aligned. Currently we cannot handle
 408          * misalinged file offsets, and so we read in the entire image
 409          * (what a waste).
 410          */
 411         if (file_offset & PAGE_MASK) {
 412                 /* Map text+data read/write/execute */
 413
 414                 /* a_entry is the load address and is page aligned */
 415                 vmaddr = trunc_page(a_out->a_entry);
 416
 417                 /* get anon user mapping, read+write+execute */
 418                 error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0,
 419                     &vmaddr, a_out->a_text + a_out->a_data, 0, VMFS_NO_SPACE,
 420                     VM_PROT_ALL, VM_PROT_ALL, 0);
 421                 if (error)
 422                         goto cleanup;
 423
 424                 error = vn_rdwr(UIO_READ, vp, (void *)vmaddr, file_offset,
 425                     a_out->a_text + a_out->a_data, UIO_USERSPACE, 0,
 426                     td->td_ucred, NOCRED, &aresid, td);
 427                 if (error != 0)
 428                         goto cleanup;
 429                 if (aresid != 0) {
 430                         error = ENOEXEC;
 431                         goto cleanup;
 432                 }
 433         } else {
 434                 /*
 435                  * for QMAGIC, a_entry is 20 bytes beyond the load address
 436                  * to skip the executable header
 437                  */
 438                 vmaddr = trunc_page(a_out->a_entry);
 439
 440                 /*
 441                  * Map it all into the process's space as a single
 442                  * copy-on-write "data" segment.
 443                  */
 444                 map = &td->td_proc->p_vmspace->vm_map;
 445                 error = vm_mmap(map, &vmaddr,
 446                     a_out->a_text + a_out->a_data, VM_PROT_ALL, VM_PROT_ALL,
 447                     MAP_PRIVATE | MAP_FIXED, OBJT_VNODE, vp, file_offset);
 448                 if (error)
 449                         goto cleanup;
 450                 vm_map_lock(map);
 451                 if (!vm_map_lookup_entry(map, vmaddr, &entry)) {
 452                         vm_map_unlock(map);
 453                         error = EDOOFUS;
 454                         goto cleanup;
 455                 }
 456                 entry->eflags |= MAP_ENTRY_VN_EXEC;
 457                 vm_map_unlock(map);
 458                 textset = false;
 459         }
 460
 461         if (bss_size != 0) {
 462                 /* Calculate BSS start address */
 463                 vmaddr = trunc_page(a_out->a_entry) + a_out->a_text +
 464                     a_out->a_data;
 465
 466                 /* allocate some 'anon' space */
 467                 error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0,
 468                     &vmaddr, bss_size, 0, VMFS_NO_SPACE, VM_PROT_ALL,
 469                     VM_PROT_ALL, 0);
 470                 if (error)
 471                         goto cleanup;
 472         }
 473
 474 cleanup:
 475         if (opened) {
 476                 if (locked)
 477                         VOP_UNLOCK(vp);
 478                 locked = false;
 479                 VOP_CLOSE(vp, FREAD, td->td_ucred, td);
 480         }
 481         if (textset) {
 482                 if (!locked) {
 483                         locked = true;
 484                         VOP_LOCK(vp, LK_SHARED | LK_RETRY);
 485                 }
 486                 VOP_UNSET_TEXT_CHECKED(vp);
 487         }
 488         if (locked)
 489                 VOP_UNLOCK(vp);
 490
 491         /* Release the temporary mapping. */
 492         if (a_out)
 493                 kmap_free_wakeup(exec_map, (vm_offset_t)a_out, PAGE_SIZE);
 494
 495         return (error);
 496 }
 497
 498 #endif  /* __i386__ */
 499
 500 #ifdef LINUX_LEGACY_SYSCALLS
 501 int
 502 linux_select(struct thread *td, struct linux_select_args *args)
 503 {
 504         l_timeval ltv;
 505         struct timeval tv0, tv1, utv, *tvp;
 506         int error;
 507
 508         /*
 509          * Store current time for computation of the amount of
 510          * time left.
 511          */
 512         if (args->timeout) {
 513                 if ((error = copyin(args->timeout, &ltv, sizeof(ltv))))
 514                         goto select_out;
 515                 utv.tv_sec = ltv.tv_sec;
 516                 utv.tv_usec = ltv.tv_usec;
 517
 518                 if (itimerfix(&utv)) {
 519                         /*
 520                          * The timeval was invalid.  Convert it to something
 521                          * valid that will act as it does under Linux.
 522                          */
 523                         utv.tv_sec += utv.tv_usec / 1000000;
 524                         utv.tv_usec %= 1000000;
 525                         if (utv.tv_usec < 0) {
 526                                 utv.tv_sec -= 1;
 527                                 utv.tv_usec += 1000000;
 528                         }
 529                         if (utv.tv_sec < 0)
 530                                 timevalclear(&utv);
 531                 }
 532                 microtime(&tv0);
 533                 tvp = &utv;
 534         } else
 535                 tvp = NULL;
 536
 537         error = kern_select(td, args->nfds, args->readfds, args->writefds,
 538             args->exceptfds, tvp, LINUX_NFDBITS);
 539         if (error)
 540                 goto select_out;
 541
 542         if (args->timeout) {
 543                 if (td->td_retval[0]) {
 544                         /*
 545                          * Compute how much time was left of the timeout,
 546                          * by subtracting the current time and the time
 547                          * before we started the call, and subtracting
 548                          * that result from the user-supplied value.
 549                          */
 550                         microtime(&tv1);
 551                         timevalsub(&tv1, &tv0);
 552                         timevalsub(&utv, &tv1);
 553                         if (utv.tv_sec < 0)
 554                                 timevalclear(&utv);
 555                 } else
 556                         timevalclear(&utv);
 557                 ltv.tv_sec = utv.tv_sec;
 558                 ltv.tv_usec = utv.tv_usec;
 559                 if ((error = copyout(&ltv, args->timeout, sizeof(ltv))))
 560                         goto select_out;
 561         }
 562
 563 select_out:
 564         return (error);
 565 }
 566 #endif
 567
 568 int
 569 linux_mremap(struct thread *td, struct linux_mremap_args *args)
 570 {
 571         uintptr_t addr;
 572         size_t len;
 573         int error = 0;
 574
 575         if (args->flags & ~(LINUX_MREMAP_FIXED | LINUX_MREMAP_MAYMOVE)) {
 576                 td->td_retval[0] = 0;
 577                 return (EINVAL);
 578         }
 579
 580         /*
 581          * Check for the page alignment.
 582          * Linux defines PAGE_MASK to be FreeBSD ~PAGE_MASK.
 583          */
 584         if (args->addr & PAGE_MASK) {
 585                 td->td_retval[0] = 0;
 586                 return (EINVAL);
 587         }
 588
 589         args->new_len = round_page(args->new_len);
 590         args->old_len = round_page(args->old_len);
 591
 592         if (args->new_len > args->old_len) {
 593                 td->td_retval[0] = 0;
 594                 return (ENOMEM);
 595         }
 596
 597         if (args->new_len < args->old_len) {
 598                 addr = args->addr + args->new_len;
 599                 len = args->old_len - args->new_len;
 600                 error = kern_munmap(td, addr, len);
 601         }
 602
 603         td->td_retval[0] = error ? 0 : (uintptr_t)args->addr;
 604         return (error);
 605 }
 606
 607 #define LINUX_MS_ASYNC       0x0001
 608 #define LINUX_MS_INVALIDATE  0x0002
 609 #define LINUX_MS_SYNC        0x0004
 610
 611 int
 612 linux_msync(struct thread *td, struct linux_msync_args *args)
 613 {
 614
 615         return (kern_msync(td, args->addr, args->len,
 616             args->fl & ~LINUX_MS_SYNC));
 617 }
 618
 619 #ifdef LINUX_LEGACY_SYSCALLS
 620 int
 621 linux_time(struct thread *td, struct linux_time_args *args)
 622 {
 623         struct timeval tv;
 624         l_time_t tm;
 625         int error;
 626
 627         microtime(&tv);
 628         tm = tv.tv_sec;
 629         if (args->tm && (error = copyout(&tm, args->tm, sizeof(tm))))
 630                 return (error);
 631         td->td_retval[0] = tm;
 632         return (0);
 633 }
 634 #endif
 635
 636 struct l_times_argv {
 637         l_clock_t       tms_utime;
 638         l_clock_t       tms_stime;
 639         l_clock_t       tms_cutime;
 640         l_clock_t       tms_cstime;
 641 };
 642
 643
 644 /*
 645  * Glibc versions prior to 2.2.1 always use hard-coded CLK_TCK value.
 646  * Since 2.2.1 Glibc uses value exported from kernel via AT_CLKTCK
 647  * auxiliary vector entry.
 648  */
 649 #define CLK_TCK         100
 650
 651 #define CONVOTCK(r)     (r.tv_sec * CLK_TCK + r.tv_usec / (1000000 / CLK_TCK))
 652 #define CONVNTCK(r)     (r.tv_sec * stclohz + r.tv_usec / (1000000 / stclohz))
 653
 654 #define CONVTCK(r)      (linux_kernver(td) >= LINUX_KERNVER_2004000 ?           \
 655                             CONVNTCK(r) : CONVOTCK(r))
 656
 657 int
 658 linux_times(struct thread *td, struct linux_times_args *args)
 659 {
 660         struct timeval tv, utime, stime, cutime, cstime;
 661         struct l_times_argv tms;
 662         struct proc *p;
 663         int error;
 664
 665         if (args->buf != NULL) {
 666                 p = td->td_proc;
 667                 PROC_LOCK(p);
 668                 PROC_STATLOCK(p);
 669                 calcru(p, &utime, &stime);
 670                 PROC_STATUNLOCK(p);
 671                 calccru(p, &cutime, &cstime);
 672                 PROC_UNLOCK(p);
 673
 674                 tms.tms_utime = CONVTCK(utime);
 675                 tms.tms_stime = CONVTCK(stime);
 676
 677                 tms.tms_cutime = CONVTCK(cutime);
 678                 tms.tms_cstime = CONVTCK(cstime);
 679
 680                 if ((error = copyout(&tms, args->buf, sizeof(tms))))
 681                         return (error);
 682         }
 683
 684         microuptime(&tv);
 685         td->td_retval[0] = (int)CONVTCK(tv);
 686         return (0);
 687 }
 688
 689 int
 690 linux_newuname(struct thread *td, struct linux_newuname_args *args)
 691 {
 692         struct l_new_utsname utsname;
 693         char osname[LINUX_MAX_UTSNAME];
 694         char osrelease[LINUX_MAX_UTSNAME];
 695         char *p;
 696
 697         linux_get_osname(td, osname);
 698         linux_get_osrelease(td, osrelease);
 699
 700         bzero(&utsname, sizeof(utsname));
 701         strlcpy(utsname.sysname, osname, LINUX_MAX_UTSNAME);
 702         getcredhostname(td->td_ucred, utsname.nodename, LINUX_MAX_UTSNAME);
 703         getcreddomainname(td->td_ucred, utsname.domainname, LINUX_MAX_UTSNAME);
 704         strlcpy(utsname.release, osrelease, LINUX_MAX_UTSNAME);
 705         strlcpy(utsname.version, version, LINUX_MAX_UTSNAME);
 706         for (p = utsname.version; *p != '\0'; ++p)
 707                 if (*p == '\n') {
 708                         *p = '\0';
 709                         break;
 710                 }
 711         strlcpy(utsname.machine, linux_kplatform, LINUX_MAX_UTSNAME);
 712
 713         return (copyout(&utsname, args->buf, sizeof(utsname)));
 714 }
 715
 716 struct l_utimbuf {
 717         l_time_t l_actime;
 718         l_time_t l_modtime;
 719 };
 720
 721 #ifdef LINUX_LEGACY_SYSCALLS
 722 int
 723 linux_utime(struct thread *td, struct linux_utime_args *args)
 724 {
 725         struct timeval tv[2], *tvp;
 726         struct l_utimbuf lut;
 727         char *fname;
 728         int error;
 729
 730         LCONVPATHEXIST(td, args->fname, &fname);
 731
 732         if (args->times) {
 733                 if ((error = copyin(args->times, &lut, sizeof lut))) {
 734                         LFREEPATH(fname);
 735                         return (error);
 736                 }
 737                 tv[0].tv_sec = lut.l_actime;
 738                 tv[0].tv_usec = 0;
 739                 tv[1].tv_sec = lut.l_modtime;
 740                 tv[1].tv_usec = 0;
 741                 tvp = tv;
 742         } else
 743                 tvp = NULL;
 744
 745         error = kern_utimesat(td, AT_FDCWD, fname, UIO_SYSSPACE, tvp,
 746             UIO_SYSSPACE);
 747         LFREEPATH(fname);
 748         return (error);
 749 }
 750 #endif
 751
 752 #ifdef LINUX_LEGACY_SYSCALLS
 753 int
 754 linux_utimes(struct thread *td, struct linux_utimes_args *args)
 755 {
 756         l_timeval ltv[2];
 757         struct timeval tv[2], *tvp = NULL;
 758         char *fname;
 759         int error;
 760
 761         LCONVPATHEXIST(td, args->fname, &fname);
 762
 763         if (args->tptr != NULL) {
 764                 if ((error = copyin(args->tptr, ltv, sizeof ltv))) {
 765                         LFREEPATH(fname);
 766                         return (error);
 767                 }
 768                 tv[0].tv_sec = ltv[0].tv_sec;
 769                 tv[0].tv_usec = ltv[0].tv_usec;
 770                 tv[1].tv_sec = ltv[1].tv_sec;
 771                 tv[1].tv_usec = ltv[1].tv_usec;
 772                 tvp = tv;
 773         }
 774
 775         error = kern_utimesat(td, AT_FDCWD, fname, UIO_SYSSPACE,
 776             tvp, UIO_SYSSPACE);
 777         LFREEPATH(fname);
 778         return (error);
 779 }
 780 #endif
 781
 782 static int
 783 linux_utimensat_nsec_valid(l_long nsec)
 784 {
 785
 786         if (nsec == LINUX_UTIME_OMIT || nsec == LINUX_UTIME_NOW)
 787                 return (0);
 788         if (nsec >= 0 && nsec <= 999999999)
 789                 return (0);
 790         return (1);
 791 }
 792
 793 int
 794 linux_utimensat(struct thread *td, struct linux_utimensat_args *args)
 795 {
 796         struct l_timespec l_times[2];
 797         struct timespec times[2], *timesp = NULL;
 798         char *path = NULL;
 799         int error, dfd, flags = 0;
 800
 801         dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
 802
 803         if (args->flags & ~LINUX_AT_SYMLINK_NOFOLLOW)
 804                 return (EINVAL);
 805
 806         if (args->times != NULL) {
 807                 error = copyin(args->times, l_times, sizeof(l_times));
 808                 if (error != 0)
 809                         return (error);
 810
 811                 if (linux_utimensat_nsec_valid(l_times[0].tv_nsec) != 0 ||
 812                     linux_utimensat_nsec_valid(l_times[1].tv_nsec) != 0)
 813                         return (EINVAL);
 814
 815                 times[0].tv_sec = l_times[0].tv_sec;
 816                 switch (l_times[0].tv_nsec)
 817                 {
 818                 case LINUX_UTIME_OMIT:
 819                         times[0].tv_nsec = UTIME_OMIT;
 820                         break;
 821                 case LINUX_UTIME_NOW:
 822                         times[0].tv_nsec = UTIME_NOW;
 823                         break;
 824                 default:
 825                         times[0].tv_nsec = l_times[0].tv_nsec;
 826                 }
 827
 828                 times[1].tv_sec = l_times[1].tv_sec;
 829                 switch (l_times[1].tv_nsec)
 830                 {
 831                 case LINUX_UTIME_OMIT:
 832                         times[1].tv_nsec = UTIME_OMIT;
 833                         break;
 834                 case LINUX_UTIME_NOW:
 835                         times[1].tv_nsec = UTIME_NOW;
 836                         break;
 837                 default:
 838                         times[1].tv_nsec = l_times[1].tv_nsec;
 839                         break;
 840                 }
 841                 timesp = times;
 842
 843                 /* This breaks POSIX, but is what the Linux kernel does
 844                  * _on purpose_ (documented in the man page for utimensat(2)),
 845                  * so we must follow that behaviour. */
 846                 if (times[0].tv_nsec == UTIME_OMIT &&
 847                     times[1].tv_nsec == UTIME_OMIT)
 848                         return (0);
 849         }
 850
 851         if (args->pathname != NULL)
 852                 LCONVPATHEXIST_AT(td, args->pathname, &path, dfd);
 853         else if (args->flags != 0)
 854                 return (EINVAL);
 855
 856         if (args->flags & LINUX_AT_SYMLINK_NOFOLLOW)
 857                 flags |= AT_SYMLINK_NOFOLLOW;
 858
 859         if (path == NULL)
 860                 error = kern_futimens(td, dfd, timesp, UIO_SYSSPACE);
 861         else {
 862                 error = kern_utimensat(td, dfd, path, UIO_SYSSPACE, timesp,
 863                         UIO_SYSSPACE, flags);
 864                 LFREEPATH(path);
 865         }
 866
 867         return (error);
 868 }
 869
 870 #ifdef LINUX_LEGACY_SYSCALLS
 871 int
 872 linux_futimesat(struct thread *td, struct linux_futimesat_args *args)
 873 {
 874         l_timeval ltv[2];
 875         struct timeval tv[2], *tvp = NULL;
 876         char *fname;
 877         int error, dfd;
 878
 879         dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
 880         LCONVPATHEXIST_AT(td, args->filename, &fname, dfd);
 881
 882         if (args->utimes != NULL) {
 883                 if ((error = copyin(args->utimes, ltv, sizeof ltv))) {
 884                         LFREEPATH(fname);
 885                         return (error);
 886                 }
 887                 tv[0].tv_sec = ltv[0].tv_sec;
 888                 tv[0].tv_usec = ltv[0].tv_usec;
 889                 tv[1].tv_sec = ltv[1].tv_sec;
 890                 tv[1].tv_usec = ltv[1].tv_usec;
 891                 tvp = tv;
 892         }
 893
 894         error = kern_utimesat(td, dfd, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE);
 895         LFREEPATH(fname);
 896         return (error);
 897 }
 898 #endif
 899
 900 static int
 901 linux_common_wait(struct thread *td, int pid, int *statusp,
 902     int options, struct __wrusage *wrup)
 903 {
 904         siginfo_t siginfo;
 905         idtype_t idtype;
 906         id_t id;
 907         int error, status, tmpstat;
 908
 909         if (pid == WAIT_ANY) {
 910                 idtype = P_ALL;
 911                 id = 0;
 912         } else if (pid < 0) {
 913                 idtype = P_PGID;
 914                 id = (id_t)-pid;
 915         } else {
 916                 idtype = P_PID;
 917                 id = (id_t)pid;
 918         }
 919
 920         /*
 921          * For backward compatibility we implicitly add flags WEXITED
 922          * and WTRAPPED here.
 923          */
 924         options |= WEXITED | WTRAPPED;
 925         error = kern_wait6(td, idtype, id, &status, options, wrup, &siginfo);
 926         if (error)
 927                 return (error);
 928
 929         if (statusp) {
 930                 tmpstat = status & 0xffff;
 931                 if (WIFSIGNALED(tmpstat)) {
 932                         tmpstat = (tmpstat & 0xffffff80) |
 933                             bsd_to_linux_signal(WTERMSIG(tmpstat));
 934                 } else if (WIFSTOPPED(tmpstat)) {
 935                         tmpstat = (tmpstat & 0xffff00ff) |
 936                             (bsd_to_linux_signal(WSTOPSIG(tmpstat)) << 8);
 937 #if defined(__amd64__) && !defined(COMPAT_LINUX32)
 938                         if (WSTOPSIG(status) == SIGTRAP) {
 939                                 tmpstat = linux_ptrace_status(td,
 940                                     siginfo.si_pid, tmpstat);
 941                         }
 942 #endif
 943                 } else if (WIFCONTINUED(tmpstat)) {
 944                         tmpstat = 0xffff;
 945                 }
 946                 error = copyout(&tmpstat, statusp, sizeof(int));
 947         }
 948
 949         return (error);
 950 }
 951
 952 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
 953 int
 954 linux_waitpid(struct thread *td, struct linux_waitpid_args *args)
 955 {
 956         struct linux_wait4_args wait4_args;
 957
 958         wait4_args.pid = args->pid;
 959         wait4_args.status = args->status;
 960         wait4_args.options = args->options;
 961         wait4_args.rusage = NULL;
 962
 963         return (linux_wait4(td, &wait4_args));
 964 }
 965 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
 966
 967 int
 968 linux_wait4(struct thread *td, struct linux_wait4_args *args)
 969 {
 970         int error, options;
 971         struct __wrusage wru, *wrup;
 972
 973         if (args->options & ~(LINUX_WUNTRACED | LINUX_WNOHANG |
 974             LINUX_WCONTINUED | __WCLONE | __WNOTHREAD | __WALL))
 975                 return (EINVAL);
 976
 977         options = WEXITED;
 978         linux_to_bsd_waitopts(args->options, &options);
 979
 980         if (args->rusage != NULL)
 981                 wrup = &wru;
 982         else
 983                 wrup = NULL;
 984         error = linux_common_wait(td, args->pid, args->status, options, wrup);
 985         if (error != 0)
 986                 return (error);
 987         if (args->rusage != NULL)
 988                 error = linux_copyout_rusage(&wru.wru_self, args->rusage);
 989         return (error);
 990 }
 991
 992 int
 993 linux_waitid(struct thread *td, struct linux_waitid_args *args)
 994 {
 995         int status, options, sig;
 996         struct __wrusage wru;
 997         siginfo_t siginfo;
 998         l_siginfo_t lsi;
 999         idtype_t idtype;
1000         struct proc *p;
1001         int error;
1002
1003         options = 0;
1004         linux_to_bsd_waitopts(args->options, &options);
1005
1006         if (options & ~(WNOHANG | WNOWAIT | WEXITED | WUNTRACED | WCONTINUED))
1007                 return (EINVAL);
1008         if (!(options & (WEXITED | WUNTRACED | WCONTINUED)))
1009                 return (EINVAL);
1010
1011         switch (args->idtype) {
1012         case LINUX_P_ALL:
1013                 idtype = P_ALL;
1014                 break;
1015         case LINUX_P_PID:
1016                 if (args->id <= 0)
1017                         return (EINVAL);
1018                 idtype = P_PID;
1019                 break;
1020         case LINUX_P_PGID:
1021                 if (args->id <= 0)
1022                         return (EINVAL);
1023                 idtype = P_PGID;
1024                 break;
1025         default:
1026                 return (EINVAL);
1027         }
1028
1029         error = kern_wait6(td, idtype, args->id, &status, options,
1030             &wru, &siginfo);
1031         if (error != 0)
1032                 return (error);
1033         if (args->rusage != NULL) {
1034                 error = linux_copyout_rusage(&wru.wru_children,
1035                     args->rusage);
1036                 if (error != 0)
1037                         return (error);
1038         }
1039         if (args->info != NULL) {
1040                 p = td->td_proc;
1041                 bzero(&lsi, sizeof(lsi));
1042                 if (td->td_retval[0] != 0) {
1043                         sig = bsd_to_linux_signal(siginfo.si_signo);
1044                         siginfo_to_lsiginfo(&siginfo, &lsi, sig);
1045                 }
1046                 error = copyout(&lsi, args->info, sizeof(lsi));
1047         }
1048         td->td_retval[0] = 0;
1049
1050         return (error);
1051 }
1052
1053 #ifdef LINUX_LEGACY_SYSCALLS
1054 int
1055 linux_mknod(struct thread *td, struct linux_mknod_args *args)
1056 {
1057         char *path;
1058         int error;
1059
1060         LCONVPATHCREAT(td, args->path, &path);
1061
1062         switch (args->mode & S_IFMT) {
1063         case S_IFIFO:
1064         case S_IFSOCK:
1065                 error = kern_mkfifoat(td, AT_FDCWD, path, UIO_SYSSPACE,
1066                     args->mode);
1067                 break;
1068
1069         case S_IFCHR:
1070         case S_IFBLK:
1071                 error = kern_mknodat(td, AT_FDCWD, path, UIO_SYSSPACE,
1072                     args->mode, args->dev);
1073                 break;
1074
1075         case S_IFDIR:
1076                 error = EPERM;
1077                 break;
1078
1079         case 0:
1080                 args->mode |= S_IFREG;
1081                 /* FALLTHROUGH */
1082         case S_IFREG:
1083                 error = kern_openat(td, AT_FDCWD, path, UIO_SYSSPACE,
1084                     O_WRONLY | O_CREAT | O_TRUNC, args->mode);
1085                 if (error == 0)
1086                         kern_close(td, td->td_retval[0]);
1087                 break;
1088
1089         default:
1090                 error = EINVAL;
1091                 break;
1092         }
1093         LFREEPATH(path);
1094         return (error);
1095 }
1096 #endif
1097
1098 int
1099 linux_mknodat(struct thread *td, struct linux_mknodat_args *args)
1100 {
1101         char *path;
1102         int error, dfd;
1103
1104         dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
1105         LCONVPATHCREAT_AT(td, args->filename, &path, dfd);
1106
1107         switch (args->mode & S_IFMT) {
1108         case S_IFIFO:
1109         case S_IFSOCK:
1110                 error = kern_mkfifoat(td, dfd, path, UIO_SYSSPACE, args->mode);
1111                 break;
1112
1113         case S_IFCHR:
1114         case S_IFBLK:
1115                 error = kern_mknodat(td, dfd, path, UIO_SYSSPACE, args->mode,
1116                     args->dev);
1117                 break;
1118
1119         case S_IFDIR:
1120                 error = EPERM;
1121                 break;
1122
1123         case 0:
1124                 args->mode |= S_IFREG;
1125                 /* FALLTHROUGH */
1126         case S_IFREG:
1127                 error = kern_openat(td, dfd, path, UIO_SYSSPACE,
1128                     O_WRONLY | O_CREAT | O_TRUNC, args->mode);
1129                 if (error == 0)
1130                         kern_close(td, td->td_retval[0]);
1131                 break;
1132
1133         default:
1134                 error = EINVAL;
1135                 break;
1136         }
1137         LFREEPATH(path);
1138         return (error);
1139 }
1140
1141 /*
1142  * UGH! This is just about the dumbest idea I've ever heard!!
1143  */
1144 int
1145 linux_personality(struct thread *td, struct linux_personality_args *args)
1146 {
1147         struct linux_pemuldata *pem;
1148         struct proc *p = td->td_proc;
1149         uint32_t old;
1150
1151         PROC_LOCK(p);
1152         pem = pem_find(p);
1153         old = pem->persona;
1154         if (args->per != 0xffffffff)
1155                 pem->persona = args->per;
1156         PROC_UNLOCK(p);
1157
1158         td->td_retval[0] = old;
1159         return (0);
1160 }
1161
1162 struct l_itimerval {
1163         l_timeval it_interval;
1164         l_timeval it_value;
1165 };
1166
1167 #define B2L_ITIMERVAL(bip, lip)                                         \
1168         (bip)->it_interval.tv_sec = (lip)->it_interval.tv_sec;          \
1169         (bip)->it_interval.tv_usec = (lip)->it_interval.tv_usec;        \
1170         (bip)->it_value.tv_sec = (lip)->it_value.tv_sec;                \
1171         (bip)->it_value.tv_usec = (lip)->it_value.tv_usec;
1172
1173 int
1174 linux_setitimer(struct thread *td, struct linux_setitimer_args *uap)
1175 {
1176         int error;
1177         struct l_itimerval ls;
1178         struct itimerval aitv, oitv;
1179
1180         if (uap->itv == NULL) {
1181                 uap->itv = uap->oitv;
1182                 return (linux_getitimer(td, (struct linux_getitimer_args *)uap));
1183         }
1184
1185         error = copyin(uap->itv, &ls, sizeof(ls));
1186         if (error != 0)
1187                 return (error);
1188         B2L_ITIMERVAL(&aitv, &ls);
1189         error = kern_setitimer(td, uap->which, &aitv, &oitv);
1190         if (error != 0 || uap->oitv == NULL)
1191                 return (error);
1192         B2L_ITIMERVAL(&ls, &oitv);
1193
1194         return (copyout(&ls, uap->oitv, sizeof(ls)));
1195 }
1196
1197 int
1198 linux_getitimer(struct thread *td, struct linux_getitimer_args *uap)
1199 {
1200         int error;
1201         struct l_itimerval ls;
1202         struct itimerval aitv;
1203
1204         error = kern_getitimer(td, uap->which, &aitv);
1205         if (error != 0)
1206                 return (error);
1207         B2L_ITIMERVAL(&ls, &aitv);
1208         return (copyout(&ls, uap->itv, sizeof(ls)));
1209 }
1210
1211 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
1212 int
1213 linux_nice(struct thread *td, struct linux_nice_args *args)
1214 {
1215
1216         return (kern_setpriority(td, PRIO_PROCESS, 0, args->inc));
1217 }
1218 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
1219
1220 int
1221 linux_setgroups(struct thread *td, struct linux_setgroups_args *args)
1222 {
1223         struct ucred *newcred, *oldcred;
1224         l_gid_t *linux_gidset;
1225         gid_t *bsd_gidset;
1226         int ngrp, error;
1227         struct proc *p;
1228
1229         ngrp = args->gidsetsize;
1230         if (ngrp < 0 || ngrp >= ngroups_max + 1)
1231                 return (EINVAL);
1232         linux_gidset = malloc(ngrp * sizeof(*linux_gidset), M_LINUX, M_WAITOK);
1233         error = copyin(args->grouplist, linux_gidset, ngrp * sizeof(l_gid_t));
1234         if (error)
1235                 goto out;
1236         newcred = crget();
1237         crextend(newcred, ngrp + 1);
1238         p = td->td_proc;
1239         PROC_LOCK(p);
1240         oldcred = p->p_ucred;
1241         crcopy(newcred, oldcred);
1242
1243         /*
1244          * cr_groups[0] holds egid. Setting the whole set from
1245          * the supplied set will cause egid to be changed too.
1246          * Keep cr_groups[0] unchanged to prevent that.
1247          */
1248
1249         if ((error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS)) != 0) {
1250                 PROC_UNLOCK(p);
1251                 crfree(newcred);
1252                 goto out;
1253         }
1254
1255         if (ngrp > 0) {
1256                 newcred->cr_ngroups = ngrp + 1;
1257
1258                 bsd_gidset = newcred->cr_groups;
1259                 ngrp--;
1260                 while (ngrp >= 0) {
1261                         bsd_gidset[ngrp + 1] = linux_gidset[ngrp];
1262                         ngrp--;
1263                 }
1264         } else
1265                 newcred->cr_ngroups = 1;
1266
1267         setsugid(p);
1268         proc_set_cred(p, newcred);
1269         PROC_UNLOCK(p);
1270         crfree(oldcred);
1271         error = 0;
1272 out:
1273         free(linux_gidset, M_LINUX);
1274         return (error);
1275 }
1276
1277 int
1278 linux_getgroups(struct thread *td, struct linux_getgroups_args *args)
1279 {
1280         struct ucred *cred;
1281         l_gid_t *linux_gidset;
1282         gid_t *bsd_gidset;
1283         int bsd_gidsetsz, ngrp, error;
1284
1285         cred = td->td_ucred;
1286         bsd_gidset = cred->cr_groups;
1287         bsd_gidsetsz = cred->cr_ngroups - 1;
1288
1289         /*
1290          * cr_groups[0] holds egid. Returning the whole set
1291          * here will cause a duplicate. Exclude cr_groups[0]
1292          * to prevent that.
1293          */
1294
1295         if ((ngrp = args->gidsetsize) == 0) {
1296                 td->td_retval[0] = bsd_gidsetsz;
1297                 return (0);
1298         }
1299
1300         if (ngrp < bsd_gidsetsz)
1301                 return (EINVAL);
1302
1303         ngrp = 0;
1304         linux_gidset = malloc(bsd_gidsetsz * sizeof(*linux_gidset),
1305             M_LINUX, M_WAITOK);
1306         while (ngrp < bsd_gidsetsz) {
1307                 linux_gidset[ngrp] = bsd_gidset[ngrp + 1];
1308                 ngrp++;
1309         }
1310
1311         error = copyout(linux_gidset, args->grouplist, ngrp * sizeof(l_gid_t));
1312         free(linux_gidset, M_LINUX);
1313         if (error)
1314                 return (error);
1315
1316         td->td_retval[0] = ngrp;
1317         return (0);
1318 }
1319
1320 int
1321 linux_setrlimit(struct thread *td, struct linux_setrlimit_args *args)
1322 {
1323         struct rlimit bsd_rlim;
1324         struct l_rlimit rlim;
1325         u_int which;
1326         int error;
1327
1328         if (args->resource >= LINUX_RLIM_NLIMITS)
1329                 return (EINVAL);
1330
1331         which = linux_to_bsd_resource[args->resource];
1332         if (which == -1)
1333                 return (EINVAL);
1334
1335         error = copyin(args->rlim, &rlim, sizeof(rlim));
1336         if (error)
1337                 return (error);
1338
1339         bsd_rlim.rlim_cur = (rlim_t)rlim.rlim_cur;
1340         bsd_rlim.rlim_max = (rlim_t)rlim.rlim_max;
1341         return (kern_setrlimit(td, which, &bsd_rlim));
1342 }
1343
1344 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
1345 int
1346 linux_old_getrlimit(struct thread *td, struct linux_old_getrlimit_args *args)
1347 {
1348         struct l_rlimit rlim;
1349         struct rlimit bsd_rlim;
1350         u_int which;
1351
1352         if (args->resource >= LINUX_RLIM_NLIMITS)
1353                 return (EINVAL);
1354
1355         which = linux_to_bsd_resource[args->resource];
1356         if (which == -1)
1357                 return (EINVAL);
1358
1359         lim_rlimit(td, which, &bsd_rlim);
1360
1361 #ifdef COMPAT_LINUX32
1362         rlim.rlim_cur = (unsigned int)bsd_rlim.rlim_cur;
1363         if (rlim.rlim_cur == UINT_MAX)
1364                 rlim.rlim_cur = INT_MAX;
1365         rlim.rlim_max = (unsigned int)bsd_rlim.rlim_max;
1366         if (rlim.rlim_max == UINT_MAX)
1367                 rlim.rlim_max = INT_MAX;
1368 #else
1369         rlim.rlim_cur = (unsigned long)bsd_rlim.rlim_cur;
1370         if (rlim.rlim_cur == ULONG_MAX)
1371                 rlim.rlim_cur = LONG_MAX;
1372         rlim.rlim_max = (unsigned long)bsd_rlim.rlim_max;
1373         if (rlim.rlim_max == ULONG_MAX)
1374                 rlim.rlim_max = LONG_MAX;
1375 #endif
1376         return (copyout(&rlim, args->rlim, sizeof(rlim)));
1377 }
1378 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
1379
1380 int
1381 linux_getrlimit(struct thread *td, struct linux_getrlimit_args *args)
1382 {
1383         struct l_rlimit rlim;
1384         struct rlimit bsd_rlim;
1385         u_int which;
1386
1387         if (args->resource >= LINUX_RLIM_NLIMITS)
1388                 return (EINVAL);
1389
1390         which = linux_to_bsd_resource[args->resource];
1391         if (which == -1)
1392                 return (EINVAL);
1393
1394         lim_rlimit(td, which, &bsd_rlim);
1395
1396         rlim.rlim_cur = (l_ulong)bsd_rlim.rlim_cur;
1397         rlim.rlim_max = (l_ulong)bsd_rlim.rlim_max;
1398         return (copyout(&rlim, args->rlim, sizeof(rlim)));
1399 }
1400
1401 int
1402 linux_sched_setscheduler(struct thread *td,
1403     struct linux_sched_setscheduler_args *args)
1404 {
1405         struct sched_param sched_param;
1406         struct thread *tdt;
1407         int error, policy;
1408
1409         switch (args->policy) {
1410         case LINUX_SCHED_OTHER:
1411                 policy = SCHED_OTHER;
1412                 break;
1413         case LINUX_SCHED_FIFO:
1414                 policy = SCHED_FIFO;
1415                 break;
1416         case LINUX_SCHED_RR:
1417                 policy = SCHED_RR;
1418                 break;
1419         default:
1420                 return (EINVAL);
1421         }
1422
1423         error = copyin(args->param, &sched_param, sizeof(sched_param));
1424         if (error)
1425                 return (error);
1426
1427         if (map_sched_prio) {
1428                 switch (policy) {
1429                 case SCHED_OTHER:
1430                         if (sched_param.sched_priority != 0)
1431                                 return (EINVAL);
1432
1433                         sched_param.sched_priority =
1434                             PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE;
1435                         break;
1436                 case SCHED_FIFO:
1437                 case SCHED_RR:
1438                         if (sched_param.sched_priority < 1 ||
1439                             sched_param.sched_priority >= LINUX_MAX_RT_PRIO)
1440                                 return (EINVAL);
1441
1442                         /*
1443                          * Map [1, LINUX_MAX_RT_PRIO - 1] to
1444                          * [0, RTP_PRIO_MAX - RTP_PRIO_MIN] (rounding down).
1445                          */
1446                         sched_param.sched_priority =
1447                             (sched_param.sched_priority - 1) *
1448                             (RTP_PRIO_MAX - RTP_PRIO_MIN + 1) /
1449                             (LINUX_MAX_RT_PRIO - 1);
1450                         break;
1451                 }
1452         }
1453
1454         tdt = linux_tdfind(td, args->pid, -1);
1455         if (tdt == NULL)
1456                 return (ESRCH);
1457
1458         error = kern_sched_setscheduler(td, tdt, policy, &sched_param);
1459         PROC_UNLOCK(tdt->td_proc);
1460         return (error);
1461 }
1462
1463 int
1464 linux_sched_getscheduler(struct thread *td,
1465     struct linux_sched_getscheduler_args *args)
1466 {
1467         struct thread *tdt;
1468         int error, policy;
1469
1470         tdt = linux_tdfind(td, args->pid, -1);
1471         if (tdt == NULL)
1472                 return (ESRCH);
1473
1474         error = kern_sched_getscheduler(td, tdt, &policy);
1475         PROC_UNLOCK(tdt->td_proc);
1476
1477         switch (policy) {
1478         case SCHED_OTHER:
1479                 td->td_retval[0] = LINUX_SCHED_OTHER;
1480                 break;
1481         case SCHED_FIFO:
1482                 td->td_retval[0] = LINUX_SCHED_FIFO;
1483                 break;
1484         case SCHED_RR:
1485                 td->td_retval[0] = LINUX_SCHED_RR;
1486                 break;
1487         }
1488         return (error);
1489 }
1490
1491 int
1492 linux_sched_get_priority_max(struct thread *td,
1493     struct linux_sched_get_priority_max_args *args)
1494 {
1495         struct sched_get_priority_max_args bsd;
1496
1497         if (map_sched_prio) {
1498                 switch (args->policy) {
1499                 case LINUX_SCHED_OTHER:
1500                         td->td_retval[0] = 0;
1501                         return (0);
1502                 case LINUX_SCHED_FIFO:
1503                 case LINUX_SCHED_RR:
1504                         td->td_retval[0] = LINUX_MAX_RT_PRIO - 1;
1505                         return (0);
1506                 default:
1507                         return (EINVAL);
1508                 }
1509         }
1510
1511         switch (args->policy) {
1512         case LINUX_SCHED_OTHER:
1513                 bsd.policy = SCHED_OTHER;
1514                 break;
1515         case LINUX_SCHED_FIFO:
1516                 bsd.policy = SCHED_FIFO;
1517                 break;
1518         case LINUX_SCHED_RR:
1519                 bsd.policy = SCHED_RR;
1520                 break;
1521         default:
1522                 return (EINVAL);
1523         }
1524         return (sys_sched_get_priority_max(td, &bsd));
1525 }
1526
1527 int
1528 linux_sched_get_priority_min(struct thread *td,
1529     struct linux_sched_get_priority_min_args *args)
1530 {
1531         struct sched_get_priority_min_args bsd;
1532
1533         if (map_sched_prio) {
1534                 switch (args->policy) {
1535                 case LINUX_SCHED_OTHER:
1536                         td->td_retval[0] = 0;
1537                         return (0);
1538                 case LINUX_SCHED_FIFO:
1539                 case LINUX_SCHED_RR:
1540                         td->td_retval[0] = 1;
1541                         return (0);
1542                 default:
1543                         return (EINVAL);
1544                 }
1545         }
1546
1547         switch (args->policy) {
1548         case LINUX_SCHED_OTHER:
1549                 bsd.policy = SCHED_OTHER;
1550                 break;
1551         case LINUX_SCHED_FIFO:
1552                 bsd.policy = SCHED_FIFO;
1553                 break;
1554         case LINUX_SCHED_RR:
1555                 bsd.policy = SCHED_RR;
1556                 break;
1557         default:
1558                 return (EINVAL);
1559         }
1560         return (sys_sched_get_priority_min(td, &bsd));
1561 }
1562
1563 #define REBOOT_CAD_ON   0x89abcdef
1564 #define REBOOT_CAD_OFF  0
1565 #define REBOOT_HALT     0xcdef0123
1566 #define REBOOT_RESTART  0x01234567
1567 #define REBOOT_RESTART2 0xA1B2C3D4
1568 #define REBOOT_POWEROFF 0x4321FEDC
1569 #define REBOOT_MAGIC1   0xfee1dead
1570 #define REBOOT_MAGIC2   0x28121969
1571 #define REBOOT_MAGIC2A  0x05121996
1572 #define REBOOT_MAGIC2B  0x16041998
1573
1574 int
1575 linux_reboot(struct thread *td, struct linux_reboot_args *args)
1576 {
1577         struct reboot_args bsd_args;
1578
1579         if (args->magic1 != REBOOT_MAGIC1)
1580                 return (EINVAL);
1581
1582         switch (args->magic2) {
1583         case REBOOT_MAGIC2:
1584         case REBOOT_MAGIC2A:
1585         case REBOOT_MAGIC2B:
1586                 break;
1587         default:
1588                 return (EINVAL);
1589         }
1590
1591         switch (args->cmd) {
1592         case REBOOT_CAD_ON:
1593         case REBOOT_CAD_OFF:
1594                 return (priv_check(td, PRIV_REBOOT));
1595         case REBOOT_HALT:
1596                 bsd_args.opt = RB_HALT;
1597                 break;
1598         case REBOOT_RESTART:
1599         case REBOOT_RESTART2:
1600                 bsd_args.opt = 0;
1601                 break;
1602         case REBOOT_POWEROFF:
1603                 bsd_args.opt = RB_POWEROFF;
1604                 break;
1605         default:
1606                 return (EINVAL);
1607         }
1608         return (sys_reboot(td, &bsd_args));
1609 }
1610
1611
1612 int
1613 linux_getpid(struct thread *td, struct linux_getpid_args *args)
1614 {
1615
1616         td->td_retval[0] = td->td_proc->p_pid;
1617
1618         return (0);
1619 }
1620
1621 int
1622 linux_gettid(struct thread *td, struct linux_gettid_args *args)
1623 {
1624         struct linux_emuldata *em;
1625
1626         em = em_find(td);
1627         KASSERT(em != NULL, ("gettid: emuldata not found.\n"));
1628
1629         td->td_retval[0] = em->em_tid;
1630
1631         return (0);
1632 }
1633
1634
1635 int
1636 linux_getppid(struct thread *td, struct linux_getppid_args *args)
1637 {
1638
1639         td->td_retval[0] = kern_getppid(td);
1640         return (0);
1641 }
1642
1643 int
1644 linux_getgid(struct thread *td, struct linux_getgid_args *args)
1645 {
1646
1647         td->td_retval[0] = td->td_ucred->cr_rgid;
1648         return (0);
1649 }
1650
1651 int
1652 linux_getuid(struct thread *td, struct linux_getuid_args *args)
1653 {
1654
1655         td->td_retval[0] = td->td_ucred->cr_ruid;
1656         return (0);
1657 }
1658
1659 int
1660 linux_getsid(struct thread *td, struct linux_getsid_args *args)
1661 {
1662
1663         return (kern_getsid(td, args->pid));
1664 }
1665
1666 int
1667 linux_nosys(struct thread *td, struct nosys_args *ignore)
1668 {
1669
1670         return (ENOSYS);
1671 }
1672
1673 int
1674 linux_getpriority(struct thread *td, struct linux_getpriority_args *args)
1675 {
1676         int error;
1677
1678         error = kern_getpriority(td, args->which, args->who);
1679         td->td_retval[0] = 20 - td->td_retval[0];
1680         return (error);
1681 }
1682
1683 int
1684 linux_sethostname(struct thread *td, struct linux_sethostname_args *args)
1685 {
1686         int name[2];
1687
1688         name[0] = CTL_KERN;
1689         name[1] = KERN_HOSTNAME;
1690         return (userland_sysctl(td, name, 2, 0, 0, 0, args->hostname,
1691             args->len, 0, 0));
1692 }
1693
1694 int
1695 linux_setdomainname(struct thread *td, struct linux_setdomainname_args *args)
1696 {
1697         int name[2];
1698
1699         name[0] = CTL_KERN;
1700         name[1] = KERN_NISDOMAINNAME;
1701         return (userland_sysctl(td, name, 2, 0, 0, 0, args->name,
1702             args->len, 0, 0));
1703 }
1704
1705 int
1706 linux_exit_group(struct thread *td, struct linux_exit_group_args *args)
1707 {
1708
1709         LINUX_CTR2(exit_group, "thread(%d) (%d)", td->td_tid,
1710             args->error_code);
1711
1712         /*
1713          * XXX: we should send a signal to the parent if
1714          * SIGNAL_EXIT_GROUP is set. We ignore that (temporarily?)
1715          * as it doesnt occur often.
1716          */
1717         exit1(td, args->error_code, 0);
1718                 /* NOTREACHED */
1719 }
1720
1721 #define _LINUX_CAPABILITY_VERSION_1  0x19980330
1722 #define _LINUX_CAPABILITY_VERSION_2  0x20071026
1723 #define _LINUX_CAPABILITY_VERSION_3  0x20080522
1724
1725 struct l_user_cap_header {
1726         l_int   version;
1727         l_int   pid;
1728 };
1729
1730 struct l_user_cap_data {
1731         l_int   effective;
1732         l_int   permitted;
1733         l_int   inheritable;
1734 };
1735
1736 int
1737 linux_capget(struct thread *td, struct linux_capget_args *uap)
1738 {
1739         struct l_user_cap_header luch;
1740         struct l_user_cap_data lucd[2];
1741         int error, u32s;
1742
1743         if (uap->hdrp == NULL)
1744                 return (EFAULT);
1745
1746         error = copyin(uap->hdrp, &luch, sizeof(luch));
1747         if (error != 0)
1748                 return (error);
1749
1750         switch (luch.version) {
1751         case _LINUX_CAPABILITY_VERSION_1:
1752                 u32s = 1;
1753                 break;
1754         case _LINUX_CAPABILITY_VERSION_2:
1755         case _LINUX_CAPABILITY_VERSION_3:
1756                 u32s = 2;
1757                 break;
1758         default:
1759                 luch.version = _LINUX_CAPABILITY_VERSION_1;
1760                 error = copyout(&luch, uap->hdrp, sizeof(luch));
1761                 if (error)
1762                         return (error);
1763                 return (EINVAL);
1764         }
1765
1766         if (luch.pid)
1767                 return (EPERM);
1768
1769         if (uap->datap) {
1770                 /*
1771                  * The current implementation doesn't support setting
1772                  * a capability (it's essentially a stub) so indicate
1773                  * that no capabilities are currently set or available
1774                  * to request.
1775                  */
1776                 memset(&lucd, 0, u32s * sizeof(lucd[0]));
1777                 error = copyout(&lucd, uap->datap, u32s * sizeof(lucd[0]));
1778         }
1779
1780         return (error);
1781 }
1782
1783 int
1784 linux_capset(struct thread *td, struct linux_capset_args *uap)
1785 {
1786         struct l_user_cap_header luch;
1787         struct l_user_cap_data lucd[2];
1788         int error, i, u32s;
1789
1790         if (uap->hdrp == NULL || uap->datap == NULL)
1791                 return (EFAULT);
1792
1793         error = copyin(uap->hdrp, &luch, sizeof(luch));
1794         if (error != 0)
1795                 return (error);
1796
1797         switch (luch.version) {
1798         case _LINUX_CAPABILITY_VERSION_1:
1799                 u32s = 1;
1800                 break;
1801         case _LINUX_CAPABILITY_VERSION_2:
1802         case _LINUX_CAPABILITY_VERSION_3:
1803                 u32s = 2;
1804                 break;
1805         default:
1806                 luch.version = _LINUX_CAPABILITY_VERSION_1;
1807                 error = copyout(&luch, uap->hdrp, sizeof(luch));
1808                 if (error)
1809                         return (error);
1810                 return (EINVAL);
1811         }
1812
1813         if (luch.pid)
1814                 return (EPERM);
1815
1816         error = copyin(uap->datap, &lucd, u32s * sizeof(lucd[0]));
1817         if (error != 0)
1818                 return (error);
1819
1820         /* We currently don't support setting any capabilities. */
1821         for (i = 0; i < u32s; i++) {
1822                 if (lucd[i].effective || lucd[i].permitted ||
1823                     lucd[i].inheritable) {
1824                         linux_msg(td,
1825                             "capset[%d] effective=0x%x, permitted=0x%x, "
1826                             "inheritable=0x%x is not implemented", i,
1827                             (int)lucd[i].effective, (int)lucd[i].permitted,
1828                             (int)lucd[i].inheritable);
1829                         return (EPERM);
1830                 }
1831         }
1832
1833         return (0);
1834 }
1835
1836 int
1837 linux_prctl(struct thread *td, struct linux_prctl_args *args)
1838 {
1839         int error = 0, max_size;
1840         struct proc *p = td->td_proc;
1841         char comm[LINUX_MAX_COMM_LEN];
1842         int pdeath_signal;
1843
1844         switch (args->option) {
1845         case LINUX_PR_SET_PDEATHSIG:
1846                 if (!LINUX_SIG_VALID(args->arg2))
1847                         return (EINVAL);
1848                 pdeath_signal = linux_to_bsd_signal(args->arg2);
1849                 return (kern_procctl(td, P_PID, 0, PROC_PDEATHSIG_CTL,
1850                     &pdeath_signal));
1851         case LINUX_PR_GET_PDEATHSIG:
1852                 error = kern_procctl(td, P_PID, 0, PROC_PDEATHSIG_STATUS,
1853                     &pdeath_signal);
1854                 if (error != 0)
1855                         return (error);
1856                 pdeath_signal = bsd_to_linux_signal(pdeath_signal);
1857                 return (copyout(&pdeath_signal,
1858                     (void *)(register_t)args->arg2,
1859                     sizeof(pdeath_signal)));
1860                 break;
1861         case LINUX_PR_GET_KEEPCAPS:
1862                 /*
1863                  * Indicate that we always clear the effective and
1864                  * permitted capability sets when the user id becomes
1865                  * non-zero (actually the capability sets are simply
1866                  * always zero in the current implementation).
1867                  */
1868                 td->td_retval[0] = 0;
1869                 break;
1870         case LINUX_PR_SET_KEEPCAPS:
1871                 /*
1872                  * Ignore requests to keep the effective and permitted
1873                  * capability sets when the user id becomes non-zero.
1874                  */
1875                 break;
1876         case LINUX_PR_SET_NAME:
1877                 /*
1878                  * To be on the safe side we need to make sure to not
1879                  * overflow the size a Linux program expects. We already
1880                  * do this here in the copyin, so that we don't need to
1881                  * check on copyout.
1882                  */
1883                 max_size = MIN(sizeof(comm), sizeof(p->p_comm));
1884                 error = copyinstr((void *)(register_t)args->arg2, comm,
1885                     max_size, NULL);
1886
1887                 /* Linux silently truncates the name if it is too long. */
1888                 if (error == ENAMETOOLONG) {
1889                         /*
1890                          * XXX: copyinstr() isn't documented to populate the
1891                          * array completely, so do a copyin() to be on the
1892                          * safe side. This should be changed in case
1893                          * copyinstr() is changed to guarantee this.
1894                          */
1895                         error = copyin((void *)(register_t)args->arg2, comm,
1896                             max_size - 1);
1897                         comm[max_size - 1] = '\0';
1898                 }
1899                 if (error)
1900                         return (error);
1901
1902                 PROC_LOCK(p);
1903                 strlcpy(p->p_comm, comm, sizeof(p->p_comm));
1904                 PROC_UNLOCK(p);
1905                 break;
1906         case LINUX_PR_GET_NAME:
1907                 PROC_LOCK(p);
1908                 strlcpy(comm, p->p_comm, sizeof(comm));
1909                 PROC_UNLOCK(p);
1910                 error = copyout(comm, (void *)(register_t)args->arg2,
1911                     strlen(comm) + 1);
1912                 break;
1913         default:
1914                 error = EINVAL;
1915                 break;
1916         }
1917
1918         return (error);
1919 }
1920
1921 int
1922 linux_sched_setparam(struct thread *td,
1923     struct linux_sched_setparam_args *uap)
1924 {
1925         struct sched_param sched_param;
1926         struct thread *tdt;
1927         int error, policy;
1928
1929         error = copyin(uap->param, &sched_param, sizeof(sched_param));
1930         if (error)
1931                 return (error);
1932
1933         tdt = linux_tdfind(td, uap->pid, -1);
1934         if (tdt == NULL)
1935                 return (ESRCH);
1936
1937         if( map_sched_prio ) {
1938                 error = kern_sched_getscheduler(td, tdt, &policy);
1939                 if (error)
1940                         goto out;
1941
1942                 switch (policy) {
1943                 case SCHED_OTHER:
1944                         if (sched_param.sched_priority != 0) {
1945                                 error = EINVAL;
1946                                 goto out;
1947                         }
1948                         sched_param.sched_priority =
1949                             PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE;
1950                         break;
1951                 case SCHED_FIFO:
1952                 case SCHED_RR:
1953                         if (sched_param.sched_priority < 1 ||
1954                             sched_param.sched_priority >= LINUX_MAX_RT_PRIO) {
1955                                 error = EINVAL;
1956                                 goto out;
1957                         }
1958                         /*
1959                          * Map [1, LINUX_MAX_RT_PRIO - 1] to
1960                          * [0, RTP_PRIO_MAX - RTP_PRIO_MIN] (rounding down).
1961                          */
1962                         sched_param.sched_priority =
1963                             (sched_param.sched_priority - 1) *
1964                             (RTP_PRIO_MAX - RTP_PRIO_MIN + 1) /
1965                             (LINUX_MAX_RT_PRIO - 1);
1966                         break;
1967                 }
1968         }
1969
1970         error = kern_sched_setparam(td, tdt, &sched_param);
1971 out:    PROC_UNLOCK(tdt->td_proc);
1972         return (error);
1973 }
1974
1975 int
1976 linux_sched_getparam(struct thread *td,
1977     struct linux_sched_getparam_args *uap)
1978 {
1979         struct sched_param sched_param;
1980         struct thread *tdt;
1981         int error, policy;
1982
1983         tdt = linux_tdfind(td, uap->pid, -1);
1984         if (tdt == NULL)
1985                 return (ESRCH);
1986
1987         error = kern_sched_getparam(td, tdt, &sched_param);
1988         if (error) {
1989                 PROC_UNLOCK(tdt->td_proc);
1990                 return (error);
1991         }
1992
1993         if (map_sched_prio) {
1994                 error = kern_sched_getscheduler(td, tdt, &policy);
1995                 PROC_UNLOCK(tdt->td_proc);
1996                 if (error)
1997                         return (error);
1998
1999                 switch (policy) {
2000                 case SCHED_OTHER:
2001                         sched_param.sched_priority = 0;
2002                         break;
2003                 case SCHED_FIFO:
2004                 case SCHED_RR:
2005                         /*
2006                          * Map [0, RTP_PRIO_MAX - RTP_PRIO_MIN] to
2007                          * [1, LINUX_MAX_RT_PRIO - 1] (rounding up).
2008                          */
2009                         sched_param.sched_priority =
2010                             (sched_param.sched_priority *
2011                             (LINUX_MAX_RT_PRIO - 1) +
2012                             (RTP_PRIO_MAX - RTP_PRIO_MIN - 1)) /
2013                             (RTP_PRIO_MAX - RTP_PRIO_MIN) + 1;
2014                         break;
2015                 }
2016         } else
2017                 PROC_UNLOCK(tdt->td_proc);
2018
2019         error = copyout(&sched_param, uap->param, sizeof(sched_param));
2020         return (error);
2021 }
2022
2023 /*
2024  * Get affinity of a process.
2025  */
2026 int
2027 linux_sched_getaffinity(struct thread *td,
2028     struct linux_sched_getaffinity_args *args)
2029 {
2030         int error;
2031         struct thread *tdt;
2032
2033         if (args->len < sizeof(cpuset_t))
2034                 return (EINVAL);
2035
2036         tdt = linux_tdfind(td, args->pid, -1);
2037         if (tdt == NULL)
2038                 return (ESRCH);
2039
2040         PROC_UNLOCK(tdt->td_proc);
2041
2042         error = kern_cpuset_getaffinity(td, CPU_LEVEL_WHICH, CPU_WHICH_TID,
2043             tdt->td_tid, sizeof(cpuset_t), (cpuset_t *)args->user_mask_ptr);
2044         if (error == 0)
2045                 td->td_retval[0] = sizeof(cpuset_t);
2046
2047         return (error);
2048 }
2049
2050 /*
2051  *  Set affinity of a process.
2052  */
2053 int
2054 linux_sched_setaffinity(struct thread *td,
2055     struct linux_sched_setaffinity_args *args)
2056 {
2057         struct thread *tdt;
2058
2059         if (args->len < sizeof(cpuset_t))
2060                 return (EINVAL);
2061
2062         tdt = linux_tdfind(td, args->pid, -1);
2063         if (tdt == NULL)
2064                 return (ESRCH);
2065
2066         PROC_UNLOCK(tdt->td_proc);
2067
2068         return (kern_cpuset_setaffinity(td, CPU_LEVEL_WHICH, CPU_WHICH_TID,
2069             tdt->td_tid, sizeof(cpuset_t), (cpuset_t *) args->user_mask_ptr));
2070 }
2071
2072 struct linux_rlimit64 {
2073         uint64_t        rlim_cur;
2074         uint64_t        rlim_max;
2075 };
2076
2077 int
2078 linux_prlimit64(struct thread *td, struct linux_prlimit64_args *args)
2079 {
2080         struct rlimit rlim, nrlim;
2081         struct linux_rlimit64 lrlim;
2082         struct proc *p;
2083         u_int which;
2084         int flags;
2085         int error;
2086
2087         if (args->resource >= LINUX_RLIM_NLIMITS)
2088                 return (EINVAL);
2089
2090         which = linux_to_bsd_resource[args->resource];
2091         if (which == -1)
2092                 return (EINVAL);
2093
2094         if (args->new != NULL) {
2095                 /*
2096                  * Note. Unlike FreeBSD where rlim is signed 64-bit Linux
2097                  * rlim is unsigned 64-bit. FreeBSD treats negative limits
2098                  * as INFINITY so we do not need a conversion even.
2099                  */
2100                 error = copyin(args->new, &nrlim, sizeof(nrlim));
2101                 if (error != 0)
2102                         return (error);
2103         }
2104
2105         flags = PGET_HOLD | PGET_NOTWEXIT;
2106         if (args->new != NULL)
2107                 flags |= PGET_CANDEBUG;
2108         else
2109                 flags |= PGET_CANSEE;
2110         if (args->pid == 0) {
2111                 p = td->td_proc;
2112                 PHOLD(p);
2113         } else {
2114                 error = pget(args->pid, flags, &p);
2115                 if (error != 0)
2116                         return (error);
2117         }
2118         if (args->old != NULL) {
2119                 PROC_LOCK(p);
2120                 lim_rlimit_proc(p, which, &rlim);
2121                 PROC_UNLOCK(p);
2122                 if (rlim.rlim_cur == RLIM_INFINITY)
2123                         lrlim.rlim_cur = LINUX_RLIM_INFINITY;
2124                 else
2125                         lrlim.rlim_cur = rlim.rlim_cur;
2126                 if (rlim.rlim_max == RLIM_INFINITY)
2127                         lrlim.rlim_max = LINUX_RLIM_INFINITY;
2128                 else
2129                         lrlim.rlim_max = rlim.rlim_max;
2130                 error = copyout(&lrlim, args->old, sizeof(lrlim));
2131                 if (error != 0)
2132                         goto out;
2133         }
2134
2135         if (args->new != NULL)
2136                 error = kern_proc_setrlimit(td, p, which, &nrlim);
2137
2138  out:
2139         PRELE(p);
2140         return (error);
2141 }
2142
2143 int
2144 linux_pselect6(struct thread *td, struct linux_pselect6_args *args)
2145 {
2146         struct timeval utv, tv0, tv1, *tvp;
2147         struct l_pselect6arg lpse6;
2148         struct l_timespec lts;
2149         struct timespec uts;
2150         l_sigset_t l_ss;
2151         sigset_t *ssp;
2152         sigset_t ss;
2153         int error;
2154
2155         ssp = NULL;
2156         if (args->sig != NULL) {
2157                 error = copyin(args->sig, &lpse6, sizeof(lpse6));
2158                 if (error != 0)
2159                         return (error);
2160                 if (lpse6.ss_len != sizeof(l_ss))
2161                         return (EINVAL);
2162                 if (lpse6.ss != 0) {
2163                         error = copyin(PTRIN(lpse6.ss), &l_ss,
2164                             sizeof(l_ss));
2165                         if (error != 0)
2166                                 return (error);
2167                         linux_to_bsd_sigset(&l_ss, &ss);
2168                         ssp = &ss;
2169                 }
2170         }
2171
2172         /*
2173          * Currently glibc changes nanosecond number to microsecond.
2174          * This mean losing precision but for now it is hardly seen.
2175          */
2176         if (args->tsp != NULL) {
2177                 error = copyin(args->tsp, &lts, sizeof(lts));
2178                 if (error != 0)
2179                         return (error);
2180                 error = linux_to_native_timespec(&uts, &lts);
2181                 if (error != 0)
2182                         return (error);
2183
2184                 TIMESPEC_TO_TIMEVAL(&utv, &uts);
2185                 if (itimerfix(&utv))
2186                         return (EINVAL);
2187
2188                 microtime(&tv0);
2189                 tvp = &utv;
2190         } else
2191                 tvp = NULL;
2192
2193         error = kern_pselect(td, args->nfds, args->readfds, args->writefds,
2194             args->exceptfds, tvp, ssp, LINUX_NFDBITS);
2195
2196         if (error == 0 && args->tsp != NULL) {
2197                 if (td->td_retval[0] != 0) {
2198                         /*
2199                          * Compute how much time was left of the timeout,
2200                          * by subtracting the current time and the time
2201                          * before we started the call, and subtracting
2202                          * that result from the user-supplied value.
2203                          */
2204
2205                         microtime(&tv1);
2206                         timevalsub(&tv1, &tv0);
2207                         timevalsub(&utv, &tv1);
2208                         if (utv.tv_sec < 0)
2209                                 timevalclear(&utv);
2210                 } else
2211                         timevalclear(&utv);
2212
2213                 TIMEVAL_TO_TIMESPEC(&utv, &uts);
2214
2215                 error = native_to_linux_timespec(&lts, &uts);
2216                 if (error == 0)
2217                         error = copyout(&lts, args->tsp, sizeof(lts));
2218         }
2219
2220         return (error);
2221 }
2222
2223 int
2224 linux_ppoll(struct thread *td, struct linux_ppoll_args *args)
2225 {
2226         struct timespec ts0, ts1;
2227         struct l_timespec lts;
2228         struct timespec uts, *tsp;
2229         l_sigset_t l_ss;
2230         sigset_t *ssp;
2231         sigset_t ss;
2232         int error;
2233
2234         if (args->sset != NULL) {
2235                 if (args->ssize != sizeof(l_ss))
2236                         return (EINVAL);
2237                 error = copyin(args->sset, &l_ss, sizeof(l_ss));
2238                 if (error)
2239                         return (error);
2240                 linux_to_bsd_sigset(&l_ss, &ss);
2241                 ssp = &ss;
2242         } else
2243                 ssp = NULL;
2244         if (args->tsp != NULL) {
2245                 error = copyin(args->tsp, &lts, sizeof(lts));
2246                 if (error)
2247                         return (error);
2248                 error = linux_to_native_timespec(&uts, &lts);
2249                 if (error != 0)
2250                         return (error);
2251
2252                 nanotime(&ts0);
2253                 tsp = &uts;
2254         } else
2255                 tsp = NULL;
2256
2257         error = kern_poll(td, args->fds, args->nfds, tsp, ssp);
2258
2259         if (error == 0 && args->tsp != NULL) {
2260                 if (td->td_retval[0]) {
2261                         nanotime(&ts1);
2262                         timespecsub(&ts1, &ts0, &ts1);
2263                         timespecsub(&uts, &ts1, &uts);
2264                         if (uts.tv_sec < 0)
2265                                 timespecclear(&uts);
2266                 } else
2267                         timespecclear(&uts);
2268
2269                 error = native_to_linux_timespec(&lts, &uts);
2270                 if (error == 0)
2271                         error = copyout(&lts, args->tsp, sizeof(lts));
2272         }
2273
2274         return (error);
2275 }
2276
2277 int
2278 linux_sched_rr_get_interval(struct thread *td,
2279     struct linux_sched_rr_get_interval_args *uap)
2280 {
2281         struct timespec ts;
2282         struct l_timespec lts;
2283         struct thread *tdt;
2284         int error;
2285
2286         /*
2287          * According to man in case the invalid pid specified
2288          * EINVAL should be returned.
2289          */
2290         if (uap->pid < 0)
2291                 return (EINVAL);
2292
2293         tdt = linux_tdfind(td, uap->pid, -1);
2294         if (tdt == NULL)
2295                 return (ESRCH);
2296
2297         error = kern_sched_rr_get_interval_td(td, tdt, &ts);
2298         PROC_UNLOCK(tdt->td_proc);
2299         if (error != 0)
2300                 return (error);
2301         error = native_to_linux_timespec(&lts, &ts);
2302         if (error != 0)
2303                 return (error);
2304         return (copyout(&lts, uap->interval, sizeof(lts)));
2305 }
2306
2307 /*
2308  * In case when the Linux thread is the initial thread in
2309  * the thread group thread id is equal to the process id.
2310  * Glibc depends on this magic (assert in pthread_getattr_np.c).
2311  */
2312 struct thread *
2313 linux_tdfind(struct thread *td, lwpid_t tid, pid_t pid)
2314 {
2315         struct linux_emuldata *em;
2316         struct thread *tdt;
2317         struct proc *p;
2318
2319         tdt = NULL;
2320         if (tid == 0 || tid == td->td_tid) {
2321                 tdt = td;
2322                 PROC_LOCK(tdt->td_proc);
2323         } else if (tid > PID_MAX)
2324                 tdt = tdfind(tid, pid);
2325         else {
2326                 /*
2327                  * Initial thread where the tid equal to the pid.
2328                  */
2329                 p = pfind(tid);
2330                 if (p != NULL) {
2331                         if (SV_PROC_ABI(p) != SV_ABI_LINUX) {
2332                                 /*
2333                                  * p is not a Linuxulator process.
2334                                  */
2335                                 PROC_UNLOCK(p);
2336                                 return (NULL);
2337                         }
2338                         FOREACH_THREAD_IN_PROC(p, tdt) {
2339                                 em = em_find(tdt);
2340                                 if (tid == em->em_tid)
2341                                         return (tdt);
2342                         }
2343                         PROC_UNLOCK(p);
2344                 }
2345                 return (NULL);
2346         }
2347
2348         return (tdt);
2349 }
2350
2351 void
2352 linux_to_bsd_waitopts(int options, int *bsdopts)
2353 {
2354
2355         if (options & LINUX_WNOHANG)
2356                 *bsdopts |= WNOHANG;
2357         if (options & LINUX_WUNTRACED)
2358                 *bsdopts |= WUNTRACED;
2359         if (options & LINUX_WEXITED)
2360                 *bsdopts |= WEXITED;
2361         if (options & LINUX_WCONTINUED)
2362                 *bsdopts |= WCONTINUED;
2363         if (options & LINUX_WNOWAIT)
2364                 *bsdopts |= WNOWAIT;
2365
2366         if (options & __WCLONE)
2367                 *bsdopts |= WLINUXCLONE;
2368 }
2369
2370 int
2371 linux_getrandom(struct thread *td, struct linux_getrandom_args *args)
2372 {
2373         struct uio uio;
2374         struct iovec iov;
2375         int error;
2376
2377         if (args->flags & ~(LINUX_GRND_NONBLOCK|LINUX_GRND_RANDOM))
2378                 return (EINVAL);
2379         if (args->count > INT_MAX)
2380                 args->count = INT_MAX;
2381
2382         iov.iov_base = args->buf;
2383         iov.iov_len = args->count;
2384
2385         uio.uio_iov = &iov;
2386         uio.uio_iovcnt = 1;
2387         uio.uio_resid = iov.iov_len;
2388         uio.uio_segflg = UIO_USERSPACE;
2389         uio.uio_rw = UIO_READ;
2390         uio.uio_td = td;
2391
2392         error = read_random_uio(&uio, args->flags & LINUX_GRND_NONBLOCK);
2393         if (error == 0)
2394                 td->td_retval[0] = args->count - uio.uio_resid;
2395         return (error);
2396 }
2397
2398 int
2399 linux_mincore(struct thread *td, struct linux_mincore_args *args)
2400 {
2401
2402         /* Needs to be page-aligned */
2403         if (args->start & PAGE_MASK)
2404                 return (EINVAL);
2405         return (kern_mincore(td, args->start, args->len, args->vec));
2406 }
2407
2408 #define SYSLOG_TAG      "<6>"
2409
2410 int
2411 linux_syslog(struct thread *td, struct linux_syslog_args *args)
2412 {
2413         char buf[128], *src, *dst;
2414         u_int seq;
2415         int buflen, error;
2416
2417         if (args->type != LINUX_SYSLOG_ACTION_READ_ALL) {
2418                 linux_msg(td, "syslog unsupported type 0x%x", args->type);
2419                 return (EINVAL);
2420         }
2421
2422         if (args->len < 6) {
2423                 td->td_retval[0] = 0;
2424                 return (0);
2425         }
2426
2427         error = priv_check(td, PRIV_MSGBUF);
2428         if (error)
2429                 return (error);
2430
2431         mtx_lock(&msgbuf_lock);
2432         msgbuf_peekbytes(msgbufp, NULL, 0, &seq);
2433         mtx_unlock(&msgbuf_lock);
2434
2435         dst = args->buf;
2436         error = copyout(&SYSLOG_TAG, dst, sizeof(SYSLOG_TAG));
2437         /* The -1 is to skip the trailing '\0'. */
2438         dst += sizeof(SYSLOG_TAG) - 1;
2439
2440         while (error == 0) {
2441                 mtx_lock(&msgbuf_lock);
2442                 buflen = msgbuf_peekbytes(msgbufp, buf, sizeof(buf), &seq);
2443                 mtx_unlock(&msgbuf_lock);
2444
2445                 if (buflen == 0)
2446                         break;
2447
2448                 for (src = buf; src < buf + buflen && error == 0; src++) {
2449                         if (*src == '\0')
2450                                 continue;
2451
2452                         if (dst >= args->buf + args->len)
2453                                 goto out;
2454
2455                         error = copyout(src, dst, 1);
2456                         dst++;
2457
2458                         if (*src == '\n' && *(src + 1) != '<' &&
2459                             dst + sizeof(SYSLOG_TAG) < args->buf + args->len) {
2460                                 error = copyout(&SYSLOG_TAG,
2461                                     dst, sizeof(SYSLOG_TAG));
2462                                 dst += sizeof(SYSLOG_TAG) - 1;
2463                         }
2464                 }
2465         }
2466 out:
2467         td->td_retval[0] = dst - args->buf;
2468         return (error);
2469 }
2470
2471 int
2472 linux_getcpu(struct thread *td, struct linux_getcpu_args *args)
2473 {
2474         int cpu, error, node;
2475
2476         cpu = td->td_oncpu; /* Make sure it doesn't change during copyout(9) */
2477         error = 0;
2478         node = cpuid_to_pcpu[cpu]->pc_domain;
2479
2480         if (args->cpu != NULL)
2481                 error = copyout(&cpu, args->cpu, sizeof(l_int));
2482         if (args->node != NULL)
2483                 error = copyout(&node, args->node, sizeof(l_int));
2484         return (error);
2485 }