sys/compat/linux/linux_misc.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 2002 Doug Rabson
   5  * Copyright (c) 1994-1995 Søren Schmidt
   6  * All rights reserved.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer
  13  *    in this position and unchanged.
  14  * 2. Redistributions in binary form must reproduce the above copyright
  15  *    notice, this list of conditions and the following disclaimer in the
  16  *    documentation and/or other materials provided with the distribution.
  17  * 3. The name of the author may not be used to endorse or promote products
  18  *    derived from this software without specific prior written permission
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  21  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  22  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  23  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  24  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  25  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  29  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 #include <sys/cdefs.h>
  33 __FBSDID("$FreeBSD$");
  34
  35 #include "opt_compat.h"
  36
  37 #include <sys/param.h>
  38 #include <sys/blist.h>
  39 #include <sys/fcntl.h>
  40 #if defined(__i386__)
  41 #include <sys/imgact_aout.h>
  42 #endif
  43 #include <sys/jail.h>
  44 #include <sys/kernel.h>
  45 #include <sys/limits.h>
  46 #include <sys/lock.h>
  47 #include <sys/malloc.h>
  48 #include <sys/mman.h>
  49 #include <sys/mount.h>
  50 #include <sys/msgbuf.h>
  51 #include <sys/mutex.h>
  52 #include <sys/namei.h>
  53 #include <sys/priv.h>
  54 #include <sys/proc.h>
  55 #include <sys/procctl.h>
  56 #include <sys/reboot.h>
  57 #include <sys/racct.h>
  58 #include <sys/random.h>
  59 #include <sys/resourcevar.h>
  60 #include <sys/sched.h>
  61 #include <sys/sdt.h>
  62 #include <sys/signalvar.h>
  63 #include <sys/stat.h>
  64 #include <sys/syscallsubr.h>
  65 #include <sys/sysctl.h>
  66 #include <sys/sysproto.h>
  67 #include <sys/systm.h>
  68 #include <sys/time.h>
  69 #include <sys/vmmeter.h>
  70 #include <sys/vnode.h>
  71 #include <sys/wait.h>
  72 #include <sys/cpuset.h>
  73 #include <sys/uio.h>
  74
  75 #include <security/mac/mac_framework.h>
  76
  77 #include <vm/vm.h>
  78 #include <vm/pmap.h>
  79 #include <vm/vm_kern.h>
  80 #include <vm/vm_map.h>
  81 #include <vm/vm_extern.h>
  82 #include <vm/vm_object.h>
  83 #include <vm/swap_pager.h>
  84
  85 #ifdef COMPAT_LINUX32
  86 #include <machine/../linux32/linux.h>
  87 #include <machine/../linux32/linux32_proto.h>
  88 #else
  89 #include <machine/../linux/linux.h>
  90 #include <machine/../linux/linux_proto.h>
  91 #endif
  92
  93 #include <compat/linux/linux_dtrace.h>
  94 #include <compat/linux/linux_file.h>
  95 #include <compat/linux/linux_mib.h>
  96 #include <compat/linux/linux_signal.h>
  97 #include <compat/linux/linux_timer.h>
  98 #include <compat/linux/linux_util.h>
  99 #include <compat/linux/linux_sysproto.h>
 100 #include <compat/linux/linux_emul.h>
 101 #include <compat/linux/linux_misc.h>
 102
 103 /**
 104  * Special DTrace provider for the linuxulator.
 105  *
 106  * In this file we define the provider for the entire linuxulator. All
 107  * modules (= files of the linuxulator) use it.
 108  *
 109  * We define a different name depending on the emulated bitsize, see
 110  * ../../<ARCH>/linux{,32}/linux.h, e.g.:
 111  *      native bitsize          = linuxulator
 112  *      amd64, 32bit emulation  = linuxulator32
 113  */
 114 LIN_SDT_PROVIDER_DEFINE(LINUX_DTRACE);
 115
 116 int stclohz;                            /* Statistics clock frequency */
 117
 118 static unsigned int linux_to_bsd_resource[LINUX_RLIM_NLIMITS] = {
 119         RLIMIT_CPU, RLIMIT_FSIZE, RLIMIT_DATA, RLIMIT_STACK,
 120         RLIMIT_CORE, RLIMIT_RSS, RLIMIT_NPROC, RLIMIT_NOFILE,
 121         RLIMIT_MEMLOCK, RLIMIT_AS
 122 };
 123
 124 struct l_sysinfo {
 125         l_long          uptime;         /* Seconds since boot */
 126         l_ulong         loads[3];       /* 1, 5, and 15 minute load averages */
 127 #define LINUX_SYSINFO_LOADS_SCALE 65536
 128         l_ulong         totalram;       /* Total usable main memory size */
 129         l_ulong         freeram;        /* Available memory size */
 130         l_ulong         sharedram;      /* Amount of shared memory */
 131         l_ulong         bufferram;      /* Memory used by buffers */
 132         l_ulong         totalswap;      /* Total swap space size */
 133         l_ulong         freeswap;       /* swap space still available */
 134         l_ushort        procs;          /* Number of current processes */
 135         l_ushort        pads;
 136         l_ulong         totalbig;
 137         l_ulong         freebig;
 138         l_uint          mem_unit;
 139         char            _f[20-2*sizeof(l_long)-sizeof(l_int)];  /* padding */
 140 };
 141
 142 struct l_pselect6arg {
 143         l_uintptr_t     ss;
 144         l_size_t        ss_len;
 145 };
 146
 147 static int      linux_utimensat_nsec_valid(l_long);
 148
 149
 150 int
 151 linux_sysinfo(struct thread *td, struct linux_sysinfo_args *args)
 152 {
 153         struct l_sysinfo sysinfo;
 154         vm_object_t object;
 155         int i, j;
 156         struct timespec ts;
 157
 158         bzero(&sysinfo, sizeof(sysinfo));
 159         getnanouptime(&ts);
 160         if (ts.tv_nsec != 0)
 161                 ts.tv_sec++;
 162         sysinfo.uptime = ts.tv_sec;
 163
 164         /* Use the information from the mib to get our load averages */
 165         for (i = 0; i < 3; i++)
 166                 sysinfo.loads[i] = averunnable.ldavg[i] *
 167                     LINUX_SYSINFO_LOADS_SCALE / averunnable.fscale;
 168
 169         sysinfo.totalram = physmem * PAGE_SIZE;
 170         sysinfo.freeram = sysinfo.totalram - vm_wire_count() * PAGE_SIZE;
 171
 172         sysinfo.sharedram = 0;
 173         mtx_lock(&vm_object_list_mtx);
 174         TAILQ_FOREACH(object, &vm_object_list, object_list)
 175                 if (object->shadow_count > 1)
 176                         sysinfo.sharedram += object->resident_page_count;
 177         mtx_unlock(&vm_object_list_mtx);
 178
 179         sysinfo.sharedram *= PAGE_SIZE;
 180         sysinfo.bufferram = 0;
 181
 182         swap_pager_status(&i, &j);
 183         sysinfo.totalswap = i * PAGE_SIZE;
 184         sysinfo.freeswap = (i - j) * PAGE_SIZE;
 185
 186         sysinfo.procs = nprocs;
 187
 188         /* The following are only present in newer Linux kernels. */
 189         sysinfo.totalbig = 0;
 190         sysinfo.freebig = 0;
 191         sysinfo.mem_unit = 1;
 192
 193         return (copyout(&sysinfo, args->info, sizeof(sysinfo)));
 194 }
 195
 196 #ifdef LINUX_LEGACY_SYSCALLS
 197 int
 198 linux_alarm(struct thread *td, struct linux_alarm_args *args)
 199 {
 200         struct itimerval it, old_it;
 201         u_int secs;
 202         int error;
 203
 204         secs = args->secs;
 205         /*
 206          * Linux alarm() is always successful. Limit secs to INT32_MAX / 2
 207          * to match kern_setitimer()'s limit to avoid error from it.
 208          *
 209          * XXX. Linux limit secs to INT_MAX on 32 and does not limit on 64-bit
 210          * platforms.
 211          */
 212         if (secs > INT32_MAX / 2)
 213                 secs = INT32_MAX / 2;
 214
 215         it.it_value.tv_sec = secs;
 216         it.it_value.tv_usec = 0;
 217         timevalclear(&it.it_interval);
 218         error = kern_setitimer(td, ITIMER_REAL, &it, &old_it);
 219         KASSERT(error == 0, ("kern_setitimer returns %d", error));
 220
 221         if ((old_it.it_value.tv_sec == 0 && old_it.it_value.tv_usec > 0) ||
 222             old_it.it_value.tv_usec >= 500000)
 223                 old_it.it_value.tv_sec++;
 224         td->td_retval[0] = old_it.it_value.tv_sec;
 225         return (0);
 226 }
 227 #endif
 228
 229 int
 230 linux_brk(struct thread *td, struct linux_brk_args *args)
 231 {
 232         struct vmspace *vm = td->td_proc->p_vmspace;
 233         uintptr_t new, old;
 234
 235         old = (uintptr_t)vm->vm_daddr + ctob(vm->vm_dsize);
 236         new = (uintptr_t)args->dsend;
 237         if ((caddr_t)new > vm->vm_daddr && !kern_break(td, &new))
 238                 td->td_retval[0] = (register_t)new;
 239         else
 240                 td->td_retval[0] = (register_t)old;
 241
 242         return (0);
 243 }
 244
 245 #if defined(__i386__)
 246 /* XXX: what about amd64/linux32? */
 247
 248 int
 249 linux_uselib(struct thread *td, struct linux_uselib_args *args)
 250 {
 251         struct nameidata ni;
 252         struct vnode *vp;
 253         struct exec *a_out;
 254         vm_map_t map;
 255         vm_map_entry_t entry;
 256         struct vattr attr;
 257         vm_offset_t vmaddr;
 258         unsigned long file_offset;
 259         unsigned long bss_size;
 260         char *library;
 261         ssize_t aresid;
 262         int error;
 263         bool locked, opened, textset;
 264
 265         LCONVPATHEXIST(td, args->library, &library);
 266
 267         a_out = NULL;
 268         vp = NULL;
 269         locked = false;
 270         textset = false;
 271         opened = false;
 272
 273         NDINIT(&ni, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | AUDITVNODE1,
 274             UIO_SYSSPACE, library, td);
 275         error = namei(&ni);
 276         LFREEPATH(library);
 277         if (error)
 278                 goto cleanup;
 279
 280         vp = ni.ni_vp;
 281         NDFREE(&ni, NDF_ONLY_PNBUF);
 282
 283         /*
 284          * From here on down, we have a locked vnode that must be unlocked.
 285          * XXX: The code below largely duplicates exec_check_permissions().
 286          */
 287         locked = true;
 288
 289         /* Executable? */
 290         error = VOP_GETATTR(vp, &attr, td->td_ucred);
 291         if (error)
 292                 goto cleanup;
 293
 294         if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
 295             ((attr.va_mode & 0111) == 0) || (attr.va_type != VREG)) {
 296                 /* EACCESS is what exec(2) returns. */
 297                 error = ENOEXEC;
 298                 goto cleanup;
 299         }
 300
 301         /* Sensible size? */
 302         if (attr.va_size == 0) {
 303                 error = ENOEXEC;
 304                 goto cleanup;
 305         }
 306
 307         /* Can we access it? */
 308         error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
 309         if (error)
 310                 goto cleanup;
 311
 312         /*
 313          * XXX: This should use vn_open() so that it is properly authorized,
 314          * and to reduce code redundancy all over the place here.
 315          * XXX: Not really, it duplicates far more of exec_check_permissions()
 316          * than vn_open().
 317          */
 318 #ifdef MAC
 319         error = mac_vnode_check_open(td->td_ucred, vp, VREAD);
 320         if (error)
 321                 goto cleanup;
 322 #endif
 323         error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
 324         if (error)
 325                 goto cleanup;
 326         opened = true;
 327
 328         /* Pull in executable header into exec_map */
 329         error = vm_mmap(exec_map, (vm_offset_t *)&a_out, PAGE_SIZE,
 330             VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, vp, 0);
 331         if (error)
 332                 goto cleanup;
 333
 334         /* Is it a Linux binary ? */
 335         if (((a_out->a_magic >> 16) & 0xff) != 0x64) {
 336                 error = ENOEXEC;
 337                 goto cleanup;
 338         }
 339
 340         /*
 341          * While we are here, we should REALLY do some more checks
 342          */
 343
 344         /* Set file/virtual offset based on a.out variant. */
 345         switch ((int)(a_out->a_magic & 0xffff)) {
 346         case 0413:                      /* ZMAGIC */
 347                 file_offset = 1024;
 348                 break;
 349         case 0314:                      /* QMAGIC */
 350                 file_offset = 0;
 351                 break;
 352         default:
 353                 error = ENOEXEC;
 354                 goto cleanup;
 355         }
 356
 357         bss_size = round_page(a_out->a_bss);
 358
 359         /* Check various fields in header for validity/bounds. */
 360         if (a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) {
 361                 error = ENOEXEC;
 362                 goto cleanup;
 363         }
 364
 365         /* text + data can't exceed file size */
 366         if (a_out->a_data + a_out->a_text > attr.va_size) {
 367                 error = EFAULT;
 368                 goto cleanup;
 369         }
 370
 371         /*
 372          * text/data/bss must not exceed limits
 373          * XXX - this is not complete. it should check current usage PLUS
 374          * the resources needed by this library.
 375          */
 376         PROC_LOCK(td->td_proc);
 377         if (a_out->a_text > maxtsiz ||
 378             a_out->a_data + bss_size > lim_cur_proc(td->td_proc, RLIMIT_DATA) ||
 379             racct_set(td->td_proc, RACCT_DATA, a_out->a_data +
 380             bss_size) != 0) {
 381                 PROC_UNLOCK(td->td_proc);
 382                 error = ENOMEM;
 383                 goto cleanup;
 384         }
 385         PROC_UNLOCK(td->td_proc);
 386
 387         /*
 388          * Prevent more writers.
 389          */
 390         error = VOP_SET_TEXT(vp);
 391         if (error != 0)
 392                 goto cleanup;
 393         textset = true;
 394
 395         /*
 396          * Lock no longer needed
 397          */
 398         locked = false;
 399         VOP_UNLOCK(vp);
 400
 401         /*
 402          * Check if file_offset page aligned. Currently we cannot handle
 403          * misalinged file offsets, and so we read in the entire image
 404          * (what a waste).
 405          */
 406         if (file_offset & PAGE_MASK) {
 407                 /* Map text+data read/write/execute */
 408
 409                 /* a_entry is the load address and is page aligned */
 410                 vmaddr = trunc_page(a_out->a_entry);
 411
 412                 /* get anon user mapping, read+write+execute */
 413                 error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0,
 414                     &vmaddr, a_out->a_text + a_out->a_data, 0, VMFS_NO_SPACE,
 415                     VM_PROT_ALL, VM_PROT_ALL, 0);
 416                 if (error)
 417                         goto cleanup;
 418
 419                 error = vn_rdwr(UIO_READ, vp, (void *)vmaddr, file_offset,
 420                     a_out->a_text + a_out->a_data, UIO_USERSPACE, 0,
 421                     td->td_ucred, NOCRED, &aresid, td);
 422                 if (error != 0)
 423                         goto cleanup;
 424                 if (aresid != 0) {
 425                         error = ENOEXEC;
 426                         goto cleanup;
 427                 }
 428         } else {
 429                 /*
 430                  * for QMAGIC, a_entry is 20 bytes beyond the load address
 431                  * to skip the executable header
 432                  */
 433                 vmaddr = trunc_page(a_out->a_entry);
 434
 435                 /*
 436                  * Map it all into the process's space as a single
 437                  * copy-on-write "data" segment.
 438                  */
 439                 map = &td->td_proc->p_vmspace->vm_map;
 440                 error = vm_mmap(map, &vmaddr,
 441                     a_out->a_text + a_out->a_data, VM_PROT_ALL, VM_PROT_ALL,
 442                     MAP_PRIVATE | MAP_FIXED, OBJT_VNODE, vp, file_offset);
 443                 if (error)
 444                         goto cleanup;
 445                 vm_map_lock(map);
 446                 if (!vm_map_lookup_entry(map, vmaddr, &entry)) {
 447                         vm_map_unlock(map);
 448                         error = EDOOFUS;
 449                         goto cleanup;
 450                 }
 451                 entry->eflags |= MAP_ENTRY_VN_EXEC;
 452                 vm_map_unlock(map);
 453                 textset = false;
 454         }
 455
 456         if (bss_size != 0) {
 457                 /* Calculate BSS start address */
 458                 vmaddr = trunc_page(a_out->a_entry) + a_out->a_text +
 459                     a_out->a_data;
 460
 461                 /* allocate some 'anon' space */
 462                 error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0,
 463                     &vmaddr, bss_size, 0, VMFS_NO_SPACE, VM_PROT_ALL,
 464                     VM_PROT_ALL, 0);
 465                 if (error)
 466                         goto cleanup;
 467         }
 468
 469 cleanup:
 470         if (opened) {
 471                 if (locked)
 472                         VOP_UNLOCK(vp);
 473                 locked = false;
 474                 VOP_CLOSE(vp, FREAD, td->td_ucred, td);
 475         }
 476         if (textset) {
 477                 if (!locked) {
 478                         locked = true;
 479                         VOP_LOCK(vp, LK_SHARED | LK_RETRY);
 480                 }
 481                 VOP_UNSET_TEXT_CHECKED(vp);
 482         }
 483         if (locked)
 484                 VOP_UNLOCK(vp);
 485
 486         /* Release the temporary mapping. */
 487         if (a_out)
 488                 kmap_free_wakeup(exec_map, (vm_offset_t)a_out, PAGE_SIZE);
 489
 490         return (error);
 491 }
 492
 493 #endif  /* __i386__ */
 494
 495 #ifdef LINUX_LEGACY_SYSCALLS
 496 int
 497 linux_select(struct thread *td, struct linux_select_args *args)
 498 {
 499         l_timeval ltv;
 500         struct timeval tv0, tv1, utv, *tvp;
 501         int error;
 502
 503         /*
 504          * Store current time for computation of the amount of
 505          * time left.
 506          */
 507         if (args->timeout) {
 508                 if ((error = copyin(args->timeout, &ltv, sizeof(ltv))))
 509                         goto select_out;
 510                 utv.tv_sec = ltv.tv_sec;
 511                 utv.tv_usec = ltv.tv_usec;
 512
 513                 if (itimerfix(&utv)) {
 514                         /*
 515                          * The timeval was invalid.  Convert it to something
 516                          * valid that will act as it does under Linux.
 517                          */
 518                         utv.tv_sec += utv.tv_usec / 1000000;
 519                         utv.tv_usec %= 1000000;
 520                         if (utv.tv_usec < 0) {
 521                                 utv.tv_sec -= 1;
 522                                 utv.tv_usec += 1000000;
 523                         }
 524                         if (utv.tv_sec < 0)
 525                                 timevalclear(&utv);
 526                 }
 527                 microtime(&tv0);
 528                 tvp = &utv;
 529         } else
 530                 tvp = NULL;
 531
 532         error = kern_select(td, args->nfds, args->readfds, args->writefds,
 533             args->exceptfds, tvp, LINUX_NFDBITS);
 534         if (error)
 535                 goto select_out;
 536
 537         if (args->timeout) {
 538                 if (td->td_retval[0]) {
 539                         /*
 540                          * Compute how much time was left of the timeout,
 541                          * by subtracting the current time and the time
 542                          * before we started the call, and subtracting
 543                          * that result from the user-supplied value.
 544                          */
 545                         microtime(&tv1);
 546                         timevalsub(&tv1, &tv0);
 547                         timevalsub(&utv, &tv1);
 548                         if (utv.tv_sec < 0)
 549                                 timevalclear(&utv);
 550                 } else
 551                         timevalclear(&utv);
 552                 ltv.tv_sec = utv.tv_sec;
 553                 ltv.tv_usec = utv.tv_usec;
 554                 if ((error = copyout(&ltv, args->timeout, sizeof(ltv))))
 555                         goto select_out;
 556         }
 557
 558 select_out:
 559         return (error);
 560 }
 561 #endif
 562
 563 int
 564 linux_mremap(struct thread *td, struct linux_mremap_args *args)
 565 {
 566         uintptr_t addr;
 567         size_t len;
 568         int error = 0;
 569
 570         if (args->flags & ~(LINUX_MREMAP_FIXED | LINUX_MREMAP_MAYMOVE)) {
 571                 td->td_retval[0] = 0;
 572                 return (EINVAL);
 573         }
 574
 575         /*
 576          * Check for the page alignment.
 577          * Linux defines PAGE_MASK to be FreeBSD ~PAGE_MASK.
 578          */
 579         if (args->addr & PAGE_MASK) {
 580                 td->td_retval[0] = 0;
 581                 return (EINVAL);
 582         }
 583
 584         args->new_len = round_page(args->new_len);
 585         args->old_len = round_page(args->old_len);
 586
 587         if (args->new_len > args->old_len) {
 588                 td->td_retval[0] = 0;
 589                 return (ENOMEM);
 590         }
 591
 592         if (args->new_len < args->old_len) {
 593                 addr = args->addr + args->new_len;
 594                 len = args->old_len - args->new_len;
 595                 error = kern_munmap(td, addr, len);
 596         }
 597
 598         td->td_retval[0] = error ? 0 : (uintptr_t)args->addr;
 599         return (error);
 600 }
 601
 602 #define LINUX_MS_ASYNC       0x0001
 603 #define LINUX_MS_INVALIDATE  0x0002
 604 #define LINUX_MS_SYNC        0x0004
 605
 606 int
 607 linux_msync(struct thread *td, struct linux_msync_args *args)
 608 {
 609
 610         return (kern_msync(td, args->addr, args->len,
 611             args->fl & ~LINUX_MS_SYNC));
 612 }
 613
 614 #ifdef LINUX_LEGACY_SYSCALLS
 615 int
 616 linux_time(struct thread *td, struct linux_time_args *args)
 617 {
 618         struct timeval tv;
 619         l_time_t tm;
 620         int error;
 621
 622         microtime(&tv);
 623         tm = tv.tv_sec;
 624         if (args->tm && (error = copyout(&tm, args->tm, sizeof(tm))))
 625                 return (error);
 626         td->td_retval[0] = tm;
 627         return (0);
 628 }
 629 #endif
 630
 631 struct l_times_argv {
 632         l_clock_t       tms_utime;
 633         l_clock_t       tms_stime;
 634         l_clock_t       tms_cutime;
 635         l_clock_t       tms_cstime;
 636 };
 637
 638
 639 /*
 640  * Glibc versions prior to 2.2.1 always use hard-coded CLK_TCK value.
 641  * Since 2.2.1 Glibc uses value exported from kernel via AT_CLKTCK
 642  * auxiliary vector entry.
 643  */
 644 #define CLK_TCK         100
 645
 646 #define CONVOTCK(r)     (r.tv_sec * CLK_TCK + r.tv_usec / (1000000 / CLK_TCK))
 647 #define CONVNTCK(r)     (r.tv_sec * stclohz + r.tv_usec / (1000000 / stclohz))
 648
 649 #define CONVTCK(r)      (linux_kernver(td) >= LINUX_KERNVER_2004000 ?           \
 650                             CONVNTCK(r) : CONVOTCK(r))
 651
 652 int
 653 linux_times(struct thread *td, struct linux_times_args *args)
 654 {
 655         struct timeval tv, utime, stime, cutime, cstime;
 656         struct l_times_argv tms;
 657         struct proc *p;
 658         int error;
 659
 660         if (args->buf != NULL) {
 661                 p = td->td_proc;
 662                 PROC_LOCK(p);
 663                 PROC_STATLOCK(p);
 664                 calcru(p, &utime, &stime);
 665                 PROC_STATUNLOCK(p);
 666                 calccru(p, &cutime, &cstime);
 667                 PROC_UNLOCK(p);
 668
 669                 tms.tms_utime = CONVTCK(utime);
 670                 tms.tms_stime = CONVTCK(stime);
 671
 672                 tms.tms_cutime = CONVTCK(cutime);
 673                 tms.tms_cstime = CONVTCK(cstime);
 674
 675                 if ((error = copyout(&tms, args->buf, sizeof(tms))))
 676                         return (error);
 677         }
 678
 679         microuptime(&tv);
 680         td->td_retval[0] = (int)CONVTCK(tv);
 681         return (0);
 682 }
 683
 684 int
 685 linux_newuname(struct thread *td, struct linux_newuname_args *args)
 686 {
 687         struct l_new_utsname utsname;
 688         char osname[LINUX_MAX_UTSNAME];
 689         char osrelease[LINUX_MAX_UTSNAME];
 690         char *p;
 691
 692         linux_get_osname(td, osname);
 693         linux_get_osrelease(td, osrelease);
 694
 695         bzero(&utsname, sizeof(utsname));
 696         strlcpy(utsname.sysname, osname, LINUX_MAX_UTSNAME);
 697         getcredhostname(td->td_ucred, utsname.nodename, LINUX_MAX_UTSNAME);
 698         getcreddomainname(td->td_ucred, utsname.domainname, LINUX_MAX_UTSNAME);
 699         strlcpy(utsname.release, osrelease, LINUX_MAX_UTSNAME);
 700         strlcpy(utsname.version, version, LINUX_MAX_UTSNAME);
 701         for (p = utsname.version; *p != '\0'; ++p)
 702                 if (*p == '\n') {
 703                         *p = '\0';
 704                         break;
 705                 }
 706         strlcpy(utsname.machine, linux_kplatform, LINUX_MAX_UTSNAME);
 707
 708         return (copyout(&utsname, args->buf, sizeof(utsname)));
 709 }
 710
 711 struct l_utimbuf {
 712         l_time_t l_actime;
 713         l_time_t l_modtime;
 714 };
 715
 716 #ifdef LINUX_LEGACY_SYSCALLS
 717 int
 718 linux_utime(struct thread *td, struct linux_utime_args *args)
 719 {
 720         struct timeval tv[2], *tvp;
 721         struct l_utimbuf lut;
 722         char *fname;
 723         int error;
 724
 725         LCONVPATHEXIST(td, args->fname, &fname);
 726
 727         if (args->times) {
 728                 if ((error = copyin(args->times, &lut, sizeof lut))) {
 729                         LFREEPATH(fname);
 730                         return (error);
 731                 }
 732                 tv[0].tv_sec = lut.l_actime;
 733                 tv[0].tv_usec = 0;
 734                 tv[1].tv_sec = lut.l_modtime;
 735                 tv[1].tv_usec = 0;
 736                 tvp = tv;
 737         } else
 738                 tvp = NULL;
 739
 740         error = kern_utimesat(td, AT_FDCWD, fname, UIO_SYSSPACE, tvp,
 741             UIO_SYSSPACE);
 742         LFREEPATH(fname);
 743         return (error);
 744 }
 745 #endif
 746
 747 #ifdef LINUX_LEGACY_SYSCALLS
 748 int
 749 linux_utimes(struct thread *td, struct linux_utimes_args *args)
 750 {
 751         l_timeval ltv[2];
 752         struct timeval tv[2], *tvp = NULL;
 753         char *fname;
 754         int error;
 755
 756         LCONVPATHEXIST(td, args->fname, &fname);
 757
 758         if (args->tptr != NULL) {
 759                 if ((error = copyin(args->tptr, ltv, sizeof ltv))) {
 760                         LFREEPATH(fname);
 761                         return (error);
 762                 }
 763                 tv[0].tv_sec = ltv[0].tv_sec;
 764                 tv[0].tv_usec = ltv[0].tv_usec;
 765                 tv[1].tv_sec = ltv[1].tv_sec;
 766                 tv[1].tv_usec = ltv[1].tv_usec;
 767                 tvp = tv;
 768         }
 769
 770         error = kern_utimesat(td, AT_FDCWD, fname, UIO_SYSSPACE,
 771             tvp, UIO_SYSSPACE);
 772         LFREEPATH(fname);
 773         return (error);
 774 }
 775 #endif
 776
 777 static int
 778 linux_utimensat_nsec_valid(l_long nsec)
 779 {
 780
 781         if (nsec == LINUX_UTIME_OMIT || nsec == LINUX_UTIME_NOW)
 782                 return (0);
 783         if (nsec >= 0 && nsec <= 999999999)
 784                 return (0);
 785         return (1);
 786 }
 787
 788 int
 789 linux_utimensat(struct thread *td, struct linux_utimensat_args *args)
 790 {
 791         struct l_timespec l_times[2];
 792         struct timespec times[2], *timesp = NULL;
 793         char *path = NULL;
 794         int error, dfd, flags = 0;
 795
 796         dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
 797
 798         if (args->flags & ~LINUX_AT_SYMLINK_NOFOLLOW)
 799                 return (EINVAL);
 800
 801         if (args->times != NULL) {
 802                 error = copyin(args->times, l_times, sizeof(l_times));
 803                 if (error != 0)
 804                         return (error);
 805
 806                 if (linux_utimensat_nsec_valid(l_times[0].tv_nsec) != 0 ||
 807                     linux_utimensat_nsec_valid(l_times[1].tv_nsec) != 0)
 808                         return (EINVAL);
 809
 810                 times[0].tv_sec = l_times[0].tv_sec;
 811                 switch (l_times[0].tv_nsec)
 812                 {
 813                 case LINUX_UTIME_OMIT:
 814                         times[0].tv_nsec = UTIME_OMIT;
 815                         break;
 816                 case LINUX_UTIME_NOW:
 817                         times[0].tv_nsec = UTIME_NOW;
 818                         break;
 819                 default:
 820                         times[0].tv_nsec = l_times[0].tv_nsec;
 821                 }
 822
 823                 times[1].tv_sec = l_times[1].tv_sec;
 824                 switch (l_times[1].tv_nsec)
 825                 {
 826                 case LINUX_UTIME_OMIT:
 827                         times[1].tv_nsec = UTIME_OMIT;
 828                         break;
 829                 case LINUX_UTIME_NOW:
 830                         times[1].tv_nsec = UTIME_NOW;
 831                         break;
 832                 default:
 833                         times[1].tv_nsec = l_times[1].tv_nsec;
 834                         break;
 835                 }
 836                 timesp = times;
 837
 838                 /* This breaks POSIX, but is what the Linux kernel does
 839                  * _on purpose_ (documented in the man page for utimensat(2)),
 840                  * so we must follow that behaviour. */
 841                 if (times[0].tv_nsec == UTIME_OMIT &&
 842                     times[1].tv_nsec == UTIME_OMIT)
 843                         return (0);
 844         }
 845
 846         if (args->pathname != NULL)
 847                 LCONVPATHEXIST_AT(td, args->pathname, &path, dfd);
 848         else if (args->flags != 0)
 849                 return (EINVAL);
 850
 851         if (args->flags & LINUX_AT_SYMLINK_NOFOLLOW)
 852                 flags |= AT_SYMLINK_NOFOLLOW;
 853
 854         if (path == NULL)
 855                 error = kern_futimens(td, dfd, timesp, UIO_SYSSPACE);
 856         else {
 857                 error = kern_utimensat(td, dfd, path, UIO_SYSSPACE, timesp,
 858                         UIO_SYSSPACE, flags);
 859                 LFREEPATH(path);
 860         }
 861
 862         return (error);
 863 }
 864
 865 #ifdef LINUX_LEGACY_SYSCALLS
 866 int
 867 linux_futimesat(struct thread *td, struct linux_futimesat_args *args)
 868 {
 869         l_timeval ltv[2];
 870         struct timeval tv[2], *tvp = NULL;
 871         char *fname;
 872         int error, dfd;
 873
 874         dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
 875         LCONVPATHEXIST_AT(td, args->filename, &fname, dfd);
 876
 877         if (args->utimes != NULL) {
 878                 if ((error = copyin(args->utimes, ltv, sizeof ltv))) {
 879                         LFREEPATH(fname);
 880                         return (error);
 881                 }
 882                 tv[0].tv_sec = ltv[0].tv_sec;
 883                 tv[0].tv_usec = ltv[0].tv_usec;
 884                 tv[1].tv_sec = ltv[1].tv_sec;
 885                 tv[1].tv_usec = ltv[1].tv_usec;
 886                 tvp = tv;
 887         }
 888
 889         error = kern_utimesat(td, dfd, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE);
 890         LFREEPATH(fname);
 891         return (error);
 892 }
 893 #endif
 894
 895 static int
 896 linux_common_wait(struct thread *td, int pid, int *statusp,
 897     int options, struct __wrusage *wrup)
 898 {
 899         siginfo_t siginfo;
 900         idtype_t idtype;
 901         id_t id;
 902         int error, status, tmpstat;
 903
 904         if (pid == WAIT_ANY) {
 905                 idtype = P_ALL;
 906                 id = 0;
 907         } else if (pid < 0) {
 908                 idtype = P_PGID;
 909                 id = (id_t)-pid;
 910         } else {
 911                 idtype = P_PID;
 912                 id = (id_t)pid;
 913         }
 914
 915         /*
 916          * For backward compatibility we implicitly add flags WEXITED
 917          * and WTRAPPED here.
 918          */
 919         options |= WEXITED | WTRAPPED;
 920         error = kern_wait6(td, idtype, id, &status, options, wrup, &siginfo);
 921         if (error)
 922                 return (error);
 923
 924         if (statusp) {
 925                 tmpstat = status & 0xffff;
 926                 if (WIFSIGNALED(tmpstat)) {
 927                         tmpstat = (tmpstat & 0xffffff80) |
 928                             bsd_to_linux_signal(WTERMSIG(tmpstat));
 929                 } else if (WIFSTOPPED(tmpstat)) {
 930                         tmpstat = (tmpstat & 0xffff00ff) |
 931                             (bsd_to_linux_signal(WSTOPSIG(tmpstat)) << 8);
 932 #if defined(__amd64__) && !defined(COMPAT_LINUX32)
 933                         if (WSTOPSIG(status) == SIGTRAP) {
 934                                 tmpstat = linux_ptrace_status(td,
 935                                     siginfo.si_pid, tmpstat);
 936                         }
 937 #endif
 938                 } else if (WIFCONTINUED(tmpstat)) {
 939                         tmpstat = 0xffff;
 940                 }
 941                 error = copyout(&tmpstat, statusp, sizeof(int));
 942         }
 943
 944         return (error);
 945 }
 946
 947 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
 948 int
 949 linux_waitpid(struct thread *td, struct linux_waitpid_args *args)
 950 {
 951         struct linux_wait4_args wait4_args;
 952
 953         wait4_args.pid = args->pid;
 954         wait4_args.status = args->status;
 955         wait4_args.options = args->options;
 956         wait4_args.rusage = NULL;
 957
 958         return (linux_wait4(td, &wait4_args));
 959 }
 960 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
 961
 962 int
 963 linux_wait4(struct thread *td, struct linux_wait4_args *args)
 964 {
 965         int error, options;
 966         struct __wrusage wru, *wrup;
 967
 968         if (args->options & ~(LINUX_WUNTRACED | LINUX_WNOHANG |
 969             LINUX_WCONTINUED | __WCLONE | __WNOTHREAD | __WALL))
 970                 return (EINVAL);
 971
 972         options = WEXITED;
 973         linux_to_bsd_waitopts(args->options, &options);
 974
 975         if (args->rusage != NULL)
 976                 wrup = &wru;
 977         else
 978                 wrup = NULL;
 979         error = linux_common_wait(td, args->pid, args->status, options, wrup);
 980         if (error != 0)
 981                 return (error);
 982         if (args->rusage != NULL)
 983                 error = linux_copyout_rusage(&wru.wru_self, args->rusage);
 984         return (error);
 985 }
 986
 987 int
 988 linux_waitid(struct thread *td, struct linux_waitid_args *args)
 989 {
 990         int status, options, sig;
 991         struct __wrusage wru;
 992         siginfo_t siginfo;
 993         l_siginfo_t lsi;
 994         idtype_t idtype;
 995         struct proc *p;
 996         int error;
 997
 998         options = 0;
 999         linux_to_bsd_waitopts(args->options, &options);
1000
1001         if (options & ~(WNOHANG | WNOWAIT | WEXITED | WUNTRACED | WCONTINUED))
1002                 return (EINVAL);
1003         if (!(options & (WEXITED | WUNTRACED | WCONTINUED)))
1004                 return (EINVAL);
1005
1006         switch (args->idtype) {
1007         case LINUX_P_ALL:
1008                 idtype = P_ALL;
1009                 break;
1010         case LINUX_P_PID:
1011                 if (args->id <= 0)
1012                         return (EINVAL);
1013                 idtype = P_PID;
1014                 break;
1015         case LINUX_P_PGID:
1016                 if (args->id <= 0)
1017                         return (EINVAL);
1018                 idtype = P_PGID;
1019                 break;
1020         default:
1021                 return (EINVAL);
1022         }
1023
1024         error = kern_wait6(td, idtype, args->id, &status, options,
1025             &wru, &siginfo);
1026         if (error != 0)
1027                 return (error);
1028         if (args->rusage != NULL) {
1029                 error = linux_copyout_rusage(&wru.wru_children,
1030                     args->rusage);
1031                 if (error != 0)
1032                         return (error);
1033         }
1034         if (args->info != NULL) {
1035                 p = td->td_proc;
1036                 bzero(&lsi, sizeof(lsi));
1037                 if (td->td_retval[0] != 0) {
1038                         sig = bsd_to_linux_signal(siginfo.si_signo);
1039                         siginfo_to_lsiginfo(&siginfo, &lsi, sig);
1040                 }
1041                 error = copyout(&lsi, args->info, sizeof(lsi));
1042         }
1043         td->td_retval[0] = 0;
1044
1045         return (error);
1046 }
1047
1048 #ifdef LINUX_LEGACY_SYSCALLS
1049 int
1050 linux_mknod(struct thread *td, struct linux_mknod_args *args)
1051 {
1052         char *path;
1053         int error;
1054
1055         LCONVPATHCREAT(td, args->path, &path);
1056
1057         switch (args->mode & S_IFMT) {
1058         case S_IFIFO:
1059         case S_IFSOCK:
1060                 error = kern_mkfifoat(td, AT_FDCWD, path, UIO_SYSSPACE,
1061                     args->mode);
1062                 break;
1063
1064         case S_IFCHR:
1065         case S_IFBLK:
1066                 error = kern_mknodat(td, AT_FDCWD, path, UIO_SYSSPACE,
1067                     args->mode, args->dev);
1068                 break;
1069
1070         case S_IFDIR:
1071                 error = EPERM;
1072                 break;
1073
1074         case 0:
1075                 args->mode |= S_IFREG;
1076                 /* FALLTHROUGH */
1077         case S_IFREG:
1078                 error = kern_openat(td, AT_FDCWD, path, UIO_SYSSPACE,
1079                     O_WRONLY | O_CREAT | O_TRUNC, args->mode);
1080                 if (error == 0)
1081                         kern_close(td, td->td_retval[0]);
1082                 break;
1083
1084         default:
1085                 error = EINVAL;
1086                 break;
1087         }
1088         LFREEPATH(path);
1089         return (error);
1090 }
1091 #endif
1092
1093 int
1094 linux_mknodat(struct thread *td, struct linux_mknodat_args *args)
1095 {
1096         char *path;
1097         int error, dfd;
1098
1099         dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
1100         LCONVPATHCREAT_AT(td, args->filename, &path, dfd);
1101
1102         switch (args->mode & S_IFMT) {
1103         case S_IFIFO:
1104         case S_IFSOCK:
1105                 error = kern_mkfifoat(td, dfd, path, UIO_SYSSPACE, args->mode);
1106                 break;
1107
1108         case S_IFCHR:
1109         case S_IFBLK:
1110                 error = kern_mknodat(td, dfd, path, UIO_SYSSPACE, args->mode,
1111                     args->dev);
1112                 break;
1113
1114         case S_IFDIR:
1115                 error = EPERM;
1116                 break;
1117
1118         case 0:
1119                 args->mode |= S_IFREG;
1120                 /* FALLTHROUGH */
1121         case S_IFREG:
1122                 error = kern_openat(td, dfd, path, UIO_SYSSPACE,
1123                     O_WRONLY | O_CREAT | O_TRUNC, args->mode);
1124                 if (error == 0)
1125                         kern_close(td, td->td_retval[0]);
1126                 break;
1127
1128         default:
1129                 error = EINVAL;
1130                 break;
1131         }
1132         LFREEPATH(path);
1133         return (error);
1134 }
1135
1136 /*
1137  * UGH! This is just about the dumbest idea I've ever heard!!
1138  */
1139 int
1140 linux_personality(struct thread *td, struct linux_personality_args *args)
1141 {
1142         struct linux_pemuldata *pem;
1143         struct proc *p = td->td_proc;
1144         uint32_t old;
1145
1146         PROC_LOCK(p);
1147         pem = pem_find(p);
1148         old = pem->persona;
1149         if (args->per != 0xffffffff)
1150                 pem->persona = args->per;
1151         PROC_UNLOCK(p);
1152
1153         td->td_retval[0] = old;
1154         return (0);
1155 }
1156
1157 struct l_itimerval {
1158         l_timeval it_interval;
1159         l_timeval it_value;
1160 };
1161
1162 #define B2L_ITIMERVAL(bip, lip)                                         \
1163         (bip)->it_interval.tv_sec = (lip)->it_interval.tv_sec;          \
1164         (bip)->it_interval.tv_usec = (lip)->it_interval.tv_usec;        \
1165         (bip)->it_value.tv_sec = (lip)->it_value.tv_sec;                \
1166         (bip)->it_value.tv_usec = (lip)->it_value.tv_usec;
1167
1168 int
1169 linux_setitimer(struct thread *td, struct linux_setitimer_args *uap)
1170 {
1171         int error;
1172         struct l_itimerval ls;
1173         struct itimerval aitv, oitv;
1174
1175         if (uap->itv == NULL) {
1176                 uap->itv = uap->oitv;
1177                 return (linux_getitimer(td, (struct linux_getitimer_args *)uap));
1178         }
1179
1180         error = copyin(uap->itv, &ls, sizeof(ls));
1181         if (error != 0)
1182                 return (error);
1183         B2L_ITIMERVAL(&aitv, &ls);
1184         error = kern_setitimer(td, uap->which, &aitv, &oitv);
1185         if (error != 0 || uap->oitv == NULL)
1186                 return (error);
1187         B2L_ITIMERVAL(&ls, &oitv);
1188
1189         return (copyout(&ls, uap->oitv, sizeof(ls)));
1190 }
1191
1192 int
1193 linux_getitimer(struct thread *td, struct linux_getitimer_args *uap)
1194 {
1195         int error;
1196         struct l_itimerval ls;
1197         struct itimerval aitv;
1198
1199         error = kern_getitimer(td, uap->which, &aitv);
1200         if (error != 0)
1201                 return (error);
1202         B2L_ITIMERVAL(&ls, &aitv);
1203         return (copyout(&ls, uap->itv, sizeof(ls)));
1204 }
1205
1206 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
1207 int
1208 linux_nice(struct thread *td, struct linux_nice_args *args)
1209 {
1210
1211         return (kern_setpriority(td, PRIO_PROCESS, 0, args->inc));
1212 }
1213 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
1214
1215 int
1216 linux_setgroups(struct thread *td, struct linux_setgroups_args *args)
1217 {
1218         struct ucred *newcred, *oldcred;
1219         l_gid_t *linux_gidset;
1220         gid_t *bsd_gidset;
1221         int ngrp, error;
1222         struct proc *p;
1223
1224         ngrp = args->gidsetsize;
1225         if (ngrp < 0 || ngrp >= ngroups_max + 1)
1226                 return (EINVAL);
1227         linux_gidset = malloc(ngrp * sizeof(*linux_gidset), M_LINUX, M_WAITOK);
1228         error = copyin(args->grouplist, linux_gidset, ngrp * sizeof(l_gid_t));
1229         if (error)
1230                 goto out;
1231         newcred = crget();
1232         crextend(newcred, ngrp + 1);
1233         p = td->td_proc;
1234         PROC_LOCK(p);
1235         oldcred = p->p_ucred;
1236         crcopy(newcred, oldcred);
1237
1238         /*
1239          * cr_groups[0] holds egid. Setting the whole set from
1240          * the supplied set will cause egid to be changed too.
1241          * Keep cr_groups[0] unchanged to prevent that.
1242          */
1243
1244         if ((error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS)) != 0) {
1245                 PROC_UNLOCK(p);
1246                 crfree(newcred);
1247                 goto out;
1248         }
1249
1250         if (ngrp > 0) {
1251                 newcred->cr_ngroups = ngrp + 1;
1252
1253                 bsd_gidset = newcred->cr_groups;
1254                 ngrp--;
1255                 while (ngrp >= 0) {
1256                         bsd_gidset[ngrp + 1] = linux_gidset[ngrp];
1257                         ngrp--;
1258                 }
1259         } else
1260                 newcred->cr_ngroups = 1;
1261
1262         setsugid(p);
1263         proc_set_cred(p, newcred);
1264         PROC_UNLOCK(p);
1265         crfree(oldcred);
1266         error = 0;
1267 out:
1268         free(linux_gidset, M_LINUX);
1269         return (error);
1270 }
1271
1272 int
1273 linux_getgroups(struct thread *td, struct linux_getgroups_args *args)
1274 {
1275         struct ucred *cred;
1276         l_gid_t *linux_gidset;
1277         gid_t *bsd_gidset;
1278         int bsd_gidsetsz, ngrp, error;
1279
1280         cred = td->td_ucred;
1281         bsd_gidset = cred->cr_groups;
1282         bsd_gidsetsz = cred->cr_ngroups - 1;
1283
1284         /*
1285          * cr_groups[0] holds egid. Returning the whole set
1286          * here will cause a duplicate. Exclude cr_groups[0]
1287          * to prevent that.
1288          */
1289
1290         if ((ngrp = args->gidsetsize) == 0) {
1291                 td->td_retval[0] = bsd_gidsetsz;
1292                 return (0);
1293         }
1294
1295         if (ngrp < bsd_gidsetsz)
1296                 return (EINVAL);
1297
1298         ngrp = 0;
1299         linux_gidset = malloc(bsd_gidsetsz * sizeof(*linux_gidset),
1300             M_LINUX, M_WAITOK);
1301         while (ngrp < bsd_gidsetsz) {
1302                 linux_gidset[ngrp] = bsd_gidset[ngrp + 1];
1303                 ngrp++;
1304         }
1305
1306         error = copyout(linux_gidset, args->grouplist, ngrp * sizeof(l_gid_t));
1307         free(linux_gidset, M_LINUX);
1308         if (error)
1309                 return (error);
1310
1311         td->td_retval[0] = ngrp;
1312         return (0);
1313 }
1314
1315 int
1316 linux_setrlimit(struct thread *td, struct linux_setrlimit_args *args)
1317 {
1318         struct rlimit bsd_rlim;
1319         struct l_rlimit rlim;
1320         u_int which;
1321         int error;
1322
1323         if (args->resource >= LINUX_RLIM_NLIMITS)
1324                 return (EINVAL);
1325
1326         which = linux_to_bsd_resource[args->resource];
1327         if (which == -1)
1328                 return (EINVAL);
1329
1330         error = copyin(args->rlim, &rlim, sizeof(rlim));
1331         if (error)
1332                 return (error);
1333
1334         bsd_rlim.rlim_cur = (rlim_t)rlim.rlim_cur;
1335         bsd_rlim.rlim_max = (rlim_t)rlim.rlim_max;
1336         return (kern_setrlimit(td, which, &bsd_rlim));
1337 }
1338
1339 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
1340 int
1341 linux_old_getrlimit(struct thread *td, struct linux_old_getrlimit_args *args)
1342 {
1343         struct l_rlimit rlim;
1344         struct rlimit bsd_rlim;
1345         u_int which;
1346
1347         if (args->resource >= LINUX_RLIM_NLIMITS)
1348                 return (EINVAL);
1349
1350         which = linux_to_bsd_resource[args->resource];
1351         if (which == -1)
1352                 return (EINVAL);
1353
1354         lim_rlimit(td, which, &bsd_rlim);
1355
1356 #ifdef COMPAT_LINUX32
1357         rlim.rlim_cur = (unsigned int)bsd_rlim.rlim_cur;
1358         if (rlim.rlim_cur == UINT_MAX)
1359                 rlim.rlim_cur = INT_MAX;
1360         rlim.rlim_max = (unsigned int)bsd_rlim.rlim_max;
1361         if (rlim.rlim_max == UINT_MAX)
1362                 rlim.rlim_max = INT_MAX;
1363 #else
1364         rlim.rlim_cur = (unsigned long)bsd_rlim.rlim_cur;
1365         if (rlim.rlim_cur == ULONG_MAX)
1366                 rlim.rlim_cur = LONG_MAX;
1367         rlim.rlim_max = (unsigned long)bsd_rlim.rlim_max;
1368         if (rlim.rlim_max == ULONG_MAX)
1369                 rlim.rlim_max = LONG_MAX;
1370 #endif
1371         return (copyout(&rlim, args->rlim, sizeof(rlim)));
1372 }
1373 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
1374
1375 int
1376 linux_getrlimit(struct thread *td, struct linux_getrlimit_args *args)
1377 {
1378         struct l_rlimit rlim;
1379         struct rlimit bsd_rlim;
1380         u_int which;
1381
1382         if (args->resource >= LINUX_RLIM_NLIMITS)
1383                 return (EINVAL);
1384
1385         which = linux_to_bsd_resource[args->resource];
1386         if (which == -1)
1387                 return (EINVAL);
1388
1389         lim_rlimit(td, which, &bsd_rlim);
1390
1391         rlim.rlim_cur = (l_ulong)bsd_rlim.rlim_cur;
1392         rlim.rlim_max = (l_ulong)bsd_rlim.rlim_max;
1393         return (copyout(&rlim, args->rlim, sizeof(rlim)));
1394 }
1395
1396 int
1397 linux_sched_setscheduler(struct thread *td,
1398     struct linux_sched_setscheduler_args *args)
1399 {
1400         struct sched_param sched_param;
1401         struct thread *tdt;
1402         int error, policy;
1403
1404         switch (args->policy) {
1405         case LINUX_SCHED_OTHER:
1406                 policy = SCHED_OTHER;
1407                 break;
1408         case LINUX_SCHED_FIFO:
1409                 policy = SCHED_FIFO;
1410                 break;
1411         case LINUX_SCHED_RR:
1412                 policy = SCHED_RR;
1413                 break;
1414         default:
1415                 return (EINVAL);
1416         }
1417
1418         error = copyin(args->param, &sched_param, sizeof(sched_param));
1419         if (error)
1420                 return (error);
1421
1422         if (linux_map_sched_prio) {
1423                 switch (policy) {
1424                 case SCHED_OTHER:
1425                         if (sched_param.sched_priority != 0)
1426                                 return (EINVAL);
1427
1428                         sched_param.sched_priority =
1429                             PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE;
1430                         break;
1431                 case SCHED_FIFO:
1432                 case SCHED_RR:
1433                         if (sched_param.sched_priority < 1 ||
1434                             sched_param.sched_priority >= LINUX_MAX_RT_PRIO)
1435                                 return (EINVAL);
1436
1437                         /*
1438                          * Map [1, LINUX_MAX_RT_PRIO - 1] to
1439                          * [0, RTP_PRIO_MAX - RTP_PRIO_MIN] (rounding down).
1440                          */
1441                         sched_param.sched_priority =
1442                             (sched_param.sched_priority - 1) *
1443                             (RTP_PRIO_MAX - RTP_PRIO_MIN + 1) /
1444                             (LINUX_MAX_RT_PRIO - 1);
1445                         break;
1446                 }
1447         }
1448
1449         tdt = linux_tdfind(td, args->pid, -1);
1450         if (tdt == NULL)
1451                 return (ESRCH);
1452
1453         error = kern_sched_setscheduler(td, tdt, policy, &sched_param);
1454         PROC_UNLOCK(tdt->td_proc);
1455         return (error);
1456 }
1457
1458 int
1459 linux_sched_getscheduler(struct thread *td,
1460     struct linux_sched_getscheduler_args *args)
1461 {
1462         struct thread *tdt;
1463         int error, policy;
1464
1465         tdt = linux_tdfind(td, args->pid, -1);
1466         if (tdt == NULL)
1467                 return (ESRCH);
1468
1469         error = kern_sched_getscheduler(td, tdt, &policy);
1470         PROC_UNLOCK(tdt->td_proc);
1471
1472         switch (policy) {
1473         case SCHED_OTHER:
1474                 td->td_retval[0] = LINUX_SCHED_OTHER;
1475                 break;
1476         case SCHED_FIFO:
1477                 td->td_retval[0] = LINUX_SCHED_FIFO;
1478                 break;
1479         case SCHED_RR:
1480                 td->td_retval[0] = LINUX_SCHED_RR;
1481                 break;
1482         }
1483         return (error);
1484 }
1485
1486 int
1487 linux_sched_get_priority_max(struct thread *td,
1488     struct linux_sched_get_priority_max_args *args)
1489 {
1490         struct sched_get_priority_max_args bsd;
1491
1492         if (linux_map_sched_prio) {
1493                 switch (args->policy) {
1494                 case LINUX_SCHED_OTHER:
1495                         td->td_retval[0] = 0;
1496                         return (0);
1497                 case LINUX_SCHED_FIFO:
1498                 case LINUX_SCHED_RR:
1499                         td->td_retval[0] = LINUX_MAX_RT_PRIO - 1;
1500                         return (0);
1501                 default:
1502                         return (EINVAL);
1503                 }
1504         }
1505
1506         switch (args->policy) {
1507         case LINUX_SCHED_OTHER:
1508                 bsd.policy = SCHED_OTHER;
1509                 break;
1510         case LINUX_SCHED_FIFO:
1511                 bsd.policy = SCHED_FIFO;
1512                 break;
1513         case LINUX_SCHED_RR:
1514                 bsd.policy = SCHED_RR;
1515                 break;
1516         default:
1517                 return (EINVAL);
1518         }
1519         return (sys_sched_get_priority_max(td, &bsd));
1520 }
1521
1522 int
1523 linux_sched_get_priority_min(struct thread *td,
1524     struct linux_sched_get_priority_min_args *args)
1525 {
1526         struct sched_get_priority_min_args bsd;
1527
1528         if (linux_map_sched_prio) {
1529                 switch (args->policy) {
1530                 case LINUX_SCHED_OTHER:
1531                         td->td_retval[0] = 0;
1532                         return (0);
1533                 case LINUX_SCHED_FIFO:
1534                 case LINUX_SCHED_RR:
1535                         td->td_retval[0] = 1;
1536                         return (0);
1537                 default:
1538                         return (EINVAL);
1539                 }
1540         }
1541
1542         switch (args->policy) {
1543         case LINUX_SCHED_OTHER:
1544                 bsd.policy = SCHED_OTHER;
1545                 break;
1546         case LINUX_SCHED_FIFO:
1547                 bsd.policy = SCHED_FIFO;
1548                 break;
1549         case LINUX_SCHED_RR:
1550                 bsd.policy = SCHED_RR;
1551                 break;
1552         default:
1553                 return (EINVAL);
1554         }
1555         return (sys_sched_get_priority_min(td, &bsd));
1556 }
1557
1558 #define REBOOT_CAD_ON   0x89abcdef
1559 #define REBOOT_CAD_OFF  0
1560 #define REBOOT_HALT     0xcdef0123
1561 #define REBOOT_RESTART  0x01234567
1562 #define REBOOT_RESTART2 0xA1B2C3D4
1563 #define REBOOT_POWEROFF 0x4321FEDC
1564 #define REBOOT_MAGIC1   0xfee1dead
1565 #define REBOOT_MAGIC2   0x28121969
1566 #define REBOOT_MAGIC2A  0x05121996
1567 #define REBOOT_MAGIC2B  0x16041998
1568
1569 int
1570 linux_reboot(struct thread *td, struct linux_reboot_args *args)
1571 {
1572         struct reboot_args bsd_args;
1573
1574         if (args->magic1 != REBOOT_MAGIC1)
1575                 return (EINVAL);
1576
1577         switch (args->magic2) {
1578         case REBOOT_MAGIC2:
1579         case REBOOT_MAGIC2A:
1580         case REBOOT_MAGIC2B:
1581                 break;
1582         default:
1583                 return (EINVAL);
1584         }
1585
1586         switch (args->cmd) {
1587         case REBOOT_CAD_ON:
1588         case REBOOT_CAD_OFF:
1589                 return (priv_check(td, PRIV_REBOOT));
1590         case REBOOT_HALT:
1591                 bsd_args.opt = RB_HALT;
1592                 break;
1593         case REBOOT_RESTART:
1594         case REBOOT_RESTART2:
1595                 bsd_args.opt = 0;
1596                 break;
1597         case REBOOT_POWEROFF:
1598                 bsd_args.opt = RB_POWEROFF;
1599                 break;
1600         default:
1601                 return (EINVAL);
1602         }
1603         return (sys_reboot(td, &bsd_args));
1604 }
1605
1606
1607 int
1608 linux_getpid(struct thread *td, struct linux_getpid_args *args)
1609 {
1610
1611         td->td_retval[0] = td->td_proc->p_pid;
1612
1613         return (0);
1614 }
1615
1616 int
1617 linux_gettid(struct thread *td, struct linux_gettid_args *args)
1618 {
1619         struct linux_emuldata *em;
1620
1621         em = em_find(td);
1622         KASSERT(em != NULL, ("gettid: emuldata not found.\n"));
1623
1624         td->td_retval[0] = em->em_tid;
1625
1626         return (0);
1627 }
1628
1629
1630 int
1631 linux_getppid(struct thread *td, struct linux_getppid_args *args)
1632 {
1633
1634         td->td_retval[0] = kern_getppid(td);
1635         return (0);
1636 }
1637
1638 int
1639 linux_getgid(struct thread *td, struct linux_getgid_args *args)
1640 {
1641
1642         td->td_retval[0] = td->td_ucred->cr_rgid;
1643         return (0);
1644 }
1645
1646 int
1647 linux_getuid(struct thread *td, struct linux_getuid_args *args)
1648 {
1649
1650         td->td_retval[0] = td->td_ucred->cr_ruid;
1651         return (0);
1652 }
1653
1654 int
1655 linux_getsid(struct thread *td, struct linux_getsid_args *args)
1656 {
1657
1658         return (kern_getsid(td, args->pid));
1659 }
1660
1661 int
1662 linux_nosys(struct thread *td, struct nosys_args *ignore)
1663 {
1664
1665         return (ENOSYS);
1666 }
1667
1668 int
1669 linux_getpriority(struct thread *td, struct linux_getpriority_args *args)
1670 {
1671         int error;
1672
1673         error = kern_getpriority(td, args->which, args->who);
1674         td->td_retval[0] = 20 - td->td_retval[0];
1675         return (error);
1676 }
1677
1678 int
1679 linux_sethostname(struct thread *td, struct linux_sethostname_args *args)
1680 {
1681         int name[2];
1682
1683         name[0] = CTL_KERN;
1684         name[1] = KERN_HOSTNAME;
1685         return (userland_sysctl(td, name, 2, 0, 0, 0, args->hostname,
1686             args->len, 0, 0));
1687 }
1688
1689 int
1690 linux_setdomainname(struct thread *td, struct linux_setdomainname_args *args)
1691 {
1692         int name[2];
1693
1694         name[0] = CTL_KERN;
1695         name[1] = KERN_NISDOMAINNAME;
1696         return (userland_sysctl(td, name, 2, 0, 0, 0, args->name,
1697             args->len, 0, 0));
1698 }
1699
1700 int
1701 linux_exit_group(struct thread *td, struct linux_exit_group_args *args)
1702 {
1703
1704         LINUX_CTR2(exit_group, "thread(%d) (%d)", td->td_tid,
1705             args->error_code);
1706
1707         /*
1708          * XXX: we should send a signal to the parent if
1709          * SIGNAL_EXIT_GROUP is set. We ignore that (temporarily?)
1710          * as it doesnt occur often.
1711          */
1712         exit1(td, args->error_code, 0);
1713                 /* NOTREACHED */
1714 }
1715
1716 #define _LINUX_CAPABILITY_VERSION_1  0x19980330
1717 #define _LINUX_CAPABILITY_VERSION_2  0x20071026
1718 #define _LINUX_CAPABILITY_VERSION_3  0x20080522
1719
1720 struct l_user_cap_header {
1721         l_int   version;
1722         l_int   pid;
1723 };
1724
1725 struct l_user_cap_data {
1726         l_int   effective;
1727         l_int   permitted;
1728         l_int   inheritable;
1729 };
1730
1731 int
1732 linux_capget(struct thread *td, struct linux_capget_args *uap)
1733 {
1734         struct l_user_cap_header luch;
1735         struct l_user_cap_data lucd[2];
1736         int error, u32s;
1737
1738         if (uap->hdrp == NULL)
1739                 return (EFAULT);
1740
1741         error = copyin(uap->hdrp, &luch, sizeof(luch));
1742         if (error != 0)
1743                 return (error);
1744
1745         switch (luch.version) {
1746         case _LINUX_CAPABILITY_VERSION_1:
1747                 u32s = 1;
1748                 break;
1749         case _LINUX_CAPABILITY_VERSION_2:
1750         case _LINUX_CAPABILITY_VERSION_3:
1751                 u32s = 2;
1752                 break;
1753         default:
1754                 luch.version = _LINUX_CAPABILITY_VERSION_1;
1755                 error = copyout(&luch, uap->hdrp, sizeof(luch));
1756                 if (error)
1757                         return (error);
1758                 return (EINVAL);
1759         }
1760
1761         if (luch.pid)
1762                 return (EPERM);
1763
1764         if (uap->datap) {
1765                 /*
1766                  * The current implementation doesn't support setting
1767                  * a capability (it's essentially a stub) so indicate
1768                  * that no capabilities are currently set or available
1769                  * to request.
1770                  */
1771                 memset(&lucd, 0, u32s * sizeof(lucd[0]));
1772                 error = copyout(&lucd, uap->datap, u32s * sizeof(lucd[0]));
1773         }
1774
1775         return (error);
1776 }
1777
1778 int
1779 linux_capset(struct thread *td, struct linux_capset_args *uap)
1780 {
1781         struct l_user_cap_header luch;
1782         struct l_user_cap_data lucd[2];
1783         int error, i, u32s;
1784
1785         if (uap->hdrp == NULL || uap->datap == NULL)
1786                 return (EFAULT);
1787
1788         error = copyin(uap->hdrp, &luch, sizeof(luch));
1789         if (error != 0)
1790                 return (error);
1791
1792         switch (luch.version) {
1793         case _LINUX_CAPABILITY_VERSION_1:
1794                 u32s = 1;
1795                 break;
1796         case _LINUX_CAPABILITY_VERSION_2:
1797         case _LINUX_CAPABILITY_VERSION_3:
1798                 u32s = 2;
1799                 break;
1800         default:
1801                 luch.version = _LINUX_CAPABILITY_VERSION_1;
1802                 error = copyout(&luch, uap->hdrp, sizeof(luch));
1803                 if (error)
1804                         return (error);
1805                 return (EINVAL);
1806         }
1807
1808         if (luch.pid)
1809                 return (EPERM);
1810
1811         error = copyin(uap->datap, &lucd, u32s * sizeof(lucd[0]));
1812         if (error != 0)
1813                 return (error);
1814
1815         /* We currently don't support setting any capabilities. */
1816         for (i = 0; i < u32s; i++) {
1817                 if (lucd[i].effective || lucd[i].permitted ||
1818                     lucd[i].inheritable) {
1819                         linux_msg(td,
1820                             "capset[%d] effective=0x%x, permitted=0x%x, "
1821                             "inheritable=0x%x is not implemented", i,
1822                             (int)lucd[i].effective, (int)lucd[i].permitted,
1823                             (int)lucd[i].inheritable);
1824                         return (EPERM);
1825                 }
1826         }
1827
1828         return (0);
1829 }
1830
1831 int
1832 linux_prctl(struct thread *td, struct linux_prctl_args *args)
1833 {
1834         int error = 0, max_size;
1835         struct proc *p = td->td_proc;
1836         char comm[LINUX_MAX_COMM_LEN];
1837         int pdeath_signal;
1838
1839         switch (args->option) {
1840         case LINUX_PR_SET_PDEATHSIG:
1841                 if (!LINUX_SIG_VALID(args->arg2))
1842                         return (EINVAL);
1843                 pdeath_signal = linux_to_bsd_signal(args->arg2);
1844                 return (kern_procctl(td, P_PID, 0, PROC_PDEATHSIG_CTL,
1845                     &pdeath_signal));
1846         case LINUX_PR_GET_PDEATHSIG:
1847                 error = kern_procctl(td, P_PID, 0, PROC_PDEATHSIG_STATUS,
1848                     &pdeath_signal);
1849                 if (error != 0)
1850                         return (error);
1851                 pdeath_signal = bsd_to_linux_signal(pdeath_signal);
1852                 return (copyout(&pdeath_signal,
1853                     (void *)(register_t)args->arg2,
1854                     sizeof(pdeath_signal)));
1855                 break;
1856         case LINUX_PR_GET_KEEPCAPS:
1857                 /*
1858                  * Indicate that we always clear the effective and
1859                  * permitted capability sets when the user id becomes
1860                  * non-zero (actually the capability sets are simply
1861                  * always zero in the current implementation).
1862                  */
1863                 td->td_retval[0] = 0;
1864                 break;
1865         case LINUX_PR_SET_KEEPCAPS:
1866                 /*
1867                  * Ignore requests to keep the effective and permitted
1868                  * capability sets when the user id becomes non-zero.
1869                  */
1870                 break;
1871         case LINUX_PR_SET_NAME:
1872                 /*
1873                  * To be on the safe side we need to make sure to not
1874                  * overflow the size a Linux program expects. We already
1875                  * do this here in the copyin, so that we don't need to
1876                  * check on copyout.
1877                  */
1878                 max_size = MIN(sizeof(comm), sizeof(p->p_comm));
1879                 error = copyinstr((void *)(register_t)args->arg2, comm,
1880                     max_size, NULL);
1881
1882                 /* Linux silently truncates the name if it is too long. */
1883                 if (error == ENAMETOOLONG) {
1884                         /*
1885                          * XXX: copyinstr() isn't documented to populate the
1886                          * array completely, so do a copyin() to be on the
1887                          * safe side. This should be changed in case
1888                          * copyinstr() is changed to guarantee this.
1889                          */
1890                         error = copyin((void *)(register_t)args->arg2, comm,
1891                             max_size - 1);
1892                         comm[max_size - 1] = '\0';
1893                 }
1894                 if (error)
1895                         return (error);
1896
1897                 PROC_LOCK(p);
1898                 strlcpy(p->p_comm, comm, sizeof(p->p_comm));
1899                 PROC_UNLOCK(p);
1900                 break;
1901         case LINUX_PR_GET_NAME:
1902                 PROC_LOCK(p);
1903                 strlcpy(comm, p->p_comm, sizeof(comm));
1904                 PROC_UNLOCK(p);
1905                 error = copyout(comm, (void *)(register_t)args->arg2,
1906                     strlen(comm) + 1);
1907                 break;
1908         default:
1909                 error = EINVAL;
1910                 break;
1911         }
1912
1913         return (error);
1914 }
1915
1916 int
1917 linux_sched_setparam(struct thread *td,
1918     struct linux_sched_setparam_args *uap)
1919 {
1920         struct sched_param sched_param;
1921         struct thread *tdt;
1922         int error, policy;
1923
1924         error = copyin(uap->param, &sched_param, sizeof(sched_param));
1925         if (error)
1926                 return (error);
1927
1928         tdt = linux_tdfind(td, uap->pid, -1);
1929         if (tdt == NULL)
1930                 return (ESRCH);
1931
1932         if (linux_map_sched_prio) {
1933                 error = kern_sched_getscheduler(td, tdt, &policy);
1934                 if (error)
1935                         goto out;
1936
1937                 switch (policy) {
1938                 case SCHED_OTHER:
1939                         if (sched_param.sched_priority != 0) {
1940                                 error = EINVAL;
1941                                 goto out;
1942                         }
1943                         sched_param.sched_priority =
1944                             PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE;
1945                         break;
1946                 case SCHED_FIFO:
1947                 case SCHED_RR:
1948                         if (sched_param.sched_priority < 1 ||
1949                             sched_param.sched_priority >= LINUX_MAX_RT_PRIO) {
1950                                 error = EINVAL;
1951                                 goto out;
1952                         }
1953                         /*
1954                          * Map [1, LINUX_MAX_RT_PRIO - 1] to
1955                          * [0, RTP_PRIO_MAX - RTP_PRIO_MIN] (rounding down).
1956                          */
1957                         sched_param.sched_priority =
1958                             (sched_param.sched_priority - 1) *
1959                             (RTP_PRIO_MAX - RTP_PRIO_MIN + 1) /
1960                             (LINUX_MAX_RT_PRIO - 1);
1961                         break;
1962                 }
1963         }
1964
1965         error = kern_sched_setparam(td, tdt, &sched_param);
1966 out:    PROC_UNLOCK(tdt->td_proc);
1967         return (error);
1968 }
1969
1970 int
1971 linux_sched_getparam(struct thread *td,
1972     struct linux_sched_getparam_args *uap)
1973 {
1974         struct sched_param sched_param;
1975         struct thread *tdt;
1976         int error, policy;
1977
1978         tdt = linux_tdfind(td, uap->pid, -1);
1979         if (tdt == NULL)
1980                 return (ESRCH);
1981
1982         error = kern_sched_getparam(td, tdt, &sched_param);
1983         if (error) {
1984                 PROC_UNLOCK(tdt->td_proc);
1985                 return (error);
1986         }
1987
1988         if (linux_map_sched_prio) {
1989                 error = kern_sched_getscheduler(td, tdt, &policy);
1990                 PROC_UNLOCK(tdt->td_proc);
1991                 if (error)
1992                         return (error);
1993
1994                 switch (policy) {
1995                 case SCHED_OTHER:
1996                         sched_param.sched_priority = 0;
1997                         break;
1998                 case SCHED_FIFO:
1999                 case SCHED_RR:
2000                         /*
2001                          * Map [0, RTP_PRIO_MAX - RTP_PRIO_MIN] to
2002                          * [1, LINUX_MAX_RT_PRIO - 1] (rounding up).
2003                          */
2004                         sched_param.sched_priority =
2005                             (sched_param.sched_priority *
2006                             (LINUX_MAX_RT_PRIO - 1) +
2007                             (RTP_PRIO_MAX - RTP_PRIO_MIN - 1)) /
2008                             (RTP_PRIO_MAX - RTP_PRIO_MIN) + 1;
2009                         break;
2010                 }
2011         } else
2012                 PROC_UNLOCK(tdt->td_proc);
2013
2014         error = copyout(&sched_param, uap->param, sizeof(sched_param));
2015         return (error);
2016 }
2017
2018 /*
2019  * Get affinity of a process.
2020  */
2021 int
2022 linux_sched_getaffinity(struct thread *td,
2023     struct linux_sched_getaffinity_args *args)
2024 {
2025         int error;
2026         struct thread *tdt;
2027
2028         if (args->len < sizeof(cpuset_t))
2029                 return (EINVAL);
2030
2031         tdt = linux_tdfind(td, args->pid, -1);
2032         if (tdt == NULL)
2033                 return (ESRCH);
2034
2035         PROC_UNLOCK(tdt->td_proc);
2036
2037         error = kern_cpuset_getaffinity(td, CPU_LEVEL_WHICH, CPU_WHICH_TID,
2038             tdt->td_tid, sizeof(cpuset_t), (cpuset_t *)args->user_mask_ptr);
2039         if (error == 0)
2040                 td->td_retval[0] = sizeof(cpuset_t);
2041
2042         return (error);
2043 }
2044
2045 /*
2046  *  Set affinity of a process.
2047  */
2048 int
2049 linux_sched_setaffinity(struct thread *td,
2050     struct linux_sched_setaffinity_args *args)
2051 {
2052         struct thread *tdt;
2053
2054         if (args->len < sizeof(cpuset_t))
2055                 return (EINVAL);
2056
2057         tdt = linux_tdfind(td, args->pid, -1);
2058         if (tdt == NULL)
2059                 return (ESRCH);
2060
2061         PROC_UNLOCK(tdt->td_proc);
2062
2063         return (kern_cpuset_setaffinity(td, CPU_LEVEL_WHICH, CPU_WHICH_TID,
2064             tdt->td_tid, sizeof(cpuset_t), (cpuset_t *) args->user_mask_ptr));
2065 }
2066
2067 struct linux_rlimit64 {
2068         uint64_t        rlim_cur;
2069         uint64_t        rlim_max;
2070 };
2071
2072 int
2073 linux_prlimit64(struct thread *td, struct linux_prlimit64_args *args)
2074 {
2075         struct rlimit rlim, nrlim;
2076         struct linux_rlimit64 lrlim;
2077         struct proc *p;
2078         u_int which;
2079         int flags;
2080         int error;
2081
2082         if (args->resource >= LINUX_RLIM_NLIMITS)
2083                 return (EINVAL);
2084
2085         which = linux_to_bsd_resource[args->resource];
2086         if (which == -1)
2087                 return (EINVAL);
2088
2089         if (args->new != NULL) {
2090                 /*
2091                  * Note. Unlike FreeBSD where rlim is signed 64-bit Linux
2092                  * rlim is unsigned 64-bit. FreeBSD treats negative limits
2093                  * as INFINITY so we do not need a conversion even.
2094                  */
2095                 error = copyin(args->new, &nrlim, sizeof(nrlim));
2096                 if (error != 0)
2097                         return (error);
2098         }
2099
2100         flags = PGET_HOLD | PGET_NOTWEXIT;
2101         if (args->new != NULL)
2102                 flags |= PGET_CANDEBUG;
2103         else
2104                 flags |= PGET_CANSEE;
2105         if (args->pid == 0) {
2106                 p = td->td_proc;
2107                 PHOLD(p);
2108         } else {
2109                 error = pget(args->pid, flags, &p);
2110                 if (error != 0)
2111                         return (error);
2112         }
2113         if (args->old != NULL) {
2114                 PROC_LOCK(p);
2115                 lim_rlimit_proc(p, which, &rlim);
2116                 PROC_UNLOCK(p);
2117                 if (rlim.rlim_cur == RLIM_INFINITY)
2118                         lrlim.rlim_cur = LINUX_RLIM_INFINITY;
2119                 else
2120                         lrlim.rlim_cur = rlim.rlim_cur;
2121                 if (rlim.rlim_max == RLIM_INFINITY)
2122                         lrlim.rlim_max = LINUX_RLIM_INFINITY;
2123                 else
2124                         lrlim.rlim_max = rlim.rlim_max;
2125                 error = copyout(&lrlim, args->old, sizeof(lrlim));
2126                 if (error != 0)
2127                         goto out;
2128         }
2129
2130         if (args->new != NULL)
2131                 error = kern_proc_setrlimit(td, p, which, &nrlim);
2132
2133  out:
2134         PRELE(p);
2135         return (error);
2136 }
2137
2138 int
2139 linux_pselect6(struct thread *td, struct linux_pselect6_args *args)
2140 {
2141         struct timeval utv, tv0, tv1, *tvp;
2142         struct l_pselect6arg lpse6;
2143         struct l_timespec lts;
2144         struct timespec uts;
2145         l_sigset_t l_ss;
2146         sigset_t *ssp;
2147         sigset_t ss;
2148         int error;
2149
2150         ssp = NULL;
2151         if (args->sig != NULL) {
2152                 error = copyin(args->sig, &lpse6, sizeof(lpse6));
2153                 if (error != 0)
2154                         return (error);
2155                 if (lpse6.ss_len != sizeof(l_ss))
2156                         return (EINVAL);
2157                 if (lpse6.ss != 0) {
2158                         error = copyin(PTRIN(lpse6.ss), &l_ss,
2159                             sizeof(l_ss));
2160                         if (error != 0)
2161                                 return (error);
2162                         linux_to_bsd_sigset(&l_ss, &ss);
2163                         ssp = &ss;
2164                 }
2165         }
2166
2167         /*
2168          * Currently glibc changes nanosecond number to microsecond.
2169          * This mean losing precision but for now it is hardly seen.
2170          */
2171         if (args->tsp != NULL) {
2172                 error = copyin(args->tsp, &lts, sizeof(lts));
2173                 if (error != 0)
2174                         return (error);
2175                 error = linux_to_native_timespec(&uts, &lts);
2176                 if (error != 0)
2177                         return (error);
2178
2179                 TIMESPEC_TO_TIMEVAL(&utv, &uts);
2180                 if (itimerfix(&utv))
2181                         return (EINVAL);
2182
2183                 microtime(&tv0);
2184                 tvp = &utv;
2185         } else
2186                 tvp = NULL;
2187
2188         error = kern_pselect(td, args->nfds, args->readfds, args->writefds,
2189             args->exceptfds, tvp, ssp, LINUX_NFDBITS);
2190
2191         if (error == 0 && args->tsp != NULL) {
2192                 if (td->td_retval[0] != 0) {
2193                         /*
2194                          * Compute how much time was left of the timeout,
2195                          * by subtracting the current time and the time
2196                          * before we started the call, and subtracting
2197                          * that result from the user-supplied value.
2198                          */
2199
2200                         microtime(&tv1);
2201                         timevalsub(&tv1, &tv0);
2202                         timevalsub(&utv, &tv1);
2203                         if (utv.tv_sec < 0)
2204                                 timevalclear(&utv);
2205                 } else
2206                         timevalclear(&utv);
2207
2208                 TIMEVAL_TO_TIMESPEC(&utv, &uts);
2209
2210                 error = native_to_linux_timespec(&lts, &uts);
2211                 if (error == 0)
2212                         error = copyout(&lts, args->tsp, sizeof(lts));
2213         }
2214
2215         return (error);
2216 }
2217
2218 int
2219 linux_ppoll(struct thread *td, struct linux_ppoll_args *args)
2220 {
2221         struct timespec ts0, ts1;
2222         struct l_timespec lts;
2223         struct timespec uts, *tsp;
2224         l_sigset_t l_ss;
2225         sigset_t *ssp;
2226         sigset_t ss;
2227         int error;
2228
2229         if (args->sset != NULL) {
2230                 if (args->ssize != sizeof(l_ss))
2231                         return (EINVAL);
2232                 error = copyin(args->sset, &l_ss, sizeof(l_ss));
2233                 if (error)
2234                         return (error);
2235                 linux_to_bsd_sigset(&l_ss, &ss);
2236                 ssp = &ss;
2237         } else
2238                 ssp = NULL;
2239         if (args->tsp != NULL) {
2240                 error = copyin(args->tsp, &lts, sizeof(lts));
2241                 if (error)
2242                         return (error);
2243                 error = linux_to_native_timespec(&uts, &lts);
2244                 if (error != 0)
2245                         return (error);
2246
2247                 nanotime(&ts0);
2248                 tsp = &uts;
2249         } else
2250                 tsp = NULL;
2251
2252         error = kern_poll(td, args->fds, args->nfds, tsp, ssp);
2253
2254         if (error == 0 && args->tsp != NULL) {
2255                 if (td->td_retval[0]) {
2256                         nanotime(&ts1);
2257                         timespecsub(&ts1, &ts0, &ts1);
2258                         timespecsub(&uts, &ts1, &uts);
2259                         if (uts.tv_sec < 0)
2260                                 timespecclear(&uts);
2261                 } else
2262                         timespecclear(&uts);
2263
2264                 error = native_to_linux_timespec(&lts, &uts);
2265                 if (error == 0)
2266                         error = copyout(&lts, args->tsp, sizeof(lts));
2267         }
2268
2269         return (error);
2270 }
2271
2272 int
2273 linux_sched_rr_get_interval(struct thread *td,
2274     struct linux_sched_rr_get_interval_args *uap)
2275 {
2276         struct timespec ts;
2277         struct l_timespec lts;
2278         struct thread *tdt;
2279         int error;
2280
2281         /*
2282          * According to man in case the invalid pid specified
2283          * EINVAL should be returned.
2284          */
2285         if (uap->pid < 0)
2286                 return (EINVAL);
2287
2288         tdt = linux_tdfind(td, uap->pid, -1);
2289         if (tdt == NULL)
2290                 return (ESRCH);
2291
2292         error = kern_sched_rr_get_interval_td(td, tdt, &ts);
2293         PROC_UNLOCK(tdt->td_proc);
2294         if (error != 0)
2295                 return (error);
2296         error = native_to_linux_timespec(&lts, &ts);
2297         if (error != 0)
2298                 return (error);
2299         return (copyout(&lts, uap->interval, sizeof(lts)));
2300 }
2301
2302 /*
2303  * In case when the Linux thread is the initial thread in
2304  * the thread group thread id is equal to the process id.
2305  * Glibc depends on this magic (assert in pthread_getattr_np.c).
2306  */
2307 struct thread *
2308 linux_tdfind(struct thread *td, lwpid_t tid, pid_t pid)
2309 {
2310         struct linux_emuldata *em;
2311         struct thread *tdt;
2312         struct proc *p;
2313
2314         tdt = NULL;
2315         if (tid == 0 || tid == td->td_tid) {
2316                 tdt = td;
2317                 PROC_LOCK(tdt->td_proc);
2318         } else if (tid > PID_MAX)
2319                 tdt = tdfind(tid, pid);
2320         else {
2321                 /*
2322                  * Initial thread where the tid equal to the pid.
2323                  */
2324                 p = pfind(tid);
2325                 if (p != NULL) {
2326                         if (SV_PROC_ABI(p) != SV_ABI_LINUX) {
2327                                 /*
2328                                  * p is not a Linuxulator process.
2329                                  */
2330                                 PROC_UNLOCK(p);
2331                                 return (NULL);
2332                         }
2333                         FOREACH_THREAD_IN_PROC(p, tdt) {
2334                                 em = em_find(tdt);
2335                                 if (tid == em->em_tid)
2336                                         return (tdt);
2337                         }
2338                         PROC_UNLOCK(p);
2339                 }
2340                 return (NULL);
2341         }
2342
2343         return (tdt);
2344 }
2345
2346 void
2347 linux_to_bsd_waitopts(int options, int *bsdopts)
2348 {
2349
2350         if (options & LINUX_WNOHANG)
2351                 *bsdopts |= WNOHANG;
2352         if (options & LINUX_WUNTRACED)
2353                 *bsdopts |= WUNTRACED;
2354         if (options & LINUX_WEXITED)
2355                 *bsdopts |= WEXITED;
2356         if (options & LINUX_WCONTINUED)
2357                 *bsdopts |= WCONTINUED;
2358         if (options & LINUX_WNOWAIT)
2359                 *bsdopts |= WNOWAIT;
2360
2361         if (options & __WCLONE)
2362                 *bsdopts |= WLINUXCLONE;
2363 }
2364
2365 int
2366 linux_getrandom(struct thread *td, struct linux_getrandom_args *args)
2367 {
2368         struct uio uio;
2369         struct iovec iov;
2370         int error;
2371
2372         if (args->flags & ~(LINUX_GRND_NONBLOCK|LINUX_GRND_RANDOM))
2373                 return (EINVAL);
2374         if (args->count > INT_MAX)
2375                 args->count = INT_MAX;
2376
2377         iov.iov_base = args->buf;
2378         iov.iov_len = args->count;
2379
2380         uio.uio_iov = &iov;
2381         uio.uio_iovcnt = 1;
2382         uio.uio_resid = iov.iov_len;
2383         uio.uio_segflg = UIO_USERSPACE;
2384         uio.uio_rw = UIO_READ;
2385         uio.uio_td = td;
2386
2387         error = read_random_uio(&uio, args->flags & LINUX_GRND_NONBLOCK);
2388         if (error == 0)
2389                 td->td_retval[0] = args->count - uio.uio_resid;
2390         return (error);
2391 }
2392
2393 int
2394 linux_mincore(struct thread *td, struct linux_mincore_args *args)
2395 {
2396
2397         /* Needs to be page-aligned */
2398         if (args->start & PAGE_MASK)
2399                 return (EINVAL);
2400         return (kern_mincore(td, args->start, args->len, args->vec));
2401 }
2402
2403 #define SYSLOG_TAG      "<6>"
2404
2405 int
2406 linux_syslog(struct thread *td, struct linux_syslog_args *args)
2407 {
2408         char buf[128], *src, *dst;
2409         u_int seq;
2410         int buflen, error;
2411
2412         if (args->type != LINUX_SYSLOG_ACTION_READ_ALL) {
2413                 linux_msg(td, "syslog unsupported type 0x%x", args->type);
2414                 return (EINVAL);
2415         }
2416
2417         if (args->len < 6) {
2418                 td->td_retval[0] = 0;
2419                 return (0);
2420         }
2421
2422         error = priv_check(td, PRIV_MSGBUF);
2423         if (error)
2424                 return (error);
2425
2426         mtx_lock(&msgbuf_lock);
2427         msgbuf_peekbytes(msgbufp, NULL, 0, &seq);
2428         mtx_unlock(&msgbuf_lock);
2429
2430         dst = args->buf;
2431         error = copyout(&SYSLOG_TAG, dst, sizeof(SYSLOG_TAG));
2432         /* The -1 is to skip the trailing '\0'. */
2433         dst += sizeof(SYSLOG_TAG) - 1;
2434
2435         while (error == 0) {
2436                 mtx_lock(&msgbuf_lock);
2437                 buflen = msgbuf_peekbytes(msgbufp, buf, sizeof(buf), &seq);
2438                 mtx_unlock(&msgbuf_lock);
2439
2440                 if (buflen == 0)
2441                         break;
2442
2443                 for (src = buf; src < buf + buflen && error == 0; src++) {
2444                         if (*src == '\0')
2445                                 continue;
2446
2447                         if (dst >= args->buf + args->len)
2448                                 goto out;
2449
2450                         error = copyout(src, dst, 1);
2451                         dst++;
2452
2453                         if (*src == '\n' && *(src + 1) != '<' &&
2454                             dst + sizeof(SYSLOG_TAG) < args->buf + args->len) {
2455                                 error = copyout(&SYSLOG_TAG,
2456                                     dst, sizeof(SYSLOG_TAG));
2457                                 dst += sizeof(SYSLOG_TAG) - 1;
2458                         }
2459                 }
2460         }
2461 out:
2462         td->td_retval[0] = dst - args->buf;
2463         return (error);
2464 }
2465
2466 int
2467 linux_getcpu(struct thread *td, struct linux_getcpu_args *args)
2468 {
2469         int cpu, error, node;
2470
2471         cpu = td->td_oncpu; /* Make sure it doesn't change during copyout(9) */
2472         error = 0;
2473         node = cpuid_to_pcpu[cpu]->pc_domain;
2474
2475         if (args->cpu != NULL)
2476                 error = copyout(&cpu, args->cpu, sizeof(l_int));
2477         if (args->node != NULL)
2478                 error = copyout(&node, args->node, sizeof(l_int));
2479         return (error);
2480 }