sys/vm/vm_phys.c

   1 /*-
   2  * Copyright (c) 2002-2006 Rice University
   3  * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
   4  * All rights reserved.
   5  *
   6  * This software was developed for the FreeBSD Project by Alan L. Cox,
   7  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  * 1. Redistributions of source code must retain the above copyright
  13  *    notice, this list of conditions and the following disclaimer.
  14  * 2. Redistributions in binary form must reproduce the above copyright
  15  *    notice, this list of conditions and the following disclaimer in the
  16  *    documentation and/or other materials provided with the distribution.
  17  *
  18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  22  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  24  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  25  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  26  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  28  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 /*
  33  *      Physical memory system implementation
  34  *
  35  * Any external functions defined by this module are only to be used by the
  36  * virtual memory system.
  37  */
  38
  39 #include <sys/cdefs.h>
  40 __FBSDID("$FreeBSD$");
  41
  42 #include "opt_ddb.h"
  43 #include "opt_vm.h"
  44
  45 #include <sys/param.h>
  46 #include <sys/systm.h>
  47 #include <sys/lock.h>
  48 #include <sys/kernel.h>
  49 #include <sys/malloc.h>
  50 #include <sys/mutex.h>
  51 #if MAXMEMDOM > 1
  52 #include <sys/proc.h>
  53 #endif
  54 #include <sys/queue.h>
  55 #include <sys/rwlock.h>
  56 #include <sys/sbuf.h>
  57 #include <sys/sysctl.h>
  58 #include <sys/tree.h>
  59 #include <sys/vmmeter.h>
  60 #include <sys/seq.h>
  61
  62 #include <ddb/ddb.h>
  63
  64 #include <vm/vm.h>
  65 #include <vm/vm_param.h>
  66 #include <vm/vm_kern.h>
  67 #include <vm/vm_object.h>
  68 #include <vm/vm_page.h>
  69 #include <vm/vm_phys.h>
  70
  71 #include <vm/vm_domain.h>
  72
  73 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
  74     "Too many physsegs.");
  75
  76 struct mem_affinity *mem_affinity;
  77 int *mem_locality;
  78
  79 int vm_ndomains = 1;
  80
  81 struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
  82 int vm_phys_nsegs;
  83
  84 struct vm_phys_fictitious_seg;
  85 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *,
  86     struct vm_phys_fictitious_seg *);
  87
  88 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree =
  89     RB_INITIALIZER(_vm_phys_fictitious_tree);
  90
  91 struct vm_phys_fictitious_seg {
  92         RB_ENTRY(vm_phys_fictitious_seg) node;
  93         /* Memory region data */
  94         vm_paddr_t      start;
  95         vm_paddr_t      end;
  96         vm_page_t       first_page;
  97 };
  98
  99 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node,
 100     vm_phys_fictitious_cmp);
 101
 102 static struct rwlock vm_phys_fictitious_reg_lock;
 103 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages");
 104
 105 static struct vm_freelist
 106     vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER];
 107
 108 static int vm_nfreelists;
 109
 110 /*
 111  * Provides the mapping from VM_FREELIST_* to free list indices (flind).
 112  */
 113 static int vm_freelist_to_flind[VM_NFREELIST];
 114
 115 CTASSERT(VM_FREELIST_DEFAULT == 0);
 116
 117 #ifdef VM_FREELIST_ISADMA
 118 #define VM_ISADMA_BOUNDARY      16777216
 119 #endif
 120 #ifdef VM_FREELIST_DMA32
 121 #define VM_DMA32_BOUNDARY       ((vm_paddr_t)1 << 32)
 122 #endif
 123
 124 /*
 125  * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about
 126  * the ordering of the free list boundaries.
 127  */
 128 #if defined(VM_ISADMA_BOUNDARY) && defined(VM_LOWMEM_BOUNDARY)
 129 CTASSERT(VM_ISADMA_BOUNDARY < VM_LOWMEM_BOUNDARY);
 130 #endif
 131 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY)
 132 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY);
 133 #endif
 134
 135 static int cnt_prezero;
 136 SYSCTL_INT(_vm_stats_misc, OID_AUTO, cnt_prezero, CTLFLAG_RD,
 137     &cnt_prezero, 0, "The number of physical pages prezeroed at idle time");
 138
 139 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
 140 SYSCTL_OID(_vm, OID_AUTO, phys_free, CTLTYPE_STRING | CTLFLAG_RD,
 141     NULL, 0, sysctl_vm_phys_free, "A", "Phys Free Info");
 142
 143 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
 144 SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,
 145     NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info");
 146
 147 #if MAXMEMDOM > 1
 148 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS);
 149 SYSCTL_OID(_vm, OID_AUTO, phys_locality, CTLTYPE_STRING | CTLFLAG_RD,
 150     NULL, 0, sysctl_vm_phys_locality, "A", "Phys Locality Info");
 151 #endif
 152
 153 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD,
 154     &vm_ndomains, 0, "Number of physical memory domains available.");
 155
 156 /*
 157  * Default to first-touch + round-robin.
 158  */
 159 static struct mtx vm_default_policy_mtx;
 160 MTX_SYSINIT(vm_default_policy, &vm_default_policy_mtx, "default policy mutex",
 161     MTX_DEF);
 162 #if MAXMEMDOM > 1
 163 static struct vm_domain_policy vm_default_policy =
 164     VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0);
 165 #else
 166 /* Use round-robin so the domain policy code will only try once per allocation */
 167 static struct vm_domain_policy vm_default_policy =
 168     VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_ROUND_ROBIN, 0);
 169 #endif
 170
 171 static vm_page_t vm_phys_alloc_domain_pages(int domain, int flind, int pool,
 172     int order);
 173 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain);
 174 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
 175 static int vm_phys_paddr_to_segind(vm_paddr_t pa);
 176 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
 177     int order);
 178
 179 static int
 180 sysctl_vm_default_policy(SYSCTL_HANDLER_ARGS)
 181 {
 182         char policy_name[32];
 183         int error;
 184
 185         mtx_lock(&vm_default_policy_mtx);
 186
 187         /* Map policy to output string */
 188         switch (vm_default_policy.p.policy) {
 189         case VM_POLICY_FIRST_TOUCH:
 190                 strcpy(policy_name, "first-touch");
 191                 break;
 192         case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
 193                 strcpy(policy_name, "first-touch-rr");
 194                 break;
 195         case VM_POLICY_ROUND_ROBIN:
 196         default:
 197                 strcpy(policy_name, "rr");
 198                 break;
 199         }
 200         mtx_unlock(&vm_default_policy_mtx);
 201
 202         error = sysctl_handle_string(oidp, &policy_name[0],
 203             sizeof(policy_name), req);
 204         if (error != 0 || req->newptr == NULL)
 205                 return (error);
 206
 207         mtx_lock(&vm_default_policy_mtx);
 208         /* Set: match on the subset of policies that make sense as a default */
 209         if (strcmp("first-touch-rr", policy_name) == 0) {
 210                 vm_domain_policy_set(&vm_default_policy,
 211                     VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0);
 212         } else if (strcmp("first-touch", policy_name) == 0) {
 213                 vm_domain_policy_set(&vm_default_policy,
 214                     VM_POLICY_FIRST_TOUCH, 0);
 215         } else if (strcmp("rr", policy_name) == 0) {
 216                 vm_domain_policy_set(&vm_default_policy,
 217                     VM_POLICY_ROUND_ROBIN, 0);
 218         } else {
 219                 error = EINVAL;
 220                 goto finish;
 221         }
 222
 223         error = 0;
 224 finish:
 225         mtx_unlock(&vm_default_policy_mtx);
 226         return (error);
 227 }
 228
 229 SYSCTL_PROC(_vm, OID_AUTO, default_policy, CTLTYPE_STRING | CTLFLAG_RW,
 230     0, 0, sysctl_vm_default_policy, "A",
 231     "Default policy (rr, first-touch, first-touch-rr");
 232
 233 /*
 234  * Red-black tree helpers for vm fictitious range management.
 235  */
 236 static inline int
 237 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p,
 238     struct vm_phys_fictitious_seg *range)
 239 {
 240
 241         KASSERT(range->start != 0 && range->end != 0,
 242             ("Invalid range passed on search for vm_fictitious page"));
 243         if (p->start >= range->end)
 244                 return (1);
 245         if (p->start < range->start)
 246                 return (-1);
 247
 248         return (0);
 249 }
 250
 251 static int
 252 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1,
 253     struct vm_phys_fictitious_seg *p2)
 254 {
 255
 256         /* Check if this is a search for a page */
 257         if (p1->end == 0)
 258                 return (vm_phys_fictitious_in_range(p1, p2));
 259
 260         KASSERT(p2->end != 0,
 261     ("Invalid range passed as second parameter to vm fictitious comparison"));
 262
 263         /* Searching to add a new range */
 264         if (p1->end <= p2->start)
 265                 return (-1);
 266         if (p1->start >= p2->end)
 267                 return (1);
 268
 269         panic("Trying to add overlapping vm fictitious ranges:\n"
 270             "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start,
 271             (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end);
 272 }
 273
 274 static __inline int
 275 vm_rr_selectdomain(void)
 276 {
 277 #if MAXMEMDOM > 1
 278         struct thread *td;
 279
 280         td = curthread;
 281
 282         td->td_dom_rr_idx++;
 283         td->td_dom_rr_idx %= vm_ndomains;
 284         return (td->td_dom_rr_idx);
 285 #else
 286         return (0);
 287 #endif
 288 }
 289
 290 /*
 291  * Initialise a VM domain iterator.
 292  *
 293  * Check the thread policy, then the proc policy,
 294  * then default to the system policy.
 295  *
 296  * Later on the various layers will have this logic
 297  * plumbed into them and the phys code will be explicitly
 298  * handed a VM domain policy to use.
 299  */
 300 static void
 301 vm_policy_iterator_init(struct vm_domain_iterator *vi)
 302 {
 303 #if MAXMEMDOM > 1
 304         struct vm_domain_policy lcl;
 305 #endif
 306
 307         vm_domain_iterator_init(vi);
 308
 309 #if MAXMEMDOM > 1
 310         /* Copy out the thread policy */
 311         vm_domain_policy_localcopy(&lcl, &curthread->td_vm_dom_policy);
 312         if (lcl.p.policy != VM_POLICY_NONE) {
 313                 /* Thread policy is present; use it */
 314                 vm_domain_iterator_set_policy(vi, &lcl);
 315                 return;
 316         }
 317
 318         vm_domain_policy_localcopy(&lcl,
 319             &curthread->td_proc->p_vm_dom_policy);
 320         if (lcl.p.policy != VM_POLICY_NONE) {
 321                 /* Process policy is present; use it */
 322                 vm_domain_iterator_set_policy(vi, &lcl);
 323                 return;
 324         }
 325 #endif
 326         /* Use system default policy */
 327         vm_domain_iterator_set_policy(vi, &vm_default_policy);
 328 }
 329
 330 static void
 331 vm_policy_iterator_finish(struct vm_domain_iterator *vi)
 332 {
 333
 334         vm_domain_iterator_cleanup(vi);
 335 }
 336
 337 boolean_t
 338 vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high)
 339 {
 340         struct vm_phys_seg *s;
 341         int idx;
 342
 343         while ((idx = ffsl(mask)) != 0) {
 344                 idx--;  /* ffsl counts from 1 */
 345                 mask &= ~(1UL << idx);
 346                 s = &vm_phys_segs[idx];
 347                 if (low < s->end && high > s->start)
 348                         return (TRUE);
 349         }
 350         return (FALSE);
 351 }
 352
 353 /*
 354  * Outputs the state of the physical memory allocator, specifically,
 355  * the amount of physical memory in each free list.
 356  */
 357 static int
 358 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
 359 {
 360         struct sbuf sbuf;
 361         struct vm_freelist *fl;
 362         int dom, error, flind, oind, pind;
 363
 364         error = sysctl_wire_old_buffer(req, 0);
 365         if (error != 0)
 366                 return (error);
 367         sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req);
 368         for (dom = 0; dom < vm_ndomains; dom++) {
 369                 sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom);
 370                 for (flind = 0; flind < vm_nfreelists; flind++) {
 371                         sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
 372                             "\n  ORDER (SIZE)  |  NUMBER"
 373                             "\n              ", flind);
 374                         for (pind = 0; pind < VM_NFREEPOOL; pind++)
 375                                 sbuf_printf(&sbuf, "  |  POOL %d", pind);
 376                         sbuf_printf(&sbuf, "\n--            ");
 377                         for (pind = 0; pind < VM_NFREEPOOL; pind++)
 378                                 sbuf_printf(&sbuf, "-- --      ");
 379                         sbuf_printf(&sbuf, "--\n");
 380                         for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
 381                                 sbuf_printf(&sbuf, "  %2d (%6dK)", oind,
 382                                     1 << (PAGE_SHIFT - 10 + oind));
 383                                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 384                                 fl = vm_phys_free_queues[dom][flind][pind];
 385                                         sbuf_printf(&sbuf, "  |  %6d",
 386                                             fl[oind].lcnt);
 387                                 }
 388                                 sbuf_printf(&sbuf, "\n");
 389                         }
 390                 }
 391         }
 392         error = sbuf_finish(&sbuf);
 393         sbuf_delete(&sbuf);
 394         return (error);
 395 }
 396
 397 /*
 398  * Outputs the set of physical memory segments.
 399  */
 400 static int
 401 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
 402 {
 403         struct sbuf sbuf;
 404         struct vm_phys_seg *seg;
 405         int error, segind;
 406
 407         error = sysctl_wire_old_buffer(req, 0);
 408         if (error != 0)
 409                 return (error);
 410         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 411         for (segind = 0; segind < vm_phys_nsegs; segind++) {
 412                 sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
 413                 seg = &vm_phys_segs[segind];
 414                 sbuf_printf(&sbuf, "start:     %#jx\n",
 415                     (uintmax_t)seg->start);
 416                 sbuf_printf(&sbuf, "end:       %#jx\n",
 417                     (uintmax_t)seg->end);
 418                 sbuf_printf(&sbuf, "domain:    %d\n", seg->domain);
 419                 sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
 420         }
 421         error = sbuf_finish(&sbuf);
 422         sbuf_delete(&sbuf);
 423         return (error);
 424 }
 425
 426 /*
 427  * Return affinity, or -1 if there's no affinity information.
 428  */
 429 int
 430 vm_phys_mem_affinity(int f, int t)
 431 {
 432
 433 #if MAXMEMDOM > 1
 434         if (mem_locality == NULL)
 435                 return (-1);
 436         if (f >= vm_ndomains || t >= vm_ndomains)
 437                 return (-1);
 438         return (mem_locality[f * vm_ndomains + t]);
 439 #else
 440         return (-1);
 441 #endif
 442 }
 443
 444 #if MAXMEMDOM > 1
 445 /*
 446  * Outputs the VM locality table.
 447  */
 448 static int
 449 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS)
 450 {
 451         struct sbuf sbuf;
 452         int error, i, j;
 453
 454         error = sysctl_wire_old_buffer(req, 0);
 455         if (error != 0)
 456                 return (error);
 457         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 458
 459         sbuf_printf(&sbuf, "\n");
 460
 461         for (i = 0; i < vm_ndomains; i++) {
 462                 sbuf_printf(&sbuf, "%d: ", i);
 463                 for (j = 0; j < vm_ndomains; j++) {
 464                         sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j));
 465                 }
 466                 sbuf_printf(&sbuf, "\n");
 467         }
 468         error = sbuf_finish(&sbuf);
 469         sbuf_delete(&sbuf);
 470         return (error);
 471 }
 472 #endif
 473
 474 static void
 475 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail)
 476 {
 477
 478         m->order = order;
 479         if (tail)
 480                 TAILQ_INSERT_TAIL(&fl[order].pl, m, plinks.q);
 481         else
 482                 TAILQ_INSERT_HEAD(&fl[order].pl, m, plinks.q);
 483         fl[order].lcnt++;
 484 }
 485
 486 static void
 487 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order)
 488 {
 489
 490         TAILQ_REMOVE(&fl[order].pl, m, plinks.q);
 491         fl[order].lcnt--;
 492         m->order = VM_NFREEORDER;
 493 }
 494
 495 /*
 496  * Create a physical memory segment.
 497  */
 498 static void
 499 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain)
 500 {
 501         struct vm_phys_seg *seg;
 502
 503         KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
 504             ("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
 505         KASSERT(domain < vm_ndomains,
 506             ("vm_phys_create_seg: invalid domain provided"));
 507         seg = &vm_phys_segs[vm_phys_nsegs++];
 508         while (seg > vm_phys_segs && (seg - 1)->start >= end) {
 509                 *seg = *(seg - 1);
 510                 seg--;
 511         }
 512         seg->start = start;
 513         seg->end = end;
 514         seg->domain = domain;
 515 }
 516
 517 static void
 518 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end)
 519 {
 520         int i;
 521
 522         if (mem_affinity == NULL) {
 523                 _vm_phys_create_seg(start, end, 0);
 524                 return;
 525         }
 526
 527         for (i = 0;; i++) {
 528                 if (mem_affinity[i].end == 0)
 529                         panic("Reached end of affinity info");
 530                 if (mem_affinity[i].end <= start)
 531                         continue;
 532                 if (mem_affinity[i].start > start)
 533                         panic("No affinity info for start %jx",
 534                             (uintmax_t)start);
 535                 if (mem_affinity[i].end >= end) {
 536                         _vm_phys_create_seg(start, end,
 537                             mem_affinity[i].domain);
 538                         break;
 539                 }
 540                 _vm_phys_create_seg(start, mem_affinity[i].end,
 541                     mem_affinity[i].domain);
 542                 start = mem_affinity[i].end;
 543         }
 544 }
 545
 546 /*
 547  * Add a physical memory segment.
 548  */
 549 void
 550 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end)
 551 {
 552         vm_paddr_t paddr;
 553
 554         KASSERT((start & PAGE_MASK) == 0,
 555             ("vm_phys_define_seg: start is not page aligned"));
 556         KASSERT((end & PAGE_MASK) == 0,
 557             ("vm_phys_define_seg: end is not page aligned"));
 558
 559         /*
 560          * Split the physical memory segment if it spans two or more free
 561          * list boundaries.
 562          */
 563         paddr = start;
 564 #ifdef  VM_FREELIST_ISADMA
 565         if (paddr < VM_ISADMA_BOUNDARY && end > VM_ISADMA_BOUNDARY) {
 566                 vm_phys_create_seg(paddr, VM_ISADMA_BOUNDARY);
 567                 paddr = VM_ISADMA_BOUNDARY;
 568         }
 569 #endif
 570 #ifdef  VM_FREELIST_LOWMEM
 571         if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) {
 572                 vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY);
 573                 paddr = VM_LOWMEM_BOUNDARY;
 574         }
 575 #endif
 576 #ifdef  VM_FREELIST_DMA32
 577         if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) {
 578                 vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY);
 579                 paddr = VM_DMA32_BOUNDARY;
 580         }
 581 #endif
 582         vm_phys_create_seg(paddr, end);
 583 }
 584
 585 /*
 586  * Initialize the physical memory allocator.
 587  *
 588  * Requires that vm_page_array is initialized!
 589  */
 590 void
 591 vm_phys_init(void)
 592 {
 593         struct vm_freelist *fl;
 594         struct vm_phys_seg *seg;
 595         u_long npages;
 596         int dom, flind, freelist, oind, pind, segind;
 597
 598         /*
 599          * Compute the number of free lists, and generate the mapping from the
 600          * manifest constants VM_FREELIST_* to the free list indices.
 601          *
 602          * Initially, the entries of vm_freelist_to_flind[] are set to either
 603          * 0 or 1 to indicate which free lists should be created.
 604          */
 605         npages = 0;
 606         for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
 607                 seg = &vm_phys_segs[segind];
 608 #ifdef  VM_FREELIST_ISADMA
 609                 if (seg->end <= VM_ISADMA_BOUNDARY)
 610                         vm_freelist_to_flind[VM_FREELIST_ISADMA] = 1;
 611                 else
 612 #endif
 613 #ifdef  VM_FREELIST_LOWMEM
 614                 if (seg->end <= VM_LOWMEM_BOUNDARY)
 615                         vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1;
 616                 else
 617 #endif
 618 #ifdef  VM_FREELIST_DMA32
 619                 if (
 620 #ifdef  VM_DMA32_NPAGES_THRESHOLD
 621                     /*
 622                      * Create the DMA32 free list only if the amount of
 623                      * physical memory above physical address 4G exceeds the
 624                      * given threshold.
 625                      */
 626                     npages > VM_DMA32_NPAGES_THRESHOLD &&
 627 #endif
 628                     seg->end <= VM_DMA32_BOUNDARY)
 629                         vm_freelist_to_flind[VM_FREELIST_DMA32] = 1;
 630                 else
 631 #endif
 632                 {
 633                         npages += atop(seg->end - seg->start);
 634                         vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1;
 635                 }
 636         }
 637         /* Change each entry into a running total of the free lists. */
 638         for (freelist = 1; freelist < VM_NFREELIST; freelist++) {
 639                 vm_freelist_to_flind[freelist] +=
 640                     vm_freelist_to_flind[freelist - 1];
 641         }
 642         vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1];
 643         KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists"));
 644         /* Change each entry into a free list index. */
 645         for (freelist = 0; freelist < VM_NFREELIST; freelist++)
 646                 vm_freelist_to_flind[freelist]--;
 647
 648         /*
 649          * Initialize the first_page and free_queues fields of each physical
 650          * memory segment.
 651          */
 652 #ifdef VM_PHYSSEG_SPARSE
 653         npages = 0;
 654 #endif
 655         for (segind = 0; segind < vm_phys_nsegs; segind++) {
 656                 seg = &vm_phys_segs[segind];
 657 #ifdef VM_PHYSSEG_SPARSE
 658                 seg->first_page = &vm_page_array[npages];
 659                 npages += atop(seg->end - seg->start);
 660 #else
 661                 seg->first_page = PHYS_TO_VM_PAGE(seg->start);
 662 #endif
 663 #ifdef  VM_FREELIST_ISADMA
 664                 if (seg->end <= VM_ISADMA_BOUNDARY) {
 665                         flind = vm_freelist_to_flind[VM_FREELIST_ISADMA];
 666                         KASSERT(flind >= 0,
 667                             ("vm_phys_init: ISADMA flind < 0"));
 668                 } else
 669 #endif
 670 #ifdef  VM_FREELIST_LOWMEM
 671                 if (seg->end <= VM_LOWMEM_BOUNDARY) {
 672                         flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM];
 673                         KASSERT(flind >= 0,
 674                             ("vm_phys_init: LOWMEM flind < 0"));
 675                 } else
 676 #endif
 677 #ifdef  VM_FREELIST_DMA32
 678                 if (seg->end <= VM_DMA32_BOUNDARY) {
 679                         flind = vm_freelist_to_flind[VM_FREELIST_DMA32];
 680                         KASSERT(flind >= 0,
 681                             ("vm_phys_init: DMA32 flind < 0"));
 682                 } else
 683 #endif
 684                 {
 685                         flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT];
 686                         KASSERT(flind >= 0,
 687                             ("vm_phys_init: DEFAULT flind < 0"));
 688                 }
 689                 seg->free_queues = &vm_phys_free_queues[seg->domain][flind];
 690         }
 691
 692         /*
 693          * Initialize the free queues.
 694          */
 695         for (dom = 0; dom < vm_ndomains; dom++) {
 696                 for (flind = 0; flind < vm_nfreelists; flind++) {
 697                         for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 698                                 fl = vm_phys_free_queues[dom][flind][pind];
 699                                 for (oind = 0; oind < VM_NFREEORDER; oind++)
 700                                         TAILQ_INIT(&fl[oind].pl);
 701                         }
 702                 }
 703         }
 704
 705         rw_init(&vm_phys_fictitious_reg_lock, "vmfctr");
 706 }
 707
 708 /*
 709  * Split a contiguous, power of two-sized set of physical pages.
 710  */
 711 static __inline void
 712 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order)
 713 {
 714         vm_page_t m_buddy;
 715
 716         while (oind > order) {
 717                 oind--;
 718                 m_buddy = &m[1 << oind];
 719                 KASSERT(m_buddy->order == VM_NFREEORDER,
 720                     ("vm_phys_split_pages: page %p has unexpected order %d",
 721                     m_buddy, m_buddy->order));
 722                 vm_freelist_add(fl, m_buddy, oind, 0);
 723         }
 724 }
 725
 726 /*
 727  * Initialize a physical page and add it to the free lists.
 728  */
 729 void
 730 vm_phys_add_page(vm_paddr_t pa)
 731 {
 732         vm_page_t m;
 733         struct vm_domain *vmd;
 734
 735         vm_cnt.v_page_count++;
 736         m = vm_phys_paddr_to_vm_page(pa);
 737         m->phys_addr = pa;
 738         m->queue = PQ_NONE;
 739         m->segind = vm_phys_paddr_to_segind(pa);
 740         vmd = vm_phys_domain(m);
 741         vmd->vmd_page_count++;
 742         vmd->vmd_segs |= 1UL << m->segind;
 743         KASSERT(m->order == VM_NFREEORDER,
 744             ("vm_phys_add_page: page %p has unexpected order %d",
 745             m, m->order));
 746         m->pool = VM_FREEPOOL_DEFAULT;
 747         pmap_page_init(m);
 748         mtx_lock(&vm_page_queue_free_mtx);
 749         vm_phys_freecnt_adj(m, 1);
 750         vm_phys_free_pages(m, 0);
 751         mtx_unlock(&vm_page_queue_free_mtx);
 752 }
 753
 754 /*
 755  * Allocate a contiguous, power of two-sized set of physical pages
 756  * from the free lists.
 757  *
 758  * The free page queues must be locked.
 759  */
 760 vm_page_t
 761 vm_phys_alloc_pages(int pool, int order)
 762 {
 763         vm_page_t m;
 764         int domain, flind;
 765         struct vm_domain_iterator vi;
 766
 767         KASSERT(pool < VM_NFREEPOOL,
 768             ("vm_phys_alloc_pages: pool %d is out of range", pool));
 769         KASSERT(order < VM_NFREEORDER,
 770             ("vm_phys_alloc_pages: order %d is out of range", order));
 771
 772         vm_policy_iterator_init(&vi);
 773
 774         while ((vm_domain_iterator_run(&vi, &domain)) == 0) {
 775                 for (flind = 0; flind < vm_nfreelists; flind++) {
 776                         m = vm_phys_alloc_domain_pages(domain, flind, pool,
 777                             order);
 778                         if (m != NULL)
 779                                 return (m);
 780                 }
 781         }
 782
 783         vm_policy_iterator_finish(&vi);
 784         return (NULL);
 785 }
 786
 787 /*
 788  * Allocate a contiguous, power of two-sized set of physical pages from the
 789  * specified free list.  The free list must be specified using one of the
 790  * manifest constants VM_FREELIST_*.
 791  *
 792  * The free page queues must be locked.
 793  */
 794 vm_page_t
 795 vm_phys_alloc_freelist_pages(int freelist, int pool, int order)
 796 {
 797         vm_page_t m;
 798         struct vm_domain_iterator vi;
 799         int domain;
 800
 801         KASSERT(freelist < VM_NFREELIST,
 802             ("vm_phys_alloc_freelist_pages: freelist %d is out of range",
 803             freelist));
 804         KASSERT(pool < VM_NFREEPOOL,
 805             ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
 806         KASSERT(order < VM_NFREEORDER,
 807             ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
 808
 809         vm_policy_iterator_init(&vi);
 810
 811         while ((vm_domain_iterator_run(&vi, &domain)) == 0) {
 812                 m = vm_phys_alloc_domain_pages(domain,
 813                     vm_freelist_to_flind[freelist], pool, order);
 814                 if (m != NULL)
 815                         return (m);
 816         }
 817
 818         vm_policy_iterator_finish(&vi);
 819         return (NULL);
 820 }
 821
 822 static vm_page_t
 823 vm_phys_alloc_domain_pages(int domain, int flind, int pool, int order)
 824 {
 825         struct vm_freelist *fl;
 826         struct vm_freelist *alt;
 827         int oind, pind;
 828         vm_page_t m;
 829
 830         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 831         fl = &vm_phys_free_queues[domain][flind][pool][0];
 832         for (oind = order; oind < VM_NFREEORDER; oind++) {
 833                 m = TAILQ_FIRST(&fl[oind].pl);
 834                 if (m != NULL) {
 835                         vm_freelist_rem(fl, m, oind);
 836                         vm_phys_split_pages(m, oind, fl, order);
 837                         return (m);
 838                 }
 839         }
 840
 841         /*
 842          * The given pool was empty.  Find the largest
 843          * contiguous, power-of-two-sized set of pages in any
 844          * pool.  Transfer these pages to the given pool, and
 845          * use them to satisfy the allocation.
 846          */
 847         for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
 848                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 849                         alt = &vm_phys_free_queues[domain][flind][pind][0];
 850                         m = TAILQ_FIRST(&alt[oind].pl);
 851                         if (m != NULL) {
 852                                 vm_freelist_rem(alt, m, oind);
 853                                 vm_phys_set_pool(pool, m, oind);
 854                                 vm_phys_split_pages(m, oind, fl, order);
 855                                 return (m);
 856                         }
 857                 }
 858         }
 859         return (NULL);
 860 }
 861
 862 /*
 863  * Find the vm_page corresponding to the given physical address.
 864  */
 865 vm_page_t
 866 vm_phys_paddr_to_vm_page(vm_paddr_t pa)
 867 {
 868         struct vm_phys_seg *seg;
 869         int segind;
 870
 871         for (segind = 0; segind < vm_phys_nsegs; segind++) {
 872                 seg = &vm_phys_segs[segind];
 873                 if (pa >= seg->start && pa < seg->end)
 874                         return (&seg->first_page[atop(pa - seg->start)]);
 875         }
 876         return (NULL);
 877 }
 878
 879 vm_page_t
 880 vm_phys_fictitious_to_vm_page(vm_paddr_t pa)
 881 {
 882         struct vm_phys_fictitious_seg tmp, *seg;
 883         vm_page_t m;
 884
 885         m = NULL;
 886         tmp.start = pa;
 887         tmp.end = 0;
 888
 889         rw_rlock(&vm_phys_fictitious_reg_lock);
 890         seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
 891         rw_runlock(&vm_phys_fictitious_reg_lock);
 892         if (seg == NULL)
 893                 return (NULL);
 894
 895         m = &seg->first_page[atop(pa - seg->start)];
 896         KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m));
 897
 898         return (m);
 899 }
 900
 901 static inline void
 902 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start,
 903     long page_count, vm_memattr_t memattr)
 904 {
 905         long i;
 906
 907         for (i = 0; i < page_count; i++) {
 908                 vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr);
 909                 range[i].oflags &= ~VPO_UNMANAGED;
 910                 range[i].busy_lock = VPB_UNBUSIED;
 911         }
 912 }
 913
 914 int
 915 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
 916     vm_memattr_t memattr)
 917 {
 918         struct vm_phys_fictitious_seg *seg;
 919         vm_page_t fp;
 920         long page_count;
 921 #ifdef VM_PHYSSEG_DENSE
 922         long pi, pe;
 923         long dpage_count;
 924 #endif
 925
 926         KASSERT(start < end,
 927             ("Start of segment isn't less than end (start: %jx end: %jx)",
 928             (uintmax_t)start, (uintmax_t)end));
 929
 930         page_count = (end - start) / PAGE_SIZE;
 931
 932 #ifdef VM_PHYSSEG_DENSE
 933         pi = atop(start);
 934         pe = atop(end);
 935         if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
 936                 fp = &vm_page_array[pi - first_page];
 937                 if ((pe - first_page) > vm_page_array_size) {
 938                         /*
 939                          * We have a segment that starts inside
 940                          * of vm_page_array, but ends outside of it.
 941                          *
 942                          * Use vm_page_array pages for those that are
 943                          * inside of the vm_page_array range, and
 944                          * allocate the remaining ones.
 945                          */
 946                         dpage_count = vm_page_array_size - (pi - first_page);
 947                         vm_phys_fictitious_init_range(fp, start, dpage_count,
 948                             memattr);
 949                         page_count -= dpage_count;
 950                         start += ptoa(dpage_count);
 951                         goto alloc;
 952                 }
 953                 /*
 954                  * We can allocate the full range from vm_page_array,
 955                  * so there's no need to register the range in the tree.
 956                  */
 957                 vm_phys_fictitious_init_range(fp, start, page_count, memattr);
 958                 return (0);
 959         } else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
 960                 /*
 961                  * We have a segment that ends inside of vm_page_array,
 962                  * but starts outside of it.
 963                  */
 964                 fp = &vm_page_array[0];
 965                 dpage_count = pe - first_page;
 966                 vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count,
 967                     memattr);
 968                 end -= ptoa(dpage_count);
 969                 page_count -= dpage_count;
 970                 goto alloc;
 971         } else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
 972                 /*
 973                  * Trying to register a fictitious range that expands before
 974                  * and after vm_page_array.
 975                  */
 976                 return (EINVAL);
 977         } else {
 978 alloc:
 979 #endif
 980                 fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES,
 981                     M_WAITOK | M_ZERO);
 982 #ifdef VM_PHYSSEG_DENSE
 983         }
 984 #endif
 985         vm_phys_fictitious_init_range(fp, start, page_count, memattr);
 986
 987         seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO);
 988         seg->start = start;
 989         seg->end = end;
 990         seg->first_page = fp;
 991
 992         rw_wlock(&vm_phys_fictitious_reg_lock);
 993         RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg);
 994         rw_wunlock(&vm_phys_fictitious_reg_lock);
 995
 996         return (0);
 997 }
 998
 999 void
1000 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end)
1001 {
1002         struct vm_phys_fictitious_seg *seg, tmp;
1003 #ifdef VM_PHYSSEG_DENSE
1004         long pi, pe;
1005 #endif
1006
1007         KASSERT(start < end,
1008             ("Start of segment isn't less than end (start: %jx end: %jx)",
1009             (uintmax_t)start, (uintmax_t)end));
1010
1011 #ifdef VM_PHYSSEG_DENSE
1012         pi = atop(start);
1013         pe = atop(end);
1014         if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
1015                 if ((pe - first_page) <= vm_page_array_size) {
1016                         /*
1017                          * This segment was allocated using vm_page_array
1018                          * only, there's nothing to do since those pages
1019                          * were never added to the tree.
1020                          */
1021                         return;
1022                 }
1023                 /*
1024                  * We have a segment that starts inside
1025                  * of vm_page_array, but ends outside of it.
1026                  *
1027                  * Calculate how many pages were added to the
1028                  * tree and free them.
1029                  */
1030                 start = ptoa(first_page + vm_page_array_size);
1031         } else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
1032                 /*
1033                  * We have a segment that ends inside of vm_page_array,
1034                  * but starts outside of it.
1035                  */
1036                 end = ptoa(first_page);
1037         } else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
1038                 /* Since it's not possible to register such a range, panic. */
1039                 panic(
1040                     "Unregistering not registered fictitious range [%#jx:%#jx]",
1041                     (uintmax_t)start, (uintmax_t)end);
1042         }
1043 #endif
1044         tmp.start = start;
1045         tmp.end = 0;
1046
1047         rw_wlock(&vm_phys_fictitious_reg_lock);
1048         seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
1049         if (seg->start != start || seg->end != end) {
1050                 rw_wunlock(&vm_phys_fictitious_reg_lock);
1051                 panic(
1052                     "Unregistering not registered fictitious range [%#jx:%#jx]",
1053                     (uintmax_t)start, (uintmax_t)end);
1054         }
1055         RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg);
1056         rw_wunlock(&vm_phys_fictitious_reg_lock);
1057         free(seg->first_page, M_FICT_PAGES);
1058         free(seg, M_FICT_PAGES);
1059 }
1060
1061 /*
1062  * Find the segment containing the given physical address.
1063  */
1064 static int
1065 vm_phys_paddr_to_segind(vm_paddr_t pa)
1066 {
1067         struct vm_phys_seg *seg;
1068         int segind;
1069
1070         for (segind = 0; segind < vm_phys_nsegs; segind++) {
1071                 seg = &vm_phys_segs[segind];
1072                 if (pa >= seg->start && pa < seg->end)
1073                         return (segind);
1074         }
1075         panic("vm_phys_paddr_to_segind: paddr %#jx is not in any segment" ,
1076             (uintmax_t)pa);
1077 }
1078
1079 /*
1080  * Free a contiguous, power of two-sized set of physical pages.
1081  *
1082  * The free page queues must be locked.
1083  */
1084 void
1085 vm_phys_free_pages(vm_page_t m, int order)
1086 {
1087         struct vm_freelist *fl;
1088         struct vm_phys_seg *seg;
1089         vm_paddr_t pa;
1090         vm_page_t m_buddy;
1091
1092         KASSERT(m->order == VM_NFREEORDER,
1093             ("vm_phys_free_pages: page %p has unexpected order %d",
1094             m, m->order));
1095         KASSERT(m->pool < VM_NFREEPOOL,
1096             ("vm_phys_free_pages: page %p has unexpected pool %d",
1097             m, m->pool));
1098         KASSERT(order < VM_NFREEORDER,
1099             ("vm_phys_free_pages: order %d is out of range", order));
1100         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1101         seg = &vm_phys_segs[m->segind];
1102         if (order < VM_NFREEORDER - 1) {
1103                 pa = VM_PAGE_TO_PHYS(m);
1104                 do {
1105                         pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order));
1106                         if (pa < seg->start || pa >= seg->end)
1107                                 break;
1108                         m_buddy = &seg->first_page[atop(pa - seg->start)];
1109                         if (m_buddy->order != order)
1110                                 break;
1111                         fl = (*seg->free_queues)[m_buddy->pool];
1112                         vm_freelist_rem(fl, m_buddy, order);
1113                         if (m_buddy->pool != m->pool)
1114                                 vm_phys_set_pool(m->pool, m_buddy, order);
1115                         order++;
1116                         pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1);
1117                         m = &seg->first_page[atop(pa - seg->start)];
1118                 } while (order < VM_NFREEORDER - 1);
1119         }
1120         fl = (*seg->free_queues)[m->pool];
1121         vm_freelist_add(fl, m, order, 1);
1122 }
1123
1124 /*
1125  * Free a contiguous, arbitrarily sized set of physical pages.
1126  *
1127  * The free page queues must be locked.
1128  */
1129 void
1130 vm_phys_free_contig(vm_page_t m, u_long npages)
1131 {
1132         u_int n;
1133         int order;
1134
1135         /*
1136          * Avoid unnecessary coalescing by freeing the pages in the largest
1137          * possible power-of-two-sized subsets.
1138          */
1139         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1140         for (;; npages -= n) {
1141                 /*
1142                  * Unsigned "min" is used here so that "order" is assigned
1143                  * "VM_NFREEORDER - 1" when "m"'s physical address is zero
1144                  * or the low-order bits of its physical address are zero
1145                  * because the size of a physical address exceeds the size of
1146                  * a long.
1147                  */
1148                 order = min(ffsl(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1,
1149                     VM_NFREEORDER - 1);
1150                 n = 1 << order;
1151                 if (npages < n)
1152                         break;
1153                 vm_phys_free_pages(m, order);
1154                 m += n;
1155         }
1156         /* The residual "npages" is less than "1 << (VM_NFREEORDER - 1)". */
1157         for (; npages > 0; npages -= n) {
1158                 order = flsl(npages) - 1;
1159                 n = 1 << order;
1160                 vm_phys_free_pages(m, order);
1161                 m += n;
1162         }
1163 }
1164
1165 /*
1166  * Set the pool for a contiguous, power of two-sized set of physical pages.
1167  */
1168 void
1169 vm_phys_set_pool(int pool, vm_page_t m, int order)
1170 {
1171         vm_page_t m_tmp;
1172
1173         for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
1174                 m_tmp->pool = pool;
1175 }
1176
1177 /*
1178  * Search for the given physical page "m" in the free lists.  If the search
1179  * succeeds, remove "m" from the free lists and return TRUE.  Otherwise, return
1180  * FALSE, indicating that "m" is not in the free lists.
1181  *
1182  * The free page queues must be locked.
1183  */
1184 boolean_t
1185 vm_phys_unfree_page(vm_page_t m)
1186 {
1187         struct vm_freelist *fl;
1188         struct vm_phys_seg *seg;
1189         vm_paddr_t pa, pa_half;
1190         vm_page_t m_set, m_tmp;
1191         int order;
1192
1193         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1194
1195         /*
1196          * First, find the contiguous, power of two-sized set of free
1197          * physical pages containing the given physical page "m" and
1198          * assign it to "m_set".
1199          */
1200         seg = &vm_phys_segs[m->segind];
1201         for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
1202             order < VM_NFREEORDER - 1; ) {
1203                 order++;
1204                 pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
1205                 if (pa >= seg->start)
1206                         m_set = &seg->first_page[atop(pa - seg->start)];
1207                 else
1208                         return (FALSE);
1209         }
1210         if (m_set->order < order)
1211                 return (FALSE);
1212         if (m_set->order == VM_NFREEORDER)
1213                 return (FALSE);
1214         KASSERT(m_set->order < VM_NFREEORDER,
1215             ("vm_phys_unfree_page: page %p has unexpected order %d",
1216             m_set, m_set->order));
1217
1218         /*
1219          * Next, remove "m_set" from the free lists.  Finally, extract
1220          * "m" from "m_set" using an iterative algorithm: While "m_set"
1221          * is larger than a page, shrink "m_set" by returning the half
1222          * of "m_set" that does not contain "m" to the free lists.
1223          */
1224         fl = (*seg->free_queues)[m_set->pool];
1225         order = m_set->order;
1226         vm_freelist_rem(fl, m_set, order);
1227         while (order > 0) {
1228                 order--;
1229                 pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
1230                 if (m->phys_addr < pa_half)
1231                         m_tmp = &seg->first_page[atop(pa_half - seg->start)];
1232                 else {
1233                         m_tmp = m_set;
1234                         m_set = &seg->first_page[atop(pa_half - seg->start)];
1235                 }
1236                 vm_freelist_add(fl, m_tmp, order, 0);
1237         }
1238         KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
1239         return (TRUE);
1240 }
1241
1242 /*
1243  * Try to zero one physical page.  Used by an idle priority thread.
1244  */
1245 boolean_t
1246 vm_phys_zero_pages_idle(void)
1247 {
1248         static struct vm_freelist *fl;
1249         static int flind, oind, pind;
1250         vm_page_t m, m_tmp;
1251         int domain;
1252
1253         domain = vm_rr_selectdomain();
1254         fl = vm_phys_free_queues[domain][0][0];
1255         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1256         for (;;) {
1257                 TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, plinks.q) {
1258                         for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) {
1259                                 if ((m_tmp->flags & (PG_CACHED | PG_ZERO)) == 0) {
1260                                         vm_phys_unfree_page(m_tmp);
1261                                         vm_phys_freecnt_adj(m, -1);
1262                                         mtx_unlock(&vm_page_queue_free_mtx);
1263                                         pmap_zero_page_idle(m_tmp);
1264                                         m_tmp->flags |= PG_ZERO;
1265                                         mtx_lock(&vm_page_queue_free_mtx);
1266                                         vm_phys_freecnt_adj(m, 1);
1267                                         vm_phys_free_pages(m_tmp, 0);
1268                                         vm_page_zero_count++;
1269                                         cnt_prezero++;
1270                                         return (TRUE);
1271                                 }
1272                         }
1273                 }
1274                 oind++;
1275                 if (oind == VM_NFREEORDER) {
1276                         oind = 0;
1277                         pind++;
1278                         if (pind == VM_NFREEPOOL) {
1279                                 pind = 0;
1280                                 flind++;
1281                                 if (flind == vm_nfreelists)
1282                                         flind = 0;
1283                         }
1284                         fl = vm_phys_free_queues[domain][flind][pind];
1285                 }
1286         }
1287 }
1288
1289 /*
1290  * Allocate a contiguous set of physical pages of the given size
1291  * "npages" from the free lists.  All of the physical pages must be at
1292  * or above the given physical address "low" and below the given
1293  * physical address "high".  The given value "alignment" determines the
1294  * alignment of the first physical page in the set.  If the given value
1295  * "boundary" is non-zero, then the set of physical pages cannot cross
1296  * any physical address boundary that is a multiple of that value.  Both
1297  * "alignment" and "boundary" must be a power of two.
1298  */
1299 vm_page_t
1300 vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
1301     u_long alignment, vm_paddr_t boundary)
1302 {
1303         struct vm_freelist *fl;
1304         struct vm_phys_seg *seg;
1305         vm_paddr_t pa, pa_last, size;
1306         vm_page_t m, m_ret;
1307         u_long npages_end;
1308         int domain, flind, oind, order, pind;
1309         struct vm_domain_iterator vi;
1310
1311         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1312         size = npages << PAGE_SHIFT;
1313         KASSERT(size != 0,
1314             ("vm_phys_alloc_contig: size must not be 0"));
1315         KASSERT((alignment & (alignment - 1)) == 0,
1316             ("vm_phys_alloc_contig: alignment must be a power of 2"));
1317         KASSERT((boundary & (boundary - 1)) == 0,
1318             ("vm_phys_alloc_contig: boundary must be a power of 2"));
1319         /* Compute the queue that is the best fit for npages. */
1320         for (order = 0; (1 << order) < npages; order++);
1321
1322         vm_policy_iterator_init(&vi);
1323
1324 restartdom:
1325         if (vm_domain_iterator_run(&vi, &domain) != 0) {
1326                 vm_policy_iterator_finish(&vi);
1327                 return (NULL);
1328         }
1329
1330         for (flind = 0; flind < vm_nfreelists; flind++) {
1331                 for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) {
1332                         for (pind = 0; pind < VM_NFREEPOOL; pind++) {
1333                                 fl = &vm_phys_free_queues[domain][flind][pind][0];
1334                                 TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) {
1335                                         /*
1336                                          * A free list may contain physical pages
1337                                          * from one or more segments.
1338                                          */
1339                                         seg = &vm_phys_segs[m_ret->segind];
1340                                         if (seg->start > high ||
1341                                             low >= seg->end)
1342                                                 continue;
1343
1344                                         /*
1345                                          * Is the size of this allocation request
1346                                          * larger than the largest block size?
1347                                          */
1348                                         if (order >= VM_NFREEORDER) {
1349                                                 /*
1350                                                  * Determine if a sufficient number
1351                                                  * of subsequent blocks to satisfy
1352                                                  * the allocation request are free.
1353                                                  */
1354                                                 pa = VM_PAGE_TO_PHYS(m_ret);
1355                                                 pa_last = pa + size;
1356                                                 for (;;) {
1357                                                         pa += 1 << (PAGE_SHIFT + VM_NFREEORDER - 1);
1358                                                         if (pa >= pa_last)
1359                                                                 break;
1360                                                         if (pa < seg->start ||
1361                                                             pa >= seg->end)
1362                                                                 break;
1363                                                         m = &seg->first_page[atop(pa - seg->start)];
1364                                                         if (m->order != VM_NFREEORDER - 1)
1365                                                                 break;
1366                                                 }
1367                                                 /* If not, continue to the next block. */
1368                                                 if (pa < pa_last)
1369                                                         continue;
1370                                         }
1371
1372                                         /*
1373                                          * Determine if the blocks are within the given range,
1374                                          * satisfy the given alignment, and do not cross the
1375                                          * given boundary.
1376                                          */
1377                                         pa = VM_PAGE_TO_PHYS(m_ret);
1378                                         if (pa >= low &&
1379                                             pa + size <= high &&
1380                                             (pa & (alignment - 1)) == 0 &&
1381                                             ((pa ^ (pa + size - 1)) & ~(boundary - 1)) == 0)
1382                                                 goto done;
1383                                 }
1384                         }
1385                 }
1386         }
1387         if (!vm_domain_iterator_isdone(&vi))
1388                 goto restartdom;
1389         vm_policy_iterator_finish(&vi);
1390         return (NULL);
1391 done:
1392         for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) {
1393                 fl = (*seg->free_queues)[m->pool];
1394                 vm_freelist_rem(fl, m, m->order);
1395         }
1396         if (m_ret->pool != VM_FREEPOOL_DEFAULT)
1397                 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m_ret, oind);
1398         fl = (*seg->free_queues)[m_ret->pool];
1399         vm_phys_split_pages(m_ret, oind, fl, order);
1400         /* Return excess pages to the free lists. */
1401         npages_end = roundup2(npages, 1 << imin(oind, order));
1402         if (npages < npages_end)
1403                 vm_phys_free_contig(&m_ret[npages], npages_end - npages);
1404         return (m_ret);
1405 }
1406
1407 #ifdef DDB
1408 /*
1409  * Show the number of physical pages in each of the free lists.
1410  */
1411 DB_SHOW_COMMAND(freepages, db_show_freepages)
1412 {
1413         struct vm_freelist *fl;
1414         int flind, oind, pind, dom;
1415
1416         for (dom = 0; dom < vm_ndomains; dom++) {
1417                 db_printf("DOMAIN: %d\n", dom);
1418                 for (flind = 0; flind < vm_nfreelists; flind++) {
1419                         db_printf("FREE LIST %d:\n"
1420                             "\n  ORDER (SIZE)  |  NUMBER"
1421                             "\n              ", flind);
1422                         for (pind = 0; pind < VM_NFREEPOOL; pind++)
1423                                 db_printf("  |  POOL %d", pind);
1424                         db_printf("\n--            ");
1425                         for (pind = 0; pind < VM_NFREEPOOL; pind++)
1426                                 db_printf("-- --      ");
1427                         db_printf("--\n");
1428                         for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
1429                                 db_printf("  %2.2d (%6.6dK)", oind,
1430                                     1 << (PAGE_SHIFT - 10 + oind));
1431                                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
1432                                 fl = vm_phys_free_queues[dom][flind][pind];
1433                                         db_printf("  |  %6.6d", fl[oind].lcnt);
1434                                 }
1435                                 db_printf("\n");
1436                         }
1437                         db_printf("\n");
1438                 }
1439                 db_printf("\n");
1440         }
1441 }
1442 #endif