sys/vm/vm_phys.c

   1 /*-
   2  * Copyright (c) 2002-2006 Rice University
   3  * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
   4  * All rights reserved.
   5  *
   6  * This software was developed for the FreeBSD Project by Alan L. Cox,
   7  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  * 1. Redistributions of source code must retain the above copyright
  13  *    notice, this list of conditions and the following disclaimer.
  14  * 2. Redistributions in binary form must reproduce the above copyright
  15  *    notice, this list of conditions and the following disclaimer in the
  16  *    documentation and/or other materials provided with the distribution.
  17  *
  18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  22  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  24  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  25  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  26  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  28  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 /*
  33  *      Physical memory system implementation
  34  *
  35  * Any external functions defined by this module are only to be used by the
  36  * virtual memory system.
  37  */
  38
  39 #include <sys/cdefs.h>
  40 __FBSDID("$FreeBSD$");
  41
  42 #include "opt_ddb.h"
  43 #include "opt_vm.h"
  44
  45 #include <sys/param.h>
  46 #include <sys/systm.h>
  47 #include <sys/lock.h>
  48 #include <sys/kernel.h>
  49 #include <sys/malloc.h>
  50 #include <sys/mutex.h>
  51 #if MAXMEMDOM > 1
  52 #include <sys/proc.h>
  53 #endif
  54 #include <sys/queue.h>
  55 #include <sys/rwlock.h>
  56 #include <sys/sbuf.h>
  57 #include <sys/sysctl.h>
  58 #include <sys/tree.h>
  59 #include <sys/vmmeter.h>
  60
  61 #include <ddb/ddb.h>
  62
  63 #include <vm/vm.h>
  64 #include <vm/vm_param.h>
  65 #include <vm/vm_kern.h>
  66 #include <vm/vm_object.h>
  67 #include <vm/vm_page.h>
  68 #include <vm/vm_phys.h>
  69
  70 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
  71     "Too many physsegs.");
  72
  73 struct mem_affinity *mem_affinity;
  74 int *mem_locality;
  75
  76 int vm_ndomains = 1;
  77
  78 struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
  79 int vm_phys_nsegs;
  80
  81 struct vm_phys_fictitious_seg;
  82 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *,
  83     struct vm_phys_fictitious_seg *);
  84
  85 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree =
  86     RB_INITIALIZER(_vm_phys_fictitious_tree);
  87
  88 struct vm_phys_fictitious_seg {
  89         RB_ENTRY(vm_phys_fictitious_seg) node;
  90         /* Memory region data */
  91         vm_paddr_t      start;
  92         vm_paddr_t      end;
  93         vm_page_t       first_page;
  94 };
  95
  96 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node,
  97     vm_phys_fictitious_cmp);
  98
  99 static struct rwlock vm_phys_fictitious_reg_lock;
 100 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages");
 101
 102 static struct vm_freelist
 103     vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER];
 104
 105 static int vm_nfreelists;
 106
 107 /*
 108  * Provides the mapping from VM_FREELIST_* to free list indices (flind).
 109  */
 110 static int vm_freelist_to_flind[VM_NFREELIST];
 111
 112 CTASSERT(VM_FREELIST_DEFAULT == 0);
 113
 114 #ifdef VM_FREELIST_ISADMA
 115 #define VM_ISADMA_BOUNDARY      16777216
 116 #endif
 117 #ifdef VM_FREELIST_DMA32
 118 #define VM_DMA32_BOUNDARY       ((vm_paddr_t)1 << 32)
 119 #endif
 120
 121 /*
 122  * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about
 123  * the ordering of the free list boundaries.
 124  */
 125 #if defined(VM_ISADMA_BOUNDARY) && defined(VM_LOWMEM_BOUNDARY)
 126 CTASSERT(VM_ISADMA_BOUNDARY < VM_LOWMEM_BOUNDARY);
 127 #endif
 128 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY)
 129 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY);
 130 #endif
 131
 132 static int cnt_prezero;
 133 SYSCTL_INT(_vm_stats_misc, OID_AUTO, cnt_prezero, CTLFLAG_RD,
 134     &cnt_prezero, 0, "The number of physical pages prezeroed at idle time");
 135
 136 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
 137 SYSCTL_OID(_vm, OID_AUTO, phys_free, CTLTYPE_STRING | CTLFLAG_RD,
 138     NULL, 0, sysctl_vm_phys_free, "A", "Phys Free Info");
 139
 140 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
 141 SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,
 142     NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info");
 143
 144 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS);
 145 SYSCTL_OID(_vm, OID_AUTO, phys_locality, CTLTYPE_STRING | CTLFLAG_RD,
 146     NULL, 0, sysctl_vm_phys_locality, "A", "Phys Locality Info");
 147
 148 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD,
 149     &vm_ndomains, 0, "Number of physical memory domains available.");
 150
 151 static vm_page_t vm_phys_alloc_domain_pages(int domain, int flind, int pool,
 152     int order);
 153 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain);
 154 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
 155 static int vm_phys_paddr_to_segind(vm_paddr_t pa);
 156 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
 157     int order);
 158
 159 /*
 160  * Red-black tree helpers for vm fictitious range management.
 161  */
 162 static inline int
 163 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p,
 164     struct vm_phys_fictitious_seg *range)
 165 {
 166
 167         KASSERT(range->start != 0 && range->end != 0,
 168             ("Invalid range passed on search for vm_fictitious page"));
 169         if (p->start >= range->end)
 170                 return (1);
 171         if (p->start < range->start)
 172                 return (-1);
 173
 174         return (0);
 175 }
 176
 177 static int
 178 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1,
 179     struct vm_phys_fictitious_seg *p2)
 180 {
 181
 182         /* Check if this is a search for a page */
 183         if (p1->end == 0)
 184                 return (vm_phys_fictitious_in_range(p1, p2));
 185
 186         KASSERT(p2->end != 0,
 187     ("Invalid range passed as second parameter to vm fictitious comparison"));
 188
 189         /* Searching to add a new range */
 190         if (p1->end <= p2->start)
 191                 return (-1);
 192         if (p1->start >= p2->end)
 193                 return (1);
 194
 195         panic("Trying to add overlapping vm fictitious ranges:\n"
 196             "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start,
 197             (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end);
 198 }
 199
 200 static __inline int
 201 vm_rr_selectdomain(void)
 202 {
 203 #if MAXMEMDOM > 1
 204         struct thread *td;
 205
 206         td = curthread;
 207
 208         td->td_dom_rr_idx++;
 209         td->td_dom_rr_idx %= vm_ndomains;
 210         return (td->td_dom_rr_idx);
 211 #else
 212         return (0);
 213 #endif
 214 }
 215
 216 boolean_t
 217 vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high)
 218 {
 219         struct vm_phys_seg *s;
 220         int idx;
 221
 222         while ((idx = ffsl(mask)) != 0) {
 223                 idx--;  /* ffsl counts from 1 */
 224                 mask &= ~(1UL << idx);
 225                 s = &vm_phys_segs[idx];
 226                 if (low < s->end && high > s->start)
 227                         return (TRUE);
 228         }
 229         return (FALSE);
 230 }
 231
 232 /*
 233  * Outputs the state of the physical memory allocator, specifically,
 234  * the amount of physical memory in each free list.
 235  */
 236 static int
 237 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
 238 {
 239         struct sbuf sbuf;
 240         struct vm_freelist *fl;
 241         int dom, error, flind, oind, pind;
 242
 243         error = sysctl_wire_old_buffer(req, 0);
 244         if (error != 0)
 245                 return (error);
 246         sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req);
 247         for (dom = 0; dom < vm_ndomains; dom++) {
 248                 sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom);
 249                 for (flind = 0; flind < vm_nfreelists; flind++) {
 250                         sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
 251                             "\n  ORDER (SIZE)  |  NUMBER"
 252                             "\n              ", flind);
 253                         for (pind = 0; pind < VM_NFREEPOOL; pind++)
 254                                 sbuf_printf(&sbuf, "  |  POOL %d", pind);
 255                         sbuf_printf(&sbuf, "\n--            ");
 256                         for (pind = 0; pind < VM_NFREEPOOL; pind++)
 257                                 sbuf_printf(&sbuf, "-- --      ");
 258                         sbuf_printf(&sbuf, "--\n");
 259                         for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
 260                                 sbuf_printf(&sbuf, "  %2d (%6dK)", oind,
 261                                     1 << (PAGE_SHIFT - 10 + oind));
 262                                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 263                                 fl = vm_phys_free_queues[dom][flind][pind];
 264                                         sbuf_printf(&sbuf, "  |  %6d",
 265                                             fl[oind].lcnt);
 266                                 }
 267                                 sbuf_printf(&sbuf, "\n");
 268                         }
 269                 }
 270         }
 271         error = sbuf_finish(&sbuf);
 272         sbuf_delete(&sbuf);
 273         return (error);
 274 }
 275
 276 /*
 277  * Outputs the set of physical memory segments.
 278  */
 279 static int
 280 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
 281 {
 282         struct sbuf sbuf;
 283         struct vm_phys_seg *seg;
 284         int error, segind;
 285
 286         error = sysctl_wire_old_buffer(req, 0);
 287         if (error != 0)
 288                 return (error);
 289         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 290         for (segind = 0; segind < vm_phys_nsegs; segind++) {
 291                 sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
 292                 seg = &vm_phys_segs[segind];
 293                 sbuf_printf(&sbuf, "start:     %#jx\n",
 294                     (uintmax_t)seg->start);
 295                 sbuf_printf(&sbuf, "end:       %#jx\n",
 296                     (uintmax_t)seg->end);
 297                 sbuf_printf(&sbuf, "domain:    %d\n", seg->domain);
 298                 sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
 299         }
 300         error = sbuf_finish(&sbuf);
 301         sbuf_delete(&sbuf);
 302         return (error);
 303 }
 304
 305 /*
 306  * Return affinity, or -1 if there's no affinity information.
 307  */
 308 static int
 309 vm_phys_mem_affinity(int f, int t)
 310 {
 311
 312         if (mem_locality == NULL)
 313                 return (-1);
 314         if (f >= vm_ndomains || t >= vm_ndomains)
 315                 return (-1);
 316         return (mem_locality[f * vm_ndomains + t]);
 317 }
 318
 319 /*
 320  * Outputs the VM locality table.
 321  */
 322 static int
 323 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS)
 324 {
 325         struct sbuf sbuf;
 326         int error, i, j;
 327
 328         error = sysctl_wire_old_buffer(req, 0);
 329         if (error != 0)
 330                 return (error);
 331         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 332
 333         sbuf_printf(&sbuf, "\n");
 334
 335         for (i = 0; i < vm_ndomains; i++) {
 336                 sbuf_printf(&sbuf, "%d: ", i);
 337                 for (j = 0; j < vm_ndomains; j++) {
 338                         sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j));
 339                 }
 340                 sbuf_printf(&sbuf, "\n");
 341         }
 342         error = sbuf_finish(&sbuf);
 343         sbuf_delete(&sbuf);
 344         return (error);
 345 }
 346
 347 static void
 348 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail)
 349 {
 350
 351         m->order = order;
 352         if (tail)
 353                 TAILQ_INSERT_TAIL(&fl[order].pl, m, plinks.q);
 354         else
 355                 TAILQ_INSERT_HEAD(&fl[order].pl, m, plinks.q);
 356         fl[order].lcnt++;
 357 }
 358
 359 static void
 360 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order)
 361 {
 362
 363         TAILQ_REMOVE(&fl[order].pl, m, plinks.q);
 364         fl[order].lcnt--;
 365         m->order = VM_NFREEORDER;
 366 }
 367
 368 /*
 369  * Create a physical memory segment.
 370  */
 371 static void
 372 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain)
 373 {
 374         struct vm_phys_seg *seg;
 375
 376         KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
 377             ("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
 378         KASSERT(domain < vm_ndomains,
 379             ("vm_phys_create_seg: invalid domain provided"));
 380         seg = &vm_phys_segs[vm_phys_nsegs++];
 381         while (seg > vm_phys_segs && (seg - 1)->start >= end) {
 382                 *seg = *(seg - 1);
 383                 seg--;
 384         }
 385         seg->start = start;
 386         seg->end = end;
 387         seg->domain = domain;
 388 }
 389
 390 static void
 391 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end)
 392 {
 393         int i;
 394
 395         if (mem_affinity == NULL) {
 396                 _vm_phys_create_seg(start, end, 0);
 397                 return;
 398         }
 399
 400         for (i = 0;; i++) {
 401                 if (mem_affinity[i].end == 0)
 402                         panic("Reached end of affinity info");
 403                 if (mem_affinity[i].end <= start)
 404                         continue;
 405                 if (mem_affinity[i].start > start)
 406                         panic("No affinity info for start %jx",
 407                             (uintmax_t)start);
 408                 if (mem_affinity[i].end >= end) {
 409                         _vm_phys_create_seg(start, end,
 410                             mem_affinity[i].domain);
 411                         break;
 412                 }
 413                 _vm_phys_create_seg(start, mem_affinity[i].end,
 414                     mem_affinity[i].domain);
 415                 start = mem_affinity[i].end;
 416         }
 417 }
 418
 419 /*
 420  * Add a physical memory segment.
 421  */
 422 void
 423 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end)
 424 {
 425         vm_paddr_t paddr;
 426
 427         KASSERT((start & PAGE_MASK) == 0,
 428             ("vm_phys_define_seg: start is not page aligned"));
 429         KASSERT((end & PAGE_MASK) == 0,
 430             ("vm_phys_define_seg: end is not page aligned"));
 431
 432         /*
 433          * Split the physical memory segment if it spans two or more free
 434          * list boundaries.
 435          */
 436         paddr = start;
 437 #ifdef  VM_FREELIST_ISADMA
 438         if (paddr < VM_ISADMA_BOUNDARY && end > VM_ISADMA_BOUNDARY) {
 439                 vm_phys_create_seg(paddr, VM_ISADMA_BOUNDARY);
 440                 paddr = VM_ISADMA_BOUNDARY;
 441         }
 442 #endif
 443 #ifdef  VM_FREELIST_LOWMEM
 444         if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) {
 445                 vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY);
 446                 paddr = VM_LOWMEM_BOUNDARY;
 447         }
 448 #endif
 449 #ifdef  VM_FREELIST_DMA32
 450         if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) {
 451                 vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY);
 452                 paddr = VM_DMA32_BOUNDARY;
 453         }
 454 #endif
 455         vm_phys_create_seg(paddr, end);
 456 }
 457
 458 /*
 459  * Initialize the physical memory allocator.
 460  *
 461  * Requires that vm_page_array is initialized!
 462  */
 463 void
 464 vm_phys_init(void)
 465 {
 466         struct vm_freelist *fl;
 467         struct vm_phys_seg *seg;
 468         u_long npages;
 469         int dom, flind, freelist, oind, pind, segind;
 470
 471         /*
 472          * Compute the number of free lists, and generate the mapping from the
 473          * manifest constants VM_FREELIST_* to the free list indices.
 474          *
 475          * Initially, the entries of vm_freelist_to_flind[] are set to either
 476          * 0 or 1 to indicate which free lists should be created.
 477          */
 478         npages = 0;
 479         for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
 480                 seg = &vm_phys_segs[segind];
 481 #ifdef  VM_FREELIST_ISADMA
 482                 if (seg->end <= VM_ISADMA_BOUNDARY)
 483                         vm_freelist_to_flind[VM_FREELIST_ISADMA] = 1;
 484                 else
 485 #endif
 486 #ifdef  VM_FREELIST_LOWMEM
 487                 if (seg->end <= VM_LOWMEM_BOUNDARY)
 488                         vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1;
 489                 else
 490 #endif
 491 #ifdef  VM_FREELIST_DMA32
 492                 if (
 493 #ifdef  VM_DMA32_NPAGES_THRESHOLD
 494                     /*
 495                      * Create the DMA32 free list only if the amount of
 496                      * physical memory above physical address 4G exceeds the
 497                      * given threshold.
 498                      */
 499                     npages > VM_DMA32_NPAGES_THRESHOLD &&
 500 #endif
 501                     seg->end <= VM_DMA32_BOUNDARY)
 502                         vm_freelist_to_flind[VM_FREELIST_DMA32] = 1;
 503                 else
 504 #endif
 505                 {
 506                         npages += atop(seg->end - seg->start);
 507                         vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1;
 508                 }
 509         }
 510         /* Change each entry into a running total of the free lists. */
 511         for (freelist = 1; freelist < VM_NFREELIST; freelist++) {
 512                 vm_freelist_to_flind[freelist] +=
 513                     vm_freelist_to_flind[freelist - 1];
 514         }
 515         vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1];
 516         KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists"));
 517         /* Change each entry into a free list index. */
 518         for (freelist = 0; freelist < VM_NFREELIST; freelist++)
 519                 vm_freelist_to_flind[freelist]--;
 520
 521         /*
 522          * Initialize the first_page and free_queues fields of each physical
 523          * memory segment.
 524          */
 525 #ifdef VM_PHYSSEG_SPARSE
 526         npages = 0;
 527 #endif
 528         for (segind = 0; segind < vm_phys_nsegs; segind++) {
 529                 seg = &vm_phys_segs[segind];
 530 #ifdef VM_PHYSSEG_SPARSE
 531                 seg->first_page = &vm_page_array[npages];
 532                 npages += atop(seg->end - seg->start);
 533 #else
 534                 seg->first_page = PHYS_TO_VM_PAGE(seg->start);
 535 #endif
 536 #ifdef  VM_FREELIST_ISADMA
 537                 if (seg->end <= VM_ISADMA_BOUNDARY) {
 538                         flind = vm_freelist_to_flind[VM_FREELIST_ISADMA];
 539                         KASSERT(flind >= 0,
 540                             ("vm_phys_init: ISADMA flind < 0"));
 541                 } else
 542 #endif
 543 #ifdef  VM_FREELIST_LOWMEM
 544                 if (seg->end <= VM_LOWMEM_BOUNDARY) {
 545                         flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM];
 546                         KASSERT(flind >= 0,
 547                             ("vm_phys_init: LOWMEM flind < 0"));
 548                 } else
 549 #endif
 550 #ifdef  VM_FREELIST_DMA32
 551                 if (seg->end <= VM_DMA32_BOUNDARY) {
 552                         flind = vm_freelist_to_flind[VM_FREELIST_DMA32];
 553                         KASSERT(flind >= 0,
 554                             ("vm_phys_init: DMA32 flind < 0"));
 555                 } else
 556 #endif
 557                 {
 558                         flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT];
 559                         KASSERT(flind >= 0,
 560                             ("vm_phys_init: DEFAULT flind < 0"));
 561                 }
 562                 seg->free_queues = &vm_phys_free_queues[seg->domain][flind];
 563         }
 564
 565         /*
 566          * Initialize the free queues.
 567          */
 568         for (dom = 0; dom < vm_ndomains; dom++) {
 569                 for (flind = 0; flind < vm_nfreelists; flind++) {
 570                         for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 571                                 fl = vm_phys_free_queues[dom][flind][pind];
 572                                 for (oind = 0; oind < VM_NFREEORDER; oind++)
 573                                         TAILQ_INIT(&fl[oind].pl);
 574                         }
 575                 }
 576         }
 577
 578         rw_init(&vm_phys_fictitious_reg_lock, "vmfctr");
 579 }
 580
 581 /*
 582  * Split a contiguous, power of two-sized set of physical pages.
 583  */
 584 static __inline void
 585 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order)
 586 {
 587         vm_page_t m_buddy;
 588
 589         while (oind > order) {
 590                 oind--;
 591                 m_buddy = &m[1 << oind];
 592                 KASSERT(m_buddy->order == VM_NFREEORDER,
 593                     ("vm_phys_split_pages: page %p has unexpected order %d",
 594                     m_buddy, m_buddy->order));
 595                 vm_freelist_add(fl, m_buddy, oind, 0);
 596         }
 597 }
 598
 599 /*
 600  * Initialize a physical page and add it to the free lists.
 601  */
 602 void
 603 vm_phys_add_page(vm_paddr_t pa)
 604 {
 605         vm_page_t m;
 606         struct vm_domain *vmd;
 607
 608         vm_cnt.v_page_count++;
 609         m = vm_phys_paddr_to_vm_page(pa);
 610         m->phys_addr = pa;
 611         m->queue = PQ_NONE;
 612         m->segind = vm_phys_paddr_to_segind(pa);
 613         vmd = vm_phys_domain(m);
 614         vmd->vmd_page_count++;
 615         vmd->vmd_segs |= 1UL << m->segind;
 616         KASSERT(m->order == VM_NFREEORDER,
 617             ("vm_phys_add_page: page %p has unexpected order %d",
 618             m, m->order));
 619         m->pool = VM_FREEPOOL_DEFAULT;
 620         pmap_page_init(m);
 621         mtx_lock(&vm_page_queue_free_mtx);
 622         vm_phys_freecnt_adj(m, 1);
 623         vm_phys_free_pages(m, 0);
 624         mtx_unlock(&vm_page_queue_free_mtx);
 625 }
 626
 627 /*
 628  * Allocate a contiguous, power of two-sized set of physical pages
 629  * from the free lists.
 630  *
 631  * The free page queues must be locked.
 632  */
 633 vm_page_t
 634 vm_phys_alloc_pages(int pool, int order)
 635 {
 636         vm_page_t m;
 637         int dom, domain, flind;
 638
 639         KASSERT(pool < VM_NFREEPOOL,
 640             ("vm_phys_alloc_pages: pool %d is out of range", pool));
 641         KASSERT(order < VM_NFREEORDER,
 642             ("vm_phys_alloc_pages: order %d is out of range", order));
 643
 644         for (dom = 0; dom < vm_ndomains; dom++) {
 645                 domain = vm_rr_selectdomain();
 646                 for (flind = 0; flind < vm_nfreelists; flind++) {
 647                         m = vm_phys_alloc_domain_pages(domain, flind, pool,
 648                             order);
 649                         if (m != NULL)
 650                                 return (m);
 651                 }
 652         }
 653         return (NULL);
 654 }
 655
 656 /*
 657  * Allocate a contiguous, power of two-sized set of physical pages from the
 658  * specified free list.  The free list must be specified using one of the
 659  * manifest constants VM_FREELIST_*.
 660  *
 661  * The free page queues must be locked.
 662  */
 663 vm_page_t
 664 vm_phys_alloc_freelist_pages(int freelist, int pool, int order)
 665 {
 666         vm_page_t m;
 667         int dom, domain;
 668
 669         KASSERT(freelist < VM_NFREELIST,
 670             ("vm_phys_alloc_freelist_pages: freelist %d is out of range",
 671             freelist));
 672         KASSERT(pool < VM_NFREEPOOL,
 673             ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
 674         KASSERT(order < VM_NFREEORDER,
 675             ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
 676         for (dom = 0; dom < vm_ndomains; dom++) {
 677                 domain = vm_rr_selectdomain();
 678                 m = vm_phys_alloc_domain_pages(domain,
 679                     vm_freelist_to_flind[freelist], pool, order);
 680                 if (m != NULL)
 681                         return (m);
 682         }
 683         return (NULL);
 684 }
 685
 686 static vm_page_t
 687 vm_phys_alloc_domain_pages(int domain, int flind, int pool, int order)
 688 {
 689         struct vm_freelist *fl;
 690         struct vm_freelist *alt;
 691         int oind, pind;
 692         vm_page_t m;
 693
 694         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 695         fl = &vm_phys_free_queues[domain][flind][pool][0];
 696         for (oind = order; oind < VM_NFREEORDER; oind++) {
 697                 m = TAILQ_FIRST(&fl[oind].pl);
 698                 if (m != NULL) {
 699                         vm_freelist_rem(fl, m, oind);
 700                         vm_phys_split_pages(m, oind, fl, order);
 701                         return (m);
 702                 }
 703         }
 704
 705         /*
 706          * The given pool was empty.  Find the largest
 707          * contiguous, power-of-two-sized set of pages in any
 708          * pool.  Transfer these pages to the given pool, and
 709          * use them to satisfy the allocation.
 710          */
 711         for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
 712                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 713                         alt = &vm_phys_free_queues[domain][flind][pind][0];
 714                         m = TAILQ_FIRST(&alt[oind].pl);
 715                         if (m != NULL) {
 716                                 vm_freelist_rem(alt, m, oind);
 717                                 vm_phys_set_pool(pool, m, oind);
 718                                 vm_phys_split_pages(m, oind, fl, order);
 719                                 return (m);
 720                         }
 721                 }
 722         }
 723         return (NULL);
 724 }
 725
 726 /*
 727  * Find the vm_page corresponding to the given physical address.
 728  */
 729 vm_page_t
 730 vm_phys_paddr_to_vm_page(vm_paddr_t pa)
 731 {
 732         struct vm_phys_seg *seg;
 733         int segind;
 734
 735         for (segind = 0; segind < vm_phys_nsegs; segind++) {
 736                 seg = &vm_phys_segs[segind];
 737                 if (pa >= seg->start && pa < seg->end)
 738                         return (&seg->first_page[atop(pa - seg->start)]);
 739         }
 740         return (NULL);
 741 }
 742
 743 vm_page_t
 744 vm_phys_fictitious_to_vm_page(vm_paddr_t pa)
 745 {
 746         struct vm_phys_fictitious_seg tmp, *seg;
 747         vm_page_t m;
 748
 749         m = NULL;
 750         tmp.start = pa;
 751         tmp.end = 0;
 752
 753         rw_rlock(&vm_phys_fictitious_reg_lock);
 754         seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
 755         rw_runlock(&vm_phys_fictitious_reg_lock);
 756         if (seg == NULL)
 757                 return (NULL);
 758
 759         m = &seg->first_page[atop(pa - seg->start)];
 760         KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m));
 761
 762         return (m);
 763 }
 764
 765 static inline void
 766 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start,
 767     long page_count, vm_memattr_t memattr)
 768 {
 769         long i;
 770
 771         for (i = 0; i < page_count; i++) {
 772                 vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr);
 773                 range[i].oflags &= ~VPO_UNMANAGED;
 774                 range[i].busy_lock = VPB_UNBUSIED;
 775         }
 776 }
 777
 778 int
 779 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
 780     vm_memattr_t memattr)
 781 {
 782         struct vm_phys_fictitious_seg *seg;
 783         vm_page_t fp;
 784         long page_count;
 785 #ifdef VM_PHYSSEG_DENSE
 786         long pi, pe;
 787         long dpage_count;
 788 #endif
 789
 790         KASSERT(start < end,
 791             ("Start of segment isn't less than end (start: %jx end: %jx)",
 792             (uintmax_t)start, (uintmax_t)end));
 793
 794         page_count = (end - start) / PAGE_SIZE;
 795
 796 #ifdef VM_PHYSSEG_DENSE
 797         pi = atop(start);
 798         pe = atop(end);
 799         if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
 800                 fp = &vm_page_array[pi - first_page];
 801                 if ((pe - first_page) > vm_page_array_size) {
 802                         /*
 803                          * We have a segment that starts inside
 804                          * of vm_page_array, but ends outside of it.
 805                          *
 806                          * Use vm_page_array pages for those that are
 807                          * inside of the vm_page_array range, and
 808                          * allocate the remaining ones.
 809                          */
 810                         dpage_count = vm_page_array_size - (pi - first_page);
 811                         vm_phys_fictitious_init_range(fp, start, dpage_count,
 812                             memattr);
 813                         page_count -= dpage_count;
 814                         start += ptoa(dpage_count);
 815                         goto alloc;
 816                 }
 817                 /*
 818                  * We can allocate the full range from vm_page_array,
 819                  * so there's no need to register the range in the tree.
 820                  */
 821                 vm_phys_fictitious_init_range(fp, start, page_count, memattr);
 822                 return (0);
 823         } else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
 824                 /*
 825                  * We have a segment that ends inside of vm_page_array,
 826                  * but starts outside of it.
 827                  */
 828                 fp = &vm_page_array[0];
 829                 dpage_count = pe - first_page;
 830                 vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count,
 831                     memattr);
 832                 end -= ptoa(dpage_count);
 833                 page_count -= dpage_count;
 834                 goto alloc;
 835         } else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
 836                 /*
 837                  * Trying to register a fictitious range that expands before
 838                  * and after vm_page_array.
 839                  */
 840                 return (EINVAL);
 841         } else {
 842 alloc:
 843 #endif
 844                 fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES,
 845                     M_WAITOK | M_ZERO);
 846 #ifdef VM_PHYSSEG_DENSE
 847         }
 848 #endif
 849         vm_phys_fictitious_init_range(fp, start, page_count, memattr);
 850
 851         seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO);
 852         seg->start = start;
 853         seg->end = end;
 854         seg->first_page = fp;
 855
 856         rw_wlock(&vm_phys_fictitious_reg_lock);
 857         RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg);
 858         rw_wunlock(&vm_phys_fictitious_reg_lock);
 859
 860         return (0);
 861 }
 862
 863 void
 864 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end)
 865 {
 866         struct vm_phys_fictitious_seg *seg, tmp;
 867 #ifdef VM_PHYSSEG_DENSE
 868         long pi, pe;
 869 #endif
 870
 871         KASSERT(start < end,
 872             ("Start of segment isn't less than end (start: %jx end: %jx)",
 873             (uintmax_t)start, (uintmax_t)end));
 874
 875 #ifdef VM_PHYSSEG_DENSE
 876         pi = atop(start);
 877         pe = atop(end);
 878         if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
 879                 if ((pe - first_page) <= vm_page_array_size) {
 880                         /*
 881                          * This segment was allocated using vm_page_array
 882                          * only, there's nothing to do since those pages
 883                          * were never added to the tree.
 884                          */
 885                         return;
 886                 }
 887                 /*
 888                  * We have a segment that starts inside
 889                  * of vm_page_array, but ends outside of it.
 890                  *
 891                  * Calculate how many pages were added to the
 892                  * tree and free them.
 893                  */
 894                 start = ptoa(first_page + vm_page_array_size);
 895         } else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
 896                 /*
 897                  * We have a segment that ends inside of vm_page_array,
 898                  * but starts outside of it.
 899                  */
 900                 end = ptoa(first_page);
 901         } else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
 902                 /* Since it's not possible to register such a range, panic. */
 903                 panic(
 904                     "Unregistering not registered fictitious range [%#jx:%#jx]",
 905                     (uintmax_t)start, (uintmax_t)end);
 906         }
 907 #endif
 908         tmp.start = start;
 909         tmp.end = 0;
 910
 911         rw_wlock(&vm_phys_fictitious_reg_lock);
 912         seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
 913         if (seg->start != start || seg->end != end) {
 914                 rw_wunlock(&vm_phys_fictitious_reg_lock);
 915                 panic(
 916                     "Unregistering not registered fictitious range [%#jx:%#jx]",
 917                     (uintmax_t)start, (uintmax_t)end);
 918         }
 919         RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg);
 920         rw_wunlock(&vm_phys_fictitious_reg_lock);
 921         free(seg->first_page, M_FICT_PAGES);
 922         free(seg, M_FICT_PAGES);
 923 }
 924
 925 /*
 926  * Find the segment containing the given physical address.
 927  */
 928 static int
 929 vm_phys_paddr_to_segind(vm_paddr_t pa)
 930 {
 931         struct vm_phys_seg *seg;
 932         int segind;
 933
 934         for (segind = 0; segind < vm_phys_nsegs; segind++) {
 935                 seg = &vm_phys_segs[segind];
 936                 if (pa >= seg->start && pa < seg->end)
 937                         return (segind);
 938         }
 939         panic("vm_phys_paddr_to_segind: paddr %#jx is not in any segment" ,
 940             (uintmax_t)pa);
 941 }
 942
 943 /*
 944  * Free a contiguous, power of two-sized set of physical pages.
 945  *
 946  * The free page queues must be locked.
 947  */
 948 void
 949 vm_phys_free_pages(vm_page_t m, int order)
 950 {
 951         struct vm_freelist *fl;
 952         struct vm_phys_seg *seg;
 953         vm_paddr_t pa;
 954         vm_page_t m_buddy;
 955
 956         KASSERT(m->order == VM_NFREEORDER,
 957             ("vm_phys_free_pages: page %p has unexpected order %d",
 958             m, m->order));
 959         KASSERT(m->pool < VM_NFREEPOOL,
 960             ("vm_phys_free_pages: page %p has unexpected pool %d",
 961             m, m->pool));
 962         KASSERT(order < VM_NFREEORDER,
 963             ("vm_phys_free_pages: order %d is out of range", order));
 964         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 965         seg = &vm_phys_segs[m->segind];
 966         if (order < VM_NFREEORDER - 1) {
 967                 pa = VM_PAGE_TO_PHYS(m);
 968                 do {
 969                         pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order));
 970                         if (pa < seg->start || pa >= seg->end)
 971                                 break;
 972                         m_buddy = &seg->first_page[atop(pa - seg->start)];
 973                         if (m_buddy->order != order)
 974                                 break;
 975                         fl = (*seg->free_queues)[m_buddy->pool];
 976                         vm_freelist_rem(fl, m_buddy, order);
 977                         if (m_buddy->pool != m->pool)
 978                                 vm_phys_set_pool(m->pool, m_buddy, order);
 979                         order++;
 980                         pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1);
 981                         m = &seg->first_page[atop(pa - seg->start)];
 982                 } while (order < VM_NFREEORDER - 1);
 983         }
 984         fl = (*seg->free_queues)[m->pool];
 985         vm_freelist_add(fl, m, order, 1);
 986 }
 987
 988 /*
 989  * Free a contiguous, arbitrarily sized set of physical pages.
 990  *
 991  * The free page queues must be locked.
 992  */
 993 void
 994 vm_phys_free_contig(vm_page_t m, u_long npages)
 995 {
 996         u_int n;
 997         int order;
 998
 999         /*
1000          * Avoid unnecessary coalescing by freeing the pages in the largest
1001          * possible power-of-two-sized subsets.
1002          */
1003         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1004         for (;; npages -= n) {
1005                 /*
1006                  * Unsigned "min" is used here so that "order" is assigned
1007                  * "VM_NFREEORDER - 1" when "m"'s physical address is zero
1008                  * or the low-order bits of its physical address are zero
1009                  * because the size of a physical address exceeds the size of
1010                  * a long.
1011                  */
1012                 order = min(ffsl(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1,
1013                     VM_NFREEORDER - 1);
1014                 n = 1 << order;
1015                 if (npages < n)
1016                         break;
1017                 vm_phys_free_pages(m, order);
1018                 m += n;
1019         }
1020         /* The residual "npages" is less than "1 << (VM_NFREEORDER - 1)". */
1021         for (; npages > 0; npages -= n) {
1022                 order = flsl(npages) - 1;
1023                 n = 1 << order;
1024                 vm_phys_free_pages(m, order);
1025                 m += n;
1026         }
1027 }
1028
1029 /*
1030  * Set the pool for a contiguous, power of two-sized set of physical pages.
1031  */
1032 void
1033 vm_phys_set_pool(int pool, vm_page_t m, int order)
1034 {
1035         vm_page_t m_tmp;
1036
1037         for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
1038                 m_tmp->pool = pool;
1039 }
1040
1041 /*
1042  * Search for the given physical page "m" in the free lists.  If the search
1043  * succeeds, remove "m" from the free lists and return TRUE.  Otherwise, return
1044  * FALSE, indicating that "m" is not in the free lists.
1045  *
1046  * The free page queues must be locked.
1047  */
1048 boolean_t
1049 vm_phys_unfree_page(vm_page_t m)
1050 {
1051         struct vm_freelist *fl;
1052         struct vm_phys_seg *seg;
1053         vm_paddr_t pa, pa_half;
1054         vm_page_t m_set, m_tmp;
1055         int order;
1056
1057         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1058
1059         /*
1060          * First, find the contiguous, power of two-sized set of free
1061          * physical pages containing the given physical page "m" and
1062          * assign it to "m_set".
1063          */
1064         seg = &vm_phys_segs[m->segind];
1065         for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
1066             order < VM_NFREEORDER - 1; ) {
1067                 order++;
1068                 pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
1069                 if (pa >= seg->start)
1070                         m_set = &seg->first_page[atop(pa - seg->start)];
1071                 else
1072                         return (FALSE);
1073         }
1074         if (m_set->order < order)
1075                 return (FALSE);
1076         if (m_set->order == VM_NFREEORDER)
1077                 return (FALSE);
1078         KASSERT(m_set->order < VM_NFREEORDER,
1079             ("vm_phys_unfree_page: page %p has unexpected order %d",
1080             m_set, m_set->order));
1081
1082         /*
1083          * Next, remove "m_set" from the free lists.  Finally, extract
1084          * "m" from "m_set" using an iterative algorithm: While "m_set"
1085          * is larger than a page, shrink "m_set" by returning the half
1086          * of "m_set" that does not contain "m" to the free lists.
1087          */
1088         fl = (*seg->free_queues)[m_set->pool];
1089         order = m_set->order;
1090         vm_freelist_rem(fl, m_set, order);
1091         while (order > 0) {
1092                 order--;
1093                 pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
1094                 if (m->phys_addr < pa_half)
1095                         m_tmp = &seg->first_page[atop(pa_half - seg->start)];
1096                 else {
1097                         m_tmp = m_set;
1098                         m_set = &seg->first_page[atop(pa_half - seg->start)];
1099                 }
1100                 vm_freelist_add(fl, m_tmp, order, 0);
1101         }
1102         KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
1103         return (TRUE);
1104 }
1105
1106 /*
1107  * Try to zero one physical page.  Used by an idle priority thread.
1108  */
1109 boolean_t
1110 vm_phys_zero_pages_idle(void)
1111 {
1112         static struct vm_freelist *fl;
1113         static int flind, oind, pind;
1114         vm_page_t m, m_tmp;
1115         int domain;
1116
1117         domain = vm_rr_selectdomain();
1118         fl = vm_phys_free_queues[domain][0][0];
1119         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1120         for (;;) {
1121                 TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, plinks.q) {
1122                         for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) {
1123                                 if ((m_tmp->flags & (PG_CACHED | PG_ZERO)) == 0) {
1124                                         vm_phys_unfree_page(m_tmp);
1125                                         vm_phys_freecnt_adj(m, -1);
1126                                         mtx_unlock(&vm_page_queue_free_mtx);
1127                                         pmap_zero_page_idle(m_tmp);
1128                                         m_tmp->flags |= PG_ZERO;
1129                                         mtx_lock(&vm_page_queue_free_mtx);
1130                                         vm_phys_freecnt_adj(m, 1);
1131                                         vm_phys_free_pages(m_tmp, 0);
1132                                         vm_page_zero_count++;
1133                                         cnt_prezero++;
1134                                         return (TRUE);
1135                                 }
1136                         }
1137                 }
1138                 oind++;
1139                 if (oind == VM_NFREEORDER) {
1140                         oind = 0;
1141                         pind++;
1142                         if (pind == VM_NFREEPOOL) {
1143                                 pind = 0;
1144                                 flind++;
1145                                 if (flind == vm_nfreelists)
1146                                         flind = 0;
1147                         }
1148                         fl = vm_phys_free_queues[domain][flind][pind];
1149                 }
1150         }
1151 }
1152
1153 /*
1154  * Allocate a contiguous set of physical pages of the given size
1155  * "npages" from the free lists.  All of the physical pages must be at
1156  * or above the given physical address "low" and below the given
1157  * physical address "high".  The given value "alignment" determines the
1158  * alignment of the first physical page in the set.  If the given value
1159  * "boundary" is non-zero, then the set of physical pages cannot cross
1160  * any physical address boundary that is a multiple of that value.  Both
1161  * "alignment" and "boundary" must be a power of two.
1162  */
1163 vm_page_t
1164 vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
1165     u_long alignment, vm_paddr_t boundary)
1166 {
1167         struct vm_freelist *fl;
1168         struct vm_phys_seg *seg;
1169         vm_paddr_t pa, pa_last, size;
1170         vm_page_t m, m_ret;
1171         u_long npages_end;
1172         int dom, domain, flind, oind, order, pind;
1173
1174         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1175         size = npages << PAGE_SHIFT;
1176         KASSERT(size != 0,
1177             ("vm_phys_alloc_contig: size must not be 0"));
1178         KASSERT((alignment & (alignment - 1)) == 0,
1179             ("vm_phys_alloc_contig: alignment must be a power of 2"));
1180         KASSERT((boundary & (boundary - 1)) == 0,
1181             ("vm_phys_alloc_contig: boundary must be a power of 2"));
1182         /* Compute the queue that is the best fit for npages. */
1183         for (order = 0; (1 << order) < npages; order++);
1184         dom = 0;
1185 restartdom:
1186         domain = vm_rr_selectdomain();
1187         for (flind = 0; flind < vm_nfreelists; flind++) {
1188                 for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) {
1189                         for (pind = 0; pind < VM_NFREEPOOL; pind++) {
1190                                 fl = &vm_phys_free_queues[domain][flind][pind][0];
1191                                 TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) {
1192                                         /*
1193                                          * A free list may contain physical pages
1194                                          * from one or more segments.
1195                                          */
1196                                         seg = &vm_phys_segs[m_ret->segind];
1197                                         if (seg->start > high ||
1198                                             low >= seg->end)
1199                                                 continue;
1200
1201                                         /*
1202                                          * Is the size of this allocation request
1203                                          * larger than the largest block size?
1204                                          */
1205                                         if (order >= VM_NFREEORDER) {
1206                                                 /*
1207                                                  * Determine if a sufficient number
1208                                                  * of subsequent blocks to satisfy
1209                                                  * the allocation request are free.
1210                                                  */
1211                                                 pa = VM_PAGE_TO_PHYS(m_ret);
1212                                                 pa_last = pa + size;
1213                                                 for (;;) {
1214                                                         pa += 1 << (PAGE_SHIFT + VM_NFREEORDER - 1);
1215                                                         if (pa >= pa_last)
1216                                                                 break;
1217                                                         if (pa < seg->start ||
1218                                                             pa >= seg->end)
1219                                                                 break;
1220                                                         m = &seg->first_page[atop(pa - seg->start)];
1221                                                         if (m->order != VM_NFREEORDER - 1)
1222                                                                 break;
1223                                                 }
1224                                                 /* If not, continue to the next block. */
1225                                                 if (pa < pa_last)
1226                                                         continue;
1227                                         }
1228
1229                                         /*
1230                                          * Determine if the blocks are within the given range,
1231                                          * satisfy the given alignment, and do not cross the
1232                                          * given boundary.
1233                                          */
1234                                         pa = VM_PAGE_TO_PHYS(m_ret);
1235                                         if (pa >= low &&
1236                                             pa + size <= high &&
1237                                             (pa & (alignment - 1)) == 0 &&
1238                                             ((pa ^ (pa + size - 1)) & ~(boundary - 1)) == 0)
1239                                                 goto done;
1240                                 }
1241                         }
1242                 }
1243         }
1244         if (++dom < vm_ndomains)
1245                 goto restartdom;
1246         return (NULL);
1247 done:
1248         for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) {
1249                 fl = (*seg->free_queues)[m->pool];
1250                 vm_freelist_rem(fl, m, m->order);
1251         }
1252         if (m_ret->pool != VM_FREEPOOL_DEFAULT)
1253                 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m_ret, oind);
1254         fl = (*seg->free_queues)[m_ret->pool];
1255         vm_phys_split_pages(m_ret, oind, fl, order);
1256         /* Return excess pages to the free lists. */
1257         npages_end = roundup2(npages, 1 << imin(oind, order));
1258         if (npages < npages_end)
1259                 vm_phys_free_contig(&m_ret[npages], npages_end - npages);
1260         return (m_ret);
1261 }
1262
1263 #ifdef DDB
1264 /*
1265  * Show the number of physical pages in each of the free lists.
1266  */
1267 DB_SHOW_COMMAND(freepages, db_show_freepages)
1268 {
1269         struct vm_freelist *fl;
1270         int flind, oind, pind, dom;
1271
1272         for (dom = 0; dom < vm_ndomains; dom++) {
1273                 db_printf("DOMAIN: %d\n", dom);
1274                 for (flind = 0; flind < vm_nfreelists; flind++) {
1275                         db_printf("FREE LIST %d:\n"
1276                             "\n  ORDER (SIZE)  |  NUMBER"
1277                             "\n              ", flind);
1278                         for (pind = 0; pind < VM_NFREEPOOL; pind++)
1279                                 db_printf("  |  POOL %d", pind);
1280                         db_printf("\n--            ");
1281                         for (pind = 0; pind < VM_NFREEPOOL; pind++)
1282                                 db_printf("-- --      ");
1283                         db_printf("--\n");
1284                         for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
1285                                 db_printf("  %2.2d (%6.6dK)", oind,
1286                                     1 << (PAGE_SHIFT - 10 + oind));
1287                                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
1288                                 fl = vm_phys_free_queues[dom][flind][pind];
1289                                         db_printf("  |  %6.6d", fl[oind].lcnt);
1290                                 }
1291                                 db_printf("\n");
1292                         }
1293                         db_printf("\n");
1294                 }
1295                 db_printf("\n");
1296         }
1297 }
1298 #endif