sys/vm/vm_phys.c

   1 /*-
   2  * Copyright (c) 2002-2006 Rice University
   3  * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
   4  * All rights reserved.
   5  *
   6  * This software was developed for the FreeBSD Project by Alan L. Cox,
   7  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  * 1. Redistributions of source code must retain the above copyright
  13  *    notice, this list of conditions and the following disclaimer.
  14  * 2. Redistributions in binary form must reproduce the above copyright
  15  *    notice, this list of conditions and the following disclaimer in the
  16  *    documentation and/or other materials provided with the distribution.
  17  *
  18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  22  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  24  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  25  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  26  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  28  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 /*
  33  *      Physical memory system implementation
  34  *
  35  * Any external functions defined by this module are only to be used by the
  36  * virtual memory system.
  37  */
  38
  39 #include <sys/cdefs.h>
  40 __FBSDID("$FreeBSD$");
  41
  42 #include "opt_ddb.h"
  43 #include "opt_vm.h"
  44
  45 #include <sys/param.h>
  46 #include <sys/systm.h>
  47 #include <sys/lock.h>
  48 #include <sys/kernel.h>
  49 #include <sys/malloc.h>
  50 #include <sys/mutex.h>
  51 #include <sys/proc.h>
  52 #include <sys/queue.h>
  53 #include <sys/rwlock.h>
  54 #include <sys/sbuf.h>
  55 #include <sys/sysctl.h>
  56 #include <sys/tree.h>
  57 #include <sys/vmmeter.h>
  58 #include <sys/seq.h>
  59
  60 #include <ddb/ddb.h>
  61
  62 #include <vm/vm.h>
  63 #include <vm/vm_param.h>
  64 #include <vm/vm_kern.h>
  65 #include <vm/vm_object.h>
  66 #include <vm/vm_page.h>
  67 #include <vm/vm_phys.h>
  68
  69 #include <vm/vm_domain.h>
  70
  71 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
  72     "Too many physsegs.");
  73
  74 #ifdef VM_NUMA_ALLOC
  75 struct mem_affinity *mem_affinity;
  76 int *mem_locality;
  77 #endif
  78
  79 int vm_ndomains = 1;
  80
  81 struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
  82 int vm_phys_nsegs;
  83
  84 struct vm_phys_fictitious_seg;
  85 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *,
  86     struct vm_phys_fictitious_seg *);
  87
  88 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree =
  89     RB_INITIALIZER(_vm_phys_fictitious_tree);
  90
  91 struct vm_phys_fictitious_seg {
  92         RB_ENTRY(vm_phys_fictitious_seg) node;
  93         /* Memory region data */
  94         vm_paddr_t      start;
  95         vm_paddr_t      end;
  96         vm_page_t       first_page;
  97 };
  98
  99 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node,
 100     vm_phys_fictitious_cmp);
 101
 102 static struct rwlock vm_phys_fictitious_reg_lock;
 103 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages");
 104
 105 static struct vm_freelist
 106     vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER];
 107
 108 static int vm_nfreelists;
 109
 110 /*
 111  * Provides the mapping from VM_FREELIST_* to free list indices (flind).
 112  */
 113 static int vm_freelist_to_flind[VM_NFREELIST];
 114
 115 CTASSERT(VM_FREELIST_DEFAULT == 0);
 116
 117 #ifdef VM_FREELIST_ISADMA
 118 #define VM_ISADMA_BOUNDARY      16777216
 119 #endif
 120 #ifdef VM_FREELIST_DMA32
 121 #define VM_DMA32_BOUNDARY       ((vm_paddr_t)1 << 32)
 122 #endif
 123
 124 /*
 125  * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about
 126  * the ordering of the free list boundaries.
 127  */
 128 #if defined(VM_ISADMA_BOUNDARY) && defined(VM_LOWMEM_BOUNDARY)
 129 CTASSERT(VM_ISADMA_BOUNDARY < VM_LOWMEM_BOUNDARY);
 130 #endif
 131 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY)
 132 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY);
 133 #endif
 134
 135 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
 136 SYSCTL_OID(_vm, OID_AUTO, phys_free, CTLTYPE_STRING | CTLFLAG_RD,
 137     NULL, 0, sysctl_vm_phys_free, "A", "Phys Free Info");
 138
 139 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
 140 SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,
 141     NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info");
 142
 143 #ifdef VM_NUMA_ALLOC
 144 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS);
 145 SYSCTL_OID(_vm, OID_AUTO, phys_locality, CTLTYPE_STRING | CTLFLAG_RD,
 146     NULL, 0, sysctl_vm_phys_locality, "A", "Phys Locality Info");
 147 #endif
 148
 149 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD,
 150     &vm_ndomains, 0, "Number of physical memory domains available.");
 151
 152 /*
 153  * Default to first-touch + round-robin.
 154  */
 155 static struct mtx vm_default_policy_mtx;
 156 MTX_SYSINIT(vm_default_policy, &vm_default_policy_mtx, "default policy mutex",
 157     MTX_DEF);
 158 #ifdef VM_NUMA_ALLOC
 159 static struct vm_domain_policy vm_default_policy =
 160     VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0);
 161 #else
 162 /* Use round-robin so the domain policy code will only try once per allocation */
 163 static struct vm_domain_policy vm_default_policy =
 164     VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_ROUND_ROBIN, 0);
 165 #endif
 166
 167 static vm_page_t vm_phys_alloc_domain_pages(int domain, int flind, int pool,
 168     int order);
 169 static vm_page_t vm_phys_alloc_seg_contig(struct vm_phys_seg *seg,
 170     u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
 171     vm_paddr_t boundary);
 172 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain);
 173 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
 174 static int vm_phys_paddr_to_segind(vm_paddr_t pa);
 175 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
 176     int order);
 177
 178 static int
 179 sysctl_vm_default_policy(SYSCTL_HANDLER_ARGS)
 180 {
 181         char policy_name[32];
 182         int error;
 183
 184         mtx_lock(&vm_default_policy_mtx);
 185
 186         /* Map policy to output string */
 187         switch (vm_default_policy.p.policy) {
 188         case VM_POLICY_FIRST_TOUCH:
 189                 strcpy(policy_name, "first-touch");
 190                 break;
 191         case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
 192                 strcpy(policy_name, "first-touch-rr");
 193                 break;
 194         case VM_POLICY_ROUND_ROBIN:
 195         default:
 196                 strcpy(policy_name, "rr");
 197                 break;
 198         }
 199         mtx_unlock(&vm_default_policy_mtx);
 200
 201         error = sysctl_handle_string(oidp, &policy_name[0],
 202             sizeof(policy_name), req);
 203         if (error != 0 || req->newptr == NULL)
 204                 return (error);
 205
 206         mtx_lock(&vm_default_policy_mtx);
 207         /* Set: match on the subset of policies that make sense as a default */
 208         if (strcmp("first-touch-rr", policy_name) == 0) {
 209                 vm_domain_policy_set(&vm_default_policy,
 210                     VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0);
 211         } else if (strcmp("first-touch", policy_name) == 0) {
 212                 vm_domain_policy_set(&vm_default_policy,
 213                     VM_POLICY_FIRST_TOUCH, 0);
 214         } else if (strcmp("rr", policy_name) == 0) {
 215                 vm_domain_policy_set(&vm_default_policy,
 216                     VM_POLICY_ROUND_ROBIN, 0);
 217         } else {
 218                 error = EINVAL;
 219                 goto finish;
 220         }
 221
 222         error = 0;
 223 finish:
 224         mtx_unlock(&vm_default_policy_mtx);
 225         return (error);
 226 }
 227
 228 SYSCTL_PROC(_vm, OID_AUTO, default_policy, CTLTYPE_STRING | CTLFLAG_RW,
 229     0, 0, sysctl_vm_default_policy, "A",
 230     "Default policy (rr, first-touch, first-touch-rr");
 231
 232 /*
 233  * Red-black tree helpers for vm fictitious range management.
 234  */
 235 static inline int
 236 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p,
 237     struct vm_phys_fictitious_seg *range)
 238 {
 239
 240         KASSERT(range->start != 0 && range->end != 0,
 241             ("Invalid range passed on search for vm_fictitious page"));
 242         if (p->start >= range->end)
 243                 return (1);
 244         if (p->start < range->start)
 245                 return (-1);
 246
 247         return (0);
 248 }
 249
 250 static int
 251 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1,
 252     struct vm_phys_fictitious_seg *p2)
 253 {
 254
 255         /* Check if this is a search for a page */
 256         if (p1->end == 0)
 257                 return (vm_phys_fictitious_in_range(p1, p2));
 258
 259         KASSERT(p2->end != 0,
 260     ("Invalid range passed as second parameter to vm fictitious comparison"));
 261
 262         /* Searching to add a new range */
 263         if (p1->end <= p2->start)
 264                 return (-1);
 265         if (p1->start >= p2->end)
 266                 return (1);
 267
 268         panic("Trying to add overlapping vm fictitious ranges:\n"
 269             "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start,
 270             (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end);
 271 }
 272
 273 static __inline int
 274 vm_rr_selectdomain(void)
 275 {
 276 #ifdef VM_NUMA_ALLOC
 277         struct thread *td;
 278
 279         td = curthread;
 280
 281         td->td_dom_rr_idx++;
 282         td->td_dom_rr_idx %= vm_ndomains;
 283         return (td->td_dom_rr_idx);
 284 #else
 285         return (0);
 286 #endif
 287 }
 288
 289 /*
 290  * Initialise a VM domain iterator.
 291  *
 292  * Check the thread policy, then the proc policy,
 293  * then default to the system policy.
 294  *
 295  * Later on the various layers will have this logic
 296  * plumbed into them and the phys code will be explicitly
 297  * handed a VM domain policy to use.
 298  */
 299 static void
 300 vm_policy_iterator_init(struct vm_domain_iterator *vi)
 301 {
 302 #ifdef VM_NUMA_ALLOC
 303         struct vm_domain_policy lcl;
 304 #endif
 305
 306         vm_domain_iterator_init(vi);
 307
 308 #ifdef VM_NUMA_ALLOC
 309         /* Copy out the thread policy */
 310         vm_domain_policy_localcopy(&lcl, &curthread->td_vm_dom_policy);
 311         if (lcl.p.policy != VM_POLICY_NONE) {
 312                 /* Thread policy is present; use it */
 313                 vm_domain_iterator_set_policy(vi, &lcl);
 314                 return;
 315         }
 316
 317         vm_domain_policy_localcopy(&lcl,
 318             &curthread->td_proc->p_vm_dom_policy);
 319         if (lcl.p.policy != VM_POLICY_NONE) {
 320                 /* Process policy is present; use it */
 321                 vm_domain_iterator_set_policy(vi, &lcl);
 322                 return;
 323         }
 324 #endif
 325         /* Use system default policy */
 326         vm_domain_iterator_set_policy(vi, &vm_default_policy);
 327 }
 328
 329 static void
 330 vm_policy_iterator_finish(struct vm_domain_iterator *vi)
 331 {
 332
 333         vm_domain_iterator_cleanup(vi);
 334 }
 335
 336 boolean_t
 337 vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high)
 338 {
 339         struct vm_phys_seg *s;
 340         int idx;
 341
 342         while ((idx = ffsl(mask)) != 0) {
 343                 idx--;  /* ffsl counts from 1 */
 344                 mask &= ~(1UL << idx);
 345                 s = &vm_phys_segs[idx];
 346                 if (low < s->end && high > s->start)
 347                         return (TRUE);
 348         }
 349         return (FALSE);
 350 }
 351
 352 /*
 353  * Outputs the state of the physical memory allocator, specifically,
 354  * the amount of physical memory in each free list.
 355  */
 356 static int
 357 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
 358 {
 359         struct sbuf sbuf;
 360         struct vm_freelist *fl;
 361         int dom, error, flind, oind, pind;
 362
 363         error = sysctl_wire_old_buffer(req, 0);
 364         if (error != 0)
 365                 return (error);
 366         sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req);
 367         for (dom = 0; dom < vm_ndomains; dom++) {
 368                 sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom);
 369                 for (flind = 0; flind < vm_nfreelists; flind++) {
 370                         sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
 371                             "\n  ORDER (SIZE)  |  NUMBER"
 372                             "\n              ", flind);
 373                         for (pind = 0; pind < VM_NFREEPOOL; pind++)
 374                                 sbuf_printf(&sbuf, "  |  POOL %d", pind);
 375                         sbuf_printf(&sbuf, "\n--            ");
 376                         for (pind = 0; pind < VM_NFREEPOOL; pind++)
 377                                 sbuf_printf(&sbuf, "-- --      ");
 378                         sbuf_printf(&sbuf, "--\n");
 379                         for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
 380                                 sbuf_printf(&sbuf, "  %2d (%6dK)", oind,
 381                                     1 << (PAGE_SHIFT - 10 + oind));
 382                                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 383                                 fl = vm_phys_free_queues[dom][flind][pind];
 384                                         sbuf_printf(&sbuf, "  |  %6d",
 385                                             fl[oind].lcnt);
 386                                 }
 387                                 sbuf_printf(&sbuf, "\n");
 388                         }
 389                 }
 390         }
 391         error = sbuf_finish(&sbuf);
 392         sbuf_delete(&sbuf);
 393         return (error);
 394 }
 395
 396 /*
 397  * Outputs the set of physical memory segments.
 398  */
 399 static int
 400 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
 401 {
 402         struct sbuf sbuf;
 403         struct vm_phys_seg *seg;
 404         int error, segind;
 405
 406         error = sysctl_wire_old_buffer(req, 0);
 407         if (error != 0)
 408                 return (error);
 409         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 410         for (segind = 0; segind < vm_phys_nsegs; segind++) {
 411                 sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
 412                 seg = &vm_phys_segs[segind];
 413                 sbuf_printf(&sbuf, "start:     %#jx\n",
 414                     (uintmax_t)seg->start);
 415                 sbuf_printf(&sbuf, "end:       %#jx\n",
 416                     (uintmax_t)seg->end);
 417                 sbuf_printf(&sbuf, "domain:    %d\n", seg->domain);
 418                 sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
 419         }
 420         error = sbuf_finish(&sbuf);
 421         sbuf_delete(&sbuf);
 422         return (error);
 423 }
 424
 425 /*
 426  * Return affinity, or -1 if there's no affinity information.
 427  */
 428 int
 429 vm_phys_mem_affinity(int f, int t)
 430 {
 431
 432 #ifdef VM_NUMA_ALLOC
 433         if (mem_locality == NULL)
 434                 return (-1);
 435         if (f >= vm_ndomains || t >= vm_ndomains)
 436                 return (-1);
 437         return (mem_locality[f * vm_ndomains + t]);
 438 #else
 439         return (-1);
 440 #endif
 441 }
 442
 443 #ifdef VM_NUMA_ALLOC
 444 /*
 445  * Outputs the VM locality table.
 446  */
 447 static int
 448 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS)
 449 {
 450         struct sbuf sbuf;
 451         int error, i, j;
 452
 453         error = sysctl_wire_old_buffer(req, 0);
 454         if (error != 0)
 455                 return (error);
 456         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 457
 458         sbuf_printf(&sbuf, "\n");
 459
 460         for (i = 0; i < vm_ndomains; i++) {
 461                 sbuf_printf(&sbuf, "%d: ", i);
 462                 for (j = 0; j < vm_ndomains; j++) {
 463                         sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j));
 464                 }
 465                 sbuf_printf(&sbuf, "\n");
 466         }
 467         error = sbuf_finish(&sbuf);
 468         sbuf_delete(&sbuf);
 469         return (error);
 470 }
 471 #endif
 472
 473 static void
 474 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail)
 475 {
 476
 477         m->order = order;
 478         if (tail)
 479                 TAILQ_INSERT_TAIL(&fl[order].pl, m, plinks.q);
 480         else
 481                 TAILQ_INSERT_HEAD(&fl[order].pl, m, plinks.q);
 482         fl[order].lcnt++;
 483 }
 484
 485 static void
 486 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order)
 487 {
 488
 489         TAILQ_REMOVE(&fl[order].pl, m, plinks.q);
 490         fl[order].lcnt--;
 491         m->order = VM_NFREEORDER;
 492 }
 493
 494 /*
 495  * Create a physical memory segment.
 496  */
 497 static void
 498 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain)
 499 {
 500         struct vm_phys_seg *seg;
 501
 502         KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
 503             ("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
 504         KASSERT(domain < vm_ndomains,
 505             ("vm_phys_create_seg: invalid domain provided"));
 506         seg = &vm_phys_segs[vm_phys_nsegs++];
 507         while (seg > vm_phys_segs && (seg - 1)->start >= end) {
 508                 *seg = *(seg - 1);
 509                 seg--;
 510         }
 511         seg->start = start;
 512         seg->end = end;
 513         seg->domain = domain;
 514 }
 515
 516 static void
 517 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end)
 518 {
 519 #ifdef VM_NUMA_ALLOC
 520         int i;
 521
 522         if (mem_affinity == NULL) {
 523                 _vm_phys_create_seg(start, end, 0);
 524                 return;
 525         }
 526
 527         for (i = 0;; i++) {
 528                 if (mem_affinity[i].end == 0)
 529                         panic("Reached end of affinity info");
 530                 if (mem_affinity[i].end <= start)
 531                         continue;
 532                 if (mem_affinity[i].start > start)
 533                         panic("No affinity info for start %jx",
 534                             (uintmax_t)start);
 535                 if (mem_affinity[i].end >= end) {
 536                         _vm_phys_create_seg(start, end,
 537                             mem_affinity[i].domain);
 538                         break;
 539                 }
 540                 _vm_phys_create_seg(start, mem_affinity[i].end,
 541                     mem_affinity[i].domain);
 542                 start = mem_affinity[i].end;
 543         }
 544 #else
 545         _vm_phys_create_seg(start, end, 0);
 546 #endif
 547 }
 548
 549 /*
 550  * Add a physical memory segment.
 551  */
 552 void
 553 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end)
 554 {
 555         vm_paddr_t paddr;
 556
 557         KASSERT((start & PAGE_MASK) == 0,
 558             ("vm_phys_define_seg: start is not page aligned"));
 559         KASSERT((end & PAGE_MASK) == 0,
 560             ("vm_phys_define_seg: end is not page aligned"));
 561
 562         /*
 563          * Split the physical memory segment if it spans two or more free
 564          * list boundaries.
 565          */
 566         paddr = start;
 567 #ifdef  VM_FREELIST_ISADMA
 568         if (paddr < VM_ISADMA_BOUNDARY && end > VM_ISADMA_BOUNDARY) {
 569                 vm_phys_create_seg(paddr, VM_ISADMA_BOUNDARY);
 570                 paddr = VM_ISADMA_BOUNDARY;
 571         }
 572 #endif
 573 #ifdef  VM_FREELIST_LOWMEM
 574         if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) {
 575                 vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY);
 576                 paddr = VM_LOWMEM_BOUNDARY;
 577         }
 578 #endif
 579 #ifdef  VM_FREELIST_DMA32
 580         if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) {
 581                 vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY);
 582                 paddr = VM_DMA32_BOUNDARY;
 583         }
 584 #endif
 585         vm_phys_create_seg(paddr, end);
 586 }
 587
 588 /*
 589  * Initialize the physical memory allocator.
 590  *
 591  * Requires that vm_page_array is initialized!
 592  */
 593 void
 594 vm_phys_init(void)
 595 {
 596         struct vm_freelist *fl;
 597         struct vm_phys_seg *seg;
 598         u_long npages;
 599         int dom, flind, freelist, oind, pind, segind;
 600
 601         /*
 602          * Compute the number of free lists, and generate the mapping from the
 603          * manifest constants VM_FREELIST_* to the free list indices.
 604          *
 605          * Initially, the entries of vm_freelist_to_flind[] are set to either
 606          * 0 or 1 to indicate which free lists should be created.
 607          */
 608         npages = 0;
 609         for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
 610                 seg = &vm_phys_segs[segind];
 611 #ifdef  VM_FREELIST_ISADMA
 612                 if (seg->end <= VM_ISADMA_BOUNDARY)
 613                         vm_freelist_to_flind[VM_FREELIST_ISADMA] = 1;
 614                 else
 615 #endif
 616 #ifdef  VM_FREELIST_LOWMEM
 617                 if (seg->end <= VM_LOWMEM_BOUNDARY)
 618                         vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1;
 619                 else
 620 #endif
 621 #ifdef  VM_FREELIST_DMA32
 622                 if (
 623 #ifdef  VM_DMA32_NPAGES_THRESHOLD
 624                     /*
 625                      * Create the DMA32 free list only if the amount of
 626                      * physical memory above physical address 4G exceeds the
 627                      * given threshold.
 628                      */
 629                     npages > VM_DMA32_NPAGES_THRESHOLD &&
 630 #endif
 631                     seg->end <= VM_DMA32_BOUNDARY)
 632                         vm_freelist_to_flind[VM_FREELIST_DMA32] = 1;
 633                 else
 634 #endif
 635                 {
 636                         npages += atop(seg->end - seg->start);
 637                         vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1;
 638                 }
 639         }
 640         /* Change each entry into a running total of the free lists. */
 641         for (freelist = 1; freelist < VM_NFREELIST; freelist++) {
 642                 vm_freelist_to_flind[freelist] +=
 643                     vm_freelist_to_flind[freelist - 1];
 644         }
 645         vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1];
 646         KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists"));
 647         /* Change each entry into a free list index. */
 648         for (freelist = 0; freelist < VM_NFREELIST; freelist++)
 649                 vm_freelist_to_flind[freelist]--;
 650
 651         /*
 652          * Initialize the first_page and free_queues fields of each physical
 653          * memory segment.
 654          */
 655 #ifdef VM_PHYSSEG_SPARSE
 656         npages = 0;
 657 #endif
 658         for (segind = 0; segind < vm_phys_nsegs; segind++) {
 659                 seg = &vm_phys_segs[segind];
 660 #ifdef VM_PHYSSEG_SPARSE
 661                 seg->first_page = &vm_page_array[npages];
 662                 npages += atop(seg->end - seg->start);
 663 #else
 664                 seg->first_page = PHYS_TO_VM_PAGE(seg->start);
 665 #endif
 666 #ifdef  VM_FREELIST_ISADMA
 667                 if (seg->end <= VM_ISADMA_BOUNDARY) {
 668                         flind = vm_freelist_to_flind[VM_FREELIST_ISADMA];
 669                         KASSERT(flind >= 0,
 670                             ("vm_phys_init: ISADMA flind < 0"));
 671                 } else
 672 #endif
 673 #ifdef  VM_FREELIST_LOWMEM
 674                 if (seg->end <= VM_LOWMEM_BOUNDARY) {
 675                         flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM];
 676                         KASSERT(flind >= 0,
 677                             ("vm_phys_init: LOWMEM flind < 0"));
 678                 } else
 679 #endif
 680 #ifdef  VM_FREELIST_DMA32
 681                 if (seg->end <= VM_DMA32_BOUNDARY) {
 682                         flind = vm_freelist_to_flind[VM_FREELIST_DMA32];
 683                         KASSERT(flind >= 0,
 684                             ("vm_phys_init: DMA32 flind < 0"));
 685                 } else
 686 #endif
 687                 {
 688                         flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT];
 689                         KASSERT(flind >= 0,
 690                             ("vm_phys_init: DEFAULT flind < 0"));
 691                 }
 692                 seg->free_queues = &vm_phys_free_queues[seg->domain][flind];
 693         }
 694
 695         /*
 696          * Initialize the free queues.
 697          */
 698         for (dom = 0; dom < vm_ndomains; dom++) {
 699                 for (flind = 0; flind < vm_nfreelists; flind++) {
 700                         for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 701                                 fl = vm_phys_free_queues[dom][flind][pind];
 702                                 for (oind = 0; oind < VM_NFREEORDER; oind++)
 703                                         TAILQ_INIT(&fl[oind].pl);
 704                         }
 705                 }
 706         }
 707
 708         rw_init(&vm_phys_fictitious_reg_lock, "vmfctr");
 709 }
 710
 711 /*
 712  * Split a contiguous, power of two-sized set of physical pages.
 713  */
 714 static __inline void
 715 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order)
 716 {
 717         vm_page_t m_buddy;
 718
 719         while (oind > order) {
 720                 oind--;
 721                 m_buddy = &m[1 << oind];
 722                 KASSERT(m_buddy->order == VM_NFREEORDER,
 723                     ("vm_phys_split_pages: page %p has unexpected order %d",
 724                     m_buddy, m_buddy->order));
 725                 vm_freelist_add(fl, m_buddy, oind, 0);
 726         }
 727 }
 728
 729 /*
 730  * Initialize a physical page and add it to the free lists.
 731  */
 732 void
 733 vm_phys_add_page(vm_paddr_t pa)
 734 {
 735         vm_page_t m;
 736         struct vm_domain *vmd;
 737
 738         vm_cnt.v_page_count++;
 739         m = vm_phys_paddr_to_vm_page(pa);
 740         m->busy_lock = VPB_UNBUSIED;
 741         m->phys_addr = pa;
 742         m->queue = PQ_NONE;
 743         m->segind = vm_phys_paddr_to_segind(pa);
 744         vmd = vm_phys_domain(m);
 745         vmd->vmd_page_count++;
 746         vmd->vmd_segs |= 1UL << m->segind;
 747         KASSERT(m->order == VM_NFREEORDER,
 748             ("vm_phys_add_page: page %p has unexpected order %d",
 749             m, m->order));
 750         m->pool = VM_FREEPOOL_DEFAULT;
 751         pmap_page_init(m);
 752         mtx_lock(&vm_page_queue_free_mtx);
 753         vm_phys_freecnt_adj(m, 1);
 754         vm_phys_free_pages(m, 0);
 755         mtx_unlock(&vm_page_queue_free_mtx);
 756 }
 757
 758 /*
 759  * Allocate a contiguous, power of two-sized set of physical pages
 760  * from the free lists.
 761  *
 762  * The free page queues must be locked.
 763  */
 764 vm_page_t
 765 vm_phys_alloc_pages(int pool, int order)
 766 {
 767         vm_page_t m;
 768         int domain, flind;
 769         struct vm_domain_iterator vi;
 770
 771         KASSERT(pool < VM_NFREEPOOL,
 772             ("vm_phys_alloc_pages: pool %d is out of range", pool));
 773         KASSERT(order < VM_NFREEORDER,
 774             ("vm_phys_alloc_pages: order %d is out of range", order));
 775
 776         vm_policy_iterator_init(&vi);
 777
 778         while ((vm_domain_iterator_run(&vi, &domain)) == 0) {
 779                 for (flind = 0; flind < vm_nfreelists; flind++) {
 780                         m = vm_phys_alloc_domain_pages(domain, flind, pool,
 781                             order);
 782                         if (m != NULL)
 783                                 return (m);
 784                 }
 785         }
 786
 787         vm_policy_iterator_finish(&vi);
 788         return (NULL);
 789 }
 790
 791 /*
 792  * Allocate a contiguous, power of two-sized set of physical pages from the
 793  * specified free list.  The free list must be specified using one of the
 794  * manifest constants VM_FREELIST_*.
 795  *
 796  * The free page queues must be locked.
 797  */
 798 vm_page_t
 799 vm_phys_alloc_freelist_pages(int freelist, int pool, int order)
 800 {
 801         vm_page_t m;
 802         struct vm_domain_iterator vi;
 803         int domain;
 804
 805         KASSERT(freelist < VM_NFREELIST,
 806             ("vm_phys_alloc_freelist_pages: freelist %d is out of range",
 807             freelist));
 808         KASSERT(pool < VM_NFREEPOOL,
 809             ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
 810         KASSERT(order < VM_NFREEORDER,
 811             ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
 812
 813         vm_policy_iterator_init(&vi);
 814
 815         while ((vm_domain_iterator_run(&vi, &domain)) == 0) {
 816                 m = vm_phys_alloc_domain_pages(domain,
 817                     vm_freelist_to_flind[freelist], pool, order);
 818                 if (m != NULL)
 819                         return (m);
 820         }
 821
 822         vm_policy_iterator_finish(&vi);
 823         return (NULL);
 824 }
 825
 826 static vm_page_t
 827 vm_phys_alloc_domain_pages(int domain, int flind, int pool, int order)
 828 {
 829         struct vm_freelist *fl;
 830         struct vm_freelist *alt;
 831         int oind, pind;
 832         vm_page_t m;
 833
 834         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 835         fl = &vm_phys_free_queues[domain][flind][pool][0];
 836         for (oind = order; oind < VM_NFREEORDER; oind++) {
 837                 m = TAILQ_FIRST(&fl[oind].pl);
 838                 if (m != NULL) {
 839                         vm_freelist_rem(fl, m, oind);
 840                         vm_phys_split_pages(m, oind, fl, order);
 841                         return (m);
 842                 }
 843         }
 844
 845         /*
 846          * The given pool was empty.  Find the largest
 847          * contiguous, power-of-two-sized set of pages in any
 848          * pool.  Transfer these pages to the given pool, and
 849          * use them to satisfy the allocation.
 850          */
 851         for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
 852                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 853                         alt = &vm_phys_free_queues[domain][flind][pind][0];
 854                         m = TAILQ_FIRST(&alt[oind].pl);
 855                         if (m != NULL) {
 856                                 vm_freelist_rem(alt, m, oind);
 857                                 vm_phys_set_pool(pool, m, oind);
 858                                 vm_phys_split_pages(m, oind, fl, order);
 859                                 return (m);
 860                         }
 861                 }
 862         }
 863         return (NULL);
 864 }
 865
 866 /*
 867  * Find the vm_page corresponding to the given physical address.
 868  */
 869 vm_page_t
 870 vm_phys_paddr_to_vm_page(vm_paddr_t pa)
 871 {
 872         struct vm_phys_seg *seg;
 873         int segind;
 874
 875         for (segind = 0; segind < vm_phys_nsegs; segind++) {
 876                 seg = &vm_phys_segs[segind];
 877                 if (pa >= seg->start && pa < seg->end)
 878                         return (&seg->first_page[atop(pa - seg->start)]);
 879         }
 880         return (NULL);
 881 }
 882
 883 vm_page_t
 884 vm_phys_fictitious_to_vm_page(vm_paddr_t pa)
 885 {
 886         struct vm_phys_fictitious_seg tmp, *seg;
 887         vm_page_t m;
 888
 889         m = NULL;
 890         tmp.start = pa;
 891         tmp.end = 0;
 892
 893         rw_rlock(&vm_phys_fictitious_reg_lock);
 894         seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
 895         rw_runlock(&vm_phys_fictitious_reg_lock);
 896         if (seg == NULL)
 897                 return (NULL);
 898
 899         m = &seg->first_page[atop(pa - seg->start)];
 900         KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m));
 901
 902         return (m);
 903 }
 904
 905 static inline void
 906 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start,
 907     long page_count, vm_memattr_t memattr)
 908 {
 909         long i;
 910
 911         for (i = 0; i < page_count; i++) {
 912                 vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr);
 913                 range[i].oflags &= ~VPO_UNMANAGED;
 914                 range[i].busy_lock = VPB_UNBUSIED;
 915         }
 916 }
 917
 918 int
 919 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
 920     vm_memattr_t memattr)
 921 {
 922         struct vm_phys_fictitious_seg *seg;
 923         vm_page_t fp;
 924         long page_count;
 925 #ifdef VM_PHYSSEG_DENSE
 926         long pi, pe;
 927         long dpage_count;
 928 #endif
 929
 930         KASSERT(start < end,
 931             ("Start of segment isn't less than end (start: %jx end: %jx)",
 932             (uintmax_t)start, (uintmax_t)end));
 933
 934         page_count = (end - start) / PAGE_SIZE;
 935
 936 #ifdef VM_PHYSSEG_DENSE
 937         pi = atop(start);
 938         pe = atop(end);
 939         if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
 940                 fp = &vm_page_array[pi - first_page];
 941                 if ((pe - first_page) > vm_page_array_size) {
 942                         /*
 943                          * We have a segment that starts inside
 944                          * of vm_page_array, but ends outside of it.
 945                          *
 946                          * Use vm_page_array pages for those that are
 947                          * inside of the vm_page_array range, and
 948                          * allocate the remaining ones.
 949                          */
 950                         dpage_count = vm_page_array_size - (pi - first_page);
 951                         vm_phys_fictitious_init_range(fp, start, dpage_count,
 952                             memattr);
 953                         page_count -= dpage_count;
 954                         start += ptoa(dpage_count);
 955                         goto alloc;
 956                 }
 957                 /*
 958                  * We can allocate the full range from vm_page_array,
 959                  * so there's no need to register the range in the tree.
 960                  */
 961                 vm_phys_fictitious_init_range(fp, start, page_count, memattr);
 962                 return (0);
 963         } else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
 964                 /*
 965                  * We have a segment that ends inside of vm_page_array,
 966                  * but starts outside of it.
 967                  */
 968                 fp = &vm_page_array[0];
 969                 dpage_count = pe - first_page;
 970                 vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count,
 971                     memattr);
 972                 end -= ptoa(dpage_count);
 973                 page_count -= dpage_count;
 974                 goto alloc;
 975         } else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
 976                 /*
 977                  * Trying to register a fictitious range that expands before
 978                  * and after vm_page_array.
 979                  */
 980                 return (EINVAL);
 981         } else {
 982 alloc:
 983 #endif
 984                 fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES,
 985                     M_WAITOK | M_ZERO);
 986 #ifdef VM_PHYSSEG_DENSE
 987         }
 988 #endif
 989         vm_phys_fictitious_init_range(fp, start, page_count, memattr);
 990
 991         seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO);
 992         seg->start = start;
 993         seg->end = end;
 994         seg->first_page = fp;
 995
 996         rw_wlock(&vm_phys_fictitious_reg_lock);
 997         RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg);
 998         rw_wunlock(&vm_phys_fictitious_reg_lock);
 999
1000         return (0);
1001 }
1002
1003 void
1004 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end)
1005 {
1006         struct vm_phys_fictitious_seg *seg, tmp;
1007 #ifdef VM_PHYSSEG_DENSE
1008         long pi, pe;
1009 #endif
1010
1011         KASSERT(start < end,
1012             ("Start of segment isn't less than end (start: %jx end: %jx)",
1013             (uintmax_t)start, (uintmax_t)end));
1014
1015 #ifdef VM_PHYSSEG_DENSE
1016         pi = atop(start);
1017         pe = atop(end);
1018         if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
1019                 if ((pe - first_page) <= vm_page_array_size) {
1020                         /*
1021                          * This segment was allocated using vm_page_array
1022                          * only, there's nothing to do since those pages
1023                          * were never added to the tree.
1024                          */
1025                         return;
1026                 }
1027                 /*
1028                  * We have a segment that starts inside
1029                  * of vm_page_array, but ends outside of it.
1030                  *
1031                  * Calculate how many pages were added to the
1032                  * tree and free them.
1033                  */
1034                 start = ptoa(first_page + vm_page_array_size);
1035         } else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
1036                 /*
1037                  * We have a segment that ends inside of vm_page_array,
1038                  * but starts outside of it.
1039                  */
1040                 end = ptoa(first_page);
1041         } else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
1042                 /* Since it's not possible to register such a range, panic. */
1043                 panic(
1044                     "Unregistering not registered fictitious range [%#jx:%#jx]",
1045                     (uintmax_t)start, (uintmax_t)end);
1046         }
1047 #endif
1048         tmp.start = start;
1049         tmp.end = 0;
1050
1051         rw_wlock(&vm_phys_fictitious_reg_lock);
1052         seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
1053         if (seg->start != start || seg->end != end) {
1054                 rw_wunlock(&vm_phys_fictitious_reg_lock);
1055                 panic(
1056                     "Unregistering not registered fictitious range [%#jx:%#jx]",
1057                     (uintmax_t)start, (uintmax_t)end);
1058         }
1059         RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg);
1060         rw_wunlock(&vm_phys_fictitious_reg_lock);
1061         free(seg->first_page, M_FICT_PAGES);
1062         free(seg, M_FICT_PAGES);
1063 }
1064
1065 /*
1066  * Find the segment containing the given physical address.
1067  */
1068 static int
1069 vm_phys_paddr_to_segind(vm_paddr_t pa)
1070 {
1071         struct vm_phys_seg *seg;
1072         int segind;
1073
1074         for (segind = 0; segind < vm_phys_nsegs; segind++) {
1075                 seg = &vm_phys_segs[segind];
1076                 if (pa >= seg->start && pa < seg->end)
1077                         return (segind);
1078         }
1079         panic("vm_phys_paddr_to_segind: paddr %#jx is not in any segment" ,
1080             (uintmax_t)pa);
1081 }
1082
1083 /*
1084  * Free a contiguous, power of two-sized set of physical pages.
1085  *
1086  * The free page queues must be locked.
1087  */
1088 void
1089 vm_phys_free_pages(vm_page_t m, int order)
1090 {
1091         struct vm_freelist *fl;
1092         struct vm_phys_seg *seg;
1093         vm_paddr_t pa;
1094         vm_page_t m_buddy;
1095
1096         KASSERT(m->order == VM_NFREEORDER,
1097             ("vm_phys_free_pages: page %p has unexpected order %d",
1098             m, m->order));
1099         KASSERT(m->pool < VM_NFREEPOOL,
1100             ("vm_phys_free_pages: page %p has unexpected pool %d",
1101             m, m->pool));
1102         KASSERT(order < VM_NFREEORDER,
1103             ("vm_phys_free_pages: order %d is out of range", order));
1104         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1105         seg = &vm_phys_segs[m->segind];
1106         if (order < VM_NFREEORDER - 1) {
1107                 pa = VM_PAGE_TO_PHYS(m);
1108                 do {
1109                         pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order));
1110                         if (pa < seg->start || pa >= seg->end)
1111                                 break;
1112                         m_buddy = &seg->first_page[atop(pa - seg->start)];
1113                         if (m_buddy->order != order)
1114                                 break;
1115                         fl = (*seg->free_queues)[m_buddy->pool];
1116                         vm_freelist_rem(fl, m_buddy, order);
1117                         if (m_buddy->pool != m->pool)
1118                                 vm_phys_set_pool(m->pool, m_buddy, order);
1119                         order++;
1120                         pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1);
1121                         m = &seg->first_page[atop(pa - seg->start)];
1122                 } while (order < VM_NFREEORDER - 1);
1123         }
1124         fl = (*seg->free_queues)[m->pool];
1125         vm_freelist_add(fl, m, order, 1);
1126 }
1127
1128 /*
1129  * Free a contiguous, arbitrarily sized set of physical pages.
1130  *
1131  * The free page queues must be locked.
1132  */
1133 void
1134 vm_phys_free_contig(vm_page_t m, u_long npages)
1135 {
1136         u_int n;
1137         int order;
1138
1139         /*
1140          * Avoid unnecessary coalescing by freeing the pages in the largest
1141          * possible power-of-two-sized subsets.
1142          */
1143         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1144         for (;; npages -= n) {
1145                 /*
1146                  * Unsigned "min" is used here so that "order" is assigned
1147                  * "VM_NFREEORDER - 1" when "m"'s physical address is zero
1148                  * or the low-order bits of its physical address are zero
1149                  * because the size of a physical address exceeds the size of
1150                  * a long.
1151                  */
1152                 order = min(ffsl(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1,
1153                     VM_NFREEORDER - 1);
1154                 n = 1 << order;
1155                 if (npages < n)
1156                         break;
1157                 vm_phys_free_pages(m, order);
1158                 m += n;
1159         }
1160         /* The residual "npages" is less than "1 << (VM_NFREEORDER - 1)". */
1161         for (; npages > 0; npages -= n) {
1162                 order = flsl(npages) - 1;
1163                 n = 1 << order;
1164                 vm_phys_free_pages(m, order);
1165                 m += n;
1166         }
1167 }
1168
1169 /*
1170  * Scan physical memory between the specified addresses "low" and "high" for a
1171  * run of contiguous physical pages that satisfy the specified conditions, and
1172  * return the lowest page in the run.  The specified "alignment" determines
1173  * the alignment of the lowest physical page in the run.  If the specified
1174  * "boundary" is non-zero, then the run of physical pages cannot span a
1175  * physical address that is a multiple of "boundary".
1176  *
1177  * "npages" must be greater than zero.  Both "alignment" and "boundary" must
1178  * be a power of two.
1179  */
1180 vm_page_t
1181 vm_phys_scan_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
1182     u_long alignment, vm_paddr_t boundary, int options)
1183 {
1184         vm_paddr_t pa_end;
1185         vm_page_t m_end, m_run, m_start;
1186         struct vm_phys_seg *seg;
1187         int segind;
1188
1189         KASSERT(npages > 0, ("npages is 0"));
1190         KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1191         KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1192         if (low >= high)
1193                 return (NULL);
1194         for (segind = 0; segind < vm_phys_nsegs; segind++) {
1195                 seg = &vm_phys_segs[segind];
1196                 if (seg->start >= high)
1197                         break;
1198                 if (low >= seg->end)
1199                         continue;
1200                 if (low <= seg->start)
1201                         m_start = seg->first_page;
1202                 else
1203                         m_start = &seg->first_page[atop(low - seg->start)];
1204                 if (high < seg->end)
1205                         pa_end = high;
1206                 else
1207                         pa_end = seg->end;
1208                 if (pa_end - VM_PAGE_TO_PHYS(m_start) < ptoa(npages))
1209                         continue;
1210                 m_end = &seg->first_page[atop(pa_end - seg->start)];
1211                 m_run = vm_page_scan_contig(npages, m_start, m_end,
1212                     alignment, boundary, options);
1213                 if (m_run != NULL)
1214                         return (m_run);
1215         }
1216         return (NULL);
1217 }
1218
1219 /*
1220  * Set the pool for a contiguous, power of two-sized set of physical pages.
1221  */
1222 void
1223 vm_phys_set_pool(int pool, vm_page_t m, int order)
1224 {
1225         vm_page_t m_tmp;
1226
1227         for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
1228                 m_tmp->pool = pool;
1229 }
1230
1231 /*
1232  * Search for the given physical page "m" in the free lists.  If the search
1233  * succeeds, remove "m" from the free lists and return TRUE.  Otherwise, return
1234  * FALSE, indicating that "m" is not in the free lists.
1235  *
1236  * The free page queues must be locked.
1237  */
1238 boolean_t
1239 vm_phys_unfree_page(vm_page_t m)
1240 {
1241         struct vm_freelist *fl;
1242         struct vm_phys_seg *seg;
1243         vm_paddr_t pa, pa_half;
1244         vm_page_t m_set, m_tmp;
1245         int order;
1246
1247         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1248
1249         /*
1250          * First, find the contiguous, power of two-sized set of free
1251          * physical pages containing the given physical page "m" and
1252          * assign it to "m_set".
1253          */
1254         seg = &vm_phys_segs[m->segind];
1255         for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
1256             order < VM_NFREEORDER - 1; ) {
1257                 order++;
1258                 pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
1259                 if (pa >= seg->start)
1260                         m_set = &seg->first_page[atop(pa - seg->start)];
1261                 else
1262                         return (FALSE);
1263         }
1264         if (m_set->order < order)
1265                 return (FALSE);
1266         if (m_set->order == VM_NFREEORDER)
1267                 return (FALSE);
1268         KASSERT(m_set->order < VM_NFREEORDER,
1269             ("vm_phys_unfree_page: page %p has unexpected order %d",
1270             m_set, m_set->order));
1271
1272         /*
1273          * Next, remove "m_set" from the free lists.  Finally, extract
1274          * "m" from "m_set" using an iterative algorithm: While "m_set"
1275          * is larger than a page, shrink "m_set" by returning the half
1276          * of "m_set" that does not contain "m" to the free lists.
1277          */
1278         fl = (*seg->free_queues)[m_set->pool];
1279         order = m_set->order;
1280         vm_freelist_rem(fl, m_set, order);
1281         while (order > 0) {
1282                 order--;
1283                 pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
1284                 if (m->phys_addr < pa_half)
1285                         m_tmp = &seg->first_page[atop(pa_half - seg->start)];
1286                 else {
1287                         m_tmp = m_set;
1288                         m_set = &seg->first_page[atop(pa_half - seg->start)];
1289                 }
1290                 vm_freelist_add(fl, m_tmp, order, 0);
1291         }
1292         KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
1293         return (TRUE);
1294 }
1295
1296 /*
1297  * Allocate a contiguous set of physical pages of the given size
1298  * "npages" from the free lists.  All of the physical pages must be at
1299  * or above the given physical address "low" and below the given
1300  * physical address "high".  The given value "alignment" determines the
1301  * alignment of the first physical page in the set.  If the given value
1302  * "boundary" is non-zero, then the set of physical pages cannot cross
1303  * any physical address boundary that is a multiple of that value.  Both
1304  * "alignment" and "boundary" must be a power of two.
1305  */
1306 vm_page_t
1307 vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
1308     u_long alignment, vm_paddr_t boundary)
1309 {
1310         vm_paddr_t pa_end, pa_start;
1311         vm_page_t m_run;
1312         struct vm_domain_iterator vi;
1313         struct vm_phys_seg *seg;
1314         int domain, segind;
1315
1316         KASSERT(npages > 0, ("npages is 0"));
1317         KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1318         KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1319         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1320         if (low >= high)
1321                 return (NULL);
1322         vm_policy_iterator_init(&vi);
1323 restartdom:
1324         if (vm_domain_iterator_run(&vi, &domain) != 0) {
1325                 vm_policy_iterator_finish(&vi);
1326                 return (NULL);
1327         }
1328         m_run = NULL;
1329         for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
1330                 seg = &vm_phys_segs[segind];
1331                 if (seg->start >= high || seg->domain != domain)
1332                         continue;
1333                 if (low >= seg->end)
1334                         break;
1335                 if (low <= seg->start)
1336                         pa_start = seg->start;
1337                 else
1338                         pa_start = low;
1339                 if (high < seg->end)
1340                         pa_end = high;
1341                 else
1342                         pa_end = seg->end;
1343                 if (pa_end - pa_start < ptoa(npages))
1344                         continue;
1345                 m_run = vm_phys_alloc_seg_contig(seg, npages, low, high,
1346                     alignment, boundary);
1347                 if (m_run != NULL)
1348                         break;
1349         }
1350         if (m_run == NULL && !vm_domain_iterator_isdone(&vi))
1351                 goto restartdom;
1352         vm_policy_iterator_finish(&vi);
1353         return (m_run);
1354 }
1355
1356 /*
1357  * Allocate a run of contiguous physical pages from the free list for the
1358  * specified segment.
1359  */
1360 static vm_page_t
1361 vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, u_long npages,
1362     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
1363 {
1364         struct vm_freelist *fl;
1365         vm_paddr_t pa, pa_end, size;
1366         vm_page_t m, m_ret;
1367         u_long npages_end;
1368         int oind, order, pind;
1369
1370         KASSERT(npages > 0, ("npages is 0"));
1371         KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1372         KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1373         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1374         /* Compute the queue that is the best fit for npages. */
1375         for (order = 0; (1 << order) < npages; order++);
1376         /* Search for a run satisfying the specified conditions. */
1377         size = npages << PAGE_SHIFT;
1378         for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER;
1379             oind++) {
1380                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
1381                         fl = (*seg->free_queues)[pind];
1382                         TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) {
1383                                 /*
1384                                  * Is the size of this allocation request
1385                                  * larger than the largest block size?
1386                                  */
1387                                 if (order >= VM_NFREEORDER) {
1388                                         /*
1389                                          * Determine if a sufficient number of
1390                                          * subsequent blocks to satisfy the
1391                                          * allocation request are free.
1392                                          */
1393                                         pa = VM_PAGE_TO_PHYS(m_ret);
1394                                         pa_end = pa + size;
1395                                         for (;;) {
1396                                                 pa += 1 << (PAGE_SHIFT +
1397                                                     VM_NFREEORDER - 1);
1398                                                 if (pa >= pa_end ||
1399                                                     pa < seg->start ||
1400                                                     pa >= seg->end)
1401                                                         break;
1402                                                 m = &seg->first_page[atop(pa -
1403                                                     seg->start)];
1404                                                 if (m->order != VM_NFREEORDER -
1405                                                     1)
1406                                                         break;
1407                                         }
1408                                         /* If not, go to the next block. */
1409                                         if (pa < pa_end)
1410                                                 continue;
1411                                 }
1412
1413                                 /*
1414                                  * Determine if the blocks are within the
1415                                  * given range, satisfy the given alignment,
1416                                  * and do not cross the given boundary.
1417                                  */
1418                                 pa = VM_PAGE_TO_PHYS(m_ret);
1419                                 pa_end = pa + size;
1420                                 if (pa >= low && pa_end <= high &&
1421                                     (pa & (alignment - 1)) == 0 &&
1422                                     rounddown2(pa ^ (pa_end - 1), boundary) == 0)
1423                                         goto done;
1424                         }
1425                 }
1426         }
1427         return (NULL);
1428 done:
1429         for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) {
1430                 fl = (*seg->free_queues)[m->pool];
1431                 vm_freelist_rem(fl, m, m->order);
1432         }
1433         if (m_ret->pool != VM_FREEPOOL_DEFAULT)
1434                 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m_ret, oind);
1435         fl = (*seg->free_queues)[m_ret->pool];
1436         vm_phys_split_pages(m_ret, oind, fl, order);
1437         /* Return excess pages to the free lists. */
1438         npages_end = roundup2(npages, 1 << imin(oind, order));
1439         if (npages < npages_end)
1440                 vm_phys_free_contig(&m_ret[npages], npages_end - npages);
1441         return (m_ret);
1442 }
1443
1444 #ifdef DDB
1445 /*
1446  * Show the number of physical pages in each of the free lists.
1447  */
1448 DB_SHOW_COMMAND(freepages, db_show_freepages)
1449 {
1450         struct vm_freelist *fl;
1451         int flind, oind, pind, dom;
1452
1453         for (dom = 0; dom < vm_ndomains; dom++) {
1454                 db_printf("DOMAIN: %d\n", dom);
1455                 for (flind = 0; flind < vm_nfreelists; flind++) {
1456                         db_printf("FREE LIST %d:\n"
1457                             "\n  ORDER (SIZE)  |  NUMBER"
1458                             "\n              ", flind);
1459                         for (pind = 0; pind < VM_NFREEPOOL; pind++)
1460                                 db_printf("  |  POOL %d", pind);
1461                         db_printf("\n--            ");
1462                         for (pind = 0; pind < VM_NFREEPOOL; pind++)
1463                                 db_printf("-- --      ");
1464                         db_printf("--\n");
1465                         for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
1466                                 db_printf("  %2.2d (%6.6dK)", oind,
1467                                     1 << (PAGE_SHIFT - 10 + oind));
1468                                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
1469                                 fl = vm_phys_free_queues[dom][flind][pind];
1470                                         db_printf("  |  %6.6d", fl[oind].lcnt);
1471                                 }
1472                                 db_printf("\n");
1473                         }
1474                         db_printf("\n");
1475                 }
1476                 db_printf("\n");
1477         }
1478 }
1479 #endif