sys/vm/vm_phys.c

   1 /*-
   2  * Copyright (c) 2002-2006 Rice University
   3  * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
   4  * All rights reserved.
   5  *
   6  * This software was developed for the FreeBSD Project by Alan L. Cox,
   7  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  * 1. Redistributions of source code must retain the above copyright
  13  *    notice, this list of conditions and the following disclaimer.
  14  * 2. Redistributions in binary form must reproduce the above copyright
  15  *    notice, this list of conditions and the following disclaimer in the
  16  *    documentation and/or other materials provided with the distribution.
  17  *
  18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  22  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  24  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  25  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  26  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  28  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 #include <sys/cdefs.h>
  33 __FBSDID("$FreeBSD$");
  34
  35 #include "opt_ddb.h"
  36 #include "opt_vm.h"
  37
  38 #include <sys/param.h>
  39 #include <sys/systm.h>
  40 #include <sys/lock.h>
  41 #include <sys/kernel.h>
  42 #include <sys/malloc.h>
  43 #include <sys/mutex.h>
  44 #include <sys/queue.h>
  45 #include <sys/sbuf.h>
  46 #include <sys/sysctl.h>
  47 #include <sys/vmmeter.h>
  48 #include <sys/vnode.h>
  49
  50 #include <ddb/ddb.h>
  51
  52 #include <vm/vm.h>
  53 #include <vm/vm_param.h>
  54 #include <vm/vm_kern.h>
  55 #include <vm/vm_object.h>
  56 #include <vm/vm_page.h>
  57 #include <vm/vm_phys.h>
  58 #include <vm/vm_reserv.h>
  59
  60 /*
  61  * VM_FREELIST_DEFAULT is split into VM_NDOMAIN lists, one for each
  62  * domain.  These extra lists are stored at the end of the regular
  63  * free lists starting with VM_NFREELIST.
  64  */
  65 #define VM_RAW_NFREELIST        (VM_NFREELIST + VM_NDOMAIN - 1)
  66
  67 struct vm_freelist {
  68         struct pglist pl;
  69         int lcnt;
  70 };
  71
  72 struct vm_phys_seg {
  73         vm_paddr_t      start;
  74         vm_paddr_t      end;
  75         vm_page_t       first_page;
  76         int             domain;
  77         struct vm_freelist (*free_queues)[VM_NFREEPOOL][VM_NFREEORDER];
  78 };
  79
  80 struct mem_affinity *mem_affinity;
  81
  82 static struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
  83
  84 static int vm_phys_nsegs;
  85
  86 #define VM_PHYS_FICTITIOUS_NSEGS        8
  87 static struct vm_phys_fictitious_seg {
  88         vm_paddr_t      start;
  89         vm_paddr_t      end;
  90         vm_page_t       first_page;
  91 } vm_phys_fictitious_segs[VM_PHYS_FICTITIOUS_NSEGS];
  92 static struct mtx vm_phys_fictitious_reg_mtx;
  93 MALLOC_DEFINE(M_FICT_PAGES, "", "");
  94
  95 static struct vm_freelist
  96     vm_phys_free_queues[VM_RAW_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER];
  97 static struct vm_freelist
  98 (*vm_phys_lookup_lists[VM_NDOMAIN][VM_RAW_NFREELIST])[VM_NFREEPOOL][VM_NFREEORDER];
  99
 100 static int vm_nfreelists = VM_FREELIST_DEFAULT + 1;
 101
 102 static int cnt_prezero;
 103 SYSCTL_INT(_vm_stats_misc, OID_AUTO, cnt_prezero, CTLFLAG_RD,
 104     &cnt_prezero, 0, "The number of physical pages prezeroed at idle time");
 105
 106 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
 107 SYSCTL_OID(_vm, OID_AUTO, phys_free, CTLTYPE_STRING | CTLFLAG_RD,
 108     NULL, 0, sysctl_vm_phys_free, "A", "Phys Free Info");
 109
 110 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
 111 SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,
 112     NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info");
 113
 114 #if VM_NDOMAIN > 1
 115 static int sysctl_vm_phys_lookup_lists(SYSCTL_HANDLER_ARGS);
 116 SYSCTL_OID(_vm, OID_AUTO, phys_lookup_lists, CTLTYPE_STRING | CTLFLAG_RD,
 117     NULL, 0, sysctl_vm_phys_lookup_lists, "A", "Phys Lookup Lists");
 118 #endif
 119
 120 static vm_page_t vm_phys_alloc_domain_pages(int domain, int flind, int pool,
 121     int order);
 122 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind,
 123     int domain);
 124 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind);
 125 static int vm_phys_paddr_to_segind(vm_paddr_t pa);
 126 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
 127     int order);
 128
 129 /*
 130  * Outputs the state of the physical memory allocator, specifically,
 131  * the amount of physical memory in each free list.
 132  */
 133 static int
 134 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
 135 {
 136         struct sbuf sbuf;
 137         struct vm_freelist *fl;
 138         int error, flind, oind, pind;
 139
 140         error = sysctl_wire_old_buffer(req, 0);
 141         if (error != 0)
 142                 return (error);
 143         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 144         for (flind = 0; flind < vm_nfreelists; flind++) {
 145                 sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
 146                     "\n  ORDER (SIZE)  |  NUMBER"
 147                     "\n              ", flind);
 148                 for (pind = 0; pind < VM_NFREEPOOL; pind++)
 149                         sbuf_printf(&sbuf, "  |  POOL %d", pind);
 150                 sbuf_printf(&sbuf, "\n--            ");
 151                 for (pind = 0; pind < VM_NFREEPOOL; pind++)
 152                         sbuf_printf(&sbuf, "-- --      ");
 153                 sbuf_printf(&sbuf, "--\n");
 154                 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
 155                         sbuf_printf(&sbuf, "  %2d (%6dK)", oind,
 156                             1 << (PAGE_SHIFT - 10 + oind));
 157                         for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 158                                 fl = vm_phys_free_queues[flind][pind];
 159                                 sbuf_printf(&sbuf, "  |  %6d", fl[oind].lcnt);
 160                         }
 161                         sbuf_printf(&sbuf, "\n");
 162                 }
 163         }
 164         error = sbuf_finish(&sbuf);
 165         sbuf_delete(&sbuf);
 166         return (error);
 167 }
 168
 169 /*
 170  * Outputs the set of physical memory segments.
 171  */
 172 static int
 173 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
 174 {
 175         struct sbuf sbuf;
 176         struct vm_phys_seg *seg;
 177         int error, segind;
 178
 179         error = sysctl_wire_old_buffer(req, 0);
 180         if (error != 0)
 181                 return (error);
 182         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 183         for (segind = 0; segind < vm_phys_nsegs; segind++) {
 184                 sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
 185                 seg = &vm_phys_segs[segind];
 186                 sbuf_printf(&sbuf, "start:     %#jx\n",
 187                     (uintmax_t)seg->start);
 188                 sbuf_printf(&sbuf, "end:       %#jx\n",
 189                     (uintmax_t)seg->end);
 190                 sbuf_printf(&sbuf, "domain:    %d\n", seg->domain);
 191                 sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
 192         }
 193         error = sbuf_finish(&sbuf);
 194         sbuf_delete(&sbuf);
 195         return (error);
 196 }
 197
 198 #if VM_NDOMAIN > 1
 199 /*
 200  * Outputs the set of free list lookup lists.
 201  */
 202 static int
 203 sysctl_vm_phys_lookup_lists(SYSCTL_HANDLER_ARGS)
 204 {
 205         struct sbuf sbuf;
 206         int domain, error, flind, ndomains;
 207
 208         error = sysctl_wire_old_buffer(req, 0);
 209         if (error != 0)
 210                 return (error);
 211         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 212         ndomains = vm_nfreelists - VM_NFREELIST + 1;
 213         for (domain = 0; domain < ndomains; domain++) {
 214                 sbuf_printf(&sbuf, "\nDOMAIN %d:\n\n", domain);
 215                 for (flind = 0; flind < vm_nfreelists; flind++)
 216                         sbuf_printf(&sbuf, "  [%d]:\t%p\n", flind,
 217                             vm_phys_lookup_lists[domain][flind]);
 218         }
 219         error = sbuf_finish(&sbuf);
 220         sbuf_delete(&sbuf);
 221         return (error);
 222 }
 223 #endif
 224
 225 /*
 226  * Create a physical memory segment.
 227  */
 228 static void
 229 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind, int domain)
 230 {
 231         struct vm_phys_seg *seg;
 232 #ifdef VM_PHYSSEG_SPARSE
 233         long pages;
 234         int segind;
 235
 236         pages = 0;
 237         for (segind = 0; segind < vm_phys_nsegs; segind++) {
 238                 seg = &vm_phys_segs[segind];
 239                 pages += atop(seg->end - seg->start);
 240         }
 241 #endif
 242         KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
 243             ("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
 244         seg = &vm_phys_segs[vm_phys_nsegs++];
 245         seg->start = start;
 246         seg->end = end;
 247         seg->domain = domain;
 248 #ifdef VM_PHYSSEG_SPARSE
 249         seg->first_page = &vm_page_array[pages];
 250 #else
 251         seg->first_page = PHYS_TO_VM_PAGE(start);
 252 #endif
 253 #if VM_NDOMAIN > 1
 254         if (flind == VM_FREELIST_DEFAULT && domain != 0) {
 255                 flind = VM_NFREELIST + (domain - 1);
 256                 if (flind >= vm_nfreelists)
 257                         vm_nfreelists = flind + 1;
 258         }
 259 #endif
 260         seg->free_queues = &vm_phys_free_queues[flind];
 261 }
 262
 263 static void
 264 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind)
 265 {
 266         int i;
 267
 268         if (mem_affinity == NULL) {
 269                 _vm_phys_create_seg(start, end, flind, 0);
 270                 return;
 271         }
 272
 273         for (i = 0;; i++) {
 274                 if (mem_affinity[i].end == 0)
 275                         panic("Reached end of affinity info");
 276                 if (mem_affinity[i].end <= start)
 277                         continue;
 278                 if (mem_affinity[i].start > start)
 279                         panic("No affinity info for start %jx",
 280                             (uintmax_t)start);
 281                 if (mem_affinity[i].end >= end) {
 282                         _vm_phys_create_seg(start, end, flind,
 283                             mem_affinity[i].domain);
 284                         break;
 285                 }
 286                 _vm_phys_create_seg(start, mem_affinity[i].end, flind,
 287                     mem_affinity[i].domain);
 288                 start = mem_affinity[i].end;
 289         }
 290 }
 291
 292 /*
 293  * Initialize the physical memory allocator.
 294  */
 295 void
 296 vm_phys_init(void)
 297 {
 298         struct vm_freelist *fl;
 299         int flind, i, oind, pind;
 300 #if VM_NDOMAIN > 1
 301         int ndomains, j;
 302 #endif
 303
 304         for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 305 #ifdef  VM_FREELIST_ISADMA
 306                 if (phys_avail[i] < 16777216) {
 307                         if (phys_avail[i + 1] > 16777216) {
 308                                 vm_phys_create_seg(phys_avail[i], 16777216,
 309                                     VM_FREELIST_ISADMA);
 310                                 vm_phys_create_seg(16777216, phys_avail[i + 1],
 311                                     VM_FREELIST_DEFAULT);
 312                         } else {
 313                                 vm_phys_create_seg(phys_avail[i],
 314                                     phys_avail[i + 1], VM_FREELIST_ISADMA);
 315                         }
 316                         if (VM_FREELIST_ISADMA >= vm_nfreelists)
 317                                 vm_nfreelists = VM_FREELIST_ISADMA + 1;
 318                 } else
 319 #endif
 320 #ifdef  VM_FREELIST_HIGHMEM
 321                 if (phys_avail[i + 1] > VM_HIGHMEM_ADDRESS) {
 322                         if (phys_avail[i] < VM_HIGHMEM_ADDRESS) {
 323                                 vm_phys_create_seg(phys_avail[i],
 324                                     VM_HIGHMEM_ADDRESS, VM_FREELIST_DEFAULT);
 325                                 vm_phys_create_seg(VM_HIGHMEM_ADDRESS,
 326                                     phys_avail[i + 1], VM_FREELIST_HIGHMEM);
 327                         } else {
 328                                 vm_phys_create_seg(phys_avail[i],
 329                                     phys_avail[i + 1], VM_FREELIST_HIGHMEM);
 330                         }
 331                         if (VM_FREELIST_HIGHMEM >= vm_nfreelists)
 332                                 vm_nfreelists = VM_FREELIST_HIGHMEM + 1;
 333                 } else
 334 #endif
 335                 vm_phys_create_seg(phys_avail[i], phys_avail[i + 1],
 336                     VM_FREELIST_DEFAULT);
 337         }
 338         for (flind = 0; flind < vm_nfreelists; flind++) {
 339                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 340                         fl = vm_phys_free_queues[flind][pind];
 341                         for (oind = 0; oind < VM_NFREEORDER; oind++)
 342                                 TAILQ_INIT(&fl[oind].pl);
 343                 }
 344         }
 345 #if VM_NDOMAIN > 1
 346         /*
 347          * Build a free list lookup list for each domain.  All of the
 348          * memory domain lists are inserted at the VM_FREELIST_DEFAULT
 349          * index in a round-robin order starting with the current
 350          * domain.
 351          */
 352         ndomains = vm_nfreelists - VM_NFREELIST + 1;
 353         for (flind = 0; flind < VM_FREELIST_DEFAULT; flind++)
 354                 for (i = 0; i < ndomains; i++)
 355                         vm_phys_lookup_lists[i][flind] =
 356                             &vm_phys_free_queues[flind];
 357         for (i = 0; i < ndomains; i++)
 358                 for (j = 0; j < ndomains; j++) {
 359                         flind = (i + j) % ndomains;
 360                         if (flind == 0)
 361                                 flind = VM_FREELIST_DEFAULT;
 362                         else
 363                                 flind += VM_NFREELIST - 1;
 364                         vm_phys_lookup_lists[i][VM_FREELIST_DEFAULT + j] =
 365                             &vm_phys_free_queues[flind];
 366                 }
 367         for (flind = VM_FREELIST_DEFAULT + 1; flind < VM_NFREELIST;
 368              flind++)
 369                 for (i = 0; i < ndomains; i++)
 370                         vm_phys_lookup_lists[i][flind + ndomains - 1] =
 371                             &vm_phys_free_queues[flind];
 372 #else
 373         for (flind = 0; flind < vm_nfreelists; flind++)
 374                 vm_phys_lookup_lists[0][flind] = &vm_phys_free_queues[flind];
 375 #endif
 376
 377         mtx_init(&vm_phys_fictitious_reg_mtx, "vmfctr", NULL, MTX_DEF);
 378 }
 379
 380 /*
 381  * Split a contiguous, power of two-sized set of physical pages.
 382  */
 383 static __inline void
 384 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order)
 385 {
 386         vm_page_t m_buddy;
 387
 388         while (oind > order) {
 389                 oind--;
 390                 m_buddy = &m[1 << oind];
 391                 KASSERT(m_buddy->order == VM_NFREEORDER,
 392                     ("vm_phys_split_pages: page %p has unexpected order %d",
 393                     m_buddy, m_buddy->order));
 394                 m_buddy->order = oind;
 395                 TAILQ_INSERT_HEAD(&fl[oind].pl, m_buddy, pageq);
 396                 fl[oind].lcnt++;
 397         }
 398 }
 399
 400 /*
 401  * Initialize a physical page and add it to the free lists.
 402  */
 403 void
 404 vm_phys_add_page(vm_paddr_t pa)
 405 {
 406         vm_page_t m;
 407
 408         cnt.v_page_count++;
 409         m = vm_phys_paddr_to_vm_page(pa);
 410         m->phys_addr = pa;
 411         m->queue = PQ_NONE;
 412         m->segind = vm_phys_paddr_to_segind(pa);
 413         m->flags = PG_FREE;
 414         KASSERT(m->order == VM_NFREEORDER,
 415             ("vm_phys_add_page: page %p has unexpected order %d",
 416             m, m->order));
 417         m->pool = VM_FREEPOOL_DEFAULT;
 418         pmap_page_init(m);
 419         mtx_lock(&vm_page_queue_free_mtx);
 420         cnt.v_free_count++;
 421         vm_phys_free_pages(m, 0);
 422         mtx_unlock(&vm_page_queue_free_mtx);
 423 }
 424
 425 /*
 426  * Allocate a contiguous, power of two-sized set of physical pages
 427  * from the free lists.
 428  *
 429  * The free page queues must be locked.
 430  */
 431 vm_page_t
 432 vm_phys_alloc_pages(int pool, int order)
 433 {
 434         vm_page_t m;
 435         int domain, flind;
 436
 437         KASSERT(pool < VM_NFREEPOOL,
 438             ("vm_phys_alloc_pages: pool %d is out of range", pool));
 439         KASSERT(order < VM_NFREEORDER,
 440             ("vm_phys_alloc_pages: order %d is out of range", order));
 441
 442 #if VM_NDOMAIN > 1
 443         domain = PCPU_GET(domain);
 444 #else
 445         domain = 0;
 446 #endif
 447         for (flind = 0; flind < vm_nfreelists; flind++) {
 448                 m = vm_phys_alloc_domain_pages(domain, flind, pool, order);
 449                 if (m != NULL)
 450                         return (m);
 451         }
 452         return (NULL);
 453 }
 454
 455 /*
 456  * Find and dequeue a free page on the given free list, with the
 457  * specified pool and order
 458  */
 459 vm_page_t
 460 vm_phys_alloc_freelist_pages(int flind, int pool, int order)
 461 {
 462 #if VM_NDOMAIN > 1
 463         vm_page_t m;
 464         int i, ndomains;
 465 #endif
 466         int domain;
 467
 468         KASSERT(flind < VM_NFREELIST,
 469             ("vm_phys_alloc_freelist_pages: freelist %d is out of range", flind));
 470         KASSERT(pool < VM_NFREEPOOL,
 471             ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
 472         KASSERT(order < VM_NFREEORDER,
 473             ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
 474
 475 #if VM_NDOMAIN > 1
 476         /*
 477          * This routine expects to be called with a VM_FREELIST_* constant.
 478          * On a system with multiple domains we need to adjust the flind
 479          * appropriately.  If it is for VM_FREELIST_DEFAULT we need to
 480          * iterate over the per-domain lists.
 481          */
 482         domain = PCPU_GET(domain);
 483         ndomains = vm_nfreelists - VM_NFREELIST + 1;
 484         if (flind == VM_FREELIST_DEFAULT) {
 485                 m = NULL;
 486                 for (i = 0; i < ndomains; i++, flind++) {
 487                         m = vm_phys_alloc_domain_pages(domain, flind, pool,
 488                             order);
 489                         if (m != NULL)
 490                                 break;
 491                 }
 492                 return (m);
 493         } else if (flind > VM_FREELIST_DEFAULT)
 494                 flind += ndomains - 1;
 495 #else
 496         domain = 0;
 497 #endif
 498         return (vm_phys_alloc_domain_pages(domain, flind, pool, order));
 499 }
 500
 501 static vm_page_t
 502 vm_phys_alloc_domain_pages(int domain, int flind, int pool, int order)
 503 {
 504         struct vm_freelist *fl;
 505         struct vm_freelist *alt;
 506         int oind, pind;
 507         vm_page_t m;
 508
 509         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 510         fl = (*vm_phys_lookup_lists[domain][flind])[pool];
 511         for (oind = order; oind < VM_NFREEORDER; oind++) {
 512                 m = TAILQ_FIRST(&fl[oind].pl);
 513                 if (m != NULL) {
 514                         TAILQ_REMOVE(&fl[oind].pl, m, pageq);
 515                         fl[oind].lcnt--;
 516                         m->order = VM_NFREEORDER;
 517                         vm_phys_split_pages(m, oind, fl, order);
 518                         return (m);
 519                 }
 520         }
 521
 522         /*
 523          * The given pool was empty.  Find the largest
 524          * contiguous, power-of-two-sized set of pages in any
 525          * pool.  Transfer these pages to the given pool, and
 526          * use them to satisfy the allocation.
 527          */
 528         for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
 529                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 530                         alt = (*vm_phys_lookup_lists[domain][flind])[pind];
 531                         m = TAILQ_FIRST(&alt[oind].pl);
 532                         if (m != NULL) {
 533                                 TAILQ_REMOVE(&alt[oind].pl, m, pageq);
 534                                 alt[oind].lcnt--;
 535                                 m->order = VM_NFREEORDER;
 536                                 vm_phys_set_pool(pool, m, oind);
 537                                 vm_phys_split_pages(m, oind, fl, order);
 538                                 return (m);
 539                         }
 540                 }
 541         }
 542         return (NULL);
 543 }
 544
 545 /*
 546  * Allocate physical memory from phys_avail[].
 547  */
 548 vm_paddr_t
 549 vm_phys_bootstrap_alloc(vm_size_t size, unsigned long alignment)
 550 {
 551         vm_paddr_t pa;
 552         int i;
 553
 554         size = round_page(size);
 555         for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 556                 if (phys_avail[i + 1] - phys_avail[i] < size)
 557                         continue;
 558                 pa = phys_avail[i];
 559                 phys_avail[i] += size;
 560                 return (pa);
 561         }
 562         panic("vm_phys_bootstrap_alloc");
 563 }
 564
 565 /*
 566  * Find the vm_page corresponding to the given physical address.
 567  */
 568 vm_page_t
 569 vm_phys_paddr_to_vm_page(vm_paddr_t pa)
 570 {
 571         struct vm_phys_seg *seg;
 572         int segind;
 573
 574         for (segind = 0; segind < vm_phys_nsegs; segind++) {
 575                 seg = &vm_phys_segs[segind];
 576                 if (pa >= seg->start && pa < seg->end)
 577                         return (&seg->first_page[atop(pa - seg->start)]);
 578         }
 579         return (NULL);
 580 }
 581
 582 vm_page_t
 583 vm_phys_fictitious_to_vm_page(vm_paddr_t pa)
 584 {
 585         struct vm_phys_fictitious_seg *seg;
 586         vm_page_t m;
 587         int segind;
 588
 589         m = NULL;
 590         for (segind = 0; segind < VM_PHYS_FICTITIOUS_NSEGS; segind++) {
 591                 seg = &vm_phys_fictitious_segs[segind];
 592                 if (pa >= seg->start && pa < seg->end) {
 593                         m = &seg->first_page[atop(pa - seg->start)];
 594                         KASSERT((m->flags & PG_FICTITIOUS) != 0,
 595                             ("%p not fictitious", m));
 596                         break;
 597                 }
 598         }
 599         return (m);
 600 }
 601
 602 int
 603 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
 604     vm_memattr_t memattr)
 605 {
 606         struct vm_phys_fictitious_seg *seg;
 607         vm_page_t fp;
 608         long i, page_count;
 609         int segind;
 610 #ifdef VM_PHYSSEG_DENSE
 611         long pi;
 612         boolean_t malloced;
 613 #endif
 614
 615         page_count = (end - start) / PAGE_SIZE;
 616
 617 #ifdef VM_PHYSSEG_DENSE
 618         pi = atop(start);
 619         if (pi >= first_page && atop(end) < vm_page_array_size) {
 620                 fp = &vm_page_array[pi - first_page];
 621                 malloced = FALSE;
 622         } else
 623 #endif
 624         {
 625                 fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES,
 626                     M_WAITOK | M_ZERO);
 627 #ifdef VM_PHYSSEG_DENSE
 628                 malloced = TRUE;
 629 #endif
 630         }
 631         for (i = 0; i < page_count; i++) {
 632                 vm_page_initfake(&fp[i], start + PAGE_SIZE * i, memattr);
 633                 pmap_page_init(&fp[i]);
 634                 fp[i].oflags &= ~(VPO_BUSY | VPO_UNMANAGED);
 635         }
 636         mtx_lock(&vm_phys_fictitious_reg_mtx);
 637         for (segind = 0; segind < VM_PHYS_FICTITIOUS_NSEGS; segind++) {
 638                 seg = &vm_phys_fictitious_segs[segind];
 639                 if (seg->start == 0 && seg->end == 0) {
 640                         seg->start = start;
 641                         seg->end = end;
 642                         seg->first_page = fp;
 643                         mtx_unlock(&vm_phys_fictitious_reg_mtx);
 644                         return (0);
 645                 }
 646         }
 647         mtx_unlock(&vm_phys_fictitious_reg_mtx);
 648 #ifdef VM_PHYSSEG_DENSE
 649         if (malloced)
 650 #endif
 651                 free(fp, M_FICT_PAGES);
 652         return (EBUSY);
 653 }
 654
 655 void
 656 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end)
 657 {
 658         struct vm_phys_fictitious_seg *seg;
 659         vm_page_t fp;
 660         int segind;
 661 #ifdef VM_PHYSSEG_DENSE
 662         long pi;
 663 #endif
 664
 665 #ifdef VM_PHYSSEG_DENSE
 666         pi = atop(start);
 667 #endif
 668
 669         mtx_lock(&vm_phys_fictitious_reg_mtx);
 670         for (segind = 0; segind < VM_PHYS_FICTITIOUS_NSEGS; segind++) {
 671                 seg = &vm_phys_fictitious_segs[segind];
 672                 if (seg->start == start && seg->end == end) {
 673                         seg->start = seg->end = 0;
 674                         fp = seg->first_page;
 675                         seg->first_page = NULL;
 676                         mtx_unlock(&vm_phys_fictitious_reg_mtx);
 677 #ifdef VM_PHYSSEG_DENSE
 678                         if (pi < first_page || atop(end) >= vm_page_array_size)
 679 #endif
 680                                 free(fp, M_FICT_PAGES);
 681                         return;
 682                 }
 683         }
 684         mtx_unlock(&vm_phys_fictitious_reg_mtx);
 685         KASSERT(0, ("Unregistering not registered fictitious range"));
 686 }
 687
 688 /*
 689  * Find the segment containing the given physical address.
 690  */
 691 static int
 692 vm_phys_paddr_to_segind(vm_paddr_t pa)
 693 {
 694         struct vm_phys_seg *seg;
 695         int segind;
 696
 697         for (segind = 0; segind < vm_phys_nsegs; segind++) {
 698                 seg = &vm_phys_segs[segind];
 699                 if (pa >= seg->start && pa < seg->end)
 700                         return (segind);
 701         }
 702         panic("vm_phys_paddr_to_segind: paddr %#jx is not in any segment" ,
 703             (uintmax_t)pa);
 704 }
 705
 706 /*
 707  * Free a contiguous, power of two-sized set of physical pages.
 708  *
 709  * The free page queues must be locked.
 710  */
 711 void
 712 vm_phys_free_pages(vm_page_t m, int order)
 713 {
 714         struct vm_freelist *fl;
 715         struct vm_phys_seg *seg;
 716         vm_paddr_t pa, pa_buddy;
 717         vm_page_t m_buddy;
 718
 719         KASSERT(m->order == VM_NFREEORDER,
 720             ("vm_phys_free_pages: page %p has unexpected order %d",
 721             m, m->order));
 722         KASSERT(m->pool < VM_NFREEPOOL,
 723             ("vm_phys_free_pages: page %p has unexpected pool %d",
 724             m, m->pool));
 725         KASSERT(order < VM_NFREEORDER,
 726             ("vm_phys_free_pages: order %d is out of range", order));
 727         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 728         pa = VM_PAGE_TO_PHYS(m);
 729         seg = &vm_phys_segs[m->segind];
 730         while (order < VM_NFREEORDER - 1) {
 731                 pa_buddy = pa ^ (1 << (PAGE_SHIFT + order));
 732                 if (pa_buddy < seg->start ||
 733                     pa_buddy >= seg->end)
 734                         break;
 735                 m_buddy = &seg->first_page[atop(pa_buddy - seg->start)];
 736                 if (m_buddy->order != order)
 737                         break;
 738                 fl = (*seg->free_queues)[m_buddy->pool];
 739                 TAILQ_REMOVE(&fl[m_buddy->order].pl, m_buddy, pageq);
 740                 fl[m_buddy->order].lcnt--;
 741                 m_buddy->order = VM_NFREEORDER;
 742                 if (m_buddy->pool != m->pool)
 743                         vm_phys_set_pool(m->pool, m_buddy, order);
 744                 order++;
 745                 pa &= ~((1 << (PAGE_SHIFT + order)) - 1);
 746                 m = &seg->first_page[atop(pa - seg->start)];
 747         }
 748         m->order = order;
 749         fl = (*seg->free_queues)[m->pool];
 750         TAILQ_INSERT_TAIL(&fl[order].pl, m, pageq);
 751         fl[order].lcnt++;
 752 }
 753
 754 /*
 755  * Set the pool for a contiguous, power of two-sized set of physical pages.
 756  */
 757 void
 758 vm_phys_set_pool(int pool, vm_page_t m, int order)
 759 {
 760         vm_page_t m_tmp;
 761
 762         for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
 763                 m_tmp->pool = pool;
 764 }
 765
 766 /*
 767  * Search for the given physical page "m" in the free lists.  If the search
 768  * succeeds, remove "m" from the free lists and return TRUE.  Otherwise, return
 769  * FALSE, indicating that "m" is not in the free lists.
 770  *
 771  * The free page queues must be locked.
 772  */
 773 boolean_t
 774 vm_phys_unfree_page(vm_page_t m)
 775 {
 776         struct vm_freelist *fl;
 777         struct vm_phys_seg *seg;
 778         vm_paddr_t pa, pa_half;
 779         vm_page_t m_set, m_tmp;
 780         int order;
 781
 782         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 783
 784         /*
 785          * First, find the contiguous, power of two-sized set of free
 786          * physical pages containing the given physical page "m" and
 787          * assign it to "m_set".
 788          */
 789         seg = &vm_phys_segs[m->segind];
 790         for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
 791             order < VM_NFREEORDER - 1; ) {
 792                 order++;
 793                 pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
 794                 if (pa >= seg->start)
 795                         m_set = &seg->first_page[atop(pa - seg->start)];
 796                 else
 797                         return (FALSE);
 798         }
 799         if (m_set->order < order)
 800                 return (FALSE);
 801         if (m_set->order == VM_NFREEORDER)
 802                 return (FALSE);
 803         KASSERT(m_set->order < VM_NFREEORDER,
 804             ("vm_phys_unfree_page: page %p has unexpected order %d",
 805             m_set, m_set->order));
 806
 807         /*
 808          * Next, remove "m_set" from the free lists.  Finally, extract
 809          * "m" from "m_set" using an iterative algorithm: While "m_set"
 810          * is larger than a page, shrink "m_set" by returning the half
 811          * of "m_set" that does not contain "m" to the free lists.
 812          */
 813         fl = (*seg->free_queues)[m_set->pool];
 814         order = m_set->order;
 815         TAILQ_REMOVE(&fl[order].pl, m_set, pageq);
 816         fl[order].lcnt--;
 817         m_set->order = VM_NFREEORDER;
 818         while (order > 0) {
 819                 order--;
 820                 pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
 821                 if (m->phys_addr < pa_half)
 822                         m_tmp = &seg->first_page[atop(pa_half - seg->start)];
 823                 else {
 824                         m_tmp = m_set;
 825                         m_set = &seg->first_page[atop(pa_half - seg->start)];
 826                 }
 827                 m_tmp->order = order;
 828                 TAILQ_INSERT_HEAD(&fl[order].pl, m_tmp, pageq);
 829                 fl[order].lcnt++;
 830         }
 831         KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
 832         return (TRUE);
 833 }
 834
 835 /*
 836  * Try to zero one physical page.  Used by an idle priority thread.
 837  */
 838 boolean_t
 839 vm_phys_zero_pages_idle(void)
 840 {
 841         static struct vm_freelist *fl = vm_phys_free_queues[0][0];
 842         static int flind, oind, pind;
 843         vm_page_t m, m_tmp;
 844
 845         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 846         for (;;) {
 847                 TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, pageq) {
 848                         for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) {
 849                                 if ((m_tmp->flags & (PG_CACHED | PG_ZERO)) == 0) {
 850                                         vm_phys_unfree_page(m_tmp);
 851                                         cnt.v_free_count--;
 852                                         mtx_unlock(&vm_page_queue_free_mtx);
 853                                         pmap_zero_page_idle(m_tmp);
 854                                         m_tmp->flags |= PG_ZERO;
 855                                         mtx_lock(&vm_page_queue_free_mtx);
 856                                         cnt.v_free_count++;
 857                                         vm_phys_free_pages(m_tmp, 0);
 858                                         vm_page_zero_count++;
 859                                         cnt_prezero++;
 860                                         return (TRUE);
 861                                 }
 862                         }
 863                 }
 864                 oind++;
 865                 if (oind == VM_NFREEORDER) {
 866                         oind = 0;
 867                         pind++;
 868                         if (pind == VM_NFREEPOOL) {
 869                                 pind = 0;
 870                                 flind++;
 871                                 if (flind == vm_nfreelists)
 872                                         flind = 0;
 873                         }
 874                         fl = vm_phys_free_queues[flind][pind];
 875                 }
 876         }
 877 }
 878
 879 /*
 880  * Allocate a contiguous set of physical pages of the given size
 881  * "npages" from the free lists.  All of the physical pages must be at
 882  * or above the given physical address "low" and below the given
 883  * physical address "high".  The given value "alignment" determines the
 884  * alignment of the first physical page in the set.  If the given value
 885  * "boundary" is non-zero, then the set of physical pages cannot cross
 886  * any physical address boundary that is a multiple of that value.  Both
 887  * "alignment" and "boundary" must be a power of two.
 888  */
 889 vm_page_t
 890 vm_phys_alloc_contig(unsigned long npages, vm_paddr_t low, vm_paddr_t high,
 891     unsigned long alignment, unsigned long boundary)
 892 {
 893         struct vm_freelist *fl;
 894         struct vm_phys_seg *seg;
 895         struct vnode *vp;
 896         vm_paddr_t pa, pa_last, size;
 897         vm_page_t deferred_vdrop_list, m, m_ret;
 898         int domain, flind, i, oind, order, pind;
 899
 900 #if VM_NDOMAIN > 1
 901         domain = PCPU_GET(domain);
 902 #else
 903         domain = 0;
 904 #endif
 905         size = npages << PAGE_SHIFT;
 906         KASSERT(size != 0,
 907             ("vm_phys_alloc_contig: size must not be 0"));
 908         KASSERT((alignment & (alignment - 1)) == 0,
 909             ("vm_phys_alloc_contig: alignment must be a power of 2"));
 910         KASSERT((boundary & (boundary - 1)) == 0,
 911             ("vm_phys_alloc_contig: boundary must be a power of 2"));
 912         deferred_vdrop_list = NULL;
 913         /* Compute the queue that is the best fit for npages. */
 914         for (order = 0; (1 << order) < npages; order++);
 915         mtx_lock(&vm_page_queue_free_mtx);
 916 #if VM_NRESERVLEVEL > 0
 917 retry:
 918 #endif
 919         for (flind = 0; flind < vm_nfreelists; flind++) {
 920                 for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) {
 921                         for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 922                                 fl = (*vm_phys_lookup_lists[domain][flind])
 923                                     [pind];
 924                                 TAILQ_FOREACH(m_ret, &fl[oind].pl, pageq) {
 925                                         /*
 926                                          * A free list may contain physical pages
 927                                          * from one or more segments.
 928                                          */
 929                                         seg = &vm_phys_segs[m_ret->segind];
 930                                         if (seg->start > high ||
 931                                             low >= seg->end)
 932                                                 continue;
 933
 934                                         /*
 935                                          * Is the size of this allocation request
 936                                          * larger than the largest block size?
 937                                          */
 938                                         if (order >= VM_NFREEORDER) {
 939                                                 /*
 940                                                  * Determine if a sufficient number
 941                                                  * of subsequent blocks to satisfy
 942                                                  * the allocation request are free.
 943                                                  */
 944                                                 pa = VM_PAGE_TO_PHYS(m_ret);
 945                                                 pa_last = pa + size;
 946                                                 for (;;) {
 947                                                         pa += 1 << (PAGE_SHIFT + VM_NFREEORDER - 1);
 948                                                         if (pa >= pa_last)
 949                                                                 break;
 950                                                         if (pa < seg->start ||
 951                                                             pa >= seg->end)
 952                                                                 break;
 953                                                         m = &seg->first_page[atop(pa - seg->start)];
 954                                                         if (m->order != VM_NFREEORDER - 1)
 955                                                                 break;
 956                                                 }
 957                                                 /* If not, continue to the next block. */
 958                                                 if (pa < pa_last)
 959                                                         continue;
 960                                         }
 961
 962                                         /*
 963                                          * Determine if the blocks are within the given range,
 964                                          * satisfy the given alignment, and do not cross the
 965                                          * given boundary.
 966                                          */
 967                                         pa = VM_PAGE_TO_PHYS(m_ret);
 968                                         if (pa >= low &&
 969                                             pa + size <= high &&
 970                                             (pa & (alignment - 1)) == 0 &&
 971                                             ((pa ^ (pa + size - 1)) & ~(boundary - 1)) == 0)
 972                                                 goto done;
 973                                 }
 974                         }
 975                 }
 976         }
 977 #if VM_NRESERVLEVEL > 0
 978         if (vm_reserv_reclaim_contig(size, low, high, alignment, boundary))
 979                 goto retry;
 980 #endif
 981         mtx_unlock(&vm_page_queue_free_mtx);
 982         return (NULL);
 983 done:
 984         for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) {
 985                 fl = (*seg->free_queues)[m->pool];
 986                 TAILQ_REMOVE(&fl[m->order].pl, m, pageq);
 987                 fl[m->order].lcnt--;
 988                 m->order = VM_NFREEORDER;
 989         }
 990         if (m_ret->pool != VM_FREEPOOL_DEFAULT)
 991                 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m_ret, oind);
 992         fl = (*seg->free_queues)[m_ret->pool];
 993         vm_phys_split_pages(m_ret, oind, fl, order);
 994         for (i = 0; i < npages; i++) {
 995                 m = &m_ret[i];
 996                 vp = vm_page_alloc_init(m);
 997                 if (vp != NULL) {
 998                         /*
 999                          * Enqueue the vnode for deferred vdrop().
1000                          *
1001                          * Unmanaged pages don't use "pageq", so it
1002                          * can be safely abused to construct a short-
1003                          * lived queue of vnodes.
1004                          */
1005                         m->pageq.tqe_prev = (void *)vp;
1006                         m->pageq.tqe_next = deferred_vdrop_list;
1007                         deferred_vdrop_list = m;
1008                 }
1009         }
1010         for (; i < roundup2(npages, 1 << imin(oind, order)); i++) {
1011                 m = &m_ret[i];
1012                 KASSERT(m->order == VM_NFREEORDER,
1013                     ("vm_phys_alloc_contig: page %p has unexpected order %d",
1014                     m, m->order));
1015                 vm_phys_free_pages(m, 0);
1016         }
1017         mtx_unlock(&vm_page_queue_free_mtx);
1018         while (deferred_vdrop_list != NULL) {
1019                 vdrop((struct vnode *)deferred_vdrop_list->pageq.tqe_prev);
1020                 deferred_vdrop_list = deferred_vdrop_list->pageq.tqe_next;
1021         }
1022         return (m_ret);
1023 }
1024
1025 #ifdef DDB
1026 /*
1027  * Show the number of physical pages in each of the free lists.
1028  */
1029 DB_SHOW_COMMAND(freepages, db_show_freepages)
1030 {
1031         struct vm_freelist *fl;
1032         int flind, oind, pind;
1033
1034         for (flind = 0; flind < vm_nfreelists; flind++) {
1035                 db_printf("FREE LIST %d:\n"
1036                     "\n  ORDER (SIZE)  |  NUMBER"
1037                     "\n              ", flind);
1038                 for (pind = 0; pind < VM_NFREEPOOL; pind++)
1039                         db_printf("  |  POOL %d", pind);
1040                 db_printf("\n--            ");
1041                 for (pind = 0; pind < VM_NFREEPOOL; pind++)
1042                         db_printf("-- --      ");
1043                 db_printf("--\n");
1044                 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
1045                         db_printf("  %2.2d (%6.6dK)", oind,
1046                             1 << (PAGE_SHIFT - 10 + oind));
1047                         for (pind = 0; pind < VM_NFREEPOOL; pind++) {
1048                                 fl = vm_phys_free_queues[flind][pind];
1049                                 db_printf("  |  %6.6d", fl[oind].lcnt);
1050                         }
1051                         db_printf("\n");
1052                 }
1053                 db_printf("\n");
1054         }
1055 }
1056 #endif