sys/vm/vm_phys.c

   1 /*-
   2  * Copyright (c) 2002-2006 Rice University
   3  * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
   4  * All rights reserved.
   5  *
   6  * This software was developed for the FreeBSD Project by Alan L. Cox,
   7  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  * 1. Redistributions of source code must retain the above copyright
  13  *    notice, this list of conditions and the following disclaimer.
  14  * 2. Redistributions in binary form must reproduce the above copyright
  15  *    notice, this list of conditions and the following disclaimer in the
  16  *    documentation and/or other materials provided with the distribution.
  17  *
  18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  22  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  24  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  25  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  26  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  28  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 /*
  33  *      Physical memory system implementation
  34  *
  35  * Any external functions defined by this module are only to be used by the
  36  * virtual memory system.
  37  */
  38
  39 #include <sys/cdefs.h>
  40 __FBSDID("$FreeBSD$");
  41
  42 #include "opt_ddb.h"
  43 #include "opt_vm.h"
  44
  45 #include <sys/param.h>
  46 #include <sys/systm.h>
  47 #include <sys/lock.h>
  48 #include <sys/kernel.h>
  49 #include <sys/malloc.h>
  50 #include <sys/mutex.h>
  51 #if MAXMEMDOM > 1
  52 #include <sys/proc.h>
  53 #endif
  54 #include <sys/queue.h>
  55 #include <sys/rwlock.h>
  56 #include <sys/sbuf.h>
  57 #include <sys/sysctl.h>
  58 #include <sys/tree.h>
  59 #include <sys/vmmeter.h>
  60
  61 #include <ddb/ddb.h>
  62
  63 #include <vm/vm.h>
  64 #include <vm/vm_param.h>
  65 #include <vm/vm_kern.h>
  66 #include <vm/vm_object.h>
  67 #include <vm/vm_page.h>
  68 #include <vm/vm_phys.h>
  69
  70 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
  71     "Too many physsegs.");
  72
  73 struct mem_affinity *mem_affinity;
  74
  75 int vm_ndomains = 1;
  76
  77 struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
  78 int vm_phys_nsegs;
  79
  80 struct vm_phys_fictitious_seg;
  81 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *,
  82     struct vm_phys_fictitious_seg *);
  83
  84 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree =
  85     RB_INITIALIZER(_vm_phys_fictitious_tree);
  86
  87 struct vm_phys_fictitious_seg {
  88         RB_ENTRY(vm_phys_fictitious_seg) node;
  89         /* Memory region data */
  90         vm_paddr_t      start;
  91         vm_paddr_t      end;
  92         vm_page_t       first_page;
  93 };
  94
  95 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node,
  96     vm_phys_fictitious_cmp);
  97
  98 static struct rwlock vm_phys_fictitious_reg_lock;
  99 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages");
 100
 101 static struct vm_freelist
 102     vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER];
 103
 104 static int vm_nfreelists = VM_FREELIST_DEFAULT + 1;
 105
 106 static int cnt_prezero;
 107 SYSCTL_INT(_vm_stats_misc, OID_AUTO, cnt_prezero, CTLFLAG_RD,
 108     &cnt_prezero, 0, "The number of physical pages prezeroed at idle time");
 109
 110 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
 111 SYSCTL_OID(_vm, OID_AUTO, phys_free, CTLTYPE_STRING | CTLFLAG_RD,
 112     NULL, 0, sysctl_vm_phys_free, "A", "Phys Free Info");
 113
 114 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
 115 SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,
 116     NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info");
 117
 118 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD,
 119     &vm_ndomains, 0, "Number of physical memory domains available.");
 120
 121 static vm_page_t vm_phys_alloc_domain_pages(int domain, int flind, int pool,
 122     int order);
 123 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind,
 124     int domain);
 125 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind);
 126 static int vm_phys_paddr_to_segind(vm_paddr_t pa);
 127 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
 128     int order);
 129
 130 /*
 131  * Red-black tree helpers for vm fictitious range management.
 132  */
 133 static inline int
 134 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p,
 135     struct vm_phys_fictitious_seg *range)
 136 {
 137
 138         KASSERT(range->start != 0 && range->end != 0,
 139             ("Invalid range passed on search for vm_fictitious page"));
 140         if (p->start >= range->end)
 141                 return (1);
 142         if (p->start < range->start)
 143                 return (-1);
 144
 145         return (0);
 146 }
 147
 148 static int
 149 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1,
 150     struct vm_phys_fictitious_seg *p2)
 151 {
 152
 153         /* Check if this is a search for a page */
 154         if (p1->end == 0)
 155                 return (vm_phys_fictitious_in_range(p1, p2));
 156
 157         KASSERT(p2->end != 0,
 158     ("Invalid range passed as second parameter to vm fictitious comparison"));
 159
 160         /* Searching to add a new range */
 161         if (p1->end <= p2->start)
 162                 return (-1);
 163         if (p1->start >= p2->end)
 164                 return (1);
 165
 166         panic("Trying to add overlapping vm fictitious ranges:\n"
 167             "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start,
 168             (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end);
 169 }
 170
 171 static __inline int
 172 vm_rr_selectdomain(void)
 173 {
 174 #if MAXMEMDOM > 1
 175         struct thread *td;
 176
 177         td = curthread;
 178
 179         td->td_dom_rr_idx++;
 180         td->td_dom_rr_idx %= vm_ndomains;
 181         return (td->td_dom_rr_idx);
 182 #else
 183         return (0);
 184 #endif
 185 }
 186
 187 boolean_t
 188 vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high)
 189 {
 190         struct vm_phys_seg *s;
 191         int idx;
 192
 193         while ((idx = ffsl(mask)) != 0) {
 194                 idx--;  /* ffsl counts from 1 */
 195                 mask &= ~(1UL << idx);
 196                 s = &vm_phys_segs[idx];
 197                 if (low < s->end && high > s->start)
 198                         return (TRUE);
 199         }
 200         return (FALSE);
 201 }
 202
 203 /*
 204  * Outputs the state of the physical memory allocator, specifically,
 205  * the amount of physical memory in each free list.
 206  */
 207 static int
 208 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
 209 {
 210         struct sbuf sbuf;
 211         struct vm_freelist *fl;
 212         int dom, error, flind, oind, pind;
 213
 214         error = sysctl_wire_old_buffer(req, 0);
 215         if (error != 0)
 216                 return (error);
 217         sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req);
 218         for (dom = 0; dom < vm_ndomains; dom++) {
 219                 sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom);
 220                 for (flind = 0; flind < vm_nfreelists; flind++) {
 221                         sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
 222                             "\n  ORDER (SIZE)  |  NUMBER"
 223                             "\n              ", flind);
 224                         for (pind = 0; pind < VM_NFREEPOOL; pind++)
 225                                 sbuf_printf(&sbuf, "  |  POOL %d", pind);
 226                         sbuf_printf(&sbuf, "\n--            ");
 227                         for (pind = 0; pind < VM_NFREEPOOL; pind++)
 228                                 sbuf_printf(&sbuf, "-- --      ");
 229                         sbuf_printf(&sbuf, "--\n");
 230                         for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
 231                                 sbuf_printf(&sbuf, "  %2d (%6dK)", oind,
 232                                     1 << (PAGE_SHIFT - 10 + oind));
 233                                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 234                                 fl = vm_phys_free_queues[dom][flind][pind];
 235                                         sbuf_printf(&sbuf, "  |  %6d",
 236                                             fl[oind].lcnt);
 237                                 }
 238                                 sbuf_printf(&sbuf, "\n");
 239                         }
 240                 }
 241         }
 242         error = sbuf_finish(&sbuf);
 243         sbuf_delete(&sbuf);
 244         return (error);
 245 }
 246
 247 /*
 248  * Outputs the set of physical memory segments.
 249  */
 250 static int
 251 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
 252 {
 253         struct sbuf sbuf;
 254         struct vm_phys_seg *seg;
 255         int error, segind;
 256
 257         error = sysctl_wire_old_buffer(req, 0);
 258         if (error != 0)
 259                 return (error);
 260         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 261         for (segind = 0; segind < vm_phys_nsegs; segind++) {
 262                 sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
 263                 seg = &vm_phys_segs[segind];
 264                 sbuf_printf(&sbuf, "start:     %#jx\n",
 265                     (uintmax_t)seg->start);
 266                 sbuf_printf(&sbuf, "end:       %#jx\n",
 267                     (uintmax_t)seg->end);
 268                 sbuf_printf(&sbuf, "domain:    %d\n", seg->domain);
 269                 sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
 270         }
 271         error = sbuf_finish(&sbuf);
 272         sbuf_delete(&sbuf);
 273         return (error);
 274 }
 275
 276 static void
 277 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail)
 278 {
 279
 280         m->order = order;
 281         if (tail)
 282                 TAILQ_INSERT_TAIL(&fl[order].pl, m, plinks.q);
 283         else
 284                 TAILQ_INSERT_HEAD(&fl[order].pl, m, plinks.q);
 285         fl[order].lcnt++;
 286 }
 287
 288 static void
 289 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order)
 290 {
 291
 292         TAILQ_REMOVE(&fl[order].pl, m, plinks.q);
 293         fl[order].lcnt--;
 294         m->order = VM_NFREEORDER;
 295 }
 296
 297 /*
 298  * Create a physical memory segment.
 299  */
 300 static void
 301 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind, int domain)
 302 {
 303         struct vm_phys_seg *seg;
 304
 305         KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
 306             ("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
 307         KASSERT(domain < vm_ndomains,
 308             ("vm_phys_create_seg: invalid domain provided"));
 309         seg = &vm_phys_segs[vm_phys_nsegs++];
 310         while (seg > vm_phys_segs && (seg - 1)->start >= end) {
 311                 *seg = *(seg - 1);
 312                 seg--;
 313         }
 314         seg->start = start;
 315         seg->end = end;
 316         seg->domain = domain;
 317         seg->free_queues = &vm_phys_free_queues[domain][flind];
 318 }
 319
 320 static void
 321 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind)
 322 {
 323         int i;
 324
 325         if (mem_affinity == NULL) {
 326                 _vm_phys_create_seg(start, end, flind, 0);
 327                 return;
 328         }
 329
 330         for (i = 0;; i++) {
 331                 if (mem_affinity[i].end == 0)
 332                         panic("Reached end of affinity info");
 333                 if (mem_affinity[i].end <= start)
 334                         continue;
 335                 if (mem_affinity[i].start > start)
 336                         panic("No affinity info for start %jx",
 337                             (uintmax_t)start);
 338                 if (mem_affinity[i].end >= end) {
 339                         _vm_phys_create_seg(start, end, flind,
 340                             mem_affinity[i].domain);
 341                         break;
 342                 }
 343                 _vm_phys_create_seg(start, mem_affinity[i].end, flind,
 344                     mem_affinity[i].domain);
 345                 start = mem_affinity[i].end;
 346         }
 347 }
 348
 349 /*
 350  * Add a physical memory segment.
 351  */
 352 void
 353 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end)
 354 {
 355
 356         KASSERT((start & PAGE_MASK) == 0,
 357             ("vm_phys_define_seg: start is not page aligned"));
 358         KASSERT((end & PAGE_MASK) == 0,
 359             ("vm_phys_define_seg: end is not page aligned"));
 360 #ifdef  VM_FREELIST_ISADMA
 361         if (start < 16777216) {
 362                 if (end > 16777216) {
 363                         vm_phys_create_seg(start, 16777216,
 364                             VM_FREELIST_ISADMA);
 365                         vm_phys_create_seg(16777216, end, VM_FREELIST_DEFAULT);
 366                 } else
 367                         vm_phys_create_seg(start, end, VM_FREELIST_ISADMA);
 368                 if (VM_FREELIST_ISADMA >= vm_nfreelists)
 369                         vm_nfreelists = VM_FREELIST_ISADMA + 1;
 370         } else
 371 #endif
 372 #ifdef  VM_FREELIST_HIGHMEM
 373         if (end > VM_HIGHMEM_ADDRESS) {
 374                 if (start < VM_HIGHMEM_ADDRESS) {
 375                         vm_phys_create_seg(start, VM_HIGHMEM_ADDRESS,
 376                             VM_FREELIST_DEFAULT);
 377                         vm_phys_create_seg(VM_HIGHMEM_ADDRESS, end,
 378                             VM_FREELIST_HIGHMEM);
 379                 } else
 380                         vm_phys_create_seg(start, end, VM_FREELIST_HIGHMEM);
 381                 if (VM_FREELIST_HIGHMEM >= vm_nfreelists)
 382                         vm_nfreelists = VM_FREELIST_HIGHMEM + 1;
 383         } else
 384 #endif
 385         vm_phys_create_seg(start, end, VM_FREELIST_DEFAULT);
 386 }
 387
 388 /*
 389  * Initialize the physical memory allocator.
 390  */
 391 void
 392 vm_phys_init(void)
 393 {
 394         struct vm_freelist *fl;
 395         struct vm_phys_seg *seg;
 396 #ifdef VM_PHYSSEG_SPARSE
 397         long pages;
 398 #endif
 399         int dom, flind, oind, pind, segind;
 400
 401 #ifdef VM_PHYSSEG_SPARSE
 402         pages = 0;
 403 #endif
 404         for (segind = 0; segind < vm_phys_nsegs; segind++) {
 405                 seg = &vm_phys_segs[segind];
 406 #ifdef VM_PHYSSEG_SPARSE
 407                 seg->first_page = &vm_page_array[pages];
 408                 pages += atop(seg->end - seg->start);
 409 #else
 410                 seg->first_page = PHYS_TO_VM_PAGE(seg->start);
 411 #endif
 412         }
 413         for (dom = 0; dom < vm_ndomains; dom++) {
 414                 for (flind = 0; flind < vm_nfreelists; flind++) {
 415                         for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 416                                 fl = vm_phys_free_queues[dom][flind][pind];
 417                                 for (oind = 0; oind < VM_NFREEORDER; oind++)
 418                                         TAILQ_INIT(&fl[oind].pl);
 419                         }
 420                 }
 421         }
 422         rw_init(&vm_phys_fictitious_reg_lock, "vmfctr");
 423 }
 424
 425 /*
 426  * Split a contiguous, power of two-sized set of physical pages.
 427  */
 428 static __inline void
 429 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order)
 430 {
 431         vm_page_t m_buddy;
 432
 433         while (oind > order) {
 434                 oind--;
 435                 m_buddy = &m[1 << oind];
 436                 KASSERT(m_buddy->order == VM_NFREEORDER,
 437                     ("vm_phys_split_pages: page %p has unexpected order %d",
 438                     m_buddy, m_buddy->order));
 439                 vm_freelist_add(fl, m_buddy, oind, 0);
 440         }
 441 }
 442
 443 /*
 444  * Initialize a physical page and add it to the free lists.
 445  */
 446 void
 447 vm_phys_add_page(vm_paddr_t pa)
 448 {
 449         vm_page_t m;
 450         struct vm_domain *vmd;
 451
 452         vm_cnt.v_page_count++;
 453         m = vm_phys_paddr_to_vm_page(pa);
 454         m->phys_addr = pa;
 455         m->queue = PQ_NONE;
 456         m->segind = vm_phys_paddr_to_segind(pa);
 457         vmd = vm_phys_domain(m);
 458         vmd->vmd_page_count++;
 459         vmd->vmd_segs |= 1UL << m->segind;
 460         KASSERT(m->order == VM_NFREEORDER,
 461             ("vm_phys_add_page: page %p has unexpected order %d",
 462             m, m->order));
 463         m->pool = VM_FREEPOOL_DEFAULT;
 464         pmap_page_init(m);
 465         mtx_lock(&vm_page_queue_free_mtx);
 466         vm_phys_freecnt_adj(m, 1);
 467         vm_phys_free_pages(m, 0);
 468         mtx_unlock(&vm_page_queue_free_mtx);
 469 }
 470
 471 /*
 472  * Allocate a contiguous, power of two-sized set of physical pages
 473  * from the free lists.
 474  *
 475  * The free page queues must be locked.
 476  */
 477 vm_page_t
 478 vm_phys_alloc_pages(int pool, int order)
 479 {
 480         vm_page_t m;
 481         int dom, domain, flind;
 482
 483         KASSERT(pool < VM_NFREEPOOL,
 484             ("vm_phys_alloc_pages: pool %d is out of range", pool));
 485         KASSERT(order < VM_NFREEORDER,
 486             ("vm_phys_alloc_pages: order %d is out of range", order));
 487
 488         for (dom = 0; dom < vm_ndomains; dom++) {
 489                 domain = vm_rr_selectdomain();
 490                 for (flind = 0; flind < vm_nfreelists; flind++) {
 491                         m = vm_phys_alloc_domain_pages(domain, flind, pool,
 492                             order);
 493                         if (m != NULL)
 494                                 return (m);
 495                 }
 496         }
 497         return (NULL);
 498 }
 499
 500 /*
 501  * Find and dequeue a free page on the given free list, with the
 502  * specified pool and order
 503  */
 504 vm_page_t
 505 vm_phys_alloc_freelist_pages(int flind, int pool, int order)
 506 {
 507         vm_page_t m;
 508         int dom, domain;
 509
 510         KASSERT(flind < VM_NFREELIST,
 511             ("vm_phys_alloc_freelist_pages: freelist %d is out of range", flind));
 512         KASSERT(pool < VM_NFREEPOOL,
 513             ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
 514         KASSERT(order < VM_NFREEORDER,
 515             ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
 516
 517         for (dom = 0; dom < vm_ndomains; dom++) {
 518                 domain = vm_rr_selectdomain();
 519                 m = vm_phys_alloc_domain_pages(domain, flind, pool, order);
 520                 if (m != NULL)
 521                         return (m);
 522         }
 523         return (NULL);
 524 }
 525
 526 static vm_page_t
 527 vm_phys_alloc_domain_pages(int domain, int flind, int pool, int order)
 528 {
 529         struct vm_freelist *fl;
 530         struct vm_freelist *alt;
 531         int oind, pind;
 532         vm_page_t m;
 533
 534         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 535         fl = &vm_phys_free_queues[domain][flind][pool][0];
 536         for (oind = order; oind < VM_NFREEORDER; oind++) {
 537                 m = TAILQ_FIRST(&fl[oind].pl);
 538                 if (m != NULL) {
 539                         vm_freelist_rem(fl, m, oind);
 540                         vm_phys_split_pages(m, oind, fl, order);
 541                         return (m);
 542                 }
 543         }
 544
 545         /*
 546          * The given pool was empty.  Find the largest
 547          * contiguous, power-of-two-sized set of pages in any
 548          * pool.  Transfer these pages to the given pool, and
 549          * use them to satisfy the allocation.
 550          */
 551         for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
 552                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 553                         alt = &vm_phys_free_queues[domain][flind][pind][0];
 554                         m = TAILQ_FIRST(&alt[oind].pl);
 555                         if (m != NULL) {
 556                                 vm_freelist_rem(alt, m, oind);
 557                                 vm_phys_set_pool(pool, m, oind);
 558                                 vm_phys_split_pages(m, oind, fl, order);
 559                                 return (m);
 560                         }
 561                 }
 562         }
 563         return (NULL);
 564 }
 565
 566 /*
 567  * Find the vm_page corresponding to the given physical address.
 568  */
 569 vm_page_t
 570 vm_phys_paddr_to_vm_page(vm_paddr_t pa)
 571 {
 572         struct vm_phys_seg *seg;
 573         int segind;
 574
 575         for (segind = 0; segind < vm_phys_nsegs; segind++) {
 576                 seg = &vm_phys_segs[segind];
 577                 if (pa >= seg->start && pa < seg->end)
 578                         return (&seg->first_page[atop(pa - seg->start)]);
 579         }
 580         return (NULL);
 581 }
 582
 583 vm_page_t
 584 vm_phys_fictitious_to_vm_page(vm_paddr_t pa)
 585 {
 586         struct vm_phys_fictitious_seg tmp, *seg;
 587         vm_page_t m;
 588
 589         m = NULL;
 590         tmp.start = pa;
 591         tmp.end = 0;
 592
 593         rw_rlock(&vm_phys_fictitious_reg_lock);
 594         seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
 595         rw_runlock(&vm_phys_fictitious_reg_lock);
 596         if (seg == NULL)
 597                 return (NULL);
 598
 599         m = &seg->first_page[atop(pa - seg->start)];
 600         KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m));
 601
 602         return (m);
 603 }
 604
 605 static inline void
 606 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start,
 607     long page_count, vm_memattr_t memattr)
 608 {
 609         long i;
 610
 611         for (i = 0; i < page_count; i++) {
 612                 vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr);
 613                 range[i].oflags &= ~VPO_UNMANAGED;
 614                 range[i].busy_lock = VPB_UNBUSIED;
 615         }
 616 }
 617
 618 int
 619 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
 620     vm_memattr_t memattr)
 621 {
 622         struct vm_phys_fictitious_seg *seg;
 623         vm_page_t fp;
 624         long page_count;
 625 #ifdef VM_PHYSSEG_DENSE
 626         long pi, pe;
 627         long dpage_count;
 628 #endif
 629
 630         KASSERT(start < end,
 631             ("Start of segment isn't less than end (start: %jx end: %jx)",
 632             (uintmax_t)start, (uintmax_t)end));
 633
 634         page_count = (end - start) / PAGE_SIZE;
 635
 636 #ifdef VM_PHYSSEG_DENSE
 637         pi = atop(start);
 638         pe = atop(end);
 639         if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
 640                 fp = &vm_page_array[pi - first_page];
 641                 if ((pe - first_page) > vm_page_array_size) {
 642                         /*
 643                          * We have a segment that starts inside
 644                          * of vm_page_array, but ends outside of it.
 645                          *
 646                          * Use vm_page_array pages for those that are
 647                          * inside of the vm_page_array range, and
 648                          * allocate the remaining ones.
 649                          */
 650                         dpage_count = vm_page_array_size - (pi - first_page);
 651                         vm_phys_fictitious_init_range(fp, start, dpage_count,
 652                             memattr);
 653                         page_count -= dpage_count;
 654                         start += ptoa(dpage_count);
 655                         goto alloc;
 656                 }
 657                 /*
 658                  * We can allocate the full range from vm_page_array,
 659                  * so there's no need to register the range in the tree.
 660                  */
 661                 vm_phys_fictitious_init_range(fp, start, page_count, memattr);
 662                 return (0);
 663         } else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
 664                 /*
 665                  * We have a segment that ends inside of vm_page_array,
 666                  * but starts outside of it.
 667                  */
 668                 fp = &vm_page_array[0];
 669                 dpage_count = pe - first_page;
 670                 vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count,
 671                     memattr);
 672                 end -= ptoa(dpage_count);
 673                 page_count -= dpage_count;
 674                 goto alloc;
 675         } else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
 676                 /*
 677                  * Trying to register a fictitious range that expands before
 678                  * and after vm_page_array.
 679                  */
 680                 return (EINVAL);
 681         } else {
 682 alloc:
 683 #endif
 684                 fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES,
 685                     M_WAITOK | M_ZERO);
 686 #ifdef VM_PHYSSEG_DENSE
 687         }
 688 #endif
 689         vm_phys_fictitious_init_range(fp, start, page_count, memattr);
 690
 691         seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO);
 692         seg->start = start;
 693         seg->end = end;
 694         seg->first_page = fp;
 695
 696         rw_wlock(&vm_phys_fictitious_reg_lock);
 697         RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg);
 698         rw_wunlock(&vm_phys_fictitious_reg_lock);
 699
 700         return (0);
 701 }
 702
 703 void
 704 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end)
 705 {
 706         struct vm_phys_fictitious_seg *seg, tmp;
 707 #ifdef VM_PHYSSEG_DENSE
 708         long pi, pe;
 709 #endif
 710
 711         KASSERT(start < end,
 712             ("Start of segment isn't less than end (start: %jx end: %jx)",
 713             (uintmax_t)start, (uintmax_t)end));
 714
 715 #ifdef VM_PHYSSEG_DENSE
 716         pi = atop(start);
 717         pe = atop(end);
 718         if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
 719                 if ((pe - first_page) <= vm_page_array_size) {
 720                         /*
 721                          * This segment was allocated using vm_page_array
 722                          * only, there's nothing to do since those pages
 723                          * were never added to the tree.
 724                          */
 725                         return;
 726                 }
 727                 /*
 728                  * We have a segment that starts inside
 729                  * of vm_page_array, but ends outside of it.
 730                  *
 731                  * Calculate how many pages were added to the
 732                  * tree and free them.
 733                  */
 734                 start = ptoa(first_page + vm_page_array_size);
 735         } else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
 736                 /*
 737                  * We have a segment that ends inside of vm_page_array,
 738                  * but starts outside of it.
 739                  */
 740                 end = ptoa(first_page);
 741         } else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
 742                 /* Since it's not possible to register such a range, panic. */
 743                 panic(
 744                     "Unregistering not registered fictitious range [%#jx:%#jx]",
 745                     (uintmax_t)start, (uintmax_t)end);
 746         }
 747 #endif
 748         tmp.start = start;
 749         tmp.end = 0;
 750
 751         rw_wlock(&vm_phys_fictitious_reg_lock);
 752         seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
 753         if (seg->start != start || seg->end != end) {
 754                 rw_wunlock(&vm_phys_fictitious_reg_lock);
 755                 panic(
 756                     "Unregistering not registered fictitious range [%#jx:%#jx]",
 757                     (uintmax_t)start, (uintmax_t)end);
 758         }
 759         RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg);
 760         rw_wunlock(&vm_phys_fictitious_reg_lock);
 761         free(seg->first_page, M_FICT_PAGES);
 762         free(seg, M_FICT_PAGES);
 763 }
 764
 765 /*
 766  * Find the segment containing the given physical address.
 767  */
 768 static int
 769 vm_phys_paddr_to_segind(vm_paddr_t pa)
 770 {
 771         struct vm_phys_seg *seg;
 772         int segind;
 773
 774         for (segind = 0; segind < vm_phys_nsegs; segind++) {
 775                 seg = &vm_phys_segs[segind];
 776                 if (pa >= seg->start && pa < seg->end)
 777                         return (segind);
 778         }
 779         panic("vm_phys_paddr_to_segind: paddr %#jx is not in any segment" ,
 780             (uintmax_t)pa);
 781 }
 782
 783 /*
 784  * Free a contiguous, power of two-sized set of physical pages.
 785  *
 786  * The free page queues must be locked.
 787  */
 788 void
 789 vm_phys_free_pages(vm_page_t m, int order)
 790 {
 791         struct vm_freelist *fl;
 792         struct vm_phys_seg *seg;
 793         vm_paddr_t pa;
 794         vm_page_t m_buddy;
 795
 796         KASSERT(m->order == VM_NFREEORDER,
 797             ("vm_phys_free_pages: page %p has unexpected order %d",
 798             m, m->order));
 799         KASSERT(m->pool < VM_NFREEPOOL,
 800             ("vm_phys_free_pages: page %p has unexpected pool %d",
 801             m, m->pool));
 802         KASSERT(order < VM_NFREEORDER,
 803             ("vm_phys_free_pages: order %d is out of range", order));
 804         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 805         seg = &vm_phys_segs[m->segind];
 806         if (order < VM_NFREEORDER - 1) {
 807                 pa = VM_PAGE_TO_PHYS(m);
 808                 do {
 809                         pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order));
 810                         if (pa < seg->start || pa >= seg->end)
 811                                 break;
 812                         m_buddy = &seg->first_page[atop(pa - seg->start)];
 813                         if (m_buddy->order != order)
 814                                 break;
 815                         fl = (*seg->free_queues)[m_buddy->pool];
 816                         vm_freelist_rem(fl, m_buddy, order);
 817                         if (m_buddy->pool != m->pool)
 818                                 vm_phys_set_pool(m->pool, m_buddy, order);
 819                         order++;
 820                         pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1);
 821                         m = &seg->first_page[atop(pa - seg->start)];
 822                 } while (order < VM_NFREEORDER - 1);
 823         }
 824         fl = (*seg->free_queues)[m->pool];
 825         vm_freelist_add(fl, m, order, 1);
 826 }
 827
 828 /*
 829  * Free a contiguous, arbitrarily sized set of physical pages.
 830  *
 831  * The free page queues must be locked.
 832  */
 833 void
 834 vm_phys_free_contig(vm_page_t m, u_long npages)
 835 {
 836         u_int n;
 837         int order;
 838
 839         /*
 840          * Avoid unnecessary coalescing by freeing the pages in the largest
 841          * possible power-of-two-sized subsets.
 842          */
 843         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 844         for (;; npages -= n) {
 845                 /*
 846                  * Unsigned "min" is used here so that "order" is assigned
 847                  * "VM_NFREEORDER - 1" when "m"'s physical address is zero
 848                  * or the low-order bits of its physical address are zero
 849                  * because the size of a physical address exceeds the size of
 850                  * a long.
 851                  */
 852                 order = min(ffsl(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1,
 853                     VM_NFREEORDER - 1);
 854                 n = 1 << order;
 855                 if (npages < n)
 856                         break;
 857                 vm_phys_free_pages(m, order);
 858                 m += n;
 859         }
 860         /* The residual "npages" is less than "1 << (VM_NFREEORDER - 1)". */
 861         for (; npages > 0; npages -= n) {
 862                 order = flsl(npages) - 1;
 863                 n = 1 << order;
 864                 vm_phys_free_pages(m, order);
 865                 m += n;
 866         }
 867 }
 868
 869 /*
 870  * Set the pool for a contiguous, power of two-sized set of physical pages.
 871  */
 872 void
 873 vm_phys_set_pool(int pool, vm_page_t m, int order)
 874 {
 875         vm_page_t m_tmp;
 876
 877         for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
 878                 m_tmp->pool = pool;
 879 }
 880
 881 /*
 882  * Search for the given physical page "m" in the free lists.  If the search
 883  * succeeds, remove "m" from the free lists and return TRUE.  Otherwise, return
 884  * FALSE, indicating that "m" is not in the free lists.
 885  *
 886  * The free page queues must be locked.
 887  */
 888 boolean_t
 889 vm_phys_unfree_page(vm_page_t m)
 890 {
 891         struct vm_freelist *fl;
 892         struct vm_phys_seg *seg;
 893         vm_paddr_t pa, pa_half;
 894         vm_page_t m_set, m_tmp;
 895         int order;
 896
 897         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 898
 899         /*
 900          * First, find the contiguous, power of two-sized set of free
 901          * physical pages containing the given physical page "m" and
 902          * assign it to "m_set".
 903          */
 904         seg = &vm_phys_segs[m->segind];
 905         for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
 906             order < VM_NFREEORDER - 1; ) {
 907                 order++;
 908                 pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
 909                 if (pa >= seg->start)
 910                         m_set = &seg->first_page[atop(pa - seg->start)];
 911                 else
 912                         return (FALSE);
 913         }
 914         if (m_set->order < order)
 915                 return (FALSE);
 916         if (m_set->order == VM_NFREEORDER)
 917                 return (FALSE);
 918         KASSERT(m_set->order < VM_NFREEORDER,
 919             ("vm_phys_unfree_page: page %p has unexpected order %d",
 920             m_set, m_set->order));
 921
 922         /*
 923          * Next, remove "m_set" from the free lists.  Finally, extract
 924          * "m" from "m_set" using an iterative algorithm: While "m_set"
 925          * is larger than a page, shrink "m_set" by returning the half
 926          * of "m_set" that does not contain "m" to the free lists.
 927          */
 928         fl = (*seg->free_queues)[m_set->pool];
 929         order = m_set->order;
 930         vm_freelist_rem(fl, m_set, order);
 931         while (order > 0) {
 932                 order--;
 933                 pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
 934                 if (m->phys_addr < pa_half)
 935                         m_tmp = &seg->first_page[atop(pa_half - seg->start)];
 936                 else {
 937                         m_tmp = m_set;
 938                         m_set = &seg->first_page[atop(pa_half - seg->start)];
 939                 }
 940                 vm_freelist_add(fl, m_tmp, order, 0);
 941         }
 942         KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
 943         return (TRUE);
 944 }
 945
 946 /*
 947  * Try to zero one physical page.  Used by an idle priority thread.
 948  */
 949 boolean_t
 950 vm_phys_zero_pages_idle(void)
 951 {
 952         static struct vm_freelist *fl;
 953         static int flind, oind, pind;
 954         vm_page_t m, m_tmp;
 955         int domain;
 956
 957         domain = vm_rr_selectdomain();
 958         fl = vm_phys_free_queues[domain][0][0];
 959         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 960         for (;;) {
 961                 TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, plinks.q) {
 962                         for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) {
 963                                 if ((m_tmp->flags & (PG_CACHED | PG_ZERO)) == 0) {
 964                                         vm_phys_unfree_page(m_tmp);
 965                                         vm_phys_freecnt_adj(m, -1);
 966                                         mtx_unlock(&vm_page_queue_free_mtx);
 967                                         pmap_zero_page_idle(m_tmp);
 968                                         m_tmp->flags |= PG_ZERO;
 969                                         mtx_lock(&vm_page_queue_free_mtx);
 970                                         vm_phys_freecnt_adj(m, 1);
 971                                         vm_phys_free_pages(m_tmp, 0);
 972                                         vm_page_zero_count++;
 973                                         cnt_prezero++;
 974                                         return (TRUE);
 975                                 }
 976                         }
 977                 }
 978                 oind++;
 979                 if (oind == VM_NFREEORDER) {
 980                         oind = 0;
 981                         pind++;
 982                         if (pind == VM_NFREEPOOL) {
 983                                 pind = 0;
 984                                 flind++;
 985                                 if (flind == vm_nfreelists)
 986                                         flind = 0;
 987                         }
 988                         fl = vm_phys_free_queues[domain][flind][pind];
 989                 }
 990         }
 991 }
 992
 993 /*
 994  * Allocate a contiguous set of physical pages of the given size
 995  * "npages" from the free lists.  All of the physical pages must be at
 996  * or above the given physical address "low" and below the given
 997  * physical address "high".  The given value "alignment" determines the
 998  * alignment of the first physical page in the set.  If the given value
 999  * "boundary" is non-zero, then the set of physical pages cannot cross
1000  * any physical address boundary that is a multiple of that value.  Both
1001  * "alignment" and "boundary" must be a power of two.
1002  */
1003 vm_page_t
1004 vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
1005     u_long alignment, vm_paddr_t boundary)
1006 {
1007         struct vm_freelist *fl;
1008         struct vm_phys_seg *seg;
1009         vm_paddr_t pa, pa_last, size;
1010         vm_page_t m, m_ret;
1011         u_long npages_end;
1012         int dom, domain, flind, oind, order, pind;
1013
1014         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1015         size = npages << PAGE_SHIFT;
1016         KASSERT(size != 0,
1017             ("vm_phys_alloc_contig: size must not be 0"));
1018         KASSERT((alignment & (alignment - 1)) == 0,
1019             ("vm_phys_alloc_contig: alignment must be a power of 2"));
1020         KASSERT((boundary & (boundary - 1)) == 0,
1021             ("vm_phys_alloc_contig: boundary must be a power of 2"));
1022         /* Compute the queue that is the best fit for npages. */
1023         for (order = 0; (1 << order) < npages; order++);
1024         dom = 0;
1025 restartdom:
1026         domain = vm_rr_selectdomain();
1027         for (flind = 0; flind < vm_nfreelists; flind++) {
1028                 for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) {
1029                         for (pind = 0; pind < VM_NFREEPOOL; pind++) {
1030                                 fl = &vm_phys_free_queues[domain][flind][pind][0];
1031                                 TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) {
1032                                         /*
1033                                          * A free list may contain physical pages
1034                                          * from one or more segments.
1035                                          */
1036                                         seg = &vm_phys_segs[m_ret->segind];
1037                                         if (seg->start > high ||
1038                                             low >= seg->end)
1039                                                 continue;
1040
1041                                         /*
1042                                          * Is the size of this allocation request
1043                                          * larger than the largest block size?
1044                                          */
1045                                         if (order >= VM_NFREEORDER) {
1046                                                 /*
1047                                                  * Determine if a sufficient number
1048                                                  * of subsequent blocks to satisfy
1049                                                  * the allocation request are free.
1050                                                  */
1051                                                 pa = VM_PAGE_TO_PHYS(m_ret);
1052                                                 pa_last = pa + size;
1053                                                 for (;;) {
1054                                                         pa += 1 << (PAGE_SHIFT + VM_NFREEORDER - 1);
1055                                                         if (pa >= pa_last)
1056                                                                 break;
1057                                                         if (pa < seg->start ||
1058                                                             pa >= seg->end)
1059                                                                 break;
1060                                                         m = &seg->first_page[atop(pa - seg->start)];
1061                                                         if (m->order != VM_NFREEORDER - 1)
1062                                                                 break;
1063                                                 }
1064                                                 /* If not, continue to the next block. */
1065                                                 if (pa < pa_last)
1066                                                         continue;
1067                                         }
1068
1069                                         /*
1070                                          * Determine if the blocks are within the given range,
1071                                          * satisfy the given alignment, and do not cross the
1072                                          * given boundary.
1073                                          */
1074                                         pa = VM_PAGE_TO_PHYS(m_ret);
1075                                         if (pa >= low &&
1076                                             pa + size <= high &&
1077                                             (pa & (alignment - 1)) == 0 &&
1078                                             ((pa ^ (pa + size - 1)) & ~(boundary - 1)) == 0)
1079                                                 goto done;
1080                                 }
1081                         }
1082                 }
1083         }
1084         if (++dom < vm_ndomains)
1085                 goto restartdom;
1086         return (NULL);
1087 done:
1088         for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) {
1089                 fl = (*seg->free_queues)[m->pool];
1090                 vm_freelist_rem(fl, m, m->order);
1091         }
1092         if (m_ret->pool != VM_FREEPOOL_DEFAULT)
1093                 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m_ret, oind);
1094         fl = (*seg->free_queues)[m_ret->pool];
1095         vm_phys_split_pages(m_ret, oind, fl, order);
1096         /* Return excess pages to the free lists. */
1097         npages_end = roundup2(npages, 1 << imin(oind, order));
1098         if (npages < npages_end)
1099                 vm_phys_free_contig(&m_ret[npages], npages_end - npages);
1100         return (m_ret);
1101 }
1102
1103 #ifdef DDB
1104 /*
1105  * Show the number of physical pages in each of the free lists.
1106  */
1107 DB_SHOW_COMMAND(freepages, db_show_freepages)
1108 {
1109         struct vm_freelist *fl;
1110         int flind, oind, pind, dom;
1111
1112         for (dom = 0; dom < vm_ndomains; dom++) {
1113                 db_printf("DOMAIN: %d\n", dom);
1114                 for (flind = 0; flind < vm_nfreelists; flind++) {
1115                         db_printf("FREE LIST %d:\n"
1116                             "\n  ORDER (SIZE)  |  NUMBER"
1117                             "\n              ", flind);
1118                         for (pind = 0; pind < VM_NFREEPOOL; pind++)
1119                                 db_printf("  |  POOL %d", pind);
1120                         db_printf("\n--            ");
1121                         for (pind = 0; pind < VM_NFREEPOOL; pind++)
1122                                 db_printf("-- --      ");
1123                         db_printf("--\n");
1124                         for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
1125                                 db_printf("  %2.2d (%6.6dK)", oind,
1126                                     1 << (PAGE_SHIFT - 10 + oind));
1127                                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
1128                                 fl = vm_phys_free_queues[dom][flind][pind];
1129                                         db_printf("  |  %6.6d", fl[oind].lcnt);
1130                                 }
1131                                 db_printf("\n");
1132                         }
1133                         db_printf("\n");
1134                 }
1135                 db_printf("\n");
1136         }
1137 }
1138 #endif