]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/vm/vm_phys.c
In swp_pager_meta_free_all(), fix type of the index variable. Style.
[FreeBSD/FreeBSD.git] / sys / vm / vm_phys.c
1 /*-
2  * Copyright (c) 2002-2006 Rice University
3  * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
4  * All rights reserved.
5  *
6  * This software was developed for the FreeBSD Project by Alan L. Cox,
7  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
22  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
25  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
26  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
28  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31
32 /*
33  *      Physical memory system implementation
34  *
35  * Any external functions defined by this module are only to be used by the
36  * virtual memory system.
37  */
38
39 #include <sys/cdefs.h>
40 __FBSDID("$FreeBSD$");
41
42 #include "opt_ddb.h"
43 #include "opt_vm.h"
44
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/lock.h>
48 #include <sys/kernel.h>
49 #include <sys/malloc.h>
50 #include <sys/mutex.h>
51 #include <sys/proc.h>
52 #include <sys/queue.h>
53 #include <sys/rwlock.h>
54 #include <sys/sbuf.h>
55 #include <sys/sysctl.h>
56 #include <sys/tree.h>
57 #include <sys/vmmeter.h>
58 #include <sys/seq.h>
59
60 #include <ddb/ddb.h>
61
62 #include <vm/vm.h>
63 #include <vm/vm_param.h>
64 #include <vm/vm_kern.h>
65 #include <vm/vm_object.h>
66 #include <vm/vm_page.h>
67 #include <vm/vm_phys.h>
68
69 #include <vm/vm_domain.h>
70
71 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
72     "Too many physsegs.");
73
74 #ifdef VM_NUMA_ALLOC
75 struct mem_affinity *mem_affinity;
76 int *mem_locality;
77 #endif
78
79 int vm_ndomains = 1;
80
81 struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
82 int vm_phys_nsegs;
83
84 struct vm_phys_fictitious_seg;
85 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *,
86     struct vm_phys_fictitious_seg *);
87
88 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree =
89     RB_INITIALIZER(_vm_phys_fictitious_tree);
90
91 struct vm_phys_fictitious_seg {
92         RB_ENTRY(vm_phys_fictitious_seg) node;
93         /* Memory region data */
94         vm_paddr_t      start;
95         vm_paddr_t      end;
96         vm_page_t       first_page;
97 };
98
99 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node,
100     vm_phys_fictitious_cmp);
101
102 static struct rwlock vm_phys_fictitious_reg_lock;
103 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages");
104
105 static struct vm_freelist
106     vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER];
107
108 static int vm_nfreelists;
109
110 /*
111  * Provides the mapping from VM_FREELIST_* to free list indices (flind).
112  */
113 static int vm_freelist_to_flind[VM_NFREELIST];
114
115 CTASSERT(VM_FREELIST_DEFAULT == 0);
116
117 #ifdef VM_FREELIST_ISADMA
118 #define VM_ISADMA_BOUNDARY      16777216
119 #endif
120 #ifdef VM_FREELIST_DMA32
121 #define VM_DMA32_BOUNDARY       ((vm_paddr_t)1 << 32)
122 #endif
123
124 /*
125  * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about
126  * the ordering of the free list boundaries.
127  */
128 #if defined(VM_ISADMA_BOUNDARY) && defined(VM_LOWMEM_BOUNDARY)
129 CTASSERT(VM_ISADMA_BOUNDARY < VM_LOWMEM_BOUNDARY);
130 #endif
131 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY)
132 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY);
133 #endif
134
135 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
136 SYSCTL_OID(_vm, OID_AUTO, phys_free, CTLTYPE_STRING | CTLFLAG_RD,
137     NULL, 0, sysctl_vm_phys_free, "A", "Phys Free Info");
138
139 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
140 SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,
141     NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info");
142
143 #ifdef VM_NUMA_ALLOC
144 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS);
145 SYSCTL_OID(_vm, OID_AUTO, phys_locality, CTLTYPE_STRING | CTLFLAG_RD,
146     NULL, 0, sysctl_vm_phys_locality, "A", "Phys Locality Info");
147 #endif
148
149 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD,
150     &vm_ndomains, 0, "Number of physical memory domains available.");
151
152 /*
153  * Default to first-touch + round-robin.
154  */
155 static struct mtx vm_default_policy_mtx;
156 MTX_SYSINIT(vm_default_policy, &vm_default_policy_mtx, "default policy mutex",
157     MTX_DEF);
158 #ifdef VM_NUMA_ALLOC
159 static struct vm_domain_policy vm_default_policy =
160     VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0);
161 #else
162 /* Use round-robin so the domain policy code will only try once per allocation */
163 static struct vm_domain_policy vm_default_policy =
164     VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_ROUND_ROBIN, 0);
165 #endif
166
167 static vm_page_t vm_phys_alloc_domain_pages(int domain, int flind, int pool,
168     int order);
169 static vm_page_t vm_phys_alloc_seg_contig(struct vm_phys_seg *seg,
170     u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
171     vm_paddr_t boundary);
172 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain);
173 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
174 static int vm_phys_paddr_to_segind(vm_paddr_t pa);
175 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
176     int order);
177
178 static int
179 sysctl_vm_default_policy(SYSCTL_HANDLER_ARGS)
180 {
181         char policy_name[32];
182         int error;
183
184         mtx_lock(&vm_default_policy_mtx);
185
186         /* Map policy to output string */
187         switch (vm_default_policy.p.policy) {
188         case VM_POLICY_FIRST_TOUCH:
189                 strcpy(policy_name, "first-touch");
190                 break;
191         case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
192                 strcpy(policy_name, "first-touch-rr");
193                 break;
194         case VM_POLICY_ROUND_ROBIN:
195         default:
196                 strcpy(policy_name, "rr");
197                 break;
198         }
199         mtx_unlock(&vm_default_policy_mtx);
200
201         error = sysctl_handle_string(oidp, &policy_name[0],
202             sizeof(policy_name), req);
203         if (error != 0 || req->newptr == NULL)
204                 return (error);
205
206         mtx_lock(&vm_default_policy_mtx);
207         /* Set: match on the subset of policies that make sense as a default */
208         if (strcmp("first-touch-rr", policy_name) == 0) {
209                 vm_domain_policy_set(&vm_default_policy,
210                     VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0);
211         } else if (strcmp("first-touch", policy_name) == 0) {
212                 vm_domain_policy_set(&vm_default_policy,
213                     VM_POLICY_FIRST_TOUCH, 0);
214         } else if (strcmp("rr", policy_name) == 0) {
215                 vm_domain_policy_set(&vm_default_policy,
216                     VM_POLICY_ROUND_ROBIN, 0);
217         } else {
218                 error = EINVAL;
219                 goto finish;
220         }
221
222         error = 0;
223 finish:
224         mtx_unlock(&vm_default_policy_mtx);
225         return (error);
226 }
227
228 SYSCTL_PROC(_vm, OID_AUTO, default_policy, CTLTYPE_STRING | CTLFLAG_RW,
229     0, 0, sysctl_vm_default_policy, "A",
230     "Default policy (rr, first-touch, first-touch-rr");
231
232 /*
233  * Red-black tree helpers for vm fictitious range management.
234  */
235 static inline int
236 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p,
237     struct vm_phys_fictitious_seg *range)
238 {
239
240         KASSERT(range->start != 0 && range->end != 0,
241             ("Invalid range passed on search for vm_fictitious page"));
242         if (p->start >= range->end)
243                 return (1);
244         if (p->start < range->start)
245                 return (-1);
246
247         return (0);
248 }
249
250 static int
251 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1,
252     struct vm_phys_fictitious_seg *p2)
253 {
254
255         /* Check if this is a search for a page */
256         if (p1->end == 0)
257                 return (vm_phys_fictitious_in_range(p1, p2));
258
259         KASSERT(p2->end != 0,
260     ("Invalid range passed as second parameter to vm fictitious comparison"));
261
262         /* Searching to add a new range */
263         if (p1->end <= p2->start)
264                 return (-1);
265         if (p1->start >= p2->end)
266                 return (1);
267
268         panic("Trying to add overlapping vm fictitious ranges:\n"
269             "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start,
270             (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end);
271 }
272
273 static __inline int
274 vm_rr_selectdomain(void)
275 {
276 #ifdef VM_NUMA_ALLOC
277         struct thread *td;
278
279         td = curthread;
280
281         td->td_dom_rr_idx++;
282         td->td_dom_rr_idx %= vm_ndomains;
283         return (td->td_dom_rr_idx);
284 #else
285         return (0);
286 #endif
287 }
288
289 /*
290  * Initialise a VM domain iterator.
291  *
292  * Check the thread policy, then the proc policy,
293  * then default to the system policy.
294  *
295  * Later on the various layers will have this logic
296  * plumbed into them and the phys code will be explicitly
297  * handed a VM domain policy to use.
298  */
299 static void
300 vm_policy_iterator_init(struct vm_domain_iterator *vi)
301 {
302 #ifdef VM_NUMA_ALLOC
303         struct vm_domain_policy lcl;
304 #endif
305
306         vm_domain_iterator_init(vi);
307
308 #ifdef VM_NUMA_ALLOC
309         /* Copy out the thread policy */
310         vm_domain_policy_localcopy(&lcl, &curthread->td_vm_dom_policy);
311         if (lcl.p.policy != VM_POLICY_NONE) {
312                 /* Thread policy is present; use it */
313                 vm_domain_iterator_set_policy(vi, &lcl);
314                 return;
315         }
316
317         vm_domain_policy_localcopy(&lcl,
318             &curthread->td_proc->p_vm_dom_policy);
319         if (lcl.p.policy != VM_POLICY_NONE) {
320                 /* Process policy is present; use it */
321                 vm_domain_iterator_set_policy(vi, &lcl);
322                 return;
323         }
324 #endif
325         /* Use system default policy */
326         vm_domain_iterator_set_policy(vi, &vm_default_policy);
327 }
328
329 static void
330 vm_policy_iterator_finish(struct vm_domain_iterator *vi)
331 {
332
333         vm_domain_iterator_cleanup(vi);
334 }
335
336 boolean_t
337 vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high)
338 {
339         struct vm_phys_seg *s;
340         int idx;
341
342         while ((idx = ffsl(mask)) != 0) {
343                 idx--;  /* ffsl counts from 1 */
344                 mask &= ~(1UL << idx);
345                 s = &vm_phys_segs[idx];
346                 if (low < s->end && high > s->start)
347                         return (TRUE);
348         }
349         return (FALSE);
350 }
351
352 /*
353  * Outputs the state of the physical memory allocator, specifically,
354  * the amount of physical memory in each free list.
355  */
356 static int
357 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
358 {
359         struct sbuf sbuf;
360         struct vm_freelist *fl;
361         int dom, error, flind, oind, pind;
362
363         error = sysctl_wire_old_buffer(req, 0);
364         if (error != 0)
365                 return (error);
366         sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req);
367         for (dom = 0; dom < vm_ndomains; dom++) {
368                 sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom);
369                 for (flind = 0; flind < vm_nfreelists; flind++) {
370                         sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
371                             "\n  ORDER (SIZE)  |  NUMBER"
372                             "\n              ", flind);
373                         for (pind = 0; pind < VM_NFREEPOOL; pind++)
374                                 sbuf_printf(&sbuf, "  |  POOL %d", pind);
375                         sbuf_printf(&sbuf, "\n--            ");
376                         for (pind = 0; pind < VM_NFREEPOOL; pind++)
377                                 sbuf_printf(&sbuf, "-- --      ");
378                         sbuf_printf(&sbuf, "--\n");
379                         for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
380                                 sbuf_printf(&sbuf, "  %2d (%6dK)", oind,
381                                     1 << (PAGE_SHIFT - 10 + oind));
382                                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
383                                 fl = vm_phys_free_queues[dom][flind][pind];
384                                         sbuf_printf(&sbuf, "  |  %6d",
385                                             fl[oind].lcnt);
386                                 }
387                                 sbuf_printf(&sbuf, "\n");
388                         }
389                 }
390         }
391         error = sbuf_finish(&sbuf);
392         sbuf_delete(&sbuf);
393         return (error);
394 }
395
396 /*
397  * Outputs the set of physical memory segments.
398  */
399 static int
400 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
401 {
402         struct sbuf sbuf;
403         struct vm_phys_seg *seg;
404         int error, segind;
405
406         error = sysctl_wire_old_buffer(req, 0);
407         if (error != 0)
408                 return (error);
409         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
410         for (segind = 0; segind < vm_phys_nsegs; segind++) {
411                 sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
412                 seg = &vm_phys_segs[segind];
413                 sbuf_printf(&sbuf, "start:     %#jx\n",
414                     (uintmax_t)seg->start);
415                 sbuf_printf(&sbuf, "end:       %#jx\n",
416                     (uintmax_t)seg->end);
417                 sbuf_printf(&sbuf, "domain:    %d\n", seg->domain);
418                 sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
419         }
420         error = sbuf_finish(&sbuf);
421         sbuf_delete(&sbuf);
422         return (error);
423 }
424
425 /*
426  * Return affinity, or -1 if there's no affinity information.
427  */
428 int
429 vm_phys_mem_affinity(int f, int t)
430 {
431
432 #ifdef VM_NUMA_ALLOC
433         if (mem_locality == NULL)
434                 return (-1);
435         if (f >= vm_ndomains || t >= vm_ndomains)
436                 return (-1);
437         return (mem_locality[f * vm_ndomains + t]);
438 #else
439         return (-1);
440 #endif
441 }
442
443 #ifdef VM_NUMA_ALLOC
444 /*
445  * Outputs the VM locality table.
446  */
447 static int
448 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS)
449 {
450         struct sbuf sbuf;
451         int error, i, j;
452
453         error = sysctl_wire_old_buffer(req, 0);
454         if (error != 0)
455                 return (error);
456         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
457
458         sbuf_printf(&sbuf, "\n");
459
460         for (i = 0; i < vm_ndomains; i++) {
461                 sbuf_printf(&sbuf, "%d: ", i);
462                 for (j = 0; j < vm_ndomains; j++) {
463                         sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j));
464                 }
465                 sbuf_printf(&sbuf, "\n");
466         }
467         error = sbuf_finish(&sbuf);
468         sbuf_delete(&sbuf);
469         return (error);
470 }
471 #endif
472
473 static void
474 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail)
475 {
476
477         m->order = order;
478         if (tail)
479                 TAILQ_INSERT_TAIL(&fl[order].pl, m, plinks.q);
480         else
481                 TAILQ_INSERT_HEAD(&fl[order].pl, m, plinks.q);
482         fl[order].lcnt++;
483 }
484
485 static void
486 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order)
487 {
488
489         TAILQ_REMOVE(&fl[order].pl, m, plinks.q);
490         fl[order].lcnt--;
491         m->order = VM_NFREEORDER;
492 }
493
494 /*
495  * Create a physical memory segment.
496  */
497 static void
498 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain)
499 {
500         struct vm_phys_seg *seg;
501
502         KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
503             ("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
504         KASSERT(domain < vm_ndomains,
505             ("vm_phys_create_seg: invalid domain provided"));
506         seg = &vm_phys_segs[vm_phys_nsegs++];
507         while (seg > vm_phys_segs && (seg - 1)->start >= end) {
508                 *seg = *(seg - 1);
509                 seg--;
510         }
511         seg->start = start;
512         seg->end = end;
513         seg->domain = domain;
514 }
515
516 static void
517 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end)
518 {
519 #ifdef VM_NUMA_ALLOC
520         int i;
521
522         if (mem_affinity == NULL) {
523                 _vm_phys_create_seg(start, end, 0);
524                 return;
525         }
526
527         for (i = 0;; i++) {
528                 if (mem_affinity[i].end == 0)
529                         panic("Reached end of affinity info");
530                 if (mem_affinity[i].end <= start)
531                         continue;
532                 if (mem_affinity[i].start > start)
533                         panic("No affinity info for start %jx",
534                             (uintmax_t)start);
535                 if (mem_affinity[i].end >= end) {
536                         _vm_phys_create_seg(start, end,
537                             mem_affinity[i].domain);
538                         break;
539                 }
540                 _vm_phys_create_seg(start, mem_affinity[i].end,
541                     mem_affinity[i].domain);
542                 start = mem_affinity[i].end;
543         }
544 #else
545         _vm_phys_create_seg(start, end, 0);
546 #endif
547 }
548
549 /*
550  * Add a physical memory segment.
551  */
552 void
553 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end)
554 {
555         vm_paddr_t paddr;
556
557         KASSERT((start & PAGE_MASK) == 0,
558             ("vm_phys_define_seg: start is not page aligned"));
559         KASSERT((end & PAGE_MASK) == 0,
560             ("vm_phys_define_seg: end is not page aligned"));
561
562         /*
563          * Split the physical memory segment if it spans two or more free
564          * list boundaries.
565          */
566         paddr = start;
567 #ifdef  VM_FREELIST_ISADMA
568         if (paddr < VM_ISADMA_BOUNDARY && end > VM_ISADMA_BOUNDARY) {
569                 vm_phys_create_seg(paddr, VM_ISADMA_BOUNDARY);
570                 paddr = VM_ISADMA_BOUNDARY;
571         }
572 #endif
573 #ifdef  VM_FREELIST_LOWMEM
574         if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) {
575                 vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY);
576                 paddr = VM_LOWMEM_BOUNDARY;
577         }
578 #endif
579 #ifdef  VM_FREELIST_DMA32
580         if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) {
581                 vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY);
582                 paddr = VM_DMA32_BOUNDARY;
583         }
584 #endif
585         vm_phys_create_seg(paddr, end);
586 }
587
588 /*
589  * Initialize the physical memory allocator.
590  *
591  * Requires that vm_page_array is initialized!
592  */
593 void
594 vm_phys_init(void)
595 {
596         struct vm_freelist *fl;
597         struct vm_phys_seg *seg;
598         u_long npages;
599         int dom, flind, freelist, oind, pind, segind;
600
601         /*
602          * Compute the number of free lists, and generate the mapping from the
603          * manifest constants VM_FREELIST_* to the free list indices.
604          *
605          * Initially, the entries of vm_freelist_to_flind[] are set to either
606          * 0 or 1 to indicate which free lists should be created.
607          */
608         npages = 0;
609         for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
610                 seg = &vm_phys_segs[segind];
611 #ifdef  VM_FREELIST_ISADMA
612                 if (seg->end <= VM_ISADMA_BOUNDARY)
613                         vm_freelist_to_flind[VM_FREELIST_ISADMA] = 1;
614                 else
615 #endif
616 #ifdef  VM_FREELIST_LOWMEM
617                 if (seg->end <= VM_LOWMEM_BOUNDARY)
618                         vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1;
619                 else
620 #endif
621 #ifdef  VM_FREELIST_DMA32
622                 if (
623 #ifdef  VM_DMA32_NPAGES_THRESHOLD
624                     /*
625                      * Create the DMA32 free list only if the amount of
626                      * physical memory above physical address 4G exceeds the
627                      * given threshold.
628                      */
629                     npages > VM_DMA32_NPAGES_THRESHOLD &&
630 #endif
631                     seg->end <= VM_DMA32_BOUNDARY)
632                         vm_freelist_to_flind[VM_FREELIST_DMA32] = 1;
633                 else
634 #endif
635                 {
636                         npages += atop(seg->end - seg->start);
637                         vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1;
638                 }
639         }
640         /* Change each entry into a running total of the free lists. */
641         for (freelist = 1; freelist < VM_NFREELIST; freelist++) {
642                 vm_freelist_to_flind[freelist] +=
643                     vm_freelist_to_flind[freelist - 1];
644         }
645         vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1];
646         KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists"));
647         /* Change each entry into a free list index. */
648         for (freelist = 0; freelist < VM_NFREELIST; freelist++)
649                 vm_freelist_to_flind[freelist]--;
650
651         /*
652          * Initialize the first_page and free_queues fields of each physical
653          * memory segment.
654          */
655 #ifdef VM_PHYSSEG_SPARSE
656         npages = 0;
657 #endif
658         for (segind = 0; segind < vm_phys_nsegs; segind++) {
659                 seg = &vm_phys_segs[segind];
660 #ifdef VM_PHYSSEG_SPARSE
661                 seg->first_page = &vm_page_array[npages];
662                 npages += atop(seg->end - seg->start);
663 #else
664                 seg->first_page = PHYS_TO_VM_PAGE(seg->start);
665 #endif
666 #ifdef  VM_FREELIST_ISADMA
667                 if (seg->end <= VM_ISADMA_BOUNDARY) {
668                         flind = vm_freelist_to_flind[VM_FREELIST_ISADMA];
669                         KASSERT(flind >= 0,
670                             ("vm_phys_init: ISADMA flind < 0"));
671                 } else
672 #endif
673 #ifdef  VM_FREELIST_LOWMEM
674                 if (seg->end <= VM_LOWMEM_BOUNDARY) {
675                         flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM];
676                         KASSERT(flind >= 0,
677                             ("vm_phys_init: LOWMEM flind < 0"));
678                 } else
679 #endif
680 #ifdef  VM_FREELIST_DMA32
681                 if (seg->end <= VM_DMA32_BOUNDARY) {
682                         flind = vm_freelist_to_flind[VM_FREELIST_DMA32];
683                         KASSERT(flind >= 0,
684                             ("vm_phys_init: DMA32 flind < 0"));
685                 } else
686 #endif
687                 {
688                         flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT];
689                         KASSERT(flind >= 0,
690                             ("vm_phys_init: DEFAULT flind < 0"));
691                 }
692                 seg->free_queues = &vm_phys_free_queues[seg->domain][flind];
693         }
694
695         /*
696          * Initialize the free queues.
697          */
698         for (dom = 0; dom < vm_ndomains; dom++) {
699                 for (flind = 0; flind < vm_nfreelists; flind++) {
700                         for (pind = 0; pind < VM_NFREEPOOL; pind++) {
701                                 fl = vm_phys_free_queues[dom][flind][pind];
702                                 for (oind = 0; oind < VM_NFREEORDER; oind++)
703                                         TAILQ_INIT(&fl[oind].pl);
704                         }
705                 }
706         }
707
708         rw_init(&vm_phys_fictitious_reg_lock, "vmfctr");
709 }
710
711 /*
712  * Split a contiguous, power of two-sized set of physical pages.
713  */
714 static __inline void
715 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order)
716 {
717         vm_page_t m_buddy;
718
719         while (oind > order) {
720                 oind--;
721                 m_buddy = &m[1 << oind];
722                 KASSERT(m_buddy->order == VM_NFREEORDER,
723                     ("vm_phys_split_pages: page %p has unexpected order %d",
724                     m_buddy, m_buddy->order));
725                 vm_freelist_add(fl, m_buddy, oind, 0);
726         }
727 }
728
729 /*
730  * Initialize a physical page and add it to the free lists.
731  */
732 void
733 vm_phys_add_page(vm_paddr_t pa)
734 {
735         vm_page_t m;
736         struct vm_domain *vmd;
737
738         vm_cnt.v_page_count++;
739         m = vm_phys_paddr_to_vm_page(pa);
740         m->busy_lock = VPB_UNBUSIED;
741         m->phys_addr = pa;
742         m->queue = PQ_NONE;
743         m->segind = vm_phys_paddr_to_segind(pa);
744         vmd = vm_phys_domain(m);
745         vmd->vmd_page_count++;
746         vmd->vmd_segs |= 1UL << m->segind;
747         KASSERT(m->order == VM_NFREEORDER,
748             ("vm_phys_add_page: page %p has unexpected order %d",
749             m, m->order));
750         m->pool = VM_FREEPOOL_DEFAULT;
751         pmap_page_init(m);
752         mtx_lock(&vm_page_queue_free_mtx);
753         vm_phys_freecnt_adj(m, 1);
754         vm_phys_free_pages(m, 0);
755         mtx_unlock(&vm_page_queue_free_mtx);
756 }
757
758 /*
759  * Allocate a contiguous, power of two-sized set of physical pages
760  * from the free lists.
761  *
762  * The free page queues must be locked.
763  */
764 vm_page_t
765 vm_phys_alloc_pages(int pool, int order)
766 {
767         vm_page_t m;
768         int domain, flind;
769         struct vm_domain_iterator vi;
770
771         KASSERT(pool < VM_NFREEPOOL,
772             ("vm_phys_alloc_pages: pool %d is out of range", pool));
773         KASSERT(order < VM_NFREEORDER,
774             ("vm_phys_alloc_pages: order %d is out of range", order));
775
776         vm_policy_iterator_init(&vi);
777
778         while ((vm_domain_iterator_run(&vi, &domain)) == 0) {
779                 for (flind = 0; flind < vm_nfreelists; flind++) {
780                         m = vm_phys_alloc_domain_pages(domain, flind, pool,
781                             order);
782                         if (m != NULL)
783                                 return (m);
784                 }
785         }
786
787         vm_policy_iterator_finish(&vi);
788         return (NULL);
789 }
790
791 /*
792  * Allocate a contiguous, power of two-sized set of physical pages from the
793  * specified free list.  The free list must be specified using one of the
794  * manifest constants VM_FREELIST_*.
795  *
796  * The free page queues must be locked.
797  */
798 vm_page_t
799 vm_phys_alloc_freelist_pages(int freelist, int pool, int order)
800 {
801         vm_page_t m;
802         struct vm_domain_iterator vi;
803         int domain;
804
805         KASSERT(freelist < VM_NFREELIST,
806             ("vm_phys_alloc_freelist_pages: freelist %d is out of range",
807             freelist));
808         KASSERT(pool < VM_NFREEPOOL,
809             ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
810         KASSERT(order < VM_NFREEORDER,
811             ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
812
813         vm_policy_iterator_init(&vi);
814
815         while ((vm_domain_iterator_run(&vi, &domain)) == 0) {
816                 m = vm_phys_alloc_domain_pages(domain,
817                     vm_freelist_to_flind[freelist], pool, order);
818                 if (m != NULL)
819                         return (m);
820         }
821
822         vm_policy_iterator_finish(&vi);
823         return (NULL);
824 }
825
826 static vm_page_t
827 vm_phys_alloc_domain_pages(int domain, int flind, int pool, int order)
828 {       
829         struct vm_freelist *fl;
830         struct vm_freelist *alt;
831         int oind, pind;
832         vm_page_t m;
833
834         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
835         fl = &vm_phys_free_queues[domain][flind][pool][0];
836         for (oind = order; oind < VM_NFREEORDER; oind++) {
837                 m = TAILQ_FIRST(&fl[oind].pl);
838                 if (m != NULL) {
839                         vm_freelist_rem(fl, m, oind);
840                         vm_phys_split_pages(m, oind, fl, order);
841                         return (m);
842                 }
843         }
844
845         /*
846          * The given pool was empty.  Find the largest
847          * contiguous, power-of-two-sized set of pages in any
848          * pool.  Transfer these pages to the given pool, and
849          * use them to satisfy the allocation.
850          */
851         for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
852                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
853                         alt = &vm_phys_free_queues[domain][flind][pind][0];
854                         m = TAILQ_FIRST(&alt[oind].pl);
855                         if (m != NULL) {
856                                 vm_freelist_rem(alt, m, oind);
857                                 vm_phys_set_pool(pool, m, oind);
858                                 vm_phys_split_pages(m, oind, fl, order);
859                                 return (m);
860                         }
861                 }
862         }
863         return (NULL);
864 }
865
866 /*
867  * Find the vm_page corresponding to the given physical address.
868  */
869 vm_page_t
870 vm_phys_paddr_to_vm_page(vm_paddr_t pa)
871 {
872         struct vm_phys_seg *seg;
873         int segind;
874
875         for (segind = 0; segind < vm_phys_nsegs; segind++) {
876                 seg = &vm_phys_segs[segind];
877                 if (pa >= seg->start && pa < seg->end)
878                         return (&seg->first_page[atop(pa - seg->start)]);
879         }
880         return (NULL);
881 }
882
883 vm_page_t
884 vm_phys_fictitious_to_vm_page(vm_paddr_t pa)
885 {
886         struct vm_phys_fictitious_seg tmp, *seg;
887         vm_page_t m;
888
889         m = NULL;
890         tmp.start = pa;
891         tmp.end = 0;
892
893         rw_rlock(&vm_phys_fictitious_reg_lock);
894         seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
895         rw_runlock(&vm_phys_fictitious_reg_lock);
896         if (seg == NULL)
897                 return (NULL);
898
899         m = &seg->first_page[atop(pa - seg->start)];
900         KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m));
901
902         return (m);
903 }
904
905 static inline void
906 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start,
907     long page_count, vm_memattr_t memattr)
908 {
909         long i;
910
911         for (i = 0; i < page_count; i++) {
912                 vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr);
913                 range[i].oflags &= ~VPO_UNMANAGED;
914                 range[i].busy_lock = VPB_UNBUSIED;
915         }
916 }
917
918 int
919 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
920     vm_memattr_t memattr)
921 {
922         struct vm_phys_fictitious_seg *seg;
923         vm_page_t fp;
924         long page_count;
925 #ifdef VM_PHYSSEG_DENSE
926         long pi, pe;
927         long dpage_count;
928 #endif
929
930         KASSERT(start < end,
931             ("Start of segment isn't less than end (start: %jx end: %jx)",
932             (uintmax_t)start, (uintmax_t)end));
933
934         page_count = (end - start) / PAGE_SIZE;
935
936 #ifdef VM_PHYSSEG_DENSE
937         pi = atop(start);
938         pe = atop(end);
939         if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
940                 fp = &vm_page_array[pi - first_page];
941                 if ((pe - first_page) > vm_page_array_size) {
942                         /*
943                          * We have a segment that starts inside
944                          * of vm_page_array, but ends outside of it.
945                          *
946                          * Use vm_page_array pages for those that are
947                          * inside of the vm_page_array range, and
948                          * allocate the remaining ones.
949                          */
950                         dpage_count = vm_page_array_size - (pi - first_page);
951                         vm_phys_fictitious_init_range(fp, start, dpage_count,
952                             memattr);
953                         page_count -= dpage_count;
954                         start += ptoa(dpage_count);
955                         goto alloc;
956                 }
957                 /*
958                  * We can allocate the full range from vm_page_array,
959                  * so there's no need to register the range in the tree.
960                  */
961                 vm_phys_fictitious_init_range(fp, start, page_count, memattr);
962                 return (0);
963         } else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
964                 /*
965                  * We have a segment that ends inside of vm_page_array,
966                  * but starts outside of it.
967                  */
968                 fp = &vm_page_array[0];
969                 dpage_count = pe - first_page;
970                 vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count,
971                     memattr);
972                 end -= ptoa(dpage_count);
973                 page_count -= dpage_count;
974                 goto alloc;
975         } else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
976                 /*
977                  * Trying to register a fictitious range that expands before
978                  * and after vm_page_array.
979                  */
980                 return (EINVAL);
981         } else {
982 alloc:
983 #endif
984                 fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES,
985                     M_WAITOK | M_ZERO);
986 #ifdef VM_PHYSSEG_DENSE
987         }
988 #endif
989         vm_phys_fictitious_init_range(fp, start, page_count, memattr);
990
991         seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO);
992         seg->start = start;
993         seg->end = end;
994         seg->first_page = fp;
995
996         rw_wlock(&vm_phys_fictitious_reg_lock);
997         RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg);
998         rw_wunlock(&vm_phys_fictitious_reg_lock);
999
1000         return (0);
1001 }
1002
1003 void
1004 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end)
1005 {
1006         struct vm_phys_fictitious_seg *seg, tmp;
1007 #ifdef VM_PHYSSEG_DENSE
1008         long pi, pe;
1009 #endif
1010
1011         KASSERT(start < end,
1012             ("Start of segment isn't less than end (start: %jx end: %jx)",
1013             (uintmax_t)start, (uintmax_t)end));
1014
1015 #ifdef VM_PHYSSEG_DENSE
1016         pi = atop(start);
1017         pe = atop(end);
1018         if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
1019                 if ((pe - first_page) <= vm_page_array_size) {
1020                         /*
1021                          * This segment was allocated using vm_page_array
1022                          * only, there's nothing to do since those pages
1023                          * were never added to the tree.
1024                          */
1025                         return;
1026                 }
1027                 /*
1028                  * We have a segment that starts inside
1029                  * of vm_page_array, but ends outside of it.
1030                  *
1031                  * Calculate how many pages were added to the
1032                  * tree and free them.
1033                  */
1034                 start = ptoa(first_page + vm_page_array_size);
1035         } else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
1036                 /*
1037                  * We have a segment that ends inside of vm_page_array,
1038                  * but starts outside of it.
1039                  */
1040                 end = ptoa(first_page);
1041         } else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
1042                 /* Since it's not possible to register such a range, panic. */
1043                 panic(
1044                     "Unregistering not registered fictitious range [%#jx:%#jx]",
1045                     (uintmax_t)start, (uintmax_t)end);
1046         }
1047 #endif
1048         tmp.start = start;
1049         tmp.end = 0;
1050
1051         rw_wlock(&vm_phys_fictitious_reg_lock);
1052         seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
1053         if (seg->start != start || seg->end != end) {
1054                 rw_wunlock(&vm_phys_fictitious_reg_lock);
1055                 panic(
1056                     "Unregistering not registered fictitious range [%#jx:%#jx]",
1057                     (uintmax_t)start, (uintmax_t)end);
1058         }
1059         RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg);
1060         rw_wunlock(&vm_phys_fictitious_reg_lock);
1061         free(seg->first_page, M_FICT_PAGES);
1062         free(seg, M_FICT_PAGES);
1063 }
1064
1065 /*
1066  * Find the segment containing the given physical address.
1067  */
1068 static int
1069 vm_phys_paddr_to_segind(vm_paddr_t pa)
1070 {
1071         struct vm_phys_seg *seg;
1072         int segind;
1073
1074         for (segind = 0; segind < vm_phys_nsegs; segind++) {
1075                 seg = &vm_phys_segs[segind];
1076                 if (pa >= seg->start && pa < seg->end)
1077                         return (segind);
1078         }
1079         panic("vm_phys_paddr_to_segind: paddr %#jx is not in any segment" ,
1080             (uintmax_t)pa);
1081 }
1082
1083 /*
1084  * Free a contiguous, power of two-sized set of physical pages.
1085  *
1086  * The free page queues must be locked.
1087  */
1088 void
1089 vm_phys_free_pages(vm_page_t m, int order)
1090 {
1091         struct vm_freelist *fl;
1092         struct vm_phys_seg *seg;
1093         vm_paddr_t pa;
1094         vm_page_t m_buddy;
1095
1096         KASSERT(m->order == VM_NFREEORDER,
1097             ("vm_phys_free_pages: page %p has unexpected order %d",
1098             m, m->order));
1099         KASSERT(m->pool < VM_NFREEPOOL,
1100             ("vm_phys_free_pages: page %p has unexpected pool %d",
1101             m, m->pool));
1102         KASSERT(order < VM_NFREEORDER,
1103             ("vm_phys_free_pages: order %d is out of range", order));
1104         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1105         seg = &vm_phys_segs[m->segind];
1106         if (order < VM_NFREEORDER - 1) {
1107                 pa = VM_PAGE_TO_PHYS(m);
1108                 do {
1109                         pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order));
1110                         if (pa < seg->start || pa >= seg->end)
1111                                 break;
1112                         m_buddy = &seg->first_page[atop(pa - seg->start)];
1113                         if (m_buddy->order != order)
1114                                 break;
1115                         fl = (*seg->free_queues)[m_buddy->pool];
1116                         vm_freelist_rem(fl, m_buddy, order);
1117                         if (m_buddy->pool != m->pool)
1118                                 vm_phys_set_pool(m->pool, m_buddy, order);
1119                         order++;
1120                         pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1);
1121                         m = &seg->first_page[atop(pa - seg->start)];
1122                 } while (order < VM_NFREEORDER - 1);
1123         }
1124         fl = (*seg->free_queues)[m->pool];
1125         vm_freelist_add(fl, m, order, 1);
1126 }
1127
1128 /*
1129  * Free a contiguous, arbitrarily sized set of physical pages.
1130  *
1131  * The free page queues must be locked.
1132  */
1133 void
1134 vm_phys_free_contig(vm_page_t m, u_long npages)
1135 {
1136         u_int n;
1137         int order;
1138
1139         /*
1140          * Avoid unnecessary coalescing by freeing the pages in the largest
1141          * possible power-of-two-sized subsets.
1142          */
1143         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1144         for (;; npages -= n) {
1145                 /*
1146                  * Unsigned "min" is used here so that "order" is assigned
1147                  * "VM_NFREEORDER - 1" when "m"'s physical address is zero
1148                  * or the low-order bits of its physical address are zero
1149                  * because the size of a physical address exceeds the size of
1150                  * a long.
1151                  */
1152                 order = min(ffsl(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1,
1153                     VM_NFREEORDER - 1);
1154                 n = 1 << order;
1155                 if (npages < n)
1156                         break;
1157                 vm_phys_free_pages(m, order);
1158                 m += n;
1159         }
1160         /* The residual "npages" is less than "1 << (VM_NFREEORDER - 1)". */
1161         for (; npages > 0; npages -= n) {
1162                 order = flsl(npages) - 1;
1163                 n = 1 << order;
1164                 vm_phys_free_pages(m, order);
1165                 m += n;
1166         }
1167 }
1168
1169 /*
1170  * Scan physical memory between the specified addresses "low" and "high" for a
1171  * run of contiguous physical pages that satisfy the specified conditions, and
1172  * return the lowest page in the run.  The specified "alignment" determines
1173  * the alignment of the lowest physical page in the run.  If the specified
1174  * "boundary" is non-zero, then the run of physical pages cannot span a
1175  * physical address that is a multiple of "boundary".
1176  *
1177  * "npages" must be greater than zero.  Both "alignment" and "boundary" must
1178  * be a power of two.
1179  */
1180 vm_page_t
1181 vm_phys_scan_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
1182     u_long alignment, vm_paddr_t boundary, int options)
1183 {
1184         vm_paddr_t pa_end;
1185         vm_page_t m_end, m_run, m_start;
1186         struct vm_phys_seg *seg;
1187         int segind;
1188
1189         KASSERT(npages > 0, ("npages is 0"));
1190         KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1191         KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1192         if (low >= high)
1193                 return (NULL);
1194         for (segind = 0; segind < vm_phys_nsegs; segind++) {
1195                 seg = &vm_phys_segs[segind];
1196                 if (seg->start >= high)
1197                         break;
1198                 if (low >= seg->end)
1199                         continue;
1200                 if (low <= seg->start)
1201                         m_start = seg->first_page;
1202                 else
1203                         m_start = &seg->first_page[atop(low - seg->start)];
1204                 if (high < seg->end)
1205                         pa_end = high;
1206                 else
1207                         pa_end = seg->end;
1208                 if (pa_end - VM_PAGE_TO_PHYS(m_start) < ptoa(npages))
1209                         continue;
1210                 m_end = &seg->first_page[atop(pa_end - seg->start)];
1211                 m_run = vm_page_scan_contig(npages, m_start, m_end,
1212                     alignment, boundary, options);
1213                 if (m_run != NULL)
1214                         return (m_run);
1215         }
1216         return (NULL);
1217 }
1218
1219 /*
1220  * Set the pool for a contiguous, power of two-sized set of physical pages. 
1221  */
1222 void
1223 vm_phys_set_pool(int pool, vm_page_t m, int order)
1224 {
1225         vm_page_t m_tmp;
1226
1227         for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
1228                 m_tmp->pool = pool;
1229 }
1230
1231 /*
1232  * Search for the given physical page "m" in the free lists.  If the search
1233  * succeeds, remove "m" from the free lists and return TRUE.  Otherwise, return
1234  * FALSE, indicating that "m" is not in the free lists.
1235  *
1236  * The free page queues must be locked.
1237  */
1238 boolean_t
1239 vm_phys_unfree_page(vm_page_t m)
1240 {
1241         struct vm_freelist *fl;
1242         struct vm_phys_seg *seg;
1243         vm_paddr_t pa, pa_half;
1244         vm_page_t m_set, m_tmp;
1245         int order;
1246
1247         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1248
1249         /*
1250          * First, find the contiguous, power of two-sized set of free
1251          * physical pages containing the given physical page "m" and
1252          * assign it to "m_set".
1253          */
1254         seg = &vm_phys_segs[m->segind];
1255         for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
1256             order < VM_NFREEORDER - 1; ) {
1257                 order++;
1258                 pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
1259                 if (pa >= seg->start)
1260                         m_set = &seg->first_page[atop(pa - seg->start)];
1261                 else
1262                         return (FALSE);
1263         }
1264         if (m_set->order < order)
1265                 return (FALSE);
1266         if (m_set->order == VM_NFREEORDER)
1267                 return (FALSE);
1268         KASSERT(m_set->order < VM_NFREEORDER,
1269             ("vm_phys_unfree_page: page %p has unexpected order %d",
1270             m_set, m_set->order));
1271
1272         /*
1273          * Next, remove "m_set" from the free lists.  Finally, extract
1274          * "m" from "m_set" using an iterative algorithm: While "m_set"
1275          * is larger than a page, shrink "m_set" by returning the half
1276          * of "m_set" that does not contain "m" to the free lists.
1277          */
1278         fl = (*seg->free_queues)[m_set->pool];
1279         order = m_set->order;
1280         vm_freelist_rem(fl, m_set, order);
1281         while (order > 0) {
1282                 order--;
1283                 pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
1284                 if (m->phys_addr < pa_half)
1285                         m_tmp = &seg->first_page[atop(pa_half - seg->start)];
1286                 else {
1287                         m_tmp = m_set;
1288                         m_set = &seg->first_page[atop(pa_half - seg->start)];
1289                 }
1290                 vm_freelist_add(fl, m_tmp, order, 0);
1291         }
1292         KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
1293         return (TRUE);
1294 }
1295
1296 /*
1297  * Allocate a contiguous set of physical pages of the given size
1298  * "npages" from the free lists.  All of the physical pages must be at
1299  * or above the given physical address "low" and below the given
1300  * physical address "high".  The given value "alignment" determines the
1301  * alignment of the first physical page in the set.  If the given value
1302  * "boundary" is non-zero, then the set of physical pages cannot cross
1303  * any physical address boundary that is a multiple of that value.  Both
1304  * "alignment" and "boundary" must be a power of two.
1305  */
1306 vm_page_t
1307 vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
1308     u_long alignment, vm_paddr_t boundary)
1309 {
1310         vm_paddr_t pa_end, pa_start;
1311         vm_page_t m_run;
1312         struct vm_domain_iterator vi;
1313         struct vm_phys_seg *seg;
1314         int domain, segind;
1315
1316         KASSERT(npages > 0, ("npages is 0"));
1317         KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1318         KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1319         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1320         if (low >= high)
1321                 return (NULL);
1322         vm_policy_iterator_init(&vi);
1323 restartdom:
1324         if (vm_domain_iterator_run(&vi, &domain) != 0) {
1325                 vm_policy_iterator_finish(&vi);
1326                 return (NULL);
1327         }
1328         m_run = NULL;
1329         for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
1330                 seg = &vm_phys_segs[segind];
1331                 if (seg->start >= high || seg->domain != domain)
1332                         continue;
1333                 if (low >= seg->end)
1334                         break;
1335                 if (low <= seg->start)
1336                         pa_start = seg->start;
1337                 else
1338                         pa_start = low;
1339                 if (high < seg->end)
1340                         pa_end = high;
1341                 else
1342                         pa_end = seg->end;
1343                 if (pa_end - pa_start < ptoa(npages))
1344                         continue;
1345                 m_run = vm_phys_alloc_seg_contig(seg, npages, low, high,
1346                     alignment, boundary);
1347                 if (m_run != NULL)
1348                         break;
1349         }
1350         if (m_run == NULL && !vm_domain_iterator_isdone(&vi))
1351                 goto restartdom;
1352         vm_policy_iterator_finish(&vi);
1353         return (m_run);
1354 }
1355
1356 /*
1357  * Allocate a run of contiguous physical pages from the free list for the
1358  * specified segment.
1359  */
1360 static vm_page_t
1361 vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, u_long npages,
1362     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
1363 {
1364         struct vm_freelist *fl;
1365         vm_paddr_t pa, pa_end, size;
1366         vm_page_t m, m_ret;
1367         u_long npages_end;
1368         int oind, order, pind;
1369
1370         KASSERT(npages > 0, ("npages is 0"));
1371         KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1372         KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1373         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1374         /* Compute the queue that is the best fit for npages. */
1375         for (order = 0; (1 << order) < npages; order++);
1376         /* Search for a run satisfying the specified conditions. */
1377         size = npages << PAGE_SHIFT;
1378         for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER;
1379             oind++) {
1380                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
1381                         fl = (*seg->free_queues)[pind];
1382                         TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) {
1383                                 /*
1384                                  * Is the size of this allocation request
1385                                  * larger than the largest block size?
1386                                  */
1387                                 if (order >= VM_NFREEORDER) {
1388                                         /*
1389                                          * Determine if a sufficient number of
1390                                          * subsequent blocks to satisfy the
1391                                          * allocation request are free.
1392                                          */
1393                                         pa = VM_PAGE_TO_PHYS(m_ret);
1394                                         pa_end = pa + size;
1395                                         for (;;) {
1396                                                 pa += 1 << (PAGE_SHIFT +
1397                                                     VM_NFREEORDER - 1);
1398                                                 if (pa >= pa_end ||
1399                                                     pa < seg->start ||
1400                                                     pa >= seg->end)
1401                                                         break;
1402                                                 m = &seg->first_page[atop(pa -
1403                                                     seg->start)];
1404                                                 if (m->order != VM_NFREEORDER -
1405                                                     1)
1406                                                         break;
1407                                         }
1408                                         /* If not, go to the next block. */
1409                                         if (pa < pa_end)
1410                                                 continue;
1411                                 }
1412
1413                                 /*
1414                                  * Determine if the blocks are within the
1415                                  * given range, satisfy the given alignment,
1416                                  * and do not cross the given boundary.
1417                                  */
1418                                 pa = VM_PAGE_TO_PHYS(m_ret);
1419                                 pa_end = pa + size;
1420                                 if (pa >= low && pa_end <= high &&
1421                                     (pa & (alignment - 1)) == 0 &&
1422                                     rounddown2(pa ^ (pa_end - 1), boundary) == 0)
1423                                         goto done;
1424                         }
1425                 }
1426         }
1427         return (NULL);
1428 done:
1429         for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) {
1430                 fl = (*seg->free_queues)[m->pool];
1431                 vm_freelist_rem(fl, m, m->order);
1432         }
1433         if (m_ret->pool != VM_FREEPOOL_DEFAULT)
1434                 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m_ret, oind);
1435         fl = (*seg->free_queues)[m_ret->pool];
1436         vm_phys_split_pages(m_ret, oind, fl, order);
1437         /* Return excess pages to the free lists. */
1438         npages_end = roundup2(npages, 1 << imin(oind, order));
1439         if (npages < npages_end)
1440                 vm_phys_free_contig(&m_ret[npages], npages_end - npages);
1441         return (m_ret);
1442 }
1443
1444 #ifdef DDB
1445 /*
1446  * Show the number of physical pages in each of the free lists.
1447  */
1448 DB_SHOW_COMMAND(freepages, db_show_freepages)
1449 {
1450         struct vm_freelist *fl;
1451         int flind, oind, pind, dom;
1452
1453         for (dom = 0; dom < vm_ndomains; dom++) {
1454                 db_printf("DOMAIN: %d\n", dom);
1455                 for (flind = 0; flind < vm_nfreelists; flind++) {
1456                         db_printf("FREE LIST %d:\n"
1457                             "\n  ORDER (SIZE)  |  NUMBER"
1458                             "\n              ", flind);
1459                         for (pind = 0; pind < VM_NFREEPOOL; pind++)
1460                                 db_printf("  |  POOL %d", pind);
1461                         db_printf("\n--            ");
1462                         for (pind = 0; pind < VM_NFREEPOOL; pind++)
1463                                 db_printf("-- --      ");
1464                         db_printf("--\n");
1465                         for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
1466                                 db_printf("  %2.2d (%6.6dK)", oind,
1467                                     1 << (PAGE_SHIFT - 10 + oind));
1468                                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
1469                                 fl = vm_phys_free_queues[dom][flind][pind];
1470                                         db_printf("  |  %6.6d", fl[oind].lcnt);
1471                                 }
1472                                 db_printf("\n");
1473                         }
1474                         db_printf("\n");
1475                 }
1476                 db_printf("\n");
1477         }
1478 }
1479 #endif