]> CyberLeo.Net >> Repos - FreeBSD/releng/9.1.git/blob - sys/vm/vm_phys.c
Fix Denial of Service vulnerability in named(8) with DNS64. [13:01]
[FreeBSD/releng/9.1.git] / sys / vm / vm_phys.c
1 /*-
2  * Copyright (c) 2002-2006 Rice University
3  * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
4  * All rights reserved.
5  *
6  * This software was developed for the FreeBSD Project by Alan L. Cox,
7  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
22  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
25  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
26  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
28  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34
35 #include "opt_ddb.h"
36 #include "opt_vm.h"
37
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/lock.h>
41 #include <sys/kernel.h>
42 #include <sys/malloc.h>
43 #include <sys/mutex.h>
44 #include <sys/queue.h>
45 #include <sys/sbuf.h>
46 #include <sys/sysctl.h>
47 #include <sys/vmmeter.h>
48 #include <sys/vnode.h>
49
50 #include <ddb/ddb.h>
51
52 #include <vm/vm.h>
53 #include <vm/vm_param.h>
54 #include <vm/vm_kern.h>
55 #include <vm/vm_object.h>
56 #include <vm/vm_page.h>
57 #include <vm/vm_phys.h>
58 #include <vm/vm_reserv.h>
59
60 /*
61  * VM_FREELIST_DEFAULT is split into VM_NDOMAIN lists, one for each
62  * domain.  These extra lists are stored at the end of the regular
63  * free lists starting with VM_NFREELIST.
64  */
65 #define VM_RAW_NFREELIST        (VM_NFREELIST + VM_NDOMAIN - 1)
66
67 struct vm_freelist {
68         struct pglist pl;
69         int lcnt;
70 };
71
72 struct vm_phys_seg {
73         vm_paddr_t      start;
74         vm_paddr_t      end;
75         vm_page_t       first_page;
76         int             domain;
77         struct vm_freelist (*free_queues)[VM_NFREEPOOL][VM_NFREEORDER];
78 };
79
80 struct mem_affinity *mem_affinity;
81
82 static struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
83
84 static int vm_phys_nsegs;
85
86 #define VM_PHYS_FICTITIOUS_NSEGS        8
87 static struct vm_phys_fictitious_seg {
88         vm_paddr_t      start;
89         vm_paddr_t      end;
90         vm_page_t       first_page;
91 } vm_phys_fictitious_segs[VM_PHYS_FICTITIOUS_NSEGS];
92 static struct mtx vm_phys_fictitious_reg_mtx;
93 MALLOC_DEFINE(M_FICT_PAGES, "", "");
94
95 static struct vm_freelist
96     vm_phys_free_queues[VM_RAW_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER];
97 static struct vm_freelist
98 (*vm_phys_lookup_lists[VM_NDOMAIN][VM_RAW_NFREELIST])[VM_NFREEPOOL][VM_NFREEORDER];
99
100 static int vm_nfreelists = VM_FREELIST_DEFAULT + 1;
101
102 static int cnt_prezero;
103 SYSCTL_INT(_vm_stats_misc, OID_AUTO, cnt_prezero, CTLFLAG_RD,
104     &cnt_prezero, 0, "The number of physical pages prezeroed at idle time");
105
106 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
107 SYSCTL_OID(_vm, OID_AUTO, phys_free, CTLTYPE_STRING | CTLFLAG_RD,
108     NULL, 0, sysctl_vm_phys_free, "A", "Phys Free Info");
109
110 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
111 SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,
112     NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info");
113
114 #if VM_NDOMAIN > 1
115 static int sysctl_vm_phys_lookup_lists(SYSCTL_HANDLER_ARGS);
116 SYSCTL_OID(_vm, OID_AUTO, phys_lookup_lists, CTLTYPE_STRING | CTLFLAG_RD,
117     NULL, 0, sysctl_vm_phys_lookup_lists, "A", "Phys Lookup Lists");
118 #endif
119
120 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind,
121     int domain);
122 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind);
123 static int vm_phys_paddr_to_segind(vm_paddr_t pa);
124 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
125     int order);
126
127 /*
128  * Outputs the state of the physical memory allocator, specifically,
129  * the amount of physical memory in each free list.
130  */
131 static int
132 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
133 {
134         struct sbuf sbuf;
135         struct vm_freelist *fl;
136         int error, flind, oind, pind;
137
138         error = sysctl_wire_old_buffer(req, 0);
139         if (error != 0)
140                 return (error);
141         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
142         for (flind = 0; flind < vm_nfreelists; flind++) {
143                 sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
144                     "\n  ORDER (SIZE)  |  NUMBER"
145                     "\n              ", flind);
146                 for (pind = 0; pind < VM_NFREEPOOL; pind++)
147                         sbuf_printf(&sbuf, "  |  POOL %d", pind);
148                 sbuf_printf(&sbuf, "\n--            ");
149                 for (pind = 0; pind < VM_NFREEPOOL; pind++)
150                         sbuf_printf(&sbuf, "-- --      ");
151                 sbuf_printf(&sbuf, "--\n");
152                 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
153                         sbuf_printf(&sbuf, "  %2d (%6dK)", oind,
154                             1 << (PAGE_SHIFT - 10 + oind));
155                         for (pind = 0; pind < VM_NFREEPOOL; pind++) {
156                                 fl = vm_phys_free_queues[flind][pind];
157                                 sbuf_printf(&sbuf, "  |  %6d", fl[oind].lcnt);
158                         }
159                         sbuf_printf(&sbuf, "\n");
160                 }
161         }
162         error = sbuf_finish(&sbuf);
163         sbuf_delete(&sbuf);
164         return (error);
165 }
166
167 /*
168  * Outputs the set of physical memory segments.
169  */
170 static int
171 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
172 {
173         struct sbuf sbuf;
174         struct vm_phys_seg *seg;
175         int error, segind;
176
177         error = sysctl_wire_old_buffer(req, 0);
178         if (error != 0)
179                 return (error);
180         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
181         for (segind = 0; segind < vm_phys_nsegs; segind++) {
182                 sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
183                 seg = &vm_phys_segs[segind];
184                 sbuf_printf(&sbuf, "start:     %#jx\n",
185                     (uintmax_t)seg->start);
186                 sbuf_printf(&sbuf, "end:       %#jx\n",
187                     (uintmax_t)seg->end);
188                 sbuf_printf(&sbuf, "domain:    %d\n", seg->domain);
189                 sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
190         }
191         error = sbuf_finish(&sbuf);
192         sbuf_delete(&sbuf);
193         return (error);
194 }
195
196 #if VM_NDOMAIN > 1
197 /*
198  * Outputs the set of free list lookup lists.
199  */
200 static int
201 sysctl_vm_phys_lookup_lists(SYSCTL_HANDLER_ARGS)
202 {
203         struct sbuf sbuf;
204         int domain, error, flind, ndomains;
205
206         error = sysctl_wire_old_buffer(req, 0);
207         if (error != 0)
208                 return (error);
209         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
210         ndomains = vm_nfreelists - VM_NFREELIST + 1;
211         for (domain = 0; domain < ndomains; domain++) {
212                 sbuf_printf(&sbuf, "\nDOMAIN %d:\n\n", domain);
213                 for (flind = 0; flind < vm_nfreelists; flind++)
214                         sbuf_printf(&sbuf, "  [%d]:\t%p\n", flind,
215                             vm_phys_lookup_lists[domain][flind]);
216         }
217         error = sbuf_finish(&sbuf);
218         sbuf_delete(&sbuf);
219         return (error);
220 }
221 #endif
222         
223 /*
224  * Create a physical memory segment.
225  */
226 static void
227 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind, int domain)
228 {
229         struct vm_phys_seg *seg;
230 #ifdef VM_PHYSSEG_SPARSE
231         long pages;
232         int segind;
233
234         pages = 0;
235         for (segind = 0; segind < vm_phys_nsegs; segind++) {
236                 seg = &vm_phys_segs[segind];
237                 pages += atop(seg->end - seg->start);
238         }
239 #endif
240         KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
241             ("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
242         seg = &vm_phys_segs[vm_phys_nsegs++];
243         seg->start = start;
244         seg->end = end;
245         seg->domain = domain;
246 #ifdef VM_PHYSSEG_SPARSE
247         seg->first_page = &vm_page_array[pages];
248 #else
249         seg->first_page = PHYS_TO_VM_PAGE(start);
250 #endif
251 #if VM_NDOMAIN > 1
252         if (flind == VM_FREELIST_DEFAULT && domain != 0) {
253                 flind = VM_NFREELIST + (domain - 1);
254                 if (flind >= vm_nfreelists)
255                         vm_nfreelists = flind + 1;
256         }
257 #endif
258         seg->free_queues = &vm_phys_free_queues[flind];
259 }
260
261 static void
262 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind)
263 {
264         int i;
265
266         if (mem_affinity == NULL) {
267                 _vm_phys_create_seg(start, end, flind, 0);
268                 return;
269         }
270
271         for (i = 0;; i++) {
272                 if (mem_affinity[i].end == 0)
273                         panic("Reached end of affinity info");
274                 if (mem_affinity[i].end <= start)
275                         continue;
276                 if (mem_affinity[i].start > start)
277                         panic("No affinity info for start %jx",
278                             (uintmax_t)start);
279                 if (mem_affinity[i].end >= end) {
280                         _vm_phys_create_seg(start, end, flind,
281                             mem_affinity[i].domain);
282                         break;
283                 }
284                 _vm_phys_create_seg(start, mem_affinity[i].end, flind,
285                     mem_affinity[i].domain);
286                 start = mem_affinity[i].end;
287         }
288 }
289
290 /*
291  * Initialize the physical memory allocator.
292  */
293 void
294 vm_phys_init(void)
295 {
296         struct vm_freelist *fl;
297         int flind, i, oind, pind;
298 #if VM_NDOMAIN > 1
299         int ndomains, j;
300 #endif
301
302         for (i = 0; phys_avail[i + 1] != 0; i += 2) {
303 #ifdef  VM_FREELIST_ISADMA
304                 if (phys_avail[i] < 16777216) {
305                         if (phys_avail[i + 1] > 16777216) {
306                                 vm_phys_create_seg(phys_avail[i], 16777216,
307                                     VM_FREELIST_ISADMA);
308                                 vm_phys_create_seg(16777216, phys_avail[i + 1],
309                                     VM_FREELIST_DEFAULT);
310                         } else {
311                                 vm_phys_create_seg(phys_avail[i],
312                                     phys_avail[i + 1], VM_FREELIST_ISADMA);
313                         }
314                         if (VM_FREELIST_ISADMA >= vm_nfreelists)
315                                 vm_nfreelists = VM_FREELIST_ISADMA + 1;
316                 } else
317 #endif
318 #ifdef  VM_FREELIST_HIGHMEM
319                 if (phys_avail[i + 1] > VM_HIGHMEM_ADDRESS) {
320                         if (phys_avail[i] < VM_HIGHMEM_ADDRESS) {
321                                 vm_phys_create_seg(phys_avail[i],
322                                     VM_HIGHMEM_ADDRESS, VM_FREELIST_DEFAULT);
323                                 vm_phys_create_seg(VM_HIGHMEM_ADDRESS,
324                                     phys_avail[i + 1], VM_FREELIST_HIGHMEM);
325                         } else {
326                                 vm_phys_create_seg(phys_avail[i],
327                                     phys_avail[i + 1], VM_FREELIST_HIGHMEM);
328                         }
329                         if (VM_FREELIST_HIGHMEM >= vm_nfreelists)
330                                 vm_nfreelists = VM_FREELIST_HIGHMEM + 1;
331                 } else
332 #endif
333                 vm_phys_create_seg(phys_avail[i], phys_avail[i + 1],
334                     VM_FREELIST_DEFAULT);
335         }
336         for (flind = 0; flind < vm_nfreelists; flind++) {
337                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
338                         fl = vm_phys_free_queues[flind][pind];
339                         for (oind = 0; oind < VM_NFREEORDER; oind++)
340                                 TAILQ_INIT(&fl[oind].pl);
341                 }
342         }
343 #if VM_NDOMAIN > 1
344         /*
345          * Build a free list lookup list for each domain.  All of the
346          * memory domain lists are inserted at the VM_FREELIST_DEFAULT
347          * index in a round-robin order starting with the current
348          * domain.
349          */
350         ndomains = vm_nfreelists - VM_NFREELIST + 1;
351         for (flind = 0; flind < VM_FREELIST_DEFAULT; flind++)
352                 for (i = 0; i < ndomains; i++)
353                         vm_phys_lookup_lists[i][flind] =
354                             &vm_phys_free_queues[flind];
355         for (i = 0; i < ndomains; i++)
356                 for (j = 0; j < ndomains; j++) {
357                         flind = (i + j) % ndomains;
358                         if (flind == 0)
359                                 flind = VM_FREELIST_DEFAULT;
360                         else
361                                 flind += VM_NFREELIST - 1;
362                         vm_phys_lookup_lists[i][VM_FREELIST_DEFAULT + j] =
363                             &vm_phys_free_queues[flind];
364                 }
365         for (flind = VM_FREELIST_DEFAULT + 1; flind < VM_NFREELIST;
366              flind++)
367                 for (i = 0; i < ndomains; i++)
368                         vm_phys_lookup_lists[i][flind + ndomains - 1] =
369                             &vm_phys_free_queues[flind];
370 #else
371         for (flind = 0; flind < vm_nfreelists; flind++)
372                 vm_phys_lookup_lists[0][flind] = &vm_phys_free_queues[flind];
373 #endif
374
375         mtx_init(&vm_phys_fictitious_reg_mtx, "vmfctr", NULL, MTX_DEF);
376 }
377
378 /*
379  * Split a contiguous, power of two-sized set of physical pages.
380  */
381 static __inline void
382 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order)
383 {
384         vm_page_t m_buddy;
385
386         while (oind > order) {
387                 oind--;
388                 m_buddy = &m[1 << oind];
389                 KASSERT(m_buddy->order == VM_NFREEORDER,
390                     ("vm_phys_split_pages: page %p has unexpected order %d",
391                     m_buddy, m_buddy->order));
392                 m_buddy->order = oind;
393                 TAILQ_INSERT_HEAD(&fl[oind].pl, m_buddy, pageq);
394                 fl[oind].lcnt++;
395         }
396 }
397
398 /*
399  * Initialize a physical page and add it to the free lists.
400  */
401 void
402 vm_phys_add_page(vm_paddr_t pa)
403 {
404         vm_page_t m;
405
406         cnt.v_page_count++;
407         m = vm_phys_paddr_to_vm_page(pa);
408         m->phys_addr = pa;
409         m->queue = PQ_NONE;
410         m->segind = vm_phys_paddr_to_segind(pa);
411         m->flags = PG_FREE;
412         KASSERT(m->order == VM_NFREEORDER,
413             ("vm_phys_add_page: page %p has unexpected order %d",
414             m, m->order));
415         m->pool = VM_FREEPOOL_DEFAULT;
416         pmap_page_init(m);
417         mtx_lock(&vm_page_queue_free_mtx);
418         cnt.v_free_count++;
419         vm_phys_free_pages(m, 0);
420         mtx_unlock(&vm_page_queue_free_mtx);
421 }
422
423 /*
424  * Allocate a contiguous, power of two-sized set of physical pages
425  * from the free lists.
426  *
427  * The free page queues must be locked.
428  */
429 vm_page_t
430 vm_phys_alloc_pages(int pool, int order)
431 {
432         vm_page_t m;
433         int flind;
434
435         for (flind = 0; flind < vm_nfreelists; flind++) {
436                 m = vm_phys_alloc_freelist_pages(flind, pool, order);
437                 if (m != NULL)
438                         return (m);
439         }
440         return (NULL);
441 }
442
443 /*
444  * Find and dequeue a free page on the given free list, with the 
445  * specified pool and order
446  */
447 vm_page_t
448 vm_phys_alloc_freelist_pages(int flind, int pool, int order)
449 {       
450         struct vm_freelist *fl;
451         struct vm_freelist *alt;
452         int domain, oind, pind;
453         vm_page_t m;
454
455         KASSERT(flind < VM_NFREELIST,
456             ("vm_phys_alloc_freelist_pages: freelist %d is out of range", flind));
457         KASSERT(pool < VM_NFREEPOOL,
458             ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
459         KASSERT(order < VM_NFREEORDER,
460             ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
461
462 #if VM_NDOMAIN > 1
463         domain = PCPU_GET(domain);
464 #else
465         domain = 0;
466 #endif
467         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
468         fl = (*vm_phys_lookup_lists[domain][flind])[pool];
469         for (oind = order; oind < VM_NFREEORDER; oind++) {
470                 m = TAILQ_FIRST(&fl[oind].pl);
471                 if (m != NULL) {
472                         TAILQ_REMOVE(&fl[oind].pl, m, pageq);
473                         fl[oind].lcnt--;
474                         m->order = VM_NFREEORDER;
475                         vm_phys_split_pages(m, oind, fl, order);
476                         return (m);
477                 }
478         }
479
480         /*
481          * The given pool was empty.  Find the largest
482          * contiguous, power-of-two-sized set of pages in any
483          * pool.  Transfer these pages to the given pool, and
484          * use them to satisfy the allocation.
485          */
486         for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
487                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
488                         alt = (*vm_phys_lookup_lists[domain][flind])[pind];
489                         m = TAILQ_FIRST(&alt[oind].pl);
490                         if (m != NULL) {
491                                 TAILQ_REMOVE(&alt[oind].pl, m, pageq);
492                                 alt[oind].lcnt--;
493                                 m->order = VM_NFREEORDER;
494                                 vm_phys_set_pool(pool, m, oind);
495                                 vm_phys_split_pages(m, oind, fl, order);
496                                 return (m);
497                         }
498                 }
499         }
500         return (NULL);
501 }
502
503 /*
504  * Allocate physical memory from phys_avail[].
505  */
506 vm_paddr_t
507 vm_phys_bootstrap_alloc(vm_size_t size, unsigned long alignment)
508 {
509         vm_paddr_t pa;
510         int i;
511
512         size = round_page(size);
513         for (i = 0; phys_avail[i + 1] != 0; i += 2) {
514                 if (phys_avail[i + 1] - phys_avail[i] < size)
515                         continue;
516                 pa = phys_avail[i];
517                 phys_avail[i] += size;
518                 return (pa);
519         }
520         panic("vm_phys_bootstrap_alloc");
521 }
522
523 /*
524  * Find the vm_page corresponding to the given physical address.
525  */
526 vm_page_t
527 vm_phys_paddr_to_vm_page(vm_paddr_t pa)
528 {
529         struct vm_phys_seg *seg;
530         int segind;
531
532         for (segind = 0; segind < vm_phys_nsegs; segind++) {
533                 seg = &vm_phys_segs[segind];
534                 if (pa >= seg->start && pa < seg->end)
535                         return (&seg->first_page[atop(pa - seg->start)]);
536         }
537         return (NULL);
538 }
539
540 vm_page_t
541 vm_phys_fictitious_to_vm_page(vm_paddr_t pa)
542 {
543         struct vm_phys_fictitious_seg *seg;
544         vm_page_t m;
545         int segind;
546
547         m = NULL;
548         for (segind = 0; segind < VM_PHYS_FICTITIOUS_NSEGS; segind++) {
549                 seg = &vm_phys_fictitious_segs[segind];
550                 if (pa >= seg->start && pa < seg->end) {
551                         m = &seg->first_page[atop(pa - seg->start)];
552                         KASSERT((m->flags & PG_FICTITIOUS) != 0,
553                             ("%p not fictitious", m));
554                         break;
555                 }
556         }
557         return (m);
558 }
559
560 int
561 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
562     vm_memattr_t memattr)
563 {
564         struct vm_phys_fictitious_seg *seg;
565         vm_page_t fp;
566         long i, page_count;
567         int segind;
568 #ifdef VM_PHYSSEG_DENSE
569         long pi;
570         boolean_t malloced;
571 #endif
572
573         page_count = (end - start) / PAGE_SIZE;
574
575 #ifdef VM_PHYSSEG_DENSE
576         pi = atop(start);
577         if (pi >= first_page && atop(end) < vm_page_array_size) {
578                 fp = &vm_page_array[pi - first_page];
579                 malloced = FALSE;
580         } else
581 #endif
582         {
583                 fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES,
584                     M_WAITOK | M_ZERO);
585 #ifdef VM_PHYSSEG_DENSE
586                 malloced = TRUE;
587 #endif
588         }
589         for (i = 0; i < page_count; i++) {
590                 vm_page_initfake(&fp[i], start + PAGE_SIZE * i, memattr);
591                 pmap_page_init(&fp[i]);
592                 fp[i].oflags &= ~(VPO_BUSY | VPO_UNMANAGED);
593         }
594         mtx_lock(&vm_phys_fictitious_reg_mtx);
595         for (segind = 0; segind < VM_PHYS_FICTITIOUS_NSEGS; segind++) {
596                 seg = &vm_phys_fictitious_segs[segind];
597                 if (seg->start == 0 && seg->end == 0) {
598                         seg->start = start;
599                         seg->end = end;
600                         seg->first_page = fp;
601                         mtx_unlock(&vm_phys_fictitious_reg_mtx);
602                         return (0);
603                 }
604         }
605         mtx_unlock(&vm_phys_fictitious_reg_mtx);
606 #ifdef VM_PHYSSEG_DENSE
607         if (malloced)
608 #endif
609                 free(fp, M_FICT_PAGES);
610         return (EBUSY);
611 }
612
613 void
614 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end)
615 {
616         struct vm_phys_fictitious_seg *seg;
617         vm_page_t fp;
618         int segind;
619 #ifdef VM_PHYSSEG_DENSE
620         long pi;
621 #endif
622
623 #ifdef VM_PHYSSEG_DENSE
624         pi = atop(start);
625 #endif
626
627         mtx_lock(&vm_phys_fictitious_reg_mtx);
628         for (segind = 0; segind < VM_PHYS_FICTITIOUS_NSEGS; segind++) {
629                 seg = &vm_phys_fictitious_segs[segind];
630                 if (seg->start == start && seg->end == end) {
631                         seg->start = seg->end = 0;
632                         fp = seg->first_page;
633                         seg->first_page = NULL;
634                         mtx_unlock(&vm_phys_fictitious_reg_mtx);
635 #ifdef VM_PHYSSEG_DENSE
636                         if (pi < first_page || atop(end) >= vm_page_array_size)
637 #endif
638                                 free(fp, M_FICT_PAGES);
639                         return;
640                 }
641         }
642         mtx_unlock(&vm_phys_fictitious_reg_mtx);
643         KASSERT(0, ("Unregistering not registered fictitious range"));
644 }
645
646 /*
647  * Find the segment containing the given physical address.
648  */
649 static int
650 vm_phys_paddr_to_segind(vm_paddr_t pa)
651 {
652         struct vm_phys_seg *seg;
653         int segind;
654
655         for (segind = 0; segind < vm_phys_nsegs; segind++) {
656                 seg = &vm_phys_segs[segind];
657                 if (pa >= seg->start && pa < seg->end)
658                         return (segind);
659         }
660         panic("vm_phys_paddr_to_segind: paddr %#jx is not in any segment" ,
661             (uintmax_t)pa);
662 }
663
664 /*
665  * Free a contiguous, power of two-sized set of physical pages.
666  *
667  * The free page queues must be locked.
668  */
669 void
670 vm_phys_free_pages(vm_page_t m, int order)
671 {
672         struct vm_freelist *fl;
673         struct vm_phys_seg *seg;
674         vm_paddr_t pa, pa_buddy;
675         vm_page_t m_buddy;
676
677         KASSERT(m->order == VM_NFREEORDER,
678             ("vm_phys_free_pages: page %p has unexpected order %d",
679             m, m->order));
680         KASSERT(m->pool < VM_NFREEPOOL,
681             ("vm_phys_free_pages: page %p has unexpected pool %d",
682             m, m->pool));
683         KASSERT(order < VM_NFREEORDER,
684             ("vm_phys_free_pages: order %d is out of range", order));
685         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
686         pa = VM_PAGE_TO_PHYS(m);
687         seg = &vm_phys_segs[m->segind];
688         while (order < VM_NFREEORDER - 1) {
689                 pa_buddy = pa ^ (1 << (PAGE_SHIFT + order));
690                 if (pa_buddy < seg->start ||
691                     pa_buddy >= seg->end)
692                         break;
693                 m_buddy = &seg->first_page[atop(pa_buddy - seg->start)];
694                 if (m_buddy->order != order)
695                         break;
696                 fl = (*seg->free_queues)[m_buddy->pool];
697                 TAILQ_REMOVE(&fl[m_buddy->order].pl, m_buddy, pageq);
698                 fl[m_buddy->order].lcnt--;
699                 m_buddy->order = VM_NFREEORDER;
700                 if (m_buddy->pool != m->pool)
701                         vm_phys_set_pool(m->pool, m_buddy, order);
702                 order++;
703                 pa &= ~((1 << (PAGE_SHIFT + order)) - 1);
704                 m = &seg->first_page[atop(pa - seg->start)];
705         }
706         m->order = order;
707         fl = (*seg->free_queues)[m->pool];
708         TAILQ_INSERT_TAIL(&fl[order].pl, m, pageq);
709         fl[order].lcnt++;
710 }
711
712 /*
713  * Set the pool for a contiguous, power of two-sized set of physical pages. 
714  */
715 void
716 vm_phys_set_pool(int pool, vm_page_t m, int order)
717 {
718         vm_page_t m_tmp;
719
720         for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
721                 m_tmp->pool = pool;
722 }
723
724 /*
725  * Search for the given physical page "m" in the free lists.  If the search
726  * succeeds, remove "m" from the free lists and return TRUE.  Otherwise, return
727  * FALSE, indicating that "m" is not in the free lists.
728  *
729  * The free page queues must be locked.
730  */
731 boolean_t
732 vm_phys_unfree_page(vm_page_t m)
733 {
734         struct vm_freelist *fl;
735         struct vm_phys_seg *seg;
736         vm_paddr_t pa, pa_half;
737         vm_page_t m_set, m_tmp;
738         int order;
739
740         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
741
742         /*
743          * First, find the contiguous, power of two-sized set of free
744          * physical pages containing the given physical page "m" and
745          * assign it to "m_set".
746          */
747         seg = &vm_phys_segs[m->segind];
748         for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
749             order < VM_NFREEORDER - 1; ) {
750                 order++;
751                 pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
752                 if (pa >= seg->start)
753                         m_set = &seg->first_page[atop(pa - seg->start)];
754                 else
755                         return (FALSE);
756         }
757         if (m_set->order < order)
758                 return (FALSE);
759         if (m_set->order == VM_NFREEORDER)
760                 return (FALSE);
761         KASSERT(m_set->order < VM_NFREEORDER,
762             ("vm_phys_unfree_page: page %p has unexpected order %d",
763             m_set, m_set->order));
764
765         /*
766          * Next, remove "m_set" from the free lists.  Finally, extract
767          * "m" from "m_set" using an iterative algorithm: While "m_set"
768          * is larger than a page, shrink "m_set" by returning the half
769          * of "m_set" that does not contain "m" to the free lists.
770          */
771         fl = (*seg->free_queues)[m_set->pool];
772         order = m_set->order;
773         TAILQ_REMOVE(&fl[order].pl, m_set, pageq);
774         fl[order].lcnt--;
775         m_set->order = VM_NFREEORDER;
776         while (order > 0) {
777                 order--;
778                 pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
779                 if (m->phys_addr < pa_half)
780                         m_tmp = &seg->first_page[atop(pa_half - seg->start)];
781                 else {
782                         m_tmp = m_set;
783                         m_set = &seg->first_page[atop(pa_half - seg->start)];
784                 }
785                 m_tmp->order = order;
786                 TAILQ_INSERT_HEAD(&fl[order].pl, m_tmp, pageq);
787                 fl[order].lcnt++;
788         }
789         KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
790         return (TRUE);
791 }
792
793 /*
794  * Try to zero one physical page.  Used by an idle priority thread.
795  */
796 boolean_t
797 vm_phys_zero_pages_idle(void)
798 {
799         static struct vm_freelist *fl = vm_phys_free_queues[0][0];
800         static int flind, oind, pind;
801         vm_page_t m, m_tmp;
802
803         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
804         for (;;) {
805                 TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, pageq) {
806                         for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) {
807                                 if ((m_tmp->flags & (PG_CACHED | PG_ZERO)) == 0) {
808                                         vm_phys_unfree_page(m_tmp);
809                                         cnt.v_free_count--;
810                                         mtx_unlock(&vm_page_queue_free_mtx);
811                                         pmap_zero_page_idle(m_tmp);
812                                         m_tmp->flags |= PG_ZERO;
813                                         mtx_lock(&vm_page_queue_free_mtx);
814                                         cnt.v_free_count++;
815                                         vm_phys_free_pages(m_tmp, 0);
816                                         vm_page_zero_count++;
817                                         cnt_prezero++;
818                                         return (TRUE);
819                                 }
820                         }
821                 }
822                 oind++;
823                 if (oind == VM_NFREEORDER) {
824                         oind = 0;
825                         pind++;
826                         if (pind == VM_NFREEPOOL) {
827                                 pind = 0;
828                                 flind++;
829                                 if (flind == vm_nfreelists)
830                                         flind = 0;
831                         }
832                         fl = vm_phys_free_queues[flind][pind];
833                 }
834         }
835 }
836
837 /*
838  * Allocate a contiguous set of physical pages of the given size
839  * "npages" from the free lists.  All of the physical pages must be at
840  * or above the given physical address "low" and below the given
841  * physical address "high".  The given value "alignment" determines the
842  * alignment of the first physical page in the set.  If the given value
843  * "boundary" is non-zero, then the set of physical pages cannot cross
844  * any physical address boundary that is a multiple of that value.  Both
845  * "alignment" and "boundary" must be a power of two.
846  */
847 vm_page_t
848 vm_phys_alloc_contig(unsigned long npages, vm_paddr_t low, vm_paddr_t high,
849     unsigned long alignment, unsigned long boundary)
850 {
851         struct vm_freelist *fl;
852         struct vm_phys_seg *seg;
853         struct vnode *vp;
854         vm_paddr_t pa, pa_last, size;
855         vm_page_t deferred_vdrop_list, m, m_ret;
856         int domain, flind, i, oind, order, pind;
857
858 #if VM_NDOMAIN > 1
859         domain = PCPU_GET(domain);
860 #else
861         domain = 0;
862 #endif
863         size = npages << PAGE_SHIFT;
864         KASSERT(size != 0,
865             ("vm_phys_alloc_contig: size must not be 0"));
866         KASSERT((alignment & (alignment - 1)) == 0,
867             ("vm_phys_alloc_contig: alignment must be a power of 2"));
868         KASSERT((boundary & (boundary - 1)) == 0,
869             ("vm_phys_alloc_contig: boundary must be a power of 2"));
870         deferred_vdrop_list = NULL;
871         /* Compute the queue that is the best fit for npages. */
872         for (order = 0; (1 << order) < npages; order++);
873         mtx_lock(&vm_page_queue_free_mtx);
874 #if VM_NRESERVLEVEL > 0
875 retry:
876 #endif
877         for (flind = 0; flind < vm_nfreelists; flind++) {
878                 for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) {
879                         for (pind = 0; pind < VM_NFREEPOOL; pind++) {
880                                 fl = (*vm_phys_lookup_lists[domain][flind])
881                                     [pind];
882                                 TAILQ_FOREACH(m_ret, &fl[oind].pl, pageq) {
883                                         /*
884                                          * A free list may contain physical pages
885                                          * from one or more segments.
886                                          */
887                                         seg = &vm_phys_segs[m_ret->segind];
888                                         if (seg->start > high ||
889                                             low >= seg->end)
890                                                 continue;
891
892                                         /*
893                                          * Is the size of this allocation request
894                                          * larger than the largest block size?
895                                          */
896                                         if (order >= VM_NFREEORDER) {
897                                                 /*
898                                                  * Determine if a sufficient number
899                                                  * of subsequent blocks to satisfy
900                                                  * the allocation request are free.
901                                                  */
902                                                 pa = VM_PAGE_TO_PHYS(m_ret);
903                                                 pa_last = pa + size;
904                                                 for (;;) {
905                                                         pa += 1 << (PAGE_SHIFT + VM_NFREEORDER - 1);
906                                                         if (pa >= pa_last)
907                                                                 break;
908                                                         if (pa < seg->start ||
909                                                             pa >= seg->end)
910                                                                 break;
911                                                         m = &seg->first_page[atop(pa - seg->start)];
912                                                         if (m->order != VM_NFREEORDER - 1)
913                                                                 break;
914                                                 }
915                                                 /* If not, continue to the next block. */
916                                                 if (pa < pa_last)
917                                                         continue;
918                                         }
919
920                                         /*
921                                          * Determine if the blocks are within the given range,
922                                          * satisfy the given alignment, and do not cross the
923                                          * given boundary.
924                                          */
925                                         pa = VM_PAGE_TO_PHYS(m_ret);
926                                         if (pa >= low &&
927                                             pa + size <= high &&
928                                             (pa & (alignment - 1)) == 0 &&
929                                             ((pa ^ (pa + size - 1)) & ~(boundary - 1)) == 0)
930                                                 goto done;
931                                 }
932                         }
933                 }
934         }
935 #if VM_NRESERVLEVEL > 0
936         if (vm_reserv_reclaim_contig(size, low, high, alignment, boundary))
937                 goto retry;
938 #endif
939         mtx_unlock(&vm_page_queue_free_mtx);
940         return (NULL);
941 done:
942         for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) {
943                 fl = (*seg->free_queues)[m->pool];
944                 TAILQ_REMOVE(&fl[m->order].pl, m, pageq);
945                 fl[m->order].lcnt--;
946                 m->order = VM_NFREEORDER;
947         }
948         if (m_ret->pool != VM_FREEPOOL_DEFAULT)
949                 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m_ret, oind);
950         fl = (*seg->free_queues)[m_ret->pool];
951         vm_phys_split_pages(m_ret, oind, fl, order);
952         for (i = 0; i < npages; i++) {
953                 m = &m_ret[i];
954                 vp = vm_page_alloc_init(m);
955                 if (vp != NULL) {
956                         /*
957                          * Enqueue the vnode for deferred vdrop().
958                          *
959                          * Unmanaged pages don't use "pageq", so it
960                          * can be safely abused to construct a short-
961                          * lived queue of vnodes.
962                          */
963                         m->pageq.tqe_prev = (void *)vp;
964                         m->pageq.tqe_next = deferred_vdrop_list;
965                         deferred_vdrop_list = m;
966                 }
967         }
968         for (; i < roundup2(npages, 1 << imin(oind, order)); i++) {
969                 m = &m_ret[i];
970                 KASSERT(m->order == VM_NFREEORDER,
971                     ("vm_phys_alloc_contig: page %p has unexpected order %d",
972                     m, m->order));
973                 vm_phys_free_pages(m, 0);
974         }
975         mtx_unlock(&vm_page_queue_free_mtx);
976         while (deferred_vdrop_list != NULL) {
977                 vdrop((struct vnode *)deferred_vdrop_list->pageq.tqe_prev);
978                 deferred_vdrop_list = deferred_vdrop_list->pageq.tqe_next;
979         }
980         return (m_ret);
981 }
982
983 #ifdef DDB
984 /*
985  * Show the number of physical pages in each of the free lists.
986  */
987 DB_SHOW_COMMAND(freepages, db_show_freepages)
988 {
989         struct vm_freelist *fl;
990         int flind, oind, pind;
991
992         for (flind = 0; flind < vm_nfreelists; flind++) {
993                 db_printf("FREE LIST %d:\n"
994                     "\n  ORDER (SIZE)  |  NUMBER"
995                     "\n              ", flind);
996                 for (pind = 0; pind < VM_NFREEPOOL; pind++)
997                         db_printf("  |  POOL %d", pind);
998                 db_printf("\n--            ");
999                 for (pind = 0; pind < VM_NFREEPOOL; pind++)
1000                         db_printf("-- --      ");
1001                 db_printf("--\n");
1002                 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
1003                         db_printf("  %2.2d (%6.6dK)", oind,
1004                             1 << (PAGE_SHIFT - 10 + oind));
1005                         for (pind = 0; pind < VM_NFREEPOOL; pind++) {
1006                                 fl = vm_phys_free_queues[flind][pind];
1007                                 db_printf("  |  %6.6d", fl[oind].lcnt);
1008                         }
1009                         db_printf("\n");
1010                 }
1011                 db_printf("\n");
1012         }
1013 }
1014 #endif