]> CyberLeo.Net >> Repos - FreeBSD/stable/9.git/blob - sys/vm/vm_phys.c
Copy head to stable/9 as part of 9.0-RELEASE release cycle.
[FreeBSD/stable/9.git] / sys / vm / vm_phys.c
1 /*-
2  * Copyright (c) 2002-2006 Rice University
3  * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
4  * All rights reserved.
5  *
6  * This software was developed for the FreeBSD Project by Alan L. Cox,
7  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
22  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
25  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
26  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
28  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34
35 #include "opt_ddb.h"
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/lock.h>
40 #include <sys/kernel.h>
41 #include <sys/malloc.h>
42 #include <sys/mutex.h>
43 #include <sys/queue.h>
44 #include <sys/sbuf.h>
45 #include <sys/sysctl.h>
46 #include <sys/vmmeter.h>
47 #include <sys/vnode.h>
48
49 #include <ddb/ddb.h>
50
51 #include <vm/vm.h>
52 #include <vm/vm_param.h>
53 #include <vm/vm_kern.h>
54 #include <vm/vm_object.h>
55 #include <vm/vm_page.h>
56 #include <vm/vm_phys.h>
57 #include <vm/vm_reserv.h>
58
59 /*
60  * VM_FREELIST_DEFAULT is split into VM_NDOMAIN lists, one for each
61  * domain.  These extra lists are stored at the end of the regular
62  * free lists starting with VM_NFREELIST.
63  */
64 #define VM_RAW_NFREELIST        (VM_NFREELIST + VM_NDOMAIN - 1)
65
66 struct vm_freelist {
67         struct pglist pl;
68         int lcnt;
69 };
70
71 struct vm_phys_seg {
72         vm_paddr_t      start;
73         vm_paddr_t      end;
74         vm_page_t       first_page;
75         int             domain;
76         struct vm_freelist (*free_queues)[VM_NFREEPOOL][VM_NFREEORDER];
77 };
78
79 struct mem_affinity *mem_affinity;
80
81 static struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
82
83 static int vm_phys_nsegs;
84
85 static struct vm_freelist
86     vm_phys_free_queues[VM_RAW_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER];
87 static struct vm_freelist
88 (*vm_phys_lookup_lists[VM_NDOMAIN][VM_RAW_NFREELIST])[VM_NFREEPOOL][VM_NFREEORDER];
89
90 static int vm_nfreelists = VM_FREELIST_DEFAULT + 1;
91
92 static int cnt_prezero;
93 SYSCTL_INT(_vm_stats_misc, OID_AUTO, cnt_prezero, CTLFLAG_RD,
94     &cnt_prezero, 0, "The number of physical pages prezeroed at idle time");
95
96 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
97 SYSCTL_OID(_vm, OID_AUTO, phys_free, CTLTYPE_STRING | CTLFLAG_RD,
98     NULL, 0, sysctl_vm_phys_free, "A", "Phys Free Info");
99
100 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
101 SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,
102     NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info");
103
104 #if VM_NDOMAIN > 1
105 static int sysctl_vm_phys_lookup_lists(SYSCTL_HANDLER_ARGS);
106 SYSCTL_OID(_vm, OID_AUTO, phys_lookup_lists, CTLTYPE_STRING | CTLFLAG_RD,
107     NULL, 0, sysctl_vm_phys_lookup_lists, "A", "Phys Lookup Lists");
108 #endif
109
110 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind,
111     int domain);
112 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind);
113 static int vm_phys_paddr_to_segind(vm_paddr_t pa);
114 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
115     int order);
116
117 /*
118  * Outputs the state of the physical memory allocator, specifically,
119  * the amount of physical memory in each free list.
120  */
121 static int
122 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
123 {
124         struct sbuf sbuf;
125         struct vm_freelist *fl;
126         int error, flind, oind, pind;
127
128         error = sysctl_wire_old_buffer(req, 0);
129         if (error != 0)
130                 return (error);
131         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
132         for (flind = 0; flind < vm_nfreelists; flind++) {
133                 sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
134                     "\n  ORDER (SIZE)  |  NUMBER"
135                     "\n              ", flind);
136                 for (pind = 0; pind < VM_NFREEPOOL; pind++)
137                         sbuf_printf(&sbuf, "  |  POOL %d", pind);
138                 sbuf_printf(&sbuf, "\n--            ");
139                 for (pind = 0; pind < VM_NFREEPOOL; pind++)
140                         sbuf_printf(&sbuf, "-- --      ");
141                 sbuf_printf(&sbuf, "--\n");
142                 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
143                         sbuf_printf(&sbuf, "  %2d (%6dK)", oind,
144                             1 << (PAGE_SHIFT - 10 + oind));
145                         for (pind = 0; pind < VM_NFREEPOOL; pind++) {
146                                 fl = vm_phys_free_queues[flind][pind];
147                                 sbuf_printf(&sbuf, "  |  %6d", fl[oind].lcnt);
148                         }
149                         sbuf_printf(&sbuf, "\n");
150                 }
151         }
152         error = sbuf_finish(&sbuf);
153         sbuf_delete(&sbuf);
154         return (error);
155 }
156
157 /*
158  * Outputs the set of physical memory segments.
159  */
160 static int
161 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
162 {
163         struct sbuf sbuf;
164         struct vm_phys_seg *seg;
165         int error, segind;
166
167         error = sysctl_wire_old_buffer(req, 0);
168         if (error != 0)
169                 return (error);
170         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
171         for (segind = 0; segind < vm_phys_nsegs; segind++) {
172                 sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
173                 seg = &vm_phys_segs[segind];
174                 sbuf_printf(&sbuf, "start:     %#jx\n",
175                     (uintmax_t)seg->start);
176                 sbuf_printf(&sbuf, "end:       %#jx\n",
177                     (uintmax_t)seg->end);
178                 sbuf_printf(&sbuf, "domain:    %d\n", seg->domain);
179                 sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
180         }
181         error = sbuf_finish(&sbuf);
182         sbuf_delete(&sbuf);
183         return (error);
184 }
185
186 #if VM_NDOMAIN > 1
187 /*
188  * Outputs the set of free list lookup lists.
189  */
190 static int
191 sysctl_vm_phys_lookup_lists(SYSCTL_HANDLER_ARGS)
192 {
193         struct sbuf sbuf;
194         int domain, error, flind, ndomains;
195
196         error = sysctl_wire_old_buffer(req, 0);
197         if (error != 0)
198                 return (error);
199         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
200         ndomains = vm_nfreelists - VM_NFREELIST + 1;
201         for (domain = 0; domain < ndomains; domain++) {
202                 sbuf_printf(&sbuf, "\nDOMAIN %d:\n\n", domain);
203                 for (flind = 0; flind < vm_nfreelists; flind++)
204                         sbuf_printf(&sbuf, "  [%d]:\t%p\n", flind,
205                             vm_phys_lookup_lists[domain][flind]);
206         }
207         error = sbuf_finish(&sbuf);
208         sbuf_delete(&sbuf);
209         return (error);
210 }
211 #endif
212         
213 /*
214  * Create a physical memory segment.
215  */
216 static void
217 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind, int domain)
218 {
219         struct vm_phys_seg *seg;
220 #ifdef VM_PHYSSEG_SPARSE
221         long pages;
222         int segind;
223
224         pages = 0;
225         for (segind = 0; segind < vm_phys_nsegs; segind++) {
226                 seg = &vm_phys_segs[segind];
227                 pages += atop(seg->end - seg->start);
228         }
229 #endif
230         KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
231             ("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
232         seg = &vm_phys_segs[vm_phys_nsegs++];
233         seg->start = start;
234         seg->end = end;
235         seg->domain = domain;
236 #ifdef VM_PHYSSEG_SPARSE
237         seg->first_page = &vm_page_array[pages];
238 #else
239         seg->first_page = PHYS_TO_VM_PAGE(start);
240 #endif
241 #if VM_NDOMAIN > 1
242         if (flind == VM_FREELIST_DEFAULT && domain != 0) {
243                 flind = VM_NFREELIST + (domain - 1);
244                 if (flind >= vm_nfreelists)
245                         vm_nfreelists = flind + 1;
246         }
247 #endif
248         seg->free_queues = &vm_phys_free_queues[flind];
249 }
250
251 static void
252 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind)
253 {
254         int i;
255
256         if (mem_affinity == NULL) {
257                 _vm_phys_create_seg(start, end, flind, 0);
258                 return;
259         }
260
261         for (i = 0;; i++) {
262                 if (mem_affinity[i].end == 0)
263                         panic("Reached end of affinity info");
264                 if (mem_affinity[i].end <= start)
265                         continue;
266                 if (mem_affinity[i].start > start)
267                         panic("No affinity info for start %jx",
268                             (uintmax_t)start);
269                 if (mem_affinity[i].end >= end) {
270                         _vm_phys_create_seg(start, end, flind,
271                             mem_affinity[i].domain);
272                         break;
273                 }
274                 _vm_phys_create_seg(start, mem_affinity[i].end, flind,
275                     mem_affinity[i].domain);
276                 start = mem_affinity[i].end;
277         }
278 }
279
280 /*
281  * Initialize the physical memory allocator.
282  */
283 void
284 vm_phys_init(void)
285 {
286         struct vm_freelist *fl;
287         int flind, i, oind, pind;
288 #if VM_NDOMAIN > 1
289         int ndomains, j;
290 #endif
291
292         for (i = 0; phys_avail[i + 1] != 0; i += 2) {
293 #ifdef  VM_FREELIST_ISADMA
294                 if (phys_avail[i] < 16777216) {
295                         if (phys_avail[i + 1] > 16777216) {
296                                 vm_phys_create_seg(phys_avail[i], 16777216,
297                                     VM_FREELIST_ISADMA);
298                                 vm_phys_create_seg(16777216, phys_avail[i + 1],
299                                     VM_FREELIST_DEFAULT);
300                         } else {
301                                 vm_phys_create_seg(phys_avail[i],
302                                     phys_avail[i + 1], VM_FREELIST_ISADMA);
303                         }
304                         if (VM_FREELIST_ISADMA >= vm_nfreelists)
305                                 vm_nfreelists = VM_FREELIST_ISADMA + 1;
306                 } else
307 #endif
308 #ifdef  VM_FREELIST_HIGHMEM
309                 if (phys_avail[i + 1] > VM_HIGHMEM_ADDRESS) {
310                         if (phys_avail[i] < VM_HIGHMEM_ADDRESS) {
311                                 vm_phys_create_seg(phys_avail[i],
312                                     VM_HIGHMEM_ADDRESS, VM_FREELIST_DEFAULT);
313                                 vm_phys_create_seg(VM_HIGHMEM_ADDRESS,
314                                     phys_avail[i + 1], VM_FREELIST_HIGHMEM);
315                         } else {
316                                 vm_phys_create_seg(phys_avail[i],
317                                     phys_avail[i + 1], VM_FREELIST_HIGHMEM);
318                         }
319                         if (VM_FREELIST_HIGHMEM >= vm_nfreelists)
320                                 vm_nfreelists = VM_FREELIST_HIGHMEM + 1;
321                 } else
322 #endif
323                 vm_phys_create_seg(phys_avail[i], phys_avail[i + 1],
324                     VM_FREELIST_DEFAULT);
325         }
326         for (flind = 0; flind < vm_nfreelists; flind++) {
327                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
328                         fl = vm_phys_free_queues[flind][pind];
329                         for (oind = 0; oind < VM_NFREEORDER; oind++)
330                                 TAILQ_INIT(&fl[oind].pl);
331                 }
332         }
333 #if VM_NDOMAIN > 1
334         /*
335          * Build a free list lookup list for each domain.  All of the
336          * memory domain lists are inserted at the VM_FREELIST_DEFAULT
337          * index in a round-robin order starting with the current
338          * domain.
339          */
340         ndomains = vm_nfreelists - VM_NFREELIST + 1;
341         for (flind = 0; flind < VM_FREELIST_DEFAULT; flind++)
342                 for (i = 0; i < ndomains; i++)
343                         vm_phys_lookup_lists[i][flind] =
344                             &vm_phys_free_queues[flind];
345         for (i = 0; i < ndomains; i++)
346                 for (j = 0; j < ndomains; j++) {
347                         flind = (i + j) % ndomains;
348                         if (flind == 0)
349                                 flind = VM_FREELIST_DEFAULT;
350                         else
351                                 flind += VM_NFREELIST - 1;
352                         vm_phys_lookup_lists[i][VM_FREELIST_DEFAULT + j] =
353                             &vm_phys_free_queues[flind];
354                 }
355         for (flind = VM_FREELIST_DEFAULT + 1; flind < VM_NFREELIST;
356              flind++)
357                 for (i = 0; i < ndomains; i++)
358                         vm_phys_lookup_lists[i][flind + ndomains - 1] =
359                             &vm_phys_free_queues[flind];
360 #else
361         for (flind = 0; flind < vm_nfreelists; flind++)
362                 vm_phys_lookup_lists[0][flind] = &vm_phys_free_queues[flind];
363 #endif
364 }
365
366 /*
367  * Split a contiguous, power of two-sized set of physical pages.
368  */
369 static __inline void
370 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order)
371 {
372         vm_page_t m_buddy;
373
374         while (oind > order) {
375                 oind--;
376                 m_buddy = &m[1 << oind];
377                 KASSERT(m_buddy->order == VM_NFREEORDER,
378                     ("vm_phys_split_pages: page %p has unexpected order %d",
379                     m_buddy, m_buddy->order));
380                 m_buddy->order = oind;
381                 TAILQ_INSERT_HEAD(&fl[oind].pl, m_buddy, pageq);
382                 fl[oind].lcnt++;
383         }
384 }
385
386 /*
387  * Initialize a physical page and add it to the free lists.
388  */
389 void
390 vm_phys_add_page(vm_paddr_t pa)
391 {
392         vm_page_t m;
393
394         cnt.v_page_count++;
395         m = vm_phys_paddr_to_vm_page(pa);
396         m->phys_addr = pa;
397         m->queue = PQ_NONE;
398         m->segind = vm_phys_paddr_to_segind(pa);
399         m->flags = PG_FREE;
400         KASSERT(m->order == VM_NFREEORDER,
401             ("vm_phys_add_page: page %p has unexpected order %d",
402             m, m->order));
403         m->pool = VM_FREEPOOL_DEFAULT;
404         pmap_page_init(m);
405         mtx_lock(&vm_page_queue_free_mtx);
406         cnt.v_free_count++;
407         vm_phys_free_pages(m, 0);
408         mtx_unlock(&vm_page_queue_free_mtx);
409 }
410
411 /*
412  * Allocate a contiguous, power of two-sized set of physical pages
413  * from the free lists.
414  *
415  * The free page queues must be locked.
416  */
417 vm_page_t
418 vm_phys_alloc_pages(int pool, int order)
419 {
420         vm_page_t m;
421         int flind;
422
423         for (flind = 0; flind < vm_nfreelists; flind++) {
424                 m = vm_phys_alloc_freelist_pages(flind, pool, order);
425                 if (m != NULL)
426                         return (m);
427         }
428         return (NULL);
429 }
430
431 /*
432  * Find and dequeue a free page on the given free list, with the 
433  * specified pool and order
434  */
435 vm_page_t
436 vm_phys_alloc_freelist_pages(int flind, int pool, int order)
437 {       
438         struct vm_freelist *fl;
439         struct vm_freelist *alt;
440         int domain, oind, pind;
441         vm_page_t m;
442
443         KASSERT(flind < VM_NFREELIST,
444             ("vm_phys_alloc_freelist_pages: freelist %d is out of range", flind));
445         KASSERT(pool < VM_NFREEPOOL,
446             ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
447         KASSERT(order < VM_NFREEORDER,
448             ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
449
450 #if VM_NDOMAIN > 1
451         domain = PCPU_GET(domain);
452 #else
453         domain = 0;
454 #endif
455         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
456         fl = (*vm_phys_lookup_lists[domain][flind])[pool];
457         for (oind = order; oind < VM_NFREEORDER; oind++) {
458                 m = TAILQ_FIRST(&fl[oind].pl);
459                 if (m != NULL) {
460                         TAILQ_REMOVE(&fl[oind].pl, m, pageq);
461                         fl[oind].lcnt--;
462                         m->order = VM_NFREEORDER;
463                         vm_phys_split_pages(m, oind, fl, order);
464                         return (m);
465                 }
466         }
467
468         /*
469          * The given pool was empty.  Find the largest
470          * contiguous, power-of-two-sized set of pages in any
471          * pool.  Transfer these pages to the given pool, and
472          * use them to satisfy the allocation.
473          */
474         for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
475                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
476                         alt = (*vm_phys_lookup_lists[domain][flind])[pind];
477                         m = TAILQ_FIRST(&alt[oind].pl);
478                         if (m != NULL) {
479                                 TAILQ_REMOVE(&alt[oind].pl, m, pageq);
480                                 alt[oind].lcnt--;
481                                 m->order = VM_NFREEORDER;
482                                 vm_phys_set_pool(pool, m, oind);
483                                 vm_phys_split_pages(m, oind, fl, order);
484                                 return (m);
485                         }
486                 }
487         }
488         return (NULL);
489 }
490
491 /*
492  * Allocate physical memory from phys_avail[].
493  */
494 vm_paddr_t
495 vm_phys_bootstrap_alloc(vm_size_t size, unsigned long alignment)
496 {
497         vm_paddr_t pa;
498         int i;
499
500         size = round_page(size);
501         for (i = 0; phys_avail[i + 1] != 0; i += 2) {
502                 if (phys_avail[i + 1] - phys_avail[i] < size)
503                         continue;
504                 pa = phys_avail[i];
505                 phys_avail[i] += size;
506                 return (pa);
507         }
508         panic("vm_phys_bootstrap_alloc");
509 }
510
511 /*
512  * Find the vm_page corresponding to the given physical address.
513  */
514 vm_page_t
515 vm_phys_paddr_to_vm_page(vm_paddr_t pa)
516 {
517         struct vm_phys_seg *seg;
518         int segind;
519
520         for (segind = 0; segind < vm_phys_nsegs; segind++) {
521                 seg = &vm_phys_segs[segind];
522                 if (pa >= seg->start && pa < seg->end)
523                         return (&seg->first_page[atop(pa - seg->start)]);
524         }
525         return (NULL);
526 }
527
528 /*
529  * Find the segment containing the given physical address.
530  */
531 static int
532 vm_phys_paddr_to_segind(vm_paddr_t pa)
533 {
534         struct vm_phys_seg *seg;
535         int segind;
536
537         for (segind = 0; segind < vm_phys_nsegs; segind++) {
538                 seg = &vm_phys_segs[segind];
539                 if (pa >= seg->start && pa < seg->end)
540                         return (segind);
541         }
542         panic("vm_phys_paddr_to_segind: paddr %#jx is not in any segment" ,
543             (uintmax_t)pa);
544 }
545
546 /*
547  * Free a contiguous, power of two-sized set of physical pages.
548  *
549  * The free page queues must be locked.
550  */
551 void
552 vm_phys_free_pages(vm_page_t m, int order)
553 {
554         struct vm_freelist *fl;
555         struct vm_phys_seg *seg;
556         vm_paddr_t pa, pa_buddy;
557         vm_page_t m_buddy;
558
559         KASSERT(m->order == VM_NFREEORDER,
560             ("vm_phys_free_pages: page %p has unexpected order %d",
561             m, m->order));
562         KASSERT(m->pool < VM_NFREEPOOL,
563             ("vm_phys_free_pages: page %p has unexpected pool %d",
564             m, m->pool));
565         KASSERT(order < VM_NFREEORDER,
566             ("vm_phys_free_pages: order %d is out of range", order));
567         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
568         pa = VM_PAGE_TO_PHYS(m);
569         seg = &vm_phys_segs[m->segind];
570         while (order < VM_NFREEORDER - 1) {
571                 pa_buddy = pa ^ (1 << (PAGE_SHIFT + order));
572                 if (pa_buddy < seg->start ||
573                     pa_buddy >= seg->end)
574                         break;
575                 m_buddy = &seg->first_page[atop(pa_buddy - seg->start)];
576                 if (m_buddy->order != order)
577                         break;
578                 fl = (*seg->free_queues)[m_buddy->pool];
579                 TAILQ_REMOVE(&fl[m_buddy->order].pl, m_buddy, pageq);
580                 fl[m_buddy->order].lcnt--;
581                 m_buddy->order = VM_NFREEORDER;
582                 if (m_buddy->pool != m->pool)
583                         vm_phys_set_pool(m->pool, m_buddy, order);
584                 order++;
585                 pa &= ~((1 << (PAGE_SHIFT + order)) - 1);
586                 m = &seg->first_page[atop(pa - seg->start)];
587         }
588         m->order = order;
589         fl = (*seg->free_queues)[m->pool];
590         TAILQ_INSERT_TAIL(&fl[order].pl, m, pageq);
591         fl[order].lcnt++;
592 }
593
594 /*
595  * Set the pool for a contiguous, power of two-sized set of physical pages. 
596  */
597 void
598 vm_phys_set_pool(int pool, vm_page_t m, int order)
599 {
600         vm_page_t m_tmp;
601
602         for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
603                 m_tmp->pool = pool;
604 }
605
606 /*
607  * Search for the given physical page "m" in the free lists.  If the search
608  * succeeds, remove "m" from the free lists and return TRUE.  Otherwise, return
609  * FALSE, indicating that "m" is not in the free lists.
610  *
611  * The free page queues must be locked.
612  */
613 boolean_t
614 vm_phys_unfree_page(vm_page_t m)
615 {
616         struct vm_freelist *fl;
617         struct vm_phys_seg *seg;
618         vm_paddr_t pa, pa_half;
619         vm_page_t m_set, m_tmp;
620         int order;
621
622         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
623
624         /*
625          * First, find the contiguous, power of two-sized set of free
626          * physical pages containing the given physical page "m" and
627          * assign it to "m_set".
628          */
629         seg = &vm_phys_segs[m->segind];
630         for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
631             order < VM_NFREEORDER - 1; ) {
632                 order++;
633                 pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
634                 if (pa >= seg->start)
635                         m_set = &seg->first_page[atop(pa - seg->start)];
636                 else
637                         return (FALSE);
638         }
639         if (m_set->order < order)
640                 return (FALSE);
641         if (m_set->order == VM_NFREEORDER)
642                 return (FALSE);
643         KASSERT(m_set->order < VM_NFREEORDER,
644             ("vm_phys_unfree_page: page %p has unexpected order %d",
645             m_set, m_set->order));
646
647         /*
648          * Next, remove "m_set" from the free lists.  Finally, extract
649          * "m" from "m_set" using an iterative algorithm: While "m_set"
650          * is larger than a page, shrink "m_set" by returning the half
651          * of "m_set" that does not contain "m" to the free lists.
652          */
653         fl = (*seg->free_queues)[m_set->pool];
654         order = m_set->order;
655         TAILQ_REMOVE(&fl[order].pl, m_set, pageq);
656         fl[order].lcnt--;
657         m_set->order = VM_NFREEORDER;
658         while (order > 0) {
659                 order--;
660                 pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
661                 if (m->phys_addr < pa_half)
662                         m_tmp = &seg->first_page[atop(pa_half - seg->start)];
663                 else {
664                         m_tmp = m_set;
665                         m_set = &seg->first_page[atop(pa_half - seg->start)];
666                 }
667                 m_tmp->order = order;
668                 TAILQ_INSERT_HEAD(&fl[order].pl, m_tmp, pageq);
669                 fl[order].lcnt++;
670         }
671         KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
672         return (TRUE);
673 }
674
675 /*
676  * Try to zero one physical page.  Used by an idle priority thread.
677  */
678 boolean_t
679 vm_phys_zero_pages_idle(void)
680 {
681         static struct vm_freelist *fl = vm_phys_free_queues[0][0];
682         static int flind, oind, pind;
683         vm_page_t m, m_tmp;
684
685         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
686         for (;;) {
687                 TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, pageq) {
688                         for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) {
689                                 if ((m_tmp->flags & (PG_CACHED | PG_ZERO)) == 0) {
690                                         vm_phys_unfree_page(m_tmp);
691                                         cnt.v_free_count--;
692                                         mtx_unlock(&vm_page_queue_free_mtx);
693                                         pmap_zero_page_idle(m_tmp);
694                                         m_tmp->flags |= PG_ZERO;
695                                         mtx_lock(&vm_page_queue_free_mtx);
696                                         cnt.v_free_count++;
697                                         vm_phys_free_pages(m_tmp, 0);
698                                         vm_page_zero_count++;
699                                         cnt_prezero++;
700                                         return (TRUE);
701                                 }
702                         }
703                 }
704                 oind++;
705                 if (oind == VM_NFREEORDER) {
706                         oind = 0;
707                         pind++;
708                         if (pind == VM_NFREEPOOL) {
709                                 pind = 0;
710                                 flind++;
711                                 if (flind == vm_nfreelists)
712                                         flind = 0;
713                         }
714                         fl = vm_phys_free_queues[flind][pind];
715                 }
716         }
717 }
718
719 /*
720  * Allocate a contiguous set of physical pages of the given size
721  * "npages" from the free lists.  All of the physical pages must be at
722  * or above the given physical address "low" and below the given
723  * physical address "high".  The given value "alignment" determines the
724  * alignment of the first physical page in the set.  If the given value
725  * "boundary" is non-zero, then the set of physical pages cannot cross
726  * any physical address boundary that is a multiple of that value.  Both
727  * "alignment" and "boundary" must be a power of two.
728  */
729 vm_page_t
730 vm_phys_alloc_contig(unsigned long npages, vm_paddr_t low, vm_paddr_t high,
731     unsigned long alignment, unsigned long boundary)
732 {
733         struct vm_freelist *fl;
734         struct vm_phys_seg *seg;
735         struct vnode *vp;
736         vm_paddr_t pa, pa_last, size;
737         vm_page_t deferred_vdrop_list, m, m_ret;
738         int domain, flind, i, oind, order, pind;
739
740 #if VM_NDOMAIN > 1
741         domain = PCPU_GET(domain);
742 #else
743         domain = 0;
744 #endif
745         size = npages << PAGE_SHIFT;
746         KASSERT(size != 0,
747             ("vm_phys_alloc_contig: size must not be 0"));
748         KASSERT((alignment & (alignment - 1)) == 0,
749             ("vm_phys_alloc_contig: alignment must be a power of 2"));
750         KASSERT((boundary & (boundary - 1)) == 0,
751             ("vm_phys_alloc_contig: boundary must be a power of 2"));
752         deferred_vdrop_list = NULL;
753         /* Compute the queue that is the best fit for npages. */
754         for (order = 0; (1 << order) < npages; order++);
755         mtx_lock(&vm_page_queue_free_mtx);
756 #if VM_NRESERVLEVEL > 0
757 retry:
758 #endif
759         for (flind = 0; flind < vm_nfreelists; flind++) {
760                 for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) {
761                         for (pind = 0; pind < VM_NFREEPOOL; pind++) {
762                                 fl = (*vm_phys_lookup_lists[domain][flind])
763                                     [pind];
764                                 TAILQ_FOREACH(m_ret, &fl[oind].pl, pageq) {
765                                         /*
766                                          * A free list may contain physical pages
767                                          * from one or more segments.
768                                          */
769                                         seg = &vm_phys_segs[m_ret->segind];
770                                         if (seg->start > high ||
771                                             low >= seg->end)
772                                                 continue;
773
774                                         /*
775                                          * Is the size of this allocation request
776                                          * larger than the largest block size?
777                                          */
778                                         if (order >= VM_NFREEORDER) {
779                                                 /*
780                                                  * Determine if a sufficient number
781                                                  * of subsequent blocks to satisfy
782                                                  * the allocation request are free.
783                                                  */
784                                                 pa = VM_PAGE_TO_PHYS(m_ret);
785                                                 pa_last = pa + size;
786                                                 for (;;) {
787                                                         pa += 1 << (PAGE_SHIFT + VM_NFREEORDER - 1);
788                                                         if (pa >= pa_last)
789                                                                 break;
790                                                         if (pa < seg->start ||
791                                                             pa >= seg->end)
792                                                                 break;
793                                                         m = &seg->first_page[atop(pa - seg->start)];
794                                                         if (m->order != VM_NFREEORDER - 1)
795                                                                 break;
796                                                 }
797                                                 /* If not, continue to the next block. */
798                                                 if (pa < pa_last)
799                                                         continue;
800                                         }
801
802                                         /*
803                                          * Determine if the blocks are within the given range,
804                                          * satisfy the given alignment, and do not cross the
805                                          * given boundary.
806                                          */
807                                         pa = VM_PAGE_TO_PHYS(m_ret);
808                                         if (pa >= low &&
809                                             pa + size <= high &&
810                                             (pa & (alignment - 1)) == 0 &&
811                                             ((pa ^ (pa + size - 1)) & ~(boundary - 1)) == 0)
812                                                 goto done;
813                                 }
814                         }
815                 }
816         }
817 #if VM_NRESERVLEVEL > 0
818         if (vm_reserv_reclaim_contig(size, low, high, alignment, boundary))
819                 goto retry;
820 #endif
821         mtx_unlock(&vm_page_queue_free_mtx);
822         return (NULL);
823 done:
824         for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) {
825                 fl = (*seg->free_queues)[m->pool];
826                 TAILQ_REMOVE(&fl[m->order].pl, m, pageq);
827                 fl[m->order].lcnt--;
828                 m->order = VM_NFREEORDER;
829         }
830         if (m_ret->pool != VM_FREEPOOL_DEFAULT)
831                 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m_ret, oind);
832         fl = (*seg->free_queues)[m_ret->pool];
833         vm_phys_split_pages(m_ret, oind, fl, order);
834         for (i = 0; i < npages; i++) {
835                 m = &m_ret[i];
836                 vp = vm_page_alloc_init(m);
837                 if (vp != NULL) {
838                         /*
839                          * Enqueue the vnode for deferred vdrop().
840                          *
841                          * Unmanaged pages don't use "pageq", so it
842                          * can be safely abused to construct a short-
843                          * lived queue of vnodes.
844                          */
845                         m->pageq.tqe_prev = (void *)vp;
846                         m->pageq.tqe_next = deferred_vdrop_list;
847                         deferred_vdrop_list = m;
848                 }
849         }
850         for (; i < roundup2(npages, 1 << imin(oind, order)); i++) {
851                 m = &m_ret[i];
852                 KASSERT(m->order == VM_NFREEORDER,
853                     ("vm_phys_alloc_contig: page %p has unexpected order %d",
854                     m, m->order));
855                 vm_phys_free_pages(m, 0);
856         }
857         mtx_unlock(&vm_page_queue_free_mtx);
858         while (deferred_vdrop_list != NULL) {
859                 vdrop((struct vnode *)deferred_vdrop_list->pageq.tqe_prev);
860                 deferred_vdrop_list = deferred_vdrop_list->pageq.tqe_next;
861         }
862         return (m_ret);
863 }
864
865 #ifdef DDB
866 /*
867  * Show the number of physical pages in each of the free lists.
868  */
869 DB_SHOW_COMMAND(freepages, db_show_freepages)
870 {
871         struct vm_freelist *fl;
872         int flind, oind, pind;
873
874         for (flind = 0; flind < vm_nfreelists; flind++) {
875                 db_printf("FREE LIST %d:\n"
876                     "\n  ORDER (SIZE)  |  NUMBER"
877                     "\n              ", flind);
878                 for (pind = 0; pind < VM_NFREEPOOL; pind++)
879                         db_printf("  |  POOL %d", pind);
880                 db_printf("\n--            ");
881                 for (pind = 0; pind < VM_NFREEPOOL; pind++)
882                         db_printf("-- --      ");
883                 db_printf("--\n");
884                 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
885                         db_printf("  %2.2d (%6.6dK)", oind,
886                             1 << (PAGE_SHIFT - 10 + oind));
887                         for (pind = 0; pind < VM_NFREEPOOL; pind++) {
888                                 fl = vm_phys_free_queues[flind][pind];
889                                 db_printf("  |  %6.6d", fl[oind].lcnt);
890                         }
891                         db_printf("\n");
892                 }
893                 db_printf("\n");
894         }
895 }
896 #endif