]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/vm/vm_contig.c
Add buffer corruption protection (RedZone) for kernel's malloc(9).
[FreeBSD/FreeBSD.git] / sys / vm / vm_contig.c
1 /*-
2  * Copyright (c) 1991 Regents of the University of California.
3  * All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * The Mach Operating System project at Carnegie-Mellon University.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 4. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *      from: @(#)vm_page.c     7.4 (Berkeley) 5/7/91
33  */
34
35 /*-
36  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
37  * All rights reserved.
38  *
39  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
40  *
41  * Permission to use, copy, modify and distribute this software and
42  * its documentation is hereby granted, provided that both the copyright
43  * notice and this permission notice appear in all copies of the
44  * software, derivative works or modified versions, and any portions
45  * thereof, and that both notices appear in supporting documentation.
46  *
47  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
48  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
49  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
50  *
51  * Carnegie Mellon requests users of this software to return to
52  *
53  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
54  *  School of Computer Science
55  *  Carnegie Mellon University
56  *  Pittsburgh PA 15213-3890
57  *
58  * any improvements or extensions that they make and grant Carnegie the
59  * rights to redistribute these changes.
60  */
61
62 #include <sys/cdefs.h>
63 __FBSDID("$FreeBSD$");
64
65 #include <sys/param.h>
66 #include <sys/systm.h>
67 #include <sys/lock.h>
68 #include <sys/malloc.h>
69 #include <sys/mutex.h>
70 #include <sys/proc.h>
71 #include <sys/kernel.h>
72 #include <sys/linker_set.h>
73 #include <sys/sysctl.h>
74 #include <sys/vmmeter.h>
75 #include <sys/vnode.h>
76
77 #include <vm/vm.h>
78 #include <vm/vm_param.h>
79 #include <vm/vm_kern.h>
80 #include <vm/pmap.h>
81 #include <vm/vm_map.h>
82 #include <vm/vm_object.h>
83 #include <vm/vm_page.h>
84 #include <vm/vm_pageout.h>
85 #include <vm/vm_pager.h>
86 #include <vm/vm_extern.h>
87
88 static int
89 vm_contig_launder_page(vm_page_t m)
90 {
91         vm_object_t object;
92         vm_page_t m_tmp;
93         struct vnode *vp;
94
95         object = m->object;
96         if (!VM_OBJECT_TRYLOCK(object))
97                 return (EAGAIN);
98         if (vm_page_sleep_if_busy(m, TRUE, "vpctw0")) {
99                 VM_OBJECT_UNLOCK(object);
100                 vm_page_lock_queues();
101                 return (EBUSY);
102         }
103         vm_page_test_dirty(m);
104         if (m->dirty == 0 && m->hold_count == 0)
105                 pmap_remove_all(m);
106         if (m->dirty) {
107                 if (object->type == OBJT_VNODE) {
108                         vm_page_unlock_queues();
109                         vp = object->handle;
110                         VM_OBJECT_UNLOCK(object);
111                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
112                         VM_OBJECT_LOCK(object);
113                         vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
114                         VM_OBJECT_UNLOCK(object);
115                         VOP_UNLOCK(vp, 0, curthread);
116                         vm_page_lock_queues();
117                         return (0);
118                 } else if (object->type == OBJT_SWAP ||
119                            object->type == OBJT_DEFAULT) {
120                         m_tmp = m;
121                         vm_pageout_flush(&m_tmp, 1, VM_PAGER_PUT_SYNC);
122                         VM_OBJECT_UNLOCK(object);
123                         return (0);
124                 }
125         } else if (m->hold_count == 0)
126                 vm_page_cache(m);
127         VM_OBJECT_UNLOCK(object);
128         return (0);
129 }
130
131 static int
132 vm_contig_launder(int queue)
133 {
134         vm_page_t m, next;
135         int error;
136
137         for (m = TAILQ_FIRST(&vm_page_queues[queue].pl); m != NULL; m = next) {
138                 next = TAILQ_NEXT(m, pageq);
139
140                 /* Skip marker pages */
141                 if ((m->flags & PG_MARKER) != 0)
142                         continue;
143
144                 KASSERT(VM_PAGE_INQUEUE2(m, queue),
145                     ("vm_contig_launder: page %p's queue is not %d", m, queue));
146                 error = vm_contig_launder_page(m);
147                 if (error == 0)
148                         return (TRUE);
149                 if (error == EBUSY)
150                         return (FALSE);
151         }
152         return (FALSE);
153 }
154
155 /*
156  * This interface is for merging with malloc() someday.
157  * Even if we never implement compaction so that contiguous allocation
158  * works after initialization time, malloc()'s data structures are good
159  * for statistics and for allocations of less than a page.
160  */
161 static void *
162 contigmalloc1(
163         unsigned long size,     /* should be size_t here and for malloc() */
164         struct malloc_type *type,
165         int flags,
166         vm_paddr_t low,
167         vm_paddr_t high,
168         unsigned long alignment,
169         unsigned long boundary,
170         vm_map_t map)
171 {
172         int i, start;
173         vm_paddr_t phys;
174         vm_object_t object;
175         vm_offset_t addr, tmp_addr;
176         int pass, pqtype;
177         int inactl, actl, inactmax, actmax;
178         vm_page_t pga = vm_page_array;
179
180         size = round_page(size);
181         if (size == 0)
182                 panic("contigmalloc1: size must not be 0");
183         if ((alignment & (alignment - 1)) != 0)
184                 panic("contigmalloc1: alignment must be a power of 2");
185         if ((boundary & (boundary - 1)) != 0)
186                 panic("contigmalloc1: boundary must be a power of 2");
187
188         start = 0;
189         for (pass = 2; pass >= 0; pass--) {
190                 vm_page_lock_queues();
191 again0:
192                 mtx_lock_spin(&vm_page_queue_free_mtx);
193 again:
194                 /*
195                  * Find first page in array that is free, within range,
196                  * aligned, and such that the boundary won't be crossed.
197                  */
198                 for (i = start; i < cnt.v_page_count; i++) {
199                         phys = VM_PAGE_TO_PHYS(&pga[i]);
200                         pqtype = pga[i].queue - pga[i].pc;
201                         if (((pqtype == PQ_FREE) || (pqtype == PQ_CACHE)) &&
202                             (phys >= low) && (phys < high) &&
203                             ((phys & (alignment - 1)) == 0) &&
204                             (((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0))
205                                 break;
206                 }
207
208                 /*
209                  * If the above failed or we will exceed the upper bound, fail.
210                  */
211                 if ((i == cnt.v_page_count) ||
212                         ((VM_PAGE_TO_PHYS(&pga[i]) + size) > high)) {
213                         mtx_unlock_spin(&vm_page_queue_free_mtx);
214                         /*
215                          * Instead of racing to empty the inactive/active
216                          * queues, give up, even with more left to free,
217                          * if we try more than the initial amount of pages.
218                          *
219                          * There's no point attempting this on the last pass.
220                          */
221                         if (pass > 0) {
222                                 inactl = actl = 0;
223                                 inactmax = vm_page_queues[PQ_INACTIVE].lcnt;
224                                 actmax = vm_page_queues[PQ_ACTIVE].lcnt;
225 again1:
226                                 if (inactl < inactmax &&
227                                     vm_contig_launder(PQ_INACTIVE)) {
228                                         inactl++;
229                                         goto again1;
230                                 }
231                                 if (actl < actmax &&
232                                     vm_contig_launder(PQ_ACTIVE)) {
233                                         actl++;
234                                         goto again1;
235                                 }
236                         }
237                         vm_page_unlock_queues();
238                         continue;
239                 }
240                 start = i;
241
242                 /*
243                  * Check successive pages for contiguous and free.
244                  */
245                 for (i = start + 1; i < (start + size / PAGE_SIZE); i++) {
246                         pqtype = pga[i].queue - pga[i].pc;
247                         if ((VM_PAGE_TO_PHYS(&pga[i]) !=
248                             (VM_PAGE_TO_PHYS(&pga[i - 1]) + PAGE_SIZE)) ||
249                             ((pqtype != PQ_FREE) && (pqtype != PQ_CACHE))) {
250                                 start++;
251                                 goto again;
252                         }
253                 }
254                 mtx_unlock_spin(&vm_page_queue_free_mtx);
255                 for (i = start; i < (start + size / PAGE_SIZE); i++) {
256                         vm_page_t m = &pga[i];
257
258                         if (VM_PAGE_INQUEUE1(m, PQ_CACHE)) {
259                                 if (m->hold_count != 0) {
260                                         start++;
261                                         goto again0;
262                                 }
263                                 object = m->object;
264                                 if (!VM_OBJECT_TRYLOCK(object)) {
265                                         start++;
266                                         goto again0;
267                                 }
268                                 if ((m->flags & PG_BUSY) || m->busy != 0) {
269                                         VM_OBJECT_UNLOCK(object);
270                                         start++;
271                                         goto again0;
272                                 }
273                                 vm_page_free(m);
274                                 VM_OBJECT_UNLOCK(object);
275                         }
276                 }
277                 mtx_lock_spin(&vm_page_queue_free_mtx);
278                 for (i = start; i < (start + size / PAGE_SIZE); i++) {
279                         pqtype = pga[i].queue - pga[i].pc;
280                         if (pqtype != PQ_FREE) {
281                                 start++;
282                                 goto again;
283                         }
284                 }
285                 for (i = start; i < (start + size / PAGE_SIZE); i++) {
286                         vm_page_t m = &pga[i];
287                         vm_pageq_remove_nowakeup(m);
288                         m->valid = VM_PAGE_BITS_ALL;
289                         if (m->flags & PG_ZERO)
290                                 vm_page_zero_count--;
291                         /* Don't clear the PG_ZERO flag, we'll need it later. */
292                         m->flags = PG_UNMANAGED | (m->flags & PG_ZERO);
293                         KASSERT(m->dirty == 0,
294                             ("contigmalloc1: page %p was dirty", m));
295                         m->wire_count = 0;
296                         m->busy = 0;
297                 }
298                 mtx_unlock_spin(&vm_page_queue_free_mtx);
299                 vm_page_unlock_queues();
300                 /*
301                  * We've found a contiguous chunk that meets are requirements.
302                  * Allocate kernel VM, unfree and assign the physical pages to
303                  * it and return kernel VM pointer.
304                  */
305                 vm_map_lock(map);
306                 if (vm_map_findspace(map, vm_map_min(map), size, &addr) !=
307                     KERN_SUCCESS) {
308                         /*
309                          * XXX We almost never run out of kernel virtual
310                          * space, so we don't make the allocated memory
311                          * above available.
312                          */
313                         vm_map_unlock(map);
314                         return (NULL);
315                 }
316                 vm_object_reference(kernel_object);
317                 vm_map_insert(map, kernel_object, addr - VM_MIN_KERNEL_ADDRESS,
318                     addr, addr + size, VM_PROT_ALL, VM_PROT_ALL, 0);
319                 vm_map_unlock(map);
320
321                 tmp_addr = addr;
322                 VM_OBJECT_LOCK(kernel_object);
323                 for (i = start; i < (start + size / PAGE_SIZE); i++) {
324                         vm_page_t m = &pga[i];
325                         vm_page_insert(m, kernel_object,
326                                 OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS));
327                         if ((flags & M_ZERO) && !(m->flags & PG_ZERO))
328                                 pmap_zero_page(m);
329                         tmp_addr += PAGE_SIZE;
330                 }
331                 VM_OBJECT_UNLOCK(kernel_object);
332                 vm_map_wire(map, addr, addr + size,
333                     VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES);
334
335                 return ((void *)addr);
336         }
337         return (NULL);
338 }
339
340 static void
341 vm_page_release_contigl(vm_page_t m, vm_pindex_t count)
342 {
343         while (count--) {
344                 vm_page_free_toq(m);
345                 m++;
346         }
347 }
348
349 void
350 vm_page_release_contig(vm_page_t m, vm_pindex_t count)
351 {
352         vm_page_lock_queues();
353         vm_page_release_contigl(m, count);
354         vm_page_unlock_queues();
355 }
356
357 static int
358 vm_contig_unqueue_free(vm_page_t m)
359 {
360         int error = 0;
361
362         mtx_lock_spin(&vm_page_queue_free_mtx);
363         if ((m->queue - m->pc) == PQ_FREE)
364                 vm_pageq_remove_nowakeup(m);
365         else
366                 error = EAGAIN;
367         mtx_unlock_spin(&vm_page_queue_free_mtx);
368         if (error)
369                 return (error);
370         m->valid = VM_PAGE_BITS_ALL;
371         if (m->flags & PG_ZERO)
372                 vm_page_zero_count--;
373         /* Don't clear the PG_ZERO flag; we'll need it later. */
374         m->flags = PG_UNMANAGED | (m->flags & PG_ZERO);
375         KASSERT(m->dirty == 0,
376             ("contigmalloc2: page %p was dirty", m));
377         m->wire_count = 0;
378         m->busy = 0;
379         return (error);
380 }
381
382 vm_page_t
383 vm_page_alloc_contig(vm_pindex_t npages, vm_paddr_t low, vm_paddr_t high,
384             vm_offset_t alignment, vm_offset_t boundary)
385 {
386         vm_object_t object;
387         vm_offset_t size;
388         vm_paddr_t phys;
389         vm_page_t pga = vm_page_array;
390         static vm_pindex_t np = 0;
391         static vm_pindex_t start = 0;
392         int i, pass, pqtype;
393
394         size = npages << PAGE_SHIFT;
395         if (size == 0)
396                 panic("vm_page_alloc_contig: size must not be 0");
397         if ((alignment & (alignment - 1)) != 0)
398                 panic("vm_page_alloc_contig: alignment must be a power of 2");
399         if ((boundary & (boundary - 1)) != 0)
400                 panic("vm_page_alloc_contig: boundary must be a power of 2");
401
402         /*
403          * Two simple optimizations.  First, don't scan high ordered pages
404          * if they are outside of the requested address range.  Second, cache
405          * the starting page index across calls and reuse it instead of
406          * restarting the scan from the top.  This is conditional on the
407          * requested number of pages being the same or greater than the
408          * cached amount.
409          */
410         for (pass = 0; pass < 2; pass++) {
411                 if ((np == 0) || (np > npages)) {
412                         if (atop(high) < vm_page_array_size)
413                                 start = atop(high) - npages + 1;
414                         else
415                                 start = vm_page_array_size - npages + 1;
416                 }
417                 np = 0;
418                 vm_page_lock_queues();
419 retry:
420                 start--;
421                 /*
422                  * Find last page in array that is free, within range,
423                  * aligned, and such that the boundary won't be crossed.
424                  */
425                 for (i = start; i >= 0; i--) {
426                         phys = VM_PAGE_TO_PHYS(&pga[i]);
427                         pqtype = pga[i].queue - pga[i].pc;
428                         if (pass == 0) {
429                                 if (pqtype != PQ_FREE && pqtype != PQ_CACHE)
430                                         continue;
431                         } else if (pqtype != PQ_FREE && pqtype != PQ_CACHE &&
432                                     pga[i].queue != PQ_ACTIVE &&
433                                     pga[i].queue != PQ_INACTIVE)
434                                 continue;
435                         if (phys >= low && phys + size <= high &&
436                             ((phys & (alignment - 1)) == 0) &&
437                             ((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0)
438                                 break;
439                 }
440                 /* There are no candidates at all. */
441                 if (i == -1) {
442                         vm_page_unlock_queues();
443                         continue;
444                 }
445                 start = i;
446                 /*
447                  * Check successive pages for contiguous and free.
448                  */
449                 for (i = start + npages - 1; i > start; i--) {
450                         pqtype = pga[i].queue - pga[i].pc;
451                         if (VM_PAGE_TO_PHYS(&pga[i]) !=
452                             VM_PAGE_TO_PHYS(&pga[i - 1]) + PAGE_SIZE) {
453                                 start = i - npages + 1;
454                                 goto retry;
455                         }
456                         if (pass == 0) {
457                                 if (pqtype != PQ_FREE && pqtype != PQ_CACHE) {
458                                         start = i - npages + 1;
459                                         goto retry;
460                                 }
461                         } else if (pqtype != PQ_FREE && pqtype != PQ_CACHE &&
462                                     pga[i].queue != PQ_ACTIVE &&
463                                     pga[i].queue != PQ_INACTIVE) {
464                                 start = i - npages + 1;
465                                 goto retry;
466                         }
467                 }
468                 for (i = start + npages - 1; i >= start; i--) {
469                         vm_page_t m = &pga[i];
470
471 retry_page:
472                         pqtype = m->queue - m->pc;
473                         if (pass != 0 && pqtype != PQ_FREE &&
474                             pqtype != PQ_CACHE) {
475                                 if (m->queue == PQ_ACTIVE ||
476                                     m->queue == PQ_INACTIVE) {
477                                         if (vm_contig_launder_page(m) != 0)
478                                                 goto cleanup_freed;
479                                         pqtype = m->queue - m->pc;
480                                         if (pqtype != PQ_FREE &&
481                                             pqtype != PQ_CACHE)
482                                                 goto cleanup_freed;
483                                 } else {
484 cleanup_freed:
485                                         vm_page_release_contigl(&pga[i + 1],
486                                             start + npages - 1 - i);
487                                         start = i - npages + 1;
488                                         goto retry;
489                                 }
490                         }
491                         if (pqtype == PQ_CACHE) {
492                                 if (m->hold_count != 0)
493                                         goto cleanup_freed;
494                                 object = m->object;
495                                 if (!VM_OBJECT_TRYLOCK(object))
496                                         goto cleanup_freed;
497                                 if ((m->flags & PG_BUSY) || m->busy != 0) {
498                                         VM_OBJECT_UNLOCK(object);
499                                         goto cleanup_freed;
500                                 }
501                                 vm_page_free(m);
502                                 VM_OBJECT_UNLOCK(object);
503                         }
504                         /*
505                          * There is no good API for freeing a page
506                          * directly to PQ_NONE on our behalf, so spin.
507                          */
508                         if (vm_contig_unqueue_free(m) != 0)
509                                 goto retry_page;
510                 }
511                 vm_page_unlock_queues();
512                 /*
513                  * We've found a contiguous chunk that meets are requirements.
514                  */
515                 np = npages;
516                 return (&pga[start]);
517         }
518         return (NULL);
519 }
520
521 static void *
522 contigmalloc2(vm_page_t m, vm_pindex_t npages, int flags)
523 {
524         vm_object_t object = kernel_object;
525         vm_map_t map = kernel_map;
526         vm_offset_t addr, tmp_addr;
527         vm_pindex_t i;
528  
529         /*
530          * Allocate kernel VM, unfree and assign the physical pages to
531          * it and return kernel VM pointer.
532          */
533         vm_map_lock(map);
534         if (vm_map_findspace(map, vm_map_min(map), npages << PAGE_SHIFT, &addr)
535             != KERN_SUCCESS) {
536                 vm_map_unlock(map);
537                 return (NULL);
538         }
539         vm_object_reference(object);
540         vm_map_insert(map, object, addr - VM_MIN_KERNEL_ADDRESS,
541             addr, addr + (npages << PAGE_SHIFT), VM_PROT_ALL, VM_PROT_ALL, 0);
542         vm_map_unlock(map);
543         tmp_addr = addr;
544         VM_OBJECT_LOCK(object);
545         for (i = 0; i < npages; i++) {
546                 vm_page_insert(&m[i], object,
547                     OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS));
548                 if ((flags & M_ZERO) && !(m->flags & PG_ZERO))
549                         pmap_zero_page(&m[i]);
550                 tmp_addr += PAGE_SIZE;
551         }
552         VM_OBJECT_UNLOCK(object);
553         vm_map_wire(map, addr, addr + (npages << PAGE_SHIFT),
554             VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
555         return ((void *)addr);
556 }
557
558 static int vm_old_contigmalloc = 0;
559 SYSCTL_INT(_vm, OID_AUTO, old_contigmalloc,
560     CTLFLAG_RW, &vm_old_contigmalloc, 0, "Use the old contigmalloc algorithm");
561 TUNABLE_INT("vm.old_contigmalloc", &vm_old_contigmalloc);
562
563 void *
564 contigmalloc(
565         unsigned long size,     /* should be size_t here and for malloc() */
566         struct malloc_type *type,
567         int flags,
568         vm_paddr_t low,
569         vm_paddr_t high,
570         unsigned long alignment,
571         unsigned long boundary)
572 {
573         void * ret;
574         vm_page_t pages;
575         vm_pindex_t npgs;
576
577         npgs = round_page(size) >> PAGE_SHIFT;
578         mtx_lock(&Giant);
579         if (vm_old_contigmalloc) {
580                 ret = contigmalloc1(size, type, flags, low, high, alignment,
581                     boundary, kernel_map);
582         } else {
583                 pages = vm_page_alloc_contig(npgs, low, high,
584                     alignment, boundary);
585                 if (pages == NULL) {
586                         ret = NULL;
587                 } else {
588                         ret = contigmalloc2(pages, npgs, flags);
589                         if (ret == NULL)
590                                 vm_page_release_contig(pages, npgs);
591                 }
592                 
593         }
594         mtx_unlock(&Giant);
595         malloc_type_allocated(type, ret == NULL ? 0 : npgs << PAGE_SHIFT);
596         return (ret);
597 }
598
599 void
600 contigfree(void *addr, unsigned long size, struct malloc_type *type)
601 {
602         vm_pindex_t npgs;
603
604         npgs = round_page(size) >> PAGE_SHIFT;
605         kmem_free(kernel_map, (vm_offset_t)addr, size);
606         malloc_type_freed(type, npgs << PAGE_SHIFT);
607 }