]> CyberLeo.Net >> Repos - FreeBSD/stable/8.git/blob - sys/vm/vm_object.c
MFC elimination of several settings of PG_REFERENCED bit, that either
[FreeBSD/stable/8.git] / sys / vm / vm_object.c
1 /*-
2  * Copyright (c) 1991, 1993
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * The Mach Operating System project at Carnegie-Mellon University.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 4. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *      from: @(#)vm_object.c   8.5 (Berkeley) 3/22/94
33  *
34  *
35  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
36  * All rights reserved.
37  *
38  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
39  *
40  * Permission to use, copy, modify and distribute this software and
41  * its documentation is hereby granted, provided that both the copyright
42  * notice and this permission notice appear in all copies of the
43  * software, derivative works or modified versions, and any portions
44  * thereof, and that both notices appear in supporting documentation.
45  *
46  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
47  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
48  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
49  *
50  * Carnegie Mellon requests users of this software to return to
51  *
52  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
53  *  School of Computer Science
54  *  Carnegie Mellon University
55  *  Pittsburgh PA 15213-3890
56  *
57  * any improvements or extensions that they make and grant Carnegie the
58  * rights to redistribute these changes.
59  */
60
61 /*
62  *      Virtual memory object module.
63  */
64
65 #include <sys/cdefs.h>
66 __FBSDID("$FreeBSD$");
67
68 #include "opt_vm.h"
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/lock.h>
73 #include <sys/mman.h>
74 #include <sys/mount.h>
75 #include <sys/kernel.h>
76 #include <sys/sysctl.h>
77 #include <sys/mutex.h>
78 #include <sys/proc.h>           /* for curproc, pageproc */
79 #include <sys/socket.h>
80 #include <sys/resourcevar.h>
81 #include <sys/vnode.h>
82 #include <sys/vmmeter.h>
83 #include <sys/sx.h>
84
85 #include <vm/vm.h>
86 #include <vm/vm_param.h>
87 #include <vm/pmap.h>
88 #include <vm/vm_map.h>
89 #include <vm/vm_object.h>
90 #include <vm/vm_page.h>
91 #include <vm/vm_pageout.h>
92 #include <vm/vm_pager.h>
93 #include <vm/swap_pager.h>
94 #include <vm/vm_kern.h>
95 #include <vm/vm_extern.h>
96 #include <vm/vm_reserv.h>
97 #include <vm/uma.h>
98
99 #define EASY_SCAN_FACTOR       8
100
101 #define MSYNC_FLUSH_HARDSEQ     0x01
102 #define MSYNC_FLUSH_SOFTSEQ     0x02
103
104 /*
105  * msync / VM object flushing optimizations
106  */
107 static int msync_flush_flags = MSYNC_FLUSH_HARDSEQ | MSYNC_FLUSH_SOFTSEQ;
108 SYSCTL_INT(_vm, OID_AUTO, msync_flush_flags, CTLFLAG_RW, &msync_flush_flags, 0,
109     "Enable sequential iteration optimization");
110
111 static int old_msync;
112 SYSCTL_INT(_vm, OID_AUTO, old_msync, CTLFLAG_RW, &old_msync, 0,
113     "Use old (insecure) msync behavior");
114
115 static void     vm_object_qcollapse(vm_object_t object);
116 static int      vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration, int pagerflags);
117 static void     vm_object_vndeallocate(vm_object_t object);
118
119 /*
120  *      Virtual memory objects maintain the actual data
121  *      associated with allocated virtual memory.  A given
122  *      page of memory exists within exactly one object.
123  *
124  *      An object is only deallocated when all "references"
125  *      are given up.  Only one "reference" to a given
126  *      region of an object should be writeable.
127  *
128  *      Associated with each object is a list of all resident
129  *      memory pages belonging to that object; this list is
130  *      maintained by the "vm_page" module, and locked by the object's
131  *      lock.
132  *
133  *      Each object also records a "pager" routine which is
134  *      used to retrieve (and store) pages to the proper backing
135  *      storage.  In addition, objects may be backed by other
136  *      objects from which they were virtual-copied.
137  *
138  *      The only items within the object structure which are
139  *      modified after time of creation are:
140  *              reference count         locked by object's lock
141  *              pager routine           locked by object's lock
142  *
143  */
144
145 struct object_q vm_object_list;
146 struct mtx vm_object_list_mtx;  /* lock for object list and count */
147
148 struct vm_object kernel_object_store;
149 struct vm_object kmem_object_store;
150
151 SYSCTL_NODE(_vm_stats, OID_AUTO, object, CTLFLAG_RD, 0, "VM object stats");
152
153 static long object_collapses;
154 SYSCTL_LONG(_vm_stats_object, OID_AUTO, collapses, CTLFLAG_RD,
155     &object_collapses, 0, "VM object collapses");
156
157 static long object_bypasses;
158 SYSCTL_LONG(_vm_stats_object, OID_AUTO, bypasses, CTLFLAG_RD,
159     &object_bypasses, 0, "VM object bypasses");
160
161 static uma_zone_t obj_zone;
162
163 static int vm_object_zinit(void *mem, int size, int flags);
164
165 #ifdef INVARIANTS
166 static void vm_object_zdtor(void *mem, int size, void *arg);
167
168 static void
169 vm_object_zdtor(void *mem, int size, void *arg)
170 {
171         vm_object_t object;
172
173         object = (vm_object_t)mem;
174         KASSERT(TAILQ_EMPTY(&object->memq),
175             ("object %p has resident pages",
176             object));
177 #if VM_NRESERVLEVEL > 0
178         KASSERT(LIST_EMPTY(&object->rvq),
179             ("object %p has reservations",
180             object));
181 #endif
182         KASSERT(object->cache == NULL,
183             ("object %p has cached pages",
184             object));
185         KASSERT(object->paging_in_progress == 0,
186             ("object %p paging_in_progress = %d",
187             object, object->paging_in_progress));
188         KASSERT(object->resident_page_count == 0,
189             ("object %p resident_page_count = %d",
190             object, object->resident_page_count));
191         KASSERT(object->shadow_count == 0,
192             ("object %p shadow_count = %d",
193             object, object->shadow_count));
194 }
195 #endif
196
197 static int
198 vm_object_zinit(void *mem, int size, int flags)
199 {
200         vm_object_t object;
201
202         object = (vm_object_t)mem;
203         bzero(&object->mtx, sizeof(object->mtx));
204         VM_OBJECT_LOCK_INIT(object, "standard object");
205
206         /* These are true for any object that has been freed */
207         object->paging_in_progress = 0;
208         object->resident_page_count = 0;
209         object->shadow_count = 0;
210         return (0);
211 }
212
213 void
214 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
215 {
216
217         TAILQ_INIT(&object->memq);
218         LIST_INIT(&object->shadow_head);
219
220         object->root = NULL;
221         object->type = type;
222         object->size = size;
223         object->generation = 1;
224         object->ref_count = 1;
225         object->memattr = VM_MEMATTR_DEFAULT;
226         object->flags = 0;
227         object->uip = NULL;
228         object->charge = 0;
229         if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
230                 object->flags = OBJ_ONEMAPPING;
231         object->pg_color = 0;
232         object->handle = NULL;
233         object->backing_object = NULL;
234         object->backing_object_offset = (vm_ooffset_t) 0;
235 #if VM_NRESERVLEVEL > 0
236         LIST_INIT(&object->rvq);
237 #endif
238         object->cache = NULL;
239
240         mtx_lock(&vm_object_list_mtx);
241         TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
242         mtx_unlock(&vm_object_list_mtx);
243 }
244
245 /*
246  *      vm_object_init:
247  *
248  *      Initialize the VM objects module.
249  */
250 void
251 vm_object_init(void)
252 {
253         TAILQ_INIT(&vm_object_list);
254         mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF);
255         
256         VM_OBJECT_LOCK_INIT(&kernel_object_store, "kernel object");
257         _vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
258             kernel_object);
259 #if VM_NRESERVLEVEL > 0
260         kernel_object->flags |= OBJ_COLORED;
261         kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
262 #endif
263
264         VM_OBJECT_LOCK_INIT(&kmem_object_store, "kmem object");
265         _vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
266             kmem_object);
267 #if VM_NRESERVLEVEL > 0
268         kmem_object->flags |= OBJ_COLORED;
269         kmem_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
270 #endif
271
272         /*
273          * The lock portion of struct vm_object must be type stable due
274          * to vm_pageout_fallback_object_lock locking a vm object
275          * without holding any references to it.
276          */
277         obj_zone = uma_zcreate("VM OBJECT", sizeof (struct vm_object), NULL,
278 #ifdef INVARIANTS
279             vm_object_zdtor,
280 #else
281             NULL,
282 #endif
283             vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM|UMA_ZONE_NOFREE);
284 }
285
286 void
287 vm_object_clear_flag(vm_object_t object, u_short bits)
288 {
289
290         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
291         object->flags &= ~bits;
292 }
293
294 /*
295  *      Sets the default memory attribute for the specified object.  Pages
296  *      that are allocated to this object are by default assigned this memory
297  *      attribute.
298  *
299  *      Presently, this function must be called before any pages are allocated
300  *      to the object.  In the future, this requirement may be relaxed for
301  *      "default" and "swap" objects.
302  */
303 int
304 vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr)
305 {
306
307         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
308         switch (object->type) {
309         case OBJT_DEFAULT:
310         case OBJT_DEVICE:
311         case OBJT_PHYS:
312         case OBJT_SG:
313         case OBJT_SWAP:
314         case OBJT_VNODE:
315                 if (!TAILQ_EMPTY(&object->memq))
316                         return (KERN_FAILURE);
317                 break;
318         case OBJT_DEAD:
319                 return (KERN_INVALID_ARGUMENT);
320         }
321         object->memattr = memattr;
322         return (KERN_SUCCESS);
323 }
324
325 void
326 vm_object_pip_add(vm_object_t object, short i)
327 {
328
329         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
330         object->paging_in_progress += i;
331 }
332
333 void
334 vm_object_pip_subtract(vm_object_t object, short i)
335 {
336
337         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
338         object->paging_in_progress -= i;
339 }
340
341 void
342 vm_object_pip_wakeup(vm_object_t object)
343 {
344
345         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
346         object->paging_in_progress--;
347         if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) {
348                 vm_object_clear_flag(object, OBJ_PIPWNT);
349                 wakeup(object);
350         }
351 }
352
353 void
354 vm_object_pip_wakeupn(vm_object_t object, short i)
355 {
356
357         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
358         if (i)
359                 object->paging_in_progress -= i;
360         if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) {
361                 vm_object_clear_flag(object, OBJ_PIPWNT);
362                 wakeup(object);
363         }
364 }
365
366 void
367 vm_object_pip_wait(vm_object_t object, char *waitid)
368 {
369
370         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
371         while (object->paging_in_progress) {
372                 object->flags |= OBJ_PIPWNT;
373                 msleep(object, VM_OBJECT_MTX(object), PVM, waitid, 0);
374         }
375 }
376
377 /*
378  *      vm_object_allocate:
379  *
380  *      Returns a new object with the given size.
381  */
382 vm_object_t
383 vm_object_allocate(objtype_t type, vm_pindex_t size)
384 {
385         vm_object_t object;
386
387         object = (vm_object_t)uma_zalloc(obj_zone, M_WAITOK);
388         _vm_object_allocate(type, size, object);
389         return (object);
390 }
391
392
393 /*
394  *      vm_object_reference:
395  *
396  *      Gets another reference to the given object.  Note: OBJ_DEAD
397  *      objects can be referenced during final cleaning.
398  */
399 void
400 vm_object_reference(vm_object_t object)
401 {
402         if (object == NULL)
403                 return;
404         VM_OBJECT_LOCK(object);
405         vm_object_reference_locked(object);
406         VM_OBJECT_UNLOCK(object);
407 }
408
409 /*
410  *      vm_object_reference_locked:
411  *
412  *      Gets another reference to the given object.
413  *
414  *      The object must be locked.
415  */
416 void
417 vm_object_reference_locked(vm_object_t object)
418 {
419         struct vnode *vp;
420
421         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
422         object->ref_count++;
423         if (object->type == OBJT_VNODE) {
424                 vp = object->handle;
425                 vref(vp);
426         }
427 }
428
429 /*
430  * Handle deallocating an object of type OBJT_VNODE.
431  */
432 static void
433 vm_object_vndeallocate(vm_object_t object)
434 {
435         struct vnode *vp = (struct vnode *) object->handle;
436
437         VFS_ASSERT_GIANT(vp->v_mount);
438         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
439         KASSERT(object->type == OBJT_VNODE,
440             ("vm_object_vndeallocate: not a vnode object"));
441         KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
442 #ifdef INVARIANTS
443         if (object->ref_count == 0) {
444                 vprint("vm_object_vndeallocate", vp);
445                 panic("vm_object_vndeallocate: bad object reference count");
446         }
447 #endif
448
449         object->ref_count--;
450         if (object->ref_count == 0) {
451                 mp_fixme("Unlocked vflag access.");
452                 vp->v_vflag &= ~VV_TEXT;
453         }
454         VM_OBJECT_UNLOCK(object);
455         /*
456          * vrele may need a vop lock
457          */
458         vrele(vp);
459 }
460
461 /*
462  *      vm_object_deallocate:
463  *
464  *      Release a reference to the specified object,
465  *      gained either through a vm_object_allocate
466  *      or a vm_object_reference call.  When all references
467  *      are gone, storage associated with this object
468  *      may be relinquished.
469  *
470  *      No object may be locked.
471  */
472 void
473 vm_object_deallocate(vm_object_t object)
474 {
475         vm_object_t temp;
476
477         while (object != NULL) {
478                 int vfslocked;
479
480                 vfslocked = 0;
481         restart:
482                 VM_OBJECT_LOCK(object);
483                 if (object->type == OBJT_VNODE) {
484                         struct vnode *vp = (struct vnode *) object->handle;
485
486                         /*
487                          * Conditionally acquire Giant for a vnode-backed
488                          * object.  We have to be careful since the type of
489                          * a vnode object can change while the object is
490                          * unlocked.
491                          */
492                         if (VFS_NEEDSGIANT(vp->v_mount) && !vfslocked) {
493                                 vfslocked = 1;
494                                 if (!mtx_trylock(&Giant)) {
495                                         VM_OBJECT_UNLOCK(object);
496                                         mtx_lock(&Giant);
497                                         goto restart;
498                                 }
499                         }
500                         vm_object_vndeallocate(object);
501                         VFS_UNLOCK_GIANT(vfslocked);
502                         return;
503                 } else
504                         /*
505                          * This is to handle the case that the object
506                          * changed type while we dropped its lock to
507                          * obtain Giant.
508                          */
509                         VFS_UNLOCK_GIANT(vfslocked);
510
511                 KASSERT(object->ref_count != 0,
512                         ("vm_object_deallocate: object deallocated too many times: %d", object->type));
513
514                 /*
515                  * If the reference count goes to 0 we start calling
516                  * vm_object_terminate() on the object chain.
517                  * A ref count of 1 may be a special case depending on the
518                  * shadow count being 0 or 1.
519                  */
520                 object->ref_count--;
521                 if (object->ref_count > 1) {
522                         VM_OBJECT_UNLOCK(object);
523                         return;
524                 } else if (object->ref_count == 1) {
525                         if (object->shadow_count == 0 &&
526                             object->handle == NULL &&
527                             (object->type == OBJT_DEFAULT ||
528                              object->type == OBJT_SWAP)) {
529                                 vm_object_set_flag(object, OBJ_ONEMAPPING);
530                         } else if ((object->shadow_count == 1) &&
531                             (object->handle == NULL) &&
532                             (object->type == OBJT_DEFAULT ||
533                              object->type == OBJT_SWAP)) {
534                                 vm_object_t robject;
535
536                                 robject = LIST_FIRST(&object->shadow_head);
537                                 KASSERT(robject != NULL,
538                                     ("vm_object_deallocate: ref_count: %d, shadow_count: %d",
539                                          object->ref_count,
540                                          object->shadow_count));
541                                 if (!VM_OBJECT_TRYLOCK(robject)) {
542                                         /*
543                                          * Avoid a potential deadlock.
544                                          */
545                                         object->ref_count++;
546                                         VM_OBJECT_UNLOCK(object);
547                                         /*
548                                          * More likely than not the thread
549                                          * holding robject's lock has lower
550                                          * priority than the current thread.
551                                          * Let the lower priority thread run.
552                                          */
553                                         pause("vmo_de", 1);
554                                         continue;
555                                 }
556                                 /*
557                                  * Collapse object into its shadow unless its
558                                  * shadow is dead.  In that case, object will
559                                  * be deallocated by the thread that is
560                                  * deallocating its shadow.
561                                  */
562                                 if ((robject->flags & OBJ_DEAD) == 0 &&
563                                     (robject->handle == NULL) &&
564                                     (robject->type == OBJT_DEFAULT ||
565                                      robject->type == OBJT_SWAP)) {
566
567                                         robject->ref_count++;
568 retry:
569                                         if (robject->paging_in_progress) {
570                                                 VM_OBJECT_UNLOCK(object);
571                                                 vm_object_pip_wait(robject,
572                                                     "objde1");
573                                                 temp = robject->backing_object;
574                                                 if (object == temp) {
575                                                         VM_OBJECT_LOCK(object);
576                                                         goto retry;
577                                                 }
578                                         } else if (object->paging_in_progress) {
579                                                 VM_OBJECT_UNLOCK(robject);
580                                                 object->flags |= OBJ_PIPWNT;
581                                                 msleep(object,
582                                                     VM_OBJECT_MTX(object),
583                                                     PDROP | PVM, "objde2", 0);
584                                                 VM_OBJECT_LOCK(robject);
585                                                 temp = robject->backing_object;
586                                                 if (object == temp) {
587                                                         VM_OBJECT_LOCK(object);
588                                                         goto retry;
589                                                 }
590                                         } else
591                                                 VM_OBJECT_UNLOCK(object);
592
593                                         if (robject->ref_count == 1) {
594                                                 robject->ref_count--;
595                                                 object = robject;
596                                                 goto doterm;
597                                         }
598                                         object = robject;
599                                         vm_object_collapse(object);
600                                         VM_OBJECT_UNLOCK(object);
601                                         continue;
602                                 }
603                                 VM_OBJECT_UNLOCK(robject);
604                         }
605                         VM_OBJECT_UNLOCK(object);
606                         return;
607                 }
608 doterm:
609                 temp = object->backing_object;
610                 if (temp != NULL) {
611                         VM_OBJECT_LOCK(temp);
612                         LIST_REMOVE(object, shadow_list);
613                         temp->shadow_count--;
614                         temp->generation++;
615                         VM_OBJECT_UNLOCK(temp);
616                         object->backing_object = NULL;
617                 }
618                 /*
619                  * Don't double-terminate, we could be in a termination
620                  * recursion due to the terminate having to sync data
621                  * to disk.
622                  */
623                 if ((object->flags & OBJ_DEAD) == 0)
624                         vm_object_terminate(object);
625                 else
626                         VM_OBJECT_UNLOCK(object);
627                 object = temp;
628         }
629 }
630
631 /*
632  *      vm_object_destroy removes the object from the global object list
633  *      and frees the space for the object.
634  */
635 void
636 vm_object_destroy(vm_object_t object)
637 {
638
639         /*
640          * Remove the object from the global object list.
641          */
642         mtx_lock(&vm_object_list_mtx);
643         TAILQ_REMOVE(&vm_object_list, object, object_list);
644         mtx_unlock(&vm_object_list_mtx);
645
646         /*
647          * Release the allocation charge.
648          */
649         if (object->uip != NULL) {
650                 KASSERT(object->type == OBJT_DEFAULT ||
651                     object->type == OBJT_SWAP,
652                     ("vm_object_terminate: non-swap obj %p has uip",
653                      object));
654                 swap_release_by_uid(object->charge, object->uip);
655                 object->charge = 0;
656                 uifree(object->uip);
657                 object->uip = NULL;
658         }
659
660         /*
661          * Free the space for the object.
662          */
663         uma_zfree(obj_zone, object);
664 }
665
666 /*
667  *      vm_object_terminate actually destroys the specified object, freeing
668  *      up all previously used resources.
669  *
670  *      The object must be locked.
671  *      This routine may block.
672  */
673 void
674 vm_object_terminate(vm_object_t object)
675 {
676         vm_page_t p;
677
678         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
679
680         /*
681          * Make sure no one uses us.
682          */
683         vm_object_set_flag(object, OBJ_DEAD);
684
685         /*
686          * wait for the pageout daemon to be done with the object
687          */
688         vm_object_pip_wait(object, "objtrm");
689
690         KASSERT(!object->paging_in_progress,
691                 ("vm_object_terminate: pageout in progress"));
692
693         /*
694          * Clean and free the pages, as appropriate. All references to the
695          * object are gone, so we don't need to lock it.
696          */
697         if (object->type == OBJT_VNODE) {
698                 struct vnode *vp = (struct vnode *)object->handle;
699
700                 /*
701                  * Clean pages and flush buffers.
702                  */
703                 vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
704                 VM_OBJECT_UNLOCK(object);
705
706                 vinvalbuf(vp, V_SAVE, 0, 0);
707
708                 VM_OBJECT_LOCK(object);
709         }
710
711         KASSERT(object->ref_count == 0, 
712                 ("vm_object_terminate: object with references, ref_count=%d",
713                 object->ref_count));
714
715         /*
716          * Now free any remaining pages. For internal objects, this also
717          * removes them from paging queues. Don't free wired pages, just
718          * remove them from the object. 
719          */
720         vm_page_lock_queues();
721         while ((p = TAILQ_FIRST(&object->memq)) != NULL) {
722                 KASSERT(!p->busy && (p->oflags & VPO_BUSY) == 0,
723                         ("vm_object_terminate: freeing busy page %p "
724                         "p->busy = %d, p->oflags %x\n", p, p->busy, p->oflags));
725                 if (p->wire_count == 0) {
726                         vm_page_free(p);
727                         cnt.v_pfree++;
728                 } else {
729                         vm_page_remove(p);
730                 }
731         }
732         vm_page_unlock_queues();
733
734 #if VM_NRESERVLEVEL > 0
735         if (__predict_false(!LIST_EMPTY(&object->rvq)))
736                 vm_reserv_break_all(object);
737 #endif
738         if (__predict_false(object->cache != NULL))
739                 vm_page_cache_free(object, 0, 0);
740
741         /*
742          * Let the pager know object is dead.
743          */
744         vm_pager_deallocate(object);
745         VM_OBJECT_UNLOCK(object);
746
747         vm_object_destroy(object);
748 }
749
750 /*
751  *      vm_object_page_clean
752  *
753  *      Clean all dirty pages in the specified range of object.  Leaves page 
754  *      on whatever queue it is currently on.   If NOSYNC is set then do not
755  *      write out pages with VPO_NOSYNC set (originally comes from MAP_NOSYNC),
756  *      leaving the object dirty.
757  *
758  *      When stuffing pages asynchronously, allow clustering.  XXX we need a
759  *      synchronous clustering mode implementation.
760  *
761  *      Odd semantics: if start == end, we clean everything.
762  *
763  *      The object must be locked.
764  */
765 void
766 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int flags)
767 {
768         vm_page_t p, np;
769         vm_pindex_t tstart, tend;
770         vm_pindex_t pi;
771         int clearobjflags;
772         int pagerflags;
773         int curgeneration;
774
775         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
776         if ((object->flags & OBJ_MIGHTBEDIRTY) == 0)
777                 return;
778         KASSERT(object->type == OBJT_VNODE, ("Not a vnode object"));
779
780         pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ? VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK;
781         pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0;
782
783         vm_object_set_flag(object, OBJ_CLEANING);
784
785         tstart = start;
786         if (end == 0) {
787                 tend = object->size;
788         } else {
789                 tend = end;
790         }
791
792         vm_page_lock_queues();
793         /*
794          * If the caller is smart and only msync()s a range he knows is
795          * dirty, we may be able to avoid an object scan.  This results in
796          * a phenominal improvement in performance.  We cannot do this
797          * as a matter of course because the object may be huge - e.g.
798          * the size might be in the gigabytes or terrabytes.
799          */
800         if (msync_flush_flags & MSYNC_FLUSH_HARDSEQ) {
801                 vm_pindex_t tscan;
802                 int scanlimit;
803                 int scanreset;
804
805                 scanreset = object->resident_page_count / EASY_SCAN_FACTOR;
806                 if (scanreset < 16)
807                         scanreset = 16;
808                 pagerflags |= VM_PAGER_IGNORE_CLEANCHK;
809
810                 scanlimit = scanreset;
811                 tscan = tstart;
812                 while (tscan < tend) {
813                         curgeneration = object->generation;
814                         p = vm_page_lookup(object, tscan);
815                         if (p == NULL || p->valid == 0) {
816                                 if (--scanlimit == 0)
817                                         break;
818                                 ++tscan;
819                                 continue;
820                         }
821                         vm_page_test_dirty(p);
822                         if (p->dirty == 0) {
823                                 if (--scanlimit == 0)
824                                         break;
825                                 ++tscan;
826                                 continue;
827                         }
828                         /*
829                          * If we have been asked to skip nosync pages and 
830                          * this is a nosync page, we can't continue.
831                          */
832                         if ((flags & OBJPC_NOSYNC) && (p->oflags & VPO_NOSYNC)) {
833                                 if (--scanlimit == 0)
834                                         break;
835                                 ++tscan;
836                                 continue;
837                         }
838                         scanlimit = scanreset;
839
840                         /*
841                          * This returns 0 if it was unable to busy the first
842                          * page (i.e. had to sleep).
843                          */
844                         tscan += vm_object_page_collect_flush(object, p, curgeneration, pagerflags);
845                 }
846
847                 /*
848                  * If everything was dirty and we flushed it successfully,
849                  * and the requested range is not the entire object, we
850                  * don't have to mess with CLEANCHK or MIGHTBEDIRTY and can
851                  * return immediately.
852                  */
853                 if (tscan >= tend && (tstart || tend < object->size)) {
854                         vm_page_unlock_queues();
855                         vm_object_clear_flag(object, OBJ_CLEANING);
856                         return;
857                 }
858                 pagerflags &= ~VM_PAGER_IGNORE_CLEANCHK;
859         }
860
861         /*
862          * Generally set CLEANCHK interlock and make the page read-only so
863          * we can then clear the object flags.
864          *
865          * However, if this is a nosync mmap then the object is likely to 
866          * stay dirty so do not mess with the page and do not clear the
867          * object flags.
868          */
869         clearobjflags = 1;
870         TAILQ_FOREACH(p, &object->memq, listq) {
871                 p->oflags |= VPO_CLEANCHK;
872                 if ((flags & OBJPC_NOSYNC) && (p->oflags & VPO_NOSYNC))
873                         clearobjflags = 0;
874                 else
875                         pmap_remove_write(p);
876         }
877
878         if (clearobjflags && (tstart == 0) && (tend == object->size))
879                 vm_object_clear_flag(object, OBJ_MIGHTBEDIRTY);
880
881 rescan:
882         curgeneration = object->generation;
883
884         for (p = TAILQ_FIRST(&object->memq); p; p = np) {
885                 int n;
886
887                 np = TAILQ_NEXT(p, listq);
888
889 again:
890                 pi = p->pindex;
891                 if ((p->oflags & VPO_CLEANCHK) == 0 ||
892                         (pi < tstart) || (pi >= tend) ||
893                     p->valid == 0) {
894                         p->oflags &= ~VPO_CLEANCHK;
895                         continue;
896                 }
897
898                 vm_page_test_dirty(p);
899                 if (p->dirty == 0) {
900                         p->oflags &= ~VPO_CLEANCHK;
901                         continue;
902                 }
903
904                 /*
905                  * If we have been asked to skip nosync pages and this is a
906                  * nosync page, skip it.  Note that the object flags were
907                  * not cleared in this case so we do not have to set them.
908                  */
909                 if ((flags & OBJPC_NOSYNC) && (p->oflags & VPO_NOSYNC)) {
910                         p->oflags &= ~VPO_CLEANCHK;
911                         continue;
912                 }
913
914                 n = vm_object_page_collect_flush(object, p,
915                         curgeneration, pagerflags);
916                 if (n == 0)
917                         goto rescan;
918
919                 if (object->generation != curgeneration)
920                         goto rescan;
921
922                 /*
923                  * Try to optimize the next page.  If we can't we pick up
924                  * our (random) scan where we left off.
925                  */
926                 if (msync_flush_flags & MSYNC_FLUSH_SOFTSEQ) {
927                         if ((p = vm_page_lookup(object, pi + n)) != NULL)
928                                 goto again;
929                 }
930         }
931         vm_page_unlock_queues();
932 #if 0
933         VOP_FSYNC(vp, (pagerflags & VM_PAGER_PUT_SYNC)?MNT_WAIT:0, curproc);
934 #endif
935
936         vm_object_clear_flag(object, OBJ_CLEANING);
937         return;
938 }
939
940 static int
941 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration, int pagerflags)
942 {
943         int runlen;
944         int maxf;
945         int chkb;
946         int maxb;
947         int i;
948         vm_pindex_t pi;
949         vm_page_t maf[vm_pageout_page_count];
950         vm_page_t mab[vm_pageout_page_count];
951         vm_page_t ma[vm_pageout_page_count];
952
953         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
954         pi = p->pindex;
955         while (vm_page_sleep_if_busy(p, TRUE, "vpcwai")) {
956                 vm_page_lock_queues();
957                 if (object->generation != curgeneration) {
958                         return(0);
959                 }
960         }
961         maxf = 0;
962         for(i = 1; i < vm_pageout_page_count; i++) {
963                 vm_page_t tp;
964
965                 if ((tp = vm_page_lookup(object, pi + i)) != NULL) {
966                         if ((tp->oflags & VPO_BUSY) ||
967                                 ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
968                                  (tp->oflags & VPO_CLEANCHK) == 0) ||
969                                 (tp->busy != 0))
970                                 break;
971                         vm_page_test_dirty(tp);
972                         if (tp->dirty == 0) {
973                                 tp->oflags &= ~VPO_CLEANCHK;
974                                 break;
975                         }
976                         maf[ i - 1 ] = tp;
977                         maxf++;
978                         continue;
979                 }
980                 break;
981         }
982
983         maxb = 0;
984         chkb = vm_pageout_page_count -  maxf;
985         if (chkb) {
986                 for(i = 1; i < chkb;i++) {
987                         vm_page_t tp;
988
989                         if ((tp = vm_page_lookup(object, pi - i)) != NULL) {
990                                 if ((tp->oflags & VPO_BUSY) ||
991                                         ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
992                                          (tp->oflags & VPO_CLEANCHK) == 0) ||
993                                         (tp->busy != 0))
994                                         break;
995                                 vm_page_test_dirty(tp);
996                                 if (tp->dirty == 0) {
997                                         tp->oflags &= ~VPO_CLEANCHK;
998                                         break;
999                                 }
1000                                 mab[ i - 1 ] = tp;
1001                                 maxb++;
1002                                 continue;
1003                         }
1004                         break;
1005                 }
1006         }
1007
1008         for(i = 0; i < maxb; i++) {
1009                 int index = (maxb - i) - 1;
1010                 ma[index] = mab[i];
1011                 ma[index]->oflags &= ~VPO_CLEANCHK;
1012         }
1013         p->oflags &= ~VPO_CLEANCHK;
1014         ma[maxb] = p;
1015         for(i = 0; i < maxf; i++) {
1016                 int index = (maxb + i) + 1;
1017                 ma[index] = maf[i];
1018                 ma[index]->oflags &= ~VPO_CLEANCHK;
1019         }
1020         runlen = maxb + maxf + 1;
1021
1022         vm_pageout_flush(ma, runlen, pagerflags);
1023         for (i = 0; i < runlen; i++) {
1024                 if (ma[i]->dirty) {
1025                         pmap_remove_write(ma[i]);
1026                         ma[i]->oflags |= VPO_CLEANCHK;
1027
1028                         /*
1029                          * maxf will end up being the actual number of pages
1030                          * we wrote out contiguously, non-inclusive of the
1031                          * first page.  We do not count look-behind pages.
1032                          */
1033                         if (i >= maxb + 1 && (maxf > i - maxb - 1))
1034                                 maxf = i - maxb - 1;
1035                 }
1036         }
1037         return(maxf + 1);
1038 }
1039
1040 /*
1041  * Note that there is absolutely no sense in writing out
1042  * anonymous objects, so we track down the vnode object
1043  * to write out.
1044  * We invalidate (remove) all pages from the address space
1045  * for semantic correctness.
1046  *
1047  * Note: certain anonymous maps, such as MAP_NOSYNC maps,
1048  * may start out with a NULL object.
1049  */
1050 void
1051 vm_object_sync(vm_object_t object, vm_ooffset_t offset, vm_size_t size,
1052     boolean_t syncio, boolean_t invalidate)
1053 {
1054         vm_object_t backing_object;
1055         struct vnode *vp;
1056         struct mount *mp;
1057         int flags;
1058
1059         if (object == NULL)
1060                 return;
1061         VM_OBJECT_LOCK(object);
1062         while ((backing_object = object->backing_object) != NULL) {
1063                 VM_OBJECT_LOCK(backing_object);
1064                 offset += object->backing_object_offset;
1065                 VM_OBJECT_UNLOCK(object);
1066                 object = backing_object;
1067                 if (object->size < OFF_TO_IDX(offset + size))
1068                         size = IDX_TO_OFF(object->size) - offset;
1069         }
1070         /*
1071          * Flush pages if writing is allowed, invalidate them
1072          * if invalidation requested.  Pages undergoing I/O
1073          * will be ignored by vm_object_page_remove().
1074          *
1075          * We cannot lock the vnode and then wait for paging
1076          * to complete without deadlocking against vm_fault.
1077          * Instead we simply call vm_object_page_remove() and
1078          * allow it to block internally on a page-by-page
1079          * basis when it encounters pages undergoing async
1080          * I/O.
1081          */
1082         if (object->type == OBJT_VNODE &&
1083             (object->flags & OBJ_MIGHTBEDIRTY) != 0) {
1084                 int vfslocked;
1085                 vp = object->handle;
1086                 VM_OBJECT_UNLOCK(object);
1087                 (void) vn_start_write(vp, &mp, V_WAIT);
1088                 vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1089                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1090                 flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
1091                 flags |= invalidate ? OBJPC_INVAL : 0;
1092                 VM_OBJECT_LOCK(object);
1093                 vm_object_page_clean(object,
1094                     OFF_TO_IDX(offset),
1095                     OFF_TO_IDX(offset + size + PAGE_MASK),
1096                     flags);
1097                 VM_OBJECT_UNLOCK(object);
1098                 VOP_UNLOCK(vp, 0);
1099                 VFS_UNLOCK_GIANT(vfslocked);
1100                 vn_finished_write(mp);
1101                 VM_OBJECT_LOCK(object);
1102         }
1103         if ((object->type == OBJT_VNODE ||
1104              object->type == OBJT_DEVICE) && invalidate) {
1105                 boolean_t purge;
1106                 purge = old_msync || (object->type == OBJT_DEVICE);
1107                 vm_object_page_remove(object,
1108                     OFF_TO_IDX(offset),
1109                     OFF_TO_IDX(offset + size + PAGE_MASK),
1110                     purge ? FALSE : TRUE);
1111         }
1112         VM_OBJECT_UNLOCK(object);
1113 }
1114
1115 /*
1116  *      vm_object_madvise:
1117  *
1118  *      Implements the madvise function at the object/page level.
1119  *
1120  *      MADV_WILLNEED   (any object)
1121  *
1122  *          Activate the specified pages if they are resident.
1123  *
1124  *      MADV_DONTNEED   (any object)
1125  *
1126  *          Deactivate the specified pages if they are resident.
1127  *
1128  *      MADV_FREE       (OBJT_DEFAULT/OBJT_SWAP objects,
1129  *                       OBJ_ONEMAPPING only)
1130  *
1131  *          Deactivate and clean the specified pages if they are
1132  *          resident.  This permits the process to reuse the pages
1133  *          without faulting or the kernel to reclaim the pages
1134  *          without I/O.
1135  */
1136 void
1137 vm_object_madvise(vm_object_t object, vm_pindex_t pindex, int count, int advise)
1138 {
1139         vm_pindex_t end, tpindex;
1140         vm_object_t backing_object, tobject;
1141         vm_page_t m;
1142
1143         if (object == NULL)
1144                 return;
1145         VM_OBJECT_LOCK(object);
1146         end = pindex + count;
1147         /*
1148          * Locate and adjust resident pages
1149          */
1150         for (; pindex < end; pindex += 1) {
1151 relookup:
1152                 tobject = object;
1153                 tpindex = pindex;
1154 shadowlookup:
1155                 /*
1156                  * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
1157                  * and those pages must be OBJ_ONEMAPPING.
1158                  */
1159                 if (advise == MADV_FREE) {
1160                         if ((tobject->type != OBJT_DEFAULT &&
1161                              tobject->type != OBJT_SWAP) ||
1162                             (tobject->flags & OBJ_ONEMAPPING) == 0) {
1163                                 goto unlock_tobject;
1164                         }
1165                 }
1166                 m = vm_page_lookup(tobject, tpindex);
1167                 if (m == NULL && advise == MADV_WILLNEED) {
1168                         /*
1169                          * If the page is cached, reactivate it.
1170                          */
1171                         m = vm_page_alloc(tobject, tpindex, VM_ALLOC_IFCACHED |
1172                             VM_ALLOC_NOBUSY);
1173                 }
1174                 if (m == NULL) {
1175                         /*
1176                          * There may be swap even if there is no backing page
1177                          */
1178                         if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
1179                                 swap_pager_freespace(tobject, tpindex, 1);
1180                         /*
1181                          * next object
1182                          */
1183                         backing_object = tobject->backing_object;
1184                         if (backing_object == NULL)
1185                                 goto unlock_tobject;
1186                         VM_OBJECT_LOCK(backing_object);
1187                         tpindex += OFF_TO_IDX(tobject->backing_object_offset);
1188                         if (tobject != object)
1189                                 VM_OBJECT_UNLOCK(tobject);
1190                         tobject = backing_object;
1191                         goto shadowlookup;
1192                 }
1193                 /*
1194                  * If the page is busy or not in a normal active state,
1195                  * we skip it.  If the page is not managed there are no
1196                  * page queues to mess with.  Things can break if we mess
1197                  * with pages in any of the below states.
1198                  */
1199                 vm_page_lock_queues();
1200                 if (m->hold_count ||
1201                     m->wire_count ||
1202                     (m->flags & PG_UNMANAGED) ||
1203                     m->valid != VM_PAGE_BITS_ALL) {
1204                         vm_page_unlock_queues();
1205                         goto unlock_tobject;
1206                 }
1207                 if ((m->oflags & VPO_BUSY) || m->busy) {
1208                         if (advise == MADV_WILLNEED)
1209                                 /*
1210                                  * Reference the page before unlocking and
1211                                  * sleeping so that the page daemon is less
1212                                  * likely to reclaim it. 
1213                                  */
1214                                 vm_page_flag_set(m, PG_REFERENCED);
1215                         vm_page_unlock_queues();
1216                         if (object != tobject)
1217                                 VM_OBJECT_UNLOCK(object);
1218                         m->oflags |= VPO_WANTED;
1219                         msleep(m, VM_OBJECT_MTX(tobject), PDROP | PVM, "madvpo",
1220                             0);
1221                         VM_OBJECT_LOCK(object);
1222                         goto relookup;
1223                 }
1224                 if (advise == MADV_WILLNEED) {
1225                         vm_page_activate(m);
1226                 } else if (advise == MADV_DONTNEED) {
1227                         vm_page_dontneed(m);
1228                 } else if (advise == MADV_FREE) {
1229                         /*
1230                          * Mark the page clean.  This will allow the page
1231                          * to be freed up by the system.  However, such pages
1232                          * are often reused quickly by malloc()/free()
1233                          * so we do not do anything that would cause
1234                          * a page fault if we can help it.
1235                          *
1236                          * Specifically, we do not try to actually free
1237                          * the page now nor do we try to put it in the
1238                          * cache (which would cause a page fault on reuse).
1239                          *
1240                          * But we do make the page is freeable as we
1241                          * can without actually taking the step of unmapping
1242                          * it.
1243                          */
1244                         pmap_clear_modify(m);
1245                         m->dirty = 0;
1246                         m->act_count = 0;
1247                         vm_page_dontneed(m);
1248                 }
1249                 vm_page_unlock_queues();
1250                 if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
1251                         swap_pager_freespace(tobject, tpindex, 1);
1252 unlock_tobject:
1253                 if (tobject != object)
1254                         VM_OBJECT_UNLOCK(tobject);
1255         }       
1256         VM_OBJECT_UNLOCK(object);
1257 }
1258
1259 /*
1260  *      vm_object_shadow:
1261  *
1262  *      Create a new object which is backed by the
1263  *      specified existing object range.  The source
1264  *      object reference is deallocated.
1265  *
1266  *      The new object and offset into that object
1267  *      are returned in the source parameters.
1268  */
1269 void
1270 vm_object_shadow(
1271         vm_object_t *object,    /* IN/OUT */
1272         vm_ooffset_t *offset,   /* IN/OUT */
1273         vm_size_t length)
1274 {
1275         vm_object_t source;
1276         vm_object_t result;
1277
1278         source = *object;
1279
1280         /*
1281          * Don't create the new object if the old object isn't shared.
1282          */
1283         if (source != NULL) {
1284                 VM_OBJECT_LOCK(source);
1285                 if (source->ref_count == 1 &&
1286                     source->handle == NULL &&
1287                     (source->type == OBJT_DEFAULT ||
1288                      source->type == OBJT_SWAP)) {
1289                         VM_OBJECT_UNLOCK(source);
1290                         return;
1291                 }
1292                 VM_OBJECT_UNLOCK(source);
1293         }
1294
1295         /*
1296          * Allocate a new object with the given length.
1297          */
1298         result = vm_object_allocate(OBJT_DEFAULT, length);
1299
1300         /*
1301          * The new object shadows the source object, adding a reference to it.
1302          * Our caller changes his reference to point to the new object,
1303          * removing a reference to the source object.  Net result: no change
1304          * of reference count.
1305          *
1306          * Try to optimize the result object's page color when shadowing
1307          * in order to maintain page coloring consistency in the combined 
1308          * shadowed object.
1309          */
1310         result->backing_object = source;
1311         /*
1312          * Store the offset into the source object, and fix up the offset into
1313          * the new object.
1314          */
1315         result->backing_object_offset = *offset;
1316         if (source != NULL) {
1317                 VM_OBJECT_LOCK(source);
1318                 LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list);
1319                 source->shadow_count++;
1320                 source->generation++;
1321 #if VM_NRESERVLEVEL > 0
1322                 result->flags |= source->flags & OBJ_COLORED;
1323                 result->pg_color = (source->pg_color + OFF_TO_IDX(*offset)) &
1324                     ((1 << (VM_NFREEORDER - 1)) - 1);
1325 #endif
1326                 VM_OBJECT_UNLOCK(source);
1327         }
1328
1329
1330         /*
1331          * Return the new things
1332          */
1333         *offset = 0;
1334         *object = result;
1335 }
1336
1337 /*
1338  *      vm_object_split:
1339  *
1340  * Split the pages in a map entry into a new object.  This affords
1341  * easier removal of unused pages, and keeps object inheritance from
1342  * being a negative impact on memory usage.
1343  */
1344 void
1345 vm_object_split(vm_map_entry_t entry)
1346 {
1347         vm_page_t m, m_next;
1348         vm_object_t orig_object, new_object, source;
1349         vm_pindex_t idx, offidxstart;
1350         vm_size_t size;
1351
1352         orig_object = entry->object.vm_object;
1353         if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP)
1354                 return;
1355         if (orig_object->ref_count <= 1)
1356                 return;
1357         VM_OBJECT_UNLOCK(orig_object);
1358
1359         offidxstart = OFF_TO_IDX(entry->offset);
1360         size = atop(entry->end - entry->start);
1361
1362         /*
1363          * If swap_pager_copy() is later called, it will convert new_object
1364          * into a swap object.
1365          */
1366         new_object = vm_object_allocate(OBJT_DEFAULT, size);
1367
1368         /*
1369          * At this point, the new object is still private, so the order in
1370          * which the original and new objects are locked does not matter.
1371          */
1372         VM_OBJECT_LOCK(new_object);
1373         VM_OBJECT_LOCK(orig_object);
1374         source = orig_object->backing_object;
1375         if (source != NULL) {
1376                 VM_OBJECT_LOCK(source);
1377                 if ((source->flags & OBJ_DEAD) != 0) {
1378                         VM_OBJECT_UNLOCK(source);
1379                         VM_OBJECT_UNLOCK(orig_object);
1380                         VM_OBJECT_UNLOCK(new_object);
1381                         vm_object_deallocate(new_object);
1382                         VM_OBJECT_LOCK(orig_object);
1383                         return;
1384                 }
1385                 LIST_INSERT_HEAD(&source->shadow_head,
1386                                   new_object, shadow_list);
1387                 source->shadow_count++;
1388                 source->generation++;
1389                 vm_object_reference_locked(source);     /* for new_object */
1390                 vm_object_clear_flag(source, OBJ_ONEMAPPING);
1391                 VM_OBJECT_UNLOCK(source);
1392                 new_object->backing_object_offset = 
1393                         orig_object->backing_object_offset + entry->offset;
1394                 new_object->backing_object = source;
1395         }
1396         if (orig_object->uip != NULL) {
1397                 new_object->uip = orig_object->uip;
1398                 uihold(orig_object->uip);
1399                 new_object->charge = ptoa(size);
1400                 KASSERT(orig_object->charge >= ptoa(size),
1401                     ("orig_object->charge < 0"));
1402                 orig_object->charge -= ptoa(size);
1403         }
1404 retry:
1405         if ((m = TAILQ_FIRST(&orig_object->memq)) != NULL) {
1406                 if (m->pindex < offidxstart) {
1407                         m = vm_page_splay(offidxstart, orig_object->root);
1408                         if ((orig_object->root = m)->pindex < offidxstart)
1409                                 m = TAILQ_NEXT(m, listq);
1410                 }
1411         }
1412         vm_page_lock_queues();
1413         for (; m != NULL && (idx = m->pindex - offidxstart) < size;
1414             m = m_next) {
1415                 m_next = TAILQ_NEXT(m, listq);
1416
1417                 /*
1418                  * We must wait for pending I/O to complete before we can
1419                  * rename the page.
1420                  *
1421                  * We do not have to VM_PROT_NONE the page as mappings should
1422                  * not be changed by this operation.
1423                  */
1424                 if ((m->oflags & VPO_BUSY) || m->busy) {
1425                         vm_page_unlock_queues();
1426                         VM_OBJECT_UNLOCK(new_object);
1427                         m->oflags |= VPO_WANTED;
1428                         msleep(m, VM_OBJECT_MTX(orig_object), PVM, "spltwt", 0);
1429                         VM_OBJECT_LOCK(new_object);
1430                         goto retry;
1431                 }
1432                 vm_page_rename(m, new_object, idx);
1433                 /* page automatically made dirty by rename and cache handled */
1434                 vm_page_busy(m);
1435         }
1436         vm_page_unlock_queues();
1437         if (orig_object->type == OBJT_SWAP) {
1438                 /*
1439                  * swap_pager_copy() can sleep, in which case the orig_object's
1440                  * and new_object's locks are released and reacquired. 
1441                  */
1442                 swap_pager_copy(orig_object, new_object, offidxstart, 0);
1443
1444                 /*
1445                  * Transfer any cached pages from orig_object to new_object.
1446                  */
1447                 if (__predict_false(orig_object->cache != NULL))
1448                         vm_page_cache_transfer(orig_object, offidxstart,
1449                             new_object);
1450         }
1451         VM_OBJECT_UNLOCK(orig_object);
1452         TAILQ_FOREACH(m, &new_object->memq, listq)
1453                 vm_page_wakeup(m);
1454         VM_OBJECT_UNLOCK(new_object);
1455         entry->object.vm_object = new_object;
1456         entry->offset = 0LL;
1457         vm_object_deallocate(orig_object);
1458         VM_OBJECT_LOCK(new_object);
1459 }
1460
1461 #define OBSC_TEST_ALL_SHADOWED  0x0001
1462 #define OBSC_COLLAPSE_NOWAIT    0x0002
1463 #define OBSC_COLLAPSE_WAIT      0x0004
1464
1465 static int
1466 vm_object_backing_scan(vm_object_t object, int op)
1467 {
1468         int r = 1;
1469         vm_page_t p;
1470         vm_object_t backing_object;
1471         vm_pindex_t backing_offset_index;
1472
1473         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
1474         VM_OBJECT_LOCK_ASSERT(object->backing_object, MA_OWNED);
1475
1476         backing_object = object->backing_object;
1477         backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
1478
1479         /*
1480          * Initial conditions
1481          */
1482         if (op & OBSC_TEST_ALL_SHADOWED) {
1483                 /*
1484                  * We do not want to have to test for the existence of cache
1485                  * or swap pages in the backing object.  XXX but with the
1486                  * new swapper this would be pretty easy to do.
1487                  *
1488                  * XXX what about anonymous MAP_SHARED memory that hasn't
1489                  * been ZFOD faulted yet?  If we do not test for this, the
1490                  * shadow test may succeed! XXX
1491                  */
1492                 if (backing_object->type != OBJT_DEFAULT) {
1493                         return (0);
1494                 }
1495         }
1496         if (op & OBSC_COLLAPSE_WAIT) {
1497                 vm_object_set_flag(backing_object, OBJ_DEAD);
1498         }
1499
1500         /*
1501          * Our scan
1502          */
1503         p = TAILQ_FIRST(&backing_object->memq);
1504         while (p) {
1505                 vm_page_t next = TAILQ_NEXT(p, listq);
1506                 vm_pindex_t new_pindex = p->pindex - backing_offset_index;
1507
1508                 if (op & OBSC_TEST_ALL_SHADOWED) {
1509                         vm_page_t pp;
1510
1511                         /*
1512                          * Ignore pages outside the parent object's range
1513                          * and outside the parent object's mapping of the 
1514                          * backing object.
1515                          *
1516                          * note that we do not busy the backing object's
1517                          * page.
1518                          */
1519                         if (
1520                             p->pindex < backing_offset_index ||
1521                             new_pindex >= object->size
1522                         ) {
1523                                 p = next;
1524                                 continue;
1525                         }
1526
1527                         /*
1528                          * See if the parent has the page or if the parent's
1529                          * object pager has the page.  If the parent has the
1530                          * page but the page is not valid, the parent's
1531                          * object pager must have the page.
1532                          *
1533                          * If this fails, the parent does not completely shadow
1534                          * the object and we might as well give up now.
1535                          */
1536
1537                         pp = vm_page_lookup(object, new_pindex);
1538                         if (
1539                             (pp == NULL || pp->valid == 0) &&
1540                             !vm_pager_has_page(object, new_pindex, NULL, NULL)
1541                         ) {
1542                                 r = 0;
1543                                 break;
1544                         }
1545                 }
1546
1547                 /*
1548                  * Check for busy page
1549                  */
1550                 if (op & (OBSC_COLLAPSE_WAIT | OBSC_COLLAPSE_NOWAIT)) {
1551                         vm_page_t pp;
1552
1553                         if (op & OBSC_COLLAPSE_NOWAIT) {
1554                                 if ((p->oflags & VPO_BUSY) ||
1555                                     !p->valid || 
1556                                     p->busy) {
1557                                         p = next;
1558                                         continue;
1559                                 }
1560                         } else if (op & OBSC_COLLAPSE_WAIT) {
1561                                 if ((p->oflags & VPO_BUSY) || p->busy) {
1562                                         VM_OBJECT_UNLOCK(object);
1563                                         p->oflags |= VPO_WANTED;
1564                                         msleep(p, VM_OBJECT_MTX(backing_object),
1565                                             PDROP | PVM, "vmocol", 0);
1566                                         VM_OBJECT_LOCK(object);
1567                                         VM_OBJECT_LOCK(backing_object);
1568                                         /*
1569                                          * If we slept, anything could have
1570                                          * happened.  Since the object is
1571                                          * marked dead, the backing offset
1572                                          * should not have changed so we
1573                                          * just restart our scan.
1574                                          */
1575                                         p = TAILQ_FIRST(&backing_object->memq);
1576                                         continue;
1577                                 }
1578                         }
1579
1580                         KASSERT(
1581                             p->object == backing_object,
1582                             ("vm_object_backing_scan: object mismatch")
1583                         );
1584
1585                         /*
1586                          * Destroy any associated swap
1587                          */
1588                         if (backing_object->type == OBJT_SWAP) {
1589                                 swap_pager_freespace(
1590                                     backing_object, 
1591                                     p->pindex,
1592                                     1
1593                                 );
1594                         }
1595
1596                         if (
1597                             p->pindex < backing_offset_index ||
1598                             new_pindex >= object->size
1599                         ) {
1600                                 /*
1601                                  * Page is out of the parent object's range, we 
1602                                  * can simply destroy it. 
1603                                  */
1604                                 vm_page_lock_queues();
1605                                 KASSERT(!pmap_page_is_mapped(p),
1606                                     ("freeing mapped page %p", p));
1607                                 if (p->wire_count == 0)
1608                                         vm_page_free(p);
1609                                 else
1610                                         vm_page_remove(p);
1611                                 vm_page_unlock_queues();
1612                                 p = next;
1613                                 continue;
1614                         }
1615
1616                         pp = vm_page_lookup(object, new_pindex);
1617                         if (
1618                             pp != NULL ||
1619                             vm_pager_has_page(object, new_pindex, NULL, NULL)
1620                         ) {
1621                                 /*
1622                                  * page already exists in parent OR swap exists
1623                                  * for this location in the parent.  Destroy 
1624                                  * the original page from the backing object.
1625                                  *
1626                                  * Leave the parent's page alone
1627                                  */
1628                                 vm_page_lock_queues();
1629                                 KASSERT(!pmap_page_is_mapped(p),
1630                                     ("freeing mapped page %p", p));
1631                                 if (p->wire_count == 0)
1632                                         vm_page_free(p);
1633                                 else
1634                                         vm_page_remove(p);
1635                                 vm_page_unlock_queues();
1636                                 p = next;
1637                                 continue;
1638                         }
1639
1640 #if VM_NRESERVLEVEL > 0
1641                         /*
1642                          * Rename the reservation.
1643                          */
1644                         vm_reserv_rename(p, object, backing_object,
1645                             backing_offset_index);
1646 #endif
1647
1648                         /*
1649                          * Page does not exist in parent, rename the
1650                          * page from the backing object to the main object. 
1651                          *
1652                          * If the page was mapped to a process, it can remain 
1653                          * mapped through the rename.
1654                          */
1655                         vm_page_lock_queues();
1656                         vm_page_rename(p, object, new_pindex);
1657                         vm_page_unlock_queues();
1658                         /* page automatically made dirty by rename */
1659                 }
1660                 p = next;
1661         }
1662         return (r);
1663 }
1664
1665
1666 /*
1667  * this version of collapse allows the operation to occur earlier and
1668  * when paging_in_progress is true for an object...  This is not a complete
1669  * operation, but should plug 99.9% of the rest of the leaks.
1670  */
1671 static void
1672 vm_object_qcollapse(vm_object_t object)
1673 {
1674         vm_object_t backing_object = object->backing_object;
1675
1676         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
1677         VM_OBJECT_LOCK_ASSERT(backing_object, MA_OWNED);
1678
1679         if (backing_object->ref_count != 1)
1680                 return;
1681
1682         vm_object_backing_scan(object, OBSC_COLLAPSE_NOWAIT);
1683 }
1684
1685 /*
1686  *      vm_object_collapse:
1687  *
1688  *      Collapse an object with the object backing it.
1689  *      Pages in the backing object are moved into the
1690  *      parent, and the backing object is deallocated.
1691  */
1692 void
1693 vm_object_collapse(vm_object_t object)
1694 {
1695         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
1696         
1697         while (TRUE) {
1698                 vm_object_t backing_object;
1699
1700                 /*
1701                  * Verify that the conditions are right for collapse:
1702                  *
1703                  * The object exists and the backing object exists.
1704                  */
1705                 if ((backing_object = object->backing_object) == NULL)
1706                         break;
1707
1708                 /*
1709                  * we check the backing object first, because it is most likely
1710                  * not collapsable.
1711                  */
1712                 VM_OBJECT_LOCK(backing_object);
1713                 if (backing_object->handle != NULL ||
1714                     (backing_object->type != OBJT_DEFAULT &&
1715                      backing_object->type != OBJT_SWAP) ||
1716                     (backing_object->flags & OBJ_DEAD) ||
1717                     object->handle != NULL ||
1718                     (object->type != OBJT_DEFAULT &&
1719                      object->type != OBJT_SWAP) ||
1720                     (object->flags & OBJ_DEAD)) {
1721                         VM_OBJECT_UNLOCK(backing_object);
1722                         break;
1723                 }
1724
1725                 if (
1726                     object->paging_in_progress != 0 ||
1727                     backing_object->paging_in_progress != 0
1728                 ) {
1729                         vm_object_qcollapse(object);
1730                         VM_OBJECT_UNLOCK(backing_object);
1731                         break;
1732                 }
1733                 /*
1734                  * We know that we can either collapse the backing object (if
1735                  * the parent is the only reference to it) or (perhaps) have
1736                  * the parent bypass the object if the parent happens to shadow
1737                  * all the resident pages in the entire backing object.
1738                  *
1739                  * This is ignoring pager-backed pages such as swap pages.
1740                  * vm_object_backing_scan fails the shadowing test in this
1741                  * case.
1742                  */
1743                 if (backing_object->ref_count == 1) {
1744                         /*
1745                          * If there is exactly one reference to the backing
1746                          * object, we can collapse it into the parent.  
1747                          */
1748                         vm_object_backing_scan(object, OBSC_COLLAPSE_WAIT);
1749
1750 #if VM_NRESERVLEVEL > 0
1751                         /*
1752                          * Break any reservations from backing_object.
1753                          */
1754                         if (__predict_false(!LIST_EMPTY(&backing_object->rvq)))
1755                                 vm_reserv_break_all(backing_object);
1756 #endif
1757
1758                         /*
1759                          * Move the pager from backing_object to object.
1760                          */
1761                         if (backing_object->type == OBJT_SWAP) {
1762                                 /*
1763                                  * swap_pager_copy() can sleep, in which case
1764                                  * the backing_object's and object's locks are
1765                                  * released and reacquired.
1766                                  */
1767                                 swap_pager_copy(
1768                                     backing_object,
1769                                     object,
1770                                     OFF_TO_IDX(object->backing_object_offset), TRUE);
1771
1772                                 /*
1773                                  * Free any cached pages from backing_object.
1774                                  */
1775                                 if (__predict_false(backing_object->cache != NULL))
1776                                         vm_page_cache_free(backing_object, 0, 0);
1777                         }
1778                         /*
1779                          * Object now shadows whatever backing_object did.
1780                          * Note that the reference to 
1781                          * backing_object->backing_object moves from within 
1782                          * backing_object to within object.
1783                          */
1784                         LIST_REMOVE(object, shadow_list);
1785                         backing_object->shadow_count--;
1786                         backing_object->generation++;
1787                         if (backing_object->backing_object) {
1788                                 VM_OBJECT_LOCK(backing_object->backing_object);
1789                                 LIST_REMOVE(backing_object, shadow_list);
1790                                 LIST_INSERT_HEAD(
1791                                     &backing_object->backing_object->shadow_head,
1792                                     object, shadow_list);
1793                                 /*
1794                                  * The shadow_count has not changed.
1795                                  */
1796                                 backing_object->backing_object->generation++;
1797                                 VM_OBJECT_UNLOCK(backing_object->backing_object);
1798                         }
1799                         object->backing_object = backing_object->backing_object;
1800                         object->backing_object_offset +=
1801                             backing_object->backing_object_offset;
1802
1803                         /*
1804                          * Discard backing_object.
1805                          *
1806                          * Since the backing object has no pages, no pager left,
1807                          * and no object references within it, all that is
1808                          * necessary is to dispose of it.
1809                          */
1810                         KASSERT(backing_object->ref_count == 1, (
1811 "backing_object %p was somehow re-referenced during collapse!",
1812                             backing_object));
1813                         VM_OBJECT_UNLOCK(backing_object);
1814                         vm_object_destroy(backing_object);
1815
1816                         object_collapses++;
1817                 } else {
1818                         vm_object_t new_backing_object;
1819
1820                         /*
1821                          * If we do not entirely shadow the backing object,
1822                          * there is nothing we can do so we give up.
1823                          */
1824                         if (object->resident_page_count != object->size &&
1825                             vm_object_backing_scan(object,
1826                             OBSC_TEST_ALL_SHADOWED) == 0) {
1827                                 VM_OBJECT_UNLOCK(backing_object);
1828                                 break;
1829                         }
1830
1831                         /*
1832                          * Make the parent shadow the next object in the
1833                          * chain.  Deallocating backing_object will not remove
1834                          * it, since its reference count is at least 2.
1835                          */
1836                         LIST_REMOVE(object, shadow_list);
1837                         backing_object->shadow_count--;
1838                         backing_object->generation++;
1839
1840                         new_backing_object = backing_object->backing_object;
1841                         if ((object->backing_object = new_backing_object) != NULL) {
1842                                 VM_OBJECT_LOCK(new_backing_object);
1843                                 LIST_INSERT_HEAD(
1844                                     &new_backing_object->shadow_head,
1845                                     object,
1846                                     shadow_list
1847                                 );
1848                                 new_backing_object->shadow_count++;
1849                                 new_backing_object->generation++;
1850                                 vm_object_reference_locked(new_backing_object);
1851                                 VM_OBJECT_UNLOCK(new_backing_object);
1852                                 object->backing_object_offset +=
1853                                         backing_object->backing_object_offset;
1854                         }
1855
1856                         /*
1857                          * Drop the reference count on backing_object. Since
1858                          * its ref_count was at least 2, it will not vanish.
1859                          */
1860                         backing_object->ref_count--;
1861                         VM_OBJECT_UNLOCK(backing_object);
1862                         object_bypasses++;
1863                 }
1864
1865                 /*
1866                  * Try again with this object's new backing object.
1867                  */
1868         }
1869 }
1870
1871 /*
1872  *      vm_object_page_remove:
1873  *
1874  *      For the given object, either frees or invalidates each of the
1875  *      specified pages.  In general, a page is freed.  However, if a
1876  *      page is wired for any reason other than the existence of a
1877  *      managed, wired mapping, then it may be invalidated but not
1878  *      removed from the object.  Pages are specified by the given
1879  *      range ["start", "end") and Boolean "clean_only".  As a
1880  *      special case, if "end" is zero, then the range extends from
1881  *      "start" to the end of the object.  If "clean_only" is TRUE,
1882  *      then only the non-dirty pages within the specified range are
1883  *      affected.
1884  *
1885  *      In general, this operation should only be performed on objects
1886  *      that contain managed pages.  There are two exceptions.  First,
1887  *      it may be performed on the kernel and kmem objects.  Second,
1888  *      it may be used by msync(..., MS_INVALIDATE) to invalidate
1889  *      device-backed pages.
1890  *
1891  *      The object must be locked.
1892  */
1893 void
1894 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
1895     boolean_t clean_only)
1896 {
1897         vm_page_t p, next;
1898         int wirings;
1899
1900         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
1901         if (object->resident_page_count == 0)
1902                 goto skipmemq;
1903
1904         /*
1905          * Since physically-backed objects do not use managed pages, we can't
1906          * remove pages from the object (we must instead remove the page
1907          * references, and then destroy the object).
1908          */
1909         KASSERT(object->type != OBJT_PHYS || object == kernel_object ||
1910             object == kmem_object,
1911             ("attempt to remove pages from a physical object"));
1912
1913         vm_object_pip_add(object, 1);
1914 again:
1915         if ((p = TAILQ_FIRST(&object->memq)) != NULL) {
1916                 if (p->pindex < start) {
1917                         p = vm_page_splay(start, object->root);
1918                         if ((object->root = p)->pindex < start)
1919                                 p = TAILQ_NEXT(p, listq);
1920                 }
1921         }
1922         vm_page_lock_queues();
1923         /*
1924          * Assert: the variable p is either (1) the page with the
1925          * least pindex greater than or equal to the parameter pindex
1926          * or (2) NULL.
1927          */
1928         for (;
1929              p != NULL && (p->pindex < end || end == 0);
1930              p = next) {
1931                 next = TAILQ_NEXT(p, listq);
1932
1933                 /*
1934                  * If the page is wired for any reason besides the
1935                  * existence of managed, wired mappings, then it cannot
1936                  * be freed.  For example, fictitious pages, which
1937                  * represent device memory, are inherently wired and
1938                  * cannot be freed.  They can, however, be invalidated
1939                  * if "clean_only" is FALSE.
1940                  */
1941                 if ((wirings = p->wire_count) != 0 &&
1942                     (wirings = pmap_page_wired_mappings(p)) != p->wire_count) {
1943                         /* Fictitious pages do not have managed mappings. */
1944                         if ((p->flags & PG_FICTITIOUS) == 0)
1945                                 pmap_remove_all(p);
1946                         /* Account for removal of managed, wired mappings. */
1947                         p->wire_count -= wirings;
1948                         if (!clean_only) {
1949                                 p->valid = 0;
1950                                 vm_page_undirty(p);
1951                         }
1952                         continue;
1953                 }
1954                 if (vm_page_sleep_if_busy(p, TRUE, "vmopar"))
1955                         goto again;
1956                 KASSERT((p->flags & PG_FICTITIOUS) == 0,
1957                     ("vm_object_page_remove: page %p is fictitious", p));
1958                 if (clean_only && p->valid) {
1959                         pmap_remove_write(p);
1960                         if (p->dirty)
1961                                 continue;
1962                 }
1963                 pmap_remove_all(p);
1964                 /* Account for removal of managed, wired mappings. */
1965                 if (wirings != 0)
1966                         p->wire_count -= wirings;
1967                 vm_page_free(p);
1968         }
1969         vm_page_unlock_queues();
1970         vm_object_pip_wakeup(object);
1971 skipmemq:
1972         if (__predict_false(object->cache != NULL))
1973                 vm_page_cache_free(object, start, end);
1974 }
1975
1976 /*
1977  *      Populate the specified range of the object with valid pages.  Returns
1978  *      TRUE if the range is successfully populated and FALSE otherwise.
1979  *
1980  *      Note: This function should be optimized to pass a larger array of
1981  *      pages to vm_pager_get_pages() before it is applied to a non-
1982  *      OBJT_DEVICE object.
1983  *
1984  *      The object must be locked.
1985  */
1986 boolean_t
1987 vm_object_populate(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
1988 {
1989         vm_page_t m, ma[1];
1990         vm_pindex_t pindex;
1991         int rv;
1992
1993         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
1994         for (pindex = start; pindex < end; pindex++) {
1995                 m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL |
1996                     VM_ALLOC_RETRY);
1997                 if (m->valid != VM_PAGE_BITS_ALL) {
1998                         ma[0] = m;
1999                         rv = vm_pager_get_pages(object, ma, 1, 0);
2000                         m = vm_page_lookup(object, pindex);
2001                         if (m == NULL)
2002                                 break;
2003                         if (rv != VM_PAGER_OK) {
2004                                 vm_page_lock_queues();
2005                                 vm_page_free(m);
2006                                 vm_page_unlock_queues();
2007                                 break;
2008                         }
2009                 }
2010                 /*
2011                  * Keep "m" busy because a subsequent iteration may unlock
2012                  * the object.
2013                  */
2014         }
2015         if (pindex > start) {
2016                 m = vm_page_lookup(object, start);
2017                 while (m != NULL && m->pindex < pindex) {
2018                         vm_page_wakeup(m);
2019                         m = TAILQ_NEXT(m, listq);
2020                 }
2021         }
2022         return (pindex == end);
2023 }
2024
2025 /*
2026  *      Routine:        vm_object_coalesce
2027  *      Function:       Coalesces two objects backing up adjoining
2028  *                      regions of memory into a single object.
2029  *
2030  *      returns TRUE if objects were combined.
2031  *
2032  *      NOTE:   Only works at the moment if the second object is NULL -
2033  *              if it's not, which object do we lock first?
2034  *
2035  *      Parameters:
2036  *              prev_object     First object to coalesce
2037  *              prev_offset     Offset into prev_object
2038  *              prev_size       Size of reference to prev_object
2039  *              next_size       Size of reference to the second object
2040  *              reserved        Indicator that extension region has
2041  *                              swap accounted for
2042  *
2043  *      Conditions:
2044  *      The object must *not* be locked.
2045  */
2046 boolean_t
2047 vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset,
2048     vm_size_t prev_size, vm_size_t next_size, boolean_t reserved)
2049 {
2050         vm_pindex_t next_pindex;
2051
2052         if (prev_object == NULL)
2053                 return (TRUE);
2054         VM_OBJECT_LOCK(prev_object);
2055         if (prev_object->type != OBJT_DEFAULT &&
2056             prev_object->type != OBJT_SWAP) {
2057                 VM_OBJECT_UNLOCK(prev_object);
2058                 return (FALSE);
2059         }
2060
2061         /*
2062          * Try to collapse the object first
2063          */
2064         vm_object_collapse(prev_object);
2065
2066         /*
2067          * Can't coalesce if: . more than one reference . paged out . shadows
2068          * another object . has a copy elsewhere (any of which mean that the
2069          * pages not mapped to prev_entry may be in use anyway)
2070          */
2071         if (prev_object->backing_object != NULL) {
2072                 VM_OBJECT_UNLOCK(prev_object);
2073                 return (FALSE);
2074         }
2075
2076         prev_size >>= PAGE_SHIFT;
2077         next_size >>= PAGE_SHIFT;
2078         next_pindex = OFF_TO_IDX(prev_offset) + prev_size;
2079
2080         if ((prev_object->ref_count > 1) &&
2081             (prev_object->size != next_pindex)) {
2082                 VM_OBJECT_UNLOCK(prev_object);
2083                 return (FALSE);
2084         }
2085
2086         /*
2087          * Account for the charge.
2088          */
2089         if (prev_object->uip != NULL) {
2090
2091                 /*
2092                  * If prev_object was charged, then this mapping,
2093                  * althought not charged now, may become writable
2094                  * later. Non-NULL uip in the object would prevent
2095                  * swap reservation during enabling of the write
2096                  * access, so reserve swap now. Failed reservation
2097                  * cause allocation of the separate object for the map
2098                  * entry, and swap reservation for this entry is
2099                  * managed in appropriate time.
2100                  */
2101                 if (!reserved && !swap_reserve_by_uid(ptoa(next_size),
2102                     prev_object->uip)) {
2103                         return (FALSE);
2104                 }
2105                 prev_object->charge += ptoa(next_size);
2106         }
2107
2108         /*
2109          * Remove any pages that may still be in the object from a previous
2110          * deallocation.
2111          */
2112         if (next_pindex < prev_object->size) {
2113                 vm_object_page_remove(prev_object,
2114                                       next_pindex,
2115                                       next_pindex + next_size, FALSE);
2116                 if (prev_object->type == OBJT_SWAP)
2117                         swap_pager_freespace(prev_object,
2118                                              next_pindex, next_size);
2119 #if 0
2120                 if (prev_object->uip != NULL) {
2121                         KASSERT(prev_object->charge >=
2122                             ptoa(prev_object->size - next_pindex),
2123                             ("object %p overcharged 1 %jx %jx", prev_object,
2124                                 (uintmax_t)next_pindex, (uintmax_t)next_size));
2125                         prev_object->charge -= ptoa(prev_object->size -
2126                             next_pindex);
2127                 }
2128 #endif
2129         }
2130
2131         /*
2132          * Extend the object if necessary.
2133          */
2134         if (next_pindex + next_size > prev_object->size)
2135                 prev_object->size = next_pindex + next_size;
2136
2137         VM_OBJECT_UNLOCK(prev_object);
2138         return (TRUE);
2139 }
2140
2141 void
2142 vm_object_set_writeable_dirty(vm_object_t object)
2143 {
2144
2145         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2146         if (object->type != OBJT_VNODE ||
2147             (object->flags & OBJ_MIGHTBEDIRTY) != 0)
2148                 return;
2149         vm_object_set_flag(object, OBJ_MIGHTBEDIRTY);
2150 }
2151
2152 #include "opt_ddb.h"
2153 #ifdef DDB
2154 #include <sys/kernel.h>
2155
2156 #include <sys/cons.h>
2157
2158 #include <ddb/ddb.h>
2159
2160 static int
2161 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
2162 {
2163         vm_map_t tmpm;
2164         vm_map_entry_t tmpe;
2165         vm_object_t obj;
2166         int entcount;
2167
2168         if (map == 0)
2169                 return 0;
2170
2171         if (entry == 0) {
2172                 tmpe = map->header.next;
2173                 entcount = map->nentries;
2174                 while (entcount-- && (tmpe != &map->header)) {
2175                         if (_vm_object_in_map(map, object, tmpe)) {
2176                                 return 1;
2177                         }
2178                         tmpe = tmpe->next;
2179                 }
2180         } else if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
2181                 tmpm = entry->object.sub_map;
2182                 tmpe = tmpm->header.next;
2183                 entcount = tmpm->nentries;
2184                 while (entcount-- && tmpe != &tmpm->header) {
2185                         if (_vm_object_in_map(tmpm, object, tmpe)) {
2186                                 return 1;
2187                         }
2188                         tmpe = tmpe->next;
2189                 }
2190         } else if ((obj = entry->object.vm_object) != NULL) {
2191                 for (; obj; obj = obj->backing_object)
2192                         if (obj == object) {
2193                                 return 1;
2194                         }
2195         }
2196         return 0;
2197 }
2198
2199 static int
2200 vm_object_in_map(vm_object_t object)
2201 {
2202         struct proc *p;
2203
2204         /* sx_slock(&allproc_lock); */
2205         FOREACH_PROC_IN_SYSTEM(p) {
2206                 if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */)
2207                         continue;
2208                 if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) {
2209                         /* sx_sunlock(&allproc_lock); */
2210                         return 1;
2211                 }
2212         }
2213         /* sx_sunlock(&allproc_lock); */
2214         if (_vm_object_in_map(kernel_map, object, 0))
2215                 return 1;
2216         if (_vm_object_in_map(kmem_map, object, 0))
2217                 return 1;
2218         if (_vm_object_in_map(pager_map, object, 0))
2219                 return 1;
2220         if (_vm_object_in_map(buffer_map, object, 0))
2221                 return 1;
2222         return 0;
2223 }
2224
2225 DB_SHOW_COMMAND(vmochk, vm_object_check)
2226 {
2227         vm_object_t object;
2228
2229         /*
2230          * make sure that internal objs are in a map somewhere
2231          * and none have zero ref counts.
2232          */
2233         TAILQ_FOREACH(object, &vm_object_list, object_list) {
2234                 if (object->handle == NULL &&
2235                     (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) {
2236                         if (object->ref_count == 0) {
2237                                 db_printf("vmochk: internal obj has zero ref count: %ld\n",
2238                                         (long)object->size);
2239                         }
2240                         if (!vm_object_in_map(object)) {
2241                                 db_printf(
2242                         "vmochk: internal obj is not in a map: "
2243                         "ref: %d, size: %lu: 0x%lx, backing_object: %p\n",
2244                                     object->ref_count, (u_long)object->size, 
2245                                     (u_long)object->size,
2246                                     (void *)object->backing_object);
2247                         }
2248                 }
2249         }
2250 }
2251
2252 /*
2253  *      vm_object_print:        [ debug ]
2254  */
2255 DB_SHOW_COMMAND(object, vm_object_print_static)
2256 {
2257         /* XXX convert args. */
2258         vm_object_t object = (vm_object_t)addr;
2259         boolean_t full = have_addr;
2260
2261         vm_page_t p;
2262
2263         /* XXX count is an (unused) arg.  Avoid shadowing it. */
2264 #define count   was_count
2265
2266         int count;
2267
2268         if (object == NULL)
2269                 return;
2270
2271         db_iprintf(
2272             "Object %p: type=%d, size=0x%jx, res=%d, ref=%d, flags=0x%x uip %d charge %jx\n",
2273             object, (int)object->type, (uintmax_t)object->size,
2274             object->resident_page_count, object->ref_count, object->flags,
2275             object->uip ? object->uip->ui_uid : -1, (uintmax_t)object->charge);
2276         db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%jx\n",
2277             object->shadow_count, 
2278             object->backing_object ? object->backing_object->ref_count : 0,
2279             object->backing_object, (uintmax_t)object->backing_object_offset);
2280
2281         if (!full)
2282                 return;
2283
2284         db_indent += 2;
2285         count = 0;
2286         TAILQ_FOREACH(p, &object->memq, listq) {
2287                 if (count == 0)
2288                         db_iprintf("memory:=");
2289                 else if (count == 6) {
2290                         db_printf("\n");
2291                         db_iprintf(" ...");
2292                         count = 0;
2293                 } else
2294                         db_printf(",");
2295                 count++;
2296
2297                 db_printf("(off=0x%jx,page=0x%jx)",
2298                     (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p));
2299         }
2300         if (count != 0)
2301                 db_printf("\n");
2302         db_indent -= 2;
2303 }
2304
2305 /* XXX. */
2306 #undef count
2307
2308 /* XXX need this non-static entry for calling from vm_map_print. */
2309 void
2310 vm_object_print(
2311         /* db_expr_t */ long addr,
2312         boolean_t have_addr,
2313         /* db_expr_t */ long count,
2314         char *modif)
2315 {
2316         vm_object_print_static(addr, have_addr, count, modif);
2317 }
2318
2319 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
2320 {
2321         vm_object_t object;
2322         vm_pindex_t fidx;
2323         vm_paddr_t pa;
2324         vm_page_t m, prev_m;
2325         int rcount, nl, c;
2326
2327         nl = 0;
2328         TAILQ_FOREACH(object, &vm_object_list, object_list) {
2329                 db_printf("new object: %p\n", (void *)object);
2330                 if (nl > 18) {
2331                         c = cngetc();
2332                         if (c != ' ')
2333                                 return;
2334                         nl = 0;
2335                 }
2336                 nl++;
2337                 rcount = 0;
2338                 fidx = 0;
2339                 pa = -1;
2340                 TAILQ_FOREACH(m, &object->memq, listq) {
2341                         if (m->pindex > 128)
2342                                 break;
2343                         if ((prev_m = TAILQ_PREV(m, pglist, listq)) != NULL &&
2344                             prev_m->pindex + 1 != m->pindex) {
2345                                 if (rcount) {
2346                                         db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
2347                                                 (long)fidx, rcount, (long)pa);
2348                                         if (nl > 18) {
2349                                                 c = cngetc();
2350                                                 if (c != ' ')
2351                                                         return;
2352                                                 nl = 0;
2353                                         }
2354                                         nl++;
2355                                         rcount = 0;
2356                                 }
2357                         }                               
2358                         if (rcount &&
2359                                 (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
2360                                 ++rcount;
2361                                 continue;
2362                         }
2363                         if (rcount) {
2364                                 db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
2365                                         (long)fidx, rcount, (long)pa);
2366                                 if (nl > 18) {
2367                                         c = cngetc();
2368                                         if (c != ' ')
2369                                                 return;
2370                                         nl = 0;
2371                                 }
2372                                 nl++;
2373                         }
2374                         fidx = m->pindex;
2375                         pa = VM_PAGE_TO_PHYS(m);
2376                         rcount = 1;
2377                 }
2378                 if (rcount) {
2379                         db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
2380                                 (long)fidx, rcount, (long)pa);
2381                         if (nl > 18) {
2382                                 c = cngetc();
2383                                 if (c != ' ')
2384                                         return;
2385                                 nl = 0;
2386                         }
2387                         nl++;
2388                 }
2389         }
2390 }
2391 #endif /* DDB */