2 * Copyright (c) 1989, 1993
3 * The Regents of the University of California. All rights reserved.
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 * must display the following acknowledgement:
18 * This product includes software developed by the University of
19 * California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
37 * $Id: nfs_bio.c,v 1.59 1998/06/14 15:51:59 bde Exp $
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/resourcevar.h>
44 #include <sys/signalvar.h>
47 #include <sys/vnode.h>
48 #include <sys/mount.h>
49 #include <sys/kernel.h>
52 #include <vm/vm_extern.h>
53 #include <vm/vm_prot.h>
54 #include <vm/vm_page.h>
55 #include <vm/vm_object.h>
56 #include <vm/vm_pager.h>
57 #include <vm/vnode_pager.h>
59 #include <nfs/rpcv2.h>
60 #include <nfs/nfsproto.h>
62 #include <nfs/nfsmount.h>
63 #include <nfs/nqnfs.h>
64 #include <nfs/nfsnode.h>
66 static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
68 static void nfs_prot_buf __P((struct buf *bp, int off, int n));
70 extern int nfs_numasync;
71 extern struct nfsstats nfsstats;
74 * Vnode op for VM getpages.
78 struct vop_getpages_args /* {
83 vm_ooffset_t a_offset;
86 int i, error, nextoff, size, toff, npages, count;
99 p = curproc; /* XXX */
100 cred = curproc->p_ucred; /* XXX */
101 nmp = VFSTONFS(vp->v_mount);
105 if (vp->v_object == NULL) {
106 printf("nfs_getpages: called with non-merged cache vnode??\n");
107 return VM_PAGER_ERROR;
110 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
111 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
112 (void)nfs_fsinfo(nmp, vp, cred, p);
114 * We use only the kva address for the buffer, but this is extremely
115 * convienient and fast.
119 npages = btoc(count);
120 kva = (vm_offset_t) bp->b_data;
121 pmap_qenter(kva, pages, npages);
123 iov.iov_base = (caddr_t) kva;
127 uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
128 uio.uio_resid = count;
129 uio.uio_segflg = UIO_SYSSPACE;
130 uio.uio_rw = UIO_READ;
133 error = nfs_readrpc(vp, &uio, cred);
134 pmap_qremove(kva, npages);
138 if (error && (uio.uio_resid == count))
139 return VM_PAGER_ERROR;
141 size = count - uio.uio_resid;
143 for (i = 0, toff = 0; i < npages; i++, toff = nextoff) {
145 nextoff = toff + PAGE_SIZE;
148 m->flags &= ~PG_ZERO;
150 if (nextoff <= size) {
151 m->valid = VM_PAGE_BITS_ALL;
154 int nvalid = ((size + DEV_BSIZE - 1) - toff) & ~(DEV_BSIZE - 1);
155 vm_page_set_validclean(m, 0, nvalid);
158 if (i != ap->a_reqpage) {
160 * Whether or not to leave the page activated is up in
161 * the air, but we should put the page on a page queue
162 * somewhere (it already is in the object). Result:
163 * It appears that emperical results show that
164 * deactivating pages is best.
168 * Just in case someone was asking for this page we
169 * now tell them that it is ok to use.
172 if (m->flags & PG_WANTED)
175 vm_page_deactivate(m);
178 vnode_pager_freepage(m);
186 * Vnode op for VM putpages.
190 struct vop_putpages_args /* {
196 vm_ooffset_t a_offset;
204 int iomode, must_commit, i, error, npages, count;
209 struct nfsmount *nmp;
213 p = curproc; /* XXX */
214 cred = curproc->p_ucred; /* XXX */
215 nmp = VFSTONFS(vp->v_mount);
218 rtvals = ap->a_rtvals;
219 npages = btoc(count);
221 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
222 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
223 (void)nfs_fsinfo(nmp, vp, cred, p);
225 for (i = 0; i < npages; i++) {
226 rtvals[i] = VM_PAGER_AGAIN;
230 * We use only the kva address for the buffer, but this is extremely
231 * convienient and fast.
235 kva = (vm_offset_t) bp->b_data;
236 pmap_qenter(kva, pages, npages);
238 iov.iov_base = (caddr_t) kva;
242 uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
243 uio.uio_resid = count;
244 uio.uio_segflg = UIO_SYSSPACE;
245 uio.uio_rw = UIO_WRITE;
248 if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0)
249 iomode = NFSV3WRITE_UNSTABLE;
251 iomode = NFSV3WRITE_FILESYNC;
253 error = nfs_writerpc(vp, &uio, cred, &iomode, &must_commit);
255 pmap_qremove(kva, npages);
259 int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE;
260 for (i = 0; i < nwritten; i++) {
261 rtvals[i] = VM_PAGER_OK;
265 nfs_clearcommit(vp->v_mount);
271 * Vnode op for read using bio
274 nfs_bioread(vp, uio, ioflag, cred, getpages)
275 register struct vnode *vp;
276 register struct uio *uio;
281 register struct nfsnode *np = VTONFS(vp);
282 register int biosize, diff, i;
283 struct buf *bp = 0, *rabp;
286 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
289 int nra, error = 0, n = 0, on = 0, not_readin;
292 if (uio->uio_rw != UIO_READ)
293 panic("nfs_read mode");
295 if (uio->uio_resid == 0)
297 if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */
300 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
301 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
302 (void)nfs_fsinfo(nmp, vp, cred, p);
303 if (vp->v_type != VDIR &&
304 (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
306 biosize = vp->v_mount->mnt_stat.f_iosize;
308 * For nfs, cache consistency can only be maintained approximately.
309 * Although RFC1094 does not specify the criteria, the following is
310 * believed to be compatible with the reference port.
311 * For nqnfs, full cache consistency is maintained within the loop.
313 * If the file's modify time on the server has changed since the
314 * last read rpc or you have written to the file,
315 * you may have lost data cache consistency with the
316 * server, so flush all of the file's data out of the cache.
317 * Then force a getattr rpc to ensure that you have up to date
319 * NB: This implies that cache data can be read when up to
320 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
321 * attributes this could be forced by setting n_attrstamp to 0 before
322 * the VOP_GETATTR() call.
324 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
325 if (np->n_flag & NMODIFIED) {
326 if (vp->v_type != VREG) {
327 if (vp->v_type != VDIR)
328 panic("nfs: bioread, not dir");
330 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
335 error = VOP_GETATTR(vp, &vattr, cred, p);
338 np->n_mtime = vattr.va_mtime.tv_sec;
340 error = VOP_GETATTR(vp, &vattr, cred, p);
343 if (np->n_mtime != vattr.va_mtime.tv_sec) {
344 if (vp->v_type == VDIR)
346 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
349 np->n_mtime = vattr.va_mtime.tv_sec;
356 * Get a valid lease. If cached data is stale, flush it.
358 if (nmp->nm_flag & NFSMNT_NQNFS) {
359 if (NQNFS_CKINVALID(vp, np, ND_READ)) {
361 error = nqnfs_getlease(vp, ND_READ, cred, p);
362 } while (error == NQNFS_EXPIRED);
365 if (np->n_lrev != np->n_brev ||
366 (np->n_flag & NQNFSNONCACHE) ||
367 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
368 if (vp->v_type == VDIR)
370 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
373 np->n_brev = np->n_lrev;
375 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
377 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
382 if (np->n_flag & NQNFSNONCACHE) {
383 switch (vp->v_type) {
385 return (nfs_readrpc(vp, uio, cred));
387 return (nfs_readlinkrpc(vp, uio, cred));
391 printf(" NQNFSNONCACHE: type %x unexpected\n",
395 switch (vp->v_type) {
397 nfsstats.biocache_reads++;
398 lbn = uio->uio_offset / biosize;
399 on = uio->uio_offset & (biosize - 1);
403 * Start the read ahead(s), as required.
405 if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
406 for (nra = 0; nra < nmp->nm_readahead &&
407 (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
408 rabn = lbn + 1 + nra;
409 if (!incore(vp, rabn)) {
410 rabp = nfs_getcacheblk(vp, rabn, biosize, p);
413 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
414 rabp->b_flags |= (B_READ | B_ASYNC);
415 vfs_busy_pages(rabp, 0);
416 if (nfs_asyncio(rabp, cred)) {
417 rabp->b_flags |= B_INVAL|B_ERROR;
418 vfs_unbusy_pages(rabp);
428 * If the block is in the cache and has the required data
429 * in a valid region, just copy it out.
430 * Otherwise, get the block and write back/read in,
435 if ((off_t)(lbn + 1) * biosize > np->n_size &&
436 (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
437 bufsize = np->n_size - lbn * biosize;
438 bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
440 bp = nfs_getcacheblk(vp, lbn, bufsize, p);
444 * If we are being called from nfs_getpages, we must
445 * make sure the buffer is a vmio buffer. The vp will
446 * already be setup for vmio but there may be some old
447 * non-vmio buffers attached to it.
449 if (getpages && !(bp->b_flags & B_VMIO)) {
451 printf("nfs_bioread: non vmio buf found, discarding\n");
453 bp->b_flags |= B_NOCACHE;
454 bp->b_flags |= B_INVAFTERWRITE;
455 if (bp->b_dirtyend > 0) {
456 if ((bp->b_flags & B_DELWRI) == 0)
458 if (VOP_BWRITE(bp) == EINTR)
464 if ((bp->b_flags & B_CACHE) == 0) {
465 bp->b_flags |= B_READ;
466 bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
468 vfs_busy_pages(bp, 0);
469 error = nfs_doio(bp, cred, p);
476 n = min((unsigned)(bufsize - on), uio->uio_resid);
480 diff = np->n_size - uio->uio_offset;
483 if (not_readin && n > 0) {
484 if (on < bp->b_validoff || (on + n) > bp->b_validend) {
485 bp->b_flags |= B_NOCACHE;
486 bp->b_flags |= B_INVAFTERWRITE;
487 if (bp->b_dirtyend > 0) {
488 if ((bp->b_flags & B_DELWRI) == 0)
490 if (VOP_BWRITE(bp) == EINTR)
498 diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
503 nfsstats.biocache_readlinks++;
504 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p);
507 if ((bp->b_flags & B_CACHE) == 0) {
508 bp->b_flags |= B_READ;
509 vfs_busy_pages(bp, 0);
510 error = nfs_doio(bp, cred, p);
512 bp->b_flags |= B_ERROR;
517 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
521 nfsstats.biocache_readdirs++;
522 if (np->n_direofoffset
523 && uio->uio_offset >= np->n_direofoffset) {
526 lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ;
527 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
528 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p);
531 if ((bp->b_flags & B_CACHE) == 0) {
532 bp->b_flags |= B_READ;
533 vfs_busy_pages(bp, 0);
534 error = nfs_doio(bp, cred, p);
538 while (error == NFSERR_BAD_COOKIE) {
540 error = nfs_vinvalbuf(vp, 0, cred, p, 1);
542 * Yuck! The directory has been modified on the
543 * server. The only way to get the block is by
544 * reading from the beginning to get all the
547 for (i = 0; i <= lbn && !error; i++) {
548 if (np->n_direofoffset
549 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
551 bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
554 if ((bp->b_flags & B_DONE) == 0) {
555 bp->b_flags |= B_READ;
556 vfs_busy_pages(bp, 0);
557 error = nfs_doio(bp, cred, p);
560 } else if (i < lbn) {
571 * If not eof and read aheads are enabled, start one.
572 * (You need the current block first, so that you have the
573 * directory offset cookie of the next block.)
575 if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
576 (np->n_direofoffset == 0 ||
577 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
578 !(np->n_flag & NQNFSNONCACHE) &&
579 !incore(vp, lbn + 1)) {
580 rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p);
582 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
583 rabp->b_flags |= (B_READ | B_ASYNC);
584 vfs_busy_pages(rabp, 0);
585 if (nfs_asyncio(rabp, cred)) {
586 rabp->b_flags |= B_INVAL|B_ERROR;
587 vfs_unbusy_pages(rabp);
596 * Make sure we use a signed variant of min() since
597 * the second term may be negative.
599 n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
602 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
607 error = uiomove(bp->b_data + on, (int)n, uio);
609 switch (vp->v_type) {
616 if (np->n_flag & NQNFSNONCACHE)
617 bp->b_flags |= B_INVAL;
620 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
623 } while (error == 0 && uio->uio_resid > 0 && n > 0);
628 nfs_prot_buf(bp, off, n)
633 int pindex, boff, end;
635 if ((bp->b_flags & B_VMIO) == 0)
638 end = round_page(off + n);
639 for (boff = trunc_page(off); boff < end; boff += PAGE_SIZE) {
640 pindex = boff >> PAGE_SHIFT;
641 vm_page_protect(bp->b_pages[pindex], VM_PROT_NONE);
646 * Vnode op for write using bio
650 struct vop_write_args /* {
654 struct ucred *a_cred;
657 register int biosize;
658 register struct uio *uio = ap->a_uio;
659 struct proc *p = uio->uio_procp;
660 register struct vnode *vp = ap->a_vp;
661 struct nfsnode *np = VTONFS(vp);
662 register struct ucred *cred = ap->a_cred;
663 int ioflag = ap->a_ioflag;
666 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
669 int n, on, error = 0, iomode, must_commit;
672 if (uio->uio_rw != UIO_WRITE)
673 panic("nfs_write mode");
674 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
675 panic("nfs_write proc");
677 if (vp->v_type != VREG)
679 if (np->n_flag & NWRITEERR) {
680 np->n_flag &= ~NWRITEERR;
681 return (np->n_error);
683 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
684 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
685 (void)nfs_fsinfo(nmp, vp, cred, p);
686 if (ioflag & (IO_APPEND | IO_SYNC)) {
687 if (np->n_flag & NMODIFIED) {
689 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
693 if (ioflag & IO_APPEND) {
695 error = VOP_GETATTR(vp, &vattr, cred, p);
698 uio->uio_offset = np->n_size;
701 if (uio->uio_offset < 0)
703 if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
705 if (uio->uio_resid == 0)
708 * Maybe this should be above the vnode op call, but so long as
709 * file servers have no limits, i don't think it matters
711 if (p && uio->uio_offset + uio->uio_resid >
712 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
717 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
718 * will be the same size within a filesystem. nfs_writerpc will
719 * still use nm_wsize when sizing the rpc's.
721 biosize = vp->v_mount->mnt_stat.f_iosize;
724 * Check for a valid write lease.
726 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
727 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
729 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
730 } while (error == NQNFS_EXPIRED);
733 if (np->n_lrev != np->n_brev ||
734 (np->n_flag & NQNFSNONCACHE)) {
735 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
738 np->n_brev = np->n_lrev;
741 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
742 iomode = NFSV3WRITE_FILESYNC;
743 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
745 nfs_clearcommit(vp->v_mount);
748 nfsstats.biocache_writes++;
749 lbn = uio->uio_offset / biosize;
750 on = uio->uio_offset & (biosize-1);
751 n = min((unsigned)(biosize - on), uio->uio_resid);
753 if (uio->uio_offset + n > np->n_size) {
754 np->n_size = uio->uio_offset + n;
755 np->n_flag |= NMODIFIED;
756 vnode_pager_setsize(vp, (u_long)np->n_size);
759 if ((lbn + 1) * biosize > np->n_size) {
760 bufsize = np->n_size - lbn * biosize;
761 bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
763 bp = nfs_getcacheblk(vp, lbn, bufsize, p);
766 if (bp->b_wcred == NOCRED) {
770 np->n_flag |= NMODIFIED;
772 if ((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend > np->n_size) {
773 bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
777 * If the new write will leave a contiguous dirty
778 * area, just update the b_dirtyoff and b_dirtyend,
779 * otherwise force a write rpc of the old dirty area.
781 if (bp->b_dirtyend > 0 &&
782 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
784 if (VOP_BWRITE(bp) == EINTR)
790 * Check for valid write lease and get one as required.
791 * In case getblk() and/or bwrite() delayed us.
793 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
794 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
796 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
797 } while (error == NQNFS_EXPIRED);
802 if (np->n_lrev != np->n_brev ||
803 (np->n_flag & NQNFSNONCACHE)) {
805 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
808 np->n_brev = np->n_lrev;
813 error = uiomove((char *)bp->b_data + on, n, uio);
815 bp->b_flags |= B_ERROR;
821 * This will keep the buffer and mmaped regions more coherent.
823 nfs_prot_buf(bp, on, n);
825 if (bp->b_dirtyend > 0) {
826 bp->b_dirtyoff = min(on, bp->b_dirtyoff);
827 bp->b_dirtyend = max((on + n), bp->b_dirtyend);
830 bp->b_dirtyend = on + n;
832 if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
833 bp->b_validoff > bp->b_dirtyend) {
834 bp->b_validoff = bp->b_dirtyoff;
835 bp->b_validend = bp->b_dirtyend;
837 bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
838 bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
842 * Since this block is being modified, it must be written
843 * again and not just committed.
845 bp->b_flags &= ~B_NEEDCOMMIT;
848 * If the lease is non-cachable or IO_SYNC do bwrite().
850 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
852 if (ioflag & IO_INVAL)
853 bp->b_flags |= B_INVAL;
854 error = VOP_BWRITE(bp);
857 if (np->n_flag & NQNFSNONCACHE) {
858 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
862 } else if ((n + on) == biosize &&
863 (nmp->nm_flag & NFSMNT_NQNFS) == 0) {
864 bp->b_proc = (struct proc *)0;
865 bp->b_flags |= B_ASYNC;
866 (void)nfs_writebp(bp, 0);
869 } while (uio->uio_resid > 0 && n > 0);
874 * Get an nfs cache block.
875 * Allocate a new one if the block isn't currently in the cache
876 * and return the block marked busy. If the calling process is
877 * interrupted by a signal for an interruptible mount point, return
881 nfs_getcacheblk(vp, bn, size, p)
887 register struct buf *bp;
889 struct nfsmount *nmp;
894 if (nmp->nm_flag & NFSMNT_INT) {
895 bp = getblk(vp, bn, size, PCATCH, 0);
896 while (bp == (struct buf *)0) {
897 if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
898 return ((struct buf *)0);
899 bp = getblk(vp, bn, size, 0, 2 * hz);
902 bp = getblk(vp, bn, size, 0, 0);
904 if( vp->v_type == VREG) {
906 biosize = mp->mnt_stat.f_iosize;
907 bp->b_blkno = (bn * biosize) / DEV_BSIZE;
914 * Flush and invalidate all dirty buffers. If another process is already
915 * doing the flush, just wait for completion.
918 nfs_vinvalbuf(vp, flags, cred, p, intrflg)
925 register struct nfsnode *np = VTONFS(vp);
926 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
927 int error = 0, slpflag, slptimeo;
929 if (vp->v_flag & VXLOCK) {
933 if ((nmp->nm_flag & NFSMNT_INT) == 0)
943 * First wait for any other process doing a flush to complete.
945 while (np->n_flag & NFLUSHINPROG) {
946 np->n_flag |= NFLUSHWANT;
947 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
949 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
954 * Now, flush as required.
956 np->n_flag |= NFLUSHINPROG;
957 error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
959 if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) {
960 np->n_flag &= ~NFLUSHINPROG;
961 if (np->n_flag & NFLUSHWANT) {
962 np->n_flag &= ~NFLUSHWANT;
963 wakeup((caddr_t)&np->n_flag);
967 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
969 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
970 if (np->n_flag & NFLUSHWANT) {
971 np->n_flag &= ~NFLUSHWANT;
972 wakeup((caddr_t)&np->n_flag);
978 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
979 * This is mainly to avoid queueing async I/O requests when the nfsiods
980 * are all hung on a dead server.
983 nfs_asyncio(bp, cred)
984 register struct buf *bp;
987 struct nfsmount *nmp;
994 if (nfs_numasync == 0)
997 nmp = VFSTONFS(bp->b_vp->v_mount);
999 if (nmp->nm_flag & NFSMNT_INT)
1004 * Find a free iod to process this request.
1006 for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
1007 if (nfs_iodwant[i]) {
1009 * Found one, so wake it up and tell it which
1013 ("nfs_asyncio: waking iod %d for mount %p\n",
1015 nfs_iodwant[i] = (struct proc *)0;
1016 nfs_iodmount[i] = nmp;
1018 wakeup((caddr_t)&nfs_iodwant[i]);
1024 * If none are free, we may already have an iod working on this mount
1025 * point. If so, it will process our request.
1028 if (nmp->nm_bufqiods > 0) {
1030 ("nfs_asyncio: %d iods are already processing mount %p\n",
1031 nmp->nm_bufqiods, nmp));
1037 * If we have an iod which can process the request, then queue
1042 * Ensure that the queue never grows too large.
1044 while (nmp->nm_bufqlen >= 2*nfs_numasync) {
1046 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
1047 nmp->nm_bufqwant = TRUE;
1048 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
1049 "nfsaio", slptimeo);
1051 if (nfs_sigintr(nmp, NULL, bp->b_proc))
1053 if (slpflag == PCATCH) {
1059 * We might have lost our iod while sleeping,
1060 * so check and loop if nescessary.
1062 if (nmp->nm_bufqiods == 0) {
1064 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
1069 if (bp->b_flags & B_READ) {
1070 if (bp->b_rcred == NOCRED && cred != NOCRED) {
1075 bp->b_flags |= B_WRITEINPROG;
1076 if (bp->b_wcred == NOCRED && cred != NOCRED) {
1082 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
1088 * All the iods are busy on other mounts, so return EIO to
1089 * force the caller to process the i/o synchronously.
1091 NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
1096 * Do an I/O operation to/from a cache block. This may be called
1097 * synchronously or from an nfsiod.
1101 register struct buf *bp;
1105 register struct uio *uiop;
1106 register struct vnode *vp;
1108 struct nfsmount *nmp;
1109 int error = 0, diff, len, iomode, must_commit = 0;
1115 nmp = VFSTONFS(vp->v_mount);
1117 uiop->uio_iov = &io;
1118 uiop->uio_iovcnt = 1;
1119 uiop->uio_segflg = UIO_SYSSPACE;
1120 uiop->uio_procp = p;
1123 * Historically, paging was done with physio, but no more.
1125 if (bp->b_flags & B_PHYS) {
1127 * ...though reading /dev/drum still gets us here.
1129 io.iov_len = uiop->uio_resid = bp->b_bcount;
1130 /* mapping was done by vmapbuf() */
1131 io.iov_base = bp->b_data;
1132 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
1133 if (bp->b_flags & B_READ) {
1134 uiop->uio_rw = UIO_READ;
1135 nfsstats.read_physios++;
1136 error = nfs_readrpc(vp, uiop, cr);
1140 iomode = NFSV3WRITE_DATASYNC;
1141 uiop->uio_rw = UIO_WRITE;
1142 nfsstats.write_physios++;
1143 error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
1146 bp->b_flags |= B_ERROR;
1147 bp->b_error = error;
1149 } else if (bp->b_flags & B_READ) {
1150 io.iov_len = uiop->uio_resid = bp->b_bcount;
1151 io.iov_base = bp->b_data;
1152 uiop->uio_rw = UIO_READ;
1153 switch (vp->v_type) {
1155 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
1156 nfsstats.read_bios++;
1157 error = nfs_readrpc(vp, uiop, cr);
1160 if (uiop->uio_resid) {
1162 * If len > 0, there is a hole in the file and
1163 * no writes after the hole have been pushed to
1165 * Just zero fill the rest of the valid area.
1167 diff = bp->b_bcount - uiop->uio_resid;
1168 len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
1171 len = min(len, uiop->uio_resid);
1172 bzero((char *)bp->b_data + diff, len);
1173 bp->b_validend = diff + len;
1175 bp->b_validend = diff;
1177 bp->b_validend = bp->b_bcount;
1179 if (p && (vp->v_flag & VTEXT) &&
1180 (((nmp->nm_flag & NFSMNT_NQNFS) &&
1181 NQNFS_CKINVALID(vp, np, ND_READ) &&
1182 np->n_lrev != np->n_brev) ||
1183 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
1184 np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
1185 uprintf("Process killed due to text file modification\n");
1186 psignal(p, SIGKILL);
1187 p->p_flag |= P_NOSWAP;
1191 uiop->uio_offset = (off_t)0;
1192 nfsstats.readlink_bios++;
1193 error = nfs_readlinkrpc(vp, uiop, cr);
1196 nfsstats.readdir_bios++;
1197 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
1198 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
1199 error = nfs_readdirplusrpc(vp, uiop, cr);
1200 if (error == NFSERR_NOTSUPP)
1201 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1203 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1204 error = nfs_readdirrpc(vp, uiop, cr);
1207 printf("nfs_doio: type %x unexpected\n",vp->v_type);
1211 bp->b_flags |= B_ERROR;
1212 bp->b_error = error;
1215 if (((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend) > np->n_size)
1216 bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
1218 if (bp->b_dirtyend > bp->b_dirtyoff) {
1219 io.iov_len = uiop->uio_resid = bp->b_dirtyend
1221 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE
1223 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1224 uiop->uio_rw = UIO_WRITE;
1225 nfsstats.write_bios++;
1226 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC)
1227 iomode = NFSV3WRITE_UNSTABLE;
1229 iomode = NFSV3WRITE_FILESYNC;
1230 bp->b_flags |= B_WRITEINPROG;
1231 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
1232 if (!error && iomode == NFSV3WRITE_UNSTABLE) {
1233 bp->b_flags |= B_NEEDCOMMIT;
1234 if (bp->b_dirtyoff == 0
1235 && bp->b_dirtyend == bp->b_bufsize)
1236 bp->b_flags |= B_CLUSTEROK;
1238 bp->b_flags &= ~B_NEEDCOMMIT;
1239 bp->b_flags &= ~B_WRITEINPROG;
1242 * For an interrupted write, the buffer is still valid
1243 * and the write hasn't been pushed to the server yet,
1244 * so we can't set B_ERROR and report the interruption
1245 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1246 * is not relevant, so the rpc attempt is essentially
1247 * a noop. For the case of a V3 write rpc not being
1248 * committed to stable storage, the block is still
1249 * dirty and requires either a commit rpc or another
1250 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1251 * the block is reused. This is indicated by setting
1252 * the B_DELWRI and B_NEEDCOMMIT flags.
1255 || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
1258 bp->b_flags &= ~(B_INVAL|B_NOCACHE);
1260 bp->b_flags |= B_DELWRI;
1262 reassignbuf(bp, vp);
1264 if ((bp->b_flags & B_ASYNC) == 0)
1265 bp->b_flags |= B_EINTR;
1268 bp->b_flags |= B_ERROR;
1269 bp->b_error = np->n_error = error;
1270 np->n_flag |= NWRITEERR;
1272 bp->b_dirtyoff = bp->b_dirtyend = 0;
1280 bp->b_resid = uiop->uio_resid;
1282 nfs_clearcommit(vp->v_mount);