Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. =================================================================== RCS file: /ftp/cvs/cvsroot/src/sys/kern/vfs_subr.c,v rcsdiff: /ftp/cvs/cvsroot/src/sys/kern/vfs_subr.c,v: warning: Unknown phrases like `commitid ...;' are present. retrieving revision 1.112 retrieving revision 1.112.2.2 diff -u -p -r1.112 -r1.112.2.2 --- src/sys/kern/vfs_subr.c 1999/10/01 22:03:17 1.112 +++ src/sys/kern/vfs_subr.c 2000/12/08 09:14:01 1.112.2.2 @@ -1,4 +1,4 @@ -/* $NetBSD: vfs_subr.c,v 1.112 1999/10/01 22:03:17 mycroft Exp $ */ +/* $NetBSD: vfs_subr.c,v 1.112.2.2 2000/12/08 09:14:01 bouyer Exp $ */ /*- * Copyright (c) 1997, 1998 The NetBSD Foundation, Inc. @@ -81,12 +81,14 @@ * External virtual filesystem routines */ +#include "opt_ddb.h" #include "opt_compat_netbsd.h" #include "opt_compat_43.h" #include #include #include +#include #include #include #include @@ -103,12 +105,14 @@ #include #include -#include -#include - #include +#include +#include -#include +#include +#include + +#include enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, @@ -122,6 +126,8 @@ int vttoif_tab[9] = { int doforce = 1; /* 1 => permit forcible unmounting */ int prtactive = 0; /* 1 => print out reclaim of active vnodes */ +extern int dovfsusermount; /* 1 => permit any user to mount filesystems */ + /* * Insq/Remq for the vnode usage lists. */ @@ -130,20 +136,22 @@ int prtactive = 0; /* 1 => print out re LIST_REMOVE(bp, b_vnbufs); \ (bp)->b_vnbufs.le_next = NOLIST; \ } -TAILQ_HEAD(freelst, vnode) vnode_free_list = /* vnode free list */ - TAILQ_HEAD_INITIALIZER(vnode_free_list); +/* TAILQ_HEAD(freelst, vnode) vnode_free_list = vnode free list (in vnode.h) */ +struct freelst vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list); +struct freelst vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list); + struct mntlist mountlist = /* mounted filesystem list */ CIRCLEQ_HEAD_INITIALIZER(mountlist); struct vfs_list_head vfs_list = /* vfs list */ - LIST_HEAD_INITIALIZER(vfs_list); + LIST_HEAD_INITIALIZER(vfs_list); struct nfs_public nfs_pub; /* publicly exported FS */ -struct simplelock mountlist_slock; -static struct simplelock mntid_slock; -struct simplelock mntvnode_slock; -struct simplelock vnode_free_list_slock; -struct simplelock spechash_slock; +struct simplelock mountlist_slock = SIMPLELOCK_INITIALIZER; +static struct simplelock mntid_slock = SIMPLELOCK_INITIALIZER; +struct simplelock mntvnode_slock = SIMPLELOCK_INITIALIZER; +struct simplelock vnode_free_list_slock = SIMPLELOCK_INITIALIZER; +struct simplelock spechash_slock = SIMPLELOCK_INITIALIZER; /* * These define the root filesystem and device. @@ -177,13 +185,13 @@ void vntblinit() { - simple_lock_init(&mntvnode_slock); - simple_lock_init(&mntid_slock); - simple_lock_init(&spechash_slock); - simple_lock_init(&vnode_free_list_slock); - pool_init(&vnode_pool, sizeof(struct vnode), 0, 0, 0, "vnodepl", 0, pool_page_alloc_nointr, pool_page_free_nointr, M_VNODE); + + /* + * Initialize the filesystem syncer. + */ + vn_initialize_syncerd(); } /* @@ -203,6 +211,9 @@ vfs_busy(mp, flags, interlkp) if (flags & LK_NOWAIT) return (ENOENT); + if ((flags & LK_RECURSEFAIL) && mp->mnt_unmounter != NULL + && mp->mnt_unmounter == curproc) + return (EDEADLK); if (interlkp) simple_unlock(interlkp); /* @@ -215,7 +226,7 @@ vfs_busy(mp, flags, interlkp) * can atomically unlock-and-sleep. */ mp->mnt_wcnt++; - sleep((caddr_t)mp, PVFS); + tsleep((caddr_t)mp, PVFS, "vfs_busy", 0); mp->mnt_wcnt--; gone = mp->mnt_flag & MNT_GONE; @@ -290,7 +301,7 @@ struct mount * vfs_getvfs(fsid) fsid_t *fsid; { - register struct mount *mp; + struct mount *mp; simple_lock(&mountlist_slock); for (mp = mountlist.cqh_first; mp != (void *)&mountlist; @@ -309,16 +320,15 @@ vfs_getvfs(fsid) * Get a new unique fsid */ void -vfs_getnewfsid(mp, fstypename) +vfs_getnewfsid(mp) struct mount *mp; - char *fstypename; { static u_short xxxfs_mntid; fsid_t tfsid; int mtype; simple_lock(&mntid_slock); - mtype = makefstype(fstypename); + mtype = makefstype(mp->mnt_op->vfs_name); mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0); mp->mnt_stat.f_fsid.val[1] = mtype; if (xxxfs_mntid == 0) @@ -340,7 +350,7 @@ vfs_getnewfsid(mp, fstypename) */ long makefstype(type) - char *type; + const char *type; { long rv; @@ -357,7 +367,7 @@ makefstype(type) */ void vattr_null(vap) - register struct vattr *vap; + struct vattr *vap; { vap->va_type = VNON; @@ -403,9 +413,13 @@ getnewvnode(tag, mp, vops, vpp) int (**vops) __P((void *)); struct vnode **vpp; { + extern struct uvm_pagerops uvm_vnodeops; + struct uvm_object *uobj; struct proc *p = curproc; /* XXX */ + struct freelst *listhd; + static int toggle; struct vnode *vp; - int error; + int error = 0; #ifdef DIAGNOSTIC int s; #endif @@ -419,23 +433,43 @@ getnewvnode(tag, mp, vops, vpp) * (This puts the per-mount vnode list logically under * the protection of the vfs_busy lock). */ - error = vfs_busy(mp, 0, 0); - if (error) + error = vfs_busy(mp, LK_RECURSEFAIL, 0); + if (error && error != EDEADLK) return error; } + /* + * We must choose whether to allocate a new vnode or recycle an + * existing one. The criterion for allocating a new one is that + * the total number of vnodes is less than the number desired or + * there are no vnodes on either free list. Generally we only + * want to recycle vnodes that have no buffers associated with + * them, so we look first on the vnode_free_list. If it is empty, + * we next consider vnodes with referencing buffers on the + * vnode_hold_list. The toggle ensures that half the time we + * will use a buffer from the vnode_hold_list, and half the time + * we will allocate a new one unless the list has grown to twice + * the desired size. We are reticent to recycle vnodes from the + * vnode_hold_list because we will lose the identity of all its + * referencing buffers. + */ + + toggle ^= 1; + if (numvnodes > 2 * desiredvnodes) + toggle = 0; + simple_lock(&vnode_free_list_slock); - if ((vnode_free_list.tqh_first == NULL && - numvnodes < 2 * desiredvnodes) || - numvnodes < desiredvnodes) { + if (numvnodes < desiredvnodes || + (TAILQ_FIRST(listhd = &vnode_free_list) == NULL && + (TAILQ_FIRST(listhd = &vnode_hold_list) == NULL || toggle))) { simple_unlock(&vnode_free_list_slock); vp = pool_get(&vnode_pool, PR_WAITOK); - memset((char *)vp, 0, sizeof(*vp)); + memset(vp, 0, sizeof(*vp)); simple_lock_init(&vp->v_interlock); numvnodes++; } else { - for (vp = vnode_free_list.tqh_first; - vp != NULLVP; vp = vp->v_freelist.tqe_next) { + for (vp = TAILQ_FIRST(listhd); vp != NULLVP; + vp = TAILQ_NEXT(vp, v_freelist)) { if (simple_lock_try(&vp->v_interlock)) { if ((vp->v_flag & VLAYER) == 0) { break; @@ -453,14 +487,15 @@ getnewvnode(tag, mp, vops, vpp) */ if (vp == NULLVP) { simple_unlock(&vnode_free_list_slock); - if (mp) vfs_unbusy(mp); - tablefull("vnode"); + if (mp && error != EDEADLK) + vfs_unbusy(mp); + tablefull("vnode", "increase kern.maxvnodes or NVNODE"); *vpp = 0; return (ENFILE); } if (vp->v_usecount) - panic("free vnode isn't"); - TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + panic("free vnode isn't, vp %p", vp); + TAILQ_REMOVE(listhd, vp, v_freelist); /* see comment on why 0xdeadb is set at end of vgone (below) */ vp->v_freelist.tqe_prev = (struct vnode **)0xdeadb; simple_unlock(&vnode_free_list_slock); @@ -471,10 +506,10 @@ getnewvnode(tag, mp, vops, vpp) simple_unlock(&vp->v_interlock); #ifdef DIAGNOSTIC if (vp->v_data) - panic("cleaned vnode isn't"); + panic("cleaned vnode isn't, vp %p", vp); s = splbio(); if (vp->v_numoutput) - panic("Clean vnode has pending I/O's"); + panic("clean vnode has pending I/O's, vp %p", vp); splx(s); #endif vp->v_flag = 0; @@ -490,6 +525,7 @@ getnewvnode(tag, mp, vops, vpp) vp->v_type = VNON; vp->v_vnlock = &vp->v_lock; lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0); + lockinit(&vp->v_glock, PVFS, "glock", 0, 0); cache_purge(vp); vp->v_tag = tag; vp->v_op = vops; @@ -498,22 +534,65 @@ getnewvnode(tag, mp, vops, vpp) vp->v_usecount = 1; vp->v_data = 0; simple_lock_init(&vp->v_uvm.u_obj.vmobjlock); - if (mp) vfs_unbusy(mp); + + /* + * initialize uvm_object within vnode. + */ + + uobj = &vp->v_uvm.u_obj; + uobj->pgops = &uvm_vnodeops; + TAILQ_INIT(&uobj->memq); + vp->v_uvm.u_size = VSIZENOTSET; + + if (mp && error != EDEADLK) + vfs_unbusy(mp); return (0); } /* + * This is really just the reverse of getnewvnode(). Needed for + * VFS_VGET functions who may need to push back a vnode in case + * of a locking race. + */ +void +ungetnewvnode(vp) + struct vnode *vp; +{ +#ifdef DIAGNOSTIC + if (vp->v_usecount != 1) + panic("ungetnewvnode: busy vnode"); +#endif + vp->v_usecount--; + insmntque(vp, NULL); + vp->v_type = VBAD; + + simple_lock(&vp->v_interlock); + /* + * Insert at head of LRU list + */ + simple_lock(&vnode_free_list_slock); + if (vp->v_holdcnt > 0) + TAILQ_INSERT_HEAD(&vnode_hold_list, vp, v_freelist); + else + TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); + simple_unlock(&vnode_free_list_slock); + simple_unlock(&vp->v_interlock); +} + +/* * Move a vnode from one mount queue to another. */ void insmntque(vp, mp) - register struct vnode *vp; - register struct mount *mp; + struct vnode *vp; + struct mount *mp; { #ifdef DIAGNOSTIC if ((mp != NULL) && - (mp->mnt_flag & MNT_UNMOUNT)) { + (mp->mnt_flag & MNT_UNMOUNT) && + !(mp->mnt_flag & MNT_SOFTDEP) && + vp->v_tag != VT_VFS) { panic("insmntque into dying filesystem"); } #endif @@ -537,14 +616,13 @@ insmntque(vp, mp) */ void vwakeup(bp) - register struct buf *bp; + struct buf *bp; { - register struct vnode *vp; + struct vnode *vp; - bp->b_flags &= ~B_WRITEINPROG; if ((vp = bp->b_vp) != NULL) { if (--vp->v_numoutput < 0) - panic("vwakeup: neg numoutput"); + panic("vwakeup: neg numoutput, vp %p", vp); if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) { vp->v_flag &= ~VBWAIT; wakeup((caddr_t)&vp->v_numoutput); @@ -554,90 +632,197 @@ vwakeup(bp) /* * Flush out and invalidate all buffers associated with a vnode. - * Called with the underlying object locked. + * Called with the underlying vnode locked, which should prevent new dirty + * buffers from being queued. */ int vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) - register struct vnode *vp; + struct vnode *vp; int flags; struct ucred *cred; struct proc *p; int slpflag, slptimeo; { - register struct buf *bp; - struct buf *nbp, *blist; - int s, error; - + struct uvm_object *uobj = &vp->v_uvm.u_obj; + struct buf *bp, *nbp; + int s, error, rv; + int flushflags = PGO_ALLPAGES|PGO_FREE|PGO_SYNCIO| + (flags & V_SAVE ? PGO_CLEANIT : 0); + + /* XXXUBC this doesn't look at flags or slp* */ + if (vp->v_type == VREG) { + simple_lock(&uobj->vmobjlock); + rv = (uobj->pgops->pgo_flush)(uobj, 0, 0, flushflags); + simple_unlock(&uobj->vmobjlock); + if (!rv) { + return EIO; + } + } if (flags & V_SAVE) { - error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, p); - if (error != 0) - return (error); - if (vp->v_dirtyblkhd.lh_first != NULL) - panic("vinvalbuf: dirty bufs"); + error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0, p); + if (error) + return (error); +#ifdef DIAGNOSTIC + s = splbio(); + if (vp->v_numoutput > 0 || !LIST_EMPTY(&vp->v_dirtyblkhd)) + panic("vinvalbuf: dirty bufs, vp %p", vp); + splx(s); +#endif } - for (;;) { - if ((blist = vp->v_cleanblkhd.lh_first) && flags & V_SAVEMETA) - while (blist && blist->b_lblkno < 0) - blist = blist->b_vnbufs.le_next; - if (!blist && (blist = vp->v_dirtyblkhd.lh_first) && - (flags & V_SAVEMETA)) - while (blist && blist->b_lblkno < 0) - blist = blist->b_vnbufs.le_next; - if (!blist) - break; - for (bp = blist; bp; bp = nbp) { - nbp = bp->b_vnbufs.le_next; - if (flags & V_SAVEMETA && bp->b_lblkno < 0) - continue; - s = splbio(); - if (bp->b_flags & B_BUSY) { - bp->b_flags |= B_WANTED; - error = tsleep((caddr_t)bp, - slpflag | (PRIBIO + 1), "vinvalbuf", - slptimeo); + s = splbio(); + +restart: + for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { + nbp = LIST_NEXT(bp, b_vnbufs); + if (bp->b_flags & B_BUSY) { + bp->b_flags |= B_WANTED; + error = tsleep((caddr_t)bp, slpflag | (PRIBIO + 1), + "vinvalbuf", slptimeo); + if (error) { splx(s); - if (error) - return (error); - break; + return (error); + } + goto restart; + } + bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; + brelse(bp); + } + + for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { + nbp = LIST_NEXT(bp, b_vnbufs); + if (bp->b_flags & B_BUSY) { + bp->b_flags |= B_WANTED; + error = tsleep((caddr_t)bp, slpflag | (PRIBIO + 1), + "vinvalbuf", slptimeo); + if (error) { + splx(s); + return (error); } + goto restart; + } + /* + * XXX Since there are no node locks for NFS, I believe + * there is a slight chance that a delayed write will + * occur while sleeping just above, so check for it. + */ + if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) { +#ifdef DEBUG + printf("buffer still DELWRI\n"); +#endif bp->b_flags |= B_BUSY | B_VFLUSH; + VOP_BWRITE(bp); + goto restart; + } + bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; + brelse(bp); + } + +#ifdef DIAGNOSTIC + if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd)) + panic("vinvalbuf: flush failed, vp %p", vp); +#endif + + splx(s); + + return (0); +} + +/* + * Destroy any in core blocks past the truncation length. + * Called with the underlying vnode locked, which should prevent new dirty + * buffers from being queued. + */ +int +vtruncbuf(vp, lbn, slpflag, slptimeo) + struct vnode *vp; + daddr_t lbn; + int slpflag, slptimeo; +{ + struct uvm_object *uobj = &vp->v_uvm.u_obj; + struct buf *bp, *nbp; + int s, error, rv; + + s = splbio(); + if (vp->v_type == VREG) { + simple_lock(&uobj->vmobjlock); + rv = (uobj->pgops->pgo_flush)(uobj, + round_page(lbn << vp->v_mount->mnt_fs_bshift), + vp->v_uvm.u_size, PGO_FREE); + simple_unlock(&uobj->vmobjlock); + if (!rv) { splx(s); - /* - * XXX Since there are no node locks for NFS, I believe - * there is a slight chance that a delayed write will - * occur while sleeping just above, so check for it. - */ - if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) { - (void) VOP_BWRITE(bp); - break; + return EIO; + } + } + +restart: + for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { + nbp = LIST_NEXT(bp, b_vnbufs); + if (bp->b_lblkno < lbn) + continue; + if (bp->b_flags & B_BUSY) { + bp->b_flags |= B_WANTED; + error = tsleep(bp, slpflag | (PRIBIO + 1), + "vtruncbuf", slptimeo); + if (error) { + splx(s); + return (error); + } + goto restart; + } + bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; + brelse(bp); + } + + for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { + nbp = LIST_NEXT(bp, b_vnbufs); + if (bp->b_lblkno < lbn) + continue; + if (bp->b_flags & B_BUSY) { + bp->b_flags |= B_WANTED; + error = tsleep(bp, slpflag | (PRIBIO + 1), + "vtruncbuf", slptimeo); + if (error) { + splx(s); + return (error); } - bp->b_flags |= B_INVAL; - brelse(bp); + goto restart; } + bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; + brelse(bp); } - if (!(flags & V_SAVEMETA) && - (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first)) - panic("vinvalbuf: flush failed"); + + splx(s); + return (0); } void vflushbuf(vp, sync) - register struct vnode *vp; + struct vnode *vp; int sync; { - register struct buf *bp, *nbp; + struct uvm_object *uobj = &vp->v_uvm.u_obj; + struct buf *bp, *nbp; int s; + if (vp->v_type == VREG) { + int flags = PGO_CLEANIT|PGO_ALLPAGES| (sync ? PGO_SYNCIO : 0); + + simple_lock(&uobj->vmobjlock); + (uobj->pgops->pgo_flush)(uobj, 0, 0, flags); + simple_unlock(&uobj->vmobjlock); + } + loop: s = splbio(); - for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { - nbp = bp->b_vnbufs.le_next; + for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { + nbp = LIST_NEXT(bp, b_vnbufs); if ((bp->b_flags & B_BUSY)) continue; if ((bp->b_flags & B_DELWRI) == 0) - panic("vflushbuf: not dirty"); + panic("vflushbuf: not dirty, bp %p", bp); bp->b_flags |= B_BUSY | B_VFLUSH; splx(s); /* @@ -659,7 +844,7 @@ loop: tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "vflushbuf", 0); } splx(s); - if (vp->v_dirtyblkhd.lh_first != NULL) { + if (!LIST_EMPTY(&vp->v_dirtyblkhd)) { vprint("vflushbuf: dirty", vp); goto loop; } @@ -670,13 +855,15 @@ loop: */ void bgetvp(vp, bp) - register struct vnode *vp; - register struct buf *bp; + struct vnode *vp; + struct buf *bp; { + int s; if (bp->b_vp) - panic("bgetvp: not free"); + panic("bgetvp: not free, bp %p", bp); VHOLD(vp); + s = splbio(); bp->b_vp = vp; if (vp->v_type == VBLK || vp->v_type == VCHR) bp->b_dev = vp->v_rdev; @@ -686,6 +873,7 @@ bgetvp(vp, bp) * Insert onto list for new vnode. */ bufinsvn(bp, &vp->v_cleanblkhd); + splx(s); } /* @@ -693,38 +881,48 @@ bgetvp(vp, bp) */ void brelvp(bp) - register struct buf *bp; + struct buf *bp; { struct vnode *vp; + int s; - if (bp->b_vp == (struct vnode *) 0) - panic("brelvp: NULL"); + if (bp->b_vp == NULL) + panic("brelvp: vp NULL, bp %p", bp); + + s = splbio(); + vp = bp->b_vp; /* * Delete from old vnode list, if on one. */ if (bp->b_vnbufs.le_next != NOLIST) bufremvn(bp); - vp = bp->b_vp; - bp->b_vp = (struct vnode *) 0; + + if (vp->v_type != VREG && (vp->v_flag & VONWORKLST) && + LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { + vp->v_flag &= ~VONWORKLST; + LIST_REMOVE(vp, v_synclist); + } + + bp->b_vp = NULL; HOLDRELE(vp); + splx(s); } /* * Reassign a buffer from one vnode to another. * Used to assign file specific control information * (indirect blocks) to the vnode to which they belong. + * + * This function must be called at splbio(). */ void reassignbuf(bp, newvp) - register struct buf *bp; - register struct vnode *newvp; + struct buf *bp; + struct vnode *newvp; { - register struct buflists *listheadp; + struct buflists *listheadp; + int delay; - if (newvp == NULL) { - printf("reassignbuf: NULL"); - return; - } /* * Delete from old vnode list, if on one. */ @@ -734,10 +932,36 @@ reassignbuf(bp, newvp) * If dirty, put on list of dirty buffers; * otherwise insert onto list of clean buffers. */ - if (bp->b_flags & B_DELWRI) - listheadp = &newvp->v_dirtyblkhd; - else + if ((bp->b_flags & B_DELWRI) == 0) { listheadp = &newvp->v_cleanblkhd; + if (newvp->v_type != VREG && + (newvp->v_flag & VONWORKLST) && + LIST_FIRST(&newvp->v_dirtyblkhd) == NULL) { + newvp->v_flag &= ~VONWORKLST; + LIST_REMOVE(newvp, v_synclist); + } + } else { + listheadp = &newvp->v_dirtyblkhd; + if ((newvp->v_flag & VONWORKLST) == 0) { + switch (newvp->v_type) { + case VDIR: + delay = dirdelay; + break; + case VBLK: + if (newvp->v_specmountpoint != NULL) { + delay = metadelay; + break; + } + /* fall through */ + default: + delay = filedelay; + break; + } + if (!newvp->v_mount || + (newvp->v_mount->mnt_flag & MNT_ASYNC) == 0) + vn_syncer_add_to_worklist(newvp, delay); + } + } bufinsvn(bp, listheadp); } @@ -779,7 +1003,7 @@ getdevvp(dev, vpp, type) struct vnode **vpp; enum vtype type; { - register struct vnode *vp; + struct vnode *vp; struct vnode *nvp; int error; @@ -812,12 +1036,12 @@ getdevvp(dev, vpp, type) */ struct vnode * checkalias(nvp, nvp_rdev, mp) - register struct vnode *nvp; + struct vnode *nvp; dev_t nvp_rdev; struct mount *mp; { struct proc *p = curproc; /* XXX */ - register struct vnode *vp; + struct vnode *vp; struct vnode **vpp; if (nvp->v_type != VBLK && nvp->v_type != VCHR) @@ -850,7 +1074,7 @@ loop: nvp->v_rdev = nvp_rdev; nvp->v_hashchain = vpp; nvp->v_specnext = *vpp; - nvp->v_specflags = 0; + nvp->v_specmountpoint = NULL; simple_unlock(&spechash_slock); nvp->v_speclockf = NULL; *vpp = nvp; @@ -895,29 +1119,62 @@ vget(vp, flags) * return failure. Cleaning is determined by checking that * the VXLOCK flag is set. */ + if ((flags & LK_INTERLOCK) == 0) simple_lock(&vp->v_interlock); if (vp->v_flag & VXLOCK) { + if (flags & LK_NOWAIT) { + return EBUSY; + } vp->v_flag |= VXWANT; - simple_unlock(&vp->v_interlock); - tsleep((caddr_t)vp, PINOD, "vget", 0); + ltsleep((caddr_t)vp, PINOD|PNORELOCK, + "vget", 0, &vp->v_interlock); return (ENOENT); } if (vp->v_usecount == 0) { simple_lock(&vnode_free_list_slock); - TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + if (vp->v_holdcnt > 0) + TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist); + else + TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); simple_unlock(&vnode_free_list_slock); } vp->v_usecount++; #ifdef DIAGNOSTIC if (vp->v_usecount == 0) { vprint("vget", vp); - panic("vget: usecount overflow"); + panic("vget: usecount overflow, vp %p", vp); } #endif if (flags & LK_TYPE_MASK) { - if ((error = vn_lock(vp, flags | LK_INTERLOCK))) - vrele(vp); + if ((error = vn_lock(vp, flags | LK_INTERLOCK))) { + /* + * must expand vrele here because we do not want + * to call VOP_INACTIVE if the reference count + * drops back to zero since it was never really + * active. We must remove it from the free list + * before sleeping so that multiple processes do + * not try to recycle it. + */ + simple_lock(&vp->v_interlock); + vp->v_usecount--; + if (vp->v_usecount > 0) { + simple_unlock(&vp->v_interlock); + return (error); + } + /* + * insert at tail of LRU list + */ + simple_lock(&vnode_free_list_slock); + if (vp->v_holdcnt > 0) + TAILQ_INSERT_TAIL(&vnode_hold_list, vp, + v_freelist); + else + TAILQ_INSERT_TAIL(&vnode_free_list, vp, + v_freelist); + simple_unlock(&vnode_free_list_slock); + simple_unlock(&vp->v_interlock); + } return (error); } simple_unlock(&vp->v_interlock); @@ -954,8 +1211,12 @@ vput(vp) * Insert at tail of LRU list. */ simple_lock(&vnode_free_list_slock); - TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); + if (vp->v_holdcnt > 0) + TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); + else + TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); simple_unlock(&vnode_free_list_slock); + vp->v_flag &= ~VTEXT; simple_unlock(&vp->v_interlock); VOP_INACTIVE(vp, p); } @@ -983,15 +1244,19 @@ vrele(vp) #ifdef DIAGNOSTIC if (vp->v_usecount < 0 || vp->v_writecount != 0) { vprint("vrele: bad ref count", vp); - panic("vrele: ref cnt"); + panic("vrele: ref cnt vp %p", vp); } #endif /* * Insert at tail of LRU list. */ simple_lock(&vnode_free_list_slock); - TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); + if (vp->v_holdcnt > 0) + TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); + else + TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); simple_unlock(&vnode_free_list_slock); + vp->v_flag &= ~VTEXT; if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0) VOP_INACTIVE(vp, p); } @@ -1002,10 +1267,30 @@ vrele(vp) */ void vhold(vp) - register struct vnode *vp; + struct vnode *vp; { - simple_lock(&vp->v_interlock); + /* + * If it is on the freelist and the hold count is currently + * zero, move it to the hold list. The test of the back + * pointer and the use reference count of zero is because + * it will be removed from a free list by getnewvnode, + * but will not have its reference count incremented until + * after calling vgone. If the reference count were + * incremented first, vgone would (incorrectly) try to + * close the previous instance of the underlying object. + * So, the back pointer is explicitly set to `0xdeadb' in + * getnewvnode after removing it from a freelist to ensure + * that we do not try to move it here. + */ + simple_lock(&vp->v_interlock); + if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) && + vp->v_holdcnt == 0 && vp->v_usecount == 0) { + simple_lock(&vnode_free_list_slock); + TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); + simple_unlock(&vnode_free_list_slock); + } vp->v_holdcnt++; simple_unlock(&vp->v_interlock); } @@ -1015,13 +1300,35 @@ vhold(vp) */ void holdrele(vp) - register struct vnode *vp; + struct vnode *vp; { simple_lock(&vp->v_interlock); if (vp->v_holdcnt <= 0) - panic("holdrele: holdcnt"); + panic("holdrele: holdcnt vp %p", vp); vp->v_holdcnt--; + + /* + * If it is on the holdlist and the hold count drops to + * zero, move it to the free list. The test of the back + * pointer and the use reference count of zero is because + * it will be removed from a free list by getnewvnode, + * but will not have its reference count incremented until + * after calling vgone. If the reference count were + * incremented first, vgone would (incorrectly) try to + * close the previous instance of the underlying object. + * So, the back pointer is explicitly set to `0xdeadb' in + * getnewvnode after removing it from a freelist to ensure + * that we do not try to move it here. + */ + + if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) && + vp->v_holdcnt == 0 && vp->v_usecount == 0) { + simple_lock(&vnode_free_list_slock); + TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist); + TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); + simple_unlock(&vnode_free_list_slock); + } simple_unlock(&vp->v_interlock); } @@ -1035,12 +1342,12 @@ vref(vp) simple_lock(&vp->v_interlock); if (vp->v_usecount <= 0) - panic("vref used where vget required"); + panic("vref used where vget required, vp %p", vp); vp->v_usecount++; #ifdef DIAGNOSTIC if (vp->v_usecount == 0) { vprint("vref", vp); - panic("vref: usecount overflow"); + panic("vref: usecount overflow, vp %p", vp); } #endif simple_unlock(&vp->v_interlock); @@ -1067,7 +1374,7 @@ vflush(mp, skipvp, flags) int flags; { struct proc *p = curproc; /* XXX */ - register struct vnode *vp, *nvp; + struct vnode *vp, *nvp; int busy = 0; simple_lock(&mntvnode_slock); @@ -1143,7 +1450,7 @@ loop: */ void vclean(vp, flags, p) - register struct vnode *vp; + struct vnode *vp; int flags; struct proc *p; { @@ -1171,8 +1478,10 @@ vclean(vp, flags, p) * brought into use while we clean it out. */ if (vp->v_flag & VXLOCK) - panic("vclean: deadlock"); + panic("vclean: deadlock, vp %p", vp); vp->v_flag |= VXLOCK; + vp->v_flag &= ~VTEXT; + /* * Even if the count is zero, the VOP_INACTIVE routine may still * have the object locked while it cleans it out. The VOP_LOCK @@ -1183,11 +1492,7 @@ vclean(vp, flags, p) VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK); /* - * clean out any VM data associated with the vnode. - */ - uvm_vnp_terminate(vp); - /* - * Clean out any buffers associated with the vnode. + * Clean out any cached data associated with the vnode. */ if (flags & DOCLOSE) vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); @@ -1212,8 +1517,7 @@ vclean(vp, flags, p) * Reclaim the vnode. */ if (VOP_RECLAIM(vp, p)) - panic("vclean: cannot reclaim"); - + panic("vclean: cannot reclaim, vp %p", vp); if (active) { /* * Inline copy of vrele() since VOP_INACTIVE @@ -1230,17 +1534,21 @@ vclean(vp, flags, p) /* * Insert at tail of LRU list. */ + + simple_unlock(&vp->v_interlock); simple_lock(&vnode_free_list_slock); #ifdef DIAGNOSTIC if (vp->v_vnlock) { if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0) vprint("vclean: lock not drained", vp); } + if (vp->v_holdcnt > 0) + panic("vclean: not clean, vp %p", vp); #endif TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); simple_unlock(&vnode_free_list_slock); - } - simple_unlock(&vp->v_interlock); + } else + simple_unlock(&vp->v_interlock); } cache_purge(vp); @@ -1250,11 +1558,14 @@ vclean(vp, flags, p) */ vp->v_op = dead_vnodeop_p; vp->v_tag = VT_NON; + simple_lock(&vp->v_interlock); vp->v_flag &= ~VXLOCK; if (vp->v_flag & VXWANT) { vp->v_flag &= ~VXWANT; + simple_unlock(&vp->v_interlock); wakeup((caddr_t)vp); - } + } else + simple_unlock(&vp->v_interlock); } /* @@ -1298,7 +1609,7 @@ vgone(vp) */ void vgonel(vp, p) - register struct vnode *vp; + struct vnode *vp; struct proc *p; { struct vnode *vq; @@ -1310,8 +1621,8 @@ vgonel(vp, p) */ if (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; - simple_unlock(&vp->v_interlock); - tsleep((caddr_t)vp, PINOD, "vgone", 0); + ltsleep((caddr_t)vp, PINOD | PNORELOCK, + "vgone", 0, &vp->v_interlock); return; } /* @@ -1380,8 +1691,10 @@ vgonel(vp, p) */ if (vp->v_usecount == 0) { simple_lock(&vnode_free_list_slock); + if (vp->v_holdcnt > 0) + panic("vgonel: not clean, vp %p", vp); if (vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb && - vnode_free_list.tqh_first != vp) { + TAILQ_FIRST(&vnode_free_list) != vp) { TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); } @@ -1436,9 +1749,9 @@ vdevgone(maj, minl, minh, type) */ int vcount(vp) - register struct vnode *vp; + struct vnode *vp; { - register struct vnode *vq, *vnext; + struct vnode *vq, *vnext; int count; loop: @@ -1472,14 +1785,14 @@ static char *typename[] = void vprint(label, vp) char *label; - register struct vnode *vp; + struct vnode *vp; { char buf[64]; if (label != NULL) printf("%s: ", label); - printf("type %s, usecount %ld, writecount %ld, refcount %ld,", - typename[vp->v_type], vp->v_usecount, vp->v_writecount, + printf("tag %d type %s, usecount %d, writecount %ld, refcount %ld,", + vp->v_tag, typename[vp->v_type], vp->v_usecount, vp->v_writecount, vp->v_holdcnt); buf[0] = '\0'; if (vp->v_flag & VROOT) @@ -1577,6 +1890,8 @@ vfs_sysctl(name, namelen, oldp, oldlenp, /* The rest are generic vfs sysctls. */ switch (name[1]) { + case VFS_USERMOUNT: + return sysctl_int(oldp, oldlenp, newp, newlen, &dovfsusermount); #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) case VFS_MAXTYPENUM: /* @@ -1698,7 +2013,7 @@ vfs_mountedon(vp) struct vnode *vq; int error = 0; - if (vp->v_specflags & SI_MOUNTEDON) + if (vp->v_specmountpoint != NULL) return (EBUSY); if (vp->v_flag & VALIASED) { simple_lock(&spechash_slock); @@ -1706,7 +2021,7 @@ vfs_mountedon(vp) if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) continue; - if (vq->v_specflags & SI_MOUNTEDON) { + if (vq->v_specmountpoint != NULL) { error = EBUSY; break; } @@ -1726,9 +2041,9 @@ vfs_hang_addrlist(mp, nep, argp) struct netexport *nep; struct export_args *argp; { - register struct netcred *np, *enp; - register struct radix_node_head *rnh; - register int i; + struct netcred *np, *enp; + struct radix_node_head *rnh; + int i; struct radix_node *rn; struct sockaddr *saddr, *smask = 0; struct domain *dom; @@ -1817,7 +2132,7 @@ vfs_free_netcred(rn, w) struct radix_node *rn; void *w; { - register struct radix_node_head *rnh = (struct radix_node_head *)w; + struct radix_node_head *rnh = (struct radix_node_head *)w; (*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh); free((caddr_t)rn, M_NETADDR); @@ -1831,8 +2146,8 @@ static void vfs_free_addrlist(nep) struct netexport *nep; { - register int i; - register struct radix_node_head *rnh; + int i; + struct radix_node_head *rnh; for (i = 0; i <= AF_MAX; i++) if ((rnh = nep->ne_rtable[i]) != NULL) { @@ -1953,12 +2268,12 @@ vfs_setpublicfs(mp, nep, argp) struct netcred * vfs_export_lookup(mp, nep, nam) - register struct mount *mp; + struct mount *mp; struct netexport *nep; struct mbuf *nam; { - register struct netcred *np; - register struct radix_node_head *rnh; + struct netcred *np; + struct radix_node_head *rnh; struct sockaddr *saddr; np = NULL; @@ -2053,23 +2368,11 @@ vaccess(type, file_mode, uid, gid, acc_m * will avoid needing to worry about dependencies. */ void -vfs_unmountall() +vfs_unmountall(p) + struct proc *p; { - register struct mount *mp, *nmp; + struct mount *mp, *nmp; int allerror, error; - struct proc *p = curproc; /* XXX */ - - /* - * Unmounting a file system blocks the requesting process. - * However, it's possible for this routine to be called when - * curproc is NULL (e.g. panic situation, or via the debugger). - * If we get stuck in this situation, just abort, since any - * attempts to sleep will fault. - */ - if (p == NULL) { - printf("vfs_unmountall: no context, aborting\n"); - return; - } for (allerror = 0, mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) { @@ -2096,36 +2399,79 @@ vfs_unmountall() void vfs_shutdown() { - register struct buf *bp; - int iter, nbusy; - + struct buf *bp; + int iter, nbusy, nbusy_prev = 0, dcount, s; + struct proc *p = curproc; + + /* XXX we're certainly not running in proc0's context! */ + if (p == NULL) + p = &proc0; + printf("syncing disks... "); - /* XXX Should suspend scheduling. */ + /* remove user process from run queue */ + suspendsched(); (void) spl0(); - sys_sync(&proc0, (void *)0, (register_t *)0); + /* avoid coming back this way again if we panic. */ + doing_shutdown = 1; + + sys_sync(p, NULL, NULL); /* Wait for sync to finish. */ - for (iter = 0; iter < 20; iter++) { + dcount = 10000; + for (iter = 0; iter < 20;) { nbusy = 0; - for (bp = &buf[nbuf]; --bp >= buf; ) - if ((bp->b_flags & (B_BUSY|B_INVAL)) == B_BUSY) + for (bp = &buf[nbuf]; --bp >= buf; ) { + if ((bp->b_flags & (B_BUSY|B_INVAL|B_READ)) == B_BUSY) + nbusy++; + /* + * With soft updates, some buffers that are + * written will be remarked as dirty until other + * buffers are written. + */ + if (bp->b_vp && bp->b_vp->v_mount + && (bp->b_vp->v_mount->mnt_flag & MNT_SOFTDEP) + && (bp->b_flags & B_DELWRI)) { + s = splbio(); + bremfree(bp); + bp->b_flags |= B_BUSY; + splx(s); nbusy++; + bawrite(bp); + if (dcount-- <= 0) { + printf("softdep "); + goto fail; + } + } + } if (nbusy == 0) break; + if (nbusy_prev == 0) + nbusy_prev = nbusy; printf("%d ", nbusy); - DELAY(40000 * iter); + tsleep(&nbusy, PRIBIO, "bflush", + (iter == 0) ? 1 : hz / 25 * iter); + if (nbusy >= nbusy_prev) /* we didn't flush anything */ + iter++; + else + nbusy_prev = nbusy; } if (nbusy) { -#ifdef DEBUG +fail: +#if defined(DEBUG) || defined(DEBUG_HALT_BUSY) printf("giving up\nPrinting vnodes for busy buffers\n"); for (bp = &buf[nbuf]; --bp >= buf; ) - if ((bp->b_flags & (B_BUSY|B_INVAL)) == B_BUSY) + if ((bp->b_flags & (B_BUSY|B_INVAL|B_READ)) == B_BUSY) vprint(NULL, bp->b_vp); -#else - printf("giving up\n"); + +#if defined(DDB) && defined(DEBUG_HALT_BUSY) + Debugger(); #endif + +#else /* defined(DEBUG) || defined(DEBUG_HALT_BUSY) */ + printf("giving up\n"); +#endif /* defined(DEBUG) || defined(DEBUG_HALT_BUSY) */ return; } else printf("done\n"); @@ -2142,7 +2488,7 @@ vfs_shutdown() vnshutdown(); #endif /* Unmount file systems. */ - vfs_unmountall(); + vfs_unmountall(p); } /* @@ -2300,8 +2646,139 @@ vfs_detach(vfs) return (ESRCH); /* + * Now run the file system-specific cleanups. + */ + (*vfs->vfs_done)(); + + /* * Free the vnode operations vector. */ vfs_opv_free(vfs->vfs_opv_descs); return (0); } + +#ifdef DDB +const char buf_flagbits[] = + "\20\1AGE\2NEEDCOMMIT\3ASYNC\4BAD\5BUSY\6SCANNED\7CALL\10DELWRI" + "\11DIRTY\12DONE\13EINTR\14ERROR\15GATHERED\16INVAL\17LOCKED\20NOCACHE" + "\21ORDERED\22CACHE\23PHYS\24RAW\25READ\26TAPE\30WANTED" + "\32XXX\33VFLUSH"; + +void +vfs_buf_print(bp, full, pr) + struct buf *bp; + int full; + void (*pr) __P((const char *, ...)); +{ + char buf[1024]; + + (*pr)(" vp %p lblkno 0x%x blkno 0x%x dev 0x%x\n", + bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_dev); + + bitmask_snprintf(bp->b_flags, buf_flagbits, buf, sizeof(buf)); + (*pr)(" error %d flags 0x%s\n", bp->b_error, buf); + + (*pr)(" bufsize 0x%x bcount 0x%x resid 0x%x\n", + bp->b_bufsize, bp->b_bcount, bp->b_resid); + (*pr)(" data %p saveaddr %p dep %p\n", + bp->b_data, bp->b_saveaddr, LIST_FIRST(&bp->b_dep)); + (*pr)(" iodone %p\n", bp->b_iodone); +} + + +const char vnode_flagbits[] = + "\20\1ROOT\2TEXT\3SYSTEM\4ISTTY\11XLOCK\12XWANT\13BWAIT\14ALIASED" + "\15DIROP\17DIRTY"; + +const char *vnode_types[] = { + "VNON", + "VREG", + "VDIR", + "VBLK", + "VCHR", + "VLNK", + "VSOCK", + "VFIFO", + "VBAD", +}; + +const char *vnode_tags[] = { + "VT_NON", + "VT_UFS", + "VT_NFS", + "VT_MFS", + "VT_MSDOSFS", + "VT_LFS", + "VT_LOFS", + "VT_FDESC", + "VT_PORTAL", + "VT_NULL", + "VT_UMAP", + "VT_KERNFS", + "VT_PROCFS", + "VT_AFS", + "VT_ISOFS", + "VT_UNION", + "VT_ADOSFS", + "VT_EXT2FS", + "VT_CODA", + "VT_FILECORE", + "VT_NTFS", + "VT_VFS", + "VT_OVERLAY" +}; + +void +vfs_vnode_print(vp, full, pr) + struct vnode *vp; + int full; + void (*pr) __P((const char *, ...)); +{ + char buf[256]; + + const char *vtype, *vtag; + + uvm_object_printit(&vp->v_uvm.u_obj, full, pr); + bitmask_snprintf(vp->v_flag, vnode_flagbits, buf, sizeof(buf)); + (*pr)("\nVNODE flags %s\n", buf); + (*pr)("mp %p nio %d size 0x%x rwlock 0x%x glock 0x%x\n", + vp->v_mount, vp->v_uvm.u_nio, (int)vp->v_uvm.u_size, + vp->v_vnlock ? lockstatus(vp->v_vnlock) : 0x999, + lockstatus(&vp->v_glock)); + + (*pr)("data %p usecount %d writecount %d holdcnt %d numoutput %d\n", + vp->v_data, vp->v_usecount, vp->v_writecount, + vp->v_holdcnt, vp->v_numoutput); + + vtype = (vp->v_type >= 0 && + vp->v_type < sizeof(vnode_types) / sizeof(vnode_types[0])) ? + vnode_types[vp->v_type] : "UNKNOWN"; + vtag = (vp->v_tag >= 0 && + vp->v_tag < sizeof(vnode_tags) / sizeof(vnode_tags[0])) ? + vnode_tags[vp->v_tag] : "UNKNOWN"; + + (*pr)("type %s(%d) tag %s(%d) id 0x%x mount %p typedata %p\n", + vtype, vp->v_type, vtag, vp->v_tag, + vp->v_id, vp->v_mount, vp->v_mountedhere); + (*pr)("lastr 0x%x lastw 0x%x lasta 0x%x\n", + vp->v_lastr, vp->v_lastw, vp->v_lasta); + (*pr)("cstart 0x%x clen 0x%x ralen 0x%x maxra 0x%x\n", + vp->v_cstart, vp->v_clen, vp->v_ralen, vp->v_maxra); + + if (full) { + struct buf *bp; + + (*pr)("clean bufs:\n"); + LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) { + (*pr)(" bp %p\n", bp); + vfs_buf_print(bp, full, pr); + } + + (*pr)("dirty bufs:\n"); + LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) { + (*pr)(" bp %p\n", bp); + vfs_buf_print(bp, full, pr); + } + } +} +#endif