[BACK]Return to vfs_subr.c CVS log [TXT][DIR] Up to [cvs.NetBSD.org] / src / sys / kern

Annotation of src/sys/kern/vfs_subr.c, Revision 1.336.2.1

1.336.2.1! yamt        1: /*     $NetBSD: vfs_subr.c,v 1.336 2008/04/04 20:13:18 cegger Exp $    */
1.74      thorpej     2:
                      3: /*-
1.315     ad          4:  * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc.
1.74      thorpej     5:  * All rights reserved.
                      6:  *
                      7:  * This code is derived from software contributed to The NetBSD Foundation
                      8:  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
1.302     ad          9:  * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
1.74      thorpej    10:  *
                     11:  * Redistribution and use in source and binary forms, with or without
                     12:  * modification, are permitted provided that the following conditions
                     13:  * are met:
                     14:  * 1. Redistributions of source code must retain the above copyright
                     15:  *    notice, this list of conditions and the following disclaimer.
                     16:  * 2. Redistributions in binary form must reproduce the above copyright
                     17:  *    notice, this list of conditions and the following disclaimer in the
                     18:  *    documentation and/or other materials provided with the distribution.
                     19:  *
                     20:  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
                     21:  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
                     22:  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
                     23:  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
                     24:  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
                     25:  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
                     26:  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
                     27:  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
                     28:  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
                     29:  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
                     30:  * POSSIBILITY OF SUCH DAMAGE.
                     31:  */
1.32      cgd        32:
1.29      cgd        33: /*
1.30      mycroft    34:  * Copyright (c) 1989, 1993
                     35:  *     The Regents of the University of California.  All rights reserved.
1.29      cgd        36:  * (c) UNIX System Laboratories, Inc.
                     37:  * All or some portions of this file are derived from material licensed
                     38:  * to the University of California by American Telephone and Telegraph
                     39:  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
                     40:  * the permission of UNIX System Laboratories, Inc.
                     41:  *
                     42:  * Redistribution and use in source and binary forms, with or without
                     43:  * modification, are permitted provided that the following conditions
                     44:  * are met:
                     45:  * 1. Redistributions of source code must retain the above copyright
                     46:  *    notice, this list of conditions and the following disclaimer.
                     47:  * 2. Redistributions in binary form must reproduce the above copyright
                     48:  *    notice, this list of conditions and the following disclaimer in the
                     49:  *    documentation and/or other materials provided with the distribution.
1.204     agc        50:  * 3. Neither the name of the University nor the names of its contributors
1.29      cgd        51:  *    may be used to endorse or promote products derived from this software
                     52:  *    without specific prior written permission.
                     53:  *
                     54:  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
                     55:  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
                     56:  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
                     57:  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
                     58:  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
                     59:  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
                     60:  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
                     61:  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
                     62:  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
                     63:  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
                     64:  * SUCH DAMAGE.
                     65:  *
1.32      cgd        66:  *     @(#)vfs_subr.c  8.13 (Berkeley) 4/18/94
1.29      cgd        67:  */
                     68:
                     69: /*
1.296     pooka      70:  * External virtual filesystem routines.
                     71:  *
                     72:  * This file contains vfs subroutines which are heavily dependant on
                     73:  * the kernel and are not suitable for standalone use.  Examples include
                     74:  * routines involved vnode and mountpoint management.
1.29      cgd        75:  */
1.162     lukem      76:
                     77: #include <sys/cdefs.h>
1.336.2.1! yamt       78: __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.336 2008/04/04 20:13:18 cegger Exp $");
1.78      mrg        79:
1.125     chs        80: #include "opt_ddb.h"
1.95      thorpej    81: #include "opt_compat_netbsd.h"
1.97      christos   82: #include "opt_compat_43.h"
1.29      cgd        83:
                     84: #include <sys/param.h>
1.30      mycroft    85: #include <sys/systm.h>
1.29      cgd        86: #include <sys/proc.h>
1.138     bouyer     87: #include <sys/kernel.h>
1.29      cgd        88: #include <sys/mount.h>
1.46      mycroft    89: #include <sys/fcntl.h>
1.29      cgd        90: #include <sys/vnode.h>
1.30      mycroft    91: #include <sys/stat.h>
1.29      cgd        92: #include <sys/namei.h>
                     93: #include <sys/ucred.h>
                     94: #include <sys/buf.h>
                     95: #include <sys/errno.h>
                     96: #include <sys/malloc.h>
1.51      christos   97: #include <sys/syscallargs.h>
1.58      thorpej    98: #include <sys/device.h>
1.192     christos   99: #include <sys/filedesc.h>
1.266     elad      100: #include <sys/kauth.h>
1.307     ad        101: #include <sys/atomic.h>
1.309     ad        102: #include <sys/kthread.h>
1.50      christos  103:
1.30      mycroft   104: #include <miscfs/specfs/specdev.h>
1.113     fvdl      105: #include <miscfs/syncfs/syncfs.h>
1.30      mycroft   106:
1.125     chs       107: #include <uvm/uvm.h>
1.255     yamt      108: #include <uvm/uvm_readahead.h>
1.125     chs       109: #include <uvm/uvm_ddb.h>
1.129     mrg       110:
                    111: #include <sys/sysctl.h>
1.77      mrg       112:
1.117     fvdl      113: extern int dovfsusermount;     /* 1 => permit any user to mount filesystems */
1.263     chs       114: extern int vfs_magiclinks;     /* 1 => expand "magic" symlinks */
1.117     fvdl      115:
1.309     ad        116: static vnodelst_t vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list);
                    117: static vnodelst_t vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list);
                    118: static vnodelst_t vrele_list = TAILQ_HEAD_INITIALIZER(vrele_list);
                    119:
                    120: static int vrele_pending;
                    121: static kmutex_t        vrele_lock;
                    122: static kcondvar_t vrele_cv;
                    123: static lwp_t *vrele_lwp;
1.113     fvdl      124:
1.309     ad        125: static pool_cache_t vnode_cache;
1.186     thorpej   126:
                    127: MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes");
1.93      thorpej   128:
1.89      kleink    129: /*
                    130:  * Local declarations.
                    131:  */
1.276     hannken   132:
1.309     ad        133: static void vrele_thread(void *);
                    134: static void insmntque(vnode_t *, struct mount *);
                    135: static int getdevvp(dev_t, vnode_t **, enum vtype);
                    136: static vnode_t *getcleanvnode(void);;
                    137: void vpanic(vnode_t *, const char *);
                    138:
                    139: #ifdef DIAGNOSTIC
                    140: void
                    141: vpanic(vnode_t *vp, const char *msg)
                    142: {
                    143:
                    144:        vprint(NULL, vp);
                    145:        panic("%s\n", msg);
                    146: }
                    147: #else
                    148: #define        vpanic(vp, msg) /* nothing */
                    149: #endif
                    150:
                    151: void
                    152: vn_init1(void)
                    153: {
                    154:
                    155:        vnode_cache = pool_cache_init(sizeof(struct vnode), 0, 0, 0, "vnodepl",
                    156:            NULL, IPL_NONE, NULL, NULL, NULL);
                    157:        KASSERT(vnode_cache != NULL);
                    158:
                    159:        /* Create deferred release thread. */
                    160:        mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE);
                    161:        cv_init(&vrele_cv, "vrele");
                    162:        if (kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread,
                    163:            NULL, &vrele_lwp, "vrele"))
                    164:                panic("fork vrele");
                    165: }
1.51      christos  166:
1.202     yamt      167: int
1.256     christos  168: vfs_drainvnodes(long target, struct lwp *l)
1.202     yamt      169: {
                    170:
                    171:        while (numvnodes > target) {
1.309     ad        172:                vnode_t *vp;
1.202     yamt      173:
1.309     ad        174:                mutex_enter(&vnode_free_list_lock);
                    175:                vp = getcleanvnode();
1.202     yamt      176:                if (vp == NULL)
                    177:                        return EBUSY; /* give up */
1.309     ad        178:                ungetnewvnode(vp);
1.202     yamt      179:        }
                    180:
                    181:        return 0;
                    182: }
                    183:
                    184: /*
                    185:  * grab a vnode from freelist and clean it.
                    186:  */
1.309     ad        187: vnode_t *
                    188: getcleanvnode(void)
1.202     yamt      189: {
1.309     ad        190:        vnode_t *vp;
                    191:        vnodelst_t *listhd;
1.202     yamt      192:
1.309     ad        193:        KASSERT(mutex_owned(&vnode_free_list_lock));
1.229     yamt      194:
1.309     ad        195: retry:
1.229     yamt      196:        listhd = &vnode_free_list;
                    197: try_nextlist:
                    198:        TAILQ_FOREACH(vp, listhd, v_freelist) {
1.309     ad        199:                /*
                    200:                 * It's safe to test v_usecount and v_iflag
                    201:                 * without holding the interlock here, since
                    202:                 * these vnodes should never appear on the
                    203:                 * lists.
                    204:                 */
                    205:                if (vp->v_usecount != 0) {
                    206:                        vpanic(vp, "free vnode isn't");
                    207:                }
                    208:                if ((vp->v_iflag & VI_CLEAN) != 0) {
                    209:                        vpanic(vp, "clean vnode on freelist");
                    210:                }
                    211:                if (vp->v_freelisthd != listhd) {
                    212:                        printf("vnode sez %p, listhd %p\n", vp->v_freelisthd, listhd);
                    213:                        vpanic(vp, "list head mismatch");
                    214:                }
                    215:                if (!mutex_tryenter(&vp->v_interlock))
1.208     hannken   216:                        continue;
1.227     yamt      217:                /*
1.309     ad        218:                 * Our lwp might hold the underlying vnode
                    219:                 * locked, so don't try to reclaim a VI_LAYER
                    220:                 * node if it's locked.
1.227     yamt      221:                 */
1.302     ad        222:                if ((vp->v_iflag & VI_XLOCK) == 0 &&
                    223:                    ((vp->v_iflag & VI_LAYER) == 0 || VOP_ISLOCKED(vp) == 0)) {
1.285     hannken   224:                        break;
1.202     yamt      225:                }
1.309     ad        226:                mutex_exit(&vp->v_interlock);
1.202     yamt      227:        }
                    228:
1.309     ad        229:        if (vp == NULL) {
1.229     yamt      230:                if (listhd == &vnode_free_list) {
                    231:                        listhd = &vnode_hold_list;
                    232:                        goto try_nextlist;
                    233:                }
1.309     ad        234:                mutex_exit(&vnode_free_list_lock);
                    235:                return NULL;
1.202     yamt      236:        }
                    237:
1.309     ad        238:        /* Remove it from the freelist. */
1.202     yamt      239:        TAILQ_REMOVE(listhd, vp, v_freelist);
1.309     ad        240:        vp->v_freelisthd = NULL;
                    241:        mutex_exit(&vnode_free_list_lock);
                    242:
                    243:        /*
                    244:         * The vnode is still associated with a file system, so we must
                    245:         * clean it out before reusing it.  We need to add a reference
                    246:         * before doing this.  If the vnode gains another reference while
                    247:         * being cleaned out then we lose - retry.
                    248:         */
                    249:        vp->v_usecount++;
                    250:        vclean(vp, DOCLOSE);
                    251:        if (vp->v_usecount == 1) {
                    252:                /* We're about to dirty it. */
                    253:                vp->v_iflag &= ~VI_CLEAN;
                    254:                mutex_exit(&vp->v_interlock);
1.318     ad        255:                if (vp->v_type == VBLK || vp->v_type == VCHR) {
                    256:                        spec_node_destroy(vp);
                    257:                }
                    258:                vp->v_type = VNON;
1.309     ad        259:        } else {
                    260:                /*
                    261:                 * Don't return to freelist - the holder of the last
                    262:                 * reference will destroy it.
                    263:                 */
1.315     ad        264:                KASSERT(vp->v_usecount > 1);
1.309     ad        265:                vp->v_usecount--;
                    266:                mutex_exit(&vp->v_interlock);
                    267:                mutex_enter(&vnode_free_list_lock);
                    268:                goto retry;
                    269:        }
                    270:
                    271:        if (vp->v_data != NULL || vp->v_uobj.uo_npages != 0 ||
                    272:            !TAILQ_EMPTY(&vp->v_uobj.memq)) {
                    273:                vpanic(vp, "cleaned vnode isn't");
                    274:        }
                    275:        if (vp->v_numoutput != 0) {
                    276:                vpanic(vp, "clean vnode has pending I/O's");
                    277:        }
                    278:        if ((vp->v_iflag & VI_ONWORKLST) != 0) {
                    279:                vpanic(vp, "clean vnode on syncer list");
                    280:        }
1.202     yamt      281:
                    282:        return vp;
                    283: }
                    284:
1.29      cgd       285: /*
1.327     ad        286:  * Mark a mount point as busy, and gain a new reference to it.  Used to
1.336.2.1! yamt      287:  * prevent the file system from being unmounted during critical sections.
1.327     ad        288:  *
1.336.2.1! yamt      289:  * => The caller must hold a pre-existing reference to the mount.
        !           290:  * => Will fail if the file system is being unmounted, or is unmounted.
1.327     ad        291:  */
                    292: int
1.336.2.1! yamt      293: vfs_busy(struct mount *mp, struct mount **nextp)
1.327     ad        294: {
                    295:
                    296:        KASSERT(mp->mnt_refcnt > 0);
                    297:
1.336.2.1! yamt      298:        if (__predict_false(!rw_tryenter(&mp->mnt_unmounting, RW_READER))) {
        !           299:                if (nextp != NULL) {
        !           300:                        KASSERT(mutex_owned(&mountlist_lock));
        !           301:                        *nextp = CIRCLEQ_NEXT(mp, mnt_list);
        !           302:                }
        !           303:                return EBUSY;
        !           304:        }
        !           305:        if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) {
        !           306:                rw_exit(&mp->mnt_unmounting);
        !           307:                if (nextp != NULL) {
        !           308:                        KASSERT(mutex_owned(&mountlist_lock));
        !           309:                        *nextp = CIRCLEQ_NEXT(mp, mnt_list);
1.327     ad        310:                }
                    311:                return ENOENT;
                    312:        }
1.336.2.1! yamt      313:        if (nextp != NULL) {
        !           314:                mutex_exit(&mountlist_lock);
1.327     ad        315:        }
                    316:        atomic_inc_uint(&mp->mnt_refcnt);
                    317:        return 0;
1.29      cgd       318: }
                    319:
                    320: /*
1.336.2.1! yamt      321:  * Unbusy a busy filesystem.
        !           322:  *
        !           323:  * => If keepref is true, preserve reference added by vfs_busy().
        !           324:  * => If nextp != NULL, acquire mountlist_lock.
1.29      cgd       325:  */
                    326: void
1.336.2.1! yamt      327: vfs_unbusy(struct mount *mp, bool keepref, struct mount **nextp)
1.29      cgd       328: {
                    329:
1.327     ad        330:        KASSERT(mp->mnt_refcnt > 0);
                    331:
1.336.2.1! yamt      332:        if (nextp != NULL) {
        !           333:                mutex_enter(&mountlist_lock);
1.327     ad        334:        }
1.336.2.1! yamt      335:        rw_exit(&mp->mnt_unmounting);
1.327     ad        336:        if (!keepref) {
                    337:                vfs_destroy(mp);
                    338:        }
1.336.2.1! yamt      339:        if (nextp != NULL) {
        !           340:                KASSERT(mutex_owned(&mountlist_lock));
        !           341:                *nextp = CIRCLEQ_NEXT(mp, mnt_list);
        !           342:        }
1.29      cgd       343: }
                    344:
                    345: /*
1.80      fvdl      346:  * Lookup a filesystem type, and if found allocate and initialize
                    347:  * a mount structure for it.
                    348:  *
                    349:  * Devname is usually updated by mount(8) after booting.
1.29      cgd       350:  */
1.50      christos  351: int
1.247     thorpej   352: vfs_rootmountalloc(const char *fstypename, const char *devname,
                    353:     struct mount **mpp)
1.29      cgd       354: {
1.80      fvdl      355:        struct vfsops *vfsp = NULL;
                    356:        struct mount *mp;
1.29      cgd       357:
1.309     ad        358:        mutex_enter(&vfs_list_lock);
1.152     jdolecek  359:        LIST_FOREACH(vfsp, &vfs_list, vfs_list)
1.291     christos  360:                if (!strncmp(vfsp->vfs_name, fstypename,
                    361:                    sizeof(mp->mnt_stat.f_fstypename)))
1.80      fvdl      362:                        break;
1.315     ad        363:        if (vfsp == NULL) {
                    364:                mutex_exit(&vfs_list_lock);
1.80      fvdl      365:                return (ENODEV);
1.315     ad        366:        }
1.309     ad        367:        vfsp->vfs_refcount++;
                    368:        mutex_exit(&vfs_list_lock);
                    369:
1.327     ad        370:        mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
                    371:        if (mp == NULL)
                    372:                return ENOMEM;
                    373:        mp->mnt_refcnt = 1;
1.336.2.1! yamt      374:        rw_init(&mp->mnt_unmounting);
        !           375:        mutex_init(&mp->mnt_updating, MUTEX_DEFAULT, IPL_NONE);
1.331     skrll     376:        mutex_init(&mp->mnt_renamelock, MUTEX_DEFAULT, IPL_NONE);
1.336.2.1! yamt      377:        (void)vfs_busy(mp, NULL);
1.272     reinoud   378:        TAILQ_INIT(&mp->mnt_vnodelist);
1.80      fvdl      379:        mp->mnt_op = vfsp;
                    380:        mp->mnt_flag = MNT_RDONLY;
1.309     ad        381:        mp->mnt_vnodecovered = NULL;
1.291     christos  382:        (void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name,
                    383:            sizeof(mp->mnt_stat.f_fstypename));
1.80      fvdl      384:        mp->mnt_stat.f_mntonname[0] = '/';
1.314     pooka     385:        mp->mnt_stat.f_mntonname[1] = '\0';
1.291     christos  386:        mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] =
                    387:            '\0';
                    388:        (void)copystr(devname, mp->mnt_stat.f_mntfromname,
                    389:            sizeof(mp->mnt_stat.f_mntfromname) - 1, 0);
1.276     hannken   390:        mount_initspecific(mp);
1.80      fvdl      391:        *mpp = mp;
1.29      cgd       392:        return (0);
                    393: }
                    394:
1.30      mycroft   395: /*
                    396:  * Routines having to do with the management of the vnode table.
                    397:  */
1.217     junyoung  398: extern int (**dead_vnodeop_p)(void *);
1.30      mycroft   399:
1.29      cgd       400: /*
                    401:  * Return the next vnode from the free list.
                    402:  */
1.50      christos  403: int
1.247     thorpej   404: getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *),
1.309     ad        405:            vnode_t **vpp)
1.29      cgd       406: {
1.142     chs       407:        struct uvm_object *uobj;
1.113     fvdl      408:        static int toggle;
1.309     ad        409:        vnode_t *vp;
1.153     thorpej   410:        int error = 0, tryalloc;
1.158     chs       411:
1.159     enami     412:  try_again:
1.327     ad        413:        if (mp != NULL) {
1.103     sommerfe  414:                /*
1.327     ad        415:                 * Mark filesystem busy while we're creating a
                    416:                 * vnode.  If unmount is in progress, this will
1.336.2.1! yamt      417:                 * fail.
1.103     sommerfe  418:                 */
1.336.2.1! yamt      419:                error = vfs_busy(mp, NULL);
1.327     ad        420:                if (error)
1.103     sommerfe  421:                        return error;
                    422:        }
1.29      cgd       423:
1.113     fvdl      424:        /*
                    425:         * We must choose whether to allocate a new vnode or recycle an
                    426:         * existing one. The criterion for allocating a new one is that
                    427:         * the total number of vnodes is less than the number desired or
                    428:         * there are no vnodes on either free list. Generally we only
                    429:         * want to recycle vnodes that have no buffers associated with
                    430:         * them, so we look first on the vnode_free_list. If it is empty,
                    431:         * we next consider vnodes with referencing buffers on the
                    432:         * vnode_hold_list. The toggle ensures that half the time we
                    433:         * will use a buffer from the vnode_hold_list, and half the time
                    434:         * we will allocate a new one unless the list has grown to twice
                    435:         * the desired size. We are reticent to recycle vnodes from the
                    436:         * vnode_hold_list because we will lose the identity of all its
                    437:         * referencing buffers.
                    438:         */
1.142     chs       439:
1.153     thorpej   440:        vp = NULL;
                    441:
1.309     ad        442:        mutex_enter(&vnode_free_list_lock);
1.153     thorpej   443:
1.113     fvdl      444:        toggle ^= 1;
                    445:        if (numvnodes > 2 * desiredvnodes)
                    446:                toggle = 0;
                    447:
1.153     thorpej   448:        tryalloc = numvnodes < desiredvnodes ||
1.159     enami     449:            (TAILQ_FIRST(&vnode_free_list) == NULL &&
                    450:             (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle));
1.153     thorpej   451:
1.309     ad        452:        if (tryalloc) {
1.206     yamt      453:                numvnodes++;
1.309     ad        454:                mutex_exit(&vnode_free_list_lock);
1.310     pooka     455:                if ((vp = vnalloc(NULL)) == NULL) {
1.309     ad        456:                        mutex_enter(&vnode_free_list_lock);
                    457:                        numvnodes--;
                    458:                } else
                    459:                        vp->v_usecount = 1;
                    460:        }
                    461:
                    462:        if (vp == NULL) {
                    463:                vp = getcleanvnode();
                    464:                if (vp == NULL) {
1.327     ad        465:                        if (mp != NULL) {
1.336.2.1! yamt      466:                                vfs_unbusy(mp, false, NULL);
1.327     ad        467:                        }
1.153     thorpej   468:                        if (tryalloc) {
                    469:                                printf("WARNING: unable to allocate new "
                    470:                                    "vnode, retrying...\n");
                    471:                                (void) tsleep(&lbolt, PRIBIO, "newvn", hz);
                    472:                                goto try_again;
                    473:                        }
1.132     jdolecek  474:                        tablefull("vnode", "increase kern.maxvnodes or NVNODE");
1.29      cgd       475:                        *vpp = 0;
                    476:                        return (ENFILE);
                    477:                }
1.302     ad        478:                vp->v_iflag = 0;
                    479:                vp->v_vflag = 0;
                    480:                vp->v_uflag = 0;
1.158     chs       481:                vp->v_socket = NULL;
1.29      cgd       482:        }
1.309     ad        483:
                    484:        KASSERT(vp->v_usecount == 1);
                    485:        KASSERT(vp->v_freelisthd == NULL);
                    486:        KASSERT(LIST_EMPTY(&vp->v_nclist));
                    487:        KASSERT(LIST_EMPTY(&vp->v_dnclist));
                    488:
1.29      cgd       489:        vp->v_type = VNON;
1.104     wrstuden  490:        vp->v_vnlock = &vp->v_lock;
1.29      cgd       491:        vp->v_tag = tag;
                    492:        vp->v_op = vops;
                    493:        insmntque(vp, mp);
1.30      mycroft   494:        *vpp = vp;
                    495:        vp->v_data = 0;
1.142     chs       496:
                    497:        /*
                    498:         * initialize uvm_object within vnode.
                    499:         */
                    500:
1.158     chs       501:        uobj = &vp->v_uobj;
                    502:        KASSERT(uobj->pgops == &uvm_vnodeops);
                    503:        KASSERT(uobj->uo_npages == 0);
                    504:        KASSERT(TAILQ_FIRST(&uobj->memq) == NULL);
1.288     yamt      505:        vp->v_size = vp->v_writesize = VSIZENOTSET;
1.142     chs       506:
1.309     ad        507:        if (mp != NULL) {
                    508:                if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
                    509:                        vp->v_vflag |= VV_MPSAFE;
1.336.2.1! yamt      510:                vfs_unbusy(mp, true, NULL);
1.309     ad        511:        }
                    512:
1.29      cgd       513:        return (0);
1.130     fvdl      514: }
                    515:
                    516: /*
                    517:  * This is really just the reverse of getnewvnode(). Needed for
                    518:  * VFS_VGET functions who may need to push back a vnode in case
                    519:  * of a locking race.
                    520:  */
                    521: void
1.309     ad        522: ungetnewvnode(vnode_t *vp)
                    523: {
                    524:
                    525:        KASSERT(vp->v_usecount == 1);
                    526:        KASSERT(vp->v_data == NULL);
                    527:        KASSERT(vp->v_freelisthd == NULL);
                    528:
                    529:        mutex_enter(&vp->v_interlock);
                    530:        vp->v_iflag |= VI_CLEAN;
1.324     pooka     531:        vrelel(vp, 0);
1.309     ad        532: }
                    533:
                    534: /*
                    535:  * Allocate a new, uninitialized vnode.  If 'mp' is non-NULL, this is a
                    536:  * marker vnode and we are prepared to wait for the allocation.
                    537:  */
                    538: vnode_t *
1.310     pooka     539: vnalloc(struct mount *mp)
1.130     fvdl      540: {
1.309     ad        541:        vnode_t *vp;
                    542:
                    543:        vp = pool_cache_get(vnode_cache, (mp != NULL ? PR_WAITOK : PR_NOWAIT));
                    544:        if (vp == NULL) {
                    545:                return NULL;
                    546:        }
                    547:
                    548:        memset(vp, 0, sizeof(*vp));
                    549:        UVM_OBJ_INIT(&vp->v_uobj, &uvm_vnodeops, 0);
                    550:        cv_init(&vp->v_cv, "vnode");
                    551:        /*
                    552:         * done by memset() above.
                    553:         *      LIST_INIT(&vp->v_nclist);
                    554:         *      LIST_INIT(&vp->v_dnclist);
                    555:         */
                    556:
                    557:        if (mp != NULL) {
                    558:                vp->v_mount = mp;
                    559:                vp->v_type = VBAD;
                    560:                vp->v_iflag = VI_MARKER;
                    561:        } else {
1.326     ad        562:                rw_init(&vp->v_lock.vl_lock);
1.309     ad        563:        }
                    564:
                    565:        return vp;
                    566: }
                    567:
                    568: /*
                    569:  * Free an unused, unreferenced vnode.
                    570:  */
                    571: void
1.310     pooka     572: vnfree(vnode_t *vp)
1.309     ad        573: {
                    574:
                    575:        KASSERT(vp->v_usecount == 0);
                    576:
                    577:        if ((vp->v_iflag & VI_MARKER) == 0) {
1.326     ad        578:                rw_destroy(&vp->v_lock.vl_lock);
1.309     ad        579:                mutex_enter(&vnode_free_list_lock);
                    580:                numvnodes--;
                    581:                mutex_exit(&vnode_free_list_lock);
                    582:        }
                    583:
                    584:        UVM_OBJ_DESTROY(&vp->v_uobj);
                    585:        cv_destroy(&vp->v_cv);
                    586:        pool_cache_put(vnode_cache, vp);
                    587: }
                    588:
                    589: /*
                    590:  * Remove a vnode from its freelist.
                    591:  */
                    592: static inline void
                    593: vremfree(vnode_t *vp)
                    594: {
                    595:
                    596:        KASSERT(mutex_owned(&vp->v_interlock));
                    597:        KASSERT(vp->v_usecount == 0);
1.130     fvdl      598:
1.217     junyoung  599:        /*
1.309     ad        600:         * Note that the reference count must not change until
                    601:         * the vnode is removed.
1.130     fvdl      602:         */
1.309     ad        603:        mutex_enter(&vnode_free_list_lock);
                    604:        if (vp->v_holdcnt > 0) {
                    605:                KASSERT(vp->v_freelisthd == &vnode_hold_list);
                    606:        } else {
                    607:                KASSERT(vp->v_freelisthd == &vnode_free_list);
                    608:        }
                    609:        TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
                    610:        vp->v_freelisthd = NULL;
                    611:        mutex_exit(&vnode_free_list_lock);
1.29      cgd       612: }
                    613:
                    614: /*
                    615:  * Move a vnode from one mount queue to another.
                    616:  */
1.260     yamt      617: static void
1.309     ad        618: insmntque(vnode_t *vp, struct mount *mp)
1.29      cgd       619: {
1.327     ad        620:        struct mount *omp;
1.29      cgd       621:
1.103     sommerfe  622: #ifdef DIAGNOSTIC
                    623:        if ((mp != NULL) &&
1.207     dbj       624:            (mp->mnt_iflag & IMNT_UNMOUNT) &&
1.113     fvdl      625:            !(mp->mnt_flag & MNT_SOFTDEP) &&
                    626:            vp->v_tag != VT_VFS) {
1.103     sommerfe  627:                panic("insmntque into dying filesystem");
                    628:        }
                    629: #endif
1.217     junyoung  630:
1.309     ad        631:        mutex_enter(&mntvnode_lock);
1.29      cgd       632:        /*
                    633:         * Delete from old mount point vnode list, if on one.
                    634:         */
1.327     ad        635:        if ((omp = vp->v_mount) != NULL)
1.272     reinoud   636:                TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes);
1.29      cgd       637:        /*
1.327     ad        638:         * Insert into list of vnodes for the new mount point, if
                    639:         * available.  The caller must take a reference on the mount
                    640:         * structure and donate to the vnode.
1.29      cgd       641:         */
1.279     pooka     642:        if ((vp->v_mount = mp) != NULL)
                    643:                TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
1.309     ad        644:        mutex_exit(&mntvnode_lock);
1.327     ad        645:
                    646:        if (omp != NULL) {
                    647:                /* Release reference to old mount. */
                    648:                vfs_destroy(omp);
                    649:        }
1.29      cgd       650: }
                    651:
                    652: /*
                    653:  * Create a vnode for a block device.
1.59      thorpej   654:  * Used for root filesystem and swap areas.
1.29      cgd       655:  * Also used for memory file system special devices.
                    656:  */
1.50      christos  657: int
1.309     ad        658: bdevvp(dev_t dev, vnode_t **vpp)
1.29      cgd       659: {
1.30      mycroft   660:
                    661:        return (getdevvp(dev, vpp, VBLK));
1.29      cgd       662: }
                    663:
                    664: /*
                    665:  * Create a vnode for a character device.
                    666:  * Used for kernfs and some console handling.
                    667:  */
1.50      christos  668: int
1.309     ad        669: cdevvp(dev_t dev, vnode_t **vpp)
1.29      cgd       670: {
1.30      mycroft   671:
                    672:        return (getdevvp(dev, vpp, VCHR));
1.29      cgd       673: }
                    674:
                    675: /*
                    676:  * Create a vnode for a device.
                    677:  * Used by bdevvp (block device) for root file system etc.,
                    678:  * and by cdevvp (character device) for console and kernfs.
                    679:  */
1.260     yamt      680: static int
1.309     ad        681: getdevvp(dev_t dev, vnode_t **vpp, enum vtype type)
1.29      cgd       682: {
1.309     ad        683:        vnode_t *vp;
                    684:        vnode_t *nvp;
1.29      cgd       685:        int error;
                    686:
1.80      fvdl      687:        if (dev == NODEV) {
1.302     ad        688:                *vpp = NULL;
1.29      cgd       689:                return (0);
1.80      fvdl      690:        }
1.50      christos  691:        error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp);
1.29      cgd       692:        if (error) {
1.302     ad        693:                *vpp = NULL;
1.29      cgd       694:                return (error);
                    695:        }
                    696:        vp = nvp;
                    697:        vp->v_type = type;
1.309     ad        698:        vp->v_vflag |= VV_MPSAFE;
1.297     pooka     699:        uvm_vnp_setsize(vp, 0);
1.318     ad        700:        spec_node_init(vp, dev);
1.29      cgd       701:        *vpp = vp;
                    702:        return (0);
                    703: }
                    704:
                    705: /*
                    706:  * Grab a particular vnode from the free list, increment its
1.83      fvdl      707:  * reference count and lock it. If the vnode lock bit is set the
                    708:  * vnode is being eliminated in vgone. In that case, we can not
                    709:  * grab the vnode, so the process is awakened when the transition is
                    710:  * completed, and an error returned to indicate that the vnode is no
                    711:  * longer usable (possibly having been changed to a new file system type).
1.29      cgd       712:  */
1.30      mycroft   713: int
1.309     ad        714: vget(vnode_t *vp, int flags)
1.29      cgd       715: {
1.175     perseant  716:        int error;
1.29      cgd       717:
1.309     ad        718:        KASSERT((vp->v_iflag & VI_MARKER) == 0);
                    719:
                    720:        if ((flags & LK_INTERLOCK) == 0)
                    721:                mutex_enter(&vp->v_interlock);
                    722:
                    723:        /*
                    724:         * Before adding a reference, we must remove the vnode
                    725:         * from its freelist.
                    726:         */
                    727:        if (vp->v_usecount == 0) {
                    728:                vremfree(vp);
                    729:        }
                    730:        if (++vp->v_usecount == 0) {
                    731:                vpanic(vp, "vget: usecount overflow");
                    732:        }
                    733:
1.30      mycroft   734:        /*
                    735:         * If the vnode is in the process of being cleaned out for
                    736:         * another use, we wait for the cleaning to finish and then
1.312     ad        737:         * return failure.  Cleaning is determined by checking if
                    738:         * the VI_XLOCK or VI_FREEING flags are set.
1.80      fvdl      739:         */
1.312     ad        740:        if ((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0) {
1.313     ad        741:                if ((flags & LK_NOWAIT) != 0) {
1.324     pooka     742:                        vrelel(vp, 0);
1.142     chs       743:                        return EBUSY;
                    744:                }
1.312     ad        745:                vwait(vp, VI_XLOCK | VI_FREEING);
1.324     pooka     746:                vrelel(vp, 0);
1.313     ad        747:                return ENOENT;
1.29      cgd       748:        }
1.80      fvdl      749:        if (flags & LK_TYPE_MASK) {
1.313     ad        750:                error = vn_lock(vp, flags | LK_INTERLOCK);
                    751:                if (error != 0) {
1.257     yamt      752:                        vrele(vp);
1.113     fvdl      753:                }
1.313     ad        754:                return error;
1.80      fvdl      755:        }
1.309     ad        756:        mutex_exit(&vp->v_interlock);
1.313     ad        757:        return 0;
1.29      cgd       758: }
                    759:
                    760: /*
                    761:  * vput(), just unlock and vrele()
                    762:  */
                    763: void
1.309     ad        764: vput(vnode_t *vp)
1.29      cgd       765: {
1.30      mycroft   766:
1.309     ad        767:        KASSERT((vp->v_iflag & VI_MARKER) == 0);
                    768:
                    769:        VOP_UNLOCK(vp, 0);
                    770:        vrele(vp);
1.29      cgd       771: }
                    772:
                    773: /*
1.309     ad        774:  * Vnode release.  If reference count drops to zero, call inactive
                    775:  * routine and either return to freelist or free to the pool.
1.29      cgd       776:  */
1.309     ad        777: void
1.324     pooka     778: vrelel(vnode_t *vp, int flags)
1.29      cgd       779: {
1.309     ad        780:        bool recycle, defer;
                    781:        int error;
                    782:
                    783:        KASSERT(mutex_owned(&vp->v_interlock));
                    784:        KASSERT((vp->v_iflag & VI_MARKER) == 0);
1.315     ad        785:        KASSERT(vp->v_freelisthd == NULL);
1.29      cgd       786:
1.309     ad        787:        if (vp->v_op == dead_vnodeop_p && (vp->v_iflag & VI_CLEAN) == 0) {
                    788:                vpanic(vp, "dead but not clean");
                    789:        }
                    790:
                    791:        /*
                    792:         * If not the last reference, just drop the reference count
                    793:         * and unlock.
                    794:         */
                    795:        if (vp->v_usecount > 1) {
                    796:                vp->v_usecount--;
                    797:                vp->v_iflag |= VI_INACTREDO;
                    798:                mutex_exit(&vp->v_interlock);
1.29      cgd       799:                return;
1.80      fvdl      800:        }
1.309     ad        801:        if (vp->v_usecount <= 0 || vp->v_writecount != 0) {
                    802:                vpanic(vp, "vput: bad ref count");
1.29      cgd       803:        }
1.309     ad        804:
1.30      mycroft   805:        /*
1.309     ad        806:         * If not clean, deactivate the vnode, but preserve
                    807:         * our reference across the call to VOP_INACTIVE().
1.30      mycroft   808:         */
1.309     ad        809:  retry:
                    810:        if ((vp->v_iflag & VI_CLEAN) == 0) {
                    811:                recycle = false;
                    812:                /*
                    813:                 * XXX This ugly block can be largely eliminated if
                    814:                 * locking is pushed down into the file systems.
                    815:                 */
                    816:                if (curlwp == uvm.pagedaemon_lwp) {
                    817:                        /* The pagedaemon can't wait around; defer. */
                    818:                        defer = true;
                    819:                } else if (curlwp == vrele_lwp) {
                    820:                        /* We have to try harder. */
                    821:                        vp->v_iflag &= ~VI_INACTREDO;
                    822:                        error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK |
                    823:                            LK_RETRY);
                    824:                        if (error != 0) {
                    825:                                /* XXX */
                    826:                                vpanic(vp, "vrele: unable to lock %p");
                    827:                        }
                    828:                        defer = false;
                    829:                } else if ((vp->v_iflag & VI_LAYER) != 0) {
                    830:                        /*
                    831:                         * Acquiring the stack's lock in vclean() even
                    832:                         * for an honest vput/vrele is dangerous because
                    833:                         * our caller may hold other vnode locks; defer.
                    834:                         */
                    835:                        defer = true;
                    836:                } else {
                    837:                        /* If we can't acquire the lock, then defer. */
                    838:                        vp->v_iflag &= ~VI_INACTREDO;
                    839:                        error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK |
                    840:                            LK_NOWAIT);
                    841:                        if (error != 0) {
                    842:                                defer = true;
                    843:                                mutex_enter(&vp->v_interlock);
                    844:                        } else {
                    845:                                defer = false;
                    846:                        }
                    847:                }
                    848:
                    849:                if (defer) {
                    850:                        /*
                    851:                         * Defer reclaim to the kthread; it's not safe to
                    852:                         * clean it here.  We donate it our last reference.
                    853:                         */
                    854:                        KASSERT(mutex_owned(&vp->v_interlock));
                    855:                        KASSERT((vp->v_iflag & VI_INACTPEND) == 0);
                    856:                        vp->v_iflag |= VI_INACTPEND;
                    857:                        mutex_enter(&vrele_lock);
                    858:                        TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist);
                    859:                        if (++vrele_pending > (desiredvnodes >> 8))
                    860:                                cv_signal(&vrele_cv);
                    861:                        mutex_exit(&vrele_lock);
                    862:                        mutex_exit(&vp->v_interlock);
                    863:                        return;
                    864:                }
                    865:
1.318     ad        866: #ifdef DIAGNOSTIC
1.321     ad        867:                if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
                    868:                    vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
1.318     ad        869:                        vprint("vrelel: missing VOP_CLOSE()", vp);
                    870:                }
                    871: #endif
                    872:
1.309     ad        873:                /*
1.312     ad        874:                 * The vnode can gain another reference while being
                    875:                 * deactivated.  If VOP_INACTIVE() indicates that
                    876:                 * the described file has been deleted, then recycle
                    877:                 * the vnode irrespective of additional references.
                    878:                 * Another thread may be waiting to re-use the on-disk
                    879:                 * inode.
                    880:                 *
                    881:                 * Note that VOP_INACTIVE() will drop the vnode lock.
1.309     ad        882:                 */
                    883:                VOP_INACTIVE(vp, &recycle);
                    884:                mutex_enter(&vp->v_interlock);
1.312     ad        885:                if (!recycle) {
                    886:                        if (vp->v_usecount > 1) {
                    887:                                vp->v_usecount--;
                    888:                                mutex_exit(&vp->v_interlock);
                    889:                                return;
                    890:                        }
1.309     ad        891:
1.312     ad        892:                        /*
                    893:                         * If we grew another reference while
                    894:                         * VOP_INACTIVE() was underway, retry.
                    895:                         */
                    896:                        if ((vp->v_iflag & VI_INACTREDO) != 0) {
                    897:                                goto retry;
                    898:                        }
1.309     ad        899:                }
                    900:
                    901:                /* Take care of space accounting. */
                    902:                if (vp->v_iflag & VI_EXECMAP) {
                    903:                        atomic_add_int(&uvmexp.execpages,
                    904:                            -vp->v_uobj.uo_npages);
                    905:                        atomic_add_int(&uvmexp.filepages,
                    906:                            vp->v_uobj.uo_npages);
                    907:                }
                    908:                vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP|VI_MAPPED);
                    909:                vp->v_vflag &= ~VV_MAPPED;
                    910:
                    911:                /*
                    912:                 * Recycle the vnode if the file is now unused (unlinked),
                    913:                 * otherwise just free it.
                    914:                 */
                    915:                if (recycle) {
                    916:                        vclean(vp, DOCLOSE);
                    917:                }
                    918:                KASSERT(vp->v_usecount > 0);
1.298     pooka     919:        }
1.309     ad        920:
                    921:        if (--vp->v_usecount != 0) {
                    922:                /* Gained another reference while being reclaimed. */
                    923:                mutex_exit(&vp->v_interlock);
                    924:                return;
1.147     chs       925:        }
1.298     pooka     926:
1.309     ad        927:        if ((vp->v_iflag & VI_CLEAN) != 0) {
                    928:                /*
                    929:                 * It's clean so destroy it.  It isn't referenced
                    930:                 * anywhere since it has been reclaimed.
                    931:                 */
                    932:                KASSERT(vp->v_holdcnt == 0);
                    933:                KASSERT(vp->v_writecount == 0);
                    934:                mutex_exit(&vp->v_interlock);
                    935:                insmntque(vp, NULL);
1.318     ad        936:                if (vp->v_type == VBLK || vp->v_type == VCHR) {
                    937:                        spec_node_destroy(vp);
                    938:                }
1.310     pooka     939:                vnfree(vp);
1.298     pooka     940:        } else {
1.309     ad        941:                /*
                    942:                 * Otherwise, put it back onto the freelist.  It
                    943:                 * can't be destroyed while still associated with
                    944:                 * a file system.
                    945:                 */
                    946:                mutex_enter(&vnode_free_list_lock);
                    947:                if (vp->v_holdcnt > 0) {
                    948:                        vp->v_freelisthd = &vnode_hold_list;
                    949:                } else {
                    950:                        vp->v_freelisthd = &vnode_free_list;
                    951:                }
                    952:                TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
                    953:                mutex_exit(&vnode_free_list_lock);
                    954:                mutex_exit(&vp->v_interlock);
1.298     pooka     955:        }
                    956: }
                    957:
                    958: void
1.309     ad        959: vrele(vnode_t *vp)
1.298     pooka     960: {
                    961:
1.309     ad        962:        KASSERT((vp->v_iflag & VI_MARKER) == 0);
                    963:
                    964:        mutex_enter(&vp->v_interlock);
1.324     pooka     965:        vrelel(vp, 0);
1.298     pooka     966: }
                    967:
1.309     ad        968: static void
                    969: vrele_thread(void *cookie)
1.298     pooka     970: {
1.309     ad        971:        vnode_t *vp;
1.298     pooka     972:
1.309     ad        973:        for (;;) {
                    974:                mutex_enter(&vrele_lock);
                    975:                while (TAILQ_EMPTY(&vrele_list)) {
                    976:                        cv_timedwait(&vrele_cv, &vrele_lock, hz);
                    977:                }
                    978:                vp = TAILQ_FIRST(&vrele_list);
                    979:                TAILQ_REMOVE(&vrele_list, vp, v_freelist);
                    980:                vrele_pending--;
                    981:                mutex_exit(&vrele_lock);
                    982:
                    983:                /*
                    984:                 * If not the last reference, then ignore the vnode
                    985:                 * and look for more work.
                    986:                 */
                    987:                mutex_enter(&vp->v_interlock);
                    988:                KASSERT((vp->v_iflag & VI_INACTPEND) != 0);
                    989:                vp->v_iflag &= ~VI_INACTPEND;
                    990:                if (vp->v_usecount > 1) {
                    991:                        vp->v_usecount--;
                    992:                        mutex_exit(&vp->v_interlock);
                    993:                        continue;
                    994:                }
1.324     pooka     995:                vrelel(vp, 0);
1.309     ad        996:        }
1.29      cgd       997: }
                    998:
                    999: /*
                   1000:  * Page or buffer structure gets a reference.
1.258     chs      1001:  * Called with v_interlock held.
1.29      cgd      1002:  */
1.30      mycroft  1003: void
1.309     ad       1004: vholdl(vnode_t *vp)
1.29      cgd      1005: {
                   1006:
1.309     ad       1007:        KASSERT(mutex_owned(&vp->v_interlock));
                   1008:        KASSERT((vp->v_iflag & VI_MARKER) == 0);
                   1009:
                   1010:        if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) {
                   1011:                mutex_enter(&vnode_free_list_lock);
                   1012:                KASSERT(vp->v_freelisthd == &vnode_free_list);
                   1013:                TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
                   1014:                vp->v_freelisthd = &vnode_hold_list;
                   1015:                TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
                   1016:                mutex_exit(&vnode_free_list_lock);
1.113     fvdl     1017:        }
1.29      cgd      1018: }
                   1019:
                   1020: /*
                   1021:  * Page or buffer structure frees a reference.
1.258     chs      1022:  * Called with v_interlock held.
1.29      cgd      1023:  */
1.30      mycroft  1024: void
1.309     ad       1025: holdrelel(vnode_t *vp)
1.29      cgd      1026: {
                   1027:
1.309     ad       1028:        KASSERT(mutex_owned(&vp->v_interlock));
                   1029:        KASSERT((vp->v_iflag & VI_MARKER) == 0);
1.142     chs      1030:
1.309     ad       1031:        if (vp->v_holdcnt <= 0) {
                   1032:                vpanic(vp, "holdrelel: holdcnt vp %p");
                   1033:        }
1.142     chs      1034:
1.309     ad       1035:        vp->v_holdcnt--;
                   1036:        if (vp->v_holdcnt == 0 && vp->v_usecount == 0) {
                   1037:                mutex_enter(&vnode_free_list_lock);
                   1038:                KASSERT(vp->v_freelisthd == &vnode_hold_list);
                   1039:                TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
                   1040:                vp->v_freelisthd = &vnode_free_list;
                   1041:                TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
                   1042:                mutex_exit(&vnode_free_list_lock);
1.113     fvdl     1043:        }
1.81      ross     1044: }
                   1045:
                   1046: /*
1.309     ad       1047:  * Vnode reference, where a reference is already held by some other
                   1048:  * object (for example, a file structure).
1.81      ross     1049:  */
                   1050: void
1.309     ad       1051: vref(vnode_t *vp)
1.81      ross     1052: {
                   1053:
1.309     ad       1054:        KASSERT((vp->v_iflag & VI_MARKER) == 0);
                   1055:
                   1056:        mutex_enter(&vp->v_interlock);
                   1057:        if (vp->v_usecount <= 0) {
                   1058:                vpanic(vp, "vref used where vget required");
                   1059:        }
                   1060:        if (++vp->v_usecount == 0) {
                   1061:                vpanic(vp, "vref: usecount overflow");
1.112     mycroft  1062:        }
1.309     ad       1063:        mutex_exit(&vp->v_interlock);
1.29      cgd      1064: }
                   1065:
                   1066: /*
                   1067:  * Remove any vnodes in the vnode table belonging to mount point mp.
                   1068:  *
1.183     yamt     1069:  * If FORCECLOSE is not specified, there should not be any active ones,
1.29      cgd      1070:  * return error if any are found (nb: this is a user error, not a
1.183     yamt     1071:  * system error). If FORCECLOSE is specified, detach any active vnodes
1.29      cgd      1072:  * that are found.
1.183     yamt     1073:  *
                   1074:  * If WRITECLOSE is set, only flush out regular file vnodes open for
                   1075:  * writing.
                   1076:  *
                   1077:  * SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped.
1.29      cgd      1078:  */
1.30      mycroft  1079: #ifdef DEBUG
                   1080: int busyprt = 0;       /* print out busy vnodes */
                   1081: struct ctldebug debug1 = { "busyprt", &busyprt };
                   1082: #endif
1.29      cgd      1083:
1.334     ad       1084: static vnode_t *
                   1085: vflushnext(vnode_t *mvp, int *when)
                   1086: {
                   1087:
                   1088:        if (hardclock_ticks > *when) {
                   1089:                mutex_exit(&mntvnode_lock);
                   1090:                yield();
                   1091:                mutex_enter(&mntvnode_lock);
                   1092:                *when = hardclock_ticks + hz / 10;
                   1093:        }
                   1094:
                   1095:        return vunmark(mvp);
                   1096: }
                   1097:
1.50      christos 1098: int
1.309     ad       1099: vflush(struct mount *mp, vnode_t *skipvp, int flags)
1.29      cgd      1100: {
1.309     ad       1101:        vnode_t *vp, *mvp;
1.334     ad       1102:        int busy = 0, when = 0;
1.29      cgd      1103:
1.309     ad       1104:        /* Allocate a marker vnode. */
1.310     pooka    1105:        if ((mvp = vnalloc(mp)) == NULL)
1.309     ad       1106:                return (ENOMEM);
                   1107:
                   1108:        mutex_enter(&mntvnode_lock);
1.273     reinoud  1109:        /*
                   1110:         * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
                   1111:         * and vclean() are called
                   1112:         */
1.334     ad       1113:        for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp != NULL;
                   1114:            vp = vflushnext(mvp, &when)) {
1.309     ad       1115:                vmark(mvp, vp);
                   1116:                if (vp->v_mount != mp || vismarker(vp))
                   1117:                        continue;
1.29      cgd      1118:                /*
                   1119:                 * Skip over a selected vnode.
                   1120:                 */
                   1121:                if (vp == skipvp)
                   1122:                        continue;
1.309     ad       1123:                mutex_enter(&vp->v_interlock);
1.29      cgd      1124:                /*
1.315     ad       1125:                 * Ignore clean but still referenced vnodes.
                   1126:                 */
                   1127:                if ((vp->v_iflag & VI_CLEAN) != 0) {
                   1128:                        mutex_exit(&vp->v_interlock);
                   1129:                        continue;
                   1130:                }
                   1131:                /*
1.309     ad       1132:                 * Skip over a vnodes marked VSYSTEM.
1.29      cgd      1133:                 */
1.302     ad       1134:                if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
1.309     ad       1135:                        mutex_exit(&vp->v_interlock);
1.29      cgd      1136:                        continue;
1.80      fvdl     1137:                }
1.29      cgd      1138:                /*
1.30      mycroft  1139:                 * If WRITECLOSE is set, only flush out regular file
                   1140:                 * vnodes open for writing.
                   1141:                 */
                   1142:                if ((flags & WRITECLOSE) &&
1.92      thorpej  1143:                    (vp->v_writecount == 0 || vp->v_type != VREG)) {
1.309     ad       1144:                        mutex_exit(&vp->v_interlock);
1.30      mycroft  1145:                        continue;
1.92      thorpej  1146:                }
1.30      mycroft  1147:                /*
1.29      cgd      1148:                 * With v_usecount == 0, all we need to do is clear
                   1149:                 * out the vnode data structures and we are done.
                   1150:                 */
                   1151:                if (vp->v_usecount == 0) {
1.309     ad       1152:                        mutex_exit(&mntvnode_lock);
                   1153:                        vremfree(vp);
                   1154:                        vp->v_usecount++;
                   1155:                        vclean(vp, DOCLOSE);
1.324     pooka    1156:                        vrelel(vp, 0);
1.309     ad       1157:                        mutex_enter(&mntvnode_lock);
1.29      cgd      1158:                        continue;
                   1159:                }
                   1160:                /*
1.30      mycroft  1161:                 * If FORCECLOSE is set, forcibly close the vnode.
1.29      cgd      1162:                 * For block or character devices, revert to an
1.318     ad       1163:                 * anonymous device.  For all other files, just
                   1164:                 * kill them.
1.29      cgd      1165:                 */
                   1166:                if (flags & FORCECLOSE) {
1.309     ad       1167:                        mutex_exit(&mntvnode_lock);
                   1168:                        vp->v_usecount++;
1.29      cgd      1169:                        if (vp->v_type != VBLK && vp->v_type != VCHR) {
1.309     ad       1170:                                vclean(vp, DOCLOSE);
1.324     pooka    1171:                                vrelel(vp, 0);
1.29      cgd      1172:                        } else {
1.309     ad       1173:                                vclean(vp, 0);
1.318     ad       1174:                                vp->v_op = spec_vnodeop_p; /* XXXSMP */
1.320     ad       1175:                                mutex_exit(&vp->v_interlock);
                   1176:                                /*
                   1177:                                 * The vnode isn't clean, but still resides
                   1178:                                 * on the mount list.  Remove it. XXX This
                   1179:                                 * is a bit dodgy.
                   1180:                                 */
                   1181:                                insmntque(vp, NULL);
                   1182:                                vrele(vp);
1.29      cgd      1183:                        }
1.309     ad       1184:                        mutex_enter(&mntvnode_lock);
1.29      cgd      1185:                        continue;
                   1186:                }
1.30      mycroft  1187: #ifdef DEBUG
1.29      cgd      1188:                if (busyprt)
                   1189:                        vprint("vflush: busy vnode", vp);
1.30      mycroft  1190: #endif
1.309     ad       1191:                mutex_exit(&vp->v_interlock);
1.29      cgd      1192:                busy++;
                   1193:        }
1.309     ad       1194:        mutex_exit(&mntvnode_lock);
1.310     pooka    1195:        vnfree(mvp);
1.29      cgd      1196:        if (busy)
                   1197:                return (EBUSY);
                   1198:        return (0);
                   1199: }
                   1200:
                   1201: /*
                   1202:  * Disassociate the underlying file system from a vnode.
1.309     ad       1203:  *
                   1204:  * Must be called with the interlock held, and will return with it held.
1.29      cgd      1205:  */
1.309     ad       1206: void
                   1207: vclean(vnode_t *vp, int flags)
1.29      cgd      1208: {
1.309     ad       1209:        lwp_t *l = curlwp;
                   1210:        bool recycle, active;
1.318     ad       1211:        int error;
1.29      cgd      1212:
1.309     ad       1213:        KASSERT(mutex_owned(&vp->v_interlock));
                   1214:        KASSERT((vp->v_iflag & VI_MARKER) == 0);
                   1215:        KASSERT(vp->v_usecount != 0);
1.166     chs      1216:
1.309     ad       1217:        /* If cleaning is already in progress wait until done and return. */
                   1218:        if (vp->v_iflag & VI_XLOCK) {
                   1219:                vwait(vp, VI_XLOCK);
                   1220:                return;
                   1221:        }
1.166     chs      1222:
1.309     ad       1223:        /* If already clean, nothing to do. */
                   1224:        if ((vp->v_iflag & VI_CLEAN) != 0) {
                   1225:                return;
1.112     mycroft  1226:        }
1.87      pk       1227:
1.29      cgd      1228:        /*
1.309     ad       1229:         * Prevent the vnode from being recycled or brought into use
                   1230:         * while we clean it out.
1.29      cgd      1231:         */
1.302     ad       1232:        vp->v_iflag |= VI_XLOCK;
                   1233:        if (vp->v_iflag & VI_EXECMAP) {
1.307     ad       1234:                atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages);
                   1235:                atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages);
1.147     chs      1236:        }
1.302     ad       1237:        vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
1.309     ad       1238:        active = (vp->v_usecount > 1);
1.142     chs      1239:
1.309     ad       1240:        /* XXXAD should not lock vnode under layer */
                   1241:        VOP_LOCK(vp, LK_EXCLUSIVE | LK_INTERLOCK);
1.80      fvdl     1242:
1.98      wrstuden 1243:        /*
1.142     chs      1244:         * Clean out any cached data associated with the vnode.
1.318     ad       1245:         * If purging an active vnode, it must be closed and
                   1246:         * deactivated before being reclaimed. Note that the
                   1247:         * VOP_INACTIVE will unlock the vnode.
1.29      cgd      1248:         */
1.166     chs      1249:        if (flags & DOCLOSE) {
1.256     christos 1250:                error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
1.318     ad       1251:                if (error != 0)
1.256     christos 1252:                        error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
1.211     dbj      1253:                KASSERT(error == 0);
1.302     ad       1254:                KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1.318     ad       1255:                if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) {
                   1256:                         spec_node_revoke(vp);
1.231     mycroft  1257:                }
1.166     chs      1258:        }
1.29      cgd      1259:        if (active) {
1.309     ad       1260:                VOP_INACTIVE(vp, &recycle);
1.80      fvdl     1261:        } else {
                   1262:                /*
                   1263:                 * Any other processes trying to obtain this lock must first
1.302     ad       1264:                 * wait for VI_XLOCK to clear, then call the new lock operation.
1.80      fvdl     1265:                 */
                   1266:                VOP_UNLOCK(vp, 0);
1.29      cgd      1267:        }
1.142     chs      1268:
1.309     ad       1269:        /* Disassociate the underlying file system from the vnode. */
                   1270:        if (VOP_RECLAIM(vp)) {
                   1271:                vpanic(vp, "vclean: cannot reclaim");
1.87      pk       1272:        }
1.30      mycroft  1273:
1.169     chs      1274:        KASSERT(vp->v_uobj.uo_npages == 0);
1.255     yamt     1275:        if (vp->v_type == VREG && vp->v_ractx != NULL) {
                   1276:                uvm_ra_freectx(vp->v_ractx);
                   1277:                vp->v_ractx = NULL;
                   1278:        }
1.80      fvdl     1279:        cache_purge(vp);
                   1280:
1.309     ad       1281:        /* Done with purge, notify sleepers of the grim news. */
1.30      mycroft  1282:        vp->v_op = dead_vnodeop_p;
                   1283:        vp->v_tag = VT_NON;
1.309     ad       1284:        mutex_enter(&vp->v_interlock);
                   1285:        vp->v_vnlock = &vp->v_lock;
1.332     ad       1286:        KNOTE(&vp->v_klist, NOTE_REVOKE);
1.312     ad       1287:        vp->v_iflag &= ~(VI_XLOCK | VI_FREEING);
1.304     ad       1288:        vp->v_vflag &= ~VV_LOCKSWORK;
1.319     ad       1289:        if ((flags & DOCLOSE) != 0) {
1.318     ad       1290:                vp->v_iflag |= VI_CLEAN;
                   1291:        }
1.309     ad       1292:        cv_broadcast(&vp->v_cv);
                   1293:
                   1294:        KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1.29      cgd      1295: }
                   1296:
                   1297: /*
1.80      fvdl     1298:  * Recycle an unused vnode to the front of the free list.
                   1299:  * Release the passed interlock if the vnode will be recycled.
1.29      cgd      1300:  */
1.80      fvdl     1301: int
1.309     ad       1302: vrecycle(vnode_t *vp, kmutex_t *inter_lkp, struct lwp *l)
1.217     junyoung 1303: {
                   1304:
1.309     ad       1305:        KASSERT((vp->v_iflag & VI_MARKER) == 0);
                   1306:
                   1307:        mutex_enter(&vp->v_interlock);
                   1308:        if (vp->v_usecount != 0) {
                   1309:                mutex_exit(&vp->v_interlock);
                   1310:                return (0);
1.29      cgd      1311:        }
1.309     ad       1312:        if (inter_lkp)
                   1313:                mutex_exit(inter_lkp);
                   1314:        vremfree(vp);
                   1315:        vp->v_usecount++;
                   1316:        vclean(vp, DOCLOSE);
1.324     pooka    1317:        vrelel(vp, 0);
1.309     ad       1318:        return (1);
1.29      cgd      1319: }
                   1320:
                   1321: /*
1.309     ad       1322:  * Eliminate all activity associated with a vnode in preparation for
                   1323:  * reuse.  Drops a reference from the vnode.
1.29      cgd      1324:  */
                   1325: void
1.309     ad       1326: vgone(vnode_t *vp)
1.80      fvdl     1327: {
1.166     chs      1328:
1.309     ad       1329:        mutex_enter(&vp->v_interlock);
                   1330:        vclean(vp, DOCLOSE);
1.324     pooka    1331:        vrelel(vp, 0);
1.29      cgd      1332: }
                   1333:
                   1334: /*
                   1335:  * Lookup a vnode by device number.
                   1336:  */
1.50      christos 1337: int
1.309     ad       1338: vfinddev(dev_t dev, enum vtype type, vnode_t **vpp)
1.29      cgd      1339: {
1.309     ad       1340:        vnode_t *vp;
1.80      fvdl     1341:        int rc = 0;
1.29      cgd      1342:
1.318     ad       1343:        mutex_enter(&specfs_lock);
                   1344:        for (vp = specfs_hash[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1.29      cgd      1345:                if (dev != vp->v_rdev || type != vp->v_type)
                   1346:                        continue;
                   1347:                *vpp = vp;
1.80      fvdl     1348:                rc = 1;
                   1349:                break;
1.29      cgd      1350:        }
1.318     ad       1351:        mutex_exit(&specfs_lock);
1.80      fvdl     1352:        return (rc);
1.96      thorpej  1353: }
                   1354:
                   1355: /*
                   1356:  * Revoke all the vnodes corresponding to the specified minor number
                   1357:  * range (endpoints inclusive) of the specified major.
                   1358:  */
                   1359: void
1.247     thorpej  1360: vdevgone(int maj, int minl, int minh, enum vtype type)
1.96      thorpej  1361: {
1.316     ad       1362:        vnode_t *vp, **vpp;
                   1363:        dev_t dev;
1.96      thorpej  1364:        int mn;
                   1365:
1.274     mrg      1366:        vp = NULL;      /* XXX gcc */
                   1367:
1.318     ad       1368:        mutex_enter(&specfs_lock);
1.316     ad       1369:        for (mn = minl; mn <= minh; mn++) {
                   1370:                dev = makedev(maj, mn);
1.318     ad       1371:                vpp = &specfs_hash[SPECHASH(dev)];
1.316     ad       1372:                for (vp = *vpp; vp != NULL;) {
                   1373:                        mutex_enter(&vp->v_interlock);
                   1374:                        if ((vp->v_iflag & VI_CLEAN) != 0 ||
                   1375:                            dev != vp->v_rdev || type != vp->v_type) {
                   1376:                                mutex_exit(&vp->v_interlock);
                   1377:                                vp = vp->v_specnext;
                   1378:                                continue;
                   1379:                        }
1.318     ad       1380:                        mutex_exit(&specfs_lock);
1.316     ad       1381:                        if (vget(vp, LK_INTERLOCK) == 0) {
                   1382:                                VOP_REVOKE(vp, REVOKEALL);
                   1383:                                vrele(vp);
                   1384:                        }
1.318     ad       1385:                        mutex_enter(&specfs_lock);
1.316     ad       1386:                        vp = *vpp;
                   1387:                }
                   1388:        }
1.318     ad       1389:        mutex_exit(&specfs_lock);
1.29      cgd      1390: }
                   1391:
                   1392: /*
                   1393:  * Calculate the total number of references to a special device.
                   1394:  */
1.30      mycroft  1395: int
1.309     ad       1396: vcount(vnode_t *vp)
1.29      cgd      1397: {
                   1398:        int count;
                   1399:
1.318     ad       1400:        mutex_enter(&specfs_lock);
1.309     ad       1401:        mutex_enter(&vp->v_interlock);
1.318     ad       1402:        if (vp->v_specnode == NULL) {
1.309     ad       1403:                count = vp->v_usecount - ((vp->v_iflag & VI_INACTPEND) != 0);
                   1404:                mutex_exit(&vp->v_interlock);
1.318     ad       1405:                mutex_exit(&specfs_lock);
1.309     ad       1406:                return (count);
                   1407:        }
                   1408:        mutex_exit(&vp->v_interlock);
1.318     ad       1409:        count = vp->v_specnode->sn_dev->sd_opencnt;
                   1410:        mutex_exit(&specfs_lock);
1.29      cgd      1411:        return (count);
                   1412: }
                   1413:
1.101     mrg      1414: /*
1.316     ad       1415:  * Eliminate all activity associated with the requested vnode
                   1416:  * and with all vnodes aliased to the requested vnode.
                   1417:  */
                   1418: void
                   1419: vrevoke(vnode_t *vp)
                   1420: {
                   1421:        vnode_t *vq, **vpp;
                   1422:        enum vtype type;
                   1423:        dev_t dev;
                   1424:
                   1425:        KASSERT(vp->v_usecount > 0);
                   1426:
                   1427:        mutex_enter(&vp->v_interlock);
                   1428:        if ((vp->v_iflag & VI_CLEAN) != 0) {
                   1429:                mutex_exit(&vp->v_interlock);
                   1430:                return;
                   1431:        } else {
                   1432:                dev = vp->v_rdev;
                   1433:                type = vp->v_type;
                   1434:                mutex_exit(&vp->v_interlock);
                   1435:        }
                   1436:
1.318     ad       1437:        vpp = &specfs_hash[SPECHASH(dev)];
                   1438:        mutex_enter(&specfs_lock);
1.316     ad       1439:        for (vq = *vpp; vq != NULL;) {
1.333     ad       1440:                /* If clean or being cleaned, then ignore it. */
                   1441:                mutex_enter(&vq->v_interlock);
                   1442:                if ((vq->v_iflag & (VI_CLEAN | VI_XLOCK)) != 0 ||
1.317     ad       1443:                    vq->v_rdev != dev || vq->v_type != type) {
1.333     ad       1444:                        mutex_exit(&vq->v_interlock);
1.316     ad       1445:                        vq = vq->v_specnext;
                   1446:                        continue;
                   1447:                }
1.318     ad       1448:                mutex_exit(&specfs_lock);
                   1449:                if (vq->v_usecount == 0) {
1.317     ad       1450:                        vremfree(vq);
1.316     ad       1451:                }
1.318     ad       1452:                vq->v_usecount++;
1.316     ad       1453:                vclean(vq, DOCLOSE);
1.324     pooka    1454:                vrelel(vq, 0);
1.318     ad       1455:                mutex_enter(&specfs_lock);
1.316     ad       1456:                vq = *vpp;
                   1457:        }
1.318     ad       1458:        mutex_exit(&specfs_lock);
1.316     ad       1459: }
                   1460:
                   1461: /*
1.220     lukem    1462:  * sysctl helper routine to return list of supported fstypes
                   1463:  */
                   1464: static int
                   1465: sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS)
                   1466: {
1.291     christos 1467:        char bf[sizeof(((struct statvfs *)NULL)->f_fstypename)];
1.220     lukem    1468:        char *where = oldp;
                   1469:        struct vfsops *v;
                   1470:        size_t needed, left, slen;
                   1471:        int error, first;
                   1472:
                   1473:        if (newp != NULL)
                   1474:                return (EPERM);
                   1475:        if (namelen != 0)
                   1476:                return (EINVAL);
                   1477:
                   1478:        first = 1;
                   1479:        error = 0;
                   1480:        needed = 0;
                   1481:        left = *oldlenp;
                   1482:
1.311     ad       1483:        sysctl_unlock();
1.302     ad       1484:        mutex_enter(&vfs_list_lock);
1.220     lukem    1485:        LIST_FOREACH(v, &vfs_list, vfs_list) {
                   1486:                if (where == NULL)
                   1487:                        needed += strlen(v->vfs_name) + 1;
                   1488:                else {
1.245     christos 1489:                        memset(bf, 0, sizeof(bf));
1.220     lukem    1490:                        if (first) {
1.245     christos 1491:                                strncpy(bf, v->vfs_name, sizeof(bf));
1.220     lukem    1492:                                first = 0;
                   1493:                        } else {
1.245     christos 1494:                                bf[0] = ' ';
                   1495:                                strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1);
1.220     lukem    1496:                        }
1.245     christos 1497:                        bf[sizeof(bf)-1] = '\0';
                   1498:                        slen = strlen(bf);
1.220     lukem    1499:                        if (left < slen + 1)
                   1500:                                break;
                   1501:                        /* +1 to copy out the trailing NUL byte */
1.302     ad       1502:                        v->vfs_refcount++;
                   1503:                        mutex_exit(&vfs_list_lock);
1.245     christos 1504:                        error = copyout(bf, where, slen + 1);
1.302     ad       1505:                        mutex_enter(&vfs_list_lock);
                   1506:                        v->vfs_refcount--;
1.220     lukem    1507:                        if (error)
                   1508:                                break;
                   1509:                        where += slen;
                   1510:                        needed += slen;
                   1511:                        left -= slen;
                   1512:                }
                   1513:        }
1.302     ad       1514:        mutex_exit(&vfs_list_lock);
1.311     ad       1515:        sysctl_relock();
1.220     lukem    1516:        *oldlenp = needed;
                   1517:        return (error);
                   1518: }
                   1519:
                   1520: /*
1.80      fvdl     1521:  * Top level filesystem related information gathering.
                   1522:  */
1.212     atatat   1523: SYSCTL_SETUP(sysctl_vfs_setup, "sysctl vfs subtree setup")
1.80      fvdl     1524: {
1.218     atatat   1525:        sysctl_createv(clog, 0, NULL, NULL,
                   1526:                       CTLFLAG_PERMANENT,
1.212     atatat   1527:                       CTLTYPE_NODE, "vfs", NULL,
                   1528:                       NULL, 0, NULL, 0,
                   1529:                       CTL_VFS, CTL_EOL);
1.218     atatat   1530:        sysctl_createv(clog, 0, NULL, NULL,
                   1531:                       CTLFLAG_PERMANENT,
1.226     atatat   1532:                       CTLTYPE_NODE, "generic",
                   1533:                       SYSCTL_DESCR("Non-specific vfs related information"),
1.212     atatat   1534:                       NULL, 0, NULL, 0,
                   1535:                       CTL_VFS, VFS_GENERIC, CTL_EOL);
1.218     atatat   1536:        sysctl_createv(clog, 0, NULL, NULL,
                   1537:                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1.226     atatat   1538:                       CTLTYPE_INT, "usermount",
                   1539:                       SYSCTL_DESCR("Whether unprivileged users may mount "
                   1540:                                    "filesystems"),
1.212     atatat   1541:                       NULL, 0, &dovfsusermount, 0,
                   1542:                       CTL_VFS, VFS_GENERIC, VFS_USERMOUNT, CTL_EOL);
1.220     lukem    1543:        sysctl_createv(clog, 0, NULL, NULL,
                   1544:                       CTLFLAG_PERMANENT,
                   1545:                       CTLTYPE_STRING, "fstypes",
                   1546:                       SYSCTL_DESCR("List of file systems present"),
                   1547:                       sysctl_vfs_generic_fstypes, 0, NULL, 0,
                   1548:                       CTL_VFS, VFS_GENERIC, CTL_CREATE, CTL_EOL);
1.263     chs      1549:        sysctl_createv(clog, 0, NULL, NULL,
                   1550:                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                   1551:                       CTLTYPE_INT, "magiclinks",
                   1552:                       SYSCTL_DESCR("Whether \"magic\" symlinks are expanded"),
                   1553:                       NULL, 0, &vfs_magiclinks, 0,
                   1554:                       CTL_VFS, VFS_GENERIC, VFS_MAGICLINKS, CTL_EOL);
1.80      fvdl     1555: }
                   1556:
1.212     atatat   1557:
1.29      cgd      1558: int kinfo_vdebug = 1;
                   1559: int kinfo_vgetfailed;
                   1560: #define KINFO_VNODESLOP        10
                   1561: /*
                   1562:  * Dump vnode list (via sysctl).
                   1563:  * Copyout address of vnode followed by vnode.
                   1564:  */
                   1565: /* ARGSUSED */
1.50      christos 1566: int
1.212     atatat   1567: sysctl_kern_vnode(SYSCTLFN_ARGS)
1.29      cgd      1568: {
1.212     atatat   1569:        char *where = oldp;
                   1570:        size_t *sizep = oldlenp;
1.80      fvdl     1571:        struct mount *mp, *nmp;
1.311     ad       1572:        vnode_t *vp, *mvp, vbuf;
1.80      fvdl     1573:        char *bp = where, *savebp;
1.29      cgd      1574:        char *ewhere;
                   1575:        int error;
1.212     atatat   1576:
                   1577:        if (namelen != 0)
                   1578:                return (EOPNOTSUPP);
                   1579:        if (newp != NULL)
                   1580:                return (EPERM);
1.29      cgd      1581:
1.309     ad       1582: #define VPTRSZ sizeof(vnode_t *)
                   1583: #define VNODESZ        sizeof(vnode_t)
1.29      cgd      1584:        if (where == NULL) {
                   1585:                *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
                   1586:                return (0);
                   1587:        }
                   1588:        ewhere = where + *sizep;
1.80      fvdl     1589:
1.311     ad       1590:        sysctl_unlock();
1.302     ad       1591:        mutex_enter(&mountlist_lock);
1.177     matt     1592:        for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
                   1593:             mp = nmp) {
1.336.2.1! yamt     1594:                if (vfs_busy(mp, &nmp)) {
1.29      cgd      1595:                        continue;
1.80      fvdl     1596:                }
1.29      cgd      1597:                savebp = bp;
1.309     ad       1598:                /* Allocate a marker vnode. */
1.311     ad       1599:                if ((mvp = vnalloc(mp)) == NULL) {
                   1600:                        sysctl_relock();
1.309     ad       1601:                        return (ENOMEM);
1.311     ad       1602:                }
1.309     ad       1603:                mutex_enter(&mntvnode_lock);
                   1604:                for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) {
                   1605:                        vmark(mvp, vp);
1.29      cgd      1606:                        /*
                   1607:                         * Check that the vp is still associated with
                   1608:                         * this filesystem.  RACE: could have been
                   1609:                         * recycled onto the same filesystem.
                   1610:                         */
1.309     ad       1611:                        if (vp->v_mount != mp || vismarker(vp))
                   1612:                                continue;
1.29      cgd      1613:                        if (bp + VPTRSZ + VNODESZ > ewhere) {
1.309     ad       1614:                                (void)vunmark(mvp);
                   1615:                                mutex_exit(&mntvnode_lock);
1.310     pooka    1616:                                vnfree(mvp);
1.311     ad       1617:                                sysctl_relock();
1.29      cgd      1618:                                *sizep = bp - where;
                   1619:                                return (ENOMEM);
                   1620:                        }
1.311     ad       1621:                        memcpy(&vbuf, vp, VNODESZ);
1.309     ad       1622:                        mutex_exit(&mntvnode_lock);
1.311     ad       1623:                        if ((error = copyout(vp, bp, VPTRSZ)) ||
                   1624:                           (error = copyout(&vbuf, bp + VPTRSZ, VNODESZ))) {
1.309     ad       1625:                                mutex_enter(&mntvnode_lock);
                   1626:                                (void)vunmark(mvp);
                   1627:                                mutex_exit(&mntvnode_lock);
1.310     pooka    1628:                                vnfree(mvp);
1.311     ad       1629:                                sysctl_relock();
1.29      cgd      1630:                                return (error);
1.309     ad       1631:                        }
1.29      cgd      1632:                        bp += VPTRSZ + VNODESZ;
1.309     ad       1633:                        mutex_enter(&mntvnode_lock);
1.29      cgd      1634:                }
1.309     ad       1635:                mutex_exit(&mntvnode_lock);
1.310     pooka    1636:                vnfree(mvp);
1.336.2.1! yamt     1637:                vfs_unbusy(mp, false, &nmp);
1.29      cgd      1638:        }
1.302     ad       1639:        mutex_exit(&mountlist_lock);
1.311     ad       1640:        sysctl_relock();
1.29      cgd      1641:
                   1642:        *sizep = bp - where;
                   1643:        return (0);
1.30      mycroft  1644: }
                   1645:
                   1646: /*
1.309     ad       1647:  * Remove clean vnodes from a mountpoint's vnode list.
                   1648:  */
                   1649: void
                   1650: vfs_scrubvnlist(struct mount *mp)
                   1651: {
                   1652:        vnode_t *vp, *nvp;
                   1653:
1.327     ad       1654:  retry:
1.309     ad       1655:        mutex_enter(&mntvnode_lock);
                   1656:        for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) {
                   1657:                nvp = TAILQ_NEXT(vp, v_mntvnodes);
                   1658:                mutex_enter(&vp->v_interlock);
1.315     ad       1659:                if ((vp->v_iflag & VI_CLEAN) != 0) {
1.309     ad       1660:                        TAILQ_REMOVE(&mp->mnt_vnodelist, vp, v_mntvnodes);
1.315     ad       1661:                        vp->v_mount = NULL;
1.327     ad       1662:                        mutex_exit(&mntvnode_lock);
                   1663:                        mutex_exit(&vp->v_interlock);
                   1664:                        vfs_destroy(mp);
                   1665:                        goto retry;
1.315     ad       1666:                }
1.309     ad       1667:                mutex_exit(&vp->v_interlock);
                   1668:        }
                   1669:        mutex_exit(&mntvnode_lock);
                   1670: }
                   1671:
                   1672: /*
1.30      mycroft  1673:  * Check to see if a filesystem is mounted on a block device.
                   1674:  */
                   1675: int
1.309     ad       1676: vfs_mountedon(vnode_t *vp)
1.30      mycroft  1677: {
1.309     ad       1678:        vnode_t *vq;
1.80      fvdl     1679:        int error = 0;
1.30      mycroft  1680:
1.261     reinoud  1681:        if (vp->v_type != VBLK)
                   1682:                return ENOTBLK;
1.113     fvdl     1683:        if (vp->v_specmountpoint != NULL)
1.30      mycroft  1684:                return (EBUSY);
1.318     ad       1685:        mutex_enter(&specfs_lock);
                   1686:        for (vq = specfs_hash[SPECHASH(vp->v_rdev)]; vq != NULL;
                   1687:            vq = vq->v_specnext) {
                   1688:                if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
                   1689:                        continue;
                   1690:                if (vq->v_specmountpoint != NULL) {
                   1691:                        error = EBUSY;
                   1692:                        break;
1.30      mycroft  1693:                }
                   1694:        }
1.318     ad       1695:        mutex_exit(&specfs_lock);
1.80      fvdl     1696:        return (error);
1.30      mycroft  1697: }
                   1698:
1.35      ws       1699: /*
1.39      mycroft  1700:  * Unmount all file systems.
                   1701:  * We traverse the list in reverse order under the assumption that doing so
                   1702:  * will avoid needing to worry about dependencies.
                   1703:  */
                   1704: void
1.256     christos 1705: vfs_unmountall(struct lwp *l)
1.39      mycroft  1706: {
1.123     augustss 1707:        struct mount *mp, *nmp;
1.40      mycroft  1708:        int allerror, error;
1.39      mycroft  1709:
1.235     lukem    1710:        printf("unmounting file systems...");
1.325     dyoung   1711:        for (allerror = 0, mp = CIRCLEQ_LAST(&mountlist);
                   1712:             !CIRCLEQ_EMPTY(&mountlist);
                   1713:             mp = nmp) {
                   1714:                nmp = CIRCLEQ_PREV(mp, mnt_list);
1.54      jtk      1715: #ifdef DEBUG
1.235     lukem    1716:                printf("\nunmounting %s (%s)...",
1.56      christos 1717:                    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname);
1.54      jtk      1718: #endif
1.336.2.1! yamt     1719:                atomic_inc_uint(&mp->mnt_refcnt);
1.256     christos 1720:                if ((error = dounmount(mp, MNT_FORCE, l)) != 0) {
1.57      christos 1721:                        printf("unmount of %s failed with error %d\n",
1.40      mycroft  1722:                            mp->mnt_stat.f_mntonname, error);
                   1723:                        allerror = 1;
                   1724:                }
1.39      mycroft  1725:        }
1.235     lukem    1726:        printf(" done\n");
1.39      mycroft  1727:        if (allerror)
1.57      christos 1728:                printf("WARNING: some file systems would not unmount\n");
1.40      mycroft  1729: }
                   1730:
                   1731: /*
                   1732:  * Sync and unmount file systems before shutting down.
                   1733:  */
                   1734: void
1.247     thorpej  1735: vfs_shutdown(void)
1.40      mycroft  1736: {
1.265     skrll    1737:        struct lwp *l;
1.40      mycroft  1738:
1.265     skrll    1739:        /* XXX we're certainly not running in lwp0's context! */
                   1740:        l = curlwp;
                   1741:        if (l == NULL)
                   1742:                l = &lwp0;
1.185     christos 1743:
1.70      cgd      1744:        printf("syncing disks... ");
                   1745:
1.305     pooka    1746:        /* remove user processes from run queue */
1.138     bouyer   1747:        suspendsched();
1.40      mycroft  1748:        (void) spl0();
                   1749:
1.128     sommerfe 1750:        /* avoid coming back this way again if we panic. */
                   1751:        doing_shutdown = 1;
                   1752:
1.184     thorpej  1753:        sys_sync(l, NULL, NULL);
1.40      mycroft  1754:
                   1755:        /* Wait for sync to finish. */
1.213     pk       1756:        if (buf_syncwait() != 0) {
1.124     augustss 1757: #if defined(DDB) && defined(DEBUG_HALT_BUSY)
                   1758:                Debugger();
                   1759: #endif
1.57      christos 1760:                printf("giving up\n");
1.84      thorpej  1761:                return;
1.73      thorpej  1762:        } else
1.57      christos 1763:                printf("done\n");
1.73      thorpej  1764:
1.84      thorpej  1765:        /*
                   1766:         * If we've panic'd, don't make the situation potentially
                   1767:         * worse by unmounting the file systems.
                   1768:         */
                   1769:        if (panicstr != NULL)
                   1770:                return;
                   1771:
                   1772:        /* Release inodes held by texts before update. */
1.73      thorpej  1773: #ifdef notdef
1.84      thorpej  1774:        vnshutdown();
1.73      thorpej  1775: #endif
1.84      thorpej  1776:        /* Unmount file systems. */
1.256     christos 1777:        vfs_unmountall(l);
1.58      thorpej  1778: }
                   1779:
                   1780: /*
                   1781:  * Mount the root file system.  If the operator didn't specify a
                   1782:  * file system to use, try all possible file systems until one
                   1783:  * succeeds.
                   1784:  */
                   1785: int
1.247     thorpej  1786: vfs_mountroot(void)
1.58      thorpej  1787: {
1.79      thorpej  1788:        struct vfsops *v;
1.239     mycroft  1789:        int error = ENODEV;
1.58      thorpej  1790:
                   1791:        if (root_device == NULL)
                   1792:                panic("vfs_mountroot: root device unknown");
                   1793:
1.264     thorpej  1794:        switch (device_class(root_device)) {
1.58      thorpej  1795:        case DV_IFNET:
                   1796:                if (rootdev != NODEV)
1.173     thorpej  1797:                        panic("vfs_mountroot: rootdev set for DV_IFNET "
                   1798:                            "(0x%08x -> %d,%d)", rootdev,
                   1799:                            major(rootdev), minor(rootdev));
1.58      thorpej  1800:                break;
                   1801:
                   1802:        case DV_DISK:
                   1803:                if (rootdev == NODEV)
                   1804:                        panic("vfs_mountroot: rootdev not set for DV_DISK");
1.239     mycroft  1805:                if (bdevvp(rootdev, &rootvp))
                   1806:                        panic("vfs_mountroot: can't get vnode for rootdev");
1.306     pooka    1807:                error = VOP_OPEN(rootvp, FREAD, FSCRED);
1.239     mycroft  1808:                if (error) {
                   1809:                        printf("vfs_mountroot: can't open root device\n");
                   1810:                        return (error);
                   1811:                }
1.58      thorpej  1812:                break;
                   1813:
                   1814:        default:
                   1815:                printf("%s: inappropriate for root file system\n",
1.336     cegger   1816:                    device_xname(root_device));
1.58      thorpej  1817:                return (ENODEV);
                   1818:        }
                   1819:
                   1820:        /*
                   1821:         * If user specified a file system, use it.
                   1822:         */
1.239     mycroft  1823:        if (mountroot != NULL) {
                   1824:                error = (*mountroot)();
                   1825:                goto done;
                   1826:        }
1.58      thorpej  1827:
                   1828:        /*
                   1829:         * Try each file system currently configured into the kernel.
                   1830:         */
1.302     ad       1831:        mutex_enter(&vfs_list_lock);
1.220     lukem    1832:        LIST_FOREACH(v, &vfs_list, vfs_list) {
1.79      thorpej  1833:                if (v->vfs_mountroot == NULL)
1.58      thorpej  1834:                        continue;
                   1835: #ifdef DEBUG
1.197     thorpej  1836:                aprint_normal("mountroot: trying %s...\n", v->vfs_name);
1.58      thorpej  1837: #endif
1.302     ad       1838:                v->vfs_refcount++;
                   1839:                mutex_exit(&vfs_list_lock);
1.239     mycroft  1840:                error = (*v->vfs_mountroot)();
1.302     ad       1841:                mutex_enter(&vfs_list_lock);
                   1842:                v->vfs_refcount--;
1.239     mycroft  1843:                if (!error) {
1.197     thorpej  1844:                        aprint_normal("root file system type: %s\n",
                   1845:                            v->vfs_name);
1.79      thorpej  1846:                        break;
1.58      thorpej  1847:                }
                   1848:        }
1.302     ad       1849:        mutex_exit(&vfs_list_lock);
1.58      thorpej  1850:
1.79      thorpej  1851:        if (v == NULL) {
1.336     cegger   1852:                printf("no file system for %s", device_xname(root_device));
1.264     thorpej  1853:                if (device_class(root_device) == DV_DISK)
1.79      thorpej  1854:                        printf(" (dev 0x%x)", rootdev);
                   1855:                printf("\n");
1.239     mycroft  1856:                error = EFTYPE;
1.79      thorpej  1857:        }
1.239     mycroft  1858:
                   1859: done:
1.264     thorpej  1860:        if (error && device_class(root_device) == DV_DISK) {
1.306     pooka    1861:                VOP_CLOSE(rootvp, FREAD, FSCRED);
1.239     mycroft  1862:                vrele(rootvp);
                   1863:        }
                   1864:        return (error);
1.58      thorpej  1865: }
1.326     ad       1866:
                   1867: /*
                   1868:  * Sham lock manager for vnodes.  This is a temporary measure.
                   1869:  */
                   1870: int
                   1871: vlockmgr(struct vnlock *vl, int flags)
                   1872: {
                   1873:
                   1874:        KASSERT((flags & ~(LK_CANRECURSE | LK_NOWAIT | LK_TYPE_MASK)) == 0);
                   1875:
                   1876:        switch (flags & LK_TYPE_MASK) {
                   1877:        case LK_SHARED:
                   1878:                if (rw_tryenter(&vl->vl_lock, RW_READER)) {
                   1879:                        return 0;
                   1880:                }
                   1881:                if ((flags & LK_NOWAIT) != 0) {
1.328     ad       1882:                        return EBUSY;
1.326     ad       1883:                }
                   1884:                rw_enter(&vl->vl_lock, RW_READER);
                   1885:                return 0;
                   1886:
                   1887:        case LK_EXCLUSIVE:
                   1888:                if (rw_tryenter(&vl->vl_lock, RW_WRITER)) {
                   1889:                        return 0;
                   1890:                }
                   1891:                if ((vl->vl_canrecurse || (flags & LK_CANRECURSE) != 0) &&
                   1892:                    rw_write_held(&vl->vl_lock)) {
                   1893:                        vl->vl_recursecnt++;
                   1894:                        return 0;
                   1895:                }
                   1896:                if ((flags & LK_NOWAIT) != 0) {
1.328     ad       1897:                        return EBUSY;
1.326     ad       1898:                }
                   1899:                rw_enter(&vl->vl_lock, RW_WRITER);
                   1900:                return 0;
                   1901:
                   1902:        case LK_RELEASE:
                   1903:                if (vl->vl_recursecnt != 0) {
                   1904:                        KASSERT(rw_write_held(&vl->vl_lock));
                   1905:                        vl->vl_recursecnt--;
                   1906:                        return 0;
                   1907:                }
                   1908:                rw_exit(&vl->vl_lock);
                   1909:                return 0;
                   1910:
                   1911:        default:
                   1912:                panic("vlockmgr: flags %x", flags);
                   1913:        }
                   1914: }
                   1915:
                   1916: int
                   1917: vlockstatus(struct vnlock *vl)
                   1918: {
                   1919:
                   1920:        if (rw_write_held(&vl->vl_lock)) {
                   1921:                return LK_EXCLUSIVE;
                   1922:        }
                   1923:        if (rw_read_held(&vl->vl_lock)) {
                   1924:                return LK_SHARED;
                   1925:        }
                   1926:        return 0;
                   1927: }

CVSweb <webmaster@jp.NetBSD.org>