[BACK]Return to vfs_subr.c CVS log [TXT][DIR] Up to [cvs.NetBSD.org] / src / sys / kern

Annotation of src/sys/kern/vfs_subr.c, Revision 1.336

1.336   ! cegger      1: /*     $NetBSD: vfs_subr.c,v 1.335 2008/02/24 23:16:24 dholland Exp $  */
1.74      thorpej     2:
                      3: /*-
1.315     ad          4:  * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc.
1.74      thorpej     5:  * All rights reserved.
                      6:  *
                      7:  * This code is derived from software contributed to The NetBSD Foundation
                      8:  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
1.302     ad          9:  * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
1.74      thorpej    10:  *
                     11:  * Redistribution and use in source and binary forms, with or without
                     12:  * modification, are permitted provided that the following conditions
                     13:  * are met:
                     14:  * 1. Redistributions of source code must retain the above copyright
                     15:  *    notice, this list of conditions and the following disclaimer.
                     16:  * 2. Redistributions in binary form must reproduce the above copyright
                     17:  *    notice, this list of conditions and the following disclaimer in the
                     18:  *    documentation and/or other materials provided with the distribution.
                     19:  * 3. All advertising materials mentioning features or use of this software
                     20:  *    must display the following acknowledgement:
                     21:  *     This product includes software developed by the NetBSD
                     22:  *     Foundation, Inc. and its contributors.
                     23:  * 4. Neither the name of The NetBSD Foundation nor the names of its
                     24:  *    contributors may be used to endorse or promote products derived
                     25:  *    from this software without specific prior written permission.
                     26:  *
                     27:  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
                     28:  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
                     29:  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
                     30:  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
                     31:  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
                     32:  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
                     33:  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
                     34:  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
                     35:  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
                     36:  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
                     37:  * POSSIBILITY OF SUCH DAMAGE.
                     38:  */
1.32      cgd        39:
1.29      cgd        40: /*
1.30      mycroft    41:  * Copyright (c) 1989, 1993
                     42:  *     The Regents of the University of California.  All rights reserved.
1.29      cgd        43:  * (c) UNIX System Laboratories, Inc.
                     44:  * All or some portions of this file are derived from material licensed
                     45:  * to the University of California by American Telephone and Telegraph
                     46:  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
                     47:  * the permission of UNIX System Laboratories, Inc.
                     48:  *
                     49:  * Redistribution and use in source and binary forms, with or without
                     50:  * modification, are permitted provided that the following conditions
                     51:  * are met:
                     52:  * 1. Redistributions of source code must retain the above copyright
                     53:  *    notice, this list of conditions and the following disclaimer.
                     54:  * 2. Redistributions in binary form must reproduce the above copyright
                     55:  *    notice, this list of conditions and the following disclaimer in the
                     56:  *    documentation and/or other materials provided with the distribution.
1.204     agc        57:  * 3. Neither the name of the University nor the names of its contributors
1.29      cgd        58:  *    may be used to endorse or promote products derived from this software
                     59:  *    without specific prior written permission.
                     60:  *
                     61:  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
                     62:  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
                     63:  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
                     64:  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
                     65:  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
                     66:  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
                     67:  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
                     68:  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
                     69:  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
                     70:  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
                     71:  * SUCH DAMAGE.
                     72:  *
1.32      cgd        73:  *     @(#)vfs_subr.c  8.13 (Berkeley) 4/18/94
1.29      cgd        74:  */
                     75:
                     76: /*
1.296     pooka      77:  * External virtual filesystem routines.
                     78:  *
                     79:  * This file contains vfs subroutines which are heavily dependant on
                     80:  * the kernel and are not suitable for standalone use.  Examples include
                     81:  * routines involved vnode and mountpoint management.
1.29      cgd        82:  */
1.162     lukem      83:
                     84: #include <sys/cdefs.h>
1.336   ! cegger     85: __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.335 2008/02/24 23:16:24 dholland Exp $");
1.78      mrg        86:
1.125     chs        87: #include "opt_ddb.h"
1.95      thorpej    88: #include "opt_compat_netbsd.h"
1.97      christos   89: #include "opt_compat_43.h"
1.29      cgd        90:
                     91: #include <sys/param.h>
1.30      mycroft    92: #include <sys/systm.h>
1.29      cgd        93: #include <sys/proc.h>
1.138     bouyer     94: #include <sys/kernel.h>
1.29      cgd        95: #include <sys/mount.h>
1.46      mycroft    96: #include <sys/fcntl.h>
1.29      cgd        97: #include <sys/vnode.h>
1.30      mycroft    98: #include <sys/stat.h>
1.29      cgd        99: #include <sys/namei.h>
                    100: #include <sys/ucred.h>
                    101: #include <sys/buf.h>
                    102: #include <sys/errno.h>
                    103: #include <sys/malloc.h>
1.51      christos  104: #include <sys/syscallargs.h>
1.58      thorpej   105: #include <sys/device.h>
1.192     christos  106: #include <sys/filedesc.h>
1.266     elad      107: #include <sys/kauth.h>
1.307     ad        108: #include <sys/atomic.h>
1.309     ad        109: #include <sys/kthread.h>
1.50      christos  110:
1.30      mycroft   111: #include <miscfs/specfs/specdev.h>
1.113     fvdl      112: #include <miscfs/syncfs/syncfs.h>
1.30      mycroft   113:
1.125     chs       114: #include <uvm/uvm.h>
1.255     yamt      115: #include <uvm/uvm_readahead.h>
1.125     chs       116: #include <uvm/uvm_ddb.h>
1.129     mrg       117:
                    118: #include <sys/sysctl.h>
1.77      mrg       119:
1.117     fvdl      120: extern int dovfsusermount;     /* 1 => permit any user to mount filesystems */
1.263     chs       121: extern int vfs_magiclinks;     /* 1 => expand "magic" symlinks */
1.117     fvdl      122:
1.309     ad        123: static vnodelst_t vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list);
                    124: static vnodelst_t vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list);
                    125: static vnodelst_t vrele_list = TAILQ_HEAD_INITIALIZER(vrele_list);
                    126:
                    127: static int vrele_pending;
                    128: static kmutex_t        vrele_lock;
                    129: static kcondvar_t vrele_cv;
                    130: static lwp_t *vrele_lwp;
1.113     fvdl      131:
1.309     ad        132: static pool_cache_t vnode_cache;
1.186     thorpej   133:
                    134: MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes");
1.93      thorpej   135:
1.89      kleink    136: /*
                    137:  * Local declarations.
                    138:  */
1.276     hannken   139:
1.309     ad        140: static void vrele_thread(void *);
                    141: static void insmntque(vnode_t *, struct mount *);
                    142: static int getdevvp(dev_t, vnode_t **, enum vtype);
                    143: static vnode_t *getcleanvnode(void);;
                    144: void vpanic(vnode_t *, const char *);
                    145:
                    146: #ifdef DIAGNOSTIC
                    147: void
                    148: vpanic(vnode_t *vp, const char *msg)
                    149: {
                    150:
                    151:        vprint(NULL, vp);
                    152:        panic("%s\n", msg);
                    153: }
                    154: #else
                    155: #define        vpanic(vp, msg) /* nothing */
                    156: #endif
                    157:
                    158: void
                    159: vn_init1(void)
                    160: {
                    161:
                    162:        vnode_cache = pool_cache_init(sizeof(struct vnode), 0, 0, 0, "vnodepl",
                    163:            NULL, IPL_NONE, NULL, NULL, NULL);
                    164:        KASSERT(vnode_cache != NULL);
                    165:
                    166:        /* Create deferred release thread. */
                    167:        mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE);
                    168:        cv_init(&vrele_cv, "vrele");
                    169:        if (kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread,
                    170:            NULL, &vrele_lwp, "vrele"))
                    171:                panic("fork vrele");
                    172: }
1.51      christos  173:
1.202     yamt      174: int
1.256     christos  175: vfs_drainvnodes(long target, struct lwp *l)
1.202     yamt      176: {
                    177:
                    178:        while (numvnodes > target) {
1.309     ad        179:                vnode_t *vp;
1.202     yamt      180:
1.309     ad        181:                mutex_enter(&vnode_free_list_lock);
                    182:                vp = getcleanvnode();
1.202     yamt      183:                if (vp == NULL)
                    184:                        return EBUSY; /* give up */
1.309     ad        185:                ungetnewvnode(vp);
1.202     yamt      186:        }
                    187:
                    188:        return 0;
                    189: }
                    190:
                    191: /*
                    192:  * grab a vnode from freelist and clean it.
                    193:  */
1.309     ad        194: vnode_t *
                    195: getcleanvnode(void)
1.202     yamt      196: {
1.309     ad        197:        vnode_t *vp;
                    198:        vnodelst_t *listhd;
1.202     yamt      199:
1.309     ad        200:        KASSERT(mutex_owned(&vnode_free_list_lock));
1.229     yamt      201:
1.309     ad        202: retry:
1.229     yamt      203:        listhd = &vnode_free_list;
                    204: try_nextlist:
                    205:        TAILQ_FOREACH(vp, listhd, v_freelist) {
1.309     ad        206:                /*
                    207:                 * It's safe to test v_usecount and v_iflag
                    208:                 * without holding the interlock here, since
                    209:                 * these vnodes should never appear on the
                    210:                 * lists.
                    211:                 */
                    212:                if (vp->v_usecount != 0) {
                    213:                        vpanic(vp, "free vnode isn't");
                    214:                }
                    215:                if ((vp->v_iflag & VI_CLEAN) != 0) {
                    216:                        vpanic(vp, "clean vnode on freelist");
                    217:                }
                    218:                if (vp->v_freelisthd != listhd) {
                    219:                        printf("vnode sez %p, listhd %p\n", vp->v_freelisthd, listhd);
                    220:                        vpanic(vp, "list head mismatch");
                    221:                }
                    222:                if (!mutex_tryenter(&vp->v_interlock))
1.208     hannken   223:                        continue;
1.227     yamt      224:                /*
1.309     ad        225:                 * Our lwp might hold the underlying vnode
                    226:                 * locked, so don't try to reclaim a VI_LAYER
                    227:                 * node if it's locked.
1.227     yamt      228:                 */
1.302     ad        229:                if ((vp->v_iflag & VI_XLOCK) == 0 &&
                    230:                    ((vp->v_iflag & VI_LAYER) == 0 || VOP_ISLOCKED(vp) == 0)) {
1.285     hannken   231:                        break;
1.202     yamt      232:                }
1.309     ad        233:                mutex_exit(&vp->v_interlock);
1.202     yamt      234:        }
                    235:
1.309     ad        236:        if (vp == NULL) {
1.229     yamt      237:                if (listhd == &vnode_free_list) {
                    238:                        listhd = &vnode_hold_list;
                    239:                        goto try_nextlist;
                    240:                }
1.309     ad        241:                mutex_exit(&vnode_free_list_lock);
                    242:                return NULL;
1.202     yamt      243:        }
                    244:
1.309     ad        245:        /* Remove it from the freelist. */
1.202     yamt      246:        TAILQ_REMOVE(listhd, vp, v_freelist);
1.309     ad        247:        vp->v_freelisthd = NULL;
                    248:        mutex_exit(&vnode_free_list_lock);
                    249:
                    250:        /*
                    251:         * The vnode is still associated with a file system, so we must
                    252:         * clean it out before reusing it.  We need to add a reference
                    253:         * before doing this.  If the vnode gains another reference while
                    254:         * being cleaned out then we lose - retry.
                    255:         */
                    256:        vp->v_usecount++;
                    257:        vclean(vp, DOCLOSE);
                    258:        if (vp->v_usecount == 1) {
                    259:                /* We're about to dirty it. */
                    260:                vp->v_iflag &= ~VI_CLEAN;
                    261:                mutex_exit(&vp->v_interlock);
1.318     ad        262:                if (vp->v_type == VBLK || vp->v_type == VCHR) {
                    263:                        spec_node_destroy(vp);
                    264:                }
                    265:                vp->v_type = VNON;
1.309     ad        266:        } else {
                    267:                /*
                    268:                 * Don't return to freelist - the holder of the last
                    269:                 * reference will destroy it.
                    270:                 */
1.315     ad        271:                KASSERT(vp->v_usecount > 1);
1.309     ad        272:                vp->v_usecount--;
                    273:                mutex_exit(&vp->v_interlock);
                    274:                mutex_enter(&vnode_free_list_lock);
                    275:                goto retry;
                    276:        }
                    277:
                    278:        if (vp->v_data != NULL || vp->v_uobj.uo_npages != 0 ||
                    279:            !TAILQ_EMPTY(&vp->v_uobj.memq)) {
                    280:                vpanic(vp, "cleaned vnode isn't");
                    281:        }
                    282:        if (vp->v_numoutput != 0) {
                    283:                vpanic(vp, "clean vnode has pending I/O's");
                    284:        }
                    285:        if ((vp->v_iflag & VI_ONWORKLST) != 0) {
                    286:                vpanic(vp, "clean vnode on syncer list");
                    287:        }
1.202     yamt      288:
                    289:        return vp;
                    290: }
                    291:
1.29      cgd       292: /*
1.327     ad        293:  * Mark a mount point as busy, and gain a new reference to it.  Used to
                    294:  * synchronize access and to delay unmounting.
                    295:  *
                    296:  * => Interlock is not released on failure.
                    297:  * => If no interlock, the caller is expected to already hold a reference
                    298:  *    on the mount.
                    299:  * => If interlocked, the interlock must prevent the last reference to
                    300:  *    the mount from disappearing.
                    301:  */
                    302: int
                    303: vfs_busy(struct mount *mp, const krw_t op, kmutex_t *interlock)
                    304: {
                    305:
                    306:        KASSERT(mp->mnt_refcnt > 0);
                    307:
                    308:        atomic_inc_uint(&mp->mnt_refcnt);
                    309:        if (interlock != NULL) {
                    310:                mutex_exit(interlock);
                    311:        }
                    312:        if (mp->mnt_writer == curlwp) {
                    313:                mp->mnt_recursecnt++;
                    314:        } else {
                    315:                rw_enter(&mp->mnt_lock, op);
                    316:                if (op == RW_WRITER) {
                    317:                        KASSERT(mp->mnt_writer == NULL);
                    318:                        mp->mnt_writer = curlwp;
                    319:                }
                    320:        }
                    321:        if ((mp->mnt_iflag & IMNT_GONE) != 0) {
                    322:                vfs_unbusy(mp, false);
                    323:                if (interlock != NULL) {
                    324:                        mutex_enter(interlock);
                    325:                }
                    326:                return ENOENT;
                    327:        }
                    328:
                    329:        return 0;
                    330: }
                    331:
                    332: /*
1.335     dholland  333:  * As vfs_busy(), but return immediately if the mount cannot be
1.327     ad        334:  * locked without waiting.
1.29      cgd       335:  */
1.50      christos  336: int
1.327     ad        337: vfs_trybusy(struct mount *mp, krw_t op, kmutex_t *interlock)
1.29      cgd       338: {
                    339:
1.327     ad        340:        KASSERT(mp->mnt_refcnt > 0);
1.217     junyoung  341:
1.327     ad        342:        if (mp->mnt_writer == curlwp) {
                    343:                mp->mnt_recursecnt++;
                    344:        } else {
                    345:                if (!rw_tryenter(&mp->mnt_lock, op)) {
                    346:                        return EBUSY;
                    347:                }
                    348:                if (op == RW_WRITER) {
                    349:                        KASSERT(mp->mnt_writer == NULL);
                    350:                        mp->mnt_writer = curlwp;
                    351:                }
                    352:        }
                    353:        atomic_inc_uint(&mp->mnt_refcnt);
                    354:        if ((mp->mnt_iflag & IMNT_GONE) != 0) {
                    355:                vfs_unbusy(mp, false);
                    356:                return ENOENT;
                    357:        }
                    358:        if (interlock != NULL) {
                    359:                mutex_exit(interlock);
                    360:        }
                    361:        return 0;
1.29      cgd       362: }
                    363:
                    364: /*
1.327     ad        365:  * Unlock a busy filesystem and drop reference to it.  If 'keepref' is
                    366:  * true, unlock but preserve the reference.
1.29      cgd       367:  */
                    368: void
1.327     ad        369: vfs_unbusy(struct mount *mp, bool keepref)
1.29      cgd       370: {
                    371:
1.327     ad        372:        KASSERT(mp->mnt_refcnt > 0);
                    373:
                    374:        if (mp->mnt_writer == curlwp) {
                    375:                KASSERT(rw_write_held(&mp->mnt_lock));
                    376:                if (mp->mnt_recursecnt != 0) {
                    377:                        mp->mnt_recursecnt--;
                    378:                } else {
                    379:                        mp->mnt_writer = NULL;
                    380:                        rw_exit(&mp->mnt_lock);
                    381:                }
                    382:        } else {
                    383:                rw_exit(&mp->mnt_lock);
                    384:        }
                    385:        if (!keepref) {
                    386:                vfs_destroy(mp);
                    387:        }
1.29      cgd       388: }
                    389:
                    390: /*
1.80      fvdl      391:  * Lookup a filesystem type, and if found allocate and initialize
                    392:  * a mount structure for it.
                    393:  *
                    394:  * Devname is usually updated by mount(8) after booting.
1.29      cgd       395:  */
1.50      christos  396: int
1.247     thorpej   397: vfs_rootmountalloc(const char *fstypename, const char *devname,
                    398:     struct mount **mpp)
1.29      cgd       399: {
1.80      fvdl      400:        struct vfsops *vfsp = NULL;
                    401:        struct mount *mp;
1.29      cgd       402:
1.309     ad        403:        mutex_enter(&vfs_list_lock);
1.152     jdolecek  404:        LIST_FOREACH(vfsp, &vfs_list, vfs_list)
1.291     christos  405:                if (!strncmp(vfsp->vfs_name, fstypename,
                    406:                    sizeof(mp->mnt_stat.f_fstypename)))
1.80      fvdl      407:                        break;
1.315     ad        408:        if (vfsp == NULL) {
                    409:                mutex_exit(&vfs_list_lock);
1.80      fvdl      410:                return (ENODEV);
1.315     ad        411:        }
1.309     ad        412:        vfsp->vfs_refcount++;
                    413:        mutex_exit(&vfs_list_lock);
                    414:
1.327     ad        415:        mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
                    416:        if (mp == NULL)
                    417:                return ENOMEM;
                    418:        mp->mnt_refcnt = 1;
                    419:        rw_init(&mp->mnt_lock);
1.331     skrll     420:        mutex_init(&mp->mnt_renamelock, MUTEX_DEFAULT, IPL_NONE);
1.327     ad        421:        (void)vfs_busy(mp, RW_WRITER, NULL);
1.272     reinoud   422:        TAILQ_INIT(&mp->mnt_vnodelist);
1.80      fvdl      423:        mp->mnt_op = vfsp;
                    424:        mp->mnt_flag = MNT_RDONLY;
1.309     ad        425:        mp->mnt_vnodecovered = NULL;
1.291     christos  426:        (void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name,
                    427:            sizeof(mp->mnt_stat.f_fstypename));
1.80      fvdl      428:        mp->mnt_stat.f_mntonname[0] = '/';
1.314     pooka     429:        mp->mnt_stat.f_mntonname[1] = '\0';
1.291     christos  430:        mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] =
                    431:            '\0';
                    432:        (void)copystr(devname, mp->mnt_stat.f_mntfromname,
                    433:            sizeof(mp->mnt_stat.f_mntfromname) - 1, 0);
1.276     hannken   434:        mount_initspecific(mp);
1.80      fvdl      435:        *mpp = mp;
1.29      cgd       436:        return (0);
                    437: }
                    438:
1.30      mycroft   439: /*
                    440:  * Routines having to do with the management of the vnode table.
                    441:  */
1.217     junyoung  442: extern int (**dead_vnodeop_p)(void *);
1.30      mycroft   443:
1.29      cgd       444: /*
                    445:  * Return the next vnode from the free list.
                    446:  */
1.50      christos  447: int
1.247     thorpej   448: getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *),
1.309     ad        449:            vnode_t **vpp)
1.29      cgd       450: {
1.142     chs       451:        struct uvm_object *uobj;
1.113     fvdl      452:        static int toggle;
1.309     ad        453:        vnode_t *vp;
1.153     thorpej   454:        int error = 0, tryalloc;
1.158     chs       455:
1.159     enami     456:  try_again:
1.327     ad        457:        if (mp != NULL) {
1.103     sommerfe  458:                /*
1.327     ad        459:                 * Mark filesystem busy while we're creating a
                    460:                 * vnode.  If unmount is in progress, this will
                    461:                 * wait; if the unmount succeeds (only if umount
                    462:                 * -f), this will return an error.  If the
                    463:                 * unmount fails, we'll keep going afterwards.
1.103     sommerfe  464:                 */
1.327     ad        465:                error = vfs_busy(mp, RW_READER, NULL);
                    466:                if (error)
1.103     sommerfe  467:                        return error;
                    468:        }
1.29      cgd       469:
1.113     fvdl      470:        /*
                    471:         * We must choose whether to allocate a new vnode or recycle an
                    472:         * existing one. The criterion for allocating a new one is that
                    473:         * the total number of vnodes is less than the number desired or
                    474:         * there are no vnodes on either free list. Generally we only
                    475:         * want to recycle vnodes that have no buffers associated with
                    476:         * them, so we look first on the vnode_free_list. If it is empty,
                    477:         * we next consider vnodes with referencing buffers on the
                    478:         * vnode_hold_list. The toggle ensures that half the time we
                    479:         * will use a buffer from the vnode_hold_list, and half the time
                    480:         * we will allocate a new one unless the list has grown to twice
                    481:         * the desired size. We are reticent to recycle vnodes from the
                    482:         * vnode_hold_list because we will lose the identity of all its
                    483:         * referencing buffers.
                    484:         */
1.142     chs       485:
1.153     thorpej   486:        vp = NULL;
                    487:
1.309     ad        488:        mutex_enter(&vnode_free_list_lock);
1.153     thorpej   489:
1.113     fvdl      490:        toggle ^= 1;
                    491:        if (numvnodes > 2 * desiredvnodes)
                    492:                toggle = 0;
                    493:
1.153     thorpej   494:        tryalloc = numvnodes < desiredvnodes ||
1.159     enami     495:            (TAILQ_FIRST(&vnode_free_list) == NULL &&
                    496:             (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle));
1.153     thorpej   497:
1.309     ad        498:        if (tryalloc) {
1.206     yamt      499:                numvnodes++;
1.309     ad        500:                mutex_exit(&vnode_free_list_lock);
1.310     pooka     501:                if ((vp = vnalloc(NULL)) == NULL) {
1.309     ad        502:                        mutex_enter(&vnode_free_list_lock);
                    503:                        numvnodes--;
                    504:                } else
                    505:                        vp->v_usecount = 1;
                    506:        }
                    507:
                    508:        if (vp == NULL) {
                    509:                vp = getcleanvnode();
                    510:                if (vp == NULL) {
1.327     ad        511:                        if (mp != NULL) {
                    512:                                vfs_unbusy(mp, false);
                    513:                        }
1.153     thorpej   514:                        if (tryalloc) {
                    515:                                printf("WARNING: unable to allocate new "
                    516:                                    "vnode, retrying...\n");
                    517:                                (void) tsleep(&lbolt, PRIBIO, "newvn", hz);
                    518:                                goto try_again;
                    519:                        }
1.132     jdolecek  520:                        tablefull("vnode", "increase kern.maxvnodes or NVNODE");
1.29      cgd       521:                        *vpp = 0;
                    522:                        return (ENFILE);
                    523:                }
1.302     ad        524:                vp->v_iflag = 0;
                    525:                vp->v_vflag = 0;
                    526:                vp->v_uflag = 0;
1.158     chs       527:                vp->v_socket = NULL;
1.29      cgd       528:        }
1.309     ad        529:
                    530:        KASSERT(vp->v_usecount == 1);
                    531:        KASSERT(vp->v_freelisthd == NULL);
                    532:        KASSERT(LIST_EMPTY(&vp->v_nclist));
                    533:        KASSERT(LIST_EMPTY(&vp->v_dnclist));
                    534:
1.29      cgd       535:        vp->v_type = VNON;
1.104     wrstuden  536:        vp->v_vnlock = &vp->v_lock;
1.29      cgd       537:        vp->v_tag = tag;
                    538:        vp->v_op = vops;
                    539:        insmntque(vp, mp);
1.30      mycroft   540:        *vpp = vp;
                    541:        vp->v_data = 0;
1.142     chs       542:
                    543:        /*
                    544:         * initialize uvm_object within vnode.
                    545:         */
                    546:
1.158     chs       547:        uobj = &vp->v_uobj;
                    548:        KASSERT(uobj->pgops == &uvm_vnodeops);
                    549:        KASSERT(uobj->uo_npages == 0);
                    550:        KASSERT(TAILQ_FIRST(&uobj->memq) == NULL);
1.288     yamt      551:        vp->v_size = vp->v_writesize = VSIZENOTSET;
1.142     chs       552:
1.309     ad        553:        if (mp != NULL) {
                    554:                if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
                    555:                        vp->v_vflag |= VV_MPSAFE;
1.327     ad        556:                vfs_unbusy(mp, true);
1.309     ad        557:        }
                    558:
1.29      cgd       559:        return (0);
1.130     fvdl      560: }
                    561:
                    562: /*
                    563:  * This is really just the reverse of getnewvnode(). Needed for
                    564:  * VFS_VGET functions who may need to push back a vnode in case
                    565:  * of a locking race.
                    566:  */
                    567: void
1.309     ad        568: ungetnewvnode(vnode_t *vp)
                    569: {
                    570:
                    571:        KASSERT(vp->v_usecount == 1);
                    572:        KASSERT(vp->v_data == NULL);
                    573:        KASSERT(vp->v_freelisthd == NULL);
                    574:
                    575:        mutex_enter(&vp->v_interlock);
                    576:        vp->v_iflag |= VI_CLEAN;
1.324     pooka     577:        vrelel(vp, 0);
1.309     ad        578: }
                    579:
                    580: /*
                    581:  * Allocate a new, uninitialized vnode.  If 'mp' is non-NULL, this is a
                    582:  * marker vnode and we are prepared to wait for the allocation.
                    583:  */
                    584: vnode_t *
1.310     pooka     585: vnalloc(struct mount *mp)
1.130     fvdl      586: {
1.309     ad        587:        vnode_t *vp;
                    588:
                    589:        vp = pool_cache_get(vnode_cache, (mp != NULL ? PR_WAITOK : PR_NOWAIT));
                    590:        if (vp == NULL) {
                    591:                return NULL;
                    592:        }
                    593:
                    594:        memset(vp, 0, sizeof(*vp));
                    595:        UVM_OBJ_INIT(&vp->v_uobj, &uvm_vnodeops, 0);
                    596:        cv_init(&vp->v_cv, "vnode");
                    597:        /*
                    598:         * done by memset() above.
                    599:         *      LIST_INIT(&vp->v_nclist);
                    600:         *      LIST_INIT(&vp->v_dnclist);
                    601:         */
                    602:
                    603:        if (mp != NULL) {
                    604:                vp->v_mount = mp;
                    605:                vp->v_type = VBAD;
                    606:                vp->v_iflag = VI_MARKER;
                    607:        } else {
1.326     ad        608:                rw_init(&vp->v_lock.vl_lock);
1.309     ad        609:        }
                    610:
                    611:        return vp;
                    612: }
                    613:
                    614: /*
                    615:  * Free an unused, unreferenced vnode.
                    616:  */
                    617: void
1.310     pooka     618: vnfree(vnode_t *vp)
1.309     ad        619: {
                    620:
                    621:        KASSERT(vp->v_usecount == 0);
                    622:
                    623:        if ((vp->v_iflag & VI_MARKER) == 0) {
1.326     ad        624:                rw_destroy(&vp->v_lock.vl_lock);
1.309     ad        625:                mutex_enter(&vnode_free_list_lock);
                    626:                numvnodes--;
                    627:                mutex_exit(&vnode_free_list_lock);
                    628:        }
                    629:
                    630:        UVM_OBJ_DESTROY(&vp->v_uobj);
                    631:        cv_destroy(&vp->v_cv);
                    632:        pool_cache_put(vnode_cache, vp);
                    633: }
                    634:
                    635: /*
                    636:  * Remove a vnode from its freelist.
                    637:  */
                    638: static inline void
                    639: vremfree(vnode_t *vp)
                    640: {
                    641:
                    642:        KASSERT(mutex_owned(&vp->v_interlock));
                    643:        KASSERT(vp->v_usecount == 0);
1.130     fvdl      644:
1.217     junyoung  645:        /*
1.309     ad        646:         * Note that the reference count must not change until
                    647:         * the vnode is removed.
1.130     fvdl      648:         */
1.309     ad        649:        mutex_enter(&vnode_free_list_lock);
                    650:        if (vp->v_holdcnt > 0) {
                    651:                KASSERT(vp->v_freelisthd == &vnode_hold_list);
                    652:        } else {
                    653:                KASSERT(vp->v_freelisthd == &vnode_free_list);
                    654:        }
                    655:        TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
                    656:        vp->v_freelisthd = NULL;
                    657:        mutex_exit(&vnode_free_list_lock);
1.29      cgd       658: }
                    659:
                    660: /*
                    661:  * Move a vnode from one mount queue to another.
                    662:  */
1.260     yamt      663: static void
1.309     ad        664: insmntque(vnode_t *vp, struct mount *mp)
1.29      cgd       665: {
1.327     ad        666:        struct mount *omp;
1.29      cgd       667:
1.103     sommerfe  668: #ifdef DIAGNOSTIC
                    669:        if ((mp != NULL) &&
1.207     dbj       670:            (mp->mnt_iflag & IMNT_UNMOUNT) &&
1.113     fvdl      671:            !(mp->mnt_flag & MNT_SOFTDEP) &&
                    672:            vp->v_tag != VT_VFS) {
1.103     sommerfe  673:                panic("insmntque into dying filesystem");
                    674:        }
                    675: #endif
1.217     junyoung  676:
1.309     ad        677:        mutex_enter(&mntvnode_lock);
1.29      cgd       678:        /*
                    679:         * Delete from old mount point vnode list, if on one.
                    680:         */
1.327     ad        681:        if ((omp = vp->v_mount) != NULL)
1.272     reinoud   682:                TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes);
1.29      cgd       683:        /*
1.327     ad        684:         * Insert into list of vnodes for the new mount point, if
                    685:         * available.  The caller must take a reference on the mount
                    686:         * structure and donate to the vnode.
1.29      cgd       687:         */
1.279     pooka     688:        if ((vp->v_mount = mp) != NULL)
                    689:                TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
1.309     ad        690:        mutex_exit(&mntvnode_lock);
1.327     ad        691:
                    692:        if (omp != NULL) {
                    693:                /* Release reference to old mount. */
                    694:                vfs_destroy(omp);
                    695:        }
1.29      cgd       696: }
                    697:
                    698: /*
                    699:  * Create a vnode for a block device.
1.59      thorpej   700:  * Used for root filesystem and swap areas.
1.29      cgd       701:  * Also used for memory file system special devices.
                    702:  */
1.50      christos  703: int
1.309     ad        704: bdevvp(dev_t dev, vnode_t **vpp)
1.29      cgd       705: {
1.30      mycroft   706:
                    707:        return (getdevvp(dev, vpp, VBLK));
1.29      cgd       708: }
                    709:
                    710: /*
                    711:  * Create a vnode for a character device.
                    712:  * Used for kernfs and some console handling.
                    713:  */
1.50      christos  714: int
1.309     ad        715: cdevvp(dev_t dev, vnode_t **vpp)
1.29      cgd       716: {
1.30      mycroft   717:
                    718:        return (getdevvp(dev, vpp, VCHR));
1.29      cgd       719: }
                    720:
                    721: /*
                    722:  * Create a vnode for a device.
                    723:  * Used by bdevvp (block device) for root file system etc.,
                    724:  * and by cdevvp (character device) for console and kernfs.
                    725:  */
1.260     yamt      726: static int
1.309     ad        727: getdevvp(dev_t dev, vnode_t **vpp, enum vtype type)
1.29      cgd       728: {
1.309     ad        729:        vnode_t *vp;
                    730:        vnode_t *nvp;
1.29      cgd       731:        int error;
                    732:
1.80      fvdl      733:        if (dev == NODEV) {
1.302     ad        734:                *vpp = NULL;
1.29      cgd       735:                return (0);
1.80      fvdl      736:        }
1.50      christos  737:        error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp);
1.29      cgd       738:        if (error) {
1.302     ad        739:                *vpp = NULL;
1.29      cgd       740:                return (error);
                    741:        }
                    742:        vp = nvp;
                    743:        vp->v_type = type;
1.309     ad        744:        vp->v_vflag |= VV_MPSAFE;
1.297     pooka     745:        uvm_vnp_setsize(vp, 0);
1.318     ad        746:        spec_node_init(vp, dev);
1.29      cgd       747:        *vpp = vp;
                    748:        return (0);
                    749: }
                    750:
                    751: /*
                    752:  * Grab a particular vnode from the free list, increment its
1.83      fvdl      753:  * reference count and lock it. If the vnode lock bit is set the
                    754:  * vnode is being eliminated in vgone. In that case, we can not
                    755:  * grab the vnode, so the process is awakened when the transition is
                    756:  * completed, and an error returned to indicate that the vnode is no
                    757:  * longer usable (possibly having been changed to a new file system type).
1.29      cgd       758:  */
1.30      mycroft   759: int
1.309     ad        760: vget(vnode_t *vp, int flags)
1.29      cgd       761: {
1.175     perseant  762:        int error;
1.29      cgd       763:
1.309     ad        764:        KASSERT((vp->v_iflag & VI_MARKER) == 0);
                    765:
                    766:        if ((flags & LK_INTERLOCK) == 0)
                    767:                mutex_enter(&vp->v_interlock);
                    768:
                    769:        /*
                    770:         * Before adding a reference, we must remove the vnode
                    771:         * from its freelist.
                    772:         */
                    773:        if (vp->v_usecount == 0) {
                    774:                vremfree(vp);
                    775:        }
                    776:        if (++vp->v_usecount == 0) {
                    777:                vpanic(vp, "vget: usecount overflow");
                    778:        }
                    779:
1.30      mycroft   780:        /*
                    781:         * If the vnode is in the process of being cleaned out for
                    782:         * another use, we wait for the cleaning to finish and then
1.312     ad        783:         * return failure.  Cleaning is determined by checking if
                    784:         * the VI_XLOCK or VI_FREEING flags are set.
1.80      fvdl      785:         */
1.312     ad        786:        if ((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0) {
1.313     ad        787:                if ((flags & LK_NOWAIT) != 0) {
1.324     pooka     788:                        vrelel(vp, 0);
1.142     chs       789:                        return EBUSY;
                    790:                }
1.312     ad        791:                vwait(vp, VI_XLOCK | VI_FREEING);
1.324     pooka     792:                vrelel(vp, 0);
1.313     ad        793:                return ENOENT;
1.29      cgd       794:        }
1.80      fvdl      795:        if (flags & LK_TYPE_MASK) {
1.313     ad        796:                error = vn_lock(vp, flags | LK_INTERLOCK);
                    797:                if (error != 0) {
1.257     yamt      798:                        vrele(vp);
1.113     fvdl      799:                }
1.313     ad        800:                return error;
1.80      fvdl      801:        }
1.309     ad        802:        mutex_exit(&vp->v_interlock);
1.313     ad        803:        return 0;
1.29      cgd       804: }
                    805:
                    806: /*
                    807:  * vput(), just unlock and vrele()
                    808:  */
                    809: void
1.309     ad        810: vput(vnode_t *vp)
1.29      cgd       811: {
1.30      mycroft   812:
1.309     ad        813:        KASSERT((vp->v_iflag & VI_MARKER) == 0);
                    814:
                    815:        VOP_UNLOCK(vp, 0);
                    816:        vrele(vp);
1.29      cgd       817: }
                    818:
                    819: /*
1.309     ad        820:  * Vnode release.  If reference count drops to zero, call inactive
                    821:  * routine and either return to freelist or free to the pool.
1.29      cgd       822:  */
1.309     ad        823: void
1.324     pooka     824: vrelel(vnode_t *vp, int flags)
1.29      cgd       825: {
1.309     ad        826:        bool recycle, defer;
                    827:        int error;
                    828:
                    829:        KASSERT(mutex_owned(&vp->v_interlock));
                    830:        KASSERT((vp->v_iflag & VI_MARKER) == 0);
1.315     ad        831:        KASSERT(vp->v_freelisthd == NULL);
1.29      cgd       832:
1.309     ad        833:        if (vp->v_op == dead_vnodeop_p && (vp->v_iflag & VI_CLEAN) == 0) {
                    834:                vpanic(vp, "dead but not clean");
                    835:        }
                    836:
                    837:        /*
                    838:         * If not the last reference, just drop the reference count
                    839:         * and unlock.
                    840:         */
                    841:        if (vp->v_usecount > 1) {
                    842:                vp->v_usecount--;
                    843:                vp->v_iflag |= VI_INACTREDO;
                    844:                mutex_exit(&vp->v_interlock);
1.29      cgd       845:                return;
1.80      fvdl      846:        }
1.309     ad        847:        if (vp->v_usecount <= 0 || vp->v_writecount != 0) {
                    848:                vpanic(vp, "vput: bad ref count");
1.29      cgd       849:        }
1.309     ad        850:
1.30      mycroft   851:        /*
1.309     ad        852:         * If not clean, deactivate the vnode, but preserve
                    853:         * our reference across the call to VOP_INACTIVE().
1.30      mycroft   854:         */
1.309     ad        855:  retry:
                    856:        if ((vp->v_iflag & VI_CLEAN) == 0) {
                    857:                recycle = false;
                    858:                /*
                    859:                 * XXX This ugly block can be largely eliminated if
                    860:                 * locking is pushed down into the file systems.
                    861:                 */
                    862:                if (curlwp == uvm.pagedaemon_lwp) {
                    863:                        /* The pagedaemon can't wait around; defer. */
                    864:                        defer = true;
                    865:                } else if (curlwp == vrele_lwp) {
                    866:                        /* We have to try harder. */
                    867:                        vp->v_iflag &= ~VI_INACTREDO;
                    868:                        error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK |
                    869:                            LK_RETRY);
                    870:                        if (error != 0) {
                    871:                                /* XXX */
                    872:                                vpanic(vp, "vrele: unable to lock %p");
                    873:                        }
                    874:                        defer = false;
                    875:                } else if ((vp->v_iflag & VI_LAYER) != 0) {
                    876:                        /*
                    877:                         * Acquiring the stack's lock in vclean() even
                    878:                         * for an honest vput/vrele is dangerous because
                    879:                         * our caller may hold other vnode locks; defer.
                    880:                         */
                    881:                        defer = true;
                    882:                } else {
                    883:                        /* If we can't acquire the lock, then defer. */
                    884:                        vp->v_iflag &= ~VI_INACTREDO;
                    885:                        error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK |
                    886:                            LK_NOWAIT);
                    887:                        if (error != 0) {
                    888:                                defer = true;
                    889:                                mutex_enter(&vp->v_interlock);
                    890:                        } else {
                    891:                                defer = false;
                    892:                        }
                    893:                }
                    894:
                    895:                if (defer) {
                    896:                        /*
                    897:                         * Defer reclaim to the kthread; it's not safe to
                    898:                         * clean it here.  We donate it our last reference.
                    899:                         */
                    900:                        KASSERT(mutex_owned(&vp->v_interlock));
                    901:                        KASSERT((vp->v_iflag & VI_INACTPEND) == 0);
                    902:                        vp->v_iflag |= VI_INACTPEND;
                    903:                        mutex_enter(&vrele_lock);
                    904:                        TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist);
                    905:                        if (++vrele_pending > (desiredvnodes >> 8))
                    906:                                cv_signal(&vrele_cv);
                    907:                        mutex_exit(&vrele_lock);
                    908:                        mutex_exit(&vp->v_interlock);
                    909:                        return;
                    910:                }
                    911:
1.318     ad        912: #ifdef DIAGNOSTIC
1.321     ad        913:                if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
                    914:                    vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
1.318     ad        915:                        vprint("vrelel: missing VOP_CLOSE()", vp);
                    916:                }
                    917: #endif
                    918:
1.309     ad        919:                /*
1.312     ad        920:                 * The vnode can gain another reference while being
                    921:                 * deactivated.  If VOP_INACTIVE() indicates that
                    922:                 * the described file has been deleted, then recycle
                    923:                 * the vnode irrespective of additional references.
                    924:                 * Another thread may be waiting to re-use the on-disk
                    925:                 * inode.
                    926:                 *
                    927:                 * Note that VOP_INACTIVE() will drop the vnode lock.
1.309     ad        928:                 */
                    929:                VOP_INACTIVE(vp, &recycle);
                    930:                mutex_enter(&vp->v_interlock);
1.312     ad        931:                if (!recycle) {
                    932:                        if (vp->v_usecount > 1) {
                    933:                                vp->v_usecount--;
                    934:                                mutex_exit(&vp->v_interlock);
                    935:                                return;
                    936:                        }
1.309     ad        937:
1.312     ad        938:                        /*
                    939:                         * If we grew another reference while
                    940:                         * VOP_INACTIVE() was underway, retry.
                    941:                         */
                    942:                        if ((vp->v_iflag & VI_INACTREDO) != 0) {
                    943:                                goto retry;
                    944:                        }
1.309     ad        945:                }
                    946:
                    947:                /* Take care of space accounting. */
                    948:                if (vp->v_iflag & VI_EXECMAP) {
                    949:                        atomic_add_int(&uvmexp.execpages,
                    950:                            -vp->v_uobj.uo_npages);
                    951:                        atomic_add_int(&uvmexp.filepages,
                    952:                            vp->v_uobj.uo_npages);
                    953:                }
                    954:                vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP|VI_MAPPED);
                    955:                vp->v_vflag &= ~VV_MAPPED;
                    956:
                    957:                /*
                    958:                 * Recycle the vnode if the file is now unused (unlinked),
                    959:                 * otherwise just free it.
                    960:                 */
                    961:                if (recycle) {
                    962:                        vclean(vp, DOCLOSE);
                    963:                }
                    964:                KASSERT(vp->v_usecount > 0);
1.298     pooka     965:        }
1.309     ad        966:
                    967:        if (--vp->v_usecount != 0) {
                    968:                /* Gained another reference while being reclaimed. */
                    969:                mutex_exit(&vp->v_interlock);
                    970:                return;
1.147     chs       971:        }
1.298     pooka     972:
1.309     ad        973:        if ((vp->v_iflag & VI_CLEAN) != 0) {
                    974:                /*
                    975:                 * It's clean so destroy it.  It isn't referenced
                    976:                 * anywhere since it has been reclaimed.
                    977:                 */
                    978:                KASSERT(vp->v_holdcnt == 0);
                    979:                KASSERT(vp->v_writecount == 0);
                    980:                mutex_exit(&vp->v_interlock);
                    981:                insmntque(vp, NULL);
1.318     ad        982:                if (vp->v_type == VBLK || vp->v_type == VCHR) {
                    983:                        spec_node_destroy(vp);
                    984:                }
1.310     pooka     985:                vnfree(vp);
1.298     pooka     986:        } else {
1.309     ad        987:                /*
                    988:                 * Otherwise, put it back onto the freelist.  It
                    989:                 * can't be destroyed while still associated with
                    990:                 * a file system.
                    991:                 */
                    992:                mutex_enter(&vnode_free_list_lock);
                    993:                if (vp->v_holdcnt > 0) {
                    994:                        vp->v_freelisthd = &vnode_hold_list;
                    995:                } else {
                    996:                        vp->v_freelisthd = &vnode_free_list;
                    997:                }
                    998:                TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
                    999:                mutex_exit(&vnode_free_list_lock);
                   1000:                mutex_exit(&vp->v_interlock);
1.298     pooka    1001:        }
                   1002: }
                   1003:
                   1004: void
1.309     ad       1005: vrele(vnode_t *vp)
1.298     pooka    1006: {
                   1007:
1.309     ad       1008:        KASSERT((vp->v_iflag & VI_MARKER) == 0);
                   1009:
                   1010:        mutex_enter(&vp->v_interlock);
1.324     pooka    1011:        vrelel(vp, 0);
1.298     pooka    1012: }
                   1013:
1.309     ad       1014: static void
                   1015: vrele_thread(void *cookie)
1.298     pooka    1016: {
1.309     ad       1017:        vnode_t *vp;
1.298     pooka    1018:
1.309     ad       1019:        for (;;) {
                   1020:                mutex_enter(&vrele_lock);
                   1021:                while (TAILQ_EMPTY(&vrele_list)) {
                   1022:                        cv_timedwait(&vrele_cv, &vrele_lock, hz);
                   1023:                }
                   1024:                vp = TAILQ_FIRST(&vrele_list);
                   1025:                TAILQ_REMOVE(&vrele_list, vp, v_freelist);
                   1026:                vrele_pending--;
                   1027:                mutex_exit(&vrele_lock);
                   1028:
                   1029:                /*
                   1030:                 * If not the last reference, then ignore the vnode
                   1031:                 * and look for more work.
                   1032:                 */
                   1033:                mutex_enter(&vp->v_interlock);
                   1034:                KASSERT((vp->v_iflag & VI_INACTPEND) != 0);
                   1035:                vp->v_iflag &= ~VI_INACTPEND;
                   1036:                if (vp->v_usecount > 1) {
                   1037:                        vp->v_usecount--;
                   1038:                        mutex_exit(&vp->v_interlock);
                   1039:                        continue;
                   1040:                }
1.324     pooka    1041:                vrelel(vp, 0);
1.309     ad       1042:        }
1.29      cgd      1043: }
                   1044:
                   1045: /*
                   1046:  * Page or buffer structure gets a reference.
1.258     chs      1047:  * Called with v_interlock held.
1.29      cgd      1048:  */
1.30      mycroft  1049: void
1.309     ad       1050: vholdl(vnode_t *vp)
1.29      cgd      1051: {
                   1052:
1.309     ad       1053:        KASSERT(mutex_owned(&vp->v_interlock));
                   1054:        KASSERT((vp->v_iflag & VI_MARKER) == 0);
                   1055:
                   1056:        if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) {
                   1057:                mutex_enter(&vnode_free_list_lock);
                   1058:                KASSERT(vp->v_freelisthd == &vnode_free_list);
                   1059:                TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
                   1060:                vp->v_freelisthd = &vnode_hold_list;
                   1061:                TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
                   1062:                mutex_exit(&vnode_free_list_lock);
1.113     fvdl     1063:        }
1.29      cgd      1064: }
                   1065:
                   1066: /*
                   1067:  * Page or buffer structure frees a reference.
1.258     chs      1068:  * Called with v_interlock held.
1.29      cgd      1069:  */
1.30      mycroft  1070: void
1.309     ad       1071: holdrelel(vnode_t *vp)
1.29      cgd      1072: {
                   1073:
1.309     ad       1074:        KASSERT(mutex_owned(&vp->v_interlock));
                   1075:        KASSERT((vp->v_iflag & VI_MARKER) == 0);
1.142     chs      1076:
1.309     ad       1077:        if (vp->v_holdcnt <= 0) {
                   1078:                vpanic(vp, "holdrelel: holdcnt vp %p");
                   1079:        }
1.142     chs      1080:
1.309     ad       1081:        vp->v_holdcnt--;
                   1082:        if (vp->v_holdcnt == 0 && vp->v_usecount == 0) {
                   1083:                mutex_enter(&vnode_free_list_lock);
                   1084:                KASSERT(vp->v_freelisthd == &vnode_hold_list);
                   1085:                TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
                   1086:                vp->v_freelisthd = &vnode_free_list;
                   1087:                TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
                   1088:                mutex_exit(&vnode_free_list_lock);
1.113     fvdl     1089:        }
1.81      ross     1090: }
                   1091:
                   1092: /*
1.309     ad       1093:  * Vnode reference, where a reference is already held by some other
                   1094:  * object (for example, a file structure).
1.81      ross     1095:  */
                   1096: void
1.309     ad       1097: vref(vnode_t *vp)
1.81      ross     1098: {
                   1099:
1.309     ad       1100:        KASSERT((vp->v_iflag & VI_MARKER) == 0);
                   1101:
                   1102:        mutex_enter(&vp->v_interlock);
                   1103:        if (vp->v_usecount <= 0) {
                   1104:                vpanic(vp, "vref used where vget required");
                   1105:        }
                   1106:        if (++vp->v_usecount == 0) {
                   1107:                vpanic(vp, "vref: usecount overflow");
1.112     mycroft  1108:        }
1.309     ad       1109:        mutex_exit(&vp->v_interlock);
1.29      cgd      1110: }
                   1111:
                   1112: /*
                   1113:  * Remove any vnodes in the vnode table belonging to mount point mp.
                   1114:  *
1.183     yamt     1115:  * If FORCECLOSE is not specified, there should not be any active ones,
1.29      cgd      1116:  * return error if any are found (nb: this is a user error, not a
1.183     yamt     1117:  * system error). If FORCECLOSE is specified, detach any active vnodes
1.29      cgd      1118:  * that are found.
1.183     yamt     1119:  *
                   1120:  * If WRITECLOSE is set, only flush out regular file vnodes open for
                   1121:  * writing.
                   1122:  *
                   1123:  * SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped.
1.29      cgd      1124:  */
1.30      mycroft  1125: #ifdef DEBUG
                   1126: int busyprt = 0;       /* print out busy vnodes */
                   1127: struct ctldebug debug1 = { "busyprt", &busyprt };
                   1128: #endif
1.29      cgd      1129:
1.334     ad       1130: static vnode_t *
                   1131: vflushnext(vnode_t *mvp, int *when)
                   1132: {
                   1133:
                   1134:        if (hardclock_ticks > *when) {
                   1135:                mutex_exit(&mntvnode_lock);
                   1136:                yield();
                   1137:                mutex_enter(&mntvnode_lock);
                   1138:                *when = hardclock_ticks + hz / 10;
                   1139:        }
                   1140:
                   1141:        return vunmark(mvp);
                   1142: }
                   1143:
1.50      christos 1144: int
1.309     ad       1145: vflush(struct mount *mp, vnode_t *skipvp, int flags)
1.29      cgd      1146: {
1.309     ad       1147:        vnode_t *vp, *mvp;
1.334     ad       1148:        int busy = 0, when = 0;
1.29      cgd      1149:
1.309     ad       1150:        /* Allocate a marker vnode. */
1.310     pooka    1151:        if ((mvp = vnalloc(mp)) == NULL)
1.309     ad       1152:                return (ENOMEM);
                   1153:
                   1154:        mutex_enter(&mntvnode_lock);
1.273     reinoud  1155:        /*
                   1156:         * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
                   1157:         * and vclean() are called
                   1158:         */
1.334     ad       1159:        for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp != NULL;
                   1160:            vp = vflushnext(mvp, &when)) {
1.309     ad       1161:                vmark(mvp, vp);
                   1162:                if (vp->v_mount != mp || vismarker(vp))
                   1163:                        continue;
1.29      cgd      1164:                /*
                   1165:                 * Skip over a selected vnode.
                   1166:                 */
                   1167:                if (vp == skipvp)
                   1168:                        continue;
1.309     ad       1169:                mutex_enter(&vp->v_interlock);
1.29      cgd      1170:                /*
1.315     ad       1171:                 * Ignore clean but still referenced vnodes.
                   1172:                 */
                   1173:                if ((vp->v_iflag & VI_CLEAN) != 0) {
                   1174:                        mutex_exit(&vp->v_interlock);
                   1175:                        continue;
                   1176:                }
                   1177:                /*
1.309     ad       1178:                 * Skip over a vnodes marked VSYSTEM.
1.29      cgd      1179:                 */
1.302     ad       1180:                if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
1.309     ad       1181:                        mutex_exit(&vp->v_interlock);
1.29      cgd      1182:                        continue;
1.80      fvdl     1183:                }
1.29      cgd      1184:                /*
1.30      mycroft  1185:                 * If WRITECLOSE is set, only flush out regular file
                   1186:                 * vnodes open for writing.
                   1187:                 */
                   1188:                if ((flags & WRITECLOSE) &&
1.92      thorpej  1189:                    (vp->v_writecount == 0 || vp->v_type != VREG)) {
1.309     ad       1190:                        mutex_exit(&vp->v_interlock);
1.30      mycroft  1191:                        continue;
1.92      thorpej  1192:                }
1.30      mycroft  1193:                /*
1.29      cgd      1194:                 * With v_usecount == 0, all we need to do is clear
                   1195:                 * out the vnode data structures and we are done.
                   1196:                 */
                   1197:                if (vp->v_usecount == 0) {
1.309     ad       1198:                        mutex_exit(&mntvnode_lock);
                   1199:                        vremfree(vp);
                   1200:                        vp->v_usecount++;
                   1201:                        vclean(vp, DOCLOSE);
1.324     pooka    1202:                        vrelel(vp, 0);
1.309     ad       1203:                        mutex_enter(&mntvnode_lock);
1.29      cgd      1204:                        continue;
                   1205:                }
                   1206:                /*
1.30      mycroft  1207:                 * If FORCECLOSE is set, forcibly close the vnode.
1.29      cgd      1208:                 * For block or character devices, revert to an
1.318     ad       1209:                 * anonymous device.  For all other files, just
                   1210:                 * kill them.
1.29      cgd      1211:                 */
                   1212:                if (flags & FORCECLOSE) {
1.309     ad       1213:                        mutex_exit(&mntvnode_lock);
                   1214:                        vp->v_usecount++;
1.29      cgd      1215:                        if (vp->v_type != VBLK && vp->v_type != VCHR) {
1.309     ad       1216:                                vclean(vp, DOCLOSE);
1.324     pooka    1217:                                vrelel(vp, 0);
1.29      cgd      1218:                        } else {
1.309     ad       1219:                                vclean(vp, 0);
1.318     ad       1220:                                vp->v_op = spec_vnodeop_p; /* XXXSMP */
1.320     ad       1221:                                mutex_exit(&vp->v_interlock);
                   1222:                                /*
                   1223:                                 * The vnode isn't clean, but still resides
                   1224:                                 * on the mount list.  Remove it. XXX This
                   1225:                                 * is a bit dodgy.
                   1226:                                 */
                   1227:                                insmntque(vp, NULL);
                   1228:                                vrele(vp);
1.29      cgd      1229:                        }
1.309     ad       1230:                        mutex_enter(&mntvnode_lock);
1.29      cgd      1231:                        continue;
                   1232:                }
1.30      mycroft  1233: #ifdef DEBUG
1.29      cgd      1234:                if (busyprt)
                   1235:                        vprint("vflush: busy vnode", vp);
1.30      mycroft  1236: #endif
1.309     ad       1237:                mutex_exit(&vp->v_interlock);
1.29      cgd      1238:                busy++;
                   1239:        }
1.309     ad       1240:        mutex_exit(&mntvnode_lock);
1.310     pooka    1241:        vnfree(mvp);
1.29      cgd      1242:        if (busy)
                   1243:                return (EBUSY);
                   1244:        return (0);
                   1245: }
                   1246:
                   1247: /*
                   1248:  * Disassociate the underlying file system from a vnode.
1.309     ad       1249:  *
                   1250:  * Must be called with the interlock held, and will return with it held.
1.29      cgd      1251:  */
1.309     ad       1252: void
                   1253: vclean(vnode_t *vp, int flags)
1.29      cgd      1254: {
1.309     ad       1255:        lwp_t *l = curlwp;
                   1256:        bool recycle, active;
1.318     ad       1257:        int error;
1.29      cgd      1258:
1.309     ad       1259:        KASSERT(mutex_owned(&vp->v_interlock));
                   1260:        KASSERT((vp->v_iflag & VI_MARKER) == 0);
                   1261:        KASSERT(vp->v_usecount != 0);
1.166     chs      1262:
1.309     ad       1263:        /* If cleaning is already in progress wait until done and return. */
                   1264:        if (vp->v_iflag & VI_XLOCK) {
                   1265:                vwait(vp, VI_XLOCK);
                   1266:                return;
                   1267:        }
1.166     chs      1268:
1.309     ad       1269:        /* If already clean, nothing to do. */
                   1270:        if ((vp->v_iflag & VI_CLEAN) != 0) {
                   1271:                return;
1.112     mycroft  1272:        }
1.87      pk       1273:
1.29      cgd      1274:        /*
1.309     ad       1275:         * Prevent the vnode from being recycled or brought into use
                   1276:         * while we clean it out.
1.29      cgd      1277:         */
1.302     ad       1278:        vp->v_iflag |= VI_XLOCK;
                   1279:        if (vp->v_iflag & VI_EXECMAP) {
1.307     ad       1280:                atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages);
                   1281:                atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages);
1.147     chs      1282:        }
1.302     ad       1283:        vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
1.309     ad       1284:        active = (vp->v_usecount > 1);
1.142     chs      1285:
1.309     ad       1286:        /* XXXAD should not lock vnode under layer */
                   1287:        VOP_LOCK(vp, LK_EXCLUSIVE | LK_INTERLOCK);
1.80      fvdl     1288:
1.98      wrstuden 1289:        /*
1.142     chs      1290:         * Clean out any cached data associated with the vnode.
1.318     ad       1291:         * If purging an active vnode, it must be closed and
                   1292:         * deactivated before being reclaimed. Note that the
                   1293:         * VOP_INACTIVE will unlock the vnode.
1.29      cgd      1294:         */
1.166     chs      1295:        if (flags & DOCLOSE) {
1.256     christos 1296:                error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
1.318     ad       1297:                if (error != 0)
1.256     christos 1298:                        error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
1.211     dbj      1299:                KASSERT(error == 0);
1.302     ad       1300:                KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1.318     ad       1301:                if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) {
                   1302:                         spec_node_revoke(vp);
1.231     mycroft  1303:                }
1.166     chs      1304:        }
1.29      cgd      1305:        if (active) {
1.309     ad       1306:                VOP_INACTIVE(vp, &recycle);
1.80      fvdl     1307:        } else {
                   1308:                /*
                   1309:                 * Any other processes trying to obtain this lock must first
1.302     ad       1310:                 * wait for VI_XLOCK to clear, then call the new lock operation.
1.80      fvdl     1311:                 */
                   1312:                VOP_UNLOCK(vp, 0);
1.29      cgd      1313:        }
1.142     chs      1314:
1.309     ad       1315:        /* Disassociate the underlying file system from the vnode. */
                   1316:        if (VOP_RECLAIM(vp)) {
                   1317:                vpanic(vp, "vclean: cannot reclaim");
1.87      pk       1318:        }
1.30      mycroft  1319:
1.169     chs      1320:        KASSERT(vp->v_uobj.uo_npages == 0);
1.255     yamt     1321:        if (vp->v_type == VREG && vp->v_ractx != NULL) {
                   1322:                uvm_ra_freectx(vp->v_ractx);
                   1323:                vp->v_ractx = NULL;
                   1324:        }
1.80      fvdl     1325:        cache_purge(vp);
                   1326:
1.309     ad       1327:        /* Done with purge, notify sleepers of the grim news. */
1.30      mycroft  1328:        vp->v_op = dead_vnodeop_p;
                   1329:        vp->v_tag = VT_NON;
1.309     ad       1330:        mutex_enter(&vp->v_interlock);
                   1331:        vp->v_vnlock = &vp->v_lock;
1.332     ad       1332:        KNOTE(&vp->v_klist, NOTE_REVOKE);
1.312     ad       1333:        vp->v_iflag &= ~(VI_XLOCK | VI_FREEING);
1.304     ad       1334:        vp->v_vflag &= ~VV_LOCKSWORK;
1.319     ad       1335:        if ((flags & DOCLOSE) != 0) {
1.318     ad       1336:                vp->v_iflag |= VI_CLEAN;
                   1337:        }
1.309     ad       1338:        cv_broadcast(&vp->v_cv);
                   1339:
                   1340:        KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1.29      cgd      1341: }
                   1342:
                   1343: /*
1.80      fvdl     1344:  * Recycle an unused vnode to the front of the free list.
                   1345:  * Release the passed interlock if the vnode will be recycled.
1.29      cgd      1346:  */
1.80      fvdl     1347: int
1.309     ad       1348: vrecycle(vnode_t *vp, kmutex_t *inter_lkp, struct lwp *l)
1.217     junyoung 1349: {
                   1350:
1.309     ad       1351:        KASSERT((vp->v_iflag & VI_MARKER) == 0);
                   1352:
                   1353:        mutex_enter(&vp->v_interlock);
                   1354:        if (vp->v_usecount != 0) {
                   1355:                mutex_exit(&vp->v_interlock);
                   1356:                return (0);
1.29      cgd      1357:        }
1.309     ad       1358:        if (inter_lkp)
                   1359:                mutex_exit(inter_lkp);
                   1360:        vremfree(vp);
                   1361:        vp->v_usecount++;
                   1362:        vclean(vp, DOCLOSE);
1.324     pooka    1363:        vrelel(vp, 0);
1.309     ad       1364:        return (1);
1.29      cgd      1365: }
                   1366:
                   1367: /*
1.309     ad       1368:  * Eliminate all activity associated with a vnode in preparation for
                   1369:  * reuse.  Drops a reference from the vnode.
1.29      cgd      1370:  */
                   1371: void
1.309     ad       1372: vgone(vnode_t *vp)
1.80      fvdl     1373: {
1.166     chs      1374:
1.309     ad       1375:        mutex_enter(&vp->v_interlock);
                   1376:        vclean(vp, DOCLOSE);
1.324     pooka    1377:        vrelel(vp, 0);
1.29      cgd      1378: }
                   1379:
                   1380: /*
                   1381:  * Lookup a vnode by device number.
                   1382:  */
1.50      christos 1383: int
1.309     ad       1384: vfinddev(dev_t dev, enum vtype type, vnode_t **vpp)
1.29      cgd      1385: {
1.309     ad       1386:        vnode_t *vp;
1.80      fvdl     1387:        int rc = 0;
1.29      cgd      1388:
1.318     ad       1389:        mutex_enter(&specfs_lock);
                   1390:        for (vp = specfs_hash[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1.29      cgd      1391:                if (dev != vp->v_rdev || type != vp->v_type)
                   1392:                        continue;
                   1393:                *vpp = vp;
1.80      fvdl     1394:                rc = 1;
                   1395:                break;
1.29      cgd      1396:        }
1.318     ad       1397:        mutex_exit(&specfs_lock);
1.80      fvdl     1398:        return (rc);
1.96      thorpej  1399: }
                   1400:
                   1401: /*
                   1402:  * Revoke all the vnodes corresponding to the specified minor number
                   1403:  * range (endpoints inclusive) of the specified major.
                   1404:  */
                   1405: void
1.247     thorpej  1406: vdevgone(int maj, int minl, int minh, enum vtype type)
1.96      thorpej  1407: {
1.316     ad       1408:        vnode_t *vp, **vpp;
                   1409:        dev_t dev;
1.96      thorpej  1410:        int mn;
                   1411:
1.274     mrg      1412:        vp = NULL;      /* XXX gcc */
                   1413:
1.318     ad       1414:        mutex_enter(&specfs_lock);
1.316     ad       1415:        for (mn = minl; mn <= minh; mn++) {
                   1416:                dev = makedev(maj, mn);
1.318     ad       1417:                vpp = &specfs_hash[SPECHASH(dev)];
1.316     ad       1418:                for (vp = *vpp; vp != NULL;) {
                   1419:                        mutex_enter(&vp->v_interlock);
                   1420:                        if ((vp->v_iflag & VI_CLEAN) != 0 ||
                   1421:                            dev != vp->v_rdev || type != vp->v_type) {
                   1422:                                mutex_exit(&vp->v_interlock);
                   1423:                                vp = vp->v_specnext;
                   1424:                                continue;
                   1425:                        }
1.318     ad       1426:                        mutex_exit(&specfs_lock);
1.316     ad       1427:                        if (vget(vp, LK_INTERLOCK) == 0) {
                   1428:                                VOP_REVOKE(vp, REVOKEALL);
                   1429:                                vrele(vp);
                   1430:                        }
1.318     ad       1431:                        mutex_enter(&specfs_lock);
1.316     ad       1432:                        vp = *vpp;
                   1433:                }
                   1434:        }
1.318     ad       1435:        mutex_exit(&specfs_lock);
1.29      cgd      1436: }
                   1437:
                   1438: /*
                   1439:  * Calculate the total number of references to a special device.
                   1440:  */
1.30      mycroft  1441: int
1.309     ad       1442: vcount(vnode_t *vp)
1.29      cgd      1443: {
                   1444:        int count;
                   1445:
1.318     ad       1446:        mutex_enter(&specfs_lock);
1.309     ad       1447:        mutex_enter(&vp->v_interlock);
1.318     ad       1448:        if (vp->v_specnode == NULL) {
1.309     ad       1449:                count = vp->v_usecount - ((vp->v_iflag & VI_INACTPEND) != 0);
                   1450:                mutex_exit(&vp->v_interlock);
1.318     ad       1451:                mutex_exit(&specfs_lock);
1.309     ad       1452:                return (count);
                   1453:        }
                   1454:        mutex_exit(&vp->v_interlock);
1.318     ad       1455:        count = vp->v_specnode->sn_dev->sd_opencnt;
                   1456:        mutex_exit(&specfs_lock);
1.29      cgd      1457:        return (count);
                   1458: }
                   1459:
1.101     mrg      1460: /*
1.316     ad       1461:  * Eliminate all activity associated with the requested vnode
                   1462:  * and with all vnodes aliased to the requested vnode.
                   1463:  */
                   1464: void
                   1465: vrevoke(vnode_t *vp)
                   1466: {
                   1467:        vnode_t *vq, **vpp;
                   1468:        enum vtype type;
                   1469:        dev_t dev;
                   1470:
                   1471:        KASSERT(vp->v_usecount > 0);
                   1472:
                   1473:        mutex_enter(&vp->v_interlock);
                   1474:        if ((vp->v_iflag & VI_CLEAN) != 0) {
                   1475:                mutex_exit(&vp->v_interlock);
                   1476:                return;
                   1477:        } else {
                   1478:                dev = vp->v_rdev;
                   1479:                type = vp->v_type;
                   1480:                mutex_exit(&vp->v_interlock);
                   1481:        }
                   1482:
1.318     ad       1483:        vpp = &specfs_hash[SPECHASH(dev)];
                   1484:        mutex_enter(&specfs_lock);
1.316     ad       1485:        for (vq = *vpp; vq != NULL;) {
1.333     ad       1486:                /* If clean or being cleaned, then ignore it. */
                   1487:                mutex_enter(&vq->v_interlock);
                   1488:                if ((vq->v_iflag & (VI_CLEAN | VI_XLOCK)) != 0 ||
1.317     ad       1489:                    vq->v_rdev != dev || vq->v_type != type) {
1.333     ad       1490:                        mutex_exit(&vq->v_interlock);
1.316     ad       1491:                        vq = vq->v_specnext;
                   1492:                        continue;
                   1493:                }
1.318     ad       1494:                mutex_exit(&specfs_lock);
                   1495:                if (vq->v_usecount == 0) {
1.317     ad       1496:                        vremfree(vq);
1.316     ad       1497:                }
1.318     ad       1498:                vq->v_usecount++;
1.316     ad       1499:                vclean(vq, DOCLOSE);
1.324     pooka    1500:                vrelel(vq, 0);
1.318     ad       1501:                mutex_enter(&specfs_lock);
1.316     ad       1502:                vq = *vpp;
                   1503:        }
1.318     ad       1504:        mutex_exit(&specfs_lock);
1.316     ad       1505: }
                   1506:
                   1507: /*
1.220     lukem    1508:  * sysctl helper routine to return list of supported fstypes
                   1509:  */
                   1510: static int
                   1511: sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS)
                   1512: {
1.291     christos 1513:        char bf[sizeof(((struct statvfs *)NULL)->f_fstypename)];
1.220     lukem    1514:        char *where = oldp;
                   1515:        struct vfsops *v;
                   1516:        size_t needed, left, slen;
                   1517:        int error, first;
                   1518:
                   1519:        if (newp != NULL)
                   1520:                return (EPERM);
                   1521:        if (namelen != 0)
                   1522:                return (EINVAL);
                   1523:
                   1524:        first = 1;
                   1525:        error = 0;
                   1526:        needed = 0;
                   1527:        left = *oldlenp;
                   1528:
1.311     ad       1529:        sysctl_unlock();
1.302     ad       1530:        mutex_enter(&vfs_list_lock);
1.220     lukem    1531:        LIST_FOREACH(v, &vfs_list, vfs_list) {
                   1532:                if (where == NULL)
                   1533:                        needed += strlen(v->vfs_name) + 1;
                   1534:                else {
1.245     christos 1535:                        memset(bf, 0, sizeof(bf));
1.220     lukem    1536:                        if (first) {
1.245     christos 1537:                                strncpy(bf, v->vfs_name, sizeof(bf));
1.220     lukem    1538:                                first = 0;
                   1539:                        } else {
1.245     christos 1540:                                bf[0] = ' ';
                   1541:                                strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1);
1.220     lukem    1542:                        }
1.245     christos 1543:                        bf[sizeof(bf)-1] = '\0';
                   1544:                        slen = strlen(bf);
1.220     lukem    1545:                        if (left < slen + 1)
                   1546:                                break;
                   1547:                        /* +1 to copy out the trailing NUL byte */
1.302     ad       1548:                        v->vfs_refcount++;
                   1549:                        mutex_exit(&vfs_list_lock);
1.245     christos 1550:                        error = copyout(bf, where, slen + 1);
1.302     ad       1551:                        mutex_enter(&vfs_list_lock);
                   1552:                        v->vfs_refcount--;
1.220     lukem    1553:                        if (error)
                   1554:                                break;
                   1555:                        where += slen;
                   1556:                        needed += slen;
                   1557:                        left -= slen;
                   1558:                }
                   1559:        }
1.302     ad       1560:        mutex_exit(&vfs_list_lock);
1.311     ad       1561:        sysctl_relock();
1.220     lukem    1562:        *oldlenp = needed;
                   1563:        return (error);
                   1564: }
                   1565:
                   1566: /*
1.80      fvdl     1567:  * Top level filesystem related information gathering.
                   1568:  */
1.212     atatat   1569: SYSCTL_SETUP(sysctl_vfs_setup, "sysctl vfs subtree setup")
1.80      fvdl     1570: {
1.218     atatat   1571:        sysctl_createv(clog, 0, NULL, NULL,
                   1572:                       CTLFLAG_PERMANENT,
1.212     atatat   1573:                       CTLTYPE_NODE, "vfs", NULL,
                   1574:                       NULL, 0, NULL, 0,
                   1575:                       CTL_VFS, CTL_EOL);
1.218     atatat   1576:        sysctl_createv(clog, 0, NULL, NULL,
                   1577:                       CTLFLAG_PERMANENT,
1.226     atatat   1578:                       CTLTYPE_NODE, "generic",
                   1579:                       SYSCTL_DESCR("Non-specific vfs related information"),
1.212     atatat   1580:                       NULL, 0, NULL, 0,
                   1581:                       CTL_VFS, VFS_GENERIC, CTL_EOL);
1.218     atatat   1582:        sysctl_createv(clog, 0, NULL, NULL,
                   1583:                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1.226     atatat   1584:                       CTLTYPE_INT, "usermount",
                   1585:                       SYSCTL_DESCR("Whether unprivileged users may mount "
                   1586:                                    "filesystems"),
1.212     atatat   1587:                       NULL, 0, &dovfsusermount, 0,
                   1588:                       CTL_VFS, VFS_GENERIC, VFS_USERMOUNT, CTL_EOL);
1.220     lukem    1589:        sysctl_createv(clog, 0, NULL, NULL,
                   1590:                       CTLFLAG_PERMANENT,
                   1591:                       CTLTYPE_STRING, "fstypes",
                   1592:                       SYSCTL_DESCR("List of file systems present"),
                   1593:                       sysctl_vfs_generic_fstypes, 0, NULL, 0,
                   1594:                       CTL_VFS, VFS_GENERIC, CTL_CREATE, CTL_EOL);
1.263     chs      1595:        sysctl_createv(clog, 0, NULL, NULL,
                   1596:                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                   1597:                       CTLTYPE_INT, "magiclinks",
                   1598:                       SYSCTL_DESCR("Whether \"magic\" symlinks are expanded"),
                   1599:                       NULL, 0, &vfs_magiclinks, 0,
                   1600:                       CTL_VFS, VFS_GENERIC, VFS_MAGICLINKS, CTL_EOL);
1.80      fvdl     1601: }
                   1602:
1.212     atatat   1603:
1.29      cgd      1604: int kinfo_vdebug = 1;
                   1605: int kinfo_vgetfailed;
                   1606: #define KINFO_VNODESLOP        10
                   1607: /*
                   1608:  * Dump vnode list (via sysctl).
                   1609:  * Copyout address of vnode followed by vnode.
                   1610:  */
                   1611: /* ARGSUSED */
1.50      christos 1612: int
1.212     atatat   1613: sysctl_kern_vnode(SYSCTLFN_ARGS)
1.29      cgd      1614: {
1.212     atatat   1615:        char *where = oldp;
                   1616:        size_t *sizep = oldlenp;
1.80      fvdl     1617:        struct mount *mp, *nmp;
1.311     ad       1618:        vnode_t *vp, *mvp, vbuf;
1.80      fvdl     1619:        char *bp = where, *savebp;
1.29      cgd      1620:        char *ewhere;
                   1621:        int error;
1.212     atatat   1622:
                   1623:        if (namelen != 0)
                   1624:                return (EOPNOTSUPP);
                   1625:        if (newp != NULL)
                   1626:                return (EPERM);
1.29      cgd      1627:
1.309     ad       1628: #define VPTRSZ sizeof(vnode_t *)
                   1629: #define VNODESZ        sizeof(vnode_t)
1.29      cgd      1630:        if (where == NULL) {
                   1631:                *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
                   1632:                return (0);
                   1633:        }
                   1634:        ewhere = where + *sizep;
1.80      fvdl     1635:
1.311     ad       1636:        sysctl_unlock();
1.302     ad       1637:        mutex_enter(&mountlist_lock);
1.177     matt     1638:        for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
                   1639:             mp = nmp) {
1.327     ad       1640:                if (vfs_trybusy(mp, RW_READER, &mountlist_lock)) {
1.177     matt     1641:                        nmp = CIRCLEQ_NEXT(mp, mnt_list);
1.29      cgd      1642:                        continue;
1.80      fvdl     1643:                }
1.29      cgd      1644:                savebp = bp;
1.309     ad       1645:                /* Allocate a marker vnode. */
1.311     ad       1646:                if ((mvp = vnalloc(mp)) == NULL) {
                   1647:                        sysctl_relock();
1.309     ad       1648:                        return (ENOMEM);
1.311     ad       1649:                }
1.309     ad       1650:                mutex_enter(&mntvnode_lock);
                   1651:                for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) {
                   1652:                        vmark(mvp, vp);
1.29      cgd      1653:                        /*
                   1654:                         * Check that the vp is still associated with
                   1655:                         * this filesystem.  RACE: could have been
                   1656:                         * recycled onto the same filesystem.
                   1657:                         */
1.309     ad       1658:                        if (vp->v_mount != mp || vismarker(vp))
                   1659:                                continue;
1.29      cgd      1660:                        if (bp + VPTRSZ + VNODESZ > ewhere) {
1.309     ad       1661:                                (void)vunmark(mvp);
                   1662:                                mutex_exit(&mntvnode_lock);
1.310     pooka    1663:                                vnfree(mvp);
1.311     ad       1664:                                sysctl_relock();
1.29      cgd      1665:                                *sizep = bp - where;
                   1666:                                return (ENOMEM);
                   1667:                        }
1.311     ad       1668:                        memcpy(&vbuf, vp, VNODESZ);
1.309     ad       1669:                        mutex_exit(&mntvnode_lock);
1.311     ad       1670:                        if ((error = copyout(vp, bp, VPTRSZ)) ||
                   1671:                           (error = copyout(&vbuf, bp + VPTRSZ, VNODESZ))) {
1.309     ad       1672:                                mutex_enter(&mntvnode_lock);
                   1673:                                (void)vunmark(mvp);
                   1674:                                mutex_exit(&mntvnode_lock);
1.310     pooka    1675:                                vnfree(mvp);
1.311     ad       1676:                                sysctl_relock();
1.29      cgd      1677:                                return (error);
1.309     ad       1678:                        }
1.29      cgd      1679:                        bp += VPTRSZ + VNODESZ;
1.309     ad       1680:                        mutex_enter(&mntvnode_lock);
1.29      cgd      1681:                }
1.309     ad       1682:                mutex_exit(&mntvnode_lock);
1.302     ad       1683:                mutex_enter(&mountlist_lock);
1.177     matt     1684:                nmp = CIRCLEQ_NEXT(mp, mnt_list);
1.327     ad       1685:                vfs_unbusy(mp, false);
1.310     pooka    1686:                vnfree(mvp);
1.29      cgd      1687:        }
1.302     ad       1688:        mutex_exit(&mountlist_lock);
1.311     ad       1689:        sysctl_relock();
1.29      cgd      1690:
                   1691:        *sizep = bp - where;
                   1692:        return (0);
1.30      mycroft  1693: }
                   1694:
                   1695: /*
1.309     ad       1696:  * Remove clean vnodes from a mountpoint's vnode list.
                   1697:  */
                   1698: void
                   1699: vfs_scrubvnlist(struct mount *mp)
                   1700: {
                   1701:        vnode_t *vp, *nvp;
                   1702:
1.327     ad       1703:  retry:
1.309     ad       1704:        mutex_enter(&mntvnode_lock);
                   1705:        for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) {
                   1706:                nvp = TAILQ_NEXT(vp, v_mntvnodes);
                   1707:                mutex_enter(&vp->v_interlock);
1.315     ad       1708:                if ((vp->v_iflag & VI_CLEAN) != 0) {
1.309     ad       1709:                        TAILQ_REMOVE(&mp->mnt_vnodelist, vp, v_mntvnodes);
1.315     ad       1710:                        vp->v_mount = NULL;
1.327     ad       1711:                        mutex_exit(&mntvnode_lock);
                   1712:                        mutex_exit(&vp->v_interlock);
                   1713:                        vfs_destroy(mp);
                   1714:                        goto retry;
1.315     ad       1715:                }
1.309     ad       1716:                mutex_exit(&vp->v_interlock);
                   1717:        }
                   1718:        mutex_exit(&mntvnode_lock);
                   1719: }
                   1720:
                   1721: /*
1.30      mycroft  1722:  * Check to see if a filesystem is mounted on a block device.
                   1723:  */
                   1724: int
1.309     ad       1725: vfs_mountedon(vnode_t *vp)
1.30      mycroft  1726: {
1.309     ad       1727:        vnode_t *vq;
1.80      fvdl     1728:        int error = 0;
1.30      mycroft  1729:
1.261     reinoud  1730:        if (vp->v_type != VBLK)
                   1731:                return ENOTBLK;
1.113     fvdl     1732:        if (vp->v_specmountpoint != NULL)
1.30      mycroft  1733:                return (EBUSY);
1.318     ad       1734:        mutex_enter(&specfs_lock);
                   1735:        for (vq = specfs_hash[SPECHASH(vp->v_rdev)]; vq != NULL;
                   1736:            vq = vq->v_specnext) {
                   1737:                if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
                   1738:                        continue;
                   1739:                if (vq->v_specmountpoint != NULL) {
                   1740:                        error = EBUSY;
                   1741:                        break;
1.30      mycroft  1742:                }
                   1743:        }
1.318     ad       1744:        mutex_exit(&specfs_lock);
1.80      fvdl     1745:        return (error);
1.30      mycroft  1746: }
                   1747:
1.35      ws       1748: /*
1.39      mycroft  1749:  * Unmount all file systems.
                   1750:  * We traverse the list in reverse order under the assumption that doing so
                   1751:  * will avoid needing to worry about dependencies.
                   1752:  */
                   1753: void
1.256     christos 1754: vfs_unmountall(struct lwp *l)
1.39      mycroft  1755: {
1.123     augustss 1756:        struct mount *mp, *nmp;
1.40      mycroft  1757:        int allerror, error;
1.39      mycroft  1758:
1.235     lukem    1759:        printf("unmounting file systems...");
1.325     dyoung   1760:        for (allerror = 0, mp = CIRCLEQ_LAST(&mountlist);
                   1761:             !CIRCLEQ_EMPTY(&mountlist);
                   1762:             mp = nmp) {
                   1763:                nmp = CIRCLEQ_PREV(mp, mnt_list);
1.54      jtk      1764: #ifdef DEBUG
1.235     lukem    1765:                printf("\nunmounting %s (%s)...",
1.56      christos 1766:                    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname);
1.54      jtk      1767: #endif
1.149     thorpej  1768:                /*
                   1769:                 * XXX Freeze syncer.  Must do this before locking the
                   1770:                 * mount point.  See dounmount() for details.
                   1771:                 */
1.281     ad       1772:                mutex_enter(&syncer_mutex);
1.327     ad       1773:                if (vfs_busy(mp, RW_WRITER, NULL)) {
1.281     ad       1774:                        mutex_exit(&syncer_mutex);
1.60      fvdl     1775:                        continue;
1.149     thorpej  1776:                }
1.256     christos 1777:                if ((error = dounmount(mp, MNT_FORCE, l)) != 0) {
1.57      christos 1778:                        printf("unmount of %s failed with error %d\n",
1.40      mycroft  1779:                            mp->mnt_stat.f_mntonname, error);
                   1780:                        allerror = 1;
                   1781:                }
1.39      mycroft  1782:        }
1.235     lukem    1783:        printf(" done\n");
1.39      mycroft  1784:        if (allerror)
1.57      christos 1785:                printf("WARNING: some file systems would not unmount\n");
1.40      mycroft  1786: }
                   1787:
                   1788: /*
                   1789:  * Sync and unmount file systems before shutting down.
                   1790:  */
                   1791: void
1.247     thorpej  1792: vfs_shutdown(void)
1.40      mycroft  1793: {
1.265     skrll    1794:        struct lwp *l;
1.40      mycroft  1795:
1.265     skrll    1796:        /* XXX we're certainly not running in lwp0's context! */
                   1797:        l = curlwp;
                   1798:        if (l == NULL)
                   1799:                l = &lwp0;
1.185     christos 1800:
1.70      cgd      1801:        printf("syncing disks... ");
                   1802:
1.305     pooka    1803:        /* remove user processes from run queue */
1.138     bouyer   1804:        suspendsched();
1.40      mycroft  1805:        (void) spl0();
                   1806:
1.128     sommerfe 1807:        /* avoid coming back this way again if we panic. */
                   1808:        doing_shutdown = 1;
                   1809:
1.184     thorpej  1810:        sys_sync(l, NULL, NULL);
1.40      mycroft  1811:
                   1812:        /* Wait for sync to finish. */
1.213     pk       1813:        if (buf_syncwait() != 0) {
1.124     augustss 1814: #if defined(DDB) && defined(DEBUG_HALT_BUSY)
                   1815:                Debugger();
                   1816: #endif
1.57      christos 1817:                printf("giving up\n");
1.84      thorpej  1818:                return;
1.73      thorpej  1819:        } else
1.57      christos 1820:                printf("done\n");
1.73      thorpej  1821:
1.84      thorpej  1822:        /*
                   1823:         * If we've panic'd, don't make the situation potentially
                   1824:         * worse by unmounting the file systems.
                   1825:         */
                   1826:        if (panicstr != NULL)
                   1827:                return;
                   1828:
                   1829:        /* Release inodes held by texts before update. */
1.73      thorpej  1830: #ifdef notdef
1.84      thorpej  1831:        vnshutdown();
1.73      thorpej  1832: #endif
1.84      thorpej  1833:        /* Unmount file systems. */
1.256     christos 1834:        vfs_unmountall(l);
1.58      thorpej  1835: }
                   1836:
                   1837: /*
                   1838:  * Mount the root file system.  If the operator didn't specify a
                   1839:  * file system to use, try all possible file systems until one
                   1840:  * succeeds.
                   1841:  */
                   1842: int
1.247     thorpej  1843: vfs_mountroot(void)
1.58      thorpej  1844: {
1.79      thorpej  1845:        struct vfsops *v;
1.239     mycroft  1846:        int error = ENODEV;
1.58      thorpej  1847:
                   1848:        if (root_device == NULL)
                   1849:                panic("vfs_mountroot: root device unknown");
                   1850:
1.264     thorpej  1851:        switch (device_class(root_device)) {
1.58      thorpej  1852:        case DV_IFNET:
                   1853:                if (rootdev != NODEV)
1.173     thorpej  1854:                        panic("vfs_mountroot: rootdev set for DV_IFNET "
                   1855:                            "(0x%08x -> %d,%d)", rootdev,
                   1856:                            major(rootdev), minor(rootdev));
1.58      thorpej  1857:                break;
                   1858:
                   1859:        case DV_DISK:
                   1860:                if (rootdev == NODEV)
                   1861:                        panic("vfs_mountroot: rootdev not set for DV_DISK");
1.239     mycroft  1862:                if (bdevvp(rootdev, &rootvp))
                   1863:                        panic("vfs_mountroot: can't get vnode for rootdev");
1.306     pooka    1864:                error = VOP_OPEN(rootvp, FREAD, FSCRED);
1.239     mycroft  1865:                if (error) {
                   1866:                        printf("vfs_mountroot: can't open root device\n");
                   1867:                        return (error);
                   1868:                }
1.58      thorpej  1869:                break;
                   1870:
                   1871:        default:
                   1872:                printf("%s: inappropriate for root file system\n",
1.336   ! cegger   1873:                    device_xname(root_device));
1.58      thorpej  1874:                return (ENODEV);
                   1875:        }
                   1876:
                   1877:        /*
                   1878:         * If user specified a file system, use it.
                   1879:         */
1.239     mycroft  1880:        if (mountroot != NULL) {
                   1881:                error = (*mountroot)();
                   1882:                goto done;
                   1883:        }
1.58      thorpej  1884:
                   1885:        /*
                   1886:         * Try each file system currently configured into the kernel.
                   1887:         */
1.302     ad       1888:        mutex_enter(&vfs_list_lock);
1.220     lukem    1889:        LIST_FOREACH(v, &vfs_list, vfs_list) {
1.79      thorpej  1890:                if (v->vfs_mountroot == NULL)
1.58      thorpej  1891:                        continue;
                   1892: #ifdef DEBUG
1.197     thorpej  1893:                aprint_normal("mountroot: trying %s...\n", v->vfs_name);
1.58      thorpej  1894: #endif
1.302     ad       1895:                v->vfs_refcount++;
                   1896:                mutex_exit(&vfs_list_lock);
1.239     mycroft  1897:                error = (*v->vfs_mountroot)();
1.302     ad       1898:                mutex_enter(&vfs_list_lock);
                   1899:                v->vfs_refcount--;
1.239     mycroft  1900:                if (!error) {
1.197     thorpej  1901:                        aprint_normal("root file system type: %s\n",
                   1902:                            v->vfs_name);
1.79      thorpej  1903:                        break;
1.58      thorpej  1904:                }
                   1905:        }
1.302     ad       1906:        mutex_exit(&vfs_list_lock);
1.58      thorpej  1907:
1.79      thorpej  1908:        if (v == NULL) {
1.336   ! cegger   1909:                printf("no file system for %s", device_xname(root_device));
1.264     thorpej  1910:                if (device_class(root_device) == DV_DISK)
1.79      thorpej  1911:                        printf(" (dev 0x%x)", rootdev);
                   1912:                printf("\n");
1.239     mycroft  1913:                error = EFTYPE;
1.79      thorpej  1914:        }
1.239     mycroft  1915:
                   1916: done:
1.264     thorpej  1917:        if (error && device_class(root_device) == DV_DISK) {
1.306     pooka    1918:                VOP_CLOSE(rootvp, FREAD, FSCRED);
1.239     mycroft  1919:                vrele(rootvp);
                   1920:        }
                   1921:        return (error);
1.58      thorpej  1922: }
1.326     ad       1923:
                   1924: /*
                   1925:  * Sham lock manager for vnodes.  This is a temporary measure.
                   1926:  */
                   1927: int
                   1928: vlockmgr(struct vnlock *vl, int flags)
                   1929: {
                   1930:
                   1931:        KASSERT((flags & ~(LK_CANRECURSE | LK_NOWAIT | LK_TYPE_MASK)) == 0);
                   1932:
                   1933:        switch (flags & LK_TYPE_MASK) {
                   1934:        case LK_SHARED:
                   1935:                if (rw_tryenter(&vl->vl_lock, RW_READER)) {
                   1936:                        return 0;
                   1937:                }
                   1938:                if ((flags & LK_NOWAIT) != 0) {
1.328     ad       1939:                        return EBUSY;
1.326     ad       1940:                }
                   1941:                rw_enter(&vl->vl_lock, RW_READER);
                   1942:                return 0;
                   1943:
                   1944:        case LK_EXCLUSIVE:
                   1945:                if (rw_tryenter(&vl->vl_lock, RW_WRITER)) {
                   1946:                        return 0;
                   1947:                }
                   1948:                if ((vl->vl_canrecurse || (flags & LK_CANRECURSE) != 0) &&
                   1949:                    rw_write_held(&vl->vl_lock)) {
                   1950:                        vl->vl_recursecnt++;
                   1951:                        return 0;
                   1952:                }
                   1953:                if ((flags & LK_NOWAIT) != 0) {
1.328     ad       1954:                        return EBUSY;
1.326     ad       1955:                }
                   1956:                rw_enter(&vl->vl_lock, RW_WRITER);
                   1957:                return 0;
                   1958:
                   1959:        case LK_RELEASE:
                   1960:                if (vl->vl_recursecnt != 0) {
                   1961:                        KASSERT(rw_write_held(&vl->vl_lock));
                   1962:                        vl->vl_recursecnt--;
                   1963:                        return 0;
                   1964:                }
                   1965:                rw_exit(&vl->vl_lock);
                   1966:                return 0;
                   1967:
                   1968:        default:
                   1969:                panic("vlockmgr: flags %x", flags);
                   1970:        }
                   1971: }
                   1972:
                   1973: int
                   1974: vlockstatus(struct vnlock *vl)
                   1975: {
                   1976:
                   1977:        if (rw_write_held(&vl->vl_lock)) {
                   1978:                return LK_EXCLUSIVE;
                   1979:        }
                   1980:        if (rw_read_held(&vl->vl_lock)) {
                   1981:                return LK_SHARED;
                   1982:        }
                   1983:        return 0;
                   1984: }

CVSweb <webmaster@jp.NetBSD.org>