Annotation of src/sys/kern/vfs_vnode.c, Revision 1.23
1.23 ! hannken 1: /* $NetBSD: vfs_vnode.c,v 1.22 2013/10/25 20:39:40 martin Exp $ */
1.1 rmind 2:
3: /*-
1.2 rmind 4: * Copyright (c) 1997-2011 The NetBSD Foundation, Inc.
1.1 rmind 5: * All rights reserved.
6: *
7: * This code is derived from software contributed to The NetBSD Foundation
8: * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9: * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10: *
11: * Redistribution and use in source and binary forms, with or without
12: * modification, are permitted provided that the following conditions
13: * are met:
14: * 1. Redistributions of source code must retain the above copyright
15: * notice, this list of conditions and the following disclaimer.
16: * 2. Redistributions in binary form must reproduce the above copyright
17: * notice, this list of conditions and the following disclaimer in the
18: * documentation and/or other materials provided with the distribution.
19: *
20: * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21: * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22: * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24: * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25: * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26: * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27: * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28: * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29: * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30: * POSSIBILITY OF SUCH DAMAGE.
31: */
32:
33: /*
34: * Copyright (c) 1989, 1993
35: * The Regents of the University of California. All rights reserved.
36: * (c) UNIX System Laboratories, Inc.
37: * All or some portions of this file are derived from material licensed
38: * to the University of California by American Telephone and Telegraph
39: * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40: * the permission of UNIX System Laboratories, Inc.
41: *
42: * Redistribution and use in source and binary forms, with or without
43: * modification, are permitted provided that the following conditions
44: * are met:
45: * 1. Redistributions of source code must retain the above copyright
46: * notice, this list of conditions and the following disclaimer.
47: * 2. Redistributions in binary form must reproduce the above copyright
48: * notice, this list of conditions and the following disclaimer in the
49: * documentation and/or other materials provided with the distribution.
50: * 3. Neither the name of the University nor the names of its contributors
51: * may be used to endorse or promote products derived from this software
52: * without specific prior written permission.
53: *
54: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64: * SUCH DAMAGE.
65: *
66: * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
67: */
68:
69: /*
1.8 rmind 70: * The vnode cache subsystem.
1.1 rmind 71: *
1.8 rmind 72: * Life-cycle
1.1 rmind 73: *
1.8 rmind 74: * Normally, there are two points where new vnodes are created:
75: * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode
76: * starts in one of the following ways:
77: *
78: * - Allocation, via getnewvnode(9) and/or vnalloc(9).
79: * - Reclamation of inactive vnode, via vget(9).
80: *
1.16 rmind 81: * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9)
82: * was another, traditional way. Currently, only the draining thread
83: * recycles the vnodes. This behaviour might be revisited.
84: *
1.8 rmind 85: * The life-cycle ends when the last reference is dropped, usually
86: * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform
87: * the file system that vnode is inactive. Via this call, file system
1.16 rmind 88: * indicates whether vnode can be recycled (usually, it checks its own
89: * references, e.g. count of links, whether the file was removed).
1.8 rmind 90: *
91: * Depending on indication, vnode can be put into a free list (cache),
92: * or cleaned via vclean(9), which calls VOP_RECLAIM(9) to disassociate
93: * underlying file system from the vnode, and finally destroyed.
94: *
95: * Reference counting
96: *
97: * Vnode is considered active, if reference count (vnode_t::v_usecount)
98: * is non-zero. It is maintained using: vref(9) and vrele(9), as well
99: * as vput(9), routines. Common points holding references are e.g.
100: * file openings, current working directory, mount points, etc.
101: *
102: * Note on v_usecount and its locking
103: *
104: * At nearly all points it is known that v_usecount could be zero,
105: * the vnode_t::v_interlock will be held. To change v_usecount away
106: * from zero, the interlock must be held. To change from a non-zero
107: * value to zero, again the interlock must be held.
108: *
109: * There is a flag bit, VC_XLOCK, embedded in v_usecount. To raise
110: * v_usecount, if the VC_XLOCK bit is set in it, the interlock must
111: * be held. To modify the VC_XLOCK bit, the interlock must be held.
112: * We always keep the usecount (v_usecount & VC_MASK) non-zero while
113: * the VC_XLOCK bit is set.
114: *
115: * Unless the VC_XLOCK bit is set, changing the usecount from a non-zero
116: * value to a non-zero value can safely be done using atomic operations,
117: * without the interlock held.
118: *
119: * Even if the VC_XLOCK bit is set, decreasing the usecount to a non-zero
120: * value can be done using atomic operations, without the interlock held.
121: *
122: * Note: if VI_CLEAN is set, vnode_t::v_interlock will be released while
123: * mntvnode_lock is still held.
1.20 dholland 124: *
125: * See PR 41374.
1.1 rmind 126: */
127:
128: #include <sys/cdefs.h>
1.23 ! hannken 129: __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.22 2013/10/25 20:39:40 martin Exp $");
! 130:
! 131: #define _VFS_VNODE_PRIVATE
1.1 rmind 132:
133: #include <sys/param.h>
134: #include <sys/kernel.h>
135:
136: #include <sys/atomic.h>
137: #include <sys/buf.h>
138: #include <sys/conf.h>
139: #include <sys/device.h>
140: #include <sys/kauth.h>
141: #include <sys/kmem.h>
142: #include <sys/kthread.h>
143: #include <sys/module.h>
144: #include <sys/mount.h>
145: #include <sys/namei.h>
146: #include <sys/syscallargs.h>
147: #include <sys/sysctl.h>
148: #include <sys/systm.h>
149: #include <sys/vnode.h>
150: #include <sys/wapbl.h>
151:
152: #include <uvm/uvm.h>
153: #include <uvm/uvm_readahead.h>
154:
1.23 ! hannken 155: /* v_usecount; see the comment near the top of vfs_vnode.c */
! 156: #define VC_XLOCK 0x80000000
! 157: #define VC_MASK 0x7fffffff
! 158:
! 159: #define DOCLOSE 0x0008 /* vclean: close active files */
! 160:
! 161: /* Flags to vrelel. */
! 162: #define VRELEL_ASYNC_RELE 0x0001 /* Always defer to vrele thread. */
! 163:
1.6 rmind 164: u_int numvnodes __cacheline_aligned;
1.1 rmind 165:
1.6 rmind 166: static pool_cache_t vnode_cache __read_mostly;
1.16 rmind 167:
168: /*
169: * There are two free lists: one is for vnodes which have no buffer/page
170: * references and one for those which do (i.e. v_holdcnt is non-zero).
171: * Vnode recycling mechanism first attempts to look into the former list.
172: */
1.6 rmind 173: static kmutex_t vnode_free_list_lock __cacheline_aligned;
174: static vnodelst_t vnode_free_list __cacheline_aligned;
175: static vnodelst_t vnode_hold_list __cacheline_aligned;
1.16 rmind 176: static kcondvar_t vdrain_cv __cacheline_aligned;
177:
1.6 rmind 178: static vnodelst_t vrele_list __cacheline_aligned;
179: static kmutex_t vrele_lock __cacheline_aligned;
180: static kcondvar_t vrele_cv __cacheline_aligned;
181: static lwp_t * vrele_lwp __cacheline_aligned;
182: static int vrele_pending __cacheline_aligned;
183: static int vrele_gen __cacheline_aligned;
1.1 rmind 184:
1.12 hannken 185: static int cleanvnode(void);
1.23 ! hannken 186: static void vrelel(vnode_t *, int);
1.12 hannken 187: static void vdrain_thread(void *);
1.1 rmind 188: static void vrele_thread(void *);
1.11 christos 189: static void vnpanic(vnode_t *, const char *, ...)
1.18 christos 190: __printflike(2, 3);
1.1 rmind 191:
192: /* Routines having to do with the management of the vnode table. */
193: extern int (**dead_vnodeop_p)(void *);
194:
195: void
196: vfs_vnode_sysinit(void)
197: {
1.22 martin 198: int error __diagused;
1.1 rmind 199:
200: vnode_cache = pool_cache_init(sizeof(vnode_t), 0, 0, 0, "vnodepl",
201: NULL, IPL_NONE, NULL, NULL, NULL);
202: KASSERT(vnode_cache != NULL);
203:
204: mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE);
205: TAILQ_INIT(&vnode_free_list);
206: TAILQ_INIT(&vnode_hold_list);
207: TAILQ_INIT(&vrele_list);
208:
209: mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE);
1.12 hannken 210: cv_init(&vdrain_cv, "vdrain");
1.1 rmind 211: cv_init(&vrele_cv, "vrele");
1.12 hannken 212: error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread,
213: NULL, NULL, "vdrain");
214: KASSERT(error == 0);
1.1 rmind 215: error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread,
216: NULL, &vrele_lwp, "vrele");
217: KASSERT(error == 0);
218: }
219:
220: /*
221: * Allocate a new, uninitialized vnode. If 'mp' is non-NULL, this is a
1.13 hannken 222: * marker vnode.
1.1 rmind 223: */
224: vnode_t *
225: vnalloc(struct mount *mp)
226: {
227: vnode_t *vp;
228:
1.13 hannken 229: vp = pool_cache_get(vnode_cache, PR_WAITOK);
230: KASSERT(vp != NULL);
1.1 rmind 231:
232: memset(vp, 0, sizeof(*vp));
1.9 rmind 233: uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0);
1.1 rmind 234: cv_init(&vp->v_cv, "vnode");
235: /*
236: * Done by memset() above.
237: * LIST_INIT(&vp->v_nclist);
238: * LIST_INIT(&vp->v_dnclist);
239: */
240:
241: if (mp != NULL) {
242: vp->v_mount = mp;
243: vp->v_type = VBAD;
244: vp->v_iflag = VI_MARKER;
245: } else {
246: rw_init(&vp->v_lock);
247: }
248:
249: return vp;
250: }
251:
252: /*
253: * Free an unused, unreferenced vnode.
254: */
255: void
256: vnfree(vnode_t *vp)
257: {
258:
259: KASSERT(vp->v_usecount == 0);
260:
261: if ((vp->v_iflag & VI_MARKER) == 0) {
262: rw_destroy(&vp->v_lock);
263: mutex_enter(&vnode_free_list_lock);
264: numvnodes--;
265: mutex_exit(&vnode_free_list_lock);
266: }
267:
1.9 rmind 268: /*
269: * Note: the vnode interlock will either be freed, of reference
270: * dropped (if VI_LOCKSHARE was in use).
271: */
272: uvm_obj_destroy(&vp->v_uobj, true);
1.1 rmind 273: cv_destroy(&vp->v_cv);
274: pool_cache_put(vnode_cache, vp);
275: }
276:
277: /*
1.12 hannken 278: * cleanvnode: grab a vnode from freelist, clean and free it.
1.5 rmind 279: *
280: * => Releases vnode_free_list_lock.
1.1 rmind 281: */
1.12 hannken 282: static int
283: cleanvnode(void)
1.1 rmind 284: {
285: vnode_t *vp;
286: vnodelst_t *listhd;
287:
288: KASSERT(mutex_owned(&vnode_free_list_lock));
289: retry:
290: listhd = &vnode_free_list;
291: try_nextlist:
292: TAILQ_FOREACH(vp, listhd, v_freelist) {
293: /*
294: * It's safe to test v_usecount and v_iflag
295: * without holding the interlock here, since
296: * these vnodes should never appear on the
297: * lists.
298: */
1.5 rmind 299: KASSERT(vp->v_usecount == 0);
300: KASSERT((vp->v_iflag & VI_CLEAN) == 0);
301: KASSERT(vp->v_freelisthd == listhd);
302:
1.9 rmind 303: if (!mutex_tryenter(vp->v_interlock))
1.1 rmind 304: continue;
305: if ((vp->v_iflag & VI_XLOCK) == 0)
306: break;
1.9 rmind 307: mutex_exit(vp->v_interlock);
1.1 rmind 308: }
309:
310: if (vp == NULL) {
311: if (listhd == &vnode_free_list) {
312: listhd = &vnode_hold_list;
313: goto try_nextlist;
314: }
315: mutex_exit(&vnode_free_list_lock);
1.12 hannken 316: return EBUSY;
1.1 rmind 317: }
318:
319: /* Remove it from the freelist. */
320: TAILQ_REMOVE(listhd, vp, v_freelist);
321: vp->v_freelisthd = NULL;
322: mutex_exit(&vnode_free_list_lock);
323:
324: KASSERT(vp->v_usecount == 0);
325:
326: /*
327: * The vnode is still associated with a file system, so we must
1.12 hannken 328: * clean it out before freeing it. We need to add a reference
1.1 rmind 329: * before doing this. If the vnode gains another reference while
330: * being cleaned out then we lose - retry.
331: */
332: atomic_add_int(&vp->v_usecount, 1 + VC_XLOCK);
333: vclean(vp, DOCLOSE);
334: KASSERT(vp->v_usecount >= 1 + VC_XLOCK);
335: atomic_add_int(&vp->v_usecount, -VC_XLOCK);
1.12 hannken 336: if (vp->v_usecount > 1) {
1.1 rmind 337: /*
338: * Don't return to freelist - the holder of the last
339: * reference will destroy it.
340: */
341: vrelel(vp, 0); /* releases vp->v_interlock */
342: mutex_enter(&vnode_free_list_lock);
343: goto retry;
344: }
345:
1.12 hannken 346: KASSERT((vp->v_iflag & VI_CLEAN) == VI_CLEAN);
347: mutex_exit(vp->v_interlock);
348: if (vp->v_type == VBLK || vp->v_type == VCHR) {
349: spec_node_destroy(vp);
350: }
351: vp->v_type = VNON;
352:
1.5 rmind 353: KASSERT(vp->v_data == NULL);
354: KASSERT(vp->v_uobj.uo_npages == 0);
355: KASSERT(TAILQ_EMPTY(&vp->v_uobj.memq));
356: KASSERT(vp->v_numoutput == 0);
357: KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1.1 rmind 358:
1.12 hannken 359: vrele(vp);
360:
361: return 0;
1.1 rmind 362: }
363:
364: /*
1.12 hannken 365: * getnewvnode: return a fresh vnode.
1.5 rmind 366: *
367: * => Returns referenced vnode, moved into the mount queue.
1.9 rmind 368: * => Shares the interlock specified by 'slock', if it is not NULL.
1.1 rmind 369: */
370: int
371: getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *),
1.9 rmind 372: kmutex_t *slock, vnode_t **vpp)
1.1 rmind 373: {
1.22 martin 374: struct uvm_object *uobj __diagused;
1.1 rmind 375: vnode_t *vp;
1.12 hannken 376: int error = 0;
1.1 rmind 377:
378: if (mp != NULL) {
379: /*
1.4 rmind 380: * Mark filesystem busy while we are creating a vnode.
381: * If unmount is in progress, this will fail.
1.1 rmind 382: */
383: error = vfs_busy(mp, NULL);
384: if (error)
385: return error;
386: }
387:
388: vp = NULL;
389:
1.12 hannken 390: /* Allocate a new vnode. */
1.1 rmind 391: mutex_enter(&vnode_free_list_lock);
1.12 hannken 392: numvnodes++;
393: if (numvnodes > desiredvnodes + desiredvnodes / 10)
394: cv_signal(&vdrain_cv);
395: mutex_exit(&vnode_free_list_lock);
1.14 hannken 396: vp = vnalloc(NULL);
1.1 rmind 397:
398: KASSERT(vp->v_freelisthd == NULL);
399: KASSERT(LIST_EMPTY(&vp->v_nclist));
400: KASSERT(LIST_EMPTY(&vp->v_dnclist));
401:
1.5 rmind 402: /* Initialize vnode. */
1.14 hannken 403: vp->v_usecount = 1;
1.1 rmind 404: vp->v_type = VNON;
405: vp->v_tag = tag;
406: vp->v_op = vops;
407: vp->v_data = NULL;
408:
409: uobj = &vp->v_uobj;
410: KASSERT(uobj->pgops == &uvm_vnodeops);
411: KASSERT(uobj->uo_npages == 0);
412: KASSERT(TAILQ_FIRST(&uobj->memq) == NULL);
413: vp->v_size = vp->v_writesize = VSIZENOTSET;
414:
1.9 rmind 415: /* Share the vnode_t::v_interlock, if requested. */
416: if (slock) {
417: /* Set the interlock and mark that it is shared. */
418: KASSERT(vp->v_mount == NULL);
419: mutex_obj_hold(slock);
420: uvm_obj_setlock(&vp->v_uobj, slock);
421: KASSERT(vp->v_interlock == slock);
422: vp->v_iflag |= VI_LOCKSHARE;
423: }
424:
1.5 rmind 425: /* Finally, move vnode into the mount queue. */
426: vfs_insmntque(vp, mp);
427:
1.1 rmind 428: if (mp != NULL) {
429: if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
430: vp->v_vflag |= VV_MPSAFE;
431: vfs_unbusy(mp, true, NULL);
432: }
433:
1.5 rmind 434: *vpp = vp;
1.4 rmind 435: return 0;
1.1 rmind 436: }
437:
438: /*
439: * This is really just the reverse of getnewvnode(). Needed for
440: * VFS_VGET functions who may need to push back a vnode in case
441: * of a locking race.
442: */
443: void
444: ungetnewvnode(vnode_t *vp)
445: {
446:
447: KASSERT(vp->v_usecount == 1);
448: KASSERT(vp->v_data == NULL);
449: KASSERT(vp->v_freelisthd == NULL);
450:
1.9 rmind 451: mutex_enter(vp->v_interlock);
1.1 rmind 452: vp->v_iflag |= VI_CLEAN;
453: vrelel(vp, 0);
454: }
455:
456: /*
1.12 hannken 457: * Helper thread to keep the number of vnodes below desiredvnodes.
458: */
459: static void
460: vdrain_thread(void *cookie)
461: {
462: int error;
463:
464: mutex_enter(&vnode_free_list_lock);
465:
466: for (;;) {
467: cv_timedwait(&vdrain_cv, &vnode_free_list_lock, hz);
468: while (numvnodes > desiredvnodes) {
469: error = cleanvnode();
470: if (error)
471: kpause("vndsbusy", false, hz, NULL);
472: mutex_enter(&vnode_free_list_lock);
473: if (error)
474: break;
475: }
476: }
477: }
478:
479: /*
1.1 rmind 480: * Remove a vnode from its freelist.
481: */
482: void
483: vremfree(vnode_t *vp)
484: {
485:
1.9 rmind 486: KASSERT(mutex_owned(vp->v_interlock));
1.1 rmind 487: KASSERT(vp->v_usecount == 0);
488:
489: /*
490: * Note that the reference count must not change until
491: * the vnode is removed.
492: */
493: mutex_enter(&vnode_free_list_lock);
494: if (vp->v_holdcnt > 0) {
495: KASSERT(vp->v_freelisthd == &vnode_hold_list);
496: } else {
497: KASSERT(vp->v_freelisthd == &vnode_free_list);
498: }
499: TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
500: vp->v_freelisthd = NULL;
501: mutex_exit(&vnode_free_list_lock);
502: }
503:
504: /*
1.4 rmind 505: * vget: get a particular vnode from the free list, increment its reference
506: * count and lock it.
507: *
508: * => Should be called with v_interlock held.
509: *
510: * If VI_XLOCK is set, the vnode is being eliminated in vgone()/vclean().
511: * In that case, we cannot grab the vnode, so the process is awakened when
512: * the transition is completed, and an error returned to indicate that the
513: * vnode is no longer usable (e.g. changed to a new file system type).
1.1 rmind 514: */
515: int
516: vget(vnode_t *vp, int flags)
517: {
518: int error = 0;
519:
520: KASSERT((vp->v_iflag & VI_MARKER) == 0);
1.9 rmind 521: KASSERT(mutex_owned(vp->v_interlock));
1.1 rmind 522: KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT)) == 0);
523:
524: /*
525: * Before adding a reference, we must remove the vnode
526: * from its freelist.
527: */
528: if (vp->v_usecount == 0) {
529: vremfree(vp);
530: vp->v_usecount = 1;
531: } else {
532: atomic_inc_uint(&vp->v_usecount);
533: }
534:
535: /*
536: * If the vnode is in the process of being cleaned out for
537: * another use, we wait for the cleaning to finish and then
538: * return failure. Cleaning is determined by checking if
539: * the VI_XLOCK flag is set.
540: */
541: if ((vp->v_iflag & VI_XLOCK) != 0) {
542: if ((flags & LK_NOWAIT) != 0) {
543: vrelel(vp, 0);
544: return EBUSY;
545: }
546: vwait(vp, VI_XLOCK);
547: vrelel(vp, 0);
548: return ENOENT;
549: }
550:
1.17 hannken 551: if ((vp->v_iflag & VI_INACTNOW) != 0) {
552: /*
553: * if it's being desactived, wait for it to complete.
554: * Make sure to not return a clean vnode.
555: */
556: if ((flags & LK_NOWAIT) != 0) {
557: vrelel(vp, 0);
558: return EBUSY;
559: }
560: vwait(vp, VI_INACTNOW);
561: if ((vp->v_iflag & VI_CLEAN) != 0) {
562: vrelel(vp, 0);
563: return ENOENT;
564: }
565: }
566:
1.1 rmind 567: /*
568: * Ok, we got it in good shape. Just locking left.
569: */
570: KASSERT((vp->v_iflag & VI_CLEAN) == 0);
1.9 rmind 571: mutex_exit(vp->v_interlock);
1.1 rmind 572: if (flags & (LK_EXCLUSIVE | LK_SHARED)) {
573: error = vn_lock(vp, flags);
574: if (error != 0) {
575: vrele(vp);
576: }
577: }
578: return error;
579: }
580:
581: /*
1.4 rmind 582: * vput: unlock and release the reference.
1.1 rmind 583: */
584: void
585: vput(vnode_t *vp)
586: {
587:
588: KASSERT((vp->v_iflag & VI_MARKER) == 0);
589:
590: VOP_UNLOCK(vp);
591: vrele(vp);
592: }
593:
594: /*
595: * Try to drop reference on a vnode. Abort if we are releasing the
596: * last reference. Note: this _must_ succeed if not the last reference.
597: */
598: static inline bool
599: vtryrele(vnode_t *vp)
600: {
601: u_int use, next;
602:
603: for (use = vp->v_usecount;; use = next) {
604: if (use == 1) {
605: return false;
606: }
607: KASSERT((use & VC_MASK) > 1);
608: next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
609: if (__predict_true(next == use)) {
610: return true;
611: }
612: }
613: }
614:
615: /*
616: * Vnode release. If reference count drops to zero, call inactive
617: * routine and either return to freelist or free to the pool.
618: */
1.23 ! hannken 619: static void
1.1 rmind 620: vrelel(vnode_t *vp, int flags)
621: {
622: bool recycle, defer;
623: int error;
624:
1.9 rmind 625: KASSERT(mutex_owned(vp->v_interlock));
1.1 rmind 626: KASSERT((vp->v_iflag & VI_MARKER) == 0);
627: KASSERT(vp->v_freelisthd == NULL);
628:
629: if (__predict_false(vp->v_op == dead_vnodeop_p &&
630: (vp->v_iflag & (VI_CLEAN|VI_XLOCK)) == 0)) {
1.11 christos 631: vnpanic(vp, "dead but not clean");
1.1 rmind 632: }
633:
634: /*
635: * If not the last reference, just drop the reference count
636: * and unlock.
637: */
638: if (vtryrele(vp)) {
639: vp->v_iflag |= VI_INACTREDO;
1.9 rmind 640: mutex_exit(vp->v_interlock);
1.1 rmind 641: return;
642: }
643: if (vp->v_usecount <= 0 || vp->v_writecount != 0) {
1.11 christos 644: vnpanic(vp, "%s: bad ref count", __func__);
1.1 rmind 645: }
646:
647: KASSERT((vp->v_iflag & VI_XLOCK) == 0);
648:
1.15 hannken 649: #ifdef DIAGNOSTIC
650: if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
651: vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
652: vprint("vrelel: missing VOP_CLOSE()", vp);
653: }
654: #endif
655:
1.1 rmind 656: /*
657: * If not clean, deactivate the vnode, but preserve
658: * our reference across the call to VOP_INACTIVE().
659: */
660: retry:
661: if ((vp->v_iflag & VI_CLEAN) == 0) {
662: recycle = false;
663: vp->v_iflag |= VI_INACTNOW;
664:
665: /*
666: * XXX This ugly block can be largely eliminated if
667: * locking is pushed down into the file systems.
668: *
669: * Defer vnode release to vrele_thread if caller
670: * requests it explicitly.
671: */
672: if ((curlwp == uvm.pagedaemon_lwp) ||
673: (flags & VRELEL_ASYNC_RELE) != 0) {
674: /* The pagedaemon can't wait around; defer. */
675: defer = true;
676: } else if (curlwp == vrele_lwp) {
1.17 hannken 677: /*
678: * We have to try harder. But we can't sleep
679: * with VI_INACTNOW as vget() may be waiting on it.
680: */
681: vp->v_iflag &= ~(VI_INACTREDO|VI_INACTNOW);
682: cv_broadcast(&vp->v_cv);
1.9 rmind 683: mutex_exit(vp->v_interlock);
1.1 rmind 684: error = vn_lock(vp, LK_EXCLUSIVE);
685: if (error != 0) {
686: /* XXX */
1.11 christos 687: vnpanic(vp, "%s: unable to lock %p",
688: __func__, vp);
1.1 rmind 689: }
1.17 hannken 690: mutex_enter(vp->v_interlock);
691: /*
692: * if we did get another reference while
693: * sleeping, don't try to inactivate it yet.
694: */
695: if (__predict_false(vtryrele(vp))) {
696: VOP_UNLOCK(vp);
697: mutex_exit(vp->v_interlock);
698: return;
699: }
700: vp->v_iflag |= VI_INACTNOW;
701: mutex_exit(vp->v_interlock);
1.1 rmind 702: defer = false;
703: } else if ((vp->v_iflag & VI_LAYER) != 0) {
704: /*
705: * Acquiring the stack's lock in vclean() even
706: * for an honest vput/vrele is dangerous because
707: * our caller may hold other vnode locks; defer.
708: */
709: defer = true;
1.4 rmind 710: } else {
1.1 rmind 711: /* If we can't acquire the lock, then defer. */
712: vp->v_iflag &= ~VI_INACTREDO;
1.9 rmind 713: mutex_exit(vp->v_interlock);
1.1 rmind 714: error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT);
715: if (error != 0) {
716: defer = true;
1.9 rmind 717: mutex_enter(vp->v_interlock);
1.1 rmind 718: } else {
719: defer = false;
720: }
721: }
722:
723: if (defer) {
724: /*
725: * Defer reclaim to the kthread; it's not safe to
726: * clean it here. We donate it our last reference.
727: */
1.9 rmind 728: KASSERT(mutex_owned(vp->v_interlock));
1.1 rmind 729: vp->v_iflag &= ~VI_INACTNOW;
730: mutex_enter(&vrele_lock);
731: TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist);
732: if (++vrele_pending > (desiredvnodes >> 8))
733: cv_signal(&vrele_cv);
734: mutex_exit(&vrele_lock);
1.17 hannken 735: cv_broadcast(&vp->v_cv);
1.9 rmind 736: mutex_exit(vp->v_interlock);
1.1 rmind 737: return;
738: }
739:
740: /*
741: * The vnode can gain another reference while being
742: * deactivated. If VOP_INACTIVE() indicates that
743: * the described file has been deleted, then recycle
744: * the vnode irrespective of additional references.
745: * Another thread may be waiting to re-use the on-disk
746: * inode.
747: *
748: * Note that VOP_INACTIVE() will drop the vnode lock.
749: */
750: VOP_INACTIVE(vp, &recycle);
1.9 rmind 751: mutex_enter(vp->v_interlock);
1.1 rmind 752: vp->v_iflag &= ~VI_INACTNOW;
1.17 hannken 753: cv_broadcast(&vp->v_cv);
1.1 rmind 754: if (!recycle) {
755: if (vtryrele(vp)) {
1.9 rmind 756: mutex_exit(vp->v_interlock);
1.1 rmind 757: return;
758: }
759:
760: /*
761: * If we grew another reference while
762: * VOP_INACTIVE() was underway, retry.
763: */
764: if ((vp->v_iflag & VI_INACTREDO) != 0) {
765: goto retry;
766: }
767: }
768:
769: /* Take care of space accounting. */
770: if (vp->v_iflag & VI_EXECMAP) {
771: atomic_add_int(&uvmexp.execpages,
772: -vp->v_uobj.uo_npages);
773: atomic_add_int(&uvmexp.filepages,
774: vp->v_uobj.uo_npages);
775: }
776: vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
777: vp->v_vflag &= ~VV_MAPPED;
778:
779: /*
780: * Recycle the vnode if the file is now unused (unlinked),
781: * otherwise just free it.
782: */
783: if (recycle) {
784: vclean(vp, DOCLOSE);
785: }
786: KASSERT(vp->v_usecount > 0);
787: }
788:
789: if (atomic_dec_uint_nv(&vp->v_usecount) != 0) {
790: /* Gained another reference while being reclaimed. */
1.9 rmind 791: mutex_exit(vp->v_interlock);
1.1 rmind 792: return;
793: }
794:
795: if ((vp->v_iflag & VI_CLEAN) != 0) {
796: /*
797: * It's clean so destroy it. It isn't referenced
798: * anywhere since it has been reclaimed.
799: */
800: KASSERT(vp->v_holdcnt == 0);
801: KASSERT(vp->v_writecount == 0);
1.9 rmind 802: mutex_exit(vp->v_interlock);
1.1 rmind 803: vfs_insmntque(vp, NULL);
804: if (vp->v_type == VBLK || vp->v_type == VCHR) {
805: spec_node_destroy(vp);
806: }
807: vnfree(vp);
808: } else {
809: /*
810: * Otherwise, put it back onto the freelist. It
811: * can't be destroyed while still associated with
812: * a file system.
813: */
814: mutex_enter(&vnode_free_list_lock);
815: if (vp->v_holdcnt > 0) {
816: vp->v_freelisthd = &vnode_hold_list;
817: } else {
818: vp->v_freelisthd = &vnode_free_list;
819: }
820: TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
821: mutex_exit(&vnode_free_list_lock);
1.9 rmind 822: mutex_exit(vp->v_interlock);
1.1 rmind 823: }
824: }
825:
826: void
827: vrele(vnode_t *vp)
828: {
829:
830: KASSERT((vp->v_iflag & VI_MARKER) == 0);
831:
832: if ((vp->v_iflag & VI_INACTNOW) == 0 && vtryrele(vp)) {
833: return;
834: }
1.9 rmind 835: mutex_enter(vp->v_interlock);
1.1 rmind 836: vrelel(vp, 0);
837: }
838:
839: /*
840: * Asynchronous vnode release, vnode is released in different context.
841: */
842: void
843: vrele_async(vnode_t *vp)
844: {
845:
846: KASSERT((vp->v_iflag & VI_MARKER) == 0);
847:
848: if ((vp->v_iflag & VI_INACTNOW) == 0 && vtryrele(vp)) {
849: return;
850: }
1.9 rmind 851: mutex_enter(vp->v_interlock);
1.1 rmind 852: vrelel(vp, VRELEL_ASYNC_RELE);
853: }
854:
855: static void
856: vrele_thread(void *cookie)
857: {
858: vnode_t *vp;
859:
860: for (;;) {
861: mutex_enter(&vrele_lock);
862: while (TAILQ_EMPTY(&vrele_list)) {
863: vrele_gen++;
864: cv_broadcast(&vrele_cv);
865: cv_timedwait(&vrele_cv, &vrele_lock, hz);
866: }
867: vp = TAILQ_FIRST(&vrele_list);
868: TAILQ_REMOVE(&vrele_list, vp, v_freelist);
869: vrele_pending--;
870: mutex_exit(&vrele_lock);
871:
872: /*
873: * If not the last reference, then ignore the vnode
874: * and look for more work.
875: */
1.9 rmind 876: mutex_enter(vp->v_interlock);
1.1 rmind 877: vrelel(vp, 0);
878: }
879: }
880:
1.2 rmind 881: void
882: vrele_flush(void)
883: {
884: int gen;
885:
886: mutex_enter(&vrele_lock);
887: gen = vrele_gen;
888: while (vrele_pending && gen == vrele_gen) {
889: cv_broadcast(&vrele_cv);
890: cv_wait(&vrele_cv, &vrele_lock);
891: }
892: mutex_exit(&vrele_lock);
893: }
894:
1.1 rmind 895: /*
896: * Vnode reference, where a reference is already held by some other
897: * object (for example, a file structure).
898: */
899: void
900: vref(vnode_t *vp)
901: {
902:
903: KASSERT((vp->v_iflag & VI_MARKER) == 0);
904: KASSERT(vp->v_usecount != 0);
905:
906: atomic_inc_uint(&vp->v_usecount);
907: }
908:
909: /*
910: * Page or buffer structure gets a reference.
911: * Called with v_interlock held.
912: */
913: void
914: vholdl(vnode_t *vp)
915: {
916:
1.9 rmind 917: KASSERT(mutex_owned(vp->v_interlock));
1.1 rmind 918: KASSERT((vp->v_iflag & VI_MARKER) == 0);
919:
920: if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) {
921: mutex_enter(&vnode_free_list_lock);
922: KASSERT(vp->v_freelisthd == &vnode_free_list);
923: TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
924: vp->v_freelisthd = &vnode_hold_list;
925: TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
926: mutex_exit(&vnode_free_list_lock);
927: }
928: }
929:
930: /*
931: * Page or buffer structure frees a reference.
932: * Called with v_interlock held.
933: */
934: void
935: holdrelel(vnode_t *vp)
936: {
937:
1.9 rmind 938: KASSERT(mutex_owned(vp->v_interlock));
1.1 rmind 939: KASSERT((vp->v_iflag & VI_MARKER) == 0);
940:
941: if (vp->v_holdcnt <= 0) {
1.11 christos 942: vnpanic(vp, "%s: holdcnt vp %p", __func__, vp);
1.1 rmind 943: }
944:
945: vp->v_holdcnt--;
946: if (vp->v_holdcnt == 0 && vp->v_usecount == 0) {
947: mutex_enter(&vnode_free_list_lock);
948: KASSERT(vp->v_freelisthd == &vnode_hold_list);
949: TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
950: vp->v_freelisthd = &vnode_free_list;
951: TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
952: mutex_exit(&vnode_free_list_lock);
953: }
954: }
955:
956: /*
957: * Disassociate the underlying file system from a vnode.
958: *
959: * Must be called with the interlock held, and will return with it held.
960: */
961: void
962: vclean(vnode_t *vp, int flags)
963: {
964: lwp_t *l = curlwp;
1.23 ! hannken 965: bool recycle, active, make_anon;
1.1 rmind 966: int error;
967:
1.9 rmind 968: KASSERT(mutex_owned(vp->v_interlock));
1.1 rmind 969: KASSERT((vp->v_iflag & VI_MARKER) == 0);
970: KASSERT(vp->v_usecount != 0);
971:
972: /* If cleaning is already in progress wait until done and return. */
973: if (vp->v_iflag & VI_XLOCK) {
974: vwait(vp, VI_XLOCK);
975: return;
976: }
977:
978: /* If already clean, nothing to do. */
979: if ((vp->v_iflag & VI_CLEAN) != 0) {
980: return;
981: }
982:
983: /*
984: * Prevent the vnode from being recycled or brought into use
985: * while we clean it out.
986: */
987: vp->v_iflag |= VI_XLOCK;
988: if (vp->v_iflag & VI_EXECMAP) {
989: atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages);
990: atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages);
991: }
992: vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
993: active = (vp->v_usecount & VC_MASK) > 1;
994:
995: /* XXXAD should not lock vnode under layer */
1.9 rmind 996: mutex_exit(vp->v_interlock);
1.1 rmind 997: VOP_LOCK(vp, LK_EXCLUSIVE);
998:
1.23 ! hannken 999: make_anon = (active && vp->v_type == VBLK &&
! 1000: spec_node_getmountedfs(vp) != NULL);
! 1001: if (make_anon)
! 1002: flags &= ~DOCLOSE;
! 1003:
1.1 rmind 1004: /*
1005: * Clean out any cached data associated with the vnode.
1006: * If purging an active vnode, it must be closed and
1007: * deactivated before being reclaimed. Note that the
1008: * VOP_INACTIVE will unlock the vnode.
1009: */
1010: if (flags & DOCLOSE) {
1011: error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
1012: if (error != 0) {
1013: /* XXX, fix vn_start_write's grab of mp and use that. */
1014:
1015: if (wapbl_vphaswapbl(vp))
1016: WAPBL_DISCARD(wapbl_vptomp(vp));
1017: error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
1018: }
1019: KASSERT(error == 0);
1020: KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1021: if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) {
1022: spec_node_revoke(vp);
1023: }
1024: }
1025: if (active) {
1026: VOP_INACTIVE(vp, &recycle);
1027: } else {
1028: /*
1029: * Any other processes trying to obtain this lock must first
1030: * wait for VI_XLOCK to clear, then call the new lock operation.
1031: */
1032: VOP_UNLOCK(vp);
1033: }
1034:
1035: /* Disassociate the underlying file system from the vnode. */
1036: if (VOP_RECLAIM(vp)) {
1.11 christos 1037: vnpanic(vp, "%s: cannot reclaim", __func__);
1.1 rmind 1038: }
1039:
1.7 rmind 1040: KASSERT(vp->v_data == NULL);
1.1 rmind 1041: KASSERT(vp->v_uobj.uo_npages == 0);
1.7 rmind 1042:
1.1 rmind 1043: if (vp->v_type == VREG && vp->v_ractx != NULL) {
1044: uvm_ra_freectx(vp->v_ractx);
1045: vp->v_ractx = NULL;
1046: }
1.7 rmind 1047:
1048: /* Purge name cache. */
1.1 rmind 1049: cache_purge(vp);
1050:
1.23 ! hannken 1051: /*
! 1052: * The vnode isn't clean, but still resides on the mount list. Remove
! 1053: * it. XXX This is a bit dodgy.
! 1054: */
! 1055: if (make_anon)
! 1056: vfs_insmntque(vp, NULL);
! 1057:
1.1 rmind 1058: /* Done with purge, notify sleepers of the grim news. */
1.9 rmind 1059: mutex_enter(vp->v_interlock);
1.23 ! hannken 1060: if (make_anon) {
! 1061: vp->v_op = spec_vnodeop_p;
! 1062: } else {
! 1063: vp->v_op = dead_vnodeop_p;
! 1064: }
1.1 rmind 1065: vp->v_tag = VT_NON;
1066: KNOTE(&vp->v_klist, NOTE_REVOKE);
1067: vp->v_iflag &= ~VI_XLOCK;
1068: vp->v_vflag &= ~VV_LOCKSWORK;
1069: if ((flags & DOCLOSE) != 0) {
1070: vp->v_iflag |= VI_CLEAN;
1071: }
1072: cv_broadcast(&vp->v_cv);
1073:
1074: KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1075: }
1076:
1077: /*
1078: * Recycle an unused vnode to the front of the free list.
1079: * Release the passed interlock if the vnode will be recycled.
1080: */
1081: int
1.23 ! hannken 1082: vrecycle(vnode_t *vp, kmutex_t *inter_lkp)
1.1 rmind 1083: {
1084:
1085: KASSERT((vp->v_iflag & VI_MARKER) == 0);
1086:
1.9 rmind 1087: mutex_enter(vp->v_interlock);
1.1 rmind 1088: if (vp->v_usecount != 0) {
1.9 rmind 1089: mutex_exit(vp->v_interlock);
1.4 rmind 1090: return 0;
1.1 rmind 1091: }
1092: if (inter_lkp) {
1093: mutex_exit(inter_lkp);
1094: }
1095: vremfree(vp);
1096: vp->v_usecount = 1;
1097: vclean(vp, DOCLOSE);
1098: vrelel(vp, 0);
1.4 rmind 1099: return 1;
1.1 rmind 1100: }
1101:
1102: /*
1103: * Eliminate all activity associated with the requested vnode
1104: * and with all vnodes aliased to the requested vnode.
1105: */
1106: void
1107: vrevoke(vnode_t *vp)
1108: {
1.19 hannken 1109: vnode_t *vq;
1.1 rmind 1110: enum vtype type;
1111: dev_t dev;
1112:
1113: KASSERT(vp->v_usecount > 0);
1114:
1.9 rmind 1115: mutex_enter(vp->v_interlock);
1.1 rmind 1116: if ((vp->v_iflag & VI_CLEAN) != 0) {
1.9 rmind 1117: mutex_exit(vp->v_interlock);
1.1 rmind 1118: return;
1119: } else if (vp->v_type != VBLK && vp->v_type != VCHR) {
1120: atomic_inc_uint(&vp->v_usecount);
1121: vclean(vp, DOCLOSE);
1122: vrelel(vp, 0);
1123: return;
1124: } else {
1125: dev = vp->v_rdev;
1126: type = vp->v_type;
1.9 rmind 1127: mutex_exit(vp->v_interlock);
1.1 rmind 1128: }
1129:
1.19 hannken 1130: while (spec_node_lookup_by_dev(type, dev, &vq) == 0) {
1.9 rmind 1131: mutex_enter(vq->v_interlock);
1.1 rmind 1132: vclean(vq, DOCLOSE);
1133: vrelel(vq, 0);
1134: }
1135: }
1136:
1137: /*
1138: * Eliminate all activity associated with a vnode in preparation for
1139: * reuse. Drops a reference from the vnode.
1140: */
1141: void
1142: vgone(vnode_t *vp)
1143: {
1144:
1.9 rmind 1145: mutex_enter(vp->v_interlock);
1.1 rmind 1146: vclean(vp, DOCLOSE);
1147: vrelel(vp, 0);
1148: }
1149:
1150: /*
1151: * Update outstanding I/O count and do wakeup if requested.
1152: */
1153: void
1154: vwakeup(struct buf *bp)
1155: {
1156: vnode_t *vp;
1157:
1158: if ((vp = bp->b_vp) == NULL)
1159: return;
1160:
1.9 rmind 1161: KASSERT(bp->b_objlock == vp->v_interlock);
1.1 rmind 1162: KASSERT(mutex_owned(bp->b_objlock));
1163:
1164: if (--vp->v_numoutput < 0)
1.11 christos 1165: vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp);
1.1 rmind 1166: if (vp->v_numoutput == 0)
1167: cv_broadcast(&vp->v_cv);
1168: }
1169:
1170: /*
1171: * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or
1172: * recycled.
1173: */
1174: void
1175: vwait(vnode_t *vp, int flags)
1176: {
1177:
1.9 rmind 1178: KASSERT(mutex_owned(vp->v_interlock));
1.1 rmind 1179: KASSERT(vp->v_usecount != 0);
1180:
1181: while ((vp->v_iflag & flags) != 0)
1.9 rmind 1182: cv_wait(&vp->v_cv, vp->v_interlock);
1.1 rmind 1183: }
1184:
1185: int
1.3 rmind 1186: vfs_drainvnodes(long target)
1.1 rmind 1187: {
1.12 hannken 1188: int error;
1189:
1190: mutex_enter(&vnode_free_list_lock);
1.1 rmind 1191:
1192: while (numvnodes > target) {
1.12 hannken 1193: error = cleanvnode();
1194: if (error != 0)
1195: return error;
1.1 rmind 1196: mutex_enter(&vnode_free_list_lock);
1197: }
1.12 hannken 1198:
1199: mutex_exit(&vnode_free_list_lock);
1200:
1.1 rmind 1201: return 0;
1202: }
1203:
1204: void
1.11 christos 1205: vnpanic(vnode_t *vp, const char *fmt, ...)
1.1 rmind 1206: {
1.11 christos 1207: va_list ap;
1208:
1.1 rmind 1209: #ifdef DIAGNOSTIC
1210: vprint(NULL, vp);
1211: #endif
1.11 christos 1212: va_start(ap, fmt);
1213: vpanic(fmt, ap);
1214: va_end(ap);
1.1 rmind 1215: }
CVSweb <webmaster@jp.NetBSD.org>