Annotation of src/sys/kern/vfs_wapbl.c, Revision 1.53
1.53 ! hannken 1: /* $NetBSD: vfs_wapbl.c,v 1.52 2012/04/29 22:55:11 chs Exp $ */
1.2 simonb 2:
3: /*-
1.23 ad 4: * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
1.2 simonb 5: * All rights reserved.
6: *
7: * This code is derived from software contributed to The NetBSD Foundation
8: * by Wasabi Systems, Inc.
9: *
10: * Redistribution and use in source and binary forms, with or without
11: * modification, are permitted provided that the following conditions
12: * are met:
13: * 1. Redistributions of source code must retain the above copyright
14: * notice, this list of conditions and the following disclaimer.
15: * 2. Redistributions in binary form must reproduce the above copyright
16: * notice, this list of conditions and the following disclaimer in the
17: * documentation and/or other materials provided with the distribution.
18: *
19: * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20: * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21: * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23: * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24: * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25: * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26: * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27: * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28: * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29: * POSSIBILITY OF SUCH DAMAGE.
30: */
31:
32: /*
33: * This implements file system independent write ahead filesystem logging.
34: */
1.4 joerg 35:
36: #define WAPBL_INTERNAL
37:
1.2 simonb 38: #include <sys/cdefs.h>
1.53 ! hannken 39: __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.52 2012/04/29 22:55:11 chs Exp $");
1.2 simonb 40:
41: #include <sys/param.h>
1.31 mlelstv 42: #include <sys/bitops.h>
1.2 simonb 43:
44: #ifdef _KERNEL
45: #include <sys/param.h>
46: #include <sys/namei.h>
47: #include <sys/proc.h>
1.39 christos 48: #include <sys/sysctl.h>
1.2 simonb 49: #include <sys/uio.h>
50: #include <sys/vnode.h>
51: #include <sys/file.h>
1.35 pooka 52: #include <sys/module.h>
1.2 simonb 53: #include <sys/resourcevar.h>
54: #include <sys/conf.h>
55: #include <sys/mount.h>
56: #include <sys/kernel.h>
57: #include <sys/kauth.h>
58: #include <sys/mutex.h>
59: #include <sys/atomic.h>
60: #include <sys/wapbl.h>
1.16 joerg 61: #include <sys/wapbl_replay.h>
1.2 simonb 62:
63: #include <miscfs/specfs/specdev.h>
64:
1.51 para 65: #define wapbl_alloc(s) kmem_alloc((s), KM_SLEEP)
66: #define wapbl_free(a, s) kmem_free((a), (s))
67: #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
1.2 simonb 68:
1.39 christos 69: static struct sysctllog *wapbl_sysctl;
70: static int wapbl_flush_disk_cache = 1;
71: static int wapbl_verbose_commit = 0;
72:
1.2 simonb 73: #else /* !_KERNEL */
74: #include <assert.h>
75: #include <errno.h>
76: #include <stdio.h>
77: #include <stdbool.h>
78: #include <stdlib.h>
79: #include <string.h>
80:
81: #include <sys/time.h>
82: #include <sys/wapbl.h>
1.16 joerg 83: #include <sys/wapbl_replay.h>
1.2 simonb 84:
85: #define KDASSERT(x) assert(x)
86: #define KASSERT(x) assert(x)
1.51 para 87: #define wapbl_alloc(s) malloc(s)
1.18 yamt 88: #define wapbl_free(a, s) free(a)
1.2 simonb 89: #define wapbl_calloc(n, s) calloc((n), (s))
90:
91: #endif /* !_KERNEL */
92:
93: /*
94: * INTERNAL DATA STRUCTURES
95: */
96:
97: /*
98: * This structure holds per-mount log information.
99: *
100: * Legend: a = atomic access only
101: * r = read-only after init
102: * l = rwlock held
103: * m = mutex held
1.38 hannken 104: * lm = rwlock held writing or mutex held
1.2 simonb 105: * u = unlocked access ok
106: * b = bufcache_lock held
107: */
108: struct wapbl {
109: struct vnode *wl_logvp; /* r: log here */
110: struct vnode *wl_devvp; /* r: log on this device */
111: struct mount *wl_mount; /* r: mountpoint wl is associated with */
112: daddr_t wl_logpbn; /* r: Physical block number of start of log */
113: int wl_log_dev_bshift; /* r: logarithm of device block size of log
114: device */
115: int wl_fs_dev_bshift; /* r: logarithm of device block size of
116: filesystem device */
117:
1.3 yamt 118: unsigned wl_lock_count; /* m: Count of transactions in progress */
1.2 simonb 119:
120: size_t wl_circ_size; /* r: Number of bytes in buffer of log */
121: size_t wl_circ_off; /* r: Number of bytes reserved at start */
122:
123: size_t wl_bufcount_max; /* r: Number of buffers reserved for log */
124: size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */
125:
126: off_t wl_head; /* l: Byte offset of log head */
127: off_t wl_tail; /* l: Byte offset of log tail */
128: /*
129: * head == tail == 0 means log is empty
130: * head == tail != 0 means log is full
131: * see assertions in wapbl_advance() for other boundary conditions.
132: * only truncate moves the tail, except when flush sets it to
133: * wl_header_size only flush moves the head, except when truncate
134: * sets it to 0.
135: */
136:
137: struct wapbl_wc_header *wl_wc_header; /* l */
138: void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */
139:
140: kmutex_t wl_mtx; /* u: short-term lock */
141: krwlock_t wl_rwlock; /* u: File system transaction lock */
142:
143: /*
144: * Must be held while accessing
145: * wl_count or wl_bufs or head or tail
146: */
147:
148: /*
149: * Callback called from within the flush routine to flush any extra
150: * bits. Note that flush may be skipped without calling this if
151: * there are no outstanding buffers in the transaction.
152: */
1.5 joerg 153: #if _KERNEL
1.2 simonb 154: wapbl_flush_fn_t wl_flush; /* r */
155: wapbl_flush_fn_t wl_flush_abort;/* r */
1.5 joerg 156: #endif
1.2 simonb 157:
158: size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */
159: size_t wl_bufcount; /* m: Count of buffers in wl_bufs */
160: size_t wl_bcount; /* m: Total bcount of wl_bufs */
161:
162: LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
163:
164: kcondvar_t wl_reclaimable_cv; /* m (obviously) */
165: size_t wl_reclaimable_bytes; /* m: Amount of space available for
166: reclamation by truncate */
167: int wl_error_count; /* m: # of wl_entries with errors */
168: size_t wl_reserved_bytes; /* never truncate log smaller than this */
169:
170: #ifdef WAPBL_DEBUG_BUFBYTES
171: size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
172: #endif
173:
1.38 hannken 174: daddr_t *wl_deallocblks;/* lm: address of block */
175: int *wl_dealloclens; /* lm: size of block */
176: int wl_dealloccnt; /* lm: total count */
1.2 simonb 177: int wl_dealloclim; /* l: max count */
178:
179: /* hashtable of inode numbers for allocated but unlinked inodes */
180: /* synch ??? */
181: LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash;
182: u_long wl_inohashmask;
183: int wl_inohashcnt;
184:
185: SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
186: accounting */
187: };
188:
189: #ifdef WAPBL_DEBUG_PRINT
190: int wapbl_debug_print = WAPBL_DEBUG_PRINT;
191: #endif
192:
193: /****************************************************************/
194: #ifdef _KERNEL
195:
196: #ifdef WAPBL_DEBUG
197: struct wapbl *wapbl_debug_wl;
198: #endif
199:
200: static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
201: static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
202: static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
203: static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
204: #endif /* _KERNEL */
205:
1.14 joerg 206: static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
1.2 simonb 207:
1.30 uebayasi 208: static inline size_t wapbl_space_free(size_t avail, off_t head,
1.2 simonb 209: off_t tail);
1.30 uebayasi 210: static inline size_t wapbl_space_used(size_t avail, off_t head,
1.2 simonb 211: off_t tail);
212:
213: #ifdef _KERNEL
214:
1.51 para 215: static struct pool wapbl_entry_pool;
216:
1.2 simonb 217: #define WAPBL_INODETRK_SIZE 83
218: static int wapbl_ino_pool_refcount;
219: static struct pool wapbl_ino_pool;
220: struct wapbl_ino {
221: LIST_ENTRY(wapbl_ino) wi_hash;
222: ino_t wi_ino;
223: mode_t wi_mode;
224: };
225:
226: static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
227: static void wapbl_inodetrk_free(struct wapbl *wl);
228: static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
229:
230: static size_t wapbl_transaction_len(struct wapbl *wl);
1.30 uebayasi 231: static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
1.2 simonb 232:
1.13 joerg 233: #if 0
1.4 joerg 234: int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
235: #endif
236:
237: static int wapbl_replay_isopen1(struct wapbl_replay *);
238:
1.2 simonb 239: /*
240: * This is useful for debugging. If set, the log will
241: * only be truncated when necessary.
242: */
243: int wapbl_lazy_truncate = 0;
244:
245: struct wapbl_ops wapbl_ops = {
246: .wo_wapbl_discard = wapbl_discard,
247: .wo_wapbl_replay_isopen = wapbl_replay_isopen1,
1.6 joerg 248: .wo_wapbl_replay_can_read = wapbl_replay_can_read,
1.2 simonb 249: .wo_wapbl_replay_read = wapbl_replay_read,
250: .wo_wapbl_add_buf = wapbl_add_buf,
251: .wo_wapbl_remove_buf = wapbl_remove_buf,
252: .wo_wapbl_resize_buf = wapbl_resize_buf,
253: .wo_wapbl_begin = wapbl_begin,
254: .wo_wapbl_end = wapbl_end,
255: .wo_wapbl_junlock_assert= wapbl_junlock_assert,
256:
257: /* XXX: the following is only used to say "this is a wapbl buf" */
258: .wo_wapbl_biodone = wapbl_biodone,
259: };
260:
1.21 yamt 261: static int
1.39 christos 262: wapbl_sysctl_init(void)
263: {
264: int rv;
265: const struct sysctlnode *rnode, *cnode;
266:
267: wapbl_sysctl = NULL;
268:
269: rv = sysctl_createv(&wapbl_sysctl, 0, NULL, &rnode,
270: CTLFLAG_PERMANENT,
271: CTLTYPE_NODE, "vfs", NULL,
272: NULL, 0, NULL, 0,
273: CTL_VFS, CTL_EOL);
274: if (rv)
275: return rv;
276:
277: rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &rnode,
278: CTLFLAG_PERMANENT,
279: CTLTYPE_NODE, "wapbl",
280: SYSCTL_DESCR("WAPBL journaling options"),
281: NULL, 0, NULL, 0,
282: CTL_CREATE, CTL_EOL);
283: if (rv)
284: return rv;
285:
286: rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
287: CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
288: CTLTYPE_INT, "flush_disk_cache",
289: SYSCTL_DESCR("flush disk cache"),
290: NULL, 0, &wapbl_flush_disk_cache, 0,
291: CTL_CREATE, CTL_EOL);
292: if (rv)
293: return rv;
294:
295: rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
296: CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
297: CTLTYPE_INT, "verbose_commit",
298: SYSCTL_DESCR("show time and size of wapbl log commits"),
299: NULL, 0, &wapbl_verbose_commit, 0,
300: CTL_CREATE, CTL_EOL);
301: return rv;
302: }
303:
304: static void
305: wapbl_init(void)
306: {
1.51 para 307:
308: pool_init(&wapbl_entry_pool, sizeof(struct wapbl_entry), 0, 0, 0,
309: "wapblentrypl", &pool_allocator_kmem, IPL_VM);
310:
1.39 christos 311: wapbl_sysctl_init();
312: }
313:
314: #ifdef notyet
315: static int
316: wapbl_fini(bool interface)
317: {
1.51 para 318:
1.39 christos 319: if (aio_sysctl != NULL)
320: sysctl_teardown(&aio_sysctl);
1.51 para 321:
322: pool_destroy(&wapbl_entry_pool);
323:
1.39 christos 324: return 0;
325: }
326: #endif
327:
328: static int
1.15 joerg 329: wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
330: {
331: int error, i;
332:
333: WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
334: ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
335:
336: /*
337: * Its only valid to reuse the replay log if its
338: * the same as the new log we just opened.
339: */
340: KDASSERT(!wapbl_replay_isopen(wr));
1.47 christos 341: KASSERT(wl->wl_devvp->v_type == VBLK);
342: KASSERT(wr->wr_devvp->v_type == VBLK);
1.15 joerg 343: KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
344: KASSERT(wl->wl_logpbn == wr->wr_logpbn);
345: KASSERT(wl->wl_circ_size == wr->wr_circ_size);
346: KASSERT(wl->wl_circ_off == wr->wr_circ_off);
347: KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
348: KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
349:
350: wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
351:
352: for (i = 0; i < wr->wr_inodescnt; i++)
353: wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
354: wr->wr_inodes[i].wr_imode);
355:
356: /* Make sure new transaction won't overwrite old inodes list */
357: KDASSERT(wapbl_transaction_len(wl) <=
358: wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
359: wr->wr_inodestail));
360:
361: wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
362: wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
363: wapbl_transaction_len(wl);
364:
365: error = wapbl_write_inodes(wl, &wl->wl_head);
366: if (error)
367: return error;
368:
369: KASSERT(wl->wl_head != wl->wl_tail);
370: KASSERT(wl->wl_head != 0);
371:
372: return 0;
373: }
374:
1.2 simonb 375: int
376: wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
377: daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
378: wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
379: {
380: struct wapbl *wl;
381: struct vnode *devvp;
382: daddr_t logpbn;
383: int error;
1.31 mlelstv 384: int log_dev_bshift = ilog2(blksize);
1.32 mlelstv 385: int fs_dev_bshift = log_dev_bshift;
1.2 simonb 386: int run;
387:
388: WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
389: " count=%zu blksize=%zu\n", vp, off, count, blksize));
390:
391: if (log_dev_bshift > fs_dev_bshift) {
392: WAPBL_PRINTF(WAPBL_PRINT_OPEN,
393: ("wapbl: log device's block size cannot be larger "
394: "than filesystem's\n"));
395: /*
396: * Not currently implemented, although it could be if
397: * needed someday.
398: */
399: return ENOSYS;
400: }
401:
402: if (off < 0)
403: return EINVAL;
404:
405: if (blksize < DEV_BSIZE)
406: return EINVAL;
407: if (blksize % DEV_BSIZE)
408: return EINVAL;
409:
410: /* XXXTODO: verify that the full load is writable */
411:
412: /*
413: * XXX check for minimum log size
414: * minimum is governed by minimum amount of space
415: * to complete a transaction. (probably truncate)
416: */
417: /* XXX for now pick something minimal */
418: if ((count * blksize) < MAXPHYS) {
419: return ENOSPC;
420: }
421:
422: if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
423: return error;
424: }
425:
426: wl = wapbl_calloc(1, sizeof(*wl));
427: rw_init(&wl->wl_rwlock);
428: mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
429: cv_init(&wl->wl_reclaimable_cv, "wapblrec");
430: LIST_INIT(&wl->wl_bufs);
431: SIMPLEQ_INIT(&wl->wl_entries);
432:
433: wl->wl_logvp = vp;
434: wl->wl_devvp = devvp;
435: wl->wl_mount = mp;
436: wl->wl_logpbn = logpbn;
437: wl->wl_log_dev_bshift = log_dev_bshift;
438: wl->wl_fs_dev_bshift = fs_dev_bshift;
439:
440: wl->wl_flush = flushfn;
441: wl->wl_flush_abort = flushabortfn;
442:
443: /* Reserve two log device blocks for the commit headers */
444: wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
1.34 mlelstv 445: wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
1.2 simonb 446: /* truncate the log usage to a multiple of log_dev_bshift */
447: wl->wl_circ_size >>= wl->wl_log_dev_bshift;
448: wl->wl_circ_size <<= wl->wl_log_dev_bshift;
449:
450: /*
451: * wl_bufbytes_max limits the size of the in memory transaction space.
452: * - Since buffers are allocated and accounted for in units of
453: * PAGE_SIZE it is required to be a multiple of PAGE_SIZE
454: * (i.e. 1<<PAGE_SHIFT)
455: * - Since the log device has to be written in units of
456: * 1<<wl_log_dev_bshift it is required to be a mulitple of
457: * 1<<wl_log_dev_bshift.
458: * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
459: * it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
460: * Therefore it must be multiple of the least common multiple of those
461: * three quantities. Fortunately, all of those quantities are
462: * guaranteed to be a power of two, and the least common multiple of
463: * a set of numbers which are all powers of two is simply the maximum
464: * of those numbers. Finally, the maximum logarithm of a power of two
465: * is the same as the log of the maximum power of two. So we can do
466: * the following operations to size wl_bufbytes_max:
467: */
468:
469: /* XXX fix actual number of pages reserved per filesystem. */
470: wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
471:
472: /* Round wl_bufbytes_max to the largest power of two constraint */
473: wl->wl_bufbytes_max >>= PAGE_SHIFT;
474: wl->wl_bufbytes_max <<= PAGE_SHIFT;
475: wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
476: wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
477: wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
478: wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
479:
480: /* XXX maybe use filesystem fragment size instead of 1024 */
481: /* XXX fix actual number of buffers reserved per filesystem. */
482: wl->wl_bufcount_max = (nbuf / 2) * 1024;
483:
484: /* XXX tie this into resource estimation */
1.41 hannken 485: wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2;
1.2 simonb 486:
1.51 para 487: wl->wl_deallocblks = wapbl_alloc(sizeof(*wl->wl_deallocblks) *
1.2 simonb 488: wl->wl_dealloclim);
1.51 para 489: wl->wl_dealloclens = wapbl_alloc(sizeof(*wl->wl_dealloclens) *
1.2 simonb 490: wl->wl_dealloclim);
491:
492: wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
493:
494: /* Initialize the commit header */
495: {
496: struct wapbl_wc_header *wc;
1.14 joerg 497: size_t len = 1 << wl->wl_log_dev_bshift;
1.2 simonb 498: wc = wapbl_calloc(1, len);
499: wc->wc_type = WAPBL_WC_HEADER;
500: wc->wc_len = len;
501: wc->wc_circ_off = wl->wl_circ_off;
502: wc->wc_circ_size = wl->wl_circ_size;
503: /* XXX wc->wc_fsid */
504: wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
505: wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
506: wl->wl_wc_header = wc;
1.51 para 507: wl->wl_wc_scratch = wapbl_alloc(len);
1.2 simonb 508: }
509:
510: /*
511: * if there was an existing set of unlinked but
512: * allocated inodes, preserve it in the new
513: * log.
514: */
515: if (wr && wr->wr_inodescnt) {
1.15 joerg 516: error = wapbl_start_flush_inodes(wl, wr);
1.2 simonb 517: if (error)
518: goto errout;
519: }
520:
521: error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
522: if (error) {
523: goto errout;
524: }
525:
526: *wlp = wl;
527: #if defined(WAPBL_DEBUG)
528: wapbl_debug_wl = wl;
529: #endif
530:
531: return 0;
532: errout:
533: wapbl_discard(wl);
1.18 yamt 534: wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
535: wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
536: wapbl_free(wl->wl_deallocblks,
537: sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
538: wapbl_free(wl->wl_dealloclens,
539: sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
1.2 simonb 540: wapbl_inodetrk_free(wl);
1.18 yamt 541: wapbl_free(wl, sizeof(*wl));
1.2 simonb 542:
543: return error;
544: }
545:
546: /*
547: * Like wapbl_flush, only discards the transaction
548: * completely
549: */
550:
551: void
552: wapbl_discard(struct wapbl *wl)
553: {
554: struct wapbl_entry *we;
555: struct buf *bp;
556: int i;
557:
558: /*
559: * XXX we may consider using upgrade here
560: * if we want to call flush from inside a transaction
561: */
562: rw_enter(&wl->wl_rwlock, RW_WRITER);
563: wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
564: wl->wl_dealloccnt);
565:
566: #ifdef WAPBL_DEBUG_PRINT
567: {
568: pid_t pid = -1;
569: lwpid_t lid = -1;
570: if (curproc)
571: pid = curproc->p_pid;
572: if (curlwp)
573: lid = curlwp->l_lid;
574: #ifdef WAPBL_DEBUG_BUFBYTES
575: WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
576: ("wapbl_discard: thread %d.%d discarding "
577: "transaction\n"
578: "\tbufcount=%zu bufbytes=%zu bcount=%zu "
579: "deallocs=%d inodes=%d\n"
580: "\terrcnt = %u, reclaimable=%zu reserved=%zu "
581: "unsynced=%zu\n",
582: pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
583: wl->wl_bcount, wl->wl_dealloccnt,
584: wl->wl_inohashcnt, wl->wl_error_count,
585: wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
586: wl->wl_unsynced_bufbytes));
587: SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
588: WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
589: ("\tentry: bufcount = %zu, reclaimable = %zu, "
590: "error = %d, unsynced = %zu\n",
591: we->we_bufcount, we->we_reclaimable_bytes,
592: we->we_error, we->we_unsynced_bufbytes));
593: }
594: #else /* !WAPBL_DEBUG_BUFBYTES */
595: WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
596: ("wapbl_discard: thread %d.%d discarding transaction\n"
597: "\tbufcount=%zu bufbytes=%zu bcount=%zu "
598: "deallocs=%d inodes=%d\n"
599: "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
600: pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
601: wl->wl_bcount, wl->wl_dealloccnt,
602: wl->wl_inohashcnt, wl->wl_error_count,
603: wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
604: SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
605: WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
606: ("\tentry: bufcount = %zu, reclaimable = %zu, "
607: "error = %d\n",
608: we->we_bufcount, we->we_reclaimable_bytes,
609: we->we_error));
610: }
611: #endif /* !WAPBL_DEBUG_BUFBYTES */
612: }
613: #endif /* WAPBL_DEBUG_PRINT */
614:
615: for (i = 0; i <= wl->wl_inohashmask; i++) {
616: struct wapbl_ino_head *wih;
617: struct wapbl_ino *wi;
618:
619: wih = &wl->wl_inohash[i];
620: while ((wi = LIST_FIRST(wih)) != NULL) {
621: LIST_REMOVE(wi, wi_hash);
622: pool_put(&wapbl_ino_pool, wi);
623: KASSERT(wl->wl_inohashcnt > 0);
624: wl->wl_inohashcnt--;
625: }
626: }
627:
628: /*
629: * clean buffer list
630: */
631: mutex_enter(&bufcache_lock);
632: mutex_enter(&wl->wl_mtx);
633: while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
634: if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
635: /*
636: * The buffer will be unlocked and
637: * removed from the transaction in brelse
638: */
639: mutex_exit(&wl->wl_mtx);
640: brelsel(bp, 0);
641: mutex_enter(&wl->wl_mtx);
642: }
643: }
644: mutex_exit(&wl->wl_mtx);
645: mutex_exit(&bufcache_lock);
646:
647: /*
648: * Remove references to this wl from wl_entries, free any which
649: * no longer have buffers, others will be freed in wapbl_biodone
650: * when they no longer have any buffers.
651: */
652: while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
653: SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
654: /* XXX should we be accumulating wl_error_count
655: * and increasing reclaimable bytes ? */
656: we->we_wapbl = NULL;
657: if (we->we_bufcount == 0) {
658: #ifdef WAPBL_DEBUG_BUFBYTES
659: KASSERT(we->we_unsynced_bufbytes == 0);
660: #endif
1.51 para 661: pool_put(&wapbl_entry_pool, we);
1.2 simonb 662: }
663: }
664:
665: /* Discard list of deallocs */
666: wl->wl_dealloccnt = 0;
667: /* XXX should we clear wl_reserved_bytes? */
668:
669: KASSERT(wl->wl_bufbytes == 0);
670: KASSERT(wl->wl_bcount == 0);
671: KASSERT(wl->wl_bufcount == 0);
672: KASSERT(LIST_EMPTY(&wl->wl_bufs));
673: KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
674: KASSERT(wl->wl_inohashcnt == 0);
675:
676: rw_exit(&wl->wl_rwlock);
677: }
678:
679: int
680: wapbl_stop(struct wapbl *wl, int force)
681: {
682: struct vnode *vp;
683: int error;
684:
685: WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
686: error = wapbl_flush(wl, 1);
687: if (error) {
688: if (force)
689: wapbl_discard(wl);
690: else
691: return error;
692: }
693:
694: /* Unlinked inodes persist after a flush */
695: if (wl->wl_inohashcnt) {
696: if (force) {
697: wapbl_discard(wl);
698: } else {
699: return EBUSY;
700: }
701: }
702:
703: KASSERT(wl->wl_bufbytes == 0);
704: KASSERT(wl->wl_bcount == 0);
705: KASSERT(wl->wl_bufcount == 0);
706: KASSERT(LIST_EMPTY(&wl->wl_bufs));
707: KASSERT(wl->wl_dealloccnt == 0);
708: KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
709: KASSERT(wl->wl_inohashcnt == 0);
710:
711: vp = wl->wl_logvp;
712:
1.18 yamt 713: wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
714: wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
715: wapbl_free(wl->wl_deallocblks,
716: sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
717: wapbl_free(wl->wl_dealloclens,
718: sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
1.2 simonb 719: wapbl_inodetrk_free(wl);
720:
721: cv_destroy(&wl->wl_reclaimable_cv);
722: mutex_destroy(&wl->wl_mtx);
723: rw_destroy(&wl->wl_rwlock);
1.18 yamt 724: wapbl_free(wl, sizeof(*wl));
1.2 simonb 725:
726: return 0;
727: }
728:
729: static int
730: wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
731: {
732: struct pstats *pstats = curlwp->l_proc->p_stats;
733: struct buf *bp;
734: int error;
735:
736: KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
737: KASSERT(devvp->v_type == VBLK);
738:
739: if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
1.45 rmind 740: mutex_enter(devvp->v_interlock);
1.2 simonb 741: devvp->v_numoutput++;
1.45 rmind 742: mutex_exit(devvp->v_interlock);
1.2 simonb 743: pstats->p_ru.ru_oublock++;
744: } else {
745: pstats->p_ru.ru_inblock++;
746: }
747:
748: bp = getiobuf(devvp, true);
749: bp->b_flags = flags;
750: bp->b_cflags = BC_BUSY; /* silly & dubious */
751: bp->b_dev = devvp->v_rdev;
752: bp->b_data = data;
753: bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
754: bp->b_blkno = pbn;
1.52 chs 755: BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
1.2 simonb 756:
757: WAPBL_PRINTF(WAPBL_PRINT_IO,
1.29 pooka 758: ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n",
1.2 simonb 759: BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
760: bp->b_blkno, bp->b_dev));
761:
762: VOP_STRATEGY(devvp, bp);
763:
764: error = biowait(bp);
765: putiobuf(bp);
766:
767: if (error) {
768: WAPBL_PRINTF(WAPBL_PRINT_ERROR,
769: ("wapbl_doio: %s %zu bytes at block %" PRId64
1.29 pooka 770: " on dev 0x%"PRIx64" failed with error %d\n",
1.2 simonb 771: (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
772: "write" : "read"),
773: len, pbn, devvp->v_rdev, error));
774: }
775:
776: return error;
777: }
778:
779: int
780: wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
781: {
782:
783: return wapbl_doio(data, len, devvp, pbn, B_WRITE);
784: }
785:
786: int
787: wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
788: {
789:
790: return wapbl_doio(data, len, devvp, pbn, B_READ);
791: }
792:
793: /*
794: * Off is byte offset returns new offset for next write
795: * handles log wraparound
796: */
797: static int
798: wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
799: {
800: size_t slen;
801: off_t off = *offp;
802: int error;
1.34 mlelstv 803: daddr_t pbn;
1.2 simonb 804:
805: KDASSERT(((len >> wl->wl_log_dev_bshift) <<
806: wl->wl_log_dev_bshift) == len);
807:
808: if (off < wl->wl_circ_off)
809: off = wl->wl_circ_off;
810: slen = wl->wl_circ_off + wl->wl_circ_size - off;
811: if (slen < len) {
1.34 mlelstv 812: pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
813: #ifdef _KERNEL
814: pbn = btodb(pbn << wl->wl_log_dev_bshift);
815: #endif
816: error = wapbl_write(data, slen, wl->wl_devvp, pbn);
1.2 simonb 817: if (error)
818: return error;
819: data = (uint8_t *)data + slen;
820: len -= slen;
821: off = wl->wl_circ_off;
822: }
1.34 mlelstv 823: pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
824: #ifdef _KERNEL
825: pbn = btodb(pbn << wl->wl_log_dev_bshift);
826: #endif
827: error = wapbl_write(data, len, wl->wl_devvp, pbn);
1.2 simonb 828: if (error)
829: return error;
830: off += len;
831: if (off >= wl->wl_circ_off + wl->wl_circ_size)
832: off = wl->wl_circ_off;
833: *offp = off;
834: return 0;
835: }
836:
837: /****************************************************************/
838:
839: int
840: wapbl_begin(struct wapbl *wl, const char *file, int line)
841: {
842: int doflush;
843: unsigned lockcount;
844:
845: KDASSERT(wl);
846:
847: /*
848: * XXX this needs to be made much more sophisticated.
849: * perhaps each wapbl_begin could reserve a specified
850: * number of buffers and bytes.
851: */
852: mutex_enter(&wl->wl_mtx);
853: lockcount = wl->wl_lock_count;
854: doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
855: wl->wl_bufbytes_max / 2) ||
856: ((wl->wl_bufcount + (lockcount * 10)) >
857: wl->wl_bufcount_max / 2) ||
1.28 pooka 858: (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) ||
1.42 hannken 859: (wl->wl_dealloccnt >= (wl->wl_dealloclim / 2));
1.2 simonb 860: mutex_exit(&wl->wl_mtx);
861:
862: if (doflush) {
863: WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
864: ("force flush lockcnt=%d bufbytes=%zu "
1.28 pooka 865: "(max=%zu) bufcount=%zu (max=%zu) "
866: "dealloccnt %d (lim=%d)\n",
1.2 simonb 867: lockcount, wl->wl_bufbytes,
868: wl->wl_bufbytes_max, wl->wl_bufcount,
1.28 pooka 869: wl->wl_bufcount_max,
870: wl->wl_dealloccnt, wl->wl_dealloclim));
1.2 simonb 871: }
872:
873: if (doflush) {
874: int error = wapbl_flush(wl, 0);
875: if (error)
876: return error;
877: }
878:
1.23 ad 879: rw_enter(&wl->wl_rwlock, RW_READER);
1.2 simonb 880: mutex_enter(&wl->wl_mtx);
881: wl->wl_lock_count++;
882: mutex_exit(&wl->wl_mtx);
883:
1.23 ad 884: #if defined(WAPBL_DEBUG_PRINT)
1.2 simonb 885: WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
886: ("wapbl_begin thread %d.%d with bufcount=%zu "
887: "bufbytes=%zu bcount=%zu at %s:%d\n",
888: curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
889: wl->wl_bufbytes, wl->wl_bcount, file, line));
890: #endif
891:
892: return 0;
893: }
894:
895: void
896: wapbl_end(struct wapbl *wl)
897: {
898:
1.23 ad 899: #if defined(WAPBL_DEBUG_PRINT)
1.2 simonb 900: WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
901: ("wapbl_end thread %d.%d with bufcount=%zu "
902: "bufbytes=%zu bcount=%zu\n",
903: curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
904: wl->wl_bufbytes, wl->wl_bcount));
905: #endif
906:
1.40 bouyer 907: #ifdef DIAGNOSTIC
908: size_t flushsize = wapbl_transaction_len(wl);
909: if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
910: /*
911: * XXX this could be handled more gracefully, perhaps place
912: * only a partial transaction in the log and allow the
913: * remaining to flush without the protection of the journal.
914: */
915: panic("wapbl_end: current transaction too big to flush\n");
916: }
917: #endif
918:
1.2 simonb 919: mutex_enter(&wl->wl_mtx);
920: KASSERT(wl->wl_lock_count > 0);
921: wl->wl_lock_count--;
922: mutex_exit(&wl->wl_mtx);
923:
924: rw_exit(&wl->wl_rwlock);
925: }
926:
927: void
928: wapbl_add_buf(struct wapbl *wl, struct buf * bp)
929: {
930:
931: KASSERT(bp->b_cflags & BC_BUSY);
932: KASSERT(bp->b_vp);
933:
934: wapbl_jlock_assert(wl);
935:
936: #if 0
937: /*
938: * XXX this might be an issue for swapfiles.
939: * see uvm_swap.c:1702
940: *
941: * XXX2 why require it then? leap of semantics?
942: */
943: KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
944: #endif
945:
946: mutex_enter(&wl->wl_mtx);
947: if (bp->b_flags & B_LOCKED) {
948: LIST_REMOVE(bp, b_wapbllist);
949: WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
950: ("wapbl_add_buf thread %d.%d re-adding buf %p "
951: "with %d bytes %d bcount\n",
952: curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
953: bp->b_bcount));
954: } else {
955: /* unlocked by dirty buffers shouldn't exist */
956: KASSERT(!(bp->b_oflags & BO_DELWRI));
957: wl->wl_bufbytes += bp->b_bufsize;
958: wl->wl_bcount += bp->b_bcount;
959: wl->wl_bufcount++;
960: WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
961: ("wapbl_add_buf thread %d.%d adding buf %p "
962: "with %d bytes %d bcount\n",
963: curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
964: bp->b_bcount));
965: }
966: LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist);
967: mutex_exit(&wl->wl_mtx);
968:
969: bp->b_flags |= B_LOCKED;
970: }
971:
972: static void
973: wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
974: {
975:
976: KASSERT(mutex_owned(&wl->wl_mtx));
977: KASSERT(bp->b_cflags & BC_BUSY);
978: wapbl_jlock_assert(wl);
979:
980: #if 0
981: /*
982: * XXX this might be an issue for swapfiles.
983: * see uvm_swap.c:1725
984: *
985: * XXXdeux: see above
986: */
987: KASSERT((bp->b_flags & BC_NOCACHE) == 0);
988: #endif
989: KASSERT(bp->b_flags & B_LOCKED);
990:
991: WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
992: ("wapbl_remove_buf thread %d.%d removing buf %p with "
993: "%d bytes %d bcount\n",
994: curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
995:
996: KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
997: wl->wl_bufbytes -= bp->b_bufsize;
998: KASSERT(wl->wl_bcount >= bp->b_bcount);
999: wl->wl_bcount -= bp->b_bcount;
1000: KASSERT(wl->wl_bufcount > 0);
1001: wl->wl_bufcount--;
1002: KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1003: KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1004: LIST_REMOVE(bp, b_wapbllist);
1005:
1006: bp->b_flags &= ~B_LOCKED;
1007: }
1008:
1009: /* called from brelsel() in vfs_bio among other places */
1010: void
1011: wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
1012: {
1013:
1014: mutex_enter(&wl->wl_mtx);
1015: wapbl_remove_buf_locked(wl, bp);
1016: mutex_exit(&wl->wl_mtx);
1017: }
1018:
1019: void
1020: wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
1021: {
1022:
1023: KASSERT(bp->b_cflags & BC_BUSY);
1024:
1025: /*
1026: * XXX: why does this depend on B_LOCKED? otherwise the buf
1027: * is not for a transaction? if so, why is this called in the
1028: * first place?
1029: */
1030: if (bp->b_flags & B_LOCKED) {
1031: mutex_enter(&wl->wl_mtx);
1032: wl->wl_bufbytes += bp->b_bufsize - oldsz;
1033: wl->wl_bcount += bp->b_bcount - oldcnt;
1034: mutex_exit(&wl->wl_mtx);
1035: }
1036: }
1037:
1038: #endif /* _KERNEL */
1039:
1040: /****************************************************************/
1041: /* Some utility inlines */
1042:
1043: /* This is used to advance the pointer at old to new value at old+delta */
1.30 uebayasi 1044: static inline off_t
1.2 simonb 1045: wapbl_advance(size_t size, size_t off, off_t old, size_t delta)
1046: {
1047: off_t new;
1048:
1049: /* Define acceptable ranges for inputs. */
1.46 christos 1050: KASSERT(delta <= (size_t)size);
1051: KASSERT((old == 0) || ((size_t)old >= off));
1052: KASSERT(old < (off_t)(size + off));
1.2 simonb 1053:
1054: if ((old == 0) && (delta != 0))
1055: new = off + delta;
1056: else if ((old + delta) < (size + off))
1057: new = old + delta;
1058: else
1059: new = (old + delta) - size;
1060:
1061: /* Note some interesting axioms */
1062: KASSERT((delta != 0) || (new == old));
1063: KASSERT((delta == 0) || (new != 0));
1064: KASSERT((delta != (size)) || (new == old));
1065:
1066: /* Define acceptable ranges for output. */
1.46 christos 1067: KASSERT((new == 0) || ((size_t)new >= off));
1068: KASSERT((size_t)new < (size + off));
1.2 simonb 1069: return new;
1070: }
1071:
1.30 uebayasi 1072: static inline size_t
1.2 simonb 1073: wapbl_space_used(size_t avail, off_t head, off_t tail)
1074: {
1075:
1076: if (tail == 0) {
1077: KASSERT(head == 0);
1078: return 0;
1079: }
1080: return ((head + (avail - 1) - tail) % avail) + 1;
1081: }
1082:
1.30 uebayasi 1083: static inline size_t
1.2 simonb 1084: wapbl_space_free(size_t avail, off_t head, off_t tail)
1085: {
1086:
1087: return avail - wapbl_space_used(avail, head, tail);
1088: }
1089:
1.30 uebayasi 1090: static inline void
1.2 simonb 1091: wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
1092: off_t *tailp)
1093: {
1094: off_t head = *headp;
1095: off_t tail = *tailp;
1096:
1097: KASSERT(delta <= wapbl_space_free(size, head, tail));
1098: head = wapbl_advance(size, off, head, delta);
1099: if ((tail == 0) && (head != 0))
1100: tail = off;
1101: *headp = head;
1102: *tailp = tail;
1103: }
1104:
1.30 uebayasi 1105: static inline void
1.2 simonb 1106: wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
1107: off_t *tailp)
1108: {
1109: off_t head = *headp;
1110: off_t tail = *tailp;
1111:
1112: KASSERT(delta <= wapbl_space_used(size, head, tail));
1113: tail = wapbl_advance(size, off, tail, delta);
1114: if (head == tail) {
1115: head = tail = 0;
1116: }
1117: *headp = head;
1118: *tailp = tail;
1119: }
1120:
1121: #ifdef _KERNEL
1122:
1123: /****************************************************************/
1124:
1125: /*
1126: * Remove transactions whose buffers are completely flushed to disk.
1127: * Will block until at least minfree space is available.
1128: * only intended to be called from inside wapbl_flush and therefore
1129: * does not protect against commit races with itself or with flush.
1130: */
1131: static int
1132: wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly)
1133: {
1134: size_t delta;
1135: size_t avail;
1136: off_t head;
1137: off_t tail;
1138: int error = 0;
1139:
1140: KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
1141: KASSERT(rw_write_held(&wl->wl_rwlock));
1142:
1143: mutex_enter(&wl->wl_mtx);
1144:
1145: /*
1146: * First check to see if we have to do a commit
1147: * at all.
1148: */
1149: avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
1150: if (minfree < avail) {
1151: mutex_exit(&wl->wl_mtx);
1152: return 0;
1153: }
1154: minfree -= avail;
1155: while ((wl->wl_error_count == 0) &&
1156: (wl->wl_reclaimable_bytes < minfree)) {
1157: WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1158: ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
1159: "minfree=%zd\n",
1160: &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
1161: minfree));
1162:
1163: cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
1164: }
1165: if (wl->wl_reclaimable_bytes < minfree) {
1166: KASSERT(wl->wl_error_count);
1167: /* XXX maybe get actual error from buffer instead someday? */
1168: error = EIO;
1169: }
1170: head = wl->wl_head;
1171: tail = wl->wl_tail;
1172: delta = wl->wl_reclaimable_bytes;
1173:
1174: /* If all of of the entries are flushed, then be sure to keep
1175: * the reserved bytes reserved. Watch out for discarded transactions,
1176: * which could leave more bytes reserved than are reclaimable.
1177: */
1178: if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
1179: (delta >= wl->wl_reserved_bytes)) {
1180: delta -= wl->wl_reserved_bytes;
1181: }
1182: wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
1183: &tail);
1184: KDASSERT(wl->wl_reserved_bytes <=
1185: wapbl_space_used(wl->wl_circ_size, head, tail));
1186: mutex_exit(&wl->wl_mtx);
1187:
1188: if (error)
1189: return error;
1190:
1191: if (waitonly)
1192: return 0;
1193:
1194: /*
1195: * This is where head, tail and delta are unprotected
1196: * from races against itself or flush. This is ok since
1197: * we only call this routine from inside flush itself.
1198: *
1199: * XXX: how can it race against itself when accessed only
1200: * from behind the write-locked rwlock?
1201: */
1202: error = wapbl_write_commit(wl, head, tail);
1203: if (error)
1204: return error;
1205:
1206: wl->wl_head = head;
1207: wl->wl_tail = tail;
1208:
1209: mutex_enter(&wl->wl_mtx);
1210: KASSERT(wl->wl_reclaimable_bytes >= delta);
1211: wl->wl_reclaimable_bytes -= delta;
1212: mutex_exit(&wl->wl_mtx);
1213: WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1214: ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
1215: curproc->p_pid, curlwp->l_lid, delta));
1216:
1217: return 0;
1218: }
1219:
1220: /****************************************************************/
1221:
1222: void
1223: wapbl_biodone(struct buf *bp)
1224: {
1225: struct wapbl_entry *we = bp->b_private;
1226: struct wapbl *wl = we->we_wapbl;
1.53 ! hannken 1227: #ifdef WAPBL_DEBUG_BUFBYTES
! 1228: const int bufsize = bp->b_bufsize;
! 1229: #endif
1.2 simonb 1230:
1231: /*
1232: * Handle possible flushing of buffers after log has been
1233: * decomissioned.
1234: */
1235: if (!wl) {
1236: KASSERT(we->we_bufcount > 0);
1237: we->we_bufcount--;
1238: #ifdef WAPBL_DEBUG_BUFBYTES
1.53 ! hannken 1239: KASSERT(we->we_unsynced_bufbytes >= bufsize);
! 1240: we->we_unsynced_bufbytes -= bufsize;
1.2 simonb 1241: #endif
1242:
1243: if (we->we_bufcount == 0) {
1244: #ifdef WAPBL_DEBUG_BUFBYTES
1245: KASSERT(we->we_unsynced_bufbytes == 0);
1246: #endif
1.51 para 1247: pool_put(&wapbl_entry_pool, we);
1.2 simonb 1248: }
1249:
1250: brelse(bp, 0);
1251: return;
1252: }
1253:
1254: #ifdef ohbother
1.44 uebayasi 1255: KDASSERT(bp->b_oflags & BO_DONE);
1256: KDASSERT(!(bp->b_oflags & BO_DELWRI));
1.2 simonb 1257: KDASSERT(bp->b_flags & B_ASYNC);
1.44 uebayasi 1258: KDASSERT(bp->b_cflags & BC_BUSY);
1.2 simonb 1259: KDASSERT(!(bp->b_flags & B_LOCKED));
1260: KDASSERT(!(bp->b_flags & B_READ));
1.44 uebayasi 1261: KDASSERT(!(bp->b_cflags & BC_INVAL));
1262: KDASSERT(!(bp->b_cflags & BC_NOCACHE));
1.2 simonb 1263: #endif
1264:
1265: if (bp->b_error) {
1266: #ifdef notyet /* Can't currently handle possible dirty buffer reuse */
1.26 apb 1267: /*
1268: * XXXpooka: interfaces not fully updated
1269: * Note: this was not enabled in the original patch
1270: * against netbsd4 either. I don't know if comment
1271: * above is true or not.
1272: */
1.2 simonb 1273:
1274: /*
1275: * If an error occurs, report the error and leave the
1276: * buffer as a delayed write on the LRU queue.
1277: * restarting the write would likely result in
1278: * an error spinloop, so let it be done harmlessly
1279: * by the syncer.
1280: */
1281: bp->b_flags &= ~(B_DONE);
1282: simple_unlock(&bp->b_interlock);
1283:
1284: if (we->we_error == 0) {
1285: mutex_enter(&wl->wl_mtx);
1286: wl->wl_error_count++;
1287: mutex_exit(&wl->wl_mtx);
1288: cv_broadcast(&wl->wl_reclaimable_cv);
1289: }
1290: we->we_error = bp->b_error;
1291: bp->b_error = 0;
1292: brelse(bp);
1293: return;
1294: #else
1295: /* For now, just mark the log permanently errored out */
1296:
1297: mutex_enter(&wl->wl_mtx);
1298: if (wl->wl_error_count == 0) {
1299: wl->wl_error_count++;
1300: cv_broadcast(&wl->wl_reclaimable_cv);
1301: }
1302: mutex_exit(&wl->wl_mtx);
1303: #endif
1304: }
1305:
1.53 ! hannken 1306: /*
! 1307: * Release the buffer here. wapbl_flush() may wait for the
! 1308: * log to become empty and we better unbusy the buffer before
! 1309: * wapbl_flush() returns.
! 1310: */
! 1311: brelse(bp, 0);
! 1312:
1.2 simonb 1313: mutex_enter(&wl->wl_mtx);
1314:
1315: KASSERT(we->we_bufcount > 0);
1316: we->we_bufcount--;
1317: #ifdef WAPBL_DEBUG_BUFBYTES
1.53 ! hannken 1318: KASSERT(we->we_unsynced_bufbytes >= bufsize);
! 1319: we->we_unsynced_bufbytes -= bufsize;
! 1320: KASSERT(wl->wl_unsynced_bufbytes >= bufsize);
! 1321: wl->wl_unsynced_bufbytes -= bufsize;
1.2 simonb 1322: #endif
1323:
1324: /*
1325: * If the current transaction can be reclaimed, start
1326: * at the beginning and reclaim any consecutive reclaimable
1327: * transactions. If we successfully reclaim anything,
1328: * then wakeup anyone waiting for the reclaim.
1329: */
1330: if (we->we_bufcount == 0) {
1331: size_t delta = 0;
1332: int errcnt = 0;
1333: #ifdef WAPBL_DEBUG_BUFBYTES
1334: KDASSERT(we->we_unsynced_bufbytes == 0);
1335: #endif
1336: /*
1337: * clear any posted error, since the buffer it came from
1338: * has successfully flushed by now
1339: */
1340: while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
1341: (we->we_bufcount == 0)) {
1342: delta += we->we_reclaimable_bytes;
1343: if (we->we_error)
1344: errcnt++;
1345: SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1.51 para 1346: pool_put(&wapbl_entry_pool, we);
1.2 simonb 1347: }
1348:
1349: if (delta) {
1350: wl->wl_reclaimable_bytes += delta;
1351: KASSERT(wl->wl_error_count >= errcnt);
1352: wl->wl_error_count -= errcnt;
1353: cv_broadcast(&wl->wl_reclaimable_cv);
1354: }
1355: }
1356:
1357: mutex_exit(&wl->wl_mtx);
1358: }
1359:
1360: /*
1361: * Write transactions to disk + start I/O for contents
1362: */
1363: int
1364: wapbl_flush(struct wapbl *wl, int waitfor)
1365: {
1366: struct buf *bp;
1367: struct wapbl_entry *we;
1368: off_t off;
1369: off_t head;
1370: off_t tail;
1371: size_t delta = 0;
1372: size_t flushsize;
1373: size_t reserved;
1374: int error = 0;
1375:
1376: /*
1377: * Do a quick check to see if a full flush can be skipped
1378: * This assumes that the flush callback does not need to be called
1379: * unless there are other outstanding bufs.
1380: */
1381: if (!waitfor) {
1382: size_t nbufs;
1383: mutex_enter(&wl->wl_mtx); /* XXX need mutex here to
1384: protect the KASSERTS */
1385: nbufs = wl->wl_bufcount;
1386: KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1387: KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1388: mutex_exit(&wl->wl_mtx);
1389: if (nbufs == 0)
1390: return 0;
1391: }
1392:
1393: /*
1394: * XXX we may consider using LK_UPGRADE here
1395: * if we want to call flush from inside a transaction
1396: */
1397: rw_enter(&wl->wl_rwlock, RW_WRITER);
1398: wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
1399: wl->wl_dealloccnt);
1400:
1401: /*
1402: * Now that we are fully locked and flushed,
1403: * do another check for nothing to do.
1404: */
1405: if (wl->wl_bufcount == 0) {
1406: goto out;
1407: }
1408:
1409: #if 0
1410: WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1411: ("wapbl_flush thread %d.%d flushing entries with "
1412: "bufcount=%zu bufbytes=%zu\n",
1413: curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1414: wl->wl_bufbytes));
1415: #endif
1416:
1417: /* Calculate amount of space needed to flush */
1418: flushsize = wapbl_transaction_len(wl);
1.39 christos 1419: if (wapbl_verbose_commit) {
1420: struct timespec ts;
1421: getnanotime(&ts);
1.43 nakayama 1422: printf("%s: %lld.%09ld this transaction = %zu bytes\n",
1.39 christos 1423: __func__, (long long)ts.tv_sec,
1424: (long)ts.tv_nsec, flushsize);
1425: }
1.2 simonb 1426:
1427: if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
1428: /*
1429: * XXX this could be handled more gracefully, perhaps place
1430: * only a partial transaction in the log and allow the
1431: * remaining to flush without the protection of the journal.
1432: */
1433: panic("wapbl_flush: current transaction too big to flush\n");
1434: }
1435:
1436: error = wapbl_truncate(wl, flushsize, 0);
1437: if (error)
1438: goto out2;
1439:
1440: off = wl->wl_head;
1441: KASSERT((off == 0) || ((off >= wl->wl_circ_off) &&
1442: (off < wl->wl_circ_off + wl->wl_circ_size)));
1443: error = wapbl_write_blocks(wl, &off);
1444: if (error)
1445: goto out2;
1446: error = wapbl_write_revocations(wl, &off);
1447: if (error)
1448: goto out2;
1449: error = wapbl_write_inodes(wl, &off);
1450: if (error)
1451: goto out2;
1452:
1453: reserved = 0;
1454: if (wl->wl_inohashcnt)
1455: reserved = wapbl_transaction_inodes_len(wl);
1456:
1457: head = wl->wl_head;
1458: tail = wl->wl_tail;
1459:
1460: wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
1461: &head, &tail);
1462: #ifdef WAPBL_DEBUG
1463: if (head != off) {
1464: panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX
1465: " off=%"PRIdMAX" flush=%zu\n",
1466: (intmax_t)head, (intmax_t)tail, (intmax_t)off,
1467: flushsize);
1468: }
1469: #else
1470: KASSERT(head == off);
1471: #endif
1472:
1473: /* Opportunistically move the tail forward if we can */
1474: if (!wapbl_lazy_truncate) {
1475: mutex_enter(&wl->wl_mtx);
1476: delta = wl->wl_reclaimable_bytes;
1477: mutex_exit(&wl->wl_mtx);
1478: wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
1479: &head, &tail);
1480: }
1481:
1482: error = wapbl_write_commit(wl, head, tail);
1483: if (error)
1484: goto out2;
1485:
1.51 para 1486: we = pool_get(&wapbl_entry_pool, PR_WAITOK);
1.2 simonb 1487:
1488: #ifdef WAPBL_DEBUG_BUFBYTES
1489: WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1490: ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1491: " unsynced=%zu"
1492: "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1493: "inodes=%d\n",
1494: curproc->p_pid, curlwp->l_lid, flushsize, delta,
1495: wapbl_space_used(wl->wl_circ_size, head, tail),
1496: wl->wl_unsynced_bufbytes, wl->wl_bufcount,
1497: wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
1498: wl->wl_inohashcnt));
1499: #else
1500: WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1501: ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1502: "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1503: "inodes=%d\n",
1504: curproc->p_pid, curlwp->l_lid, flushsize, delta,
1505: wapbl_space_used(wl->wl_circ_size, head, tail),
1506: wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1507: wl->wl_dealloccnt, wl->wl_inohashcnt));
1508: #endif
1509:
1510:
1511: mutex_enter(&bufcache_lock);
1512: mutex_enter(&wl->wl_mtx);
1513:
1514: wl->wl_reserved_bytes = reserved;
1515: wl->wl_head = head;
1516: wl->wl_tail = tail;
1517: KASSERT(wl->wl_reclaimable_bytes >= delta);
1518: wl->wl_reclaimable_bytes -= delta;
1519: wl->wl_dealloccnt = 0;
1520: #ifdef WAPBL_DEBUG_BUFBYTES
1521: wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
1522: #endif
1523:
1524: we->we_wapbl = wl;
1525: we->we_bufcount = wl->wl_bufcount;
1526: #ifdef WAPBL_DEBUG_BUFBYTES
1527: we->we_unsynced_bufbytes = wl->wl_bufbytes;
1528: #endif
1529: we->we_reclaimable_bytes = flushsize;
1530: we->we_error = 0;
1531: SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
1532:
1533: /*
1534: * this flushes bufs in reverse order than they were queued
1535: * it shouldn't matter, but if we care we could use TAILQ instead.
1536: * XXX Note they will get put on the lru queue when they flush
1537: * so we might actually want to change this to preserve order.
1538: */
1539: while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
1540: if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
1541: continue;
1542: }
1543: bp->b_iodone = wapbl_biodone;
1544: bp->b_private = we;
1545: bremfree(bp);
1546: wapbl_remove_buf_locked(wl, bp);
1547: mutex_exit(&wl->wl_mtx);
1548: mutex_exit(&bufcache_lock);
1549: bawrite(bp);
1550: mutex_enter(&bufcache_lock);
1551: mutex_enter(&wl->wl_mtx);
1552: }
1553: mutex_exit(&wl->wl_mtx);
1554: mutex_exit(&bufcache_lock);
1555:
1556: #if 0
1557: WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1558: ("wapbl_flush thread %d.%d done flushing entries...\n",
1559: curproc->p_pid, curlwp->l_lid));
1560: #endif
1561:
1562: out:
1563:
1564: /*
1565: * If the waitfor flag is set, don't return until everything is
1566: * fully flushed and the on disk log is empty.
1567: */
1568: if (waitfor) {
1569: error = wapbl_truncate(wl, wl->wl_circ_size -
1570: wl->wl_reserved_bytes, wapbl_lazy_truncate);
1571: }
1572:
1573: out2:
1574: if (error) {
1575: wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks,
1576: wl->wl_dealloclens, wl->wl_dealloccnt);
1577: }
1578:
1579: #ifdef WAPBL_DEBUG_PRINT
1580: if (error) {
1581: pid_t pid = -1;
1582: lwpid_t lid = -1;
1583: if (curproc)
1584: pid = curproc->p_pid;
1585: if (curlwp)
1586: lid = curlwp->l_lid;
1587: mutex_enter(&wl->wl_mtx);
1588: #ifdef WAPBL_DEBUG_BUFBYTES
1589: WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1590: ("wapbl_flush: thread %d.%d aborted flush: "
1591: "error = %d\n"
1592: "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1593: "deallocs=%d inodes=%d\n"
1594: "\terrcnt = %d, reclaimable=%zu reserved=%zu "
1595: "unsynced=%zu\n",
1596: pid, lid, error, wl->wl_bufcount,
1597: wl->wl_bufbytes, wl->wl_bcount,
1598: wl->wl_dealloccnt, wl->wl_inohashcnt,
1599: wl->wl_error_count, wl->wl_reclaimable_bytes,
1600: wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
1601: SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1602: WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1603: ("\tentry: bufcount = %zu, reclaimable = %zu, "
1604: "error = %d, unsynced = %zu\n",
1605: we->we_bufcount, we->we_reclaimable_bytes,
1606: we->we_error, we->we_unsynced_bufbytes));
1607: }
1608: #else
1609: WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1610: ("wapbl_flush: thread %d.%d aborted flush: "
1611: "error = %d\n"
1612: "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1613: "deallocs=%d inodes=%d\n"
1614: "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
1615: pid, lid, error, wl->wl_bufcount,
1616: wl->wl_bufbytes, wl->wl_bcount,
1617: wl->wl_dealloccnt, wl->wl_inohashcnt,
1618: wl->wl_error_count, wl->wl_reclaimable_bytes,
1619: wl->wl_reserved_bytes));
1620: SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1621: WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1622: ("\tentry: bufcount = %zu, reclaimable = %zu, "
1623: "error = %d\n", we->we_bufcount,
1624: we->we_reclaimable_bytes, we->we_error));
1625: }
1626: #endif
1627: mutex_exit(&wl->wl_mtx);
1628: }
1629: #endif
1630:
1631: rw_exit(&wl->wl_rwlock);
1632: return error;
1633: }
1634:
1635: /****************************************************************/
1636:
1637: void
1638: wapbl_jlock_assert(struct wapbl *wl)
1639: {
1640:
1.23 ad 1641: KASSERT(rw_lock_held(&wl->wl_rwlock));
1.2 simonb 1642: }
1643:
1644: void
1645: wapbl_junlock_assert(struct wapbl *wl)
1646: {
1647:
1648: KASSERT(!rw_write_held(&wl->wl_rwlock));
1649: }
1650:
1651: /****************************************************************/
1652:
1653: /* locks missing */
1654: void
1655: wapbl_print(struct wapbl *wl,
1656: int full,
1657: void (*pr)(const char *, ...))
1658: {
1659: struct buf *bp;
1660: struct wapbl_entry *we;
1661: (*pr)("wapbl %p", wl);
1662: (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
1663: wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
1664: (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
1665: wl->wl_circ_size, wl->wl_circ_off,
1666: (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
1667: (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
1668: wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
1669: #ifdef WAPBL_DEBUG_BUFBYTES
1670: (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1671: "reserved = %zu errcnt = %d unsynced = %zu\n",
1672: wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1673: wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1674: wl->wl_error_count, wl->wl_unsynced_bufbytes);
1675: #else
1676: (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1677: "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
1678: wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1679: wl->wl_error_count);
1680: #endif
1681: (*pr)("\tdealloccnt = %d, dealloclim = %d\n",
1682: wl->wl_dealloccnt, wl->wl_dealloclim);
1683: (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
1684: wl->wl_inohashcnt, wl->wl_inohashmask);
1685: (*pr)("entries:\n");
1686: SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1687: #ifdef WAPBL_DEBUG_BUFBYTES
1688: (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
1689: "unsynced = %zu\n",
1690: we->we_bufcount, we->we_reclaimable_bytes,
1691: we->we_error, we->we_unsynced_bufbytes);
1692: #else
1693: (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
1694: we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
1695: #endif
1696: }
1697: if (full) {
1698: int cnt = 0;
1699: (*pr)("bufs =");
1700: LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
1701: if (!LIST_NEXT(bp, b_wapbllist)) {
1702: (*pr)(" %p", bp);
1703: } else if ((++cnt % 6) == 0) {
1704: (*pr)(" %p,\n\t", bp);
1705: } else {
1706: (*pr)(" %p,", bp);
1707: }
1708: }
1709: (*pr)("\n");
1710:
1711: (*pr)("dealloced blks = ");
1712: {
1713: int i;
1714: cnt = 0;
1715: for (i = 0; i < wl->wl_dealloccnt; i++) {
1716: (*pr)(" %"PRId64":%d,",
1717: wl->wl_deallocblks[i],
1718: wl->wl_dealloclens[i]);
1719: if ((++cnt % 4) == 0) {
1720: (*pr)("\n\t");
1721: }
1722: }
1723: }
1724: (*pr)("\n");
1725:
1726: (*pr)("registered inodes = ");
1727: {
1728: int i;
1729: cnt = 0;
1730: for (i = 0; i <= wl->wl_inohashmask; i++) {
1731: struct wapbl_ino_head *wih;
1732: struct wapbl_ino *wi;
1733:
1734: wih = &wl->wl_inohash[i];
1735: LIST_FOREACH(wi, wih, wi_hash) {
1736: if (wi->wi_ino == 0)
1737: continue;
1738: (*pr)(" %"PRId32"/0%06"PRIo32",",
1739: wi->wi_ino, wi->wi_mode);
1740: if ((++cnt % 4) == 0) {
1741: (*pr)("\n\t");
1742: }
1743: }
1744: }
1745: (*pr)("\n");
1746: }
1747: }
1748: }
1749:
1750: #if defined(WAPBL_DEBUG) || defined(DDB)
1751: void
1752: wapbl_dump(struct wapbl *wl)
1753: {
1754: #if defined(WAPBL_DEBUG)
1755: if (!wl)
1756: wl = wapbl_debug_wl;
1757: #endif
1758: if (!wl)
1759: return;
1760: wapbl_print(wl, 1, printf);
1761: }
1762: #endif
1763:
1764: /****************************************************************/
1765:
1766: void
1767: wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len)
1768: {
1769:
1770: wapbl_jlock_assert(wl);
1771:
1.38 hannken 1772: mutex_enter(&wl->wl_mtx);
1.2 simonb 1773: /* XXX should eventually instead tie this into resource estimation */
1.27 pooka 1774: /*
1775: * XXX this panic needs locking/mutex analysis and the
1776: * ability to cope with the failure.
1777: */
1778: /* XXX this XXX doesn't have enough XXX */
1779: if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim))
1780: panic("wapbl_register_deallocation: out of resources");
1781:
1.2 simonb 1782: wl->wl_deallocblks[wl->wl_dealloccnt] = blk;
1783: wl->wl_dealloclens[wl->wl_dealloccnt] = len;
1784: wl->wl_dealloccnt++;
1785: WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
1786: ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len));
1.38 hannken 1787: mutex_exit(&wl->wl_mtx);
1.2 simonb 1788: }
1789:
1790: /****************************************************************/
1791:
1792: static void
1793: wapbl_inodetrk_init(struct wapbl *wl, u_int size)
1794: {
1795:
1796: wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
1797: if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
1798: pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
1799: "wapblinopl", &pool_allocator_nointr, IPL_NONE);
1800: }
1801: }
1802:
1803: static void
1804: wapbl_inodetrk_free(struct wapbl *wl)
1805: {
1806:
1807: /* XXX this KASSERT needs locking/mutex analysis */
1808: KASSERT(wl->wl_inohashcnt == 0);
1809: hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
1810: if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
1811: pool_destroy(&wapbl_ino_pool);
1812: }
1813: }
1814:
1815: static struct wapbl_ino *
1816: wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
1817: {
1818: struct wapbl_ino_head *wih;
1819: struct wapbl_ino *wi;
1820:
1821: KASSERT(mutex_owned(&wl->wl_mtx));
1822:
1823: wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1824: LIST_FOREACH(wi, wih, wi_hash) {
1825: if (ino == wi->wi_ino)
1826: return wi;
1827: }
1828: return 0;
1829: }
1830:
1831: void
1832: wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1833: {
1834: struct wapbl_ino_head *wih;
1835: struct wapbl_ino *wi;
1836:
1837: wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
1838:
1839: mutex_enter(&wl->wl_mtx);
1840: if (wapbl_inodetrk_get(wl, ino) == NULL) {
1841: wi->wi_ino = ino;
1842: wi->wi_mode = mode;
1843: wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1844: LIST_INSERT_HEAD(wih, wi, wi_hash);
1845: wl->wl_inohashcnt++;
1846: WAPBL_PRINTF(WAPBL_PRINT_INODE,
1847: ("wapbl_register_inode: ino=%"PRId64"\n", ino));
1848: mutex_exit(&wl->wl_mtx);
1849: } else {
1850: mutex_exit(&wl->wl_mtx);
1851: pool_put(&wapbl_ino_pool, wi);
1852: }
1853: }
1854:
1855: void
1856: wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1857: {
1858: struct wapbl_ino *wi;
1859:
1860: mutex_enter(&wl->wl_mtx);
1861: wi = wapbl_inodetrk_get(wl, ino);
1862: if (wi) {
1863: WAPBL_PRINTF(WAPBL_PRINT_INODE,
1864: ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
1865: KASSERT(wl->wl_inohashcnt > 0);
1866: wl->wl_inohashcnt--;
1867: LIST_REMOVE(wi, wi_hash);
1868: mutex_exit(&wl->wl_mtx);
1869:
1870: pool_put(&wapbl_ino_pool, wi);
1871: } else {
1872: mutex_exit(&wl->wl_mtx);
1873: }
1874: }
1875:
1876: /****************************************************************/
1877:
1.30 uebayasi 1878: static inline size_t
1.2 simonb 1879: wapbl_transaction_inodes_len(struct wapbl *wl)
1880: {
1881: int blocklen = 1<<wl->wl_log_dev_bshift;
1882: int iph;
1883:
1884: /* Calculate number of inodes described in a inodelist header */
1885: iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
1886: sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
1887:
1888: KASSERT(iph > 0);
1889:
1.39 christos 1890: return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen;
1.2 simonb 1891: }
1892:
1893:
1894: /* Calculate amount of space a transaction will take on disk */
1895: static size_t
1896: wapbl_transaction_len(struct wapbl *wl)
1897: {
1898: int blocklen = 1<<wl->wl_log_dev_bshift;
1899: size_t len;
1900: int bph;
1901:
1902: /* Calculate number of blocks described in a blocklist header */
1903: bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
1904: sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
1905:
1906: KASSERT(bph > 0);
1907:
1908: len = wl->wl_bcount;
1.39 christos 1909: len += howmany(wl->wl_bufcount, bph) * blocklen;
1910: len += howmany(wl->wl_dealloccnt, bph) * blocklen;
1.2 simonb 1911: len += wapbl_transaction_inodes_len(wl);
1912:
1913: return len;
1914: }
1915:
1916: /*
1.48 yamt 1917: * wapbl_cache_sync: issue DIOCCACHESYNC
1918: */
1919: static int
1920: wapbl_cache_sync(struct wapbl *wl, const char *msg)
1921: {
1922: const bool verbose = wapbl_verbose_commit >= 2;
1923: struct bintime start_time;
1924: int force = 1;
1925: int error;
1926:
1927: if (!wapbl_flush_disk_cache) {
1928: return 0;
1929: }
1930: if (verbose) {
1931: bintime(&start_time);
1932: }
1933: error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force,
1934: FWRITE, FSCRED);
1935: if (error) {
1936: WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1937: ("wapbl_cache_sync: DIOCCACHESYNC on dev 0x%x "
1938: "returned %d\n", wl->wl_devvp->v_rdev, error));
1939: }
1940: if (verbose) {
1941: struct bintime d;
1942: struct timespec ts;
1943:
1944: bintime(&d);
1945: bintime_sub(&d, &start_time);
1946: bintime2timespec(&d, &ts);
1947: printf("wapbl_cache_sync: %s: dev 0x%jx %ju.%09lu\n",
1948: msg, (uintmax_t)wl->wl_devvp->v_rdev,
1949: (uintmax_t)ts.tv_sec, ts.tv_nsec);
1950: }
1951: return error;
1952: }
1953:
1954: /*
1.2 simonb 1955: * Perform commit operation
1956: *
1957: * Note that generation number incrementation needs to
1958: * be protected against racing with other invocations
1.48 yamt 1959: * of wapbl_write_commit. This is ok since this routine
1.2 simonb 1960: * is only invoked from wapbl_flush
1961: */
1962: static int
1963: wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
1964: {
1965: struct wapbl_wc_header *wc = wl->wl_wc_header;
1966: struct timespec ts;
1967: int error;
1.34 mlelstv 1968: daddr_t pbn;
1.2 simonb 1969:
1.49 yamt 1970: /*
1971: * flush disk cache to ensure that blocks we've written are actually
1972: * written to the stable storage before the commit header.
1973: *
1974: * XXX Calc checksum here, instead we do this for now
1975: */
1.48 yamt 1976: wapbl_cache_sync(wl, "1");
1.2 simonb 1977:
1978: wc->wc_head = head;
1979: wc->wc_tail = tail;
1980: wc->wc_checksum = 0;
1981: wc->wc_version = 1;
1982: getnanotime(&ts);
1.17 yamt 1983: wc->wc_time = ts.tv_sec;
1.2 simonb 1984: wc->wc_timensec = ts.tv_nsec;
1985:
1986: WAPBL_PRINTF(WAPBL_PRINT_WRITE,
1987: ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
1988: (intmax_t)head, (intmax_t)tail));
1989:
1990: /*
1.49 yamt 1991: * write the commit header.
1992: *
1.2 simonb 1993: * XXX if generation will rollover, then first zero
1994: * over second commit header before trying to write both headers.
1995: */
1996:
1.34 mlelstv 1997: pbn = wl->wl_logpbn + (wc->wc_generation % 2);
1998: #ifdef _KERNEL
1999: pbn = btodb(pbn << wc->wc_log_dev_bshift);
2000: #endif
2001: error = wapbl_write(wc, wc->wc_len, wl->wl_devvp, pbn);
1.2 simonb 2002: if (error)
2003: return error;
2004:
1.49 yamt 2005: /*
2006: * flush disk cache to ensure that the commit header is actually
2007: * written before meta data blocks.
2008: */
1.48 yamt 2009: wapbl_cache_sync(wl, "2");
1.2 simonb 2010:
2011: /*
2012: * If the generation number was zero, write it out a second time.
2013: * This handles initialization and generation number rollover
2014: */
2015: if (wc->wc_generation++ == 0) {
2016: error = wapbl_write_commit(wl, head, tail);
2017: /*
2018: * This panic should be able to be removed if we do the
2019: * zero'ing mentioned above, and we are certain to roll
2020: * back generation number on failure.
2021: */
2022: if (error)
2023: panic("wapbl_write_commit: error writing duplicate "
2024: "log header: %d\n", error);
2025: }
2026: return 0;
2027: }
2028:
2029: /* Returns new offset value */
2030: static int
2031: wapbl_write_blocks(struct wapbl *wl, off_t *offp)
2032: {
2033: struct wapbl_wc_blocklist *wc =
2034: (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2035: int blocklen = 1<<wl->wl_log_dev_bshift;
2036: int bph;
2037: struct buf *bp;
2038: off_t off = *offp;
2039: int error;
1.7 joerg 2040: size_t padding;
1.2 simonb 2041:
2042: KASSERT(rw_write_held(&wl->wl_rwlock));
2043:
2044: bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2045: sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2046:
2047: bp = LIST_FIRST(&wl->wl_bufs);
2048:
2049: while (bp) {
2050: int cnt;
2051: struct buf *obp = bp;
2052:
2053: KASSERT(bp->b_flags & B_LOCKED);
2054:
2055: wc->wc_type = WAPBL_WC_BLOCKS;
2056: wc->wc_len = blocklen;
2057: wc->wc_blkcount = 0;
2058: while (bp && (wc->wc_blkcount < bph)) {
2059: /*
2060: * Make sure all the physical block numbers are up to
2061: * date. If this is not always true on a given
2062: * filesystem, then VOP_BMAP must be called. We
2063: * could call VOP_BMAP here, or else in the filesystem
2064: * specific flush callback, although neither of those
2065: * solutions allow us to take the vnode lock. If a
2066: * filesystem requires that we must take the vnode lock
2067: * to call VOP_BMAP, then we can probably do it in
2068: * bwrite when the vnode lock should already be held
2069: * by the invoking code.
2070: */
2071: KASSERT((bp->b_vp->v_type == VBLK) ||
2072: (bp->b_blkno != bp->b_lblkno));
2073: KASSERT(bp->b_blkno > 0);
2074:
2075: wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
2076: wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
2077: wc->wc_len += bp->b_bcount;
2078: wc->wc_blkcount++;
2079: bp = LIST_NEXT(bp, b_wapbllist);
2080: }
1.7 joerg 2081: if (wc->wc_len % blocklen != 0) {
2082: padding = blocklen - wc->wc_len % blocklen;
2083: wc->wc_len += padding;
2084: } else {
2085: padding = 0;
2086: }
2087:
1.2 simonb 2088: WAPBL_PRINTF(WAPBL_PRINT_WRITE,
1.7 joerg 2089: ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
2090: wc->wc_len, padding, (intmax_t)off));
1.2 simonb 2091:
2092: error = wapbl_circ_write(wl, wc, blocklen, &off);
2093: if (error)
2094: return error;
2095: bp = obp;
2096: cnt = 0;
2097: while (bp && (cnt++ < bph)) {
2098: error = wapbl_circ_write(wl, bp->b_data,
2099: bp->b_bcount, &off);
2100: if (error)
2101: return error;
2102: bp = LIST_NEXT(bp, b_wapbllist);
2103: }
1.7 joerg 2104: if (padding) {
2105: void *zero;
2106:
1.51 para 2107: zero = wapbl_alloc(padding);
1.7 joerg 2108: memset(zero, 0, padding);
2109: error = wapbl_circ_write(wl, zero, padding, &off);
1.18 yamt 2110: wapbl_free(zero, padding);
1.7 joerg 2111: if (error)
2112: return error;
2113: }
1.2 simonb 2114: }
2115: *offp = off;
2116: return 0;
2117: }
2118:
2119: static int
2120: wapbl_write_revocations(struct wapbl *wl, off_t *offp)
2121: {
2122: struct wapbl_wc_blocklist *wc =
2123: (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2124: int i;
2125: int blocklen = 1<<wl->wl_log_dev_bshift;
2126: int bph;
2127: off_t off = *offp;
2128: int error;
2129:
2130: if (wl->wl_dealloccnt == 0)
2131: return 0;
2132:
2133: bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2134: sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2135:
2136: i = 0;
2137: while (i < wl->wl_dealloccnt) {
2138: wc->wc_type = WAPBL_WC_REVOCATIONS;
2139: wc->wc_len = blocklen;
2140: wc->wc_blkcount = 0;
2141: while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) {
2142: wc->wc_blocks[wc->wc_blkcount].wc_daddr =
2143: wl->wl_deallocblks[i];
2144: wc->wc_blocks[wc->wc_blkcount].wc_dlen =
2145: wl->wl_dealloclens[i];
2146: wc->wc_blkcount++;
2147: i++;
2148: }
2149: WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2150: ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
2151: wc->wc_len, (intmax_t)off));
2152: error = wapbl_circ_write(wl, wc, blocklen, &off);
2153: if (error)
2154: return error;
2155: }
2156: *offp = off;
2157: return 0;
2158: }
2159:
2160: static int
2161: wapbl_write_inodes(struct wapbl *wl, off_t *offp)
2162: {
2163: struct wapbl_wc_inodelist *wc =
2164: (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
2165: int i;
1.14 joerg 2166: int blocklen = 1 << wl->wl_log_dev_bshift;
1.2 simonb 2167: off_t off = *offp;
2168: int error;
2169:
2170: struct wapbl_ino_head *wih;
2171: struct wapbl_ino *wi;
2172: int iph;
2173:
2174: iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2175: sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2176:
2177: i = 0;
2178: wih = &wl->wl_inohash[0];
2179: wi = 0;
2180: do {
2181: wc->wc_type = WAPBL_WC_INODES;
2182: wc->wc_len = blocklen;
2183: wc->wc_inocnt = 0;
2184: wc->wc_clear = (i == 0);
2185: while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
2186: while (!wi) {
2187: KASSERT((wih - &wl->wl_inohash[0])
2188: <= wl->wl_inohashmask);
2189: wi = LIST_FIRST(wih++);
2190: }
2191: wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
2192: wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
2193: wc->wc_inocnt++;
2194: i++;
2195: wi = LIST_NEXT(wi, wi_hash);
2196: }
2197: WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2198: ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
2199: wc->wc_len, (intmax_t)off));
2200: error = wapbl_circ_write(wl, wc, blocklen, &off);
2201: if (error)
2202: return error;
2203: } while (i < wl->wl_inohashcnt);
2204:
2205: *offp = off;
2206: return 0;
2207: }
2208:
2209: #endif /* _KERNEL */
2210:
2211: /****************************************************************/
2212:
2213: struct wapbl_blk {
2214: LIST_ENTRY(wapbl_blk) wb_hash;
2215: daddr_t wb_blk;
2216: off_t wb_off; /* Offset of this block in the log */
2217: };
2218: #define WAPBL_BLKPOOL_MIN 83
2219:
2220: static void
2221: wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
2222: {
2223: if (size < WAPBL_BLKPOOL_MIN)
2224: size = WAPBL_BLKPOOL_MIN;
2225: KASSERT(wr->wr_blkhash == 0);
2226: #ifdef _KERNEL
2227: wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
2228: #else /* ! _KERNEL */
2229: /* Manually implement hashinit */
2230: {
1.25 lukem 2231: unsigned long i, hashsize;
1.2 simonb 2232: for (hashsize = 1; hashsize < size; hashsize <<= 1)
2233: continue;
1.51 para 2234: wr->wr_blkhash = wapbl_alloc(hashsize * sizeof(*wr->wr_blkhash));
1.37 drochner 2235: for (i = 0; i < hashsize; i++)
1.2 simonb 2236: LIST_INIT(&wr->wr_blkhash[i]);
2237: wr->wr_blkhashmask = hashsize - 1;
2238: }
2239: #endif /* ! _KERNEL */
2240: }
2241:
2242: static void
2243: wapbl_blkhash_free(struct wapbl_replay *wr)
2244: {
2245: KASSERT(wr->wr_blkhashcnt == 0);
2246: #ifdef _KERNEL
2247: hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
2248: #else /* ! _KERNEL */
1.18 yamt 2249: wapbl_free(wr->wr_blkhash,
2250: (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash));
1.2 simonb 2251: #endif /* ! _KERNEL */
2252: }
2253:
2254: static struct wapbl_blk *
2255: wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
2256: {
2257: struct wapbl_blk_head *wbh;
2258: struct wapbl_blk *wb;
2259: wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2260: LIST_FOREACH(wb, wbh, wb_hash) {
2261: if (blk == wb->wb_blk)
2262: return wb;
2263: }
2264: return 0;
2265: }
2266:
2267: static void
2268: wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
2269: {
2270: struct wapbl_blk_head *wbh;
2271: struct wapbl_blk *wb;
2272: wb = wapbl_blkhash_get(wr, blk);
2273: if (wb) {
2274: KASSERT(wb->wb_blk == blk);
2275: wb->wb_off = off;
2276: } else {
1.51 para 2277: wb = wapbl_alloc(sizeof(*wb));
1.2 simonb 2278: wb->wb_blk = blk;
2279: wb->wb_off = off;
2280: wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2281: LIST_INSERT_HEAD(wbh, wb, wb_hash);
2282: wr->wr_blkhashcnt++;
2283: }
2284: }
2285:
2286: static void
2287: wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
2288: {
2289: struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2290: if (wb) {
2291: KASSERT(wr->wr_blkhashcnt > 0);
2292: wr->wr_blkhashcnt--;
2293: LIST_REMOVE(wb, wb_hash);
1.18 yamt 2294: wapbl_free(wb, sizeof(*wb));
1.2 simonb 2295: }
2296: }
2297:
2298: static void
2299: wapbl_blkhash_clear(struct wapbl_replay *wr)
2300: {
1.25 lukem 2301: unsigned long i;
1.2 simonb 2302: for (i = 0; i <= wr->wr_blkhashmask; i++) {
2303: struct wapbl_blk *wb;
2304:
2305: while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
2306: KASSERT(wr->wr_blkhashcnt > 0);
2307: wr->wr_blkhashcnt--;
2308: LIST_REMOVE(wb, wb_hash);
1.18 yamt 2309: wapbl_free(wb, sizeof(*wb));
1.2 simonb 2310: }
2311: }
2312: KASSERT(wr->wr_blkhashcnt == 0);
2313: }
2314:
2315: /****************************************************************/
2316:
2317: static int
2318: wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
2319: {
2320: size_t slen;
2321: off_t off = *offp;
2322: int error;
1.34 mlelstv 2323: daddr_t pbn;
1.2 simonb 2324:
1.14 joerg 2325: KASSERT(((len >> wr->wr_log_dev_bshift) <<
2326: wr->wr_log_dev_bshift) == len);
1.34 mlelstv 2327:
1.14 joerg 2328: if (off < wr->wr_circ_off)
2329: off = wr->wr_circ_off;
2330: slen = wr->wr_circ_off + wr->wr_circ_size - off;
1.2 simonb 2331: if (slen < len) {
1.34 mlelstv 2332: pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2333: #ifdef _KERNEL
2334: pbn = btodb(pbn << wr->wr_log_dev_bshift);
2335: #endif
2336: error = wapbl_read(data, slen, wr->wr_devvp, pbn);
1.2 simonb 2337: if (error)
2338: return error;
2339: data = (uint8_t *)data + slen;
2340: len -= slen;
1.14 joerg 2341: off = wr->wr_circ_off;
1.2 simonb 2342: }
1.34 mlelstv 2343: pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2344: #ifdef _KERNEL
2345: pbn = btodb(pbn << wr->wr_log_dev_bshift);
2346: #endif
2347: error = wapbl_read(data, len, wr->wr_devvp, pbn);
1.2 simonb 2348: if (error)
2349: return error;
2350: off += len;
1.14 joerg 2351: if (off >= wr->wr_circ_off + wr->wr_circ_size)
2352: off = wr->wr_circ_off;
1.2 simonb 2353: *offp = off;
2354: return 0;
2355: }
2356:
2357: static void
2358: wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
2359: {
2360: size_t slen;
2361: off_t off = *offp;
2362:
1.14 joerg 2363: KASSERT(((len >> wr->wr_log_dev_bshift) <<
2364: wr->wr_log_dev_bshift) == len);
1.2 simonb 2365:
1.14 joerg 2366: if (off < wr->wr_circ_off)
2367: off = wr->wr_circ_off;
2368: slen = wr->wr_circ_off + wr->wr_circ_size - off;
1.2 simonb 2369: if (slen < len) {
2370: len -= slen;
1.14 joerg 2371: off = wr->wr_circ_off;
1.2 simonb 2372: }
2373: off += len;
1.14 joerg 2374: if (off >= wr->wr_circ_off + wr->wr_circ_size)
2375: off = wr->wr_circ_off;
1.2 simonb 2376: *offp = off;
2377: }
2378:
2379: /****************************************************************/
2380:
2381: int
2382: wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
2383: daddr_t off, size_t count, size_t blksize)
2384: {
2385: struct wapbl_replay *wr;
2386: int error;
2387: struct vnode *devvp;
2388: daddr_t logpbn;
2389: uint8_t *scratch;
2390: struct wapbl_wc_header *wch;
2391: struct wapbl_wc_header *wch2;
2392: /* Use this until we read the actual log header */
1.31 mlelstv 2393: int log_dev_bshift = ilog2(blksize);
1.2 simonb 2394: size_t used;
1.34 mlelstv 2395: daddr_t pbn;
1.2 simonb 2396:
2397: WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2398: ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
2399: vp, off, count, blksize));
2400:
2401: if (off < 0)
2402: return EINVAL;
2403:
2404: if (blksize < DEV_BSIZE)
2405: return EINVAL;
2406: if (blksize % DEV_BSIZE)
2407: return EINVAL;
2408:
2409: #ifdef _KERNEL
2410: #if 0
2411: /* XXX vp->v_size isn't reliably set for VBLK devices,
2412: * especially root. However, we might still want to verify
2413: * that the full load is readable */
2414: if ((off + count) * blksize > vp->v_size)
2415: return EINVAL;
2416: #endif
2417: if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
2418: return error;
2419: }
2420: #else /* ! _KERNEL */
2421: devvp = vp;
2422: logpbn = off;
2423: #endif /* ! _KERNEL */
2424:
1.51 para 2425: scratch = wapbl_alloc(MAXBSIZE);
1.2 simonb 2426:
1.34 mlelstv 2427: pbn = logpbn;
2428: #ifdef _KERNEL
2429: pbn = btodb(pbn << log_dev_bshift);
2430: #endif
2431: error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn);
1.2 simonb 2432: if (error)
2433: goto errout;
2434:
2435: wch = (struct wapbl_wc_header *)scratch;
2436: wch2 =
2437: (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
2438: /* XXX verify checksums and magic numbers */
2439: if (wch->wc_type != WAPBL_WC_HEADER) {
2440: printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
2441: error = EFTYPE;
2442: goto errout;
2443: }
2444:
2445: if (wch2->wc_generation > wch->wc_generation)
2446: wch = wch2;
2447:
2448: wr = wapbl_calloc(1, sizeof(*wr));
2449:
2450: wr->wr_logvp = vp;
2451: wr->wr_devvp = devvp;
2452: wr->wr_logpbn = logpbn;
2453:
2454: wr->wr_scratch = scratch;
2455:
1.14 joerg 2456: wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
2457: wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
2458: wr->wr_circ_off = wch->wc_circ_off;
2459: wr->wr_circ_size = wch->wc_circ_size;
2460: wr->wr_generation = wch->wc_generation;
1.2 simonb 2461:
2462: used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
2463:
2464: WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2465: ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
2466: " len=%"PRId64" used=%zu\n",
2467: wch->wc_head, wch->wc_tail, wch->wc_circ_off,
2468: wch->wc_circ_size, used));
2469:
2470: wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
1.11 joerg 2471:
1.14 joerg 2472: error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
1.2 simonb 2473: if (error) {
2474: wapbl_replay_stop(wr);
2475: wapbl_replay_free(wr);
2476: return error;
2477: }
2478:
2479: *wrp = wr;
2480: return 0;
2481:
2482: errout:
1.18 yamt 2483: wapbl_free(scratch, MAXBSIZE);
1.2 simonb 2484: return error;
2485: }
2486:
2487: void
2488: wapbl_replay_stop(struct wapbl_replay *wr)
2489: {
2490:
1.4 joerg 2491: if (!wapbl_replay_isopen(wr))
2492: return;
2493:
1.2 simonb 2494: WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
2495:
1.18 yamt 2496: wapbl_free(wr->wr_scratch, MAXBSIZE);
2497: wr->wr_scratch = NULL;
1.2 simonb 2498:
1.18 yamt 2499: wr->wr_logvp = NULL;
1.2 simonb 2500:
2501: wapbl_blkhash_clear(wr);
2502: wapbl_blkhash_free(wr);
2503: }
2504:
2505: void
2506: wapbl_replay_free(struct wapbl_replay *wr)
2507: {
2508:
2509: KDASSERT(!wapbl_replay_isopen(wr));
2510:
2511: if (wr->wr_inodes)
1.18 yamt 2512: wapbl_free(wr->wr_inodes,
2513: wr->wr_inodescnt * sizeof(wr->wr_inodes[0]));
2514: wapbl_free(wr, sizeof(*wr));
1.2 simonb 2515: }
2516:
1.4 joerg 2517: #ifdef _KERNEL
1.2 simonb 2518: int
2519: wapbl_replay_isopen1(struct wapbl_replay *wr)
2520: {
2521:
2522: return wapbl_replay_isopen(wr);
2523: }
1.4 joerg 2524: #endif
1.2 simonb 2525:
1.10 joerg 2526: static void
2527: wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
2528: {
2529: struct wapbl_wc_blocklist *wc =
2530: (struct wapbl_wc_blocklist *)wr->wr_scratch;
1.14 joerg 2531: int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.10 joerg 2532: int i, j, n;
2533:
2534: for (i = 0; i < wc->wc_blkcount; i++) {
2535: /*
2536: * Enter each physical block into the hashtable independently.
2537: */
1.14 joerg 2538: n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
1.10 joerg 2539: for (j = 0; j < n; j++) {
1.34 mlelstv 2540: wapbl_blkhash_ins(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen),
1.10 joerg 2541: *offp);
2542: wapbl_circ_advance(wr, fsblklen, offp);
2543: }
2544: }
2545: }
2546:
2547: static void
2548: wapbl_replay_process_revocations(struct wapbl_replay *wr)
2549: {
2550: struct wapbl_wc_blocklist *wc =
2551: (struct wapbl_wc_blocklist *)wr->wr_scratch;
1.34 mlelstv 2552: int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.10 joerg 2553: int i, j, n;
2554:
2555: for (i = 0; i < wc->wc_blkcount; i++) {
2556: /*
2557: * Remove any blocks found from the hashtable.
2558: */
1.14 joerg 2559: n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
1.10 joerg 2560: for (j = 0; j < n; j++)
1.34 mlelstv 2561: wapbl_blkhash_rem(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
1.10 joerg 2562: }
2563: }
2564:
2565: static void
2566: wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
2567: {
2568: struct wapbl_wc_inodelist *wc =
2569: (struct wapbl_wc_inodelist *)wr->wr_scratch;
1.18 yamt 2570: void *new_inodes;
2571: const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]);
2572:
2573: KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0]));
2574:
1.10 joerg 2575: /*
2576: * Keep track of where we found this so location won't be
2577: * overwritten.
2578: */
2579: if (wc->wc_clear) {
2580: wr->wr_inodestail = oldoff;
2581: wr->wr_inodescnt = 0;
1.12 joerg 2582: if (wr->wr_inodes != NULL) {
1.18 yamt 2583: wapbl_free(wr->wr_inodes, oldsize);
1.12 joerg 2584: wr->wr_inodes = NULL;
2585: }
1.10 joerg 2586: }
2587: wr->wr_inodeshead = newoff;
2588: if (wc->wc_inocnt == 0)
2589: return;
2590:
1.51 para 2591: new_inodes = wapbl_alloc((wr->wr_inodescnt + wc->wc_inocnt) *
1.18 yamt 2592: sizeof(wr->wr_inodes[0]));
2593: if (wr->wr_inodes != NULL) {
2594: memcpy(new_inodes, wr->wr_inodes, oldsize);
2595: wapbl_free(wr->wr_inodes, oldsize);
2596: }
2597: wr->wr_inodes = new_inodes;
1.10 joerg 2598: memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
1.18 yamt 2599: wc->wc_inocnt * sizeof(wr->wr_inodes[0]));
1.10 joerg 2600: wr->wr_inodescnt += wc->wc_inocnt;
2601: }
2602:
1.2 simonb 2603: static int
1.14 joerg 2604: wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
1.2 simonb 2605: {
2606: off_t off;
2607: int error;
2608:
1.14 joerg 2609: int logblklen = 1 << wr->wr_log_dev_bshift;
1.2 simonb 2610:
2611: wapbl_blkhash_clear(wr);
2612:
1.14 joerg 2613: off = tail;
2614: while (off != head) {
1.2 simonb 2615: struct wapbl_wc_null *wcn;
2616: off_t saveoff = off;
2617: error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2618: if (error)
2619: goto errout;
2620: wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2621: switch (wcn->wc_type) {
2622: case WAPBL_WC_BLOCKS:
1.10 joerg 2623: wapbl_replay_process_blocks(wr, &off);
1.2 simonb 2624: break;
2625:
2626: case WAPBL_WC_REVOCATIONS:
1.10 joerg 2627: wapbl_replay_process_revocations(wr);
1.2 simonb 2628: break;
2629:
2630: case WAPBL_WC_INODES:
1.10 joerg 2631: wapbl_replay_process_inodes(wr, saveoff, off);
1.2 simonb 2632: break;
1.10 joerg 2633:
1.2 simonb 2634: default:
2635: printf("Unrecognized wapbl type: 0x%08x\n",
2636: wcn->wc_type);
2637: error = EFTYPE;
2638: goto errout;
2639: }
2640: wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2641: if (off != saveoff) {
2642: printf("wapbl_replay: corrupted records\n");
2643: error = EFTYPE;
2644: goto errout;
2645: }
2646: }
2647: return 0;
2648:
2649: errout:
2650: wapbl_blkhash_clear(wr);
2651: return error;
2652: }
2653:
1.13 joerg 2654: #if 0
1.2 simonb 2655: int
2656: wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
2657: {
2658: off_t off;
2659: int mismatchcnt = 0;
1.14 joerg 2660: int logblklen = 1 << wr->wr_log_dev_bshift;
2661: int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.51 para 2662: void *scratch1 = wapbl_alloc(MAXBSIZE);
2663: void *scratch2 = wapbl_alloc(MAXBSIZE);
1.2 simonb 2664: int error = 0;
2665:
2666: KDASSERT(wapbl_replay_isopen(wr));
2667:
2668: off = wch->wc_tail;
2669: while (off != wch->wc_head) {
2670: struct wapbl_wc_null *wcn;
2671: #ifdef DEBUG
2672: off_t saveoff = off;
2673: #endif
2674: error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2675: if (error)
2676: goto out;
2677: wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2678: switch (wcn->wc_type) {
2679: case WAPBL_WC_BLOCKS:
2680: {
2681: struct wapbl_wc_blocklist *wc =
2682: (struct wapbl_wc_blocklist *)wr->wr_scratch;
2683: int i;
2684: for (i = 0; i < wc->wc_blkcount; i++) {
2685: int foundcnt = 0;
2686: int dirtycnt = 0;
2687: int j, n;
2688: /*
2689: * Check each physical block into the
2690: * hashtable independently
2691: */
2692: n = wc->wc_blocks[i].wc_dlen >>
2693: wch->wc_fs_dev_bshift;
2694: for (j = 0; j < n; j++) {
2695: struct wapbl_blk *wb =
2696: wapbl_blkhash_get(wr,
1.34 mlelstv 2697: wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
1.2 simonb 2698: if (wb && (wb->wb_off == off)) {
2699: foundcnt++;
2700: error =
2701: wapbl_circ_read(wr,
2702: scratch1, fsblklen,
2703: &off);
2704: if (error)
2705: goto out;
2706: error =
2707: wapbl_read(scratch2,
2708: fsblklen, fsdevvp,
2709: wb->wb_blk);
2710: if (error)
2711: goto out;
2712: if (memcmp(scratch1,
2713: scratch2,
2714: fsblklen)) {
2715: printf(
2716: "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
2717: wb->wb_blk, (intmax_t)off);
2718: dirtycnt++;
2719: mismatchcnt++;
2720: }
2721: } else {
2722: wapbl_circ_advance(wr,
2723: fsblklen, &off);
2724: }
2725: }
2726: #if 0
2727: /*
2728: * If all of the blocks in an entry
2729: * are clean, then remove all of its
2730: * blocks from the hashtable since they
2731: * never will need replay.
2732: */
2733: if ((foundcnt != 0) &&
2734: (dirtycnt == 0)) {
2735: off = saveoff;
2736: wapbl_circ_advance(wr,
2737: logblklen, &off);
2738: for (j = 0; j < n; j++) {
2739: struct wapbl_blk *wb =
2740: wapbl_blkhash_get(wr,
1.34 mlelstv 2741: wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
1.2 simonb 2742: if (wb &&
2743: (wb->wb_off == off)) {
2744: wapbl_blkhash_rem(wr, wb->wb_blk);
2745: }
2746: wapbl_circ_advance(wr,
2747: fsblklen, &off);
2748: }
2749: }
2750: #endif
2751: }
2752: }
2753: break;
2754: case WAPBL_WC_REVOCATIONS:
2755: case WAPBL_WC_INODES:
2756: break;
2757: default:
2758: KASSERT(0);
2759: }
2760: #ifdef DEBUG
2761: wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2762: KASSERT(off == saveoff);
2763: #endif
2764: }
2765: out:
1.18 yamt 2766: wapbl_free(scratch1, MAXBSIZE);
2767: wapbl_free(scratch2, MAXBSIZE);
1.2 simonb 2768: if (!error && mismatchcnt)
2769: error = EFTYPE;
2770: return error;
2771: }
2772: #endif
2773:
2774: int
2775: wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
2776: {
1.9 joerg 2777: struct wapbl_blk *wb;
2778: size_t i;
1.2 simonb 2779: off_t off;
1.9 joerg 2780: void *scratch;
1.2 simonb 2781: int error = 0;
1.14 joerg 2782: int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.2 simonb 2783:
2784: KDASSERT(wapbl_replay_isopen(wr));
2785:
1.51 para 2786: scratch = wapbl_alloc(MAXBSIZE);
1.2 simonb 2787:
1.37 drochner 2788: for (i = 0; i <= wr->wr_blkhashmask; ++i) {
1.9 joerg 2789: LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
2790: off = wb->wb_off;
2791: error = wapbl_circ_read(wr, scratch, fsblklen, &off);
2792: if (error)
2793: break;
2794: error = wapbl_write(scratch, fsblklen, fsdevvp,
2795: wb->wb_blk);
2796: if (error)
2797: break;
1.2 simonb 2798: }
2799: }
1.9 joerg 2800:
1.18 yamt 2801: wapbl_free(scratch, MAXBSIZE);
1.2 simonb 2802: return error;
2803: }
2804:
2805: int
1.6 joerg 2806: wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
2807: {
1.14 joerg 2808: int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.6 joerg 2809:
2810: KDASSERT(wapbl_replay_isopen(wr));
2811: KASSERT((len % fsblklen) == 0);
2812:
2813: while (len != 0) {
2814: struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2815: if (wb)
2816: return 1;
2817: len -= fsblklen;
2818: }
2819: return 0;
2820: }
2821:
2822: int
1.2 simonb 2823: wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
2824: {
1.14 joerg 2825: int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.2 simonb 2826:
2827: KDASSERT(wapbl_replay_isopen(wr));
2828:
2829: KASSERT((len % fsblklen) == 0);
2830:
2831: while (len != 0) {
2832: struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2833: if (wb) {
2834: off_t off = wb->wb_off;
2835: int error;
2836: error = wapbl_circ_read(wr, data, fsblklen, &off);
2837: if (error)
2838: return error;
2839: }
2840: data = (uint8_t *)data + fsblklen;
2841: len -= fsblklen;
2842: blk++;
2843: }
2844: return 0;
2845: }
1.35 pooka 2846:
1.36 pooka 2847: #ifdef _KERNEL
1.35 pooka 2848: /*
2849: * This is not really a module now, but maybe on it's way to
2850: * being one some day.
2851: */
2852: MODULE(MODULE_CLASS_VFS, wapbl, NULL);
2853:
2854: static int
2855: wapbl_modcmd(modcmd_t cmd, void *arg)
2856: {
2857:
2858: switch (cmd) {
2859: case MODULE_CMD_INIT:
1.39 christos 2860: wapbl_init();
1.35 pooka 2861: return 0;
2862: case MODULE_CMD_FINI:
1.39 christos 2863: #ifdef notyet
2864: return wapbl_fini(true);
2865: #endif
1.35 pooka 2866: return EOPNOTSUPP;
2867: default:
2868: return ENOTTY;
2869: }
2870: }
1.36 pooka 2871: #endif /* _KERNEL */
CVSweb <webmaster@jp.NetBSD.org>