Annotation of src/sys/kern/vfs_wapbl.c, Revision 1.78
1.78 ! riastrad 1: /* $NetBSD: vfs_wapbl.c,v 1.77 2016/05/07 22:12:29 riastradh Exp $ */
1.2 simonb 2:
3: /*-
1.23 ad 4: * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
1.2 simonb 5: * All rights reserved.
6: *
7: * This code is derived from software contributed to The NetBSD Foundation
8: * by Wasabi Systems, Inc.
9: *
10: * Redistribution and use in source and binary forms, with or without
11: * modification, are permitted provided that the following conditions
12: * are met:
13: * 1. Redistributions of source code must retain the above copyright
14: * notice, this list of conditions and the following disclaimer.
15: * 2. Redistributions in binary form must reproduce the above copyright
16: * notice, this list of conditions and the following disclaimer in the
17: * documentation and/or other materials provided with the distribution.
18: *
19: * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20: * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21: * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23: * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24: * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25: * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26: * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27: * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28: * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29: * POSSIBILITY OF SUCH DAMAGE.
30: */
31:
32: /*
33: * This implements file system independent write ahead filesystem logging.
34: */
1.4 joerg 35:
36: #define WAPBL_INTERNAL
37:
1.2 simonb 38: #include <sys/cdefs.h>
1.78 ! riastrad 39: __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.77 2016/05/07 22:12:29 riastradh Exp $");
1.2 simonb 40:
41: #include <sys/param.h>
1.31 mlelstv 42: #include <sys/bitops.h>
1.68 riastrad 43: #include <sys/time.h>
44: #include <sys/wapbl.h>
45: #include <sys/wapbl_replay.h>
1.2 simonb 46:
47: #ifdef _KERNEL
1.68 riastrad 48:
49: #include <sys/atomic.h>
50: #include <sys/conf.h>
51: #include <sys/file.h>
52: #include <sys/kauth.h>
53: #include <sys/kernel.h>
54: #include <sys/module.h>
55: #include <sys/mount.h>
56: #include <sys/mutex.h>
1.2 simonb 57: #include <sys/namei.h>
58: #include <sys/proc.h>
1.68 riastrad 59: #include <sys/resourcevar.h>
1.39 christos 60: #include <sys/sysctl.h>
1.2 simonb 61: #include <sys/uio.h>
62: #include <sys/vnode.h>
63:
64: #include <miscfs/specfs/specdev.h>
65:
1.51 para 66: #define wapbl_alloc(s) kmem_alloc((s), KM_SLEEP)
67: #define wapbl_free(a, s) kmem_free((a), (s))
68: #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
1.2 simonb 69:
1.39 christos 70: static struct sysctllog *wapbl_sysctl;
71: static int wapbl_flush_disk_cache = 1;
72: static int wapbl_verbose_commit = 0;
73:
1.57 joerg 74: static inline size_t wapbl_space_free(size_t, off_t, off_t);
75:
1.2 simonb 76: #else /* !_KERNEL */
1.68 riastrad 77:
1.2 simonb 78: #include <assert.h>
79: #include <errno.h>
1.68 riastrad 80: #include <stdbool.h>
1.2 simonb 81: #include <stdio.h>
82: #include <stdlib.h>
83: #include <string.h>
84:
85: #define KDASSERT(x) assert(x)
86: #define KASSERT(x) assert(x)
1.51 para 87: #define wapbl_alloc(s) malloc(s)
1.18 yamt 88: #define wapbl_free(a, s) free(a)
1.2 simonb 89: #define wapbl_calloc(n, s) calloc((n), (s))
90:
91: #endif /* !_KERNEL */
92:
93: /*
94: * INTERNAL DATA STRUCTURES
95: */
96:
97: /*
98: * This structure holds per-mount log information.
99: *
100: * Legend: a = atomic access only
101: * r = read-only after init
102: * l = rwlock held
103: * m = mutex held
1.38 hannken 104: * lm = rwlock held writing or mutex held
1.2 simonb 105: * u = unlocked access ok
106: * b = bufcache_lock held
107: */
1.60 matt 108: LIST_HEAD(wapbl_ino_head, wapbl_ino);
1.2 simonb 109: struct wapbl {
110: struct vnode *wl_logvp; /* r: log here */
111: struct vnode *wl_devvp; /* r: log on this device */
112: struct mount *wl_mount; /* r: mountpoint wl is associated with */
113: daddr_t wl_logpbn; /* r: Physical block number of start of log */
114: int wl_log_dev_bshift; /* r: logarithm of device block size of log
115: device */
116: int wl_fs_dev_bshift; /* r: logarithm of device block size of
117: filesystem device */
118:
1.3 yamt 119: unsigned wl_lock_count; /* m: Count of transactions in progress */
1.2 simonb 120:
121: size_t wl_circ_size; /* r: Number of bytes in buffer of log */
122: size_t wl_circ_off; /* r: Number of bytes reserved at start */
123:
124: size_t wl_bufcount_max; /* r: Number of buffers reserved for log */
125: size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */
126:
127: off_t wl_head; /* l: Byte offset of log head */
128: off_t wl_tail; /* l: Byte offset of log tail */
129: /*
1.71 riastrad 130: * WAPBL log layout, stored on wl_devvp at wl_logpbn:
131: *
132: * ___________________ wl_circ_size __________________
133: * / \
134: * +---------+---------+-------+--------------+--------+
135: * [ commit0 | commit1 | CCWCW | EEEEEEEEEEEE | CCCWCW ]
136: * +---------+---------+-------+--------------+--------+
137: * wl_circ_off --^ ^-- wl_head ^-- wl_tail
138: *
139: * commit0 and commit1 are commit headers. A commit header has
140: * a generation number, indicating which of the two headers is
141: * more recent, and an assignment of head and tail pointers.
142: * The rest is a circular queue of log records, starting at
143: * the byte offset wl_circ_off.
144: *
145: * E marks empty space for records.
146: * W marks records for block writes issued but waiting.
147: * C marks completed records.
148: *
149: * wapbl_flush writes new records to empty `E' spaces after
150: * wl_head from the current transaction in memory.
151: *
152: * wapbl_truncate advances wl_tail past any completed `C'
153: * records, freeing them up for use.
154: *
155: * head == tail == 0 means log is empty.
156: * head == tail != 0 means log is full.
157: *
158: * See assertions in wapbl_advance() for other boundary
159: * conditions.
160: *
161: * Only wapbl_flush moves the head, except when wapbl_truncate
162: * sets it to 0 to indicate that the log is empty.
163: *
164: * Only wapbl_truncate moves the tail, except when wapbl_flush
165: * sets it to wl_circ_off to indicate that the log is full.
1.2 simonb 166: */
167:
168: struct wapbl_wc_header *wl_wc_header; /* l */
169: void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */
170:
171: kmutex_t wl_mtx; /* u: short-term lock */
172: krwlock_t wl_rwlock; /* u: File system transaction lock */
173:
174: /*
175: * Must be held while accessing
176: * wl_count or wl_bufs or head or tail
177: */
178:
179: /*
180: * Callback called from within the flush routine to flush any extra
181: * bits. Note that flush may be skipped without calling this if
182: * there are no outstanding buffers in the transaction.
183: */
1.5 joerg 184: #if _KERNEL
1.2 simonb 185: wapbl_flush_fn_t wl_flush; /* r */
186: wapbl_flush_fn_t wl_flush_abort;/* r */
1.5 joerg 187: #endif
1.2 simonb 188:
189: size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */
190: size_t wl_bufcount; /* m: Count of buffers in wl_bufs */
191: size_t wl_bcount; /* m: Total bcount of wl_bufs */
192:
193: LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
194:
195: kcondvar_t wl_reclaimable_cv; /* m (obviously) */
196: size_t wl_reclaimable_bytes; /* m: Amount of space available for
197: reclamation by truncate */
198: int wl_error_count; /* m: # of wl_entries with errors */
199: size_t wl_reserved_bytes; /* never truncate log smaller than this */
200:
201: #ifdef WAPBL_DEBUG_BUFBYTES
202: size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
203: #endif
204:
1.38 hannken 205: daddr_t *wl_deallocblks;/* lm: address of block */
206: int *wl_dealloclens; /* lm: size of block */
207: int wl_dealloccnt; /* lm: total count */
1.2 simonb 208: int wl_dealloclim; /* l: max count */
209:
210: /* hashtable of inode numbers for allocated but unlinked inodes */
211: /* synch ??? */
1.60 matt 212: struct wapbl_ino_head *wl_inohash;
1.2 simonb 213: u_long wl_inohashmask;
214: int wl_inohashcnt;
215:
216: SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
217: accounting */
1.54 hannken 218:
219: u_char *wl_buffer; /* l: buffer for wapbl_buffered_write() */
220: daddr_t wl_buffer_dblk; /* l: buffer disk block address */
221: size_t wl_buffer_used; /* l: buffer current use */
1.2 simonb 222: };
223:
224: #ifdef WAPBL_DEBUG_PRINT
225: int wapbl_debug_print = WAPBL_DEBUG_PRINT;
226: #endif
227:
228: /****************************************************************/
229: #ifdef _KERNEL
230:
231: #ifdef WAPBL_DEBUG
232: struct wapbl *wapbl_debug_wl;
233: #endif
234:
235: static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
236: static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
237: static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
238: static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
239: #endif /* _KERNEL */
240:
1.14 joerg 241: static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
1.2 simonb 242:
1.30 uebayasi 243: static inline size_t wapbl_space_used(size_t avail, off_t head,
1.2 simonb 244: off_t tail);
245:
246: #ifdef _KERNEL
247:
1.51 para 248: static struct pool wapbl_entry_pool;
249:
1.2 simonb 250: #define WAPBL_INODETRK_SIZE 83
251: static int wapbl_ino_pool_refcount;
252: static struct pool wapbl_ino_pool;
253: struct wapbl_ino {
254: LIST_ENTRY(wapbl_ino) wi_hash;
255: ino_t wi_ino;
256: mode_t wi_mode;
257: };
258:
259: static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
260: static void wapbl_inodetrk_free(struct wapbl *wl);
261: static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
262:
263: static size_t wapbl_transaction_len(struct wapbl *wl);
1.30 uebayasi 264: static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
1.2 simonb 265:
1.13 joerg 266: #if 0
1.4 joerg 267: int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
268: #endif
269:
270: static int wapbl_replay_isopen1(struct wapbl_replay *);
271:
1.2 simonb 272: struct wapbl_ops wapbl_ops = {
273: .wo_wapbl_discard = wapbl_discard,
274: .wo_wapbl_replay_isopen = wapbl_replay_isopen1,
1.6 joerg 275: .wo_wapbl_replay_can_read = wapbl_replay_can_read,
1.2 simonb 276: .wo_wapbl_replay_read = wapbl_replay_read,
277: .wo_wapbl_add_buf = wapbl_add_buf,
278: .wo_wapbl_remove_buf = wapbl_remove_buf,
279: .wo_wapbl_resize_buf = wapbl_resize_buf,
280: .wo_wapbl_begin = wapbl_begin,
281: .wo_wapbl_end = wapbl_end,
282: .wo_wapbl_junlock_assert= wapbl_junlock_assert,
283:
284: /* XXX: the following is only used to say "this is a wapbl buf" */
285: .wo_wapbl_biodone = wapbl_biodone,
286: };
287:
1.21 yamt 288: static int
1.39 christos 289: wapbl_sysctl_init(void)
290: {
291: int rv;
292: const struct sysctlnode *rnode, *cnode;
293:
294: wapbl_sysctl = NULL;
295:
296: rv = sysctl_createv(&wapbl_sysctl, 0, NULL, &rnode,
297: CTLFLAG_PERMANENT,
298: CTLTYPE_NODE, "wapbl",
299: SYSCTL_DESCR("WAPBL journaling options"),
300: NULL, 0, NULL, 0,
1.59 pooka 301: CTL_VFS, CTL_CREATE, CTL_EOL);
1.39 christos 302: if (rv)
303: return rv;
304:
305: rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
306: CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
307: CTLTYPE_INT, "flush_disk_cache",
308: SYSCTL_DESCR("flush disk cache"),
309: NULL, 0, &wapbl_flush_disk_cache, 0,
310: CTL_CREATE, CTL_EOL);
311: if (rv)
312: return rv;
313:
314: rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
315: CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
316: CTLTYPE_INT, "verbose_commit",
317: SYSCTL_DESCR("show time and size of wapbl log commits"),
318: NULL, 0, &wapbl_verbose_commit, 0,
319: CTL_CREATE, CTL_EOL);
320: return rv;
321: }
322:
323: static void
324: wapbl_init(void)
325: {
1.51 para 326:
327: pool_init(&wapbl_entry_pool, sizeof(struct wapbl_entry), 0, 0, 0,
328: "wapblentrypl", &pool_allocator_kmem, IPL_VM);
329:
1.39 christos 330: wapbl_sysctl_init();
331: }
332:
333: static int
1.74 riastrad 334: wapbl_fini(void)
1.39 christos 335: {
1.51 para 336:
1.63 pgoyette 337: if (wapbl_sysctl != NULL)
338: sysctl_teardown(&wapbl_sysctl);
1.51 para 339:
340: pool_destroy(&wapbl_entry_pool);
341:
1.39 christos 342: return 0;
343: }
344:
345: static int
1.15 joerg 346: wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
347: {
348: int error, i;
349:
350: WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
351: ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
352:
353: /*
354: * Its only valid to reuse the replay log if its
355: * the same as the new log we just opened.
356: */
357: KDASSERT(!wapbl_replay_isopen(wr));
1.47 christos 358: KASSERT(wl->wl_devvp->v_type == VBLK);
359: KASSERT(wr->wr_devvp->v_type == VBLK);
1.15 joerg 360: KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
361: KASSERT(wl->wl_logpbn == wr->wr_logpbn);
362: KASSERT(wl->wl_circ_size == wr->wr_circ_size);
363: KASSERT(wl->wl_circ_off == wr->wr_circ_off);
364: KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
365: KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
366:
367: wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
368:
369: for (i = 0; i < wr->wr_inodescnt; i++)
370: wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
371: wr->wr_inodes[i].wr_imode);
372:
373: /* Make sure new transaction won't overwrite old inodes list */
374: KDASSERT(wapbl_transaction_len(wl) <=
375: wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
376: wr->wr_inodestail));
377:
378: wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
379: wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
380: wapbl_transaction_len(wl);
381:
382: error = wapbl_write_inodes(wl, &wl->wl_head);
383: if (error)
384: return error;
385:
386: KASSERT(wl->wl_head != wl->wl_tail);
387: KASSERT(wl->wl_head != 0);
388:
389: return 0;
390: }
391:
1.2 simonb 392: int
393: wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
394: daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
395: wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
396: {
397: struct wapbl *wl;
398: struct vnode *devvp;
399: daddr_t logpbn;
400: int error;
1.31 mlelstv 401: int log_dev_bshift = ilog2(blksize);
1.32 mlelstv 402: int fs_dev_bshift = log_dev_bshift;
1.2 simonb 403: int run;
404:
405: WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
406: " count=%zu blksize=%zu\n", vp, off, count, blksize));
407:
408: if (log_dev_bshift > fs_dev_bshift) {
409: WAPBL_PRINTF(WAPBL_PRINT_OPEN,
410: ("wapbl: log device's block size cannot be larger "
411: "than filesystem's\n"));
412: /*
413: * Not currently implemented, although it could be if
414: * needed someday.
415: */
416: return ENOSYS;
417: }
418:
419: if (off < 0)
420: return EINVAL;
421:
422: if (blksize < DEV_BSIZE)
423: return EINVAL;
424: if (blksize % DEV_BSIZE)
425: return EINVAL;
426:
427: /* XXXTODO: verify that the full load is writable */
428:
429: /*
430: * XXX check for minimum log size
431: * minimum is governed by minimum amount of space
432: * to complete a transaction. (probably truncate)
433: */
434: /* XXX for now pick something minimal */
435: if ((count * blksize) < MAXPHYS) {
436: return ENOSPC;
437: }
438:
439: if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
440: return error;
441: }
442:
443: wl = wapbl_calloc(1, sizeof(*wl));
444: rw_init(&wl->wl_rwlock);
445: mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
446: cv_init(&wl->wl_reclaimable_cv, "wapblrec");
447: LIST_INIT(&wl->wl_bufs);
448: SIMPLEQ_INIT(&wl->wl_entries);
449:
450: wl->wl_logvp = vp;
451: wl->wl_devvp = devvp;
452: wl->wl_mount = mp;
453: wl->wl_logpbn = logpbn;
454: wl->wl_log_dev_bshift = log_dev_bshift;
455: wl->wl_fs_dev_bshift = fs_dev_bshift;
456:
457: wl->wl_flush = flushfn;
458: wl->wl_flush_abort = flushabortfn;
459:
460: /* Reserve two log device blocks for the commit headers */
461: wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
1.34 mlelstv 462: wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
1.2 simonb 463: /* truncate the log usage to a multiple of log_dev_bshift */
464: wl->wl_circ_size >>= wl->wl_log_dev_bshift;
465: wl->wl_circ_size <<= wl->wl_log_dev_bshift;
466:
467: /*
468: * wl_bufbytes_max limits the size of the in memory transaction space.
469: * - Since buffers are allocated and accounted for in units of
470: * PAGE_SIZE it is required to be a multiple of PAGE_SIZE
471: * (i.e. 1<<PAGE_SHIFT)
472: * - Since the log device has to be written in units of
473: * 1<<wl_log_dev_bshift it is required to be a mulitple of
474: * 1<<wl_log_dev_bshift.
475: * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
476: * it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
477: * Therefore it must be multiple of the least common multiple of those
478: * three quantities. Fortunately, all of those quantities are
479: * guaranteed to be a power of two, and the least common multiple of
480: * a set of numbers which are all powers of two is simply the maximum
481: * of those numbers. Finally, the maximum logarithm of a power of two
482: * is the same as the log of the maximum power of two. So we can do
483: * the following operations to size wl_bufbytes_max:
484: */
485:
486: /* XXX fix actual number of pages reserved per filesystem. */
487: wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
488:
489: /* Round wl_bufbytes_max to the largest power of two constraint */
490: wl->wl_bufbytes_max >>= PAGE_SHIFT;
491: wl->wl_bufbytes_max <<= PAGE_SHIFT;
492: wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
493: wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
494: wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
495: wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
496:
497: /* XXX maybe use filesystem fragment size instead of 1024 */
498: /* XXX fix actual number of buffers reserved per filesystem. */
499: wl->wl_bufcount_max = (nbuf / 2) * 1024;
500:
501: /* XXX tie this into resource estimation */
1.41 hannken 502: wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2;
1.2 simonb 503:
1.51 para 504: wl->wl_deallocblks = wapbl_alloc(sizeof(*wl->wl_deallocblks) *
1.2 simonb 505: wl->wl_dealloclim);
1.51 para 506: wl->wl_dealloclens = wapbl_alloc(sizeof(*wl->wl_dealloclens) *
1.2 simonb 507: wl->wl_dealloclim);
508:
1.54 hannken 509: wl->wl_buffer = wapbl_alloc(MAXPHYS);
510: wl->wl_buffer_used = 0;
511:
1.2 simonb 512: wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
513:
514: /* Initialize the commit header */
515: {
516: struct wapbl_wc_header *wc;
1.14 joerg 517: size_t len = 1 << wl->wl_log_dev_bshift;
1.2 simonb 518: wc = wapbl_calloc(1, len);
519: wc->wc_type = WAPBL_WC_HEADER;
520: wc->wc_len = len;
521: wc->wc_circ_off = wl->wl_circ_off;
522: wc->wc_circ_size = wl->wl_circ_size;
523: /* XXX wc->wc_fsid */
524: wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
525: wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
526: wl->wl_wc_header = wc;
1.51 para 527: wl->wl_wc_scratch = wapbl_alloc(len);
1.2 simonb 528: }
529:
530: /*
531: * if there was an existing set of unlinked but
532: * allocated inodes, preserve it in the new
533: * log.
534: */
535: if (wr && wr->wr_inodescnt) {
1.15 joerg 536: error = wapbl_start_flush_inodes(wl, wr);
1.2 simonb 537: if (error)
538: goto errout;
539: }
540:
541: error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
542: if (error) {
543: goto errout;
544: }
545:
546: *wlp = wl;
547: #if defined(WAPBL_DEBUG)
548: wapbl_debug_wl = wl;
549: #endif
550:
551: return 0;
552: errout:
553: wapbl_discard(wl);
1.18 yamt 554: wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
555: wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
556: wapbl_free(wl->wl_deallocblks,
557: sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
558: wapbl_free(wl->wl_dealloclens,
559: sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
1.54 hannken 560: wapbl_free(wl->wl_buffer, MAXPHYS);
1.2 simonb 561: wapbl_inodetrk_free(wl);
1.18 yamt 562: wapbl_free(wl, sizeof(*wl));
1.2 simonb 563:
564: return error;
565: }
566:
567: /*
568: * Like wapbl_flush, only discards the transaction
569: * completely
570: */
571:
572: void
573: wapbl_discard(struct wapbl *wl)
574: {
575: struct wapbl_entry *we;
576: struct buf *bp;
577: int i;
578:
579: /*
580: * XXX we may consider using upgrade here
581: * if we want to call flush from inside a transaction
582: */
583: rw_enter(&wl->wl_rwlock, RW_WRITER);
584: wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
585: wl->wl_dealloccnt);
586:
587: #ifdef WAPBL_DEBUG_PRINT
588: {
589: pid_t pid = -1;
590: lwpid_t lid = -1;
591: if (curproc)
592: pid = curproc->p_pid;
593: if (curlwp)
594: lid = curlwp->l_lid;
595: #ifdef WAPBL_DEBUG_BUFBYTES
596: WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
597: ("wapbl_discard: thread %d.%d discarding "
598: "transaction\n"
599: "\tbufcount=%zu bufbytes=%zu bcount=%zu "
600: "deallocs=%d inodes=%d\n"
601: "\terrcnt = %u, reclaimable=%zu reserved=%zu "
602: "unsynced=%zu\n",
603: pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
604: wl->wl_bcount, wl->wl_dealloccnt,
605: wl->wl_inohashcnt, wl->wl_error_count,
606: wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
607: wl->wl_unsynced_bufbytes));
608: SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
609: WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
610: ("\tentry: bufcount = %zu, reclaimable = %zu, "
611: "error = %d, unsynced = %zu\n",
612: we->we_bufcount, we->we_reclaimable_bytes,
613: we->we_error, we->we_unsynced_bufbytes));
614: }
615: #else /* !WAPBL_DEBUG_BUFBYTES */
616: WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
617: ("wapbl_discard: thread %d.%d discarding transaction\n"
618: "\tbufcount=%zu bufbytes=%zu bcount=%zu "
619: "deallocs=%d inodes=%d\n"
620: "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
621: pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
622: wl->wl_bcount, wl->wl_dealloccnt,
623: wl->wl_inohashcnt, wl->wl_error_count,
624: wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
625: SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
626: WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
627: ("\tentry: bufcount = %zu, reclaimable = %zu, "
628: "error = %d\n",
629: we->we_bufcount, we->we_reclaimable_bytes,
630: we->we_error));
631: }
632: #endif /* !WAPBL_DEBUG_BUFBYTES */
633: }
634: #endif /* WAPBL_DEBUG_PRINT */
635:
636: for (i = 0; i <= wl->wl_inohashmask; i++) {
637: struct wapbl_ino_head *wih;
638: struct wapbl_ino *wi;
639:
640: wih = &wl->wl_inohash[i];
641: while ((wi = LIST_FIRST(wih)) != NULL) {
642: LIST_REMOVE(wi, wi_hash);
643: pool_put(&wapbl_ino_pool, wi);
644: KASSERT(wl->wl_inohashcnt > 0);
645: wl->wl_inohashcnt--;
646: }
647: }
648:
649: /*
650: * clean buffer list
651: */
652: mutex_enter(&bufcache_lock);
653: mutex_enter(&wl->wl_mtx);
654: while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
655: if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
656: /*
657: * The buffer will be unlocked and
658: * removed from the transaction in brelse
659: */
660: mutex_exit(&wl->wl_mtx);
661: brelsel(bp, 0);
662: mutex_enter(&wl->wl_mtx);
663: }
664: }
665: mutex_exit(&wl->wl_mtx);
666: mutex_exit(&bufcache_lock);
667:
668: /*
669: * Remove references to this wl from wl_entries, free any which
670: * no longer have buffers, others will be freed in wapbl_biodone
671: * when they no longer have any buffers.
672: */
673: while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
674: SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
675: /* XXX should we be accumulating wl_error_count
676: * and increasing reclaimable bytes ? */
677: we->we_wapbl = NULL;
678: if (we->we_bufcount == 0) {
679: #ifdef WAPBL_DEBUG_BUFBYTES
680: KASSERT(we->we_unsynced_bufbytes == 0);
681: #endif
1.51 para 682: pool_put(&wapbl_entry_pool, we);
1.2 simonb 683: }
684: }
685:
686: /* Discard list of deallocs */
687: wl->wl_dealloccnt = 0;
688: /* XXX should we clear wl_reserved_bytes? */
689:
690: KASSERT(wl->wl_bufbytes == 0);
691: KASSERT(wl->wl_bcount == 0);
692: KASSERT(wl->wl_bufcount == 0);
693: KASSERT(LIST_EMPTY(&wl->wl_bufs));
694: KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
695: KASSERT(wl->wl_inohashcnt == 0);
696:
697: rw_exit(&wl->wl_rwlock);
698: }
699:
700: int
701: wapbl_stop(struct wapbl *wl, int force)
702: {
703: int error;
704:
705: WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
706: error = wapbl_flush(wl, 1);
707: if (error) {
708: if (force)
709: wapbl_discard(wl);
710: else
711: return error;
712: }
713:
714: /* Unlinked inodes persist after a flush */
715: if (wl->wl_inohashcnt) {
716: if (force) {
717: wapbl_discard(wl);
718: } else {
719: return EBUSY;
720: }
721: }
722:
723: KASSERT(wl->wl_bufbytes == 0);
724: KASSERT(wl->wl_bcount == 0);
725: KASSERT(wl->wl_bufcount == 0);
726: KASSERT(LIST_EMPTY(&wl->wl_bufs));
727: KASSERT(wl->wl_dealloccnt == 0);
728: KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
729: KASSERT(wl->wl_inohashcnt == 0);
730:
1.18 yamt 731: wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
732: wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
733: wapbl_free(wl->wl_deallocblks,
734: sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
735: wapbl_free(wl->wl_dealloclens,
736: sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
1.54 hannken 737: wapbl_free(wl->wl_buffer, MAXPHYS);
1.2 simonb 738: wapbl_inodetrk_free(wl);
739:
740: cv_destroy(&wl->wl_reclaimable_cv);
741: mutex_destroy(&wl->wl_mtx);
742: rw_destroy(&wl->wl_rwlock);
1.18 yamt 743: wapbl_free(wl, sizeof(*wl));
1.2 simonb 744:
745: return 0;
746: }
747:
1.71 riastrad 748: /****************************************************************/
749: /*
750: * Unbuffered disk I/O
751: */
752:
1.2 simonb 753: static int
754: wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
755: {
756: struct pstats *pstats = curlwp->l_proc->p_stats;
757: struct buf *bp;
758: int error;
759:
760: KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
761: KASSERT(devvp->v_type == VBLK);
762:
763: if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
1.45 rmind 764: mutex_enter(devvp->v_interlock);
1.2 simonb 765: devvp->v_numoutput++;
1.45 rmind 766: mutex_exit(devvp->v_interlock);
1.2 simonb 767: pstats->p_ru.ru_oublock++;
768: } else {
769: pstats->p_ru.ru_inblock++;
770: }
771:
772: bp = getiobuf(devvp, true);
773: bp->b_flags = flags;
774: bp->b_cflags = BC_BUSY; /* silly & dubious */
775: bp->b_dev = devvp->v_rdev;
776: bp->b_data = data;
777: bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
778: bp->b_blkno = pbn;
1.52 chs 779: BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
1.2 simonb 780:
781: WAPBL_PRINTF(WAPBL_PRINT_IO,
1.29 pooka 782: ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n",
1.2 simonb 783: BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
784: bp->b_blkno, bp->b_dev));
785:
786: VOP_STRATEGY(devvp, bp);
787:
788: error = biowait(bp);
789: putiobuf(bp);
790:
791: if (error) {
792: WAPBL_PRINTF(WAPBL_PRINT_ERROR,
793: ("wapbl_doio: %s %zu bytes at block %" PRId64
1.29 pooka 794: " on dev 0x%"PRIx64" failed with error %d\n",
1.2 simonb 795: (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
796: "write" : "read"),
797: len, pbn, devvp->v_rdev, error));
798: }
799:
800: return error;
801: }
802:
1.71 riastrad 803: /*
804: * wapbl_write(data, len, devvp, pbn)
805: *
806: * Synchronously write len bytes from data to physical block pbn
807: * on devvp.
808: */
1.2 simonb 809: int
810: wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
811: {
812:
813: return wapbl_doio(data, len, devvp, pbn, B_WRITE);
814: }
815:
1.71 riastrad 816: /*
817: * wapbl_read(data, len, devvp, pbn)
818: *
819: * Synchronously read len bytes into data from physical block pbn
820: * on devvp.
821: */
1.2 simonb 822: int
823: wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
824: {
825:
826: return wapbl_doio(data, len, devvp, pbn, B_READ);
827: }
828:
1.71 riastrad 829: /****************************************************************/
830: /*
831: * Buffered disk writes -- try to coalesce writes and emit
832: * MAXPHYS-aligned blocks.
833: */
834:
1.2 simonb 835: /*
1.71 riastrad 836: * wapbl_buffered_flush(wl)
837: *
838: * Flush any buffered writes from wapbl_buffered_write.
1.54 hannken 839: */
840: static int
841: wapbl_buffered_flush(struct wapbl *wl)
842: {
843: int error;
844:
845: if (wl->wl_buffer_used == 0)
846: return 0;
847:
848: error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
849: wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
850: wl->wl_buffer_used = 0;
851:
852: return error;
853: }
854:
855: /*
1.71 riastrad 856: * wapbl_buffered_write(data, len, wl, pbn)
857: *
858: * Write len bytes from data to physical block pbn on
859: * wl->wl_devvp. The write may not complete until
860: * wapbl_buffered_flush.
1.54 hannken 861: */
862: static int
863: wapbl_buffered_write(void *data, size_t len, struct wapbl *wl, daddr_t pbn)
864: {
865: int error;
866: size_t resid;
867:
868: /*
869: * If not adjacent to buffered data flush first. Disk block
870: * address is always valid for non-empty buffer.
871: */
872: if (wl->wl_buffer_used > 0 &&
873: pbn != wl->wl_buffer_dblk + btodb(wl->wl_buffer_used)) {
874: error = wapbl_buffered_flush(wl);
875: if (error)
876: return error;
877: }
878: /*
879: * If this write goes to an empty buffer we have to
880: * save the disk block address first.
881: */
882: if (wl->wl_buffer_used == 0)
883: wl->wl_buffer_dblk = pbn;
884: /*
885: * Remaining space so this buffer ends on a MAXPHYS boundary.
886: *
887: * Cannot become less or equal zero as the buffer would have been
888: * flushed on the last call then.
889: */
890: resid = MAXPHYS - dbtob(wl->wl_buffer_dblk % btodb(MAXPHYS)) -
891: wl->wl_buffer_used;
892: KASSERT(resid > 0);
893: KASSERT(dbtob(btodb(resid)) == resid);
894: if (len >= resid) {
895: memcpy(wl->wl_buffer + wl->wl_buffer_used, data, resid);
896: wl->wl_buffer_used += resid;
897: error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
898: wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
899: data = (uint8_t *)data + resid;
900: len -= resid;
901: wl->wl_buffer_dblk = pbn + btodb(resid);
902: wl->wl_buffer_used = 0;
903: if (error)
904: return error;
905: }
906: KASSERT(len < MAXPHYS);
907: if (len > 0) {
908: memcpy(wl->wl_buffer + wl->wl_buffer_used, data, len);
909: wl->wl_buffer_used += len;
910: }
911:
912: return 0;
913: }
914:
915: /*
1.71 riastrad 916: * wapbl_circ_write(wl, data, len, offp)
917: *
918: * Write len bytes from data to the circular queue of wl, starting
919: * at linear byte offset *offp, and returning the new linear byte
920: * offset in *offp.
921: *
922: * If the starting linear byte offset precedes wl->wl_circ_off,
923: * the write instead begins at wl->wl_circ_off. XXX WTF? This
924: * should be a KASSERT, not a conditional.
925: *
926: * The write is buffered in wl and must be flushed with
927: * wapbl_buffered_flush before it will be submitted to the disk.
1.2 simonb 928: */
929: static int
930: wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
931: {
932: size_t slen;
933: off_t off = *offp;
934: int error;
1.34 mlelstv 935: daddr_t pbn;
1.2 simonb 936:
937: KDASSERT(((len >> wl->wl_log_dev_bshift) <<
938: wl->wl_log_dev_bshift) == len);
939:
940: if (off < wl->wl_circ_off)
941: off = wl->wl_circ_off;
942: slen = wl->wl_circ_off + wl->wl_circ_size - off;
943: if (slen < len) {
1.34 mlelstv 944: pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
945: #ifdef _KERNEL
946: pbn = btodb(pbn << wl->wl_log_dev_bshift);
947: #endif
1.54 hannken 948: error = wapbl_buffered_write(data, slen, wl, pbn);
1.2 simonb 949: if (error)
950: return error;
951: data = (uint8_t *)data + slen;
952: len -= slen;
953: off = wl->wl_circ_off;
954: }
1.34 mlelstv 955: pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
956: #ifdef _KERNEL
957: pbn = btodb(pbn << wl->wl_log_dev_bshift);
958: #endif
1.54 hannken 959: error = wapbl_buffered_write(data, len, wl, pbn);
1.2 simonb 960: if (error)
961: return error;
962: off += len;
963: if (off >= wl->wl_circ_off + wl->wl_circ_size)
964: off = wl->wl_circ_off;
965: *offp = off;
966: return 0;
967: }
968:
969: /****************************************************************/
1.71 riastrad 970: /*
971: * WAPBL transactions: entering, adding/removing bufs, and exiting
972: */
1.2 simonb 973:
974: int
975: wapbl_begin(struct wapbl *wl, const char *file, int line)
976: {
977: int doflush;
978: unsigned lockcount;
979:
980: KDASSERT(wl);
981:
982: /*
983: * XXX this needs to be made much more sophisticated.
984: * perhaps each wapbl_begin could reserve a specified
985: * number of buffers and bytes.
986: */
987: mutex_enter(&wl->wl_mtx);
988: lockcount = wl->wl_lock_count;
989: doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
990: wl->wl_bufbytes_max / 2) ||
991: ((wl->wl_bufcount + (lockcount * 10)) >
992: wl->wl_bufcount_max / 2) ||
1.28 pooka 993: (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) ||
1.42 hannken 994: (wl->wl_dealloccnt >= (wl->wl_dealloclim / 2));
1.2 simonb 995: mutex_exit(&wl->wl_mtx);
996:
997: if (doflush) {
998: WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
999: ("force flush lockcnt=%d bufbytes=%zu "
1.28 pooka 1000: "(max=%zu) bufcount=%zu (max=%zu) "
1001: "dealloccnt %d (lim=%d)\n",
1.2 simonb 1002: lockcount, wl->wl_bufbytes,
1003: wl->wl_bufbytes_max, wl->wl_bufcount,
1.28 pooka 1004: wl->wl_bufcount_max,
1005: wl->wl_dealloccnt, wl->wl_dealloclim));
1.2 simonb 1006: }
1007:
1008: if (doflush) {
1009: int error = wapbl_flush(wl, 0);
1010: if (error)
1011: return error;
1012: }
1013:
1.23 ad 1014: rw_enter(&wl->wl_rwlock, RW_READER);
1.2 simonb 1015: mutex_enter(&wl->wl_mtx);
1016: wl->wl_lock_count++;
1017: mutex_exit(&wl->wl_mtx);
1018:
1.23 ad 1019: #if defined(WAPBL_DEBUG_PRINT)
1.2 simonb 1020: WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
1021: ("wapbl_begin thread %d.%d with bufcount=%zu "
1022: "bufbytes=%zu bcount=%zu at %s:%d\n",
1023: curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1024: wl->wl_bufbytes, wl->wl_bcount, file, line));
1025: #endif
1026:
1027: return 0;
1028: }
1029:
1030: void
1031: wapbl_end(struct wapbl *wl)
1032: {
1033:
1.23 ad 1034: #if defined(WAPBL_DEBUG_PRINT)
1.2 simonb 1035: WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
1036: ("wapbl_end thread %d.%d with bufcount=%zu "
1037: "bufbytes=%zu bcount=%zu\n",
1038: curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1039: wl->wl_bufbytes, wl->wl_bcount));
1040: #endif
1041:
1.65 riastrad 1042: /*
1043: * XXX this could be handled more gracefully, perhaps place
1044: * only a partial transaction in the log and allow the
1045: * remaining to flush without the protection of the journal.
1046: */
1.67 riastrad 1047: KASSERTMSG((wapbl_transaction_len(wl) <=
1048: (wl->wl_circ_size - wl->wl_reserved_bytes)),
1.65 riastrad 1049: "wapbl_end: current transaction too big to flush");
1.40 bouyer 1050:
1.2 simonb 1051: mutex_enter(&wl->wl_mtx);
1052: KASSERT(wl->wl_lock_count > 0);
1053: wl->wl_lock_count--;
1054: mutex_exit(&wl->wl_mtx);
1055:
1056: rw_exit(&wl->wl_rwlock);
1057: }
1058:
1059: void
1060: wapbl_add_buf(struct wapbl *wl, struct buf * bp)
1061: {
1062:
1063: KASSERT(bp->b_cflags & BC_BUSY);
1064: KASSERT(bp->b_vp);
1065:
1066: wapbl_jlock_assert(wl);
1067:
1068: #if 0
1069: /*
1070: * XXX this might be an issue for swapfiles.
1071: * see uvm_swap.c:1702
1072: *
1073: * XXX2 why require it then? leap of semantics?
1074: */
1075: KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
1076: #endif
1077:
1078: mutex_enter(&wl->wl_mtx);
1079: if (bp->b_flags & B_LOCKED) {
1080: LIST_REMOVE(bp, b_wapbllist);
1081: WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
1082: ("wapbl_add_buf thread %d.%d re-adding buf %p "
1083: "with %d bytes %d bcount\n",
1084: curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
1085: bp->b_bcount));
1086: } else {
1087: /* unlocked by dirty buffers shouldn't exist */
1088: KASSERT(!(bp->b_oflags & BO_DELWRI));
1089: wl->wl_bufbytes += bp->b_bufsize;
1090: wl->wl_bcount += bp->b_bcount;
1091: wl->wl_bufcount++;
1092: WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
1093: ("wapbl_add_buf thread %d.%d adding buf %p "
1094: "with %d bytes %d bcount\n",
1095: curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
1096: bp->b_bcount));
1097: }
1098: LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist);
1099: mutex_exit(&wl->wl_mtx);
1100:
1101: bp->b_flags |= B_LOCKED;
1102: }
1103:
1104: static void
1105: wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
1106: {
1107:
1108: KASSERT(mutex_owned(&wl->wl_mtx));
1109: KASSERT(bp->b_cflags & BC_BUSY);
1110: wapbl_jlock_assert(wl);
1111:
1112: #if 0
1113: /*
1114: * XXX this might be an issue for swapfiles.
1115: * see uvm_swap.c:1725
1116: *
1117: * XXXdeux: see above
1118: */
1119: KASSERT((bp->b_flags & BC_NOCACHE) == 0);
1120: #endif
1121: KASSERT(bp->b_flags & B_LOCKED);
1122:
1123: WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
1124: ("wapbl_remove_buf thread %d.%d removing buf %p with "
1125: "%d bytes %d bcount\n",
1126: curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
1127:
1128: KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
1129: wl->wl_bufbytes -= bp->b_bufsize;
1130: KASSERT(wl->wl_bcount >= bp->b_bcount);
1131: wl->wl_bcount -= bp->b_bcount;
1132: KASSERT(wl->wl_bufcount > 0);
1133: wl->wl_bufcount--;
1134: KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1135: KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1136: LIST_REMOVE(bp, b_wapbllist);
1137:
1138: bp->b_flags &= ~B_LOCKED;
1139: }
1140:
1141: /* called from brelsel() in vfs_bio among other places */
1142: void
1143: wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
1144: {
1145:
1146: mutex_enter(&wl->wl_mtx);
1147: wapbl_remove_buf_locked(wl, bp);
1148: mutex_exit(&wl->wl_mtx);
1149: }
1150:
1151: void
1152: wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
1153: {
1154:
1155: KASSERT(bp->b_cflags & BC_BUSY);
1156:
1157: /*
1158: * XXX: why does this depend on B_LOCKED? otherwise the buf
1159: * is not for a transaction? if so, why is this called in the
1160: * first place?
1161: */
1162: if (bp->b_flags & B_LOCKED) {
1163: mutex_enter(&wl->wl_mtx);
1164: wl->wl_bufbytes += bp->b_bufsize - oldsz;
1165: wl->wl_bcount += bp->b_bcount - oldcnt;
1166: mutex_exit(&wl->wl_mtx);
1167: }
1168: }
1169:
1170: #endif /* _KERNEL */
1171:
1172: /****************************************************************/
1173: /* Some utility inlines */
1174:
1.71 riastrad 1175: /*
1176: * wapbl_space_used(avail, head, tail)
1177: *
1178: * Number of bytes used in a circular queue of avail total bytes,
1179: * from tail to head.
1180: */
1.56 joerg 1181: static inline size_t
1182: wapbl_space_used(size_t avail, off_t head, off_t tail)
1183: {
1184:
1185: if (tail == 0) {
1186: KASSERT(head == 0);
1187: return 0;
1188: }
1189: return ((head + (avail - 1) - tail) % avail) + 1;
1190: }
1191:
1192: #ifdef _KERNEL
1.71 riastrad 1193: /*
1194: * wapbl_advance(size, off, oldoff, delta)
1195: *
1196: * Given a byte offset oldoff into a circular queue of size bytes
1197: * starting at off, return a new byte offset oldoff + delta into
1198: * the circular queue.
1199: */
1.30 uebayasi 1200: static inline off_t
1.60 matt 1201: wapbl_advance(size_t size, size_t off, off_t oldoff, size_t delta)
1.2 simonb 1202: {
1.60 matt 1203: off_t newoff;
1.2 simonb 1204:
1205: /* Define acceptable ranges for inputs. */
1.46 christos 1206: KASSERT(delta <= (size_t)size);
1.60 matt 1207: KASSERT((oldoff == 0) || ((size_t)oldoff >= off));
1208: KASSERT(oldoff < (off_t)(size + off));
1.2 simonb 1209:
1.60 matt 1210: if ((oldoff == 0) && (delta != 0))
1211: newoff = off + delta;
1212: else if ((oldoff + delta) < (size + off))
1213: newoff = oldoff + delta;
1.2 simonb 1214: else
1.60 matt 1215: newoff = (oldoff + delta) - size;
1.2 simonb 1216:
1217: /* Note some interesting axioms */
1.60 matt 1218: KASSERT((delta != 0) || (newoff == oldoff));
1219: KASSERT((delta == 0) || (newoff != 0));
1220: KASSERT((delta != (size)) || (newoff == oldoff));
1.2 simonb 1221:
1222: /* Define acceptable ranges for output. */
1.60 matt 1223: KASSERT((newoff == 0) || ((size_t)newoff >= off));
1224: KASSERT((size_t)newoff < (size + off));
1225: return newoff;
1.2 simonb 1226: }
1227:
1.71 riastrad 1228: /*
1229: * wapbl_space_free(avail, head, tail)
1230: *
1231: * Number of bytes free in a circular queue of avail total bytes,
1232: * in which everything from tail to head is used.
1233: */
1.30 uebayasi 1234: static inline size_t
1.2 simonb 1235: wapbl_space_free(size_t avail, off_t head, off_t tail)
1236: {
1237:
1238: return avail - wapbl_space_used(avail, head, tail);
1239: }
1240:
1.71 riastrad 1241: /*
1242: * wapbl_advance_head(size, off, delta, headp, tailp)
1243: *
1244: * In a circular queue of size bytes starting at off, given the
1245: * old head and tail offsets *headp and *tailp, store the new head
1246: * and tail offsets in *headp and *tailp resulting from adding
1247: * delta bytes of data to the head.
1248: */
1.30 uebayasi 1249: static inline void
1.2 simonb 1250: wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
1251: off_t *tailp)
1252: {
1253: off_t head = *headp;
1254: off_t tail = *tailp;
1255:
1256: KASSERT(delta <= wapbl_space_free(size, head, tail));
1257: head = wapbl_advance(size, off, head, delta);
1258: if ((tail == 0) && (head != 0))
1259: tail = off;
1260: *headp = head;
1261: *tailp = tail;
1262: }
1263:
1.71 riastrad 1264: /*
1265: * wapbl_advance_tail(size, off, delta, headp, tailp)
1266: *
1267: * In a circular queue of size bytes starting at off, given the
1268: * old head and tail offsets *headp and *tailp, store the new head
1269: * and tail offsets in *headp and *tailp resulting from removing
1270: * delta bytes of data from the tail.
1271: */
1.30 uebayasi 1272: static inline void
1.2 simonb 1273: wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
1274: off_t *tailp)
1275: {
1276: off_t head = *headp;
1277: off_t tail = *tailp;
1278:
1279: KASSERT(delta <= wapbl_space_used(size, head, tail));
1280: tail = wapbl_advance(size, off, tail, delta);
1281: if (head == tail) {
1282: head = tail = 0;
1283: }
1284: *headp = head;
1285: *tailp = tail;
1286: }
1287:
1288:
1289: /****************************************************************/
1290:
1291: /*
1.73 riastrad 1292: * wapbl_truncate(wl, minfree)
1.71 riastrad 1293: *
1294: * Wait until at least minfree bytes are available in the log.
1295: *
1.73 riastrad 1296: * If it was necessary to wait for writes to complete,
1297: * advance the circular queue tail to reflect the new write
1298: * completions and issue a write commit to the log.
1.71 riastrad 1299: *
1300: * => Caller must hold wl->wl_rwlock writer lock.
1.2 simonb 1301: */
1302: static int
1.73 riastrad 1303: wapbl_truncate(struct wapbl *wl, size_t minfree)
1.2 simonb 1304: {
1305: size_t delta;
1306: size_t avail;
1307: off_t head;
1308: off_t tail;
1309: int error = 0;
1310:
1311: KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
1312: KASSERT(rw_write_held(&wl->wl_rwlock));
1313:
1314: mutex_enter(&wl->wl_mtx);
1315:
1316: /*
1317: * First check to see if we have to do a commit
1318: * at all.
1319: */
1320: avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
1321: if (minfree < avail) {
1322: mutex_exit(&wl->wl_mtx);
1323: return 0;
1324: }
1325: minfree -= avail;
1326: while ((wl->wl_error_count == 0) &&
1327: (wl->wl_reclaimable_bytes < minfree)) {
1328: WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1329: ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
1330: "minfree=%zd\n",
1331: &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
1332: minfree));
1333:
1334: cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
1335: }
1336: if (wl->wl_reclaimable_bytes < minfree) {
1337: KASSERT(wl->wl_error_count);
1338: /* XXX maybe get actual error from buffer instead someday? */
1339: error = EIO;
1340: }
1341: head = wl->wl_head;
1342: tail = wl->wl_tail;
1343: delta = wl->wl_reclaimable_bytes;
1344:
1345: /* If all of of the entries are flushed, then be sure to keep
1346: * the reserved bytes reserved. Watch out for discarded transactions,
1347: * which could leave more bytes reserved than are reclaimable.
1348: */
1349: if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
1350: (delta >= wl->wl_reserved_bytes)) {
1351: delta -= wl->wl_reserved_bytes;
1352: }
1353: wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
1354: &tail);
1355: KDASSERT(wl->wl_reserved_bytes <=
1356: wapbl_space_used(wl->wl_circ_size, head, tail));
1357: mutex_exit(&wl->wl_mtx);
1358:
1359: if (error)
1360: return error;
1361:
1362: /*
1363: * This is where head, tail and delta are unprotected
1364: * from races against itself or flush. This is ok since
1365: * we only call this routine from inside flush itself.
1366: *
1367: * XXX: how can it race against itself when accessed only
1368: * from behind the write-locked rwlock?
1369: */
1370: error = wapbl_write_commit(wl, head, tail);
1371: if (error)
1372: return error;
1373:
1374: wl->wl_head = head;
1375: wl->wl_tail = tail;
1376:
1377: mutex_enter(&wl->wl_mtx);
1378: KASSERT(wl->wl_reclaimable_bytes >= delta);
1379: wl->wl_reclaimable_bytes -= delta;
1380: mutex_exit(&wl->wl_mtx);
1381: WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1382: ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
1383: curproc->p_pid, curlwp->l_lid, delta));
1384:
1385: return 0;
1386: }
1387:
1388: /****************************************************************/
1389:
1390: void
1391: wapbl_biodone(struct buf *bp)
1392: {
1393: struct wapbl_entry *we = bp->b_private;
1394: struct wapbl *wl = we->we_wapbl;
1.53 hannken 1395: #ifdef WAPBL_DEBUG_BUFBYTES
1396: const int bufsize = bp->b_bufsize;
1397: #endif
1.2 simonb 1398:
1399: /*
1400: * Handle possible flushing of buffers after log has been
1401: * decomissioned.
1402: */
1403: if (!wl) {
1404: KASSERT(we->we_bufcount > 0);
1405: we->we_bufcount--;
1406: #ifdef WAPBL_DEBUG_BUFBYTES
1.53 hannken 1407: KASSERT(we->we_unsynced_bufbytes >= bufsize);
1408: we->we_unsynced_bufbytes -= bufsize;
1.2 simonb 1409: #endif
1410:
1411: if (we->we_bufcount == 0) {
1412: #ifdef WAPBL_DEBUG_BUFBYTES
1413: KASSERT(we->we_unsynced_bufbytes == 0);
1414: #endif
1.51 para 1415: pool_put(&wapbl_entry_pool, we);
1.2 simonb 1416: }
1417:
1418: brelse(bp, 0);
1419: return;
1420: }
1421:
1422: #ifdef ohbother
1.44 uebayasi 1423: KDASSERT(bp->b_oflags & BO_DONE);
1424: KDASSERT(!(bp->b_oflags & BO_DELWRI));
1.2 simonb 1425: KDASSERT(bp->b_flags & B_ASYNC);
1.44 uebayasi 1426: KDASSERT(bp->b_cflags & BC_BUSY);
1.2 simonb 1427: KDASSERT(!(bp->b_flags & B_LOCKED));
1428: KDASSERT(!(bp->b_flags & B_READ));
1.44 uebayasi 1429: KDASSERT(!(bp->b_cflags & BC_INVAL));
1430: KDASSERT(!(bp->b_cflags & BC_NOCACHE));
1.2 simonb 1431: #endif
1432:
1433: if (bp->b_error) {
1.26 apb 1434: /*
1.78 ! riastrad 1435: * If an error occurs, it would be nice to leave the buffer
! 1436: * as a delayed write on the LRU queue so that we can retry
! 1437: * it later. But buffercache(9) can't handle dirty buffer
! 1438: * reuse, so just mark the log permanently errored out.
1.26 apb 1439: */
1.2 simonb 1440: mutex_enter(&wl->wl_mtx);
1441: if (wl->wl_error_count == 0) {
1442: wl->wl_error_count++;
1443: cv_broadcast(&wl->wl_reclaimable_cv);
1444: }
1445: mutex_exit(&wl->wl_mtx);
1446: }
1447:
1.53 hannken 1448: /*
1449: * Release the buffer here. wapbl_flush() may wait for the
1450: * log to become empty and we better unbusy the buffer before
1451: * wapbl_flush() returns.
1452: */
1453: brelse(bp, 0);
1454:
1.2 simonb 1455: mutex_enter(&wl->wl_mtx);
1456:
1457: KASSERT(we->we_bufcount > 0);
1458: we->we_bufcount--;
1459: #ifdef WAPBL_DEBUG_BUFBYTES
1.53 hannken 1460: KASSERT(we->we_unsynced_bufbytes >= bufsize);
1461: we->we_unsynced_bufbytes -= bufsize;
1462: KASSERT(wl->wl_unsynced_bufbytes >= bufsize);
1463: wl->wl_unsynced_bufbytes -= bufsize;
1.2 simonb 1464: #endif
1465:
1466: /*
1467: * If the current transaction can be reclaimed, start
1468: * at the beginning and reclaim any consecutive reclaimable
1469: * transactions. If we successfully reclaim anything,
1470: * then wakeup anyone waiting for the reclaim.
1471: */
1472: if (we->we_bufcount == 0) {
1473: size_t delta = 0;
1474: int errcnt = 0;
1475: #ifdef WAPBL_DEBUG_BUFBYTES
1476: KDASSERT(we->we_unsynced_bufbytes == 0);
1477: #endif
1478: /*
1479: * clear any posted error, since the buffer it came from
1480: * has successfully flushed by now
1481: */
1482: while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
1483: (we->we_bufcount == 0)) {
1484: delta += we->we_reclaimable_bytes;
1485: if (we->we_error)
1486: errcnt++;
1487: SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1.51 para 1488: pool_put(&wapbl_entry_pool, we);
1.2 simonb 1489: }
1490:
1491: if (delta) {
1492: wl->wl_reclaimable_bytes += delta;
1493: KASSERT(wl->wl_error_count >= errcnt);
1494: wl->wl_error_count -= errcnt;
1495: cv_broadcast(&wl->wl_reclaimable_cv);
1496: }
1497: }
1498:
1499: mutex_exit(&wl->wl_mtx);
1500: }
1501:
1502: /*
1.71 riastrad 1503: * wapbl_flush(wl, wait)
1504: *
1505: * Flush pending block writes, deallocations, and inodes from
1506: * the current transaction in memory to the log on disk:
1507: *
1508: * 1. Call the file system's wl_flush callback to flush any
1509: * per-file-system pending updates.
1510: * 2. Wait for enough space in the log for the current transaction.
1511: * 3. Synchronously write the new log records, advancing the
1512: * circular queue head.
1.77 riastrad 1513: * 4. Issue the pending block writes asynchronously, now that they
1514: * are recorded in the log and can be replayed after crash.
1515: * 5. If wait is true, wait for all writes to complete and for the
1516: * log to become empty.
1.71 riastrad 1517: *
1518: * On failure, call the file system's wl_flush_abort callback.
1.2 simonb 1519: */
1520: int
1521: wapbl_flush(struct wapbl *wl, int waitfor)
1522: {
1523: struct buf *bp;
1524: struct wapbl_entry *we;
1525: off_t off;
1526: off_t head;
1527: off_t tail;
1528: size_t delta = 0;
1529: size_t flushsize;
1530: size_t reserved;
1531: int error = 0;
1532:
1533: /*
1534: * Do a quick check to see if a full flush can be skipped
1535: * This assumes that the flush callback does not need to be called
1536: * unless there are other outstanding bufs.
1537: */
1538: if (!waitfor) {
1539: size_t nbufs;
1540: mutex_enter(&wl->wl_mtx); /* XXX need mutex here to
1541: protect the KASSERTS */
1542: nbufs = wl->wl_bufcount;
1543: KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1544: KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1545: mutex_exit(&wl->wl_mtx);
1546: if (nbufs == 0)
1547: return 0;
1548: }
1549:
1550: /*
1551: * XXX we may consider using LK_UPGRADE here
1552: * if we want to call flush from inside a transaction
1553: */
1554: rw_enter(&wl->wl_rwlock, RW_WRITER);
1555: wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
1556: wl->wl_dealloccnt);
1557:
1558: /*
1.75 riastrad 1559: * Now that we are exclusively locked and the file system has
1560: * issued any deferred block writes for this transaction, check
1561: * whether there are any blocks to write to the log. If not,
1562: * skip waiting for space or writing any log entries.
1563: *
1564: * XXX Shouldn't this also check wl_dealloccnt and
1565: * wl_inohashcnt? Perhaps wl_dealloccnt doesn't matter if the
1566: * file system didn't produce any blocks as a consequence of
1567: * it, but the same does not seem to be so of wl_inohashcnt.
1.2 simonb 1568: */
1569: if (wl->wl_bufcount == 0) {
1.69 riastrad 1570: goto wait_out;
1.2 simonb 1571: }
1572:
1573: #if 0
1574: WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1575: ("wapbl_flush thread %d.%d flushing entries with "
1576: "bufcount=%zu bufbytes=%zu\n",
1577: curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1578: wl->wl_bufbytes));
1579: #endif
1580:
1581: /* Calculate amount of space needed to flush */
1582: flushsize = wapbl_transaction_len(wl);
1.39 christos 1583: if (wapbl_verbose_commit) {
1584: struct timespec ts;
1585: getnanotime(&ts);
1.43 nakayama 1586: printf("%s: %lld.%09ld this transaction = %zu bytes\n",
1.39 christos 1587: __func__, (long long)ts.tv_sec,
1588: (long)ts.tv_nsec, flushsize);
1589: }
1.2 simonb 1590:
1591: if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
1592: /*
1593: * XXX this could be handled more gracefully, perhaps place
1594: * only a partial transaction in the log and allow the
1595: * remaining to flush without the protection of the journal.
1596: */
1.66 riastrad 1597: panic("wapbl_flush: current transaction too big to flush");
1.2 simonb 1598: }
1599:
1.73 riastrad 1600: error = wapbl_truncate(wl, flushsize);
1.2 simonb 1601: if (error)
1.69 riastrad 1602: goto out;
1.2 simonb 1603:
1604: off = wl->wl_head;
1.70 riastrad 1605: KASSERT((off == 0) || (off >= wl->wl_circ_off));
1606: KASSERT((off == 0) || (off < wl->wl_circ_off + wl->wl_circ_size));
1.2 simonb 1607: error = wapbl_write_blocks(wl, &off);
1608: if (error)
1.69 riastrad 1609: goto out;
1.2 simonb 1610: error = wapbl_write_revocations(wl, &off);
1611: if (error)
1.69 riastrad 1612: goto out;
1.2 simonb 1613: error = wapbl_write_inodes(wl, &off);
1614: if (error)
1.69 riastrad 1615: goto out;
1.2 simonb 1616:
1617: reserved = 0;
1618: if (wl->wl_inohashcnt)
1619: reserved = wapbl_transaction_inodes_len(wl);
1620:
1621: head = wl->wl_head;
1622: tail = wl->wl_tail;
1623:
1624: wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
1625: &head, &tail);
1.72 riastrad 1626:
1627: KASSERTMSG(head == off,
1628: "lost head! head=%"PRIdMAX" tail=%" PRIdMAX
1629: " off=%"PRIdMAX" flush=%zu",
1630: (intmax_t)head, (intmax_t)tail, (intmax_t)off,
1631: flushsize);
1.2 simonb 1632:
1633: /* Opportunistically move the tail forward if we can */
1.73 riastrad 1634: mutex_enter(&wl->wl_mtx);
1635: delta = wl->wl_reclaimable_bytes;
1636: mutex_exit(&wl->wl_mtx);
1637: wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
1638: &head, &tail);
1.2 simonb 1639:
1640: error = wapbl_write_commit(wl, head, tail);
1641: if (error)
1.69 riastrad 1642: goto out;
1.2 simonb 1643:
1.51 para 1644: we = pool_get(&wapbl_entry_pool, PR_WAITOK);
1.2 simonb 1645:
1646: #ifdef WAPBL_DEBUG_BUFBYTES
1647: WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1648: ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1649: " unsynced=%zu"
1650: "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1651: "inodes=%d\n",
1652: curproc->p_pid, curlwp->l_lid, flushsize, delta,
1653: wapbl_space_used(wl->wl_circ_size, head, tail),
1654: wl->wl_unsynced_bufbytes, wl->wl_bufcount,
1655: wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
1656: wl->wl_inohashcnt));
1657: #else
1658: WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1659: ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1660: "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1661: "inodes=%d\n",
1662: curproc->p_pid, curlwp->l_lid, flushsize, delta,
1663: wapbl_space_used(wl->wl_circ_size, head, tail),
1664: wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1665: wl->wl_dealloccnt, wl->wl_inohashcnt));
1666: #endif
1667:
1668:
1669: mutex_enter(&bufcache_lock);
1670: mutex_enter(&wl->wl_mtx);
1671:
1672: wl->wl_reserved_bytes = reserved;
1673: wl->wl_head = head;
1674: wl->wl_tail = tail;
1675: KASSERT(wl->wl_reclaimable_bytes >= delta);
1676: wl->wl_reclaimable_bytes -= delta;
1677: wl->wl_dealloccnt = 0;
1678: #ifdef WAPBL_DEBUG_BUFBYTES
1679: wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
1680: #endif
1681:
1682: we->we_wapbl = wl;
1683: we->we_bufcount = wl->wl_bufcount;
1684: #ifdef WAPBL_DEBUG_BUFBYTES
1685: we->we_unsynced_bufbytes = wl->wl_bufbytes;
1686: #endif
1687: we->we_reclaimable_bytes = flushsize;
1688: we->we_error = 0;
1689: SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
1690:
1691: /*
1692: * this flushes bufs in reverse order than they were queued
1693: * it shouldn't matter, but if we care we could use TAILQ instead.
1694: * XXX Note they will get put on the lru queue when they flush
1695: * so we might actually want to change this to preserve order.
1696: */
1697: while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
1698: if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
1699: continue;
1700: }
1701: bp->b_iodone = wapbl_biodone;
1702: bp->b_private = we;
1703: bremfree(bp);
1704: wapbl_remove_buf_locked(wl, bp);
1705: mutex_exit(&wl->wl_mtx);
1706: mutex_exit(&bufcache_lock);
1707: bawrite(bp);
1708: mutex_enter(&bufcache_lock);
1709: mutex_enter(&wl->wl_mtx);
1710: }
1711: mutex_exit(&wl->wl_mtx);
1712: mutex_exit(&bufcache_lock);
1713:
1714: #if 0
1715: WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1716: ("wapbl_flush thread %d.%d done flushing entries...\n",
1717: curproc->p_pid, curlwp->l_lid));
1718: #endif
1719:
1.69 riastrad 1720: wait_out:
1.2 simonb 1721:
1722: /*
1723: * If the waitfor flag is set, don't return until everything is
1724: * fully flushed and the on disk log is empty.
1725: */
1726: if (waitfor) {
1727: error = wapbl_truncate(wl, wl->wl_circ_size -
1.73 riastrad 1728: wl->wl_reserved_bytes);
1.2 simonb 1729: }
1730:
1.69 riastrad 1731: out:
1.2 simonb 1732: if (error) {
1733: wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks,
1734: wl->wl_dealloclens, wl->wl_dealloccnt);
1735: }
1736:
1737: #ifdef WAPBL_DEBUG_PRINT
1738: if (error) {
1739: pid_t pid = -1;
1740: lwpid_t lid = -1;
1741: if (curproc)
1742: pid = curproc->p_pid;
1743: if (curlwp)
1744: lid = curlwp->l_lid;
1745: mutex_enter(&wl->wl_mtx);
1746: #ifdef WAPBL_DEBUG_BUFBYTES
1747: WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1748: ("wapbl_flush: thread %d.%d aborted flush: "
1749: "error = %d\n"
1750: "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1751: "deallocs=%d inodes=%d\n"
1752: "\terrcnt = %d, reclaimable=%zu reserved=%zu "
1753: "unsynced=%zu\n",
1754: pid, lid, error, wl->wl_bufcount,
1755: wl->wl_bufbytes, wl->wl_bcount,
1756: wl->wl_dealloccnt, wl->wl_inohashcnt,
1757: wl->wl_error_count, wl->wl_reclaimable_bytes,
1758: wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
1759: SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1760: WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1761: ("\tentry: bufcount = %zu, reclaimable = %zu, "
1762: "error = %d, unsynced = %zu\n",
1763: we->we_bufcount, we->we_reclaimable_bytes,
1764: we->we_error, we->we_unsynced_bufbytes));
1765: }
1766: #else
1767: WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1768: ("wapbl_flush: thread %d.%d aborted flush: "
1769: "error = %d\n"
1770: "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1771: "deallocs=%d inodes=%d\n"
1772: "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
1773: pid, lid, error, wl->wl_bufcount,
1774: wl->wl_bufbytes, wl->wl_bcount,
1775: wl->wl_dealloccnt, wl->wl_inohashcnt,
1776: wl->wl_error_count, wl->wl_reclaimable_bytes,
1777: wl->wl_reserved_bytes));
1778: SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1779: WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1780: ("\tentry: bufcount = %zu, reclaimable = %zu, "
1781: "error = %d\n", we->we_bufcount,
1782: we->we_reclaimable_bytes, we->we_error));
1783: }
1784: #endif
1785: mutex_exit(&wl->wl_mtx);
1786: }
1787: #endif
1788:
1789: rw_exit(&wl->wl_rwlock);
1790: return error;
1791: }
1792:
1793: /****************************************************************/
1794:
1795: void
1796: wapbl_jlock_assert(struct wapbl *wl)
1797: {
1798:
1.23 ad 1799: KASSERT(rw_lock_held(&wl->wl_rwlock));
1.2 simonb 1800: }
1801:
1802: void
1803: wapbl_junlock_assert(struct wapbl *wl)
1804: {
1805:
1806: KASSERT(!rw_write_held(&wl->wl_rwlock));
1807: }
1808:
1809: /****************************************************************/
1810:
1811: /* locks missing */
1812: void
1813: wapbl_print(struct wapbl *wl,
1814: int full,
1815: void (*pr)(const char *, ...))
1816: {
1817: struct buf *bp;
1818: struct wapbl_entry *we;
1819: (*pr)("wapbl %p", wl);
1820: (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
1821: wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
1822: (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
1823: wl->wl_circ_size, wl->wl_circ_off,
1824: (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
1825: (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
1826: wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
1827: #ifdef WAPBL_DEBUG_BUFBYTES
1828: (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1829: "reserved = %zu errcnt = %d unsynced = %zu\n",
1830: wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1831: wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1832: wl->wl_error_count, wl->wl_unsynced_bufbytes);
1833: #else
1834: (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1835: "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
1836: wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1837: wl->wl_error_count);
1838: #endif
1839: (*pr)("\tdealloccnt = %d, dealloclim = %d\n",
1840: wl->wl_dealloccnt, wl->wl_dealloclim);
1841: (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
1842: wl->wl_inohashcnt, wl->wl_inohashmask);
1843: (*pr)("entries:\n");
1844: SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1845: #ifdef WAPBL_DEBUG_BUFBYTES
1846: (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
1847: "unsynced = %zu\n",
1848: we->we_bufcount, we->we_reclaimable_bytes,
1849: we->we_error, we->we_unsynced_bufbytes);
1850: #else
1851: (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
1852: we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
1853: #endif
1854: }
1855: if (full) {
1856: int cnt = 0;
1857: (*pr)("bufs =");
1858: LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
1859: if (!LIST_NEXT(bp, b_wapbllist)) {
1860: (*pr)(" %p", bp);
1861: } else if ((++cnt % 6) == 0) {
1862: (*pr)(" %p,\n\t", bp);
1863: } else {
1864: (*pr)(" %p,", bp);
1865: }
1866: }
1867: (*pr)("\n");
1868:
1869: (*pr)("dealloced blks = ");
1870: {
1871: int i;
1872: cnt = 0;
1873: for (i = 0; i < wl->wl_dealloccnt; i++) {
1874: (*pr)(" %"PRId64":%d,",
1875: wl->wl_deallocblks[i],
1876: wl->wl_dealloclens[i]);
1877: if ((++cnt % 4) == 0) {
1878: (*pr)("\n\t");
1879: }
1880: }
1881: }
1882: (*pr)("\n");
1883:
1884: (*pr)("registered inodes = ");
1885: {
1886: int i;
1887: cnt = 0;
1888: for (i = 0; i <= wl->wl_inohashmask; i++) {
1889: struct wapbl_ino_head *wih;
1890: struct wapbl_ino *wi;
1891:
1892: wih = &wl->wl_inohash[i];
1893: LIST_FOREACH(wi, wih, wi_hash) {
1894: if (wi->wi_ino == 0)
1895: continue;
1.55 christos 1896: (*pr)(" %"PRIu64"/0%06"PRIo32",",
1.2 simonb 1897: wi->wi_ino, wi->wi_mode);
1898: if ((++cnt % 4) == 0) {
1899: (*pr)("\n\t");
1900: }
1901: }
1902: }
1903: (*pr)("\n");
1904: }
1905: }
1906: }
1907:
1908: #if defined(WAPBL_DEBUG) || defined(DDB)
1909: void
1910: wapbl_dump(struct wapbl *wl)
1911: {
1912: #if defined(WAPBL_DEBUG)
1913: if (!wl)
1914: wl = wapbl_debug_wl;
1915: #endif
1916: if (!wl)
1917: return;
1918: wapbl_print(wl, 1, printf);
1919: }
1920: #endif
1921:
1922: /****************************************************************/
1923:
1924: void
1925: wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len)
1926: {
1927:
1928: wapbl_jlock_assert(wl);
1929:
1.38 hannken 1930: mutex_enter(&wl->wl_mtx);
1.2 simonb 1931: /* XXX should eventually instead tie this into resource estimation */
1.27 pooka 1932: /*
1933: * XXX this panic needs locking/mutex analysis and the
1934: * ability to cope with the failure.
1935: */
1936: /* XXX this XXX doesn't have enough XXX */
1937: if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim))
1938: panic("wapbl_register_deallocation: out of resources");
1939:
1.2 simonb 1940: wl->wl_deallocblks[wl->wl_dealloccnt] = blk;
1941: wl->wl_dealloclens[wl->wl_dealloccnt] = len;
1942: wl->wl_dealloccnt++;
1943: WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
1944: ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len));
1.38 hannken 1945: mutex_exit(&wl->wl_mtx);
1.2 simonb 1946: }
1947:
1948: /****************************************************************/
1949:
1950: static void
1951: wapbl_inodetrk_init(struct wapbl *wl, u_int size)
1952: {
1953:
1954: wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
1955: if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
1956: pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
1957: "wapblinopl", &pool_allocator_nointr, IPL_NONE);
1958: }
1959: }
1960:
1961: static void
1962: wapbl_inodetrk_free(struct wapbl *wl)
1963: {
1964:
1965: /* XXX this KASSERT needs locking/mutex analysis */
1966: KASSERT(wl->wl_inohashcnt == 0);
1967: hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
1968: if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
1969: pool_destroy(&wapbl_ino_pool);
1970: }
1971: }
1972:
1973: static struct wapbl_ino *
1974: wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
1975: {
1976: struct wapbl_ino_head *wih;
1977: struct wapbl_ino *wi;
1978:
1979: KASSERT(mutex_owned(&wl->wl_mtx));
1980:
1981: wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1982: LIST_FOREACH(wi, wih, wi_hash) {
1983: if (ino == wi->wi_ino)
1984: return wi;
1985: }
1986: return 0;
1987: }
1988:
1989: void
1990: wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1991: {
1992: struct wapbl_ino_head *wih;
1993: struct wapbl_ino *wi;
1994:
1995: wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
1996:
1997: mutex_enter(&wl->wl_mtx);
1998: if (wapbl_inodetrk_get(wl, ino) == NULL) {
1999: wi->wi_ino = ino;
2000: wi->wi_mode = mode;
2001: wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
2002: LIST_INSERT_HEAD(wih, wi, wi_hash);
2003: wl->wl_inohashcnt++;
2004: WAPBL_PRINTF(WAPBL_PRINT_INODE,
2005: ("wapbl_register_inode: ino=%"PRId64"\n", ino));
2006: mutex_exit(&wl->wl_mtx);
2007: } else {
2008: mutex_exit(&wl->wl_mtx);
2009: pool_put(&wapbl_ino_pool, wi);
2010: }
2011: }
2012:
2013: void
2014: wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
2015: {
2016: struct wapbl_ino *wi;
2017:
2018: mutex_enter(&wl->wl_mtx);
2019: wi = wapbl_inodetrk_get(wl, ino);
2020: if (wi) {
2021: WAPBL_PRINTF(WAPBL_PRINT_INODE,
2022: ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
2023: KASSERT(wl->wl_inohashcnt > 0);
2024: wl->wl_inohashcnt--;
2025: LIST_REMOVE(wi, wi_hash);
2026: mutex_exit(&wl->wl_mtx);
2027:
2028: pool_put(&wapbl_ino_pool, wi);
2029: } else {
2030: mutex_exit(&wl->wl_mtx);
2031: }
2032: }
2033:
2034: /****************************************************************/
2035:
1.71 riastrad 2036: /*
2037: * wapbl_transaction_inodes_len(wl)
2038: *
2039: * Calculate the number of bytes required for inode registration
2040: * log records in wl.
2041: */
1.30 uebayasi 2042: static inline size_t
1.2 simonb 2043: wapbl_transaction_inodes_len(struct wapbl *wl)
2044: {
2045: int blocklen = 1<<wl->wl_log_dev_bshift;
2046: int iph;
2047:
2048: /* Calculate number of inodes described in a inodelist header */
2049: iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2050: sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2051:
2052: KASSERT(iph > 0);
2053:
1.39 christos 2054: return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen;
1.2 simonb 2055: }
2056:
2057:
1.71 riastrad 2058: /*
2059: * wapbl_transaction_len(wl)
2060: *
2061: * Calculate number of bytes required for all log records in wl.
2062: */
1.2 simonb 2063: static size_t
2064: wapbl_transaction_len(struct wapbl *wl)
2065: {
2066: int blocklen = 1<<wl->wl_log_dev_bshift;
2067: size_t len;
2068: int bph;
2069:
2070: /* Calculate number of blocks described in a blocklist header */
2071: bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2072: sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2073:
2074: KASSERT(bph > 0);
2075:
2076: len = wl->wl_bcount;
1.39 christos 2077: len += howmany(wl->wl_bufcount, bph) * blocklen;
2078: len += howmany(wl->wl_dealloccnt, bph) * blocklen;
1.2 simonb 2079: len += wapbl_transaction_inodes_len(wl);
2080:
2081: return len;
2082: }
2083:
2084: /*
1.71 riastrad 2085: * wapbl_cache_sync(wl, msg)
2086: *
2087: * Issue DIOCCACHESYNC to wl->wl_devvp.
2088: *
2089: * If sysctl(vfs.wapbl.verbose_commit) >= 2, print a message
2090: * including msg about the duration of the cache sync.
1.48 yamt 2091: */
2092: static int
2093: wapbl_cache_sync(struct wapbl *wl, const char *msg)
2094: {
2095: const bool verbose = wapbl_verbose_commit >= 2;
2096: struct bintime start_time;
2097: int force = 1;
2098: int error;
2099:
2100: if (!wapbl_flush_disk_cache) {
2101: return 0;
2102: }
2103: if (verbose) {
2104: bintime(&start_time);
2105: }
2106: error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force,
2107: FWRITE, FSCRED);
2108: if (error) {
2109: WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1.76 riastrad 2110: ("wapbl_cache_sync: DIOCCACHESYNC on dev 0x%jx "
2111: "returned %d\n", (uintmax_t)wl->wl_devvp->v_rdev, error));
1.48 yamt 2112: }
2113: if (verbose) {
2114: struct bintime d;
2115: struct timespec ts;
2116:
2117: bintime(&d);
2118: bintime_sub(&d, &start_time);
2119: bintime2timespec(&d, &ts);
2120: printf("wapbl_cache_sync: %s: dev 0x%jx %ju.%09lu\n",
2121: msg, (uintmax_t)wl->wl_devvp->v_rdev,
2122: (uintmax_t)ts.tv_sec, ts.tv_nsec);
2123: }
2124: return error;
2125: }
2126:
2127: /*
1.71 riastrad 2128: * wapbl_write_commit(wl, head, tail)
2129: *
2130: * Issue a disk cache sync to wait for all pending writes to the
2131: * log to complete, and then synchronously commit the current
2132: * circular queue head and tail to the log, in the next of two
2133: * locations for commit headers on disk.
1.2 simonb 2134: *
1.71 riastrad 2135: * Increment the generation number. If the generation number
2136: * rolls over to zero, then a subsequent commit would appear to
2137: * have an older generation than this one -- in that case, issue a
2138: * duplicate commit to avoid this.
2139: *
2140: * => Caller must have exclusive access to wl, either by holding
2141: * wl->wl_rwlock for writer or by being wapbl_start before anyone
2142: * else has seen wl.
1.2 simonb 2143: */
2144: static int
2145: wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
2146: {
2147: struct wapbl_wc_header *wc = wl->wl_wc_header;
2148: struct timespec ts;
2149: int error;
1.34 mlelstv 2150: daddr_t pbn;
1.2 simonb 2151:
1.54 hannken 2152: error = wapbl_buffered_flush(wl);
2153: if (error)
2154: return error;
1.49 yamt 2155: /*
2156: * flush disk cache to ensure that blocks we've written are actually
2157: * written to the stable storage before the commit header.
2158: *
2159: * XXX Calc checksum here, instead we do this for now
2160: */
1.48 yamt 2161: wapbl_cache_sync(wl, "1");
1.2 simonb 2162:
2163: wc->wc_head = head;
2164: wc->wc_tail = tail;
2165: wc->wc_checksum = 0;
2166: wc->wc_version = 1;
2167: getnanotime(&ts);
1.17 yamt 2168: wc->wc_time = ts.tv_sec;
1.2 simonb 2169: wc->wc_timensec = ts.tv_nsec;
2170:
2171: WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2172: ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
2173: (intmax_t)head, (intmax_t)tail));
2174:
2175: /*
1.49 yamt 2176: * write the commit header.
2177: *
1.2 simonb 2178: * XXX if generation will rollover, then first zero
2179: * over second commit header before trying to write both headers.
2180: */
2181:
1.34 mlelstv 2182: pbn = wl->wl_logpbn + (wc->wc_generation % 2);
2183: #ifdef _KERNEL
2184: pbn = btodb(pbn << wc->wc_log_dev_bshift);
2185: #endif
1.54 hannken 2186: error = wapbl_buffered_write(wc, wc->wc_len, wl, pbn);
2187: if (error)
2188: return error;
2189: error = wapbl_buffered_flush(wl);
1.2 simonb 2190: if (error)
2191: return error;
2192:
1.49 yamt 2193: /*
2194: * flush disk cache to ensure that the commit header is actually
2195: * written before meta data blocks.
2196: */
1.48 yamt 2197: wapbl_cache_sync(wl, "2");
1.2 simonb 2198:
2199: /*
2200: * If the generation number was zero, write it out a second time.
2201: * This handles initialization and generation number rollover
2202: */
2203: if (wc->wc_generation++ == 0) {
2204: error = wapbl_write_commit(wl, head, tail);
2205: /*
2206: * This panic should be able to be removed if we do the
2207: * zero'ing mentioned above, and we are certain to roll
2208: * back generation number on failure.
2209: */
2210: if (error)
2211: panic("wapbl_write_commit: error writing duplicate "
1.66 riastrad 2212: "log header: %d", error);
1.2 simonb 2213: }
2214: return 0;
2215: }
2216:
1.71 riastrad 2217: /*
2218: * wapbl_write_blocks(wl, offp)
2219: *
2220: * Write all pending physical blocks in the current transaction
2221: * from wapbl_add_buf to the log on disk, adding to the circular
2222: * queue head at byte offset *offp, and returning the new head's
2223: * byte offset in *offp.
2224: */
1.2 simonb 2225: static int
2226: wapbl_write_blocks(struct wapbl *wl, off_t *offp)
2227: {
2228: struct wapbl_wc_blocklist *wc =
2229: (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2230: int blocklen = 1<<wl->wl_log_dev_bshift;
2231: int bph;
2232: struct buf *bp;
2233: off_t off = *offp;
2234: int error;
1.7 joerg 2235: size_t padding;
1.2 simonb 2236:
2237: KASSERT(rw_write_held(&wl->wl_rwlock));
2238:
2239: bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2240: sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2241:
2242: bp = LIST_FIRST(&wl->wl_bufs);
2243:
2244: while (bp) {
2245: int cnt;
2246: struct buf *obp = bp;
2247:
2248: KASSERT(bp->b_flags & B_LOCKED);
2249:
2250: wc->wc_type = WAPBL_WC_BLOCKS;
2251: wc->wc_len = blocklen;
2252: wc->wc_blkcount = 0;
2253: while (bp && (wc->wc_blkcount < bph)) {
2254: /*
2255: * Make sure all the physical block numbers are up to
2256: * date. If this is not always true on a given
2257: * filesystem, then VOP_BMAP must be called. We
2258: * could call VOP_BMAP here, or else in the filesystem
2259: * specific flush callback, although neither of those
2260: * solutions allow us to take the vnode lock. If a
2261: * filesystem requires that we must take the vnode lock
2262: * to call VOP_BMAP, then we can probably do it in
2263: * bwrite when the vnode lock should already be held
2264: * by the invoking code.
2265: */
2266: KASSERT((bp->b_vp->v_type == VBLK) ||
2267: (bp->b_blkno != bp->b_lblkno));
2268: KASSERT(bp->b_blkno > 0);
2269:
2270: wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
2271: wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
2272: wc->wc_len += bp->b_bcount;
2273: wc->wc_blkcount++;
2274: bp = LIST_NEXT(bp, b_wapbllist);
2275: }
1.7 joerg 2276: if (wc->wc_len % blocklen != 0) {
2277: padding = blocklen - wc->wc_len % blocklen;
2278: wc->wc_len += padding;
2279: } else {
2280: padding = 0;
2281: }
2282:
1.2 simonb 2283: WAPBL_PRINTF(WAPBL_PRINT_WRITE,
1.7 joerg 2284: ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
2285: wc->wc_len, padding, (intmax_t)off));
1.2 simonb 2286:
2287: error = wapbl_circ_write(wl, wc, blocklen, &off);
2288: if (error)
2289: return error;
2290: bp = obp;
2291: cnt = 0;
2292: while (bp && (cnt++ < bph)) {
2293: error = wapbl_circ_write(wl, bp->b_data,
2294: bp->b_bcount, &off);
2295: if (error)
2296: return error;
2297: bp = LIST_NEXT(bp, b_wapbllist);
2298: }
1.7 joerg 2299: if (padding) {
2300: void *zero;
2301:
1.51 para 2302: zero = wapbl_alloc(padding);
1.7 joerg 2303: memset(zero, 0, padding);
2304: error = wapbl_circ_write(wl, zero, padding, &off);
1.18 yamt 2305: wapbl_free(zero, padding);
1.7 joerg 2306: if (error)
2307: return error;
2308: }
1.2 simonb 2309: }
2310: *offp = off;
2311: return 0;
2312: }
2313:
1.71 riastrad 2314: /*
2315: * wapbl_write_revocations(wl, offp)
2316: *
2317: * Write all pending deallocations in the current transaction from
2318: * wapbl_register_deallocation to the log on disk, adding to the
2319: * circular queue's head at byte offset *offp, and returning the
2320: * new head's byte offset in *offp.
2321: */
1.2 simonb 2322: static int
2323: wapbl_write_revocations(struct wapbl *wl, off_t *offp)
2324: {
2325: struct wapbl_wc_blocklist *wc =
2326: (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2327: int i;
2328: int blocklen = 1<<wl->wl_log_dev_bshift;
2329: int bph;
2330: off_t off = *offp;
2331: int error;
2332:
2333: if (wl->wl_dealloccnt == 0)
2334: return 0;
2335:
2336: bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2337: sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2338:
2339: i = 0;
2340: while (i < wl->wl_dealloccnt) {
2341: wc->wc_type = WAPBL_WC_REVOCATIONS;
2342: wc->wc_len = blocklen;
2343: wc->wc_blkcount = 0;
2344: while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) {
2345: wc->wc_blocks[wc->wc_blkcount].wc_daddr =
2346: wl->wl_deallocblks[i];
2347: wc->wc_blocks[wc->wc_blkcount].wc_dlen =
2348: wl->wl_dealloclens[i];
2349: wc->wc_blkcount++;
2350: i++;
2351: }
2352: WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2353: ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
2354: wc->wc_len, (intmax_t)off));
2355: error = wapbl_circ_write(wl, wc, blocklen, &off);
2356: if (error)
2357: return error;
2358: }
2359: *offp = off;
2360: return 0;
2361: }
2362:
1.71 riastrad 2363: /*
2364: * wapbl_write_inodes(wl, offp)
2365: *
2366: * Write all pending inode allocations in the current transaction
2367: * from wapbl_register_inode to the log on disk, adding to the
2368: * circular queue's head at byte offset *offp and returning the
2369: * new head's byte offset in *offp.
2370: */
1.2 simonb 2371: static int
2372: wapbl_write_inodes(struct wapbl *wl, off_t *offp)
2373: {
2374: struct wapbl_wc_inodelist *wc =
2375: (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
2376: int i;
1.14 joerg 2377: int blocklen = 1 << wl->wl_log_dev_bshift;
1.2 simonb 2378: off_t off = *offp;
2379: int error;
2380:
2381: struct wapbl_ino_head *wih;
2382: struct wapbl_ino *wi;
2383: int iph;
2384:
2385: iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2386: sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2387:
2388: i = 0;
2389: wih = &wl->wl_inohash[0];
2390: wi = 0;
2391: do {
2392: wc->wc_type = WAPBL_WC_INODES;
2393: wc->wc_len = blocklen;
2394: wc->wc_inocnt = 0;
2395: wc->wc_clear = (i == 0);
2396: while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
2397: while (!wi) {
2398: KASSERT((wih - &wl->wl_inohash[0])
2399: <= wl->wl_inohashmask);
2400: wi = LIST_FIRST(wih++);
2401: }
2402: wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
2403: wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
2404: wc->wc_inocnt++;
2405: i++;
2406: wi = LIST_NEXT(wi, wi_hash);
2407: }
2408: WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2409: ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
2410: wc->wc_len, (intmax_t)off));
2411: error = wapbl_circ_write(wl, wc, blocklen, &off);
2412: if (error)
2413: return error;
2414: } while (i < wl->wl_inohashcnt);
2415:
2416: *offp = off;
2417: return 0;
2418: }
2419:
2420: #endif /* _KERNEL */
2421:
2422: /****************************************************************/
2423:
2424: struct wapbl_blk {
2425: LIST_ENTRY(wapbl_blk) wb_hash;
2426: daddr_t wb_blk;
2427: off_t wb_off; /* Offset of this block in the log */
2428: };
2429: #define WAPBL_BLKPOOL_MIN 83
2430:
2431: static void
2432: wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
2433: {
2434: if (size < WAPBL_BLKPOOL_MIN)
2435: size = WAPBL_BLKPOOL_MIN;
2436: KASSERT(wr->wr_blkhash == 0);
2437: #ifdef _KERNEL
2438: wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
2439: #else /* ! _KERNEL */
2440: /* Manually implement hashinit */
2441: {
1.25 lukem 2442: unsigned long i, hashsize;
1.2 simonb 2443: for (hashsize = 1; hashsize < size; hashsize <<= 1)
2444: continue;
1.51 para 2445: wr->wr_blkhash = wapbl_alloc(hashsize * sizeof(*wr->wr_blkhash));
1.37 drochner 2446: for (i = 0; i < hashsize; i++)
1.2 simonb 2447: LIST_INIT(&wr->wr_blkhash[i]);
2448: wr->wr_blkhashmask = hashsize - 1;
2449: }
2450: #endif /* ! _KERNEL */
2451: }
2452:
2453: static void
2454: wapbl_blkhash_free(struct wapbl_replay *wr)
2455: {
2456: KASSERT(wr->wr_blkhashcnt == 0);
2457: #ifdef _KERNEL
2458: hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
2459: #else /* ! _KERNEL */
1.18 yamt 2460: wapbl_free(wr->wr_blkhash,
2461: (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash));
1.2 simonb 2462: #endif /* ! _KERNEL */
2463: }
2464:
2465: static struct wapbl_blk *
2466: wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
2467: {
2468: struct wapbl_blk_head *wbh;
2469: struct wapbl_blk *wb;
2470: wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2471: LIST_FOREACH(wb, wbh, wb_hash) {
2472: if (blk == wb->wb_blk)
2473: return wb;
2474: }
2475: return 0;
2476: }
2477:
2478: static void
2479: wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
2480: {
2481: struct wapbl_blk_head *wbh;
2482: struct wapbl_blk *wb;
2483: wb = wapbl_blkhash_get(wr, blk);
2484: if (wb) {
2485: KASSERT(wb->wb_blk == blk);
2486: wb->wb_off = off;
2487: } else {
1.51 para 2488: wb = wapbl_alloc(sizeof(*wb));
1.2 simonb 2489: wb->wb_blk = blk;
2490: wb->wb_off = off;
2491: wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2492: LIST_INSERT_HEAD(wbh, wb, wb_hash);
2493: wr->wr_blkhashcnt++;
2494: }
2495: }
2496:
2497: static void
2498: wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
2499: {
2500: struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2501: if (wb) {
2502: KASSERT(wr->wr_blkhashcnt > 0);
2503: wr->wr_blkhashcnt--;
2504: LIST_REMOVE(wb, wb_hash);
1.18 yamt 2505: wapbl_free(wb, sizeof(*wb));
1.2 simonb 2506: }
2507: }
2508:
2509: static void
2510: wapbl_blkhash_clear(struct wapbl_replay *wr)
2511: {
1.25 lukem 2512: unsigned long i;
1.2 simonb 2513: for (i = 0; i <= wr->wr_blkhashmask; i++) {
2514: struct wapbl_blk *wb;
2515:
2516: while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
2517: KASSERT(wr->wr_blkhashcnt > 0);
2518: wr->wr_blkhashcnt--;
2519: LIST_REMOVE(wb, wb_hash);
1.18 yamt 2520: wapbl_free(wb, sizeof(*wb));
1.2 simonb 2521: }
2522: }
2523: KASSERT(wr->wr_blkhashcnt == 0);
2524: }
2525:
2526: /****************************************************************/
2527:
1.71 riastrad 2528: /*
2529: * wapbl_circ_read(wr, data, len, offp)
2530: *
2531: * Read len bytes into data from the circular queue of wr,
2532: * starting at the linear byte offset *offp, and returning the new
2533: * linear byte offset in *offp.
2534: *
2535: * If the starting linear byte offset precedes wr->wr_circ_off,
2536: * the read instead begins at wr->wr_circ_off. XXX WTF? This
2537: * should be a KASSERT, not a conditional.
2538: */
1.2 simonb 2539: static int
2540: wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
2541: {
2542: size_t slen;
2543: off_t off = *offp;
2544: int error;
1.34 mlelstv 2545: daddr_t pbn;
1.2 simonb 2546:
1.14 joerg 2547: KASSERT(((len >> wr->wr_log_dev_bshift) <<
2548: wr->wr_log_dev_bshift) == len);
1.34 mlelstv 2549:
1.14 joerg 2550: if (off < wr->wr_circ_off)
2551: off = wr->wr_circ_off;
2552: slen = wr->wr_circ_off + wr->wr_circ_size - off;
1.2 simonb 2553: if (slen < len) {
1.34 mlelstv 2554: pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2555: #ifdef _KERNEL
2556: pbn = btodb(pbn << wr->wr_log_dev_bshift);
2557: #endif
2558: error = wapbl_read(data, slen, wr->wr_devvp, pbn);
1.2 simonb 2559: if (error)
2560: return error;
2561: data = (uint8_t *)data + slen;
2562: len -= slen;
1.14 joerg 2563: off = wr->wr_circ_off;
1.2 simonb 2564: }
1.34 mlelstv 2565: pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2566: #ifdef _KERNEL
2567: pbn = btodb(pbn << wr->wr_log_dev_bshift);
2568: #endif
2569: error = wapbl_read(data, len, wr->wr_devvp, pbn);
1.2 simonb 2570: if (error)
2571: return error;
2572: off += len;
1.14 joerg 2573: if (off >= wr->wr_circ_off + wr->wr_circ_size)
2574: off = wr->wr_circ_off;
1.2 simonb 2575: *offp = off;
2576: return 0;
2577: }
2578:
1.71 riastrad 2579: /*
2580: * wapbl_circ_advance(wr, len, offp)
2581: *
2582: * Compute the linear byte offset of the circular queue of wr that
2583: * is len bytes past *offp, and store it in *offp.
2584: *
2585: * This is as if wapbl_circ_read, but without actually reading
2586: * anything.
2587: *
2588: * If the starting linear byte offset precedes wr->wr_circ_off, it
2589: * is taken to be wr->wr_circ_off instead. XXX WTF? This should
2590: * be a KASSERT, not a conditional.
2591: */
1.2 simonb 2592: static void
2593: wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
2594: {
2595: size_t slen;
2596: off_t off = *offp;
2597:
1.14 joerg 2598: KASSERT(((len >> wr->wr_log_dev_bshift) <<
2599: wr->wr_log_dev_bshift) == len);
1.2 simonb 2600:
1.14 joerg 2601: if (off < wr->wr_circ_off)
2602: off = wr->wr_circ_off;
2603: slen = wr->wr_circ_off + wr->wr_circ_size - off;
1.2 simonb 2604: if (slen < len) {
2605: len -= slen;
1.14 joerg 2606: off = wr->wr_circ_off;
1.2 simonb 2607: }
2608: off += len;
1.14 joerg 2609: if (off >= wr->wr_circ_off + wr->wr_circ_size)
2610: off = wr->wr_circ_off;
1.2 simonb 2611: *offp = off;
2612: }
2613:
2614: /****************************************************************/
2615:
2616: int
2617: wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
2618: daddr_t off, size_t count, size_t blksize)
2619: {
2620: struct wapbl_replay *wr;
2621: int error;
2622: struct vnode *devvp;
2623: daddr_t logpbn;
2624: uint8_t *scratch;
2625: struct wapbl_wc_header *wch;
2626: struct wapbl_wc_header *wch2;
2627: /* Use this until we read the actual log header */
1.31 mlelstv 2628: int log_dev_bshift = ilog2(blksize);
1.2 simonb 2629: size_t used;
1.34 mlelstv 2630: daddr_t pbn;
1.2 simonb 2631:
2632: WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2633: ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
2634: vp, off, count, blksize));
2635:
2636: if (off < 0)
2637: return EINVAL;
2638:
2639: if (blksize < DEV_BSIZE)
2640: return EINVAL;
2641: if (blksize % DEV_BSIZE)
2642: return EINVAL;
2643:
2644: #ifdef _KERNEL
2645: #if 0
2646: /* XXX vp->v_size isn't reliably set for VBLK devices,
2647: * especially root. However, we might still want to verify
2648: * that the full load is readable */
2649: if ((off + count) * blksize > vp->v_size)
2650: return EINVAL;
2651: #endif
2652: if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
2653: return error;
2654: }
2655: #else /* ! _KERNEL */
2656: devvp = vp;
2657: logpbn = off;
2658: #endif /* ! _KERNEL */
2659:
1.51 para 2660: scratch = wapbl_alloc(MAXBSIZE);
1.2 simonb 2661:
1.34 mlelstv 2662: pbn = logpbn;
2663: #ifdef _KERNEL
2664: pbn = btodb(pbn << log_dev_bshift);
2665: #endif
2666: error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn);
1.2 simonb 2667: if (error)
2668: goto errout;
2669:
2670: wch = (struct wapbl_wc_header *)scratch;
2671: wch2 =
2672: (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
2673: /* XXX verify checksums and magic numbers */
2674: if (wch->wc_type != WAPBL_WC_HEADER) {
2675: printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
2676: error = EFTYPE;
2677: goto errout;
2678: }
2679:
2680: if (wch2->wc_generation > wch->wc_generation)
2681: wch = wch2;
2682:
2683: wr = wapbl_calloc(1, sizeof(*wr));
2684:
2685: wr->wr_logvp = vp;
2686: wr->wr_devvp = devvp;
2687: wr->wr_logpbn = logpbn;
2688:
2689: wr->wr_scratch = scratch;
2690:
1.14 joerg 2691: wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
2692: wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
2693: wr->wr_circ_off = wch->wc_circ_off;
2694: wr->wr_circ_size = wch->wc_circ_size;
2695: wr->wr_generation = wch->wc_generation;
1.2 simonb 2696:
2697: used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
2698:
2699: WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2700: ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
2701: " len=%"PRId64" used=%zu\n",
2702: wch->wc_head, wch->wc_tail, wch->wc_circ_off,
2703: wch->wc_circ_size, used));
2704:
2705: wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
1.11 joerg 2706:
1.14 joerg 2707: error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
1.2 simonb 2708: if (error) {
2709: wapbl_replay_stop(wr);
2710: wapbl_replay_free(wr);
2711: return error;
2712: }
2713:
2714: *wrp = wr;
2715: return 0;
2716:
2717: errout:
1.18 yamt 2718: wapbl_free(scratch, MAXBSIZE);
1.2 simonb 2719: return error;
2720: }
2721:
2722: void
2723: wapbl_replay_stop(struct wapbl_replay *wr)
2724: {
2725:
1.4 joerg 2726: if (!wapbl_replay_isopen(wr))
2727: return;
2728:
1.2 simonb 2729: WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
2730:
1.18 yamt 2731: wapbl_free(wr->wr_scratch, MAXBSIZE);
2732: wr->wr_scratch = NULL;
1.2 simonb 2733:
1.18 yamt 2734: wr->wr_logvp = NULL;
1.2 simonb 2735:
2736: wapbl_blkhash_clear(wr);
2737: wapbl_blkhash_free(wr);
2738: }
2739:
2740: void
2741: wapbl_replay_free(struct wapbl_replay *wr)
2742: {
2743:
2744: KDASSERT(!wapbl_replay_isopen(wr));
2745:
2746: if (wr->wr_inodes)
1.18 yamt 2747: wapbl_free(wr->wr_inodes,
2748: wr->wr_inodescnt * sizeof(wr->wr_inodes[0]));
2749: wapbl_free(wr, sizeof(*wr));
1.2 simonb 2750: }
2751:
1.4 joerg 2752: #ifdef _KERNEL
1.2 simonb 2753: int
2754: wapbl_replay_isopen1(struct wapbl_replay *wr)
2755: {
2756:
2757: return wapbl_replay_isopen(wr);
2758: }
1.4 joerg 2759: #endif
1.2 simonb 2760:
1.62 mlelstv 2761: /*
2762: * calculate the disk address for the i'th block in the wc_blockblist
2763: * offset by j blocks of size blen.
2764: *
2765: * wc_daddr is always a kernel disk address in DEV_BSIZE units that
2766: * was written to the journal.
2767: *
2768: * The kernel needs that address plus the offset in DEV_BSIZE units.
2769: *
2770: * Userland needs that address plus the offset in blen units.
2771: *
2772: */
2773: static daddr_t
2774: wapbl_block_daddr(struct wapbl_wc_blocklist *wc, int i, int j, int blen)
2775: {
2776: daddr_t pbn;
2777:
2778: #ifdef _KERNEL
2779: pbn = wc->wc_blocks[i].wc_daddr + btodb(j * blen);
2780: #else
2781: pbn = dbtob(wc->wc_blocks[i].wc_daddr) / blen + j;
2782: #endif
2783:
2784: return pbn;
2785: }
2786:
1.10 joerg 2787: static void
2788: wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
2789: {
2790: struct wapbl_wc_blocklist *wc =
2791: (struct wapbl_wc_blocklist *)wr->wr_scratch;
1.14 joerg 2792: int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.10 joerg 2793: int i, j, n;
2794:
2795: for (i = 0; i < wc->wc_blkcount; i++) {
2796: /*
2797: * Enter each physical block into the hashtable independently.
2798: */
1.14 joerg 2799: n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
1.10 joerg 2800: for (j = 0; j < n; j++) {
1.62 mlelstv 2801: wapbl_blkhash_ins(wr, wapbl_block_daddr(wc, i, j, fsblklen),
1.10 joerg 2802: *offp);
2803: wapbl_circ_advance(wr, fsblklen, offp);
2804: }
2805: }
2806: }
2807:
2808: static void
2809: wapbl_replay_process_revocations(struct wapbl_replay *wr)
2810: {
2811: struct wapbl_wc_blocklist *wc =
2812: (struct wapbl_wc_blocklist *)wr->wr_scratch;
1.34 mlelstv 2813: int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.10 joerg 2814: int i, j, n;
2815:
2816: for (i = 0; i < wc->wc_blkcount; i++) {
2817: /*
2818: * Remove any blocks found from the hashtable.
2819: */
1.14 joerg 2820: n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
1.10 joerg 2821: for (j = 0; j < n; j++)
1.62 mlelstv 2822: wapbl_blkhash_rem(wr, wapbl_block_daddr(wc, i, j, fsblklen));
1.10 joerg 2823: }
2824: }
2825:
2826: static void
2827: wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
2828: {
2829: struct wapbl_wc_inodelist *wc =
2830: (struct wapbl_wc_inodelist *)wr->wr_scratch;
1.18 yamt 2831: void *new_inodes;
2832: const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]);
2833:
2834: KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0]));
2835:
1.10 joerg 2836: /*
2837: * Keep track of where we found this so location won't be
2838: * overwritten.
2839: */
2840: if (wc->wc_clear) {
2841: wr->wr_inodestail = oldoff;
2842: wr->wr_inodescnt = 0;
1.12 joerg 2843: if (wr->wr_inodes != NULL) {
1.18 yamt 2844: wapbl_free(wr->wr_inodes, oldsize);
1.12 joerg 2845: wr->wr_inodes = NULL;
2846: }
1.10 joerg 2847: }
2848: wr->wr_inodeshead = newoff;
2849: if (wc->wc_inocnt == 0)
2850: return;
2851:
1.51 para 2852: new_inodes = wapbl_alloc((wr->wr_inodescnt + wc->wc_inocnt) *
1.18 yamt 2853: sizeof(wr->wr_inodes[0]));
2854: if (wr->wr_inodes != NULL) {
2855: memcpy(new_inodes, wr->wr_inodes, oldsize);
2856: wapbl_free(wr->wr_inodes, oldsize);
2857: }
2858: wr->wr_inodes = new_inodes;
1.10 joerg 2859: memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
1.18 yamt 2860: wc->wc_inocnt * sizeof(wr->wr_inodes[0]));
1.10 joerg 2861: wr->wr_inodescnt += wc->wc_inocnt;
2862: }
2863:
1.2 simonb 2864: static int
1.14 joerg 2865: wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
1.2 simonb 2866: {
2867: off_t off;
2868: int error;
2869:
1.14 joerg 2870: int logblklen = 1 << wr->wr_log_dev_bshift;
1.2 simonb 2871:
2872: wapbl_blkhash_clear(wr);
2873:
1.14 joerg 2874: off = tail;
2875: while (off != head) {
1.2 simonb 2876: struct wapbl_wc_null *wcn;
2877: off_t saveoff = off;
2878: error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2879: if (error)
2880: goto errout;
2881: wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2882: switch (wcn->wc_type) {
2883: case WAPBL_WC_BLOCKS:
1.10 joerg 2884: wapbl_replay_process_blocks(wr, &off);
1.2 simonb 2885: break;
2886:
2887: case WAPBL_WC_REVOCATIONS:
1.10 joerg 2888: wapbl_replay_process_revocations(wr);
1.2 simonb 2889: break;
2890:
2891: case WAPBL_WC_INODES:
1.10 joerg 2892: wapbl_replay_process_inodes(wr, saveoff, off);
1.2 simonb 2893: break;
1.10 joerg 2894:
1.2 simonb 2895: default:
2896: printf("Unrecognized wapbl type: 0x%08x\n",
2897: wcn->wc_type);
2898: error = EFTYPE;
2899: goto errout;
2900: }
2901: wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2902: if (off != saveoff) {
2903: printf("wapbl_replay: corrupted records\n");
2904: error = EFTYPE;
2905: goto errout;
2906: }
2907: }
2908: return 0;
2909:
2910: errout:
2911: wapbl_blkhash_clear(wr);
2912: return error;
2913: }
2914:
1.13 joerg 2915: #if 0
1.2 simonb 2916: int
2917: wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
2918: {
2919: off_t off;
2920: int mismatchcnt = 0;
1.14 joerg 2921: int logblklen = 1 << wr->wr_log_dev_bshift;
2922: int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.51 para 2923: void *scratch1 = wapbl_alloc(MAXBSIZE);
2924: void *scratch2 = wapbl_alloc(MAXBSIZE);
1.2 simonb 2925: int error = 0;
2926:
2927: KDASSERT(wapbl_replay_isopen(wr));
2928:
2929: off = wch->wc_tail;
2930: while (off != wch->wc_head) {
2931: struct wapbl_wc_null *wcn;
2932: #ifdef DEBUG
2933: off_t saveoff = off;
2934: #endif
2935: error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2936: if (error)
2937: goto out;
2938: wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2939: switch (wcn->wc_type) {
2940: case WAPBL_WC_BLOCKS:
2941: {
2942: struct wapbl_wc_blocklist *wc =
2943: (struct wapbl_wc_blocklist *)wr->wr_scratch;
2944: int i;
2945: for (i = 0; i < wc->wc_blkcount; i++) {
2946: int foundcnt = 0;
2947: int dirtycnt = 0;
2948: int j, n;
2949: /*
2950: * Check each physical block into the
2951: * hashtable independently
2952: */
2953: n = wc->wc_blocks[i].wc_dlen >>
2954: wch->wc_fs_dev_bshift;
2955: for (j = 0; j < n; j++) {
2956: struct wapbl_blk *wb =
2957: wapbl_blkhash_get(wr,
1.62 mlelstv 2958: wapbl_block_daddr(wc, i, j, fsblklen));
1.2 simonb 2959: if (wb && (wb->wb_off == off)) {
2960: foundcnt++;
2961: error =
2962: wapbl_circ_read(wr,
2963: scratch1, fsblklen,
2964: &off);
2965: if (error)
2966: goto out;
2967: error =
2968: wapbl_read(scratch2,
2969: fsblklen, fsdevvp,
2970: wb->wb_blk);
2971: if (error)
2972: goto out;
2973: if (memcmp(scratch1,
2974: scratch2,
2975: fsblklen)) {
2976: printf(
2977: "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
2978: wb->wb_blk, (intmax_t)off);
2979: dirtycnt++;
2980: mismatchcnt++;
2981: }
2982: } else {
2983: wapbl_circ_advance(wr,
2984: fsblklen, &off);
2985: }
2986: }
2987: #if 0
2988: /*
2989: * If all of the blocks in an entry
2990: * are clean, then remove all of its
2991: * blocks from the hashtable since they
2992: * never will need replay.
2993: */
2994: if ((foundcnt != 0) &&
2995: (dirtycnt == 0)) {
2996: off = saveoff;
2997: wapbl_circ_advance(wr,
2998: logblklen, &off);
2999: for (j = 0; j < n; j++) {
3000: struct wapbl_blk *wb =
3001: wapbl_blkhash_get(wr,
1.62 mlelstv 3002: wapbl_block_daddr(wc, i, j, fsblklen));
1.2 simonb 3003: if (wb &&
3004: (wb->wb_off == off)) {
3005: wapbl_blkhash_rem(wr, wb->wb_blk);
3006: }
3007: wapbl_circ_advance(wr,
3008: fsblklen, &off);
3009: }
3010: }
3011: #endif
3012: }
3013: }
3014: break;
3015: case WAPBL_WC_REVOCATIONS:
3016: case WAPBL_WC_INODES:
3017: break;
3018: default:
3019: KASSERT(0);
3020: }
3021: #ifdef DEBUG
3022: wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
3023: KASSERT(off == saveoff);
3024: #endif
3025: }
3026: out:
1.18 yamt 3027: wapbl_free(scratch1, MAXBSIZE);
3028: wapbl_free(scratch2, MAXBSIZE);
1.2 simonb 3029: if (!error && mismatchcnt)
3030: error = EFTYPE;
3031: return error;
3032: }
3033: #endif
3034:
3035: int
3036: wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
3037: {
1.9 joerg 3038: struct wapbl_blk *wb;
3039: size_t i;
1.2 simonb 3040: off_t off;
1.9 joerg 3041: void *scratch;
1.2 simonb 3042: int error = 0;
1.14 joerg 3043: int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.2 simonb 3044:
3045: KDASSERT(wapbl_replay_isopen(wr));
3046:
1.51 para 3047: scratch = wapbl_alloc(MAXBSIZE);
1.2 simonb 3048:
1.37 drochner 3049: for (i = 0; i <= wr->wr_blkhashmask; ++i) {
1.9 joerg 3050: LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
3051: off = wb->wb_off;
3052: error = wapbl_circ_read(wr, scratch, fsblklen, &off);
3053: if (error)
3054: break;
3055: error = wapbl_write(scratch, fsblklen, fsdevvp,
3056: wb->wb_blk);
3057: if (error)
3058: break;
1.2 simonb 3059: }
3060: }
1.9 joerg 3061:
1.18 yamt 3062: wapbl_free(scratch, MAXBSIZE);
1.2 simonb 3063: return error;
3064: }
3065:
3066: int
1.6 joerg 3067: wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
3068: {
1.14 joerg 3069: int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.6 joerg 3070:
3071: KDASSERT(wapbl_replay_isopen(wr));
3072: KASSERT((len % fsblklen) == 0);
3073:
3074: while (len != 0) {
3075: struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
3076: if (wb)
3077: return 1;
3078: len -= fsblklen;
3079: }
3080: return 0;
3081: }
3082:
3083: int
1.2 simonb 3084: wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
3085: {
1.14 joerg 3086: int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.2 simonb 3087:
3088: KDASSERT(wapbl_replay_isopen(wr));
3089:
3090: KASSERT((len % fsblklen) == 0);
3091:
3092: while (len != 0) {
3093: struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
3094: if (wb) {
3095: off_t off = wb->wb_off;
3096: int error;
3097: error = wapbl_circ_read(wr, data, fsblklen, &off);
3098: if (error)
3099: return error;
3100: }
3101: data = (uint8_t *)data + fsblklen;
3102: len -= fsblklen;
3103: blk++;
3104: }
3105: return 0;
3106: }
1.35 pooka 3107:
1.36 pooka 3108: #ifdef _KERNEL
1.64 pgoyette 3109:
1.35 pooka 3110: MODULE(MODULE_CLASS_VFS, wapbl, NULL);
3111:
3112: static int
3113: wapbl_modcmd(modcmd_t cmd, void *arg)
3114: {
3115:
3116: switch (cmd) {
3117: case MODULE_CMD_INIT:
1.39 christos 3118: wapbl_init();
1.35 pooka 3119: return 0;
3120: case MODULE_CMD_FINI:
1.74 riastrad 3121: return wapbl_fini();
1.35 pooka 3122: default:
3123: return ENOTTY;
3124: }
3125: }
1.36 pooka 3126: #endif /* _KERNEL */
CVSweb <webmaster@jp.NetBSD.org>