Annotation of src/sys/kern/vfs_wapbl.c, Revision 1.78.2.1
1.78.2.1! pgoyette 1: /* $NetBSD: vfs_wapbl.c,v 1.85 2016/10/28 20:38:12 jdolecek Exp $ */
1.2 simonb 2:
3: /*-
1.23 ad 4: * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
1.2 simonb 5: * All rights reserved.
6: *
7: * This code is derived from software contributed to The NetBSD Foundation
8: * by Wasabi Systems, Inc.
9: *
10: * Redistribution and use in source and binary forms, with or without
11: * modification, are permitted provided that the following conditions
12: * are met:
13: * 1. Redistributions of source code must retain the above copyright
14: * notice, this list of conditions and the following disclaimer.
15: * 2. Redistributions in binary form must reproduce the above copyright
16: * notice, this list of conditions and the following disclaimer in the
17: * documentation and/or other materials provided with the distribution.
18: *
19: * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20: * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21: * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23: * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24: * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25: * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26: * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27: * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28: * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29: * POSSIBILITY OF SUCH DAMAGE.
30: */
31:
32: /*
33: * This implements file system independent write ahead filesystem logging.
34: */
1.4 joerg 35:
36: #define WAPBL_INTERNAL
37:
1.2 simonb 38: #include <sys/cdefs.h>
1.78.2.1! pgoyette 39: __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.85 2016/10/28 20:38:12 jdolecek Exp $");
1.2 simonb 40:
41: #include <sys/param.h>
1.31 mlelstv 42: #include <sys/bitops.h>
1.68 riastrad 43: #include <sys/time.h>
44: #include <sys/wapbl.h>
45: #include <sys/wapbl_replay.h>
1.2 simonb 46:
47: #ifdef _KERNEL
1.68 riastrad 48:
49: #include <sys/atomic.h>
50: #include <sys/conf.h>
51: #include <sys/file.h>
52: #include <sys/kauth.h>
53: #include <sys/kernel.h>
54: #include <sys/module.h>
55: #include <sys/mount.h>
56: #include <sys/mutex.h>
1.2 simonb 57: #include <sys/namei.h>
58: #include <sys/proc.h>
1.68 riastrad 59: #include <sys/resourcevar.h>
1.39 christos 60: #include <sys/sysctl.h>
1.2 simonb 61: #include <sys/uio.h>
62: #include <sys/vnode.h>
63:
64: #include <miscfs/specfs/specdev.h>
65:
1.51 para 66: #define wapbl_alloc(s) kmem_alloc((s), KM_SLEEP)
67: #define wapbl_free(a, s) kmem_free((a), (s))
68: #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
1.2 simonb 69:
1.39 christos 70: static struct sysctllog *wapbl_sysctl;
71: static int wapbl_flush_disk_cache = 1;
72: static int wapbl_verbose_commit = 0;
73:
1.57 joerg 74: static inline size_t wapbl_space_free(size_t, off_t, off_t);
75:
1.2 simonb 76: #else /* !_KERNEL */
1.68 riastrad 77:
1.2 simonb 78: #include <assert.h>
79: #include <errno.h>
1.68 riastrad 80: #include <stdbool.h>
1.2 simonb 81: #include <stdio.h>
82: #include <stdlib.h>
83: #include <string.h>
84:
85: #define KDASSERT(x) assert(x)
86: #define KASSERT(x) assert(x)
1.51 para 87: #define wapbl_alloc(s) malloc(s)
1.18 yamt 88: #define wapbl_free(a, s) free(a)
1.2 simonb 89: #define wapbl_calloc(n, s) calloc((n), (s))
90:
91: #endif /* !_KERNEL */
92:
93: /*
94: * INTERNAL DATA STRUCTURES
95: */
96:
97: /*
98: * This structure holds per-mount log information.
99: *
100: * Legend: a = atomic access only
101: * r = read-only after init
102: * l = rwlock held
103: * m = mutex held
1.38 hannken 104: * lm = rwlock held writing or mutex held
1.2 simonb 105: * u = unlocked access ok
106: * b = bufcache_lock held
107: */
1.60 matt 108: LIST_HEAD(wapbl_ino_head, wapbl_ino);
1.2 simonb 109: struct wapbl {
110: struct vnode *wl_logvp; /* r: log here */
111: struct vnode *wl_devvp; /* r: log on this device */
112: struct mount *wl_mount; /* r: mountpoint wl is associated with */
113: daddr_t wl_logpbn; /* r: Physical block number of start of log */
114: int wl_log_dev_bshift; /* r: logarithm of device block size of log
115: device */
116: int wl_fs_dev_bshift; /* r: logarithm of device block size of
117: filesystem device */
118:
1.3 yamt 119: unsigned wl_lock_count; /* m: Count of transactions in progress */
1.2 simonb 120:
121: size_t wl_circ_size; /* r: Number of bytes in buffer of log */
122: size_t wl_circ_off; /* r: Number of bytes reserved at start */
123:
124: size_t wl_bufcount_max; /* r: Number of buffers reserved for log */
125: size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */
126:
127: off_t wl_head; /* l: Byte offset of log head */
128: off_t wl_tail; /* l: Byte offset of log tail */
129: /*
1.71 riastrad 130: * WAPBL log layout, stored on wl_devvp at wl_logpbn:
131: *
132: * ___________________ wl_circ_size __________________
133: * / \
134: * +---------+---------+-------+--------------+--------+
135: * [ commit0 | commit1 | CCWCW | EEEEEEEEEEEE | CCCWCW ]
136: * +---------+---------+-------+--------------+--------+
137: * wl_circ_off --^ ^-- wl_head ^-- wl_tail
138: *
139: * commit0 and commit1 are commit headers. A commit header has
140: * a generation number, indicating which of the two headers is
141: * more recent, and an assignment of head and tail pointers.
142: * The rest is a circular queue of log records, starting at
143: * the byte offset wl_circ_off.
144: *
145: * E marks empty space for records.
146: * W marks records for block writes issued but waiting.
147: * C marks completed records.
148: *
149: * wapbl_flush writes new records to empty `E' spaces after
150: * wl_head from the current transaction in memory.
151: *
152: * wapbl_truncate advances wl_tail past any completed `C'
153: * records, freeing them up for use.
154: *
155: * head == tail == 0 means log is empty.
156: * head == tail != 0 means log is full.
157: *
158: * See assertions in wapbl_advance() for other boundary
159: * conditions.
160: *
161: * Only wapbl_flush moves the head, except when wapbl_truncate
162: * sets it to 0 to indicate that the log is empty.
163: *
164: * Only wapbl_truncate moves the tail, except when wapbl_flush
165: * sets it to wl_circ_off to indicate that the log is full.
1.2 simonb 166: */
167:
168: struct wapbl_wc_header *wl_wc_header; /* l */
169: void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */
170:
171: kmutex_t wl_mtx; /* u: short-term lock */
172: krwlock_t wl_rwlock; /* u: File system transaction lock */
173:
174: /*
175: * Must be held while accessing
176: * wl_count or wl_bufs or head or tail
177: */
178:
179: /*
180: * Callback called from within the flush routine to flush any extra
181: * bits. Note that flush may be skipped without calling this if
182: * there are no outstanding buffers in the transaction.
183: */
1.5 joerg 184: #if _KERNEL
1.2 simonb 185: wapbl_flush_fn_t wl_flush; /* r */
186: wapbl_flush_fn_t wl_flush_abort;/* r */
1.5 joerg 187: #endif
1.2 simonb 188:
189: size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */
190: size_t wl_bufcount; /* m: Count of buffers in wl_bufs */
191: size_t wl_bcount; /* m: Total bcount of wl_bufs */
192:
193: LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
194:
195: kcondvar_t wl_reclaimable_cv; /* m (obviously) */
196: size_t wl_reclaimable_bytes; /* m: Amount of space available for
197: reclamation by truncate */
198: int wl_error_count; /* m: # of wl_entries with errors */
199: size_t wl_reserved_bytes; /* never truncate log smaller than this */
200:
201: #ifdef WAPBL_DEBUG_BUFBYTES
202: size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
203: #endif
204:
1.78.2.1! pgoyette 205: #if _KERNEL
! 206: int wl_brperjblock; /* r Block records per journal block */
! 207: #endif
! 208:
! 209: SIMPLEQ_HEAD(, wapbl_dealloc) wl_dealloclist; /* lm: list head */
! 210: int wl_dealloccnt; /* lm: total count */
! 211: int wl_dealloclim; /* r: max count */
1.2 simonb 212:
213: /* hashtable of inode numbers for allocated but unlinked inodes */
214: /* synch ??? */
1.60 matt 215: struct wapbl_ino_head *wl_inohash;
1.2 simonb 216: u_long wl_inohashmask;
217: int wl_inohashcnt;
218:
219: SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
220: accounting */
1.54 hannken 221:
222: u_char *wl_buffer; /* l: buffer for wapbl_buffered_write() */
223: daddr_t wl_buffer_dblk; /* l: buffer disk block address */
224: size_t wl_buffer_used; /* l: buffer current use */
1.2 simonb 225: };
226:
227: #ifdef WAPBL_DEBUG_PRINT
228: int wapbl_debug_print = WAPBL_DEBUG_PRINT;
229: #endif
230:
231: /****************************************************************/
232: #ifdef _KERNEL
233:
234: #ifdef WAPBL_DEBUG
235: struct wapbl *wapbl_debug_wl;
236: #endif
237:
238: static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
239: static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
240: static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
241: static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
242: #endif /* _KERNEL */
243:
1.14 joerg 244: static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
1.2 simonb 245:
1.30 uebayasi 246: static inline size_t wapbl_space_used(size_t avail, off_t head,
1.2 simonb 247: off_t tail);
248:
249: #ifdef _KERNEL
250:
1.51 para 251: static struct pool wapbl_entry_pool;
1.78.2.1! pgoyette 252: static struct pool wapbl_dealloc_pool;
1.51 para 253:
1.2 simonb 254: #define WAPBL_INODETRK_SIZE 83
255: static int wapbl_ino_pool_refcount;
256: static struct pool wapbl_ino_pool;
257: struct wapbl_ino {
258: LIST_ENTRY(wapbl_ino) wi_hash;
259: ino_t wi_ino;
260: mode_t wi_mode;
261: };
262:
263: static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
264: static void wapbl_inodetrk_free(struct wapbl *wl);
265: static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
266:
267: static size_t wapbl_transaction_len(struct wapbl *wl);
1.30 uebayasi 268: static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
1.2 simonb 269:
1.13 joerg 270: #if 0
1.4 joerg 271: int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
272: #endif
273:
274: static int wapbl_replay_isopen1(struct wapbl_replay *);
275:
1.2 simonb 276: struct wapbl_ops wapbl_ops = {
277: .wo_wapbl_discard = wapbl_discard,
278: .wo_wapbl_replay_isopen = wapbl_replay_isopen1,
1.6 joerg 279: .wo_wapbl_replay_can_read = wapbl_replay_can_read,
1.2 simonb 280: .wo_wapbl_replay_read = wapbl_replay_read,
281: .wo_wapbl_add_buf = wapbl_add_buf,
282: .wo_wapbl_remove_buf = wapbl_remove_buf,
283: .wo_wapbl_resize_buf = wapbl_resize_buf,
284: .wo_wapbl_begin = wapbl_begin,
285: .wo_wapbl_end = wapbl_end,
286: .wo_wapbl_junlock_assert= wapbl_junlock_assert,
287:
288: /* XXX: the following is only used to say "this is a wapbl buf" */
289: .wo_wapbl_biodone = wapbl_biodone,
290: };
291:
1.21 yamt 292: static int
1.39 christos 293: wapbl_sysctl_init(void)
294: {
295: int rv;
296: const struct sysctlnode *rnode, *cnode;
297:
298: wapbl_sysctl = NULL;
299:
300: rv = sysctl_createv(&wapbl_sysctl, 0, NULL, &rnode,
301: CTLFLAG_PERMANENT,
302: CTLTYPE_NODE, "wapbl",
303: SYSCTL_DESCR("WAPBL journaling options"),
304: NULL, 0, NULL, 0,
1.59 pooka 305: CTL_VFS, CTL_CREATE, CTL_EOL);
1.39 christos 306: if (rv)
307: return rv;
308:
309: rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
310: CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
311: CTLTYPE_INT, "flush_disk_cache",
312: SYSCTL_DESCR("flush disk cache"),
313: NULL, 0, &wapbl_flush_disk_cache, 0,
314: CTL_CREATE, CTL_EOL);
315: if (rv)
316: return rv;
317:
318: rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
319: CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
320: CTLTYPE_INT, "verbose_commit",
321: SYSCTL_DESCR("show time and size of wapbl log commits"),
322: NULL, 0, &wapbl_verbose_commit, 0,
323: CTL_CREATE, CTL_EOL);
324: return rv;
325: }
326:
327: static void
328: wapbl_init(void)
329: {
1.51 para 330:
331: pool_init(&wapbl_entry_pool, sizeof(struct wapbl_entry), 0, 0, 0,
332: "wapblentrypl", &pool_allocator_kmem, IPL_VM);
1.78.2.1! pgoyette 333: pool_init(&wapbl_dealloc_pool, sizeof(struct wapbl_dealloc), 0, 0, 0,
! 334: "wapbldealloc", &pool_allocator_nointr, IPL_NONE);
1.51 para 335:
1.39 christos 336: wapbl_sysctl_init();
337: }
338:
339: static int
1.74 riastrad 340: wapbl_fini(void)
1.39 christos 341: {
1.51 para 342:
1.63 pgoyette 343: if (wapbl_sysctl != NULL)
344: sysctl_teardown(&wapbl_sysctl);
1.51 para 345:
1.78.2.1! pgoyette 346: pool_destroy(&wapbl_dealloc_pool);
1.51 para 347: pool_destroy(&wapbl_entry_pool);
348:
1.39 christos 349: return 0;
350: }
351:
352: static int
1.15 joerg 353: wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
354: {
355: int error, i;
356:
357: WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
358: ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
359:
360: /*
361: * Its only valid to reuse the replay log if its
362: * the same as the new log we just opened.
363: */
364: KDASSERT(!wapbl_replay_isopen(wr));
1.47 christos 365: KASSERT(wl->wl_devvp->v_type == VBLK);
366: KASSERT(wr->wr_devvp->v_type == VBLK);
1.15 joerg 367: KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
368: KASSERT(wl->wl_logpbn == wr->wr_logpbn);
369: KASSERT(wl->wl_circ_size == wr->wr_circ_size);
370: KASSERT(wl->wl_circ_off == wr->wr_circ_off);
371: KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
372: KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
373:
374: wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
375:
376: for (i = 0; i < wr->wr_inodescnt; i++)
377: wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
378: wr->wr_inodes[i].wr_imode);
379:
380: /* Make sure new transaction won't overwrite old inodes list */
381: KDASSERT(wapbl_transaction_len(wl) <=
382: wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
383: wr->wr_inodestail));
384:
385: wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
386: wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
387: wapbl_transaction_len(wl);
388:
389: error = wapbl_write_inodes(wl, &wl->wl_head);
390: if (error)
391: return error;
392:
393: KASSERT(wl->wl_head != wl->wl_tail);
394: KASSERT(wl->wl_head != 0);
395:
396: return 0;
397: }
398:
1.2 simonb 399: int
400: wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
401: daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
402: wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
403: {
404: struct wapbl *wl;
405: struct vnode *devvp;
406: daddr_t logpbn;
407: int error;
1.31 mlelstv 408: int log_dev_bshift = ilog2(blksize);
1.32 mlelstv 409: int fs_dev_bshift = log_dev_bshift;
1.2 simonb 410: int run;
411:
412: WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
413: " count=%zu blksize=%zu\n", vp, off, count, blksize));
414:
415: if (log_dev_bshift > fs_dev_bshift) {
416: WAPBL_PRINTF(WAPBL_PRINT_OPEN,
417: ("wapbl: log device's block size cannot be larger "
418: "than filesystem's\n"));
419: /*
420: * Not currently implemented, although it could be if
421: * needed someday.
422: */
423: return ENOSYS;
424: }
425:
426: if (off < 0)
427: return EINVAL;
428:
429: if (blksize < DEV_BSIZE)
430: return EINVAL;
431: if (blksize % DEV_BSIZE)
432: return EINVAL;
433:
434: /* XXXTODO: verify that the full load is writable */
435:
436: /*
437: * XXX check for minimum log size
438: * minimum is governed by minimum amount of space
439: * to complete a transaction. (probably truncate)
440: */
441: /* XXX for now pick something minimal */
442: if ((count * blksize) < MAXPHYS) {
443: return ENOSPC;
444: }
445:
446: if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
447: return error;
448: }
449:
450: wl = wapbl_calloc(1, sizeof(*wl));
451: rw_init(&wl->wl_rwlock);
452: mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
453: cv_init(&wl->wl_reclaimable_cv, "wapblrec");
454: LIST_INIT(&wl->wl_bufs);
455: SIMPLEQ_INIT(&wl->wl_entries);
456:
457: wl->wl_logvp = vp;
458: wl->wl_devvp = devvp;
459: wl->wl_mount = mp;
460: wl->wl_logpbn = logpbn;
461: wl->wl_log_dev_bshift = log_dev_bshift;
462: wl->wl_fs_dev_bshift = fs_dev_bshift;
463:
464: wl->wl_flush = flushfn;
465: wl->wl_flush_abort = flushabortfn;
466:
467: /* Reserve two log device blocks for the commit headers */
468: wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
1.34 mlelstv 469: wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
1.2 simonb 470: /* truncate the log usage to a multiple of log_dev_bshift */
471: wl->wl_circ_size >>= wl->wl_log_dev_bshift;
472: wl->wl_circ_size <<= wl->wl_log_dev_bshift;
473:
474: /*
475: * wl_bufbytes_max limits the size of the in memory transaction space.
476: * - Since buffers are allocated and accounted for in units of
477: * PAGE_SIZE it is required to be a multiple of PAGE_SIZE
478: * (i.e. 1<<PAGE_SHIFT)
479: * - Since the log device has to be written in units of
480: * 1<<wl_log_dev_bshift it is required to be a mulitple of
481: * 1<<wl_log_dev_bshift.
482: * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
483: * it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
484: * Therefore it must be multiple of the least common multiple of those
485: * three quantities. Fortunately, all of those quantities are
486: * guaranteed to be a power of two, and the least common multiple of
487: * a set of numbers which are all powers of two is simply the maximum
488: * of those numbers. Finally, the maximum logarithm of a power of two
489: * is the same as the log of the maximum power of two. So we can do
490: * the following operations to size wl_bufbytes_max:
491: */
492:
493: /* XXX fix actual number of pages reserved per filesystem. */
494: wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
495:
496: /* Round wl_bufbytes_max to the largest power of two constraint */
497: wl->wl_bufbytes_max >>= PAGE_SHIFT;
498: wl->wl_bufbytes_max <<= PAGE_SHIFT;
499: wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
500: wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
501: wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
502: wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
503:
504: /* XXX maybe use filesystem fragment size instead of 1024 */
505: /* XXX fix actual number of buffers reserved per filesystem. */
506: wl->wl_bufcount_max = (nbuf / 2) * 1024;
507:
1.78.2.1! pgoyette 508: wl->wl_brperjblock = ((1<<wl->wl_log_dev_bshift)
! 509: - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
! 510: sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
! 511: KASSERT(wl->wl_brperjblock > 0);
! 512:
1.2 simonb 513: /* XXX tie this into resource estimation */
1.41 hannken 514: wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2;
1.78.2.1! pgoyette 515: SIMPLEQ_INIT(&wl->wl_dealloclist);
1.2 simonb 516:
1.54 hannken 517: wl->wl_buffer = wapbl_alloc(MAXPHYS);
518: wl->wl_buffer_used = 0;
519:
1.2 simonb 520: wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
521:
522: /* Initialize the commit header */
523: {
524: struct wapbl_wc_header *wc;
1.14 joerg 525: size_t len = 1 << wl->wl_log_dev_bshift;
1.2 simonb 526: wc = wapbl_calloc(1, len);
527: wc->wc_type = WAPBL_WC_HEADER;
528: wc->wc_len = len;
529: wc->wc_circ_off = wl->wl_circ_off;
530: wc->wc_circ_size = wl->wl_circ_size;
531: /* XXX wc->wc_fsid */
532: wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
533: wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
534: wl->wl_wc_header = wc;
1.51 para 535: wl->wl_wc_scratch = wapbl_alloc(len);
1.2 simonb 536: }
537:
538: /*
539: * if there was an existing set of unlinked but
540: * allocated inodes, preserve it in the new
541: * log.
542: */
543: if (wr && wr->wr_inodescnt) {
1.15 joerg 544: error = wapbl_start_flush_inodes(wl, wr);
1.2 simonb 545: if (error)
546: goto errout;
547: }
548:
549: error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
550: if (error) {
551: goto errout;
552: }
553:
554: *wlp = wl;
555: #if defined(WAPBL_DEBUG)
556: wapbl_debug_wl = wl;
557: #endif
558:
559: return 0;
560: errout:
561: wapbl_discard(wl);
1.18 yamt 562: wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
563: wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
1.54 hannken 564: wapbl_free(wl->wl_buffer, MAXPHYS);
1.2 simonb 565: wapbl_inodetrk_free(wl);
1.18 yamt 566: wapbl_free(wl, sizeof(*wl));
1.2 simonb 567:
568: return error;
569: }
570:
571: /*
572: * Like wapbl_flush, only discards the transaction
573: * completely
574: */
575:
576: void
577: wapbl_discard(struct wapbl *wl)
578: {
579: struct wapbl_entry *we;
1.78.2.1! pgoyette 580: struct wapbl_dealloc *wd;
1.2 simonb 581: struct buf *bp;
582: int i;
583:
584: /*
585: * XXX we may consider using upgrade here
586: * if we want to call flush from inside a transaction
587: */
588: rw_enter(&wl->wl_rwlock, RW_WRITER);
1.78.2.1! pgoyette 589: wl->wl_flush(wl->wl_mount, SIMPLEQ_FIRST(&wl->wl_dealloclist));
1.2 simonb 590:
591: #ifdef WAPBL_DEBUG_PRINT
592: {
593: pid_t pid = -1;
594: lwpid_t lid = -1;
595: if (curproc)
596: pid = curproc->p_pid;
597: if (curlwp)
598: lid = curlwp->l_lid;
599: #ifdef WAPBL_DEBUG_BUFBYTES
600: WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
601: ("wapbl_discard: thread %d.%d discarding "
602: "transaction\n"
603: "\tbufcount=%zu bufbytes=%zu bcount=%zu "
604: "deallocs=%d inodes=%d\n"
605: "\terrcnt = %u, reclaimable=%zu reserved=%zu "
606: "unsynced=%zu\n",
607: pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
608: wl->wl_bcount, wl->wl_dealloccnt,
609: wl->wl_inohashcnt, wl->wl_error_count,
610: wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
611: wl->wl_unsynced_bufbytes));
612: SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
613: WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
614: ("\tentry: bufcount = %zu, reclaimable = %zu, "
615: "error = %d, unsynced = %zu\n",
616: we->we_bufcount, we->we_reclaimable_bytes,
617: we->we_error, we->we_unsynced_bufbytes));
618: }
619: #else /* !WAPBL_DEBUG_BUFBYTES */
620: WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
621: ("wapbl_discard: thread %d.%d discarding transaction\n"
622: "\tbufcount=%zu bufbytes=%zu bcount=%zu "
623: "deallocs=%d inodes=%d\n"
624: "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
625: pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
626: wl->wl_bcount, wl->wl_dealloccnt,
627: wl->wl_inohashcnt, wl->wl_error_count,
628: wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
629: SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
630: WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
631: ("\tentry: bufcount = %zu, reclaimable = %zu, "
632: "error = %d\n",
633: we->we_bufcount, we->we_reclaimable_bytes,
634: we->we_error));
635: }
636: #endif /* !WAPBL_DEBUG_BUFBYTES */
637: }
638: #endif /* WAPBL_DEBUG_PRINT */
639:
640: for (i = 0; i <= wl->wl_inohashmask; i++) {
641: struct wapbl_ino_head *wih;
642: struct wapbl_ino *wi;
643:
644: wih = &wl->wl_inohash[i];
645: while ((wi = LIST_FIRST(wih)) != NULL) {
646: LIST_REMOVE(wi, wi_hash);
647: pool_put(&wapbl_ino_pool, wi);
648: KASSERT(wl->wl_inohashcnt > 0);
649: wl->wl_inohashcnt--;
650: }
651: }
652:
653: /*
654: * clean buffer list
655: */
656: mutex_enter(&bufcache_lock);
657: mutex_enter(&wl->wl_mtx);
658: while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
659: if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
660: /*
661: * The buffer will be unlocked and
662: * removed from the transaction in brelse
663: */
664: mutex_exit(&wl->wl_mtx);
665: brelsel(bp, 0);
666: mutex_enter(&wl->wl_mtx);
667: }
668: }
669: mutex_exit(&wl->wl_mtx);
670: mutex_exit(&bufcache_lock);
671:
672: /*
673: * Remove references to this wl from wl_entries, free any which
674: * no longer have buffers, others will be freed in wapbl_biodone
675: * when they no longer have any buffers.
676: */
677: while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
678: SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
679: /* XXX should we be accumulating wl_error_count
680: * and increasing reclaimable bytes ? */
681: we->we_wapbl = NULL;
682: if (we->we_bufcount == 0) {
683: #ifdef WAPBL_DEBUG_BUFBYTES
684: KASSERT(we->we_unsynced_bufbytes == 0);
685: #endif
1.51 para 686: pool_put(&wapbl_entry_pool, we);
1.2 simonb 687: }
688: }
689:
690: /* Discard list of deallocs */
1.78.2.1! pgoyette 691: while ((wd = SIMPLEQ_FIRST(&wl->wl_dealloclist)) != NULL) {
! 692: SIMPLEQ_REMOVE_HEAD(&wl->wl_dealloclist, wd_entries);
! 693: pool_put(&wapbl_dealloc_pool, wd);
! 694: wl->wl_dealloccnt--;
! 695: }
! 696:
1.2 simonb 697: /* XXX should we clear wl_reserved_bytes? */
698:
699: KASSERT(wl->wl_bufbytes == 0);
700: KASSERT(wl->wl_bcount == 0);
701: KASSERT(wl->wl_bufcount == 0);
702: KASSERT(LIST_EMPTY(&wl->wl_bufs));
703: KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
704: KASSERT(wl->wl_inohashcnt == 0);
1.78.2.1! pgoyette 705: KASSERT(SIMPLEQ_EMPTY(&wl->wl_dealloclist));
! 706: KASSERT(wl->wl_dealloccnt == 0);
1.2 simonb 707:
708: rw_exit(&wl->wl_rwlock);
709: }
710:
711: int
712: wapbl_stop(struct wapbl *wl, int force)
713: {
714: int error;
715:
716: WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
717: error = wapbl_flush(wl, 1);
718: if (error) {
719: if (force)
720: wapbl_discard(wl);
721: else
722: return error;
723: }
724:
725: /* Unlinked inodes persist after a flush */
726: if (wl->wl_inohashcnt) {
727: if (force) {
728: wapbl_discard(wl);
729: } else {
730: return EBUSY;
731: }
732: }
733:
734: KASSERT(wl->wl_bufbytes == 0);
735: KASSERT(wl->wl_bcount == 0);
736: KASSERT(wl->wl_bufcount == 0);
737: KASSERT(LIST_EMPTY(&wl->wl_bufs));
738: KASSERT(wl->wl_dealloccnt == 0);
739: KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
740: KASSERT(wl->wl_inohashcnt == 0);
1.78.2.1! pgoyette 741: KASSERT(SIMPLEQ_EMPTY(&wl->wl_dealloclist));
! 742: KASSERT(wl->wl_dealloccnt == 0);
1.2 simonb 743:
1.18 yamt 744: wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
745: wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
1.54 hannken 746: wapbl_free(wl->wl_buffer, MAXPHYS);
1.2 simonb 747: wapbl_inodetrk_free(wl);
748:
749: cv_destroy(&wl->wl_reclaimable_cv);
750: mutex_destroy(&wl->wl_mtx);
751: rw_destroy(&wl->wl_rwlock);
1.18 yamt 752: wapbl_free(wl, sizeof(*wl));
1.2 simonb 753:
754: return 0;
755: }
756:
1.71 riastrad 757: /****************************************************************/
758: /*
759: * Unbuffered disk I/O
760: */
761:
1.2 simonb 762: static int
763: wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
764: {
765: struct pstats *pstats = curlwp->l_proc->p_stats;
766: struct buf *bp;
767: int error;
768:
769: KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
770: KASSERT(devvp->v_type == VBLK);
771:
772: if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
1.45 rmind 773: mutex_enter(devvp->v_interlock);
1.2 simonb 774: devvp->v_numoutput++;
1.45 rmind 775: mutex_exit(devvp->v_interlock);
1.2 simonb 776: pstats->p_ru.ru_oublock++;
777: } else {
778: pstats->p_ru.ru_inblock++;
779: }
780:
781: bp = getiobuf(devvp, true);
782: bp->b_flags = flags;
783: bp->b_cflags = BC_BUSY; /* silly & dubious */
784: bp->b_dev = devvp->v_rdev;
785: bp->b_data = data;
786: bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
787: bp->b_blkno = pbn;
1.52 chs 788: BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
1.2 simonb 789:
790: WAPBL_PRINTF(WAPBL_PRINT_IO,
1.29 pooka 791: ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n",
1.2 simonb 792: BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
793: bp->b_blkno, bp->b_dev));
794:
795: VOP_STRATEGY(devvp, bp);
796:
797: error = biowait(bp);
798: putiobuf(bp);
799:
800: if (error) {
801: WAPBL_PRINTF(WAPBL_PRINT_ERROR,
802: ("wapbl_doio: %s %zu bytes at block %" PRId64
1.29 pooka 803: " on dev 0x%"PRIx64" failed with error %d\n",
1.2 simonb 804: (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
805: "write" : "read"),
806: len, pbn, devvp->v_rdev, error));
807: }
808:
809: return error;
810: }
811:
1.71 riastrad 812: /*
813: * wapbl_write(data, len, devvp, pbn)
814: *
815: * Synchronously write len bytes from data to physical block pbn
816: * on devvp.
817: */
1.2 simonb 818: int
819: wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
820: {
821:
822: return wapbl_doio(data, len, devvp, pbn, B_WRITE);
823: }
824:
1.71 riastrad 825: /*
826: * wapbl_read(data, len, devvp, pbn)
827: *
828: * Synchronously read len bytes into data from physical block pbn
829: * on devvp.
830: */
1.2 simonb 831: int
832: wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
833: {
834:
835: return wapbl_doio(data, len, devvp, pbn, B_READ);
836: }
837:
1.71 riastrad 838: /****************************************************************/
839: /*
840: * Buffered disk writes -- try to coalesce writes and emit
841: * MAXPHYS-aligned blocks.
842: */
843:
1.2 simonb 844: /*
1.71 riastrad 845: * wapbl_buffered_flush(wl)
846: *
847: * Flush any buffered writes from wapbl_buffered_write.
1.54 hannken 848: */
849: static int
850: wapbl_buffered_flush(struct wapbl *wl)
851: {
852: int error;
853:
854: if (wl->wl_buffer_used == 0)
855: return 0;
856:
857: error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
858: wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
859: wl->wl_buffer_used = 0;
860:
861: return error;
862: }
863:
864: /*
1.71 riastrad 865: * wapbl_buffered_write(data, len, wl, pbn)
866: *
867: * Write len bytes from data to physical block pbn on
868: * wl->wl_devvp. The write may not complete until
869: * wapbl_buffered_flush.
1.54 hannken 870: */
871: static int
872: wapbl_buffered_write(void *data, size_t len, struct wapbl *wl, daddr_t pbn)
873: {
874: int error;
875: size_t resid;
876:
877: /*
878: * If not adjacent to buffered data flush first. Disk block
879: * address is always valid for non-empty buffer.
880: */
881: if (wl->wl_buffer_used > 0 &&
882: pbn != wl->wl_buffer_dblk + btodb(wl->wl_buffer_used)) {
883: error = wapbl_buffered_flush(wl);
884: if (error)
885: return error;
886: }
887: /*
888: * If this write goes to an empty buffer we have to
889: * save the disk block address first.
890: */
891: if (wl->wl_buffer_used == 0)
892: wl->wl_buffer_dblk = pbn;
893: /*
894: * Remaining space so this buffer ends on a MAXPHYS boundary.
895: *
896: * Cannot become less or equal zero as the buffer would have been
897: * flushed on the last call then.
898: */
899: resid = MAXPHYS - dbtob(wl->wl_buffer_dblk % btodb(MAXPHYS)) -
900: wl->wl_buffer_used;
901: KASSERT(resid > 0);
902: KASSERT(dbtob(btodb(resid)) == resid);
903: if (len >= resid) {
904: memcpy(wl->wl_buffer + wl->wl_buffer_used, data, resid);
905: wl->wl_buffer_used += resid;
906: error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
907: wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
908: data = (uint8_t *)data + resid;
909: len -= resid;
910: wl->wl_buffer_dblk = pbn + btodb(resid);
911: wl->wl_buffer_used = 0;
912: if (error)
913: return error;
914: }
915: KASSERT(len < MAXPHYS);
916: if (len > 0) {
917: memcpy(wl->wl_buffer + wl->wl_buffer_used, data, len);
918: wl->wl_buffer_used += len;
919: }
920:
921: return 0;
922: }
923:
924: /*
1.71 riastrad 925: * wapbl_circ_write(wl, data, len, offp)
926: *
927: * Write len bytes from data to the circular queue of wl, starting
928: * at linear byte offset *offp, and returning the new linear byte
929: * offset in *offp.
930: *
931: * If the starting linear byte offset precedes wl->wl_circ_off,
932: * the write instead begins at wl->wl_circ_off. XXX WTF? This
933: * should be a KASSERT, not a conditional.
934: *
935: * The write is buffered in wl and must be flushed with
936: * wapbl_buffered_flush before it will be submitted to the disk.
1.2 simonb 937: */
938: static int
939: wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
940: {
941: size_t slen;
942: off_t off = *offp;
943: int error;
1.34 mlelstv 944: daddr_t pbn;
1.2 simonb 945:
946: KDASSERT(((len >> wl->wl_log_dev_bshift) <<
947: wl->wl_log_dev_bshift) == len);
948:
949: if (off < wl->wl_circ_off)
950: off = wl->wl_circ_off;
951: slen = wl->wl_circ_off + wl->wl_circ_size - off;
952: if (slen < len) {
1.34 mlelstv 953: pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
954: #ifdef _KERNEL
955: pbn = btodb(pbn << wl->wl_log_dev_bshift);
956: #endif
1.54 hannken 957: error = wapbl_buffered_write(data, slen, wl, pbn);
1.2 simonb 958: if (error)
959: return error;
960: data = (uint8_t *)data + slen;
961: len -= slen;
962: off = wl->wl_circ_off;
963: }
1.34 mlelstv 964: pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
965: #ifdef _KERNEL
966: pbn = btodb(pbn << wl->wl_log_dev_bshift);
967: #endif
1.54 hannken 968: error = wapbl_buffered_write(data, len, wl, pbn);
1.2 simonb 969: if (error)
970: return error;
971: off += len;
972: if (off >= wl->wl_circ_off + wl->wl_circ_size)
973: off = wl->wl_circ_off;
974: *offp = off;
975: return 0;
976: }
977:
978: /****************************************************************/
1.71 riastrad 979: /*
980: * WAPBL transactions: entering, adding/removing bufs, and exiting
981: */
1.2 simonb 982:
983: int
984: wapbl_begin(struct wapbl *wl, const char *file, int line)
985: {
986: int doflush;
987: unsigned lockcount;
988:
989: KDASSERT(wl);
990:
991: /*
992: * XXX this needs to be made much more sophisticated.
993: * perhaps each wapbl_begin could reserve a specified
994: * number of buffers and bytes.
995: */
996: mutex_enter(&wl->wl_mtx);
997: lockcount = wl->wl_lock_count;
998: doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
999: wl->wl_bufbytes_max / 2) ||
1000: ((wl->wl_bufcount + (lockcount * 10)) >
1001: wl->wl_bufcount_max / 2) ||
1.28 pooka 1002: (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) ||
1.42 hannken 1003: (wl->wl_dealloccnt >= (wl->wl_dealloclim / 2));
1.2 simonb 1004: mutex_exit(&wl->wl_mtx);
1005:
1006: if (doflush) {
1007: WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1008: ("force flush lockcnt=%d bufbytes=%zu "
1.28 pooka 1009: "(max=%zu) bufcount=%zu (max=%zu) "
1010: "dealloccnt %d (lim=%d)\n",
1.2 simonb 1011: lockcount, wl->wl_bufbytes,
1012: wl->wl_bufbytes_max, wl->wl_bufcount,
1.28 pooka 1013: wl->wl_bufcount_max,
1014: wl->wl_dealloccnt, wl->wl_dealloclim));
1.2 simonb 1015: }
1016:
1017: if (doflush) {
1018: int error = wapbl_flush(wl, 0);
1019: if (error)
1020: return error;
1021: }
1022:
1.23 ad 1023: rw_enter(&wl->wl_rwlock, RW_READER);
1.2 simonb 1024: mutex_enter(&wl->wl_mtx);
1025: wl->wl_lock_count++;
1026: mutex_exit(&wl->wl_mtx);
1027:
1.23 ad 1028: #if defined(WAPBL_DEBUG_PRINT)
1.2 simonb 1029: WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
1030: ("wapbl_begin thread %d.%d with bufcount=%zu "
1031: "bufbytes=%zu bcount=%zu at %s:%d\n",
1032: curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1033: wl->wl_bufbytes, wl->wl_bcount, file, line));
1034: #endif
1035:
1036: return 0;
1037: }
1038:
1039: void
1040: wapbl_end(struct wapbl *wl)
1041: {
1042:
1.23 ad 1043: #if defined(WAPBL_DEBUG_PRINT)
1.2 simonb 1044: WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
1045: ("wapbl_end thread %d.%d with bufcount=%zu "
1046: "bufbytes=%zu bcount=%zu\n",
1047: curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1048: wl->wl_bufbytes, wl->wl_bcount));
1049: #endif
1050:
1.65 riastrad 1051: /*
1052: * XXX this could be handled more gracefully, perhaps place
1053: * only a partial transaction in the log and allow the
1054: * remaining to flush without the protection of the journal.
1055: */
1.67 riastrad 1056: KASSERTMSG((wapbl_transaction_len(wl) <=
1057: (wl->wl_circ_size - wl->wl_reserved_bytes)),
1.65 riastrad 1058: "wapbl_end: current transaction too big to flush");
1.40 bouyer 1059:
1.2 simonb 1060: mutex_enter(&wl->wl_mtx);
1061: KASSERT(wl->wl_lock_count > 0);
1062: wl->wl_lock_count--;
1063: mutex_exit(&wl->wl_mtx);
1064:
1065: rw_exit(&wl->wl_rwlock);
1066: }
1067:
1068: void
1069: wapbl_add_buf(struct wapbl *wl, struct buf * bp)
1070: {
1071:
1072: KASSERT(bp->b_cflags & BC_BUSY);
1073: KASSERT(bp->b_vp);
1074:
1075: wapbl_jlock_assert(wl);
1076:
1077: #if 0
1078: /*
1079: * XXX this might be an issue for swapfiles.
1080: * see uvm_swap.c:1702
1081: *
1082: * XXX2 why require it then? leap of semantics?
1083: */
1084: KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
1085: #endif
1086:
1087: mutex_enter(&wl->wl_mtx);
1088: if (bp->b_flags & B_LOCKED) {
1089: LIST_REMOVE(bp, b_wapbllist);
1090: WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
1091: ("wapbl_add_buf thread %d.%d re-adding buf %p "
1092: "with %d bytes %d bcount\n",
1093: curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
1094: bp->b_bcount));
1095: } else {
1096: /* unlocked by dirty buffers shouldn't exist */
1097: KASSERT(!(bp->b_oflags & BO_DELWRI));
1098: wl->wl_bufbytes += bp->b_bufsize;
1099: wl->wl_bcount += bp->b_bcount;
1100: wl->wl_bufcount++;
1101: WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
1102: ("wapbl_add_buf thread %d.%d adding buf %p "
1103: "with %d bytes %d bcount\n",
1104: curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
1105: bp->b_bcount));
1106: }
1107: LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist);
1108: mutex_exit(&wl->wl_mtx);
1109:
1110: bp->b_flags |= B_LOCKED;
1111: }
1112:
1113: static void
1114: wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
1115: {
1116:
1117: KASSERT(mutex_owned(&wl->wl_mtx));
1118: KASSERT(bp->b_cflags & BC_BUSY);
1119: wapbl_jlock_assert(wl);
1120:
1121: #if 0
1122: /*
1123: * XXX this might be an issue for swapfiles.
1124: * see uvm_swap.c:1725
1125: *
1126: * XXXdeux: see above
1127: */
1128: KASSERT((bp->b_flags & BC_NOCACHE) == 0);
1129: #endif
1130: KASSERT(bp->b_flags & B_LOCKED);
1131:
1132: WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
1133: ("wapbl_remove_buf thread %d.%d removing buf %p with "
1134: "%d bytes %d bcount\n",
1135: curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
1136:
1137: KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
1138: wl->wl_bufbytes -= bp->b_bufsize;
1139: KASSERT(wl->wl_bcount >= bp->b_bcount);
1140: wl->wl_bcount -= bp->b_bcount;
1141: KASSERT(wl->wl_bufcount > 0);
1142: wl->wl_bufcount--;
1143: KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1144: KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1145: LIST_REMOVE(bp, b_wapbllist);
1146:
1147: bp->b_flags &= ~B_LOCKED;
1148: }
1149:
1150: /* called from brelsel() in vfs_bio among other places */
1151: void
1152: wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
1153: {
1154:
1155: mutex_enter(&wl->wl_mtx);
1156: wapbl_remove_buf_locked(wl, bp);
1157: mutex_exit(&wl->wl_mtx);
1158: }
1159:
1160: void
1161: wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
1162: {
1163:
1164: KASSERT(bp->b_cflags & BC_BUSY);
1165:
1166: /*
1167: * XXX: why does this depend on B_LOCKED? otherwise the buf
1168: * is not for a transaction? if so, why is this called in the
1169: * first place?
1170: */
1171: if (bp->b_flags & B_LOCKED) {
1172: mutex_enter(&wl->wl_mtx);
1173: wl->wl_bufbytes += bp->b_bufsize - oldsz;
1174: wl->wl_bcount += bp->b_bcount - oldcnt;
1175: mutex_exit(&wl->wl_mtx);
1176: }
1177: }
1178:
1179: #endif /* _KERNEL */
1180:
1181: /****************************************************************/
1182: /* Some utility inlines */
1183:
1.71 riastrad 1184: /*
1185: * wapbl_space_used(avail, head, tail)
1186: *
1187: * Number of bytes used in a circular queue of avail total bytes,
1188: * from tail to head.
1189: */
1.56 joerg 1190: static inline size_t
1191: wapbl_space_used(size_t avail, off_t head, off_t tail)
1192: {
1193:
1194: if (tail == 0) {
1195: KASSERT(head == 0);
1196: return 0;
1197: }
1198: return ((head + (avail - 1) - tail) % avail) + 1;
1199: }
1200:
1201: #ifdef _KERNEL
1.71 riastrad 1202: /*
1203: * wapbl_advance(size, off, oldoff, delta)
1204: *
1205: * Given a byte offset oldoff into a circular queue of size bytes
1206: * starting at off, return a new byte offset oldoff + delta into
1207: * the circular queue.
1208: */
1.30 uebayasi 1209: static inline off_t
1.60 matt 1210: wapbl_advance(size_t size, size_t off, off_t oldoff, size_t delta)
1.2 simonb 1211: {
1.60 matt 1212: off_t newoff;
1.2 simonb 1213:
1214: /* Define acceptable ranges for inputs. */
1.46 christos 1215: KASSERT(delta <= (size_t)size);
1.60 matt 1216: KASSERT((oldoff == 0) || ((size_t)oldoff >= off));
1217: KASSERT(oldoff < (off_t)(size + off));
1.2 simonb 1218:
1.60 matt 1219: if ((oldoff == 0) && (delta != 0))
1220: newoff = off + delta;
1221: else if ((oldoff + delta) < (size + off))
1222: newoff = oldoff + delta;
1.2 simonb 1223: else
1.60 matt 1224: newoff = (oldoff + delta) - size;
1.2 simonb 1225:
1226: /* Note some interesting axioms */
1.60 matt 1227: KASSERT((delta != 0) || (newoff == oldoff));
1228: KASSERT((delta == 0) || (newoff != 0));
1229: KASSERT((delta != (size)) || (newoff == oldoff));
1.2 simonb 1230:
1231: /* Define acceptable ranges for output. */
1.60 matt 1232: KASSERT((newoff == 0) || ((size_t)newoff >= off));
1233: KASSERT((size_t)newoff < (size + off));
1234: return newoff;
1.2 simonb 1235: }
1236:
1.71 riastrad 1237: /*
1238: * wapbl_space_free(avail, head, tail)
1239: *
1240: * Number of bytes free in a circular queue of avail total bytes,
1241: * in which everything from tail to head is used.
1242: */
1.30 uebayasi 1243: static inline size_t
1.2 simonb 1244: wapbl_space_free(size_t avail, off_t head, off_t tail)
1245: {
1246:
1247: return avail - wapbl_space_used(avail, head, tail);
1248: }
1249:
1.71 riastrad 1250: /*
1251: * wapbl_advance_head(size, off, delta, headp, tailp)
1252: *
1253: * In a circular queue of size bytes starting at off, given the
1254: * old head and tail offsets *headp and *tailp, store the new head
1255: * and tail offsets in *headp and *tailp resulting from adding
1256: * delta bytes of data to the head.
1257: */
1.30 uebayasi 1258: static inline void
1.2 simonb 1259: wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
1260: off_t *tailp)
1261: {
1262: off_t head = *headp;
1263: off_t tail = *tailp;
1264:
1265: KASSERT(delta <= wapbl_space_free(size, head, tail));
1266: head = wapbl_advance(size, off, head, delta);
1267: if ((tail == 0) && (head != 0))
1268: tail = off;
1269: *headp = head;
1270: *tailp = tail;
1271: }
1272:
1.71 riastrad 1273: /*
1274: * wapbl_advance_tail(size, off, delta, headp, tailp)
1275: *
1276: * In a circular queue of size bytes starting at off, given the
1277: * old head and tail offsets *headp and *tailp, store the new head
1278: * and tail offsets in *headp and *tailp resulting from removing
1279: * delta bytes of data from the tail.
1280: */
1.30 uebayasi 1281: static inline void
1.2 simonb 1282: wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
1283: off_t *tailp)
1284: {
1285: off_t head = *headp;
1286: off_t tail = *tailp;
1287:
1288: KASSERT(delta <= wapbl_space_used(size, head, tail));
1289: tail = wapbl_advance(size, off, tail, delta);
1290: if (head == tail) {
1291: head = tail = 0;
1292: }
1293: *headp = head;
1294: *tailp = tail;
1295: }
1296:
1297:
1298: /****************************************************************/
1299:
1300: /*
1.73 riastrad 1301: * wapbl_truncate(wl, minfree)
1.71 riastrad 1302: *
1303: * Wait until at least minfree bytes are available in the log.
1304: *
1.73 riastrad 1305: * If it was necessary to wait for writes to complete,
1306: * advance the circular queue tail to reflect the new write
1307: * completions and issue a write commit to the log.
1.71 riastrad 1308: *
1309: * => Caller must hold wl->wl_rwlock writer lock.
1.2 simonb 1310: */
1311: static int
1.73 riastrad 1312: wapbl_truncate(struct wapbl *wl, size_t minfree)
1.2 simonb 1313: {
1314: size_t delta;
1315: size_t avail;
1316: off_t head;
1317: off_t tail;
1318: int error = 0;
1319:
1320: KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
1321: KASSERT(rw_write_held(&wl->wl_rwlock));
1322:
1323: mutex_enter(&wl->wl_mtx);
1324:
1325: /*
1326: * First check to see if we have to do a commit
1327: * at all.
1328: */
1329: avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
1330: if (minfree < avail) {
1331: mutex_exit(&wl->wl_mtx);
1332: return 0;
1333: }
1334: minfree -= avail;
1335: while ((wl->wl_error_count == 0) &&
1336: (wl->wl_reclaimable_bytes < minfree)) {
1337: WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1338: ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
1339: "minfree=%zd\n",
1340: &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
1341: minfree));
1342:
1343: cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
1344: }
1345: if (wl->wl_reclaimable_bytes < minfree) {
1346: KASSERT(wl->wl_error_count);
1347: /* XXX maybe get actual error from buffer instead someday? */
1348: error = EIO;
1349: }
1350: head = wl->wl_head;
1351: tail = wl->wl_tail;
1352: delta = wl->wl_reclaimable_bytes;
1353:
1354: /* If all of of the entries are flushed, then be sure to keep
1355: * the reserved bytes reserved. Watch out for discarded transactions,
1356: * which could leave more bytes reserved than are reclaimable.
1357: */
1358: if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
1359: (delta >= wl->wl_reserved_bytes)) {
1360: delta -= wl->wl_reserved_bytes;
1361: }
1362: wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
1363: &tail);
1364: KDASSERT(wl->wl_reserved_bytes <=
1365: wapbl_space_used(wl->wl_circ_size, head, tail));
1366: mutex_exit(&wl->wl_mtx);
1367:
1368: if (error)
1369: return error;
1370:
1371: /*
1372: * This is where head, tail and delta are unprotected
1373: * from races against itself or flush. This is ok since
1374: * we only call this routine from inside flush itself.
1375: *
1376: * XXX: how can it race against itself when accessed only
1377: * from behind the write-locked rwlock?
1378: */
1379: error = wapbl_write_commit(wl, head, tail);
1380: if (error)
1381: return error;
1382:
1383: wl->wl_head = head;
1384: wl->wl_tail = tail;
1385:
1386: mutex_enter(&wl->wl_mtx);
1387: KASSERT(wl->wl_reclaimable_bytes >= delta);
1388: wl->wl_reclaimable_bytes -= delta;
1389: mutex_exit(&wl->wl_mtx);
1390: WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1391: ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
1392: curproc->p_pid, curlwp->l_lid, delta));
1393:
1394: return 0;
1395: }
1396:
1397: /****************************************************************/
1398:
1399: void
1400: wapbl_biodone(struct buf *bp)
1401: {
1402: struct wapbl_entry *we = bp->b_private;
1403: struct wapbl *wl = we->we_wapbl;
1.53 hannken 1404: #ifdef WAPBL_DEBUG_BUFBYTES
1405: const int bufsize = bp->b_bufsize;
1406: #endif
1.2 simonb 1407:
1408: /*
1409: * Handle possible flushing of buffers after log has been
1410: * decomissioned.
1411: */
1412: if (!wl) {
1413: KASSERT(we->we_bufcount > 0);
1414: we->we_bufcount--;
1415: #ifdef WAPBL_DEBUG_BUFBYTES
1.53 hannken 1416: KASSERT(we->we_unsynced_bufbytes >= bufsize);
1417: we->we_unsynced_bufbytes -= bufsize;
1.2 simonb 1418: #endif
1419:
1420: if (we->we_bufcount == 0) {
1421: #ifdef WAPBL_DEBUG_BUFBYTES
1422: KASSERT(we->we_unsynced_bufbytes == 0);
1423: #endif
1.51 para 1424: pool_put(&wapbl_entry_pool, we);
1.2 simonb 1425: }
1426:
1427: brelse(bp, 0);
1428: return;
1429: }
1430:
1431: #ifdef ohbother
1.44 uebayasi 1432: KDASSERT(bp->b_oflags & BO_DONE);
1433: KDASSERT(!(bp->b_oflags & BO_DELWRI));
1.2 simonb 1434: KDASSERT(bp->b_flags & B_ASYNC);
1.44 uebayasi 1435: KDASSERT(bp->b_cflags & BC_BUSY);
1.2 simonb 1436: KDASSERT(!(bp->b_flags & B_LOCKED));
1437: KDASSERT(!(bp->b_flags & B_READ));
1.44 uebayasi 1438: KDASSERT(!(bp->b_cflags & BC_INVAL));
1439: KDASSERT(!(bp->b_cflags & BC_NOCACHE));
1.2 simonb 1440: #endif
1441:
1442: if (bp->b_error) {
1.26 apb 1443: /*
1.78 riastrad 1444: * If an error occurs, it would be nice to leave the buffer
1445: * as a delayed write on the LRU queue so that we can retry
1446: * it later. But buffercache(9) can't handle dirty buffer
1447: * reuse, so just mark the log permanently errored out.
1.26 apb 1448: */
1.2 simonb 1449: mutex_enter(&wl->wl_mtx);
1450: if (wl->wl_error_count == 0) {
1451: wl->wl_error_count++;
1452: cv_broadcast(&wl->wl_reclaimable_cv);
1453: }
1454: mutex_exit(&wl->wl_mtx);
1455: }
1456:
1.53 hannken 1457: /*
1458: * Release the buffer here. wapbl_flush() may wait for the
1459: * log to become empty and we better unbusy the buffer before
1460: * wapbl_flush() returns.
1461: */
1462: brelse(bp, 0);
1463:
1.2 simonb 1464: mutex_enter(&wl->wl_mtx);
1465:
1466: KASSERT(we->we_bufcount > 0);
1467: we->we_bufcount--;
1468: #ifdef WAPBL_DEBUG_BUFBYTES
1.53 hannken 1469: KASSERT(we->we_unsynced_bufbytes >= bufsize);
1470: we->we_unsynced_bufbytes -= bufsize;
1471: KASSERT(wl->wl_unsynced_bufbytes >= bufsize);
1472: wl->wl_unsynced_bufbytes -= bufsize;
1.2 simonb 1473: #endif
1474:
1475: /*
1476: * If the current transaction can be reclaimed, start
1477: * at the beginning and reclaim any consecutive reclaimable
1478: * transactions. If we successfully reclaim anything,
1479: * then wakeup anyone waiting for the reclaim.
1480: */
1481: if (we->we_bufcount == 0) {
1482: size_t delta = 0;
1483: int errcnt = 0;
1484: #ifdef WAPBL_DEBUG_BUFBYTES
1485: KDASSERT(we->we_unsynced_bufbytes == 0);
1486: #endif
1487: /*
1488: * clear any posted error, since the buffer it came from
1489: * has successfully flushed by now
1490: */
1491: while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
1492: (we->we_bufcount == 0)) {
1493: delta += we->we_reclaimable_bytes;
1494: if (we->we_error)
1495: errcnt++;
1496: SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1.51 para 1497: pool_put(&wapbl_entry_pool, we);
1.2 simonb 1498: }
1499:
1500: if (delta) {
1501: wl->wl_reclaimable_bytes += delta;
1502: KASSERT(wl->wl_error_count >= errcnt);
1503: wl->wl_error_count -= errcnt;
1504: cv_broadcast(&wl->wl_reclaimable_cv);
1505: }
1506: }
1507:
1508: mutex_exit(&wl->wl_mtx);
1509: }
1510:
1511: /*
1.71 riastrad 1512: * wapbl_flush(wl, wait)
1513: *
1514: * Flush pending block writes, deallocations, and inodes from
1515: * the current transaction in memory to the log on disk:
1516: *
1517: * 1. Call the file system's wl_flush callback to flush any
1518: * per-file-system pending updates.
1519: * 2. Wait for enough space in the log for the current transaction.
1520: * 3. Synchronously write the new log records, advancing the
1521: * circular queue head.
1.77 riastrad 1522: * 4. Issue the pending block writes asynchronously, now that they
1523: * are recorded in the log and can be replayed after crash.
1524: * 5. If wait is true, wait for all writes to complete and for the
1525: * log to become empty.
1.71 riastrad 1526: *
1527: * On failure, call the file system's wl_flush_abort callback.
1.2 simonb 1528: */
1529: int
1530: wapbl_flush(struct wapbl *wl, int waitfor)
1531: {
1532: struct buf *bp;
1533: struct wapbl_entry *we;
1534: off_t off;
1535: off_t head;
1536: off_t tail;
1537: size_t delta = 0;
1538: size_t flushsize;
1539: size_t reserved;
1540: int error = 0;
1541:
1542: /*
1543: * Do a quick check to see if a full flush can be skipped
1544: * This assumes that the flush callback does not need to be called
1545: * unless there are other outstanding bufs.
1546: */
1547: if (!waitfor) {
1548: size_t nbufs;
1549: mutex_enter(&wl->wl_mtx); /* XXX need mutex here to
1550: protect the KASSERTS */
1551: nbufs = wl->wl_bufcount;
1552: KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1553: KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1554: mutex_exit(&wl->wl_mtx);
1555: if (nbufs == 0)
1556: return 0;
1557: }
1558:
1559: /*
1560: * XXX we may consider using LK_UPGRADE here
1561: * if we want to call flush from inside a transaction
1562: */
1563: rw_enter(&wl->wl_rwlock, RW_WRITER);
1.78.2.1! pgoyette 1564: wl->wl_flush(wl->wl_mount, SIMPLEQ_FIRST(&wl->wl_dealloclist));
1.2 simonb 1565:
1566: /*
1.75 riastrad 1567: * Now that we are exclusively locked and the file system has
1568: * issued any deferred block writes for this transaction, check
1569: * whether there are any blocks to write to the log. If not,
1570: * skip waiting for space or writing any log entries.
1571: *
1572: * XXX Shouldn't this also check wl_dealloccnt and
1573: * wl_inohashcnt? Perhaps wl_dealloccnt doesn't matter if the
1574: * file system didn't produce any blocks as a consequence of
1575: * it, but the same does not seem to be so of wl_inohashcnt.
1.2 simonb 1576: */
1577: if (wl->wl_bufcount == 0) {
1.69 riastrad 1578: goto wait_out;
1.2 simonb 1579: }
1580:
1581: #if 0
1582: WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1583: ("wapbl_flush thread %d.%d flushing entries with "
1584: "bufcount=%zu bufbytes=%zu\n",
1585: curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1586: wl->wl_bufbytes));
1587: #endif
1588:
1589: /* Calculate amount of space needed to flush */
1590: flushsize = wapbl_transaction_len(wl);
1.39 christos 1591: if (wapbl_verbose_commit) {
1592: struct timespec ts;
1593: getnanotime(&ts);
1.43 nakayama 1594: printf("%s: %lld.%09ld this transaction = %zu bytes\n",
1.39 christos 1595: __func__, (long long)ts.tv_sec,
1596: (long)ts.tv_nsec, flushsize);
1597: }
1.2 simonb 1598:
1599: if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
1600: /*
1601: * XXX this could be handled more gracefully, perhaps place
1602: * only a partial transaction in the log and allow the
1603: * remaining to flush without the protection of the journal.
1604: */
1.66 riastrad 1605: panic("wapbl_flush: current transaction too big to flush");
1.2 simonb 1606: }
1607:
1.73 riastrad 1608: error = wapbl_truncate(wl, flushsize);
1.2 simonb 1609: if (error)
1.69 riastrad 1610: goto out;
1.2 simonb 1611:
1612: off = wl->wl_head;
1.70 riastrad 1613: KASSERT((off == 0) || (off >= wl->wl_circ_off));
1614: KASSERT((off == 0) || (off < wl->wl_circ_off + wl->wl_circ_size));
1.2 simonb 1615: error = wapbl_write_blocks(wl, &off);
1616: if (error)
1.69 riastrad 1617: goto out;
1.2 simonb 1618: error = wapbl_write_revocations(wl, &off);
1619: if (error)
1.69 riastrad 1620: goto out;
1.2 simonb 1621: error = wapbl_write_inodes(wl, &off);
1622: if (error)
1.69 riastrad 1623: goto out;
1.2 simonb 1624:
1625: reserved = 0;
1626: if (wl->wl_inohashcnt)
1627: reserved = wapbl_transaction_inodes_len(wl);
1628:
1629: head = wl->wl_head;
1630: tail = wl->wl_tail;
1631:
1632: wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
1633: &head, &tail);
1.72 riastrad 1634:
1635: KASSERTMSG(head == off,
1636: "lost head! head=%"PRIdMAX" tail=%" PRIdMAX
1637: " off=%"PRIdMAX" flush=%zu",
1638: (intmax_t)head, (intmax_t)tail, (intmax_t)off,
1639: flushsize);
1.2 simonb 1640:
1641: /* Opportunistically move the tail forward if we can */
1.73 riastrad 1642: mutex_enter(&wl->wl_mtx);
1643: delta = wl->wl_reclaimable_bytes;
1644: mutex_exit(&wl->wl_mtx);
1645: wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
1646: &head, &tail);
1.2 simonb 1647:
1648: error = wapbl_write_commit(wl, head, tail);
1649: if (error)
1.69 riastrad 1650: goto out;
1.2 simonb 1651:
1.51 para 1652: we = pool_get(&wapbl_entry_pool, PR_WAITOK);
1.2 simonb 1653:
1654: #ifdef WAPBL_DEBUG_BUFBYTES
1655: WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1656: ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1657: " unsynced=%zu"
1658: "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1659: "inodes=%d\n",
1660: curproc->p_pid, curlwp->l_lid, flushsize, delta,
1661: wapbl_space_used(wl->wl_circ_size, head, tail),
1662: wl->wl_unsynced_bufbytes, wl->wl_bufcount,
1663: wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
1664: wl->wl_inohashcnt));
1665: #else
1666: WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1667: ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1668: "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1669: "inodes=%d\n",
1670: curproc->p_pid, curlwp->l_lid, flushsize, delta,
1671: wapbl_space_used(wl->wl_circ_size, head, tail),
1672: wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1673: wl->wl_dealloccnt, wl->wl_inohashcnt));
1674: #endif
1675:
1676:
1677: mutex_enter(&bufcache_lock);
1678: mutex_enter(&wl->wl_mtx);
1679:
1680: wl->wl_reserved_bytes = reserved;
1681: wl->wl_head = head;
1682: wl->wl_tail = tail;
1683: KASSERT(wl->wl_reclaimable_bytes >= delta);
1684: wl->wl_reclaimable_bytes -= delta;
1.78.2.1! pgoyette 1685: KDASSERT(wl->wl_dealloccnt == 0);
1.2 simonb 1686: #ifdef WAPBL_DEBUG_BUFBYTES
1687: wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
1688: #endif
1689:
1690: we->we_wapbl = wl;
1691: we->we_bufcount = wl->wl_bufcount;
1692: #ifdef WAPBL_DEBUG_BUFBYTES
1693: we->we_unsynced_bufbytes = wl->wl_bufbytes;
1694: #endif
1695: we->we_reclaimable_bytes = flushsize;
1696: we->we_error = 0;
1697: SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
1698:
1699: /*
1700: * this flushes bufs in reverse order than they were queued
1701: * it shouldn't matter, but if we care we could use TAILQ instead.
1702: * XXX Note they will get put on the lru queue when they flush
1703: * so we might actually want to change this to preserve order.
1704: */
1705: while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
1706: if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
1707: continue;
1708: }
1709: bp->b_iodone = wapbl_biodone;
1710: bp->b_private = we;
1711: bremfree(bp);
1712: wapbl_remove_buf_locked(wl, bp);
1713: mutex_exit(&wl->wl_mtx);
1714: mutex_exit(&bufcache_lock);
1715: bawrite(bp);
1716: mutex_enter(&bufcache_lock);
1717: mutex_enter(&wl->wl_mtx);
1718: }
1719: mutex_exit(&wl->wl_mtx);
1720: mutex_exit(&bufcache_lock);
1721:
1722: #if 0
1723: WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1724: ("wapbl_flush thread %d.%d done flushing entries...\n",
1725: curproc->p_pid, curlwp->l_lid));
1726: #endif
1727:
1.69 riastrad 1728: wait_out:
1.2 simonb 1729:
1730: /*
1731: * If the waitfor flag is set, don't return until everything is
1732: * fully flushed and the on disk log is empty.
1733: */
1734: if (waitfor) {
1735: error = wapbl_truncate(wl, wl->wl_circ_size -
1.73 riastrad 1736: wl->wl_reserved_bytes);
1.2 simonb 1737: }
1738:
1.69 riastrad 1739: out:
1.2 simonb 1740: if (error) {
1.78.2.1! pgoyette 1741: wl->wl_flush_abort(wl->wl_mount,
! 1742: SIMPLEQ_FIRST(&wl->wl_dealloclist));
1.2 simonb 1743: }
1744:
1745: #ifdef WAPBL_DEBUG_PRINT
1746: if (error) {
1747: pid_t pid = -1;
1748: lwpid_t lid = -1;
1749: if (curproc)
1750: pid = curproc->p_pid;
1751: if (curlwp)
1752: lid = curlwp->l_lid;
1753: mutex_enter(&wl->wl_mtx);
1754: #ifdef WAPBL_DEBUG_BUFBYTES
1755: WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1756: ("wapbl_flush: thread %d.%d aborted flush: "
1757: "error = %d\n"
1758: "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1759: "deallocs=%d inodes=%d\n"
1760: "\terrcnt = %d, reclaimable=%zu reserved=%zu "
1761: "unsynced=%zu\n",
1762: pid, lid, error, wl->wl_bufcount,
1763: wl->wl_bufbytes, wl->wl_bcount,
1764: wl->wl_dealloccnt, wl->wl_inohashcnt,
1765: wl->wl_error_count, wl->wl_reclaimable_bytes,
1766: wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
1767: SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1768: WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1769: ("\tentry: bufcount = %zu, reclaimable = %zu, "
1770: "error = %d, unsynced = %zu\n",
1771: we->we_bufcount, we->we_reclaimable_bytes,
1772: we->we_error, we->we_unsynced_bufbytes));
1773: }
1774: #else
1775: WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1776: ("wapbl_flush: thread %d.%d aborted flush: "
1777: "error = %d\n"
1778: "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1779: "deallocs=%d inodes=%d\n"
1780: "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
1781: pid, lid, error, wl->wl_bufcount,
1782: wl->wl_bufbytes, wl->wl_bcount,
1783: wl->wl_dealloccnt, wl->wl_inohashcnt,
1784: wl->wl_error_count, wl->wl_reclaimable_bytes,
1785: wl->wl_reserved_bytes));
1786: SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1787: WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1788: ("\tentry: bufcount = %zu, reclaimable = %zu, "
1789: "error = %d\n", we->we_bufcount,
1790: we->we_reclaimable_bytes, we->we_error));
1791: }
1792: #endif
1793: mutex_exit(&wl->wl_mtx);
1794: }
1795: #endif
1796:
1797: rw_exit(&wl->wl_rwlock);
1798: return error;
1799: }
1800:
1801: /****************************************************************/
1802:
1803: void
1804: wapbl_jlock_assert(struct wapbl *wl)
1805: {
1806:
1.23 ad 1807: KASSERT(rw_lock_held(&wl->wl_rwlock));
1.2 simonb 1808: }
1809:
1810: void
1811: wapbl_junlock_assert(struct wapbl *wl)
1812: {
1813:
1814: KASSERT(!rw_write_held(&wl->wl_rwlock));
1815: }
1816:
1817: /****************************************************************/
1818:
1819: /* locks missing */
1820: void
1821: wapbl_print(struct wapbl *wl,
1822: int full,
1823: void (*pr)(const char *, ...))
1824: {
1825: struct buf *bp;
1826: struct wapbl_entry *we;
1827: (*pr)("wapbl %p", wl);
1828: (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
1829: wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
1830: (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
1831: wl->wl_circ_size, wl->wl_circ_off,
1832: (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
1833: (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
1834: wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
1835: #ifdef WAPBL_DEBUG_BUFBYTES
1836: (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1837: "reserved = %zu errcnt = %d unsynced = %zu\n",
1838: wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1839: wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1840: wl->wl_error_count, wl->wl_unsynced_bufbytes);
1841: #else
1842: (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1843: "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
1844: wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1845: wl->wl_error_count);
1846: #endif
1847: (*pr)("\tdealloccnt = %d, dealloclim = %d\n",
1848: wl->wl_dealloccnt, wl->wl_dealloclim);
1849: (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
1850: wl->wl_inohashcnt, wl->wl_inohashmask);
1851: (*pr)("entries:\n");
1852: SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1853: #ifdef WAPBL_DEBUG_BUFBYTES
1854: (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
1855: "unsynced = %zu\n",
1856: we->we_bufcount, we->we_reclaimable_bytes,
1857: we->we_error, we->we_unsynced_bufbytes);
1858: #else
1859: (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
1860: we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
1861: #endif
1862: }
1863: if (full) {
1864: int cnt = 0;
1865: (*pr)("bufs =");
1866: LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
1867: if (!LIST_NEXT(bp, b_wapbllist)) {
1868: (*pr)(" %p", bp);
1869: } else if ((++cnt % 6) == 0) {
1870: (*pr)(" %p,\n\t", bp);
1871: } else {
1872: (*pr)(" %p,", bp);
1873: }
1874: }
1875: (*pr)("\n");
1876:
1877: (*pr)("dealloced blks = ");
1878: {
1.78.2.1! pgoyette 1879: struct wapbl_dealloc *wd;
1.2 simonb 1880: cnt = 0;
1.78.2.1! pgoyette 1881: SIMPLEQ_FOREACH(wd, &wl->wl_dealloclist, wd_entries) {
1.2 simonb 1882: (*pr)(" %"PRId64":%d,",
1.78.2.1! pgoyette 1883: wd->wd_blkno,
! 1884: wd->wd_len);
1.2 simonb 1885: if ((++cnt % 4) == 0) {
1886: (*pr)("\n\t");
1887: }
1888: }
1889: }
1890: (*pr)("\n");
1891:
1892: (*pr)("registered inodes = ");
1893: {
1894: int i;
1895: cnt = 0;
1896: for (i = 0; i <= wl->wl_inohashmask; i++) {
1897: struct wapbl_ino_head *wih;
1898: struct wapbl_ino *wi;
1899:
1900: wih = &wl->wl_inohash[i];
1901: LIST_FOREACH(wi, wih, wi_hash) {
1902: if (wi->wi_ino == 0)
1903: continue;
1.55 christos 1904: (*pr)(" %"PRIu64"/0%06"PRIo32",",
1.2 simonb 1905: wi->wi_ino, wi->wi_mode);
1906: if ((++cnt % 4) == 0) {
1907: (*pr)("\n\t");
1908: }
1909: }
1910: }
1911: (*pr)("\n");
1912: }
1913: }
1914: }
1915:
1916: #if defined(WAPBL_DEBUG) || defined(DDB)
1917: void
1918: wapbl_dump(struct wapbl *wl)
1919: {
1920: #if defined(WAPBL_DEBUG)
1921: if (!wl)
1922: wl = wapbl_debug_wl;
1923: #endif
1924: if (!wl)
1925: return;
1926: wapbl_print(wl, 1, printf);
1927: }
1928: #endif
1929:
1930: /****************************************************************/
1931:
1.78.2.1! pgoyette 1932: int
! 1933: wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len, bool force)
1.2 simonb 1934: {
1.78.2.1! pgoyette 1935: struct wapbl_dealloc *wd;
! 1936: int error = 0;
1.2 simonb 1937:
1938: wapbl_jlock_assert(wl);
1939:
1.38 hannken 1940: mutex_enter(&wl->wl_mtx);
1.27 pooka 1941:
1.78.2.1! pgoyette 1942: if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim)) {
! 1943: if (!force) {
! 1944: error = EAGAIN;
! 1945: goto out;
! 1946: }
! 1947:
! 1948: /*
! 1949: * Forced registration can only be used when:
! 1950: * 1) the caller can't cope with failure
! 1951: * 2) the path can be triggered only bounded, small
! 1952: * times per transaction
! 1953: * If this is not fullfilled, and the path would be triggered
! 1954: * many times, this could overflow maximum transaction size
! 1955: * and panic later.
! 1956: */
! 1957: printf("%s: forced dealloc registration over limit: %d >= %d\n",
! 1958: wl->wl_mount->mnt_stat.f_mntonname,
! 1959: wl->wl_dealloccnt, wl->wl_dealloclim);
! 1960: }
! 1961:
1.2 simonb 1962: wl->wl_dealloccnt++;
1.38 hannken 1963: mutex_exit(&wl->wl_mtx);
1.78.2.1! pgoyette 1964:
! 1965: wd = pool_get(&wapbl_dealloc_pool, PR_WAITOK);
! 1966: wd->wd_blkno = blk;
! 1967: wd->wd_len = len;
! 1968:
! 1969: mutex_enter(&wl->wl_mtx);
! 1970: SIMPLEQ_INSERT_TAIL(&wl->wl_dealloclist, wd, wd_entries);
! 1971:
! 1972: out:
! 1973: mutex_exit(&wl->wl_mtx);
! 1974:
! 1975: WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
! 1976: ("wapbl_register_deallocation: blk=%"PRId64" len=%d error=%d\n",
! 1977: blk, len, error));
! 1978:
! 1979: return error;
1.2 simonb 1980: }
1981:
1982: /****************************************************************/
1983:
1984: static void
1985: wapbl_inodetrk_init(struct wapbl *wl, u_int size)
1986: {
1987:
1988: wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
1989: if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
1990: pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
1991: "wapblinopl", &pool_allocator_nointr, IPL_NONE);
1992: }
1993: }
1994:
1995: static void
1996: wapbl_inodetrk_free(struct wapbl *wl)
1997: {
1998:
1999: /* XXX this KASSERT needs locking/mutex analysis */
2000: KASSERT(wl->wl_inohashcnt == 0);
2001: hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
2002: if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
2003: pool_destroy(&wapbl_ino_pool);
2004: }
2005: }
2006:
2007: static struct wapbl_ino *
2008: wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
2009: {
2010: struct wapbl_ino_head *wih;
2011: struct wapbl_ino *wi;
2012:
2013: KASSERT(mutex_owned(&wl->wl_mtx));
2014:
2015: wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
2016: LIST_FOREACH(wi, wih, wi_hash) {
2017: if (ino == wi->wi_ino)
2018: return wi;
2019: }
2020: return 0;
2021: }
2022:
2023: void
2024: wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
2025: {
2026: struct wapbl_ino_head *wih;
2027: struct wapbl_ino *wi;
2028:
2029: wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
2030:
2031: mutex_enter(&wl->wl_mtx);
2032: if (wapbl_inodetrk_get(wl, ino) == NULL) {
2033: wi->wi_ino = ino;
2034: wi->wi_mode = mode;
2035: wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
2036: LIST_INSERT_HEAD(wih, wi, wi_hash);
2037: wl->wl_inohashcnt++;
2038: WAPBL_PRINTF(WAPBL_PRINT_INODE,
2039: ("wapbl_register_inode: ino=%"PRId64"\n", ino));
2040: mutex_exit(&wl->wl_mtx);
2041: } else {
2042: mutex_exit(&wl->wl_mtx);
2043: pool_put(&wapbl_ino_pool, wi);
2044: }
2045: }
2046:
2047: void
2048: wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
2049: {
2050: struct wapbl_ino *wi;
2051:
2052: mutex_enter(&wl->wl_mtx);
2053: wi = wapbl_inodetrk_get(wl, ino);
2054: if (wi) {
2055: WAPBL_PRINTF(WAPBL_PRINT_INODE,
2056: ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
2057: KASSERT(wl->wl_inohashcnt > 0);
2058: wl->wl_inohashcnt--;
2059: LIST_REMOVE(wi, wi_hash);
2060: mutex_exit(&wl->wl_mtx);
2061:
2062: pool_put(&wapbl_ino_pool, wi);
2063: } else {
2064: mutex_exit(&wl->wl_mtx);
2065: }
2066: }
2067:
2068: /****************************************************************/
2069:
1.71 riastrad 2070: /*
2071: * wapbl_transaction_inodes_len(wl)
2072: *
2073: * Calculate the number of bytes required for inode registration
2074: * log records in wl.
2075: */
1.30 uebayasi 2076: static inline size_t
1.2 simonb 2077: wapbl_transaction_inodes_len(struct wapbl *wl)
2078: {
2079: int blocklen = 1<<wl->wl_log_dev_bshift;
2080: int iph;
2081:
2082: /* Calculate number of inodes described in a inodelist header */
2083: iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2084: sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2085:
2086: KASSERT(iph > 0);
2087:
1.39 christos 2088: return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen;
1.2 simonb 2089: }
2090:
2091:
1.71 riastrad 2092: /*
2093: * wapbl_transaction_len(wl)
2094: *
2095: * Calculate number of bytes required for all log records in wl.
2096: */
1.2 simonb 2097: static size_t
2098: wapbl_transaction_len(struct wapbl *wl)
2099: {
2100: int blocklen = 1<<wl->wl_log_dev_bshift;
2101: size_t len;
2102:
2103: /* Calculate number of blocks described in a blocklist header */
2104: len = wl->wl_bcount;
1.78.2.1! pgoyette 2105: len += howmany(wl->wl_bufcount, wl->wl_brperjblock) * blocklen;
! 2106: len += howmany(wl->wl_dealloccnt, wl->wl_brperjblock) * blocklen;
1.2 simonb 2107: len += wapbl_transaction_inodes_len(wl);
2108:
2109: return len;
2110: }
2111:
2112: /*
1.71 riastrad 2113: * wapbl_cache_sync(wl, msg)
2114: *
2115: * Issue DIOCCACHESYNC to wl->wl_devvp.
2116: *
2117: * If sysctl(vfs.wapbl.verbose_commit) >= 2, print a message
2118: * including msg about the duration of the cache sync.
1.48 yamt 2119: */
2120: static int
2121: wapbl_cache_sync(struct wapbl *wl, const char *msg)
2122: {
2123: const bool verbose = wapbl_verbose_commit >= 2;
2124: struct bintime start_time;
2125: int force = 1;
2126: int error;
2127:
2128: if (!wapbl_flush_disk_cache) {
2129: return 0;
2130: }
2131: if (verbose) {
2132: bintime(&start_time);
2133: }
2134: error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force,
2135: FWRITE, FSCRED);
2136: if (error) {
2137: WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1.76 riastrad 2138: ("wapbl_cache_sync: DIOCCACHESYNC on dev 0x%jx "
2139: "returned %d\n", (uintmax_t)wl->wl_devvp->v_rdev, error));
1.48 yamt 2140: }
2141: if (verbose) {
2142: struct bintime d;
2143: struct timespec ts;
2144:
2145: bintime(&d);
2146: bintime_sub(&d, &start_time);
2147: bintime2timespec(&d, &ts);
2148: printf("wapbl_cache_sync: %s: dev 0x%jx %ju.%09lu\n",
2149: msg, (uintmax_t)wl->wl_devvp->v_rdev,
2150: (uintmax_t)ts.tv_sec, ts.tv_nsec);
2151: }
2152: return error;
2153: }
2154:
2155: /*
1.71 riastrad 2156: * wapbl_write_commit(wl, head, tail)
2157: *
2158: * Issue a disk cache sync to wait for all pending writes to the
2159: * log to complete, and then synchronously commit the current
2160: * circular queue head and tail to the log, in the next of two
2161: * locations for commit headers on disk.
1.2 simonb 2162: *
1.71 riastrad 2163: * Increment the generation number. If the generation number
2164: * rolls over to zero, then a subsequent commit would appear to
2165: * have an older generation than this one -- in that case, issue a
2166: * duplicate commit to avoid this.
2167: *
2168: * => Caller must have exclusive access to wl, either by holding
2169: * wl->wl_rwlock for writer or by being wapbl_start before anyone
2170: * else has seen wl.
1.2 simonb 2171: */
2172: static int
2173: wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
2174: {
2175: struct wapbl_wc_header *wc = wl->wl_wc_header;
2176: struct timespec ts;
2177: int error;
1.34 mlelstv 2178: daddr_t pbn;
1.2 simonb 2179:
1.54 hannken 2180: error = wapbl_buffered_flush(wl);
2181: if (error)
2182: return error;
1.49 yamt 2183: /*
2184: * flush disk cache to ensure that blocks we've written are actually
2185: * written to the stable storage before the commit header.
2186: *
2187: * XXX Calc checksum here, instead we do this for now
2188: */
1.48 yamt 2189: wapbl_cache_sync(wl, "1");
1.2 simonb 2190:
2191: wc->wc_head = head;
2192: wc->wc_tail = tail;
2193: wc->wc_checksum = 0;
2194: wc->wc_version = 1;
2195: getnanotime(&ts);
1.17 yamt 2196: wc->wc_time = ts.tv_sec;
1.2 simonb 2197: wc->wc_timensec = ts.tv_nsec;
2198:
2199: WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2200: ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
2201: (intmax_t)head, (intmax_t)tail));
2202:
2203: /*
1.49 yamt 2204: * write the commit header.
2205: *
1.2 simonb 2206: * XXX if generation will rollover, then first zero
2207: * over second commit header before trying to write both headers.
2208: */
2209:
1.34 mlelstv 2210: pbn = wl->wl_logpbn + (wc->wc_generation % 2);
2211: #ifdef _KERNEL
2212: pbn = btodb(pbn << wc->wc_log_dev_bshift);
2213: #endif
1.54 hannken 2214: error = wapbl_buffered_write(wc, wc->wc_len, wl, pbn);
2215: if (error)
2216: return error;
2217: error = wapbl_buffered_flush(wl);
1.2 simonb 2218: if (error)
2219: return error;
2220:
1.49 yamt 2221: /*
2222: * flush disk cache to ensure that the commit header is actually
2223: * written before meta data blocks.
2224: */
1.48 yamt 2225: wapbl_cache_sync(wl, "2");
1.2 simonb 2226:
2227: /*
2228: * If the generation number was zero, write it out a second time.
2229: * This handles initialization and generation number rollover
2230: */
2231: if (wc->wc_generation++ == 0) {
2232: error = wapbl_write_commit(wl, head, tail);
2233: /*
2234: * This panic should be able to be removed if we do the
2235: * zero'ing mentioned above, and we are certain to roll
2236: * back generation number on failure.
2237: */
2238: if (error)
2239: panic("wapbl_write_commit: error writing duplicate "
1.66 riastrad 2240: "log header: %d", error);
1.2 simonb 2241: }
2242: return 0;
2243: }
2244:
1.71 riastrad 2245: /*
2246: * wapbl_write_blocks(wl, offp)
2247: *
2248: * Write all pending physical blocks in the current transaction
2249: * from wapbl_add_buf to the log on disk, adding to the circular
2250: * queue head at byte offset *offp, and returning the new head's
2251: * byte offset in *offp.
2252: */
1.2 simonb 2253: static int
2254: wapbl_write_blocks(struct wapbl *wl, off_t *offp)
2255: {
2256: struct wapbl_wc_blocklist *wc =
2257: (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2258: int blocklen = 1<<wl->wl_log_dev_bshift;
2259: struct buf *bp;
2260: off_t off = *offp;
2261: int error;
1.7 joerg 2262: size_t padding;
1.2 simonb 2263:
2264: KASSERT(rw_write_held(&wl->wl_rwlock));
2265:
2266: bp = LIST_FIRST(&wl->wl_bufs);
2267:
2268: while (bp) {
2269: int cnt;
2270: struct buf *obp = bp;
2271:
2272: KASSERT(bp->b_flags & B_LOCKED);
2273:
2274: wc->wc_type = WAPBL_WC_BLOCKS;
2275: wc->wc_len = blocklen;
2276: wc->wc_blkcount = 0;
1.78.2.1! pgoyette 2277: while (bp && (wc->wc_blkcount < wl->wl_brperjblock)) {
1.2 simonb 2278: /*
2279: * Make sure all the physical block numbers are up to
2280: * date. If this is not always true on a given
2281: * filesystem, then VOP_BMAP must be called. We
2282: * could call VOP_BMAP here, or else in the filesystem
2283: * specific flush callback, although neither of those
2284: * solutions allow us to take the vnode lock. If a
2285: * filesystem requires that we must take the vnode lock
2286: * to call VOP_BMAP, then we can probably do it in
2287: * bwrite when the vnode lock should already be held
2288: * by the invoking code.
2289: */
2290: KASSERT((bp->b_vp->v_type == VBLK) ||
2291: (bp->b_blkno != bp->b_lblkno));
2292: KASSERT(bp->b_blkno > 0);
2293:
2294: wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
2295: wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
2296: wc->wc_len += bp->b_bcount;
2297: wc->wc_blkcount++;
2298: bp = LIST_NEXT(bp, b_wapbllist);
2299: }
1.7 joerg 2300: if (wc->wc_len % blocklen != 0) {
2301: padding = blocklen - wc->wc_len % blocklen;
2302: wc->wc_len += padding;
2303: } else {
2304: padding = 0;
2305: }
2306:
1.2 simonb 2307: WAPBL_PRINTF(WAPBL_PRINT_WRITE,
1.7 joerg 2308: ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
2309: wc->wc_len, padding, (intmax_t)off));
1.2 simonb 2310:
2311: error = wapbl_circ_write(wl, wc, blocklen, &off);
2312: if (error)
2313: return error;
2314: bp = obp;
2315: cnt = 0;
1.78.2.1! pgoyette 2316: while (bp && (cnt++ < wl->wl_brperjblock)) {
1.2 simonb 2317: error = wapbl_circ_write(wl, bp->b_data,
2318: bp->b_bcount, &off);
2319: if (error)
2320: return error;
2321: bp = LIST_NEXT(bp, b_wapbllist);
2322: }
1.7 joerg 2323: if (padding) {
2324: void *zero;
2325:
1.51 para 2326: zero = wapbl_alloc(padding);
1.7 joerg 2327: memset(zero, 0, padding);
2328: error = wapbl_circ_write(wl, zero, padding, &off);
1.18 yamt 2329: wapbl_free(zero, padding);
1.7 joerg 2330: if (error)
2331: return error;
2332: }
1.2 simonb 2333: }
2334: *offp = off;
2335: return 0;
2336: }
2337:
1.71 riastrad 2338: /*
2339: * wapbl_write_revocations(wl, offp)
2340: *
2341: * Write all pending deallocations in the current transaction from
2342: * wapbl_register_deallocation to the log on disk, adding to the
2343: * circular queue's head at byte offset *offp, and returning the
2344: * new head's byte offset in *offp.
2345: */
1.2 simonb 2346: static int
2347: wapbl_write_revocations(struct wapbl *wl, off_t *offp)
2348: {
2349: struct wapbl_wc_blocklist *wc =
2350: (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
1.78.2.1! pgoyette 2351: struct wapbl_dealloc *wd, *lwd;
1.2 simonb 2352: int blocklen = 1<<wl->wl_log_dev_bshift;
2353: off_t off = *offp;
2354: int error;
2355:
2356: if (wl->wl_dealloccnt == 0)
2357: return 0;
2358:
1.78.2.1! pgoyette 2359: while ((wd = SIMPLEQ_FIRST(&wl->wl_dealloclist)) != NULL) {
1.2 simonb 2360: wc->wc_type = WAPBL_WC_REVOCATIONS;
2361: wc->wc_len = blocklen;
2362: wc->wc_blkcount = 0;
1.78.2.1! pgoyette 2363: while (wd && (wc->wc_blkcount < wl->wl_brperjblock)) {
1.2 simonb 2364: wc->wc_blocks[wc->wc_blkcount].wc_daddr =
1.78.2.1! pgoyette 2365: wd->wd_blkno;
1.2 simonb 2366: wc->wc_blocks[wc->wc_blkcount].wc_dlen =
1.78.2.1! pgoyette 2367: wd->wd_len;
1.2 simonb 2368: wc->wc_blkcount++;
1.78.2.1! pgoyette 2369:
! 2370: wd = SIMPLEQ_NEXT(wd, wd_entries);
1.2 simonb 2371: }
2372: WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2373: ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
2374: wc->wc_len, (intmax_t)off));
2375: error = wapbl_circ_write(wl, wc, blocklen, &off);
2376: if (error)
2377: return error;
1.78.2.1! pgoyette 2378:
! 2379: /* free all successfully written deallocs */
! 2380: lwd = wd;
! 2381: while ((wd = SIMPLEQ_FIRST(&wl->wl_dealloclist)) != NULL) {
! 2382: if (wd == lwd)
! 2383: break;
! 2384: SIMPLEQ_REMOVE_HEAD(&wl->wl_dealloclist, wd_entries);
! 2385: pool_put(&wapbl_dealloc_pool, wd);
! 2386: wl->wl_dealloccnt--;
! 2387: }
1.2 simonb 2388: }
2389: *offp = off;
2390: return 0;
2391: }
2392:
1.71 riastrad 2393: /*
2394: * wapbl_write_inodes(wl, offp)
2395: *
2396: * Write all pending inode allocations in the current transaction
2397: * from wapbl_register_inode to the log on disk, adding to the
2398: * circular queue's head at byte offset *offp and returning the
2399: * new head's byte offset in *offp.
2400: */
1.2 simonb 2401: static int
2402: wapbl_write_inodes(struct wapbl *wl, off_t *offp)
2403: {
2404: struct wapbl_wc_inodelist *wc =
2405: (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
2406: int i;
1.14 joerg 2407: int blocklen = 1 << wl->wl_log_dev_bshift;
1.2 simonb 2408: off_t off = *offp;
2409: int error;
2410:
2411: struct wapbl_ino_head *wih;
2412: struct wapbl_ino *wi;
2413: int iph;
2414:
2415: iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2416: sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2417:
2418: i = 0;
2419: wih = &wl->wl_inohash[0];
2420: wi = 0;
2421: do {
2422: wc->wc_type = WAPBL_WC_INODES;
2423: wc->wc_len = blocklen;
2424: wc->wc_inocnt = 0;
2425: wc->wc_clear = (i == 0);
2426: while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
2427: while (!wi) {
2428: KASSERT((wih - &wl->wl_inohash[0])
2429: <= wl->wl_inohashmask);
2430: wi = LIST_FIRST(wih++);
2431: }
2432: wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
2433: wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
2434: wc->wc_inocnt++;
2435: i++;
2436: wi = LIST_NEXT(wi, wi_hash);
2437: }
2438: WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2439: ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
2440: wc->wc_len, (intmax_t)off));
2441: error = wapbl_circ_write(wl, wc, blocklen, &off);
2442: if (error)
2443: return error;
2444: } while (i < wl->wl_inohashcnt);
2445:
2446: *offp = off;
2447: return 0;
2448: }
2449:
2450: #endif /* _KERNEL */
2451:
2452: /****************************************************************/
2453:
2454: struct wapbl_blk {
2455: LIST_ENTRY(wapbl_blk) wb_hash;
2456: daddr_t wb_blk;
2457: off_t wb_off; /* Offset of this block in the log */
2458: };
2459: #define WAPBL_BLKPOOL_MIN 83
2460:
2461: static void
2462: wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
2463: {
2464: if (size < WAPBL_BLKPOOL_MIN)
2465: size = WAPBL_BLKPOOL_MIN;
2466: KASSERT(wr->wr_blkhash == 0);
2467: #ifdef _KERNEL
2468: wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
2469: #else /* ! _KERNEL */
2470: /* Manually implement hashinit */
2471: {
1.25 lukem 2472: unsigned long i, hashsize;
1.2 simonb 2473: for (hashsize = 1; hashsize < size; hashsize <<= 1)
2474: continue;
1.51 para 2475: wr->wr_blkhash = wapbl_alloc(hashsize * sizeof(*wr->wr_blkhash));
1.37 drochner 2476: for (i = 0; i < hashsize; i++)
1.2 simonb 2477: LIST_INIT(&wr->wr_blkhash[i]);
2478: wr->wr_blkhashmask = hashsize - 1;
2479: }
2480: #endif /* ! _KERNEL */
2481: }
2482:
2483: static void
2484: wapbl_blkhash_free(struct wapbl_replay *wr)
2485: {
2486: KASSERT(wr->wr_blkhashcnt == 0);
2487: #ifdef _KERNEL
2488: hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
2489: #else /* ! _KERNEL */
1.18 yamt 2490: wapbl_free(wr->wr_blkhash,
2491: (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash));
1.2 simonb 2492: #endif /* ! _KERNEL */
2493: }
2494:
2495: static struct wapbl_blk *
2496: wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
2497: {
2498: struct wapbl_blk_head *wbh;
2499: struct wapbl_blk *wb;
2500: wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2501: LIST_FOREACH(wb, wbh, wb_hash) {
2502: if (blk == wb->wb_blk)
2503: return wb;
2504: }
2505: return 0;
2506: }
2507:
2508: static void
2509: wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
2510: {
2511: struct wapbl_blk_head *wbh;
2512: struct wapbl_blk *wb;
2513: wb = wapbl_blkhash_get(wr, blk);
2514: if (wb) {
2515: KASSERT(wb->wb_blk == blk);
2516: wb->wb_off = off;
2517: } else {
1.51 para 2518: wb = wapbl_alloc(sizeof(*wb));
1.2 simonb 2519: wb->wb_blk = blk;
2520: wb->wb_off = off;
2521: wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2522: LIST_INSERT_HEAD(wbh, wb, wb_hash);
2523: wr->wr_blkhashcnt++;
2524: }
2525: }
2526:
2527: static void
2528: wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
2529: {
2530: struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2531: if (wb) {
2532: KASSERT(wr->wr_blkhashcnt > 0);
2533: wr->wr_blkhashcnt--;
2534: LIST_REMOVE(wb, wb_hash);
1.18 yamt 2535: wapbl_free(wb, sizeof(*wb));
1.2 simonb 2536: }
2537: }
2538:
2539: static void
2540: wapbl_blkhash_clear(struct wapbl_replay *wr)
2541: {
1.25 lukem 2542: unsigned long i;
1.2 simonb 2543: for (i = 0; i <= wr->wr_blkhashmask; i++) {
2544: struct wapbl_blk *wb;
2545:
2546: while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
2547: KASSERT(wr->wr_blkhashcnt > 0);
2548: wr->wr_blkhashcnt--;
2549: LIST_REMOVE(wb, wb_hash);
1.18 yamt 2550: wapbl_free(wb, sizeof(*wb));
1.2 simonb 2551: }
2552: }
2553: KASSERT(wr->wr_blkhashcnt == 0);
2554: }
2555:
2556: /****************************************************************/
2557:
1.71 riastrad 2558: /*
2559: * wapbl_circ_read(wr, data, len, offp)
2560: *
2561: * Read len bytes into data from the circular queue of wr,
2562: * starting at the linear byte offset *offp, and returning the new
2563: * linear byte offset in *offp.
2564: *
2565: * If the starting linear byte offset precedes wr->wr_circ_off,
2566: * the read instead begins at wr->wr_circ_off. XXX WTF? This
2567: * should be a KASSERT, not a conditional.
2568: */
1.2 simonb 2569: static int
2570: wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
2571: {
2572: size_t slen;
2573: off_t off = *offp;
2574: int error;
1.34 mlelstv 2575: daddr_t pbn;
1.2 simonb 2576:
1.14 joerg 2577: KASSERT(((len >> wr->wr_log_dev_bshift) <<
2578: wr->wr_log_dev_bshift) == len);
1.34 mlelstv 2579:
1.14 joerg 2580: if (off < wr->wr_circ_off)
2581: off = wr->wr_circ_off;
2582: slen = wr->wr_circ_off + wr->wr_circ_size - off;
1.2 simonb 2583: if (slen < len) {
1.34 mlelstv 2584: pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2585: #ifdef _KERNEL
2586: pbn = btodb(pbn << wr->wr_log_dev_bshift);
2587: #endif
2588: error = wapbl_read(data, slen, wr->wr_devvp, pbn);
1.2 simonb 2589: if (error)
2590: return error;
2591: data = (uint8_t *)data + slen;
2592: len -= slen;
1.14 joerg 2593: off = wr->wr_circ_off;
1.2 simonb 2594: }
1.34 mlelstv 2595: pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2596: #ifdef _KERNEL
2597: pbn = btodb(pbn << wr->wr_log_dev_bshift);
2598: #endif
2599: error = wapbl_read(data, len, wr->wr_devvp, pbn);
1.2 simonb 2600: if (error)
2601: return error;
2602: off += len;
1.14 joerg 2603: if (off >= wr->wr_circ_off + wr->wr_circ_size)
2604: off = wr->wr_circ_off;
1.2 simonb 2605: *offp = off;
2606: return 0;
2607: }
2608:
1.71 riastrad 2609: /*
2610: * wapbl_circ_advance(wr, len, offp)
2611: *
2612: * Compute the linear byte offset of the circular queue of wr that
2613: * is len bytes past *offp, and store it in *offp.
2614: *
2615: * This is as if wapbl_circ_read, but without actually reading
2616: * anything.
2617: *
2618: * If the starting linear byte offset precedes wr->wr_circ_off, it
2619: * is taken to be wr->wr_circ_off instead. XXX WTF? This should
2620: * be a KASSERT, not a conditional.
2621: */
1.2 simonb 2622: static void
2623: wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
2624: {
2625: size_t slen;
2626: off_t off = *offp;
2627:
1.14 joerg 2628: KASSERT(((len >> wr->wr_log_dev_bshift) <<
2629: wr->wr_log_dev_bshift) == len);
1.2 simonb 2630:
1.14 joerg 2631: if (off < wr->wr_circ_off)
2632: off = wr->wr_circ_off;
2633: slen = wr->wr_circ_off + wr->wr_circ_size - off;
1.2 simonb 2634: if (slen < len) {
2635: len -= slen;
1.14 joerg 2636: off = wr->wr_circ_off;
1.2 simonb 2637: }
2638: off += len;
1.14 joerg 2639: if (off >= wr->wr_circ_off + wr->wr_circ_size)
2640: off = wr->wr_circ_off;
1.2 simonb 2641: *offp = off;
2642: }
2643:
2644: /****************************************************************/
2645:
2646: int
2647: wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
2648: daddr_t off, size_t count, size_t blksize)
2649: {
2650: struct wapbl_replay *wr;
2651: int error;
2652: struct vnode *devvp;
2653: daddr_t logpbn;
2654: uint8_t *scratch;
2655: struct wapbl_wc_header *wch;
2656: struct wapbl_wc_header *wch2;
2657: /* Use this until we read the actual log header */
1.31 mlelstv 2658: int log_dev_bshift = ilog2(blksize);
1.2 simonb 2659: size_t used;
1.34 mlelstv 2660: daddr_t pbn;
1.2 simonb 2661:
2662: WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2663: ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
2664: vp, off, count, blksize));
2665:
2666: if (off < 0)
2667: return EINVAL;
2668:
2669: if (blksize < DEV_BSIZE)
2670: return EINVAL;
2671: if (blksize % DEV_BSIZE)
2672: return EINVAL;
2673:
2674: #ifdef _KERNEL
2675: #if 0
2676: /* XXX vp->v_size isn't reliably set for VBLK devices,
2677: * especially root. However, we might still want to verify
2678: * that the full load is readable */
2679: if ((off + count) * blksize > vp->v_size)
2680: return EINVAL;
2681: #endif
2682: if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
2683: return error;
2684: }
2685: #else /* ! _KERNEL */
2686: devvp = vp;
2687: logpbn = off;
2688: #endif /* ! _KERNEL */
2689:
1.51 para 2690: scratch = wapbl_alloc(MAXBSIZE);
1.2 simonb 2691:
1.34 mlelstv 2692: pbn = logpbn;
2693: #ifdef _KERNEL
2694: pbn = btodb(pbn << log_dev_bshift);
2695: #endif
2696: error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn);
1.2 simonb 2697: if (error)
2698: goto errout;
2699:
2700: wch = (struct wapbl_wc_header *)scratch;
2701: wch2 =
2702: (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
2703: /* XXX verify checksums and magic numbers */
2704: if (wch->wc_type != WAPBL_WC_HEADER) {
2705: printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
2706: error = EFTYPE;
2707: goto errout;
2708: }
2709:
2710: if (wch2->wc_generation > wch->wc_generation)
2711: wch = wch2;
2712:
2713: wr = wapbl_calloc(1, sizeof(*wr));
2714:
2715: wr->wr_logvp = vp;
2716: wr->wr_devvp = devvp;
2717: wr->wr_logpbn = logpbn;
2718:
2719: wr->wr_scratch = scratch;
2720:
1.14 joerg 2721: wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
2722: wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
2723: wr->wr_circ_off = wch->wc_circ_off;
2724: wr->wr_circ_size = wch->wc_circ_size;
2725: wr->wr_generation = wch->wc_generation;
1.2 simonb 2726:
2727: used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
2728:
2729: WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2730: ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
2731: " len=%"PRId64" used=%zu\n",
2732: wch->wc_head, wch->wc_tail, wch->wc_circ_off,
2733: wch->wc_circ_size, used));
2734:
2735: wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
1.11 joerg 2736:
1.14 joerg 2737: error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
1.2 simonb 2738: if (error) {
2739: wapbl_replay_stop(wr);
2740: wapbl_replay_free(wr);
2741: return error;
2742: }
2743:
2744: *wrp = wr;
2745: return 0;
2746:
2747: errout:
1.18 yamt 2748: wapbl_free(scratch, MAXBSIZE);
1.2 simonb 2749: return error;
2750: }
2751:
2752: void
2753: wapbl_replay_stop(struct wapbl_replay *wr)
2754: {
2755:
1.4 joerg 2756: if (!wapbl_replay_isopen(wr))
2757: return;
2758:
1.2 simonb 2759: WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
2760:
1.18 yamt 2761: wapbl_free(wr->wr_scratch, MAXBSIZE);
2762: wr->wr_scratch = NULL;
1.2 simonb 2763:
1.18 yamt 2764: wr->wr_logvp = NULL;
1.2 simonb 2765:
2766: wapbl_blkhash_clear(wr);
2767: wapbl_blkhash_free(wr);
2768: }
2769:
2770: void
2771: wapbl_replay_free(struct wapbl_replay *wr)
2772: {
2773:
2774: KDASSERT(!wapbl_replay_isopen(wr));
2775:
2776: if (wr->wr_inodes)
1.18 yamt 2777: wapbl_free(wr->wr_inodes,
2778: wr->wr_inodescnt * sizeof(wr->wr_inodes[0]));
2779: wapbl_free(wr, sizeof(*wr));
1.2 simonb 2780: }
2781:
1.4 joerg 2782: #ifdef _KERNEL
1.2 simonb 2783: int
2784: wapbl_replay_isopen1(struct wapbl_replay *wr)
2785: {
2786:
2787: return wapbl_replay_isopen(wr);
2788: }
1.4 joerg 2789: #endif
1.2 simonb 2790:
1.62 mlelstv 2791: /*
2792: * calculate the disk address for the i'th block in the wc_blockblist
2793: * offset by j blocks of size blen.
2794: *
2795: * wc_daddr is always a kernel disk address in DEV_BSIZE units that
2796: * was written to the journal.
2797: *
2798: * The kernel needs that address plus the offset in DEV_BSIZE units.
2799: *
2800: * Userland needs that address plus the offset in blen units.
2801: *
2802: */
2803: static daddr_t
2804: wapbl_block_daddr(struct wapbl_wc_blocklist *wc, int i, int j, int blen)
2805: {
2806: daddr_t pbn;
2807:
2808: #ifdef _KERNEL
2809: pbn = wc->wc_blocks[i].wc_daddr + btodb(j * blen);
2810: #else
2811: pbn = dbtob(wc->wc_blocks[i].wc_daddr) / blen + j;
2812: #endif
2813:
2814: return pbn;
2815: }
2816:
1.10 joerg 2817: static void
2818: wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
2819: {
2820: struct wapbl_wc_blocklist *wc =
2821: (struct wapbl_wc_blocklist *)wr->wr_scratch;
1.14 joerg 2822: int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.10 joerg 2823: int i, j, n;
2824:
2825: for (i = 0; i < wc->wc_blkcount; i++) {
2826: /*
2827: * Enter each physical block into the hashtable independently.
2828: */
1.14 joerg 2829: n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
1.10 joerg 2830: for (j = 0; j < n; j++) {
1.62 mlelstv 2831: wapbl_blkhash_ins(wr, wapbl_block_daddr(wc, i, j, fsblklen),
1.10 joerg 2832: *offp);
2833: wapbl_circ_advance(wr, fsblklen, offp);
2834: }
2835: }
2836: }
2837:
2838: static void
2839: wapbl_replay_process_revocations(struct wapbl_replay *wr)
2840: {
2841: struct wapbl_wc_blocklist *wc =
2842: (struct wapbl_wc_blocklist *)wr->wr_scratch;
1.34 mlelstv 2843: int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.10 joerg 2844: int i, j, n;
2845:
2846: for (i = 0; i < wc->wc_blkcount; i++) {
2847: /*
2848: * Remove any blocks found from the hashtable.
2849: */
1.14 joerg 2850: n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
1.10 joerg 2851: for (j = 0; j < n; j++)
1.62 mlelstv 2852: wapbl_blkhash_rem(wr, wapbl_block_daddr(wc, i, j, fsblklen));
1.10 joerg 2853: }
2854: }
2855:
2856: static void
2857: wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
2858: {
2859: struct wapbl_wc_inodelist *wc =
2860: (struct wapbl_wc_inodelist *)wr->wr_scratch;
1.18 yamt 2861: void *new_inodes;
2862: const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]);
2863:
2864: KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0]));
2865:
1.10 joerg 2866: /*
2867: * Keep track of where we found this so location won't be
2868: * overwritten.
2869: */
2870: if (wc->wc_clear) {
2871: wr->wr_inodestail = oldoff;
2872: wr->wr_inodescnt = 0;
1.12 joerg 2873: if (wr->wr_inodes != NULL) {
1.18 yamt 2874: wapbl_free(wr->wr_inodes, oldsize);
1.12 joerg 2875: wr->wr_inodes = NULL;
2876: }
1.10 joerg 2877: }
2878: wr->wr_inodeshead = newoff;
2879: if (wc->wc_inocnt == 0)
2880: return;
2881:
1.51 para 2882: new_inodes = wapbl_alloc((wr->wr_inodescnt + wc->wc_inocnt) *
1.18 yamt 2883: sizeof(wr->wr_inodes[0]));
2884: if (wr->wr_inodes != NULL) {
2885: memcpy(new_inodes, wr->wr_inodes, oldsize);
2886: wapbl_free(wr->wr_inodes, oldsize);
2887: }
2888: wr->wr_inodes = new_inodes;
1.10 joerg 2889: memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
1.18 yamt 2890: wc->wc_inocnt * sizeof(wr->wr_inodes[0]));
1.10 joerg 2891: wr->wr_inodescnt += wc->wc_inocnt;
2892: }
2893:
1.2 simonb 2894: static int
1.14 joerg 2895: wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
1.2 simonb 2896: {
2897: off_t off;
2898: int error;
2899:
1.14 joerg 2900: int logblklen = 1 << wr->wr_log_dev_bshift;
1.2 simonb 2901:
2902: wapbl_blkhash_clear(wr);
2903:
1.14 joerg 2904: off = tail;
2905: while (off != head) {
1.2 simonb 2906: struct wapbl_wc_null *wcn;
2907: off_t saveoff = off;
2908: error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2909: if (error)
2910: goto errout;
2911: wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2912: switch (wcn->wc_type) {
2913: case WAPBL_WC_BLOCKS:
1.10 joerg 2914: wapbl_replay_process_blocks(wr, &off);
1.2 simonb 2915: break;
2916:
2917: case WAPBL_WC_REVOCATIONS:
1.10 joerg 2918: wapbl_replay_process_revocations(wr);
1.2 simonb 2919: break;
2920:
2921: case WAPBL_WC_INODES:
1.10 joerg 2922: wapbl_replay_process_inodes(wr, saveoff, off);
1.2 simonb 2923: break;
1.10 joerg 2924:
1.2 simonb 2925: default:
2926: printf("Unrecognized wapbl type: 0x%08x\n",
2927: wcn->wc_type);
2928: error = EFTYPE;
2929: goto errout;
2930: }
2931: wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2932: if (off != saveoff) {
2933: printf("wapbl_replay: corrupted records\n");
2934: error = EFTYPE;
2935: goto errout;
2936: }
2937: }
2938: return 0;
2939:
2940: errout:
2941: wapbl_blkhash_clear(wr);
2942: return error;
2943: }
2944:
1.13 joerg 2945: #if 0
1.2 simonb 2946: int
2947: wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
2948: {
2949: off_t off;
2950: int mismatchcnt = 0;
1.14 joerg 2951: int logblklen = 1 << wr->wr_log_dev_bshift;
2952: int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.51 para 2953: void *scratch1 = wapbl_alloc(MAXBSIZE);
2954: void *scratch2 = wapbl_alloc(MAXBSIZE);
1.2 simonb 2955: int error = 0;
2956:
2957: KDASSERT(wapbl_replay_isopen(wr));
2958:
2959: off = wch->wc_tail;
2960: while (off != wch->wc_head) {
2961: struct wapbl_wc_null *wcn;
2962: #ifdef DEBUG
2963: off_t saveoff = off;
2964: #endif
2965: error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2966: if (error)
2967: goto out;
2968: wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2969: switch (wcn->wc_type) {
2970: case WAPBL_WC_BLOCKS:
2971: {
2972: struct wapbl_wc_blocklist *wc =
2973: (struct wapbl_wc_blocklist *)wr->wr_scratch;
2974: int i;
2975: for (i = 0; i < wc->wc_blkcount; i++) {
2976: int foundcnt = 0;
2977: int dirtycnt = 0;
2978: int j, n;
2979: /*
2980: * Check each physical block into the
2981: * hashtable independently
2982: */
2983: n = wc->wc_blocks[i].wc_dlen >>
2984: wch->wc_fs_dev_bshift;
2985: for (j = 0; j < n; j++) {
2986: struct wapbl_blk *wb =
2987: wapbl_blkhash_get(wr,
1.62 mlelstv 2988: wapbl_block_daddr(wc, i, j, fsblklen));
1.2 simonb 2989: if (wb && (wb->wb_off == off)) {
2990: foundcnt++;
2991: error =
2992: wapbl_circ_read(wr,
2993: scratch1, fsblklen,
2994: &off);
2995: if (error)
2996: goto out;
2997: error =
2998: wapbl_read(scratch2,
2999: fsblklen, fsdevvp,
3000: wb->wb_blk);
3001: if (error)
3002: goto out;
3003: if (memcmp(scratch1,
3004: scratch2,
3005: fsblklen)) {
3006: printf(
3007: "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
3008: wb->wb_blk, (intmax_t)off);
3009: dirtycnt++;
3010: mismatchcnt++;
3011: }
3012: } else {
3013: wapbl_circ_advance(wr,
3014: fsblklen, &off);
3015: }
3016: }
3017: #if 0
3018: /*
3019: * If all of the blocks in an entry
3020: * are clean, then remove all of its
3021: * blocks from the hashtable since they
3022: * never will need replay.
3023: */
3024: if ((foundcnt != 0) &&
3025: (dirtycnt == 0)) {
3026: off = saveoff;
3027: wapbl_circ_advance(wr,
3028: logblklen, &off);
3029: for (j = 0; j < n; j++) {
3030: struct wapbl_blk *wb =
3031: wapbl_blkhash_get(wr,
1.62 mlelstv 3032: wapbl_block_daddr(wc, i, j, fsblklen));
1.2 simonb 3033: if (wb &&
3034: (wb->wb_off == off)) {
3035: wapbl_blkhash_rem(wr, wb->wb_blk);
3036: }
3037: wapbl_circ_advance(wr,
3038: fsblklen, &off);
3039: }
3040: }
3041: #endif
3042: }
3043: }
3044: break;
3045: case WAPBL_WC_REVOCATIONS:
3046: case WAPBL_WC_INODES:
3047: break;
3048: default:
3049: KASSERT(0);
3050: }
3051: #ifdef DEBUG
3052: wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
3053: KASSERT(off == saveoff);
3054: #endif
3055: }
3056: out:
1.18 yamt 3057: wapbl_free(scratch1, MAXBSIZE);
3058: wapbl_free(scratch2, MAXBSIZE);
1.2 simonb 3059: if (!error && mismatchcnt)
3060: error = EFTYPE;
3061: return error;
3062: }
3063: #endif
3064:
3065: int
3066: wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
3067: {
1.9 joerg 3068: struct wapbl_blk *wb;
3069: size_t i;
1.2 simonb 3070: off_t off;
1.9 joerg 3071: void *scratch;
1.2 simonb 3072: int error = 0;
1.14 joerg 3073: int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.2 simonb 3074:
3075: KDASSERT(wapbl_replay_isopen(wr));
3076:
1.51 para 3077: scratch = wapbl_alloc(MAXBSIZE);
1.2 simonb 3078:
1.37 drochner 3079: for (i = 0; i <= wr->wr_blkhashmask; ++i) {
1.9 joerg 3080: LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
3081: off = wb->wb_off;
3082: error = wapbl_circ_read(wr, scratch, fsblklen, &off);
3083: if (error)
3084: break;
3085: error = wapbl_write(scratch, fsblklen, fsdevvp,
3086: wb->wb_blk);
3087: if (error)
3088: break;
1.2 simonb 3089: }
3090: }
1.9 joerg 3091:
1.18 yamt 3092: wapbl_free(scratch, MAXBSIZE);
1.2 simonb 3093: return error;
3094: }
3095:
3096: int
1.6 joerg 3097: wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
3098: {
1.14 joerg 3099: int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.6 joerg 3100:
3101: KDASSERT(wapbl_replay_isopen(wr));
3102: KASSERT((len % fsblklen) == 0);
3103:
3104: while (len != 0) {
3105: struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
3106: if (wb)
3107: return 1;
3108: len -= fsblklen;
3109: }
3110: return 0;
3111: }
3112:
3113: int
1.2 simonb 3114: wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
3115: {
1.14 joerg 3116: int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.2 simonb 3117:
3118: KDASSERT(wapbl_replay_isopen(wr));
3119:
3120: KASSERT((len % fsblklen) == 0);
3121:
3122: while (len != 0) {
3123: struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
3124: if (wb) {
3125: off_t off = wb->wb_off;
3126: int error;
3127: error = wapbl_circ_read(wr, data, fsblklen, &off);
3128: if (error)
3129: return error;
3130: }
3131: data = (uint8_t *)data + fsblklen;
3132: len -= fsblklen;
3133: blk++;
3134: }
3135: return 0;
3136: }
1.35 pooka 3137:
1.36 pooka 3138: #ifdef _KERNEL
1.64 pgoyette 3139:
1.35 pooka 3140: MODULE(MODULE_CLASS_VFS, wapbl, NULL);
3141:
3142: static int
3143: wapbl_modcmd(modcmd_t cmd, void *arg)
3144: {
3145:
3146: switch (cmd) {
3147: case MODULE_CMD_INIT:
1.39 christos 3148: wapbl_init();
1.35 pooka 3149: return 0;
3150: case MODULE_CMD_FINI:
1.74 riastrad 3151: return wapbl_fini();
1.35 pooka 3152: default:
3153: return ENOTTY;
3154: }
3155: }
1.36 pooka 3156: #endif /* _KERNEL */
CVSweb <webmaster@jp.NetBSD.org>