Annotation of src/sys/kern/vfs_wapbl.c, Revision 1.34
1.34 ! mlelstv 1: /* $NetBSD: vfs_wapbl.c,v 1.33 2010/02/27 12:04:19 mlelstv Exp $ */
1.2 simonb 2:
3: /*-
1.23 ad 4: * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
1.2 simonb 5: * All rights reserved.
6: *
7: * This code is derived from software contributed to The NetBSD Foundation
8: * by Wasabi Systems, Inc.
9: *
10: * Redistribution and use in source and binary forms, with or without
11: * modification, are permitted provided that the following conditions
12: * are met:
13: * 1. Redistributions of source code must retain the above copyright
14: * notice, this list of conditions and the following disclaimer.
15: * 2. Redistributions in binary form must reproduce the above copyright
16: * notice, this list of conditions and the following disclaimer in the
17: * documentation and/or other materials provided with the distribution.
18: *
19: * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20: * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21: * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23: * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24: * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25: * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26: * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27: * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28: * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29: * POSSIBILITY OF SUCH DAMAGE.
30: */
31:
32: /*
33: * This implements file system independent write ahead filesystem logging.
34: */
1.4 joerg 35:
36: #define WAPBL_INTERNAL
37:
1.2 simonb 38: #include <sys/cdefs.h>
1.34 ! mlelstv 39: __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.33 2010/02/27 12:04:19 mlelstv Exp $");
1.2 simonb 40:
41: #include <sys/param.h>
1.31 mlelstv 42: #include <sys/bitops.h>
1.2 simonb 43:
44: #ifdef _KERNEL
45: #include <sys/param.h>
46: #include <sys/namei.h>
47: #include <sys/proc.h>
48: #include <sys/uio.h>
49: #include <sys/vnode.h>
50: #include <sys/file.h>
1.19 yamt 51: #include <sys/malloc.h>
1.2 simonb 52: #include <sys/resourcevar.h>
53: #include <sys/conf.h>
54: #include <sys/mount.h>
55: #include <sys/kernel.h>
56: #include <sys/kauth.h>
57: #include <sys/mutex.h>
58: #include <sys/atomic.h>
59: #include <sys/wapbl.h>
1.16 joerg 60: #include <sys/wapbl_replay.h>
1.2 simonb 61:
62: #include <miscfs/specfs/specdev.h>
63:
1.19 yamt 64: #if 0 /* notyet */
1.18 yamt 65: #define wapbl_malloc(s) kmem_alloc((s), KM_SLEEP)
66: #define wapbl_free(a, s) kmem_free((a), (s))
67: #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
1.19 yamt 68: #else
69: MALLOC_JUSTDEFINE(M_WAPBL, "wapbl", "write-ahead physical block logging");
70: #define wapbl_malloc(s) malloc((s), M_WAPBL, M_WAITOK)
71: #define wapbl_free(a, s) free((a), M_WAPBL)
72: #define wapbl_calloc(n, s) malloc((n)*(s), M_WAPBL, M_WAITOK | M_ZERO)
73: #endif
1.2 simonb 74:
75: #else /* !_KERNEL */
76: #include <assert.h>
77: #include <errno.h>
78: #include <stdio.h>
79: #include <stdbool.h>
80: #include <stdlib.h>
81: #include <string.h>
82:
83: #include <sys/time.h>
84: #include <sys/wapbl.h>
1.16 joerg 85: #include <sys/wapbl_replay.h>
1.2 simonb 86:
87: #define KDASSERT(x) assert(x)
88: #define KASSERT(x) assert(x)
89: #define wapbl_malloc(s) malloc(s)
1.18 yamt 90: #define wapbl_free(a, s) free(a)
1.2 simonb 91: #define wapbl_calloc(n, s) calloc((n), (s))
92:
93: #endif /* !_KERNEL */
94:
95: /*
96: * INTERNAL DATA STRUCTURES
97: */
98:
99: /*
100: * This structure holds per-mount log information.
101: *
102: * Legend: a = atomic access only
103: * r = read-only after init
104: * l = rwlock held
105: * m = mutex held
106: * u = unlocked access ok
107: * b = bufcache_lock held
108: */
109: struct wapbl {
110: struct vnode *wl_logvp; /* r: log here */
111: struct vnode *wl_devvp; /* r: log on this device */
112: struct mount *wl_mount; /* r: mountpoint wl is associated with */
113: daddr_t wl_logpbn; /* r: Physical block number of start of log */
114: int wl_log_dev_bshift; /* r: logarithm of device block size of log
115: device */
116: int wl_fs_dev_bshift; /* r: logarithm of device block size of
117: filesystem device */
118:
1.3 yamt 119: unsigned wl_lock_count; /* m: Count of transactions in progress */
1.2 simonb 120:
121: size_t wl_circ_size; /* r: Number of bytes in buffer of log */
122: size_t wl_circ_off; /* r: Number of bytes reserved at start */
123:
124: size_t wl_bufcount_max; /* r: Number of buffers reserved for log */
125: size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */
126:
127: off_t wl_head; /* l: Byte offset of log head */
128: off_t wl_tail; /* l: Byte offset of log tail */
129: /*
130: * head == tail == 0 means log is empty
131: * head == tail != 0 means log is full
132: * see assertions in wapbl_advance() for other boundary conditions.
133: * only truncate moves the tail, except when flush sets it to
134: * wl_header_size only flush moves the head, except when truncate
135: * sets it to 0.
136: */
137:
138: struct wapbl_wc_header *wl_wc_header; /* l */
139: void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */
140:
141: kmutex_t wl_mtx; /* u: short-term lock */
142: krwlock_t wl_rwlock; /* u: File system transaction lock */
143:
144: /*
145: * Must be held while accessing
146: * wl_count or wl_bufs or head or tail
147: */
148:
149: /*
150: * Callback called from within the flush routine to flush any extra
151: * bits. Note that flush may be skipped without calling this if
152: * there are no outstanding buffers in the transaction.
153: */
1.5 joerg 154: #if _KERNEL
1.2 simonb 155: wapbl_flush_fn_t wl_flush; /* r */
156: wapbl_flush_fn_t wl_flush_abort;/* r */
1.5 joerg 157: #endif
1.2 simonb 158:
159: size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */
160: size_t wl_bufcount; /* m: Count of buffers in wl_bufs */
161: size_t wl_bcount; /* m: Total bcount of wl_bufs */
162:
163: LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
164:
165: kcondvar_t wl_reclaimable_cv; /* m (obviously) */
166: size_t wl_reclaimable_bytes; /* m: Amount of space available for
167: reclamation by truncate */
168: int wl_error_count; /* m: # of wl_entries with errors */
169: size_t wl_reserved_bytes; /* never truncate log smaller than this */
170:
171: #ifdef WAPBL_DEBUG_BUFBYTES
172: size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
173: #endif
174:
175: daddr_t *wl_deallocblks;/* l: address of block */
1.20 yamt 176: int *wl_dealloclens; /* l: size of block */
1.2 simonb 177: int wl_dealloccnt; /* l: total count */
178: int wl_dealloclim; /* l: max count */
179:
180: /* hashtable of inode numbers for allocated but unlinked inodes */
181: /* synch ??? */
182: LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash;
183: u_long wl_inohashmask;
184: int wl_inohashcnt;
185:
186: SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
187: accounting */
188: };
189:
190: #ifdef WAPBL_DEBUG_PRINT
191: int wapbl_debug_print = WAPBL_DEBUG_PRINT;
192: #endif
193:
194: /****************************************************************/
195: #ifdef _KERNEL
196:
197: #ifdef WAPBL_DEBUG
198: struct wapbl *wapbl_debug_wl;
199: #endif
200:
201: static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
202: static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
203: static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
204: static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
205: #endif /* _KERNEL */
206:
1.14 joerg 207: static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
1.2 simonb 208:
1.30 uebayasi 209: static inline size_t wapbl_space_free(size_t avail, off_t head,
1.2 simonb 210: off_t tail);
1.30 uebayasi 211: static inline size_t wapbl_space_used(size_t avail, off_t head,
1.2 simonb 212: off_t tail);
213:
214: #ifdef _KERNEL
215:
216: #define WAPBL_INODETRK_SIZE 83
217: static int wapbl_ino_pool_refcount;
218: static struct pool wapbl_ino_pool;
219: struct wapbl_ino {
220: LIST_ENTRY(wapbl_ino) wi_hash;
221: ino_t wi_ino;
222: mode_t wi_mode;
223: };
224:
225: static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
226: static void wapbl_inodetrk_free(struct wapbl *wl);
227: static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
228:
229: static size_t wapbl_transaction_len(struct wapbl *wl);
1.30 uebayasi 230: static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
1.2 simonb 231:
1.13 joerg 232: #if 0
1.4 joerg 233: int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
234: #endif
235:
236: static int wapbl_replay_isopen1(struct wapbl_replay *);
237:
1.2 simonb 238: /*
239: * This is useful for debugging. If set, the log will
240: * only be truncated when necessary.
241: */
242: int wapbl_lazy_truncate = 0;
243:
244: struct wapbl_ops wapbl_ops = {
245: .wo_wapbl_discard = wapbl_discard,
246: .wo_wapbl_replay_isopen = wapbl_replay_isopen1,
1.6 joerg 247: .wo_wapbl_replay_can_read = wapbl_replay_can_read,
1.2 simonb 248: .wo_wapbl_replay_read = wapbl_replay_read,
249: .wo_wapbl_add_buf = wapbl_add_buf,
250: .wo_wapbl_remove_buf = wapbl_remove_buf,
251: .wo_wapbl_resize_buf = wapbl_resize_buf,
252: .wo_wapbl_begin = wapbl_begin,
253: .wo_wapbl_end = wapbl_end,
254: .wo_wapbl_junlock_assert= wapbl_junlock_assert,
255:
256: /* XXX: the following is only used to say "this is a wapbl buf" */
257: .wo_wapbl_biodone = wapbl_biodone,
258: };
259:
260: void
1.24 cegger 261: wapbl_init(void)
1.2 simonb 262: {
263:
1.22 yamt 264: malloc_type_attach(M_WAPBL);
1.2 simonb 265: }
266:
1.21 yamt 267: static int
1.15 joerg 268: wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
269: {
270: int error, i;
271:
272: WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
273: ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
274:
275: /*
276: * Its only valid to reuse the replay log if its
277: * the same as the new log we just opened.
278: */
279: KDASSERT(!wapbl_replay_isopen(wr));
280: KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
281: KASSERT(wl->wl_logpbn == wr->wr_logpbn);
282: KASSERT(wl->wl_circ_size == wr->wr_circ_size);
283: KASSERT(wl->wl_circ_off == wr->wr_circ_off);
284: KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
285: KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
286:
287: wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
288:
289: for (i = 0; i < wr->wr_inodescnt; i++)
290: wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
291: wr->wr_inodes[i].wr_imode);
292:
293: /* Make sure new transaction won't overwrite old inodes list */
294: KDASSERT(wapbl_transaction_len(wl) <=
295: wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
296: wr->wr_inodestail));
297:
298: wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
299: wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
300: wapbl_transaction_len(wl);
301:
302: error = wapbl_write_inodes(wl, &wl->wl_head);
303: if (error)
304: return error;
305:
306: KASSERT(wl->wl_head != wl->wl_tail);
307: KASSERT(wl->wl_head != 0);
308:
309: return 0;
310: }
311:
1.2 simonb 312: int
313: wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
314: daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
315: wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
316: {
317: struct wapbl *wl;
318: struct vnode *devvp;
319: daddr_t logpbn;
320: int error;
1.31 mlelstv 321: int log_dev_bshift = ilog2(blksize);
1.32 mlelstv 322: int fs_dev_bshift = log_dev_bshift;
1.2 simonb 323: int run;
324:
325: WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
326: " count=%zu blksize=%zu\n", vp, off, count, blksize));
327:
328: if (log_dev_bshift > fs_dev_bshift) {
329: WAPBL_PRINTF(WAPBL_PRINT_OPEN,
330: ("wapbl: log device's block size cannot be larger "
331: "than filesystem's\n"));
332: /*
333: * Not currently implemented, although it could be if
334: * needed someday.
335: */
336: return ENOSYS;
337: }
338:
339: if (off < 0)
340: return EINVAL;
341:
342: if (blksize < DEV_BSIZE)
343: return EINVAL;
344: if (blksize % DEV_BSIZE)
345: return EINVAL;
346:
347: /* XXXTODO: verify that the full load is writable */
348:
349: /*
350: * XXX check for minimum log size
351: * minimum is governed by minimum amount of space
352: * to complete a transaction. (probably truncate)
353: */
354: /* XXX for now pick something minimal */
355: if ((count * blksize) < MAXPHYS) {
356: return ENOSPC;
357: }
358:
359: if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
360: return error;
361: }
362:
363: wl = wapbl_calloc(1, sizeof(*wl));
364: rw_init(&wl->wl_rwlock);
365: mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
366: cv_init(&wl->wl_reclaimable_cv, "wapblrec");
367: LIST_INIT(&wl->wl_bufs);
368: SIMPLEQ_INIT(&wl->wl_entries);
369:
370: wl->wl_logvp = vp;
371: wl->wl_devvp = devvp;
372: wl->wl_mount = mp;
373: wl->wl_logpbn = logpbn;
374: wl->wl_log_dev_bshift = log_dev_bshift;
375: wl->wl_fs_dev_bshift = fs_dev_bshift;
376:
377: wl->wl_flush = flushfn;
378: wl->wl_flush_abort = flushabortfn;
379:
380: /* Reserve two log device blocks for the commit headers */
381: wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
1.34 ! mlelstv 382: wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
1.2 simonb 383: /* truncate the log usage to a multiple of log_dev_bshift */
384: wl->wl_circ_size >>= wl->wl_log_dev_bshift;
385: wl->wl_circ_size <<= wl->wl_log_dev_bshift;
386:
387: /*
388: * wl_bufbytes_max limits the size of the in memory transaction space.
389: * - Since buffers are allocated and accounted for in units of
390: * PAGE_SIZE it is required to be a multiple of PAGE_SIZE
391: * (i.e. 1<<PAGE_SHIFT)
392: * - Since the log device has to be written in units of
393: * 1<<wl_log_dev_bshift it is required to be a mulitple of
394: * 1<<wl_log_dev_bshift.
395: * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
396: * it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
397: * Therefore it must be multiple of the least common multiple of those
398: * three quantities. Fortunately, all of those quantities are
399: * guaranteed to be a power of two, and the least common multiple of
400: * a set of numbers which are all powers of two is simply the maximum
401: * of those numbers. Finally, the maximum logarithm of a power of two
402: * is the same as the log of the maximum power of two. So we can do
403: * the following operations to size wl_bufbytes_max:
404: */
405:
406: /* XXX fix actual number of pages reserved per filesystem. */
407: wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
408:
409: /* Round wl_bufbytes_max to the largest power of two constraint */
410: wl->wl_bufbytes_max >>= PAGE_SHIFT;
411: wl->wl_bufbytes_max <<= PAGE_SHIFT;
412: wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
413: wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
414: wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
415: wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
416:
417: /* XXX maybe use filesystem fragment size instead of 1024 */
418: /* XXX fix actual number of buffers reserved per filesystem. */
419: wl->wl_bufcount_max = (nbuf / 2) * 1024;
420:
421: /* XXX tie this into resource estimation */
422: wl->wl_dealloclim = 2 * btodb(wl->wl_bufbytes_max);
423:
424: wl->wl_deallocblks = wapbl_malloc(sizeof(*wl->wl_deallocblks) *
425: wl->wl_dealloclim);
426: wl->wl_dealloclens = wapbl_malloc(sizeof(*wl->wl_dealloclens) *
427: wl->wl_dealloclim);
428:
429: wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
430:
431: /* Initialize the commit header */
432: {
433: struct wapbl_wc_header *wc;
1.14 joerg 434: size_t len = 1 << wl->wl_log_dev_bshift;
1.2 simonb 435: wc = wapbl_calloc(1, len);
436: wc->wc_type = WAPBL_WC_HEADER;
437: wc->wc_len = len;
438: wc->wc_circ_off = wl->wl_circ_off;
439: wc->wc_circ_size = wl->wl_circ_size;
440: /* XXX wc->wc_fsid */
441: wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
442: wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
443: wl->wl_wc_header = wc;
444: wl->wl_wc_scratch = wapbl_malloc(len);
445: }
446:
447: /*
448: * if there was an existing set of unlinked but
449: * allocated inodes, preserve it in the new
450: * log.
451: */
452: if (wr && wr->wr_inodescnt) {
1.15 joerg 453: error = wapbl_start_flush_inodes(wl, wr);
1.2 simonb 454: if (error)
455: goto errout;
456: }
457:
458: error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
459: if (error) {
460: goto errout;
461: }
462:
463: *wlp = wl;
464: #if defined(WAPBL_DEBUG)
465: wapbl_debug_wl = wl;
466: #endif
467:
468: return 0;
469: errout:
470: wapbl_discard(wl);
1.18 yamt 471: wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
472: wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
473: wapbl_free(wl->wl_deallocblks,
474: sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
475: wapbl_free(wl->wl_dealloclens,
476: sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
1.2 simonb 477: wapbl_inodetrk_free(wl);
1.18 yamt 478: wapbl_free(wl, sizeof(*wl));
1.2 simonb 479:
480: return error;
481: }
482:
483: /*
484: * Like wapbl_flush, only discards the transaction
485: * completely
486: */
487:
488: void
489: wapbl_discard(struct wapbl *wl)
490: {
491: struct wapbl_entry *we;
492: struct buf *bp;
493: int i;
494:
495: /*
496: * XXX we may consider using upgrade here
497: * if we want to call flush from inside a transaction
498: */
499: rw_enter(&wl->wl_rwlock, RW_WRITER);
500: wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
501: wl->wl_dealloccnt);
502:
503: #ifdef WAPBL_DEBUG_PRINT
504: {
505: pid_t pid = -1;
506: lwpid_t lid = -1;
507: if (curproc)
508: pid = curproc->p_pid;
509: if (curlwp)
510: lid = curlwp->l_lid;
511: #ifdef WAPBL_DEBUG_BUFBYTES
512: WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
513: ("wapbl_discard: thread %d.%d discarding "
514: "transaction\n"
515: "\tbufcount=%zu bufbytes=%zu bcount=%zu "
516: "deallocs=%d inodes=%d\n"
517: "\terrcnt = %u, reclaimable=%zu reserved=%zu "
518: "unsynced=%zu\n",
519: pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
520: wl->wl_bcount, wl->wl_dealloccnt,
521: wl->wl_inohashcnt, wl->wl_error_count,
522: wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
523: wl->wl_unsynced_bufbytes));
524: SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
525: WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
526: ("\tentry: bufcount = %zu, reclaimable = %zu, "
527: "error = %d, unsynced = %zu\n",
528: we->we_bufcount, we->we_reclaimable_bytes,
529: we->we_error, we->we_unsynced_bufbytes));
530: }
531: #else /* !WAPBL_DEBUG_BUFBYTES */
532: WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
533: ("wapbl_discard: thread %d.%d discarding transaction\n"
534: "\tbufcount=%zu bufbytes=%zu bcount=%zu "
535: "deallocs=%d inodes=%d\n"
536: "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
537: pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
538: wl->wl_bcount, wl->wl_dealloccnt,
539: wl->wl_inohashcnt, wl->wl_error_count,
540: wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
541: SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
542: WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
543: ("\tentry: bufcount = %zu, reclaimable = %zu, "
544: "error = %d\n",
545: we->we_bufcount, we->we_reclaimable_bytes,
546: we->we_error));
547: }
548: #endif /* !WAPBL_DEBUG_BUFBYTES */
549: }
550: #endif /* WAPBL_DEBUG_PRINT */
551:
552: for (i = 0; i <= wl->wl_inohashmask; i++) {
553: struct wapbl_ino_head *wih;
554: struct wapbl_ino *wi;
555:
556: wih = &wl->wl_inohash[i];
557: while ((wi = LIST_FIRST(wih)) != NULL) {
558: LIST_REMOVE(wi, wi_hash);
559: pool_put(&wapbl_ino_pool, wi);
560: KASSERT(wl->wl_inohashcnt > 0);
561: wl->wl_inohashcnt--;
562: }
563: }
564:
565: /*
566: * clean buffer list
567: */
568: mutex_enter(&bufcache_lock);
569: mutex_enter(&wl->wl_mtx);
570: while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
571: if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
572: /*
573: * The buffer will be unlocked and
574: * removed from the transaction in brelse
575: */
576: mutex_exit(&wl->wl_mtx);
577: brelsel(bp, 0);
578: mutex_enter(&wl->wl_mtx);
579: }
580: }
581: mutex_exit(&wl->wl_mtx);
582: mutex_exit(&bufcache_lock);
583:
584: /*
585: * Remove references to this wl from wl_entries, free any which
586: * no longer have buffers, others will be freed in wapbl_biodone
587: * when they no longer have any buffers.
588: */
589: while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
590: SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
591: /* XXX should we be accumulating wl_error_count
592: * and increasing reclaimable bytes ? */
593: we->we_wapbl = NULL;
594: if (we->we_bufcount == 0) {
595: #ifdef WAPBL_DEBUG_BUFBYTES
596: KASSERT(we->we_unsynced_bufbytes == 0);
597: #endif
1.18 yamt 598: wapbl_free(we, sizeof(*we));
1.2 simonb 599: }
600: }
601:
602: /* Discard list of deallocs */
603: wl->wl_dealloccnt = 0;
604: /* XXX should we clear wl_reserved_bytes? */
605:
606: KASSERT(wl->wl_bufbytes == 0);
607: KASSERT(wl->wl_bcount == 0);
608: KASSERT(wl->wl_bufcount == 0);
609: KASSERT(LIST_EMPTY(&wl->wl_bufs));
610: KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
611: KASSERT(wl->wl_inohashcnt == 0);
612:
613: rw_exit(&wl->wl_rwlock);
614: }
615:
616: int
617: wapbl_stop(struct wapbl *wl, int force)
618: {
619: struct vnode *vp;
620: int error;
621:
622: WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
623: error = wapbl_flush(wl, 1);
624: if (error) {
625: if (force)
626: wapbl_discard(wl);
627: else
628: return error;
629: }
630:
631: /* Unlinked inodes persist after a flush */
632: if (wl->wl_inohashcnt) {
633: if (force) {
634: wapbl_discard(wl);
635: } else {
636: return EBUSY;
637: }
638: }
639:
640: KASSERT(wl->wl_bufbytes == 0);
641: KASSERT(wl->wl_bcount == 0);
642: KASSERT(wl->wl_bufcount == 0);
643: KASSERT(LIST_EMPTY(&wl->wl_bufs));
644: KASSERT(wl->wl_dealloccnt == 0);
645: KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
646: KASSERT(wl->wl_inohashcnt == 0);
647:
648: vp = wl->wl_logvp;
649:
1.18 yamt 650: wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
651: wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
652: wapbl_free(wl->wl_deallocblks,
653: sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
654: wapbl_free(wl->wl_dealloclens,
655: sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
1.2 simonb 656: wapbl_inodetrk_free(wl);
657:
658: cv_destroy(&wl->wl_reclaimable_cv);
659: mutex_destroy(&wl->wl_mtx);
660: rw_destroy(&wl->wl_rwlock);
1.18 yamt 661: wapbl_free(wl, sizeof(*wl));
1.2 simonb 662:
663: return 0;
664: }
665:
666: static int
667: wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
668: {
669: struct pstats *pstats = curlwp->l_proc->p_stats;
670: struct buf *bp;
671: int error;
672:
673: KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
674: KASSERT(devvp->v_type == VBLK);
675:
676: if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
677: mutex_enter(&devvp->v_interlock);
678: devvp->v_numoutput++;
679: mutex_exit(&devvp->v_interlock);
680: pstats->p_ru.ru_oublock++;
681: } else {
682: pstats->p_ru.ru_inblock++;
683: }
684:
685: bp = getiobuf(devvp, true);
686: bp->b_flags = flags;
687: bp->b_cflags = BC_BUSY; /* silly & dubious */
688: bp->b_dev = devvp->v_rdev;
689: bp->b_data = data;
690: bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
691: bp->b_blkno = pbn;
692:
693: WAPBL_PRINTF(WAPBL_PRINT_IO,
1.29 pooka 694: ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n",
1.2 simonb 695: BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
696: bp->b_blkno, bp->b_dev));
697:
698: VOP_STRATEGY(devvp, bp);
699:
700: error = biowait(bp);
701: putiobuf(bp);
702:
703: if (error) {
704: WAPBL_PRINTF(WAPBL_PRINT_ERROR,
705: ("wapbl_doio: %s %zu bytes at block %" PRId64
1.29 pooka 706: " on dev 0x%"PRIx64" failed with error %d\n",
1.2 simonb 707: (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
708: "write" : "read"),
709: len, pbn, devvp->v_rdev, error));
710: }
711:
712: return error;
713: }
714:
715: int
716: wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
717: {
718:
719: return wapbl_doio(data, len, devvp, pbn, B_WRITE);
720: }
721:
722: int
723: wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
724: {
725:
726: return wapbl_doio(data, len, devvp, pbn, B_READ);
727: }
728:
729: /*
730: * Off is byte offset returns new offset for next write
731: * handles log wraparound
732: */
733: static int
734: wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
735: {
736: size_t slen;
737: off_t off = *offp;
738: int error;
1.34 ! mlelstv 739: daddr_t pbn;
1.2 simonb 740:
741: KDASSERT(((len >> wl->wl_log_dev_bshift) <<
742: wl->wl_log_dev_bshift) == len);
743:
744: if (off < wl->wl_circ_off)
745: off = wl->wl_circ_off;
746: slen = wl->wl_circ_off + wl->wl_circ_size - off;
747: if (slen < len) {
1.34 ! mlelstv 748: pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
! 749: #ifdef _KERNEL
! 750: pbn = btodb(pbn << wl->wl_log_dev_bshift);
! 751: #endif
! 752: error = wapbl_write(data, slen, wl->wl_devvp, pbn);
1.2 simonb 753: if (error)
754: return error;
755: data = (uint8_t *)data + slen;
756: len -= slen;
757: off = wl->wl_circ_off;
758: }
1.34 ! mlelstv 759: pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
! 760: #ifdef _KERNEL
! 761: pbn = btodb(pbn << wl->wl_log_dev_bshift);
! 762: #endif
! 763: error = wapbl_write(data, len, wl->wl_devvp, pbn);
1.2 simonb 764: if (error)
765: return error;
766: off += len;
767: if (off >= wl->wl_circ_off + wl->wl_circ_size)
768: off = wl->wl_circ_off;
769: *offp = off;
770: return 0;
771: }
772:
773: /****************************************************************/
774:
775: int
776: wapbl_begin(struct wapbl *wl, const char *file, int line)
777: {
778: int doflush;
779: unsigned lockcount;
780:
781: KDASSERT(wl);
782:
783: /*
784: * XXX this needs to be made much more sophisticated.
785: * perhaps each wapbl_begin could reserve a specified
786: * number of buffers and bytes.
787: */
788: mutex_enter(&wl->wl_mtx);
789: lockcount = wl->wl_lock_count;
790: doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
791: wl->wl_bufbytes_max / 2) ||
792: ((wl->wl_bufcount + (lockcount * 10)) >
793: wl->wl_bufcount_max / 2) ||
1.28 pooka 794: (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) ||
795: (wl->wl_dealloccnt >=
796: (wl->wl_dealloclim - (wl->wl_dealloclim >> 8)));
1.2 simonb 797: mutex_exit(&wl->wl_mtx);
798:
799: if (doflush) {
800: WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
801: ("force flush lockcnt=%d bufbytes=%zu "
1.28 pooka 802: "(max=%zu) bufcount=%zu (max=%zu) "
803: "dealloccnt %d (lim=%d)\n",
1.2 simonb 804: lockcount, wl->wl_bufbytes,
805: wl->wl_bufbytes_max, wl->wl_bufcount,
1.28 pooka 806: wl->wl_bufcount_max,
807: wl->wl_dealloccnt, wl->wl_dealloclim));
1.2 simonb 808: }
809:
810: if (doflush) {
811: int error = wapbl_flush(wl, 0);
812: if (error)
813: return error;
814: }
815:
1.23 ad 816: rw_enter(&wl->wl_rwlock, RW_READER);
1.2 simonb 817: mutex_enter(&wl->wl_mtx);
818: wl->wl_lock_count++;
819: mutex_exit(&wl->wl_mtx);
820:
1.23 ad 821: #if defined(WAPBL_DEBUG_PRINT)
1.2 simonb 822: WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
823: ("wapbl_begin thread %d.%d with bufcount=%zu "
824: "bufbytes=%zu bcount=%zu at %s:%d\n",
825: curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
826: wl->wl_bufbytes, wl->wl_bcount, file, line));
827: #endif
828:
829: return 0;
830: }
831:
832: void
833: wapbl_end(struct wapbl *wl)
834: {
835:
1.23 ad 836: #if defined(WAPBL_DEBUG_PRINT)
1.2 simonb 837: WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
838: ("wapbl_end thread %d.%d with bufcount=%zu "
839: "bufbytes=%zu bcount=%zu\n",
840: curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
841: wl->wl_bufbytes, wl->wl_bcount));
842: #endif
843:
844: mutex_enter(&wl->wl_mtx);
845: KASSERT(wl->wl_lock_count > 0);
846: wl->wl_lock_count--;
847: mutex_exit(&wl->wl_mtx);
848:
849: rw_exit(&wl->wl_rwlock);
850: }
851:
852: void
853: wapbl_add_buf(struct wapbl *wl, struct buf * bp)
854: {
855:
856: KASSERT(bp->b_cflags & BC_BUSY);
857: KASSERT(bp->b_vp);
858:
859: wapbl_jlock_assert(wl);
860:
861: #if 0
862: /*
863: * XXX this might be an issue for swapfiles.
864: * see uvm_swap.c:1702
865: *
866: * XXX2 why require it then? leap of semantics?
867: */
868: KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
869: #endif
870:
871: mutex_enter(&wl->wl_mtx);
872: if (bp->b_flags & B_LOCKED) {
873: LIST_REMOVE(bp, b_wapbllist);
874: WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
875: ("wapbl_add_buf thread %d.%d re-adding buf %p "
876: "with %d bytes %d bcount\n",
877: curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
878: bp->b_bcount));
879: } else {
880: /* unlocked by dirty buffers shouldn't exist */
881: KASSERT(!(bp->b_oflags & BO_DELWRI));
882: wl->wl_bufbytes += bp->b_bufsize;
883: wl->wl_bcount += bp->b_bcount;
884: wl->wl_bufcount++;
885: WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
886: ("wapbl_add_buf thread %d.%d adding buf %p "
887: "with %d bytes %d bcount\n",
888: curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
889: bp->b_bcount));
890: }
891: LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist);
892: mutex_exit(&wl->wl_mtx);
893:
894: bp->b_flags |= B_LOCKED;
895: }
896:
897: static void
898: wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
899: {
900:
901: KASSERT(mutex_owned(&wl->wl_mtx));
902: KASSERT(bp->b_cflags & BC_BUSY);
903: wapbl_jlock_assert(wl);
904:
905: #if 0
906: /*
907: * XXX this might be an issue for swapfiles.
908: * see uvm_swap.c:1725
909: *
910: * XXXdeux: see above
911: */
912: KASSERT((bp->b_flags & BC_NOCACHE) == 0);
913: #endif
914: KASSERT(bp->b_flags & B_LOCKED);
915:
916: WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
917: ("wapbl_remove_buf thread %d.%d removing buf %p with "
918: "%d bytes %d bcount\n",
919: curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
920:
921: KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
922: wl->wl_bufbytes -= bp->b_bufsize;
923: KASSERT(wl->wl_bcount >= bp->b_bcount);
924: wl->wl_bcount -= bp->b_bcount;
925: KASSERT(wl->wl_bufcount > 0);
926: wl->wl_bufcount--;
927: KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
928: KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
929: LIST_REMOVE(bp, b_wapbllist);
930:
931: bp->b_flags &= ~B_LOCKED;
932: }
933:
934: /* called from brelsel() in vfs_bio among other places */
935: void
936: wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
937: {
938:
939: mutex_enter(&wl->wl_mtx);
940: wapbl_remove_buf_locked(wl, bp);
941: mutex_exit(&wl->wl_mtx);
942: }
943:
944: void
945: wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
946: {
947:
948: KASSERT(bp->b_cflags & BC_BUSY);
949:
950: /*
951: * XXX: why does this depend on B_LOCKED? otherwise the buf
952: * is not for a transaction? if so, why is this called in the
953: * first place?
954: */
955: if (bp->b_flags & B_LOCKED) {
956: mutex_enter(&wl->wl_mtx);
957: wl->wl_bufbytes += bp->b_bufsize - oldsz;
958: wl->wl_bcount += bp->b_bcount - oldcnt;
959: mutex_exit(&wl->wl_mtx);
960: }
961: }
962:
963: #endif /* _KERNEL */
964:
965: /****************************************************************/
966: /* Some utility inlines */
967:
968: /* This is used to advance the pointer at old to new value at old+delta */
1.30 uebayasi 969: static inline off_t
1.2 simonb 970: wapbl_advance(size_t size, size_t off, off_t old, size_t delta)
971: {
972: off_t new;
973:
974: /* Define acceptable ranges for inputs. */
975: KASSERT(delta <= size);
976: KASSERT((old == 0) || (old >= off));
977: KASSERT(old < (size + off));
978:
979: if ((old == 0) && (delta != 0))
980: new = off + delta;
981: else if ((old + delta) < (size + off))
982: new = old + delta;
983: else
984: new = (old + delta) - size;
985:
986: /* Note some interesting axioms */
987: KASSERT((delta != 0) || (new == old));
988: KASSERT((delta == 0) || (new != 0));
989: KASSERT((delta != (size)) || (new == old));
990:
991: /* Define acceptable ranges for output. */
992: KASSERT((new == 0) || (new >= off));
993: KASSERT(new < (size + off));
994: return new;
995: }
996:
1.30 uebayasi 997: static inline size_t
1.2 simonb 998: wapbl_space_used(size_t avail, off_t head, off_t tail)
999: {
1000:
1001: if (tail == 0) {
1002: KASSERT(head == 0);
1003: return 0;
1004: }
1005: return ((head + (avail - 1) - tail) % avail) + 1;
1006: }
1007:
1.30 uebayasi 1008: static inline size_t
1.2 simonb 1009: wapbl_space_free(size_t avail, off_t head, off_t tail)
1010: {
1011:
1012: return avail - wapbl_space_used(avail, head, tail);
1013: }
1014:
1.30 uebayasi 1015: static inline void
1.2 simonb 1016: wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
1017: off_t *tailp)
1018: {
1019: off_t head = *headp;
1020: off_t tail = *tailp;
1021:
1022: KASSERT(delta <= wapbl_space_free(size, head, tail));
1023: head = wapbl_advance(size, off, head, delta);
1024: if ((tail == 0) && (head != 0))
1025: tail = off;
1026: *headp = head;
1027: *tailp = tail;
1028: }
1029:
1.30 uebayasi 1030: static inline void
1.2 simonb 1031: wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
1032: off_t *tailp)
1033: {
1034: off_t head = *headp;
1035: off_t tail = *tailp;
1036:
1037: KASSERT(delta <= wapbl_space_used(size, head, tail));
1038: tail = wapbl_advance(size, off, tail, delta);
1039: if (head == tail) {
1040: head = tail = 0;
1041: }
1042: *headp = head;
1043: *tailp = tail;
1044: }
1045:
1046: #ifdef _KERNEL
1047:
1048: /****************************************************************/
1049:
1050: /*
1051: * Remove transactions whose buffers are completely flushed to disk.
1052: * Will block until at least minfree space is available.
1053: * only intended to be called from inside wapbl_flush and therefore
1054: * does not protect against commit races with itself or with flush.
1055: */
1056: static int
1057: wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly)
1058: {
1059: size_t delta;
1060: size_t avail;
1061: off_t head;
1062: off_t tail;
1063: int error = 0;
1064:
1065: KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
1066: KASSERT(rw_write_held(&wl->wl_rwlock));
1067:
1068: mutex_enter(&wl->wl_mtx);
1069:
1070: /*
1071: * First check to see if we have to do a commit
1072: * at all.
1073: */
1074: avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
1075: if (minfree < avail) {
1076: mutex_exit(&wl->wl_mtx);
1077: return 0;
1078: }
1079: minfree -= avail;
1080: while ((wl->wl_error_count == 0) &&
1081: (wl->wl_reclaimable_bytes < minfree)) {
1082: WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1083: ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
1084: "minfree=%zd\n",
1085: &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
1086: minfree));
1087:
1088: cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
1089: }
1090: if (wl->wl_reclaimable_bytes < minfree) {
1091: KASSERT(wl->wl_error_count);
1092: /* XXX maybe get actual error from buffer instead someday? */
1093: error = EIO;
1094: }
1095: head = wl->wl_head;
1096: tail = wl->wl_tail;
1097: delta = wl->wl_reclaimable_bytes;
1098:
1099: /* If all of of the entries are flushed, then be sure to keep
1100: * the reserved bytes reserved. Watch out for discarded transactions,
1101: * which could leave more bytes reserved than are reclaimable.
1102: */
1103: if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
1104: (delta >= wl->wl_reserved_bytes)) {
1105: delta -= wl->wl_reserved_bytes;
1106: }
1107: wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
1108: &tail);
1109: KDASSERT(wl->wl_reserved_bytes <=
1110: wapbl_space_used(wl->wl_circ_size, head, tail));
1111: mutex_exit(&wl->wl_mtx);
1112:
1113: if (error)
1114: return error;
1115:
1116: if (waitonly)
1117: return 0;
1118:
1119: /*
1120: * This is where head, tail and delta are unprotected
1121: * from races against itself or flush. This is ok since
1122: * we only call this routine from inside flush itself.
1123: *
1124: * XXX: how can it race against itself when accessed only
1125: * from behind the write-locked rwlock?
1126: */
1127: error = wapbl_write_commit(wl, head, tail);
1128: if (error)
1129: return error;
1130:
1131: wl->wl_head = head;
1132: wl->wl_tail = tail;
1133:
1134: mutex_enter(&wl->wl_mtx);
1135: KASSERT(wl->wl_reclaimable_bytes >= delta);
1136: wl->wl_reclaimable_bytes -= delta;
1137: mutex_exit(&wl->wl_mtx);
1138: WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1139: ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
1140: curproc->p_pid, curlwp->l_lid, delta));
1141:
1142: return 0;
1143: }
1144:
1145: /****************************************************************/
1146:
1147: void
1148: wapbl_biodone(struct buf *bp)
1149: {
1150: struct wapbl_entry *we = bp->b_private;
1151: struct wapbl *wl = we->we_wapbl;
1152:
1153: /*
1154: * Handle possible flushing of buffers after log has been
1155: * decomissioned.
1156: */
1157: if (!wl) {
1158: KASSERT(we->we_bufcount > 0);
1159: we->we_bufcount--;
1160: #ifdef WAPBL_DEBUG_BUFBYTES
1161: KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
1162: we->we_unsynced_bufbytes -= bp->b_bufsize;
1163: #endif
1164:
1165: if (we->we_bufcount == 0) {
1166: #ifdef WAPBL_DEBUG_BUFBYTES
1167: KASSERT(we->we_unsynced_bufbytes == 0);
1168: #endif
1.18 yamt 1169: wapbl_free(we, sizeof(*we));
1.2 simonb 1170: }
1171:
1172: brelse(bp, 0);
1173: return;
1174: }
1175:
1176: #ifdef ohbother
1177: KDASSERT(bp->b_flags & B_DONE);
1178: KDASSERT(!(bp->b_flags & B_DELWRI));
1179: KDASSERT(bp->b_flags & B_ASYNC);
1180: KDASSERT(bp->b_flags & B_BUSY);
1181: KDASSERT(!(bp->b_flags & B_LOCKED));
1182: KDASSERT(!(bp->b_flags & B_READ));
1183: KDASSERT(!(bp->b_flags & B_INVAL));
1184: KDASSERT(!(bp->b_flags & B_NOCACHE));
1185: #endif
1186:
1187: if (bp->b_error) {
1188: #ifdef notyet /* Can't currently handle possible dirty buffer reuse */
1.26 apb 1189: /*
1190: * XXXpooka: interfaces not fully updated
1191: * Note: this was not enabled in the original patch
1192: * against netbsd4 either. I don't know if comment
1193: * above is true or not.
1194: */
1.2 simonb 1195:
1196: /*
1197: * If an error occurs, report the error and leave the
1198: * buffer as a delayed write on the LRU queue.
1199: * restarting the write would likely result in
1200: * an error spinloop, so let it be done harmlessly
1201: * by the syncer.
1202: */
1203: bp->b_flags &= ~(B_DONE);
1204: simple_unlock(&bp->b_interlock);
1205:
1206: if (we->we_error == 0) {
1207: mutex_enter(&wl->wl_mtx);
1208: wl->wl_error_count++;
1209: mutex_exit(&wl->wl_mtx);
1210: cv_broadcast(&wl->wl_reclaimable_cv);
1211: }
1212: we->we_error = bp->b_error;
1213: bp->b_error = 0;
1214: brelse(bp);
1215: return;
1216: #else
1217: /* For now, just mark the log permanently errored out */
1218:
1219: mutex_enter(&wl->wl_mtx);
1220: if (wl->wl_error_count == 0) {
1221: wl->wl_error_count++;
1222: cv_broadcast(&wl->wl_reclaimable_cv);
1223: }
1224: mutex_exit(&wl->wl_mtx);
1225: #endif
1226: }
1227:
1228: mutex_enter(&wl->wl_mtx);
1229:
1230: KASSERT(we->we_bufcount > 0);
1231: we->we_bufcount--;
1232: #ifdef WAPBL_DEBUG_BUFBYTES
1233: KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
1234: we->we_unsynced_bufbytes -= bp->b_bufsize;
1235: KASSERT(wl->wl_unsynced_bufbytes >= bp->b_bufsize);
1236: wl->wl_unsynced_bufbytes -= bp->b_bufsize;
1237: #endif
1238:
1239: /*
1240: * If the current transaction can be reclaimed, start
1241: * at the beginning and reclaim any consecutive reclaimable
1242: * transactions. If we successfully reclaim anything,
1243: * then wakeup anyone waiting for the reclaim.
1244: */
1245: if (we->we_bufcount == 0) {
1246: size_t delta = 0;
1247: int errcnt = 0;
1248: #ifdef WAPBL_DEBUG_BUFBYTES
1249: KDASSERT(we->we_unsynced_bufbytes == 0);
1250: #endif
1251: /*
1252: * clear any posted error, since the buffer it came from
1253: * has successfully flushed by now
1254: */
1255: while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
1256: (we->we_bufcount == 0)) {
1257: delta += we->we_reclaimable_bytes;
1258: if (we->we_error)
1259: errcnt++;
1260: SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1.18 yamt 1261: wapbl_free(we, sizeof(*we));
1.2 simonb 1262: }
1263:
1264: if (delta) {
1265: wl->wl_reclaimable_bytes += delta;
1266: KASSERT(wl->wl_error_count >= errcnt);
1267: wl->wl_error_count -= errcnt;
1268: cv_broadcast(&wl->wl_reclaimable_cv);
1269: }
1270: }
1271:
1272: mutex_exit(&wl->wl_mtx);
1273: brelse(bp, 0);
1274: }
1275:
1276: /*
1277: * Write transactions to disk + start I/O for contents
1278: */
1279: int
1280: wapbl_flush(struct wapbl *wl, int waitfor)
1281: {
1282: struct buf *bp;
1283: struct wapbl_entry *we;
1284: off_t off;
1285: off_t head;
1286: off_t tail;
1287: size_t delta = 0;
1288: size_t flushsize;
1289: size_t reserved;
1290: int error = 0;
1291:
1292: /*
1293: * Do a quick check to see if a full flush can be skipped
1294: * This assumes that the flush callback does not need to be called
1295: * unless there are other outstanding bufs.
1296: */
1297: if (!waitfor) {
1298: size_t nbufs;
1299: mutex_enter(&wl->wl_mtx); /* XXX need mutex here to
1300: protect the KASSERTS */
1301: nbufs = wl->wl_bufcount;
1302: KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1303: KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1304: mutex_exit(&wl->wl_mtx);
1305: if (nbufs == 0)
1306: return 0;
1307: }
1308:
1309: /*
1310: * XXX we may consider using LK_UPGRADE here
1311: * if we want to call flush from inside a transaction
1312: */
1313: rw_enter(&wl->wl_rwlock, RW_WRITER);
1314: wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
1315: wl->wl_dealloccnt);
1316:
1317: /*
1318: * Now that we are fully locked and flushed,
1319: * do another check for nothing to do.
1320: */
1321: if (wl->wl_bufcount == 0) {
1322: goto out;
1323: }
1324:
1325: #if 0
1326: WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1327: ("wapbl_flush thread %d.%d flushing entries with "
1328: "bufcount=%zu bufbytes=%zu\n",
1329: curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1330: wl->wl_bufbytes));
1331: #endif
1332:
1333: /* Calculate amount of space needed to flush */
1334: flushsize = wapbl_transaction_len(wl);
1335:
1336: if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
1337: /*
1338: * XXX this could be handled more gracefully, perhaps place
1339: * only a partial transaction in the log and allow the
1340: * remaining to flush without the protection of the journal.
1341: */
1342: panic("wapbl_flush: current transaction too big to flush\n");
1343: }
1344:
1345: error = wapbl_truncate(wl, flushsize, 0);
1346: if (error)
1347: goto out2;
1348:
1349: off = wl->wl_head;
1350: KASSERT((off == 0) || ((off >= wl->wl_circ_off) &&
1351: (off < wl->wl_circ_off + wl->wl_circ_size)));
1352: error = wapbl_write_blocks(wl, &off);
1353: if (error)
1354: goto out2;
1355: error = wapbl_write_revocations(wl, &off);
1356: if (error)
1357: goto out2;
1358: error = wapbl_write_inodes(wl, &off);
1359: if (error)
1360: goto out2;
1361:
1362: reserved = 0;
1363: if (wl->wl_inohashcnt)
1364: reserved = wapbl_transaction_inodes_len(wl);
1365:
1366: head = wl->wl_head;
1367: tail = wl->wl_tail;
1368:
1369: wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
1370: &head, &tail);
1371: #ifdef WAPBL_DEBUG
1372: if (head != off) {
1373: panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX
1374: " off=%"PRIdMAX" flush=%zu\n",
1375: (intmax_t)head, (intmax_t)tail, (intmax_t)off,
1376: flushsize);
1377: }
1378: #else
1379: KASSERT(head == off);
1380: #endif
1381:
1382: /* Opportunistically move the tail forward if we can */
1383: if (!wapbl_lazy_truncate) {
1384: mutex_enter(&wl->wl_mtx);
1385: delta = wl->wl_reclaimable_bytes;
1386: mutex_exit(&wl->wl_mtx);
1387: wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
1388: &head, &tail);
1389: }
1390:
1391: error = wapbl_write_commit(wl, head, tail);
1392: if (error)
1393: goto out2;
1394:
1395: we = wapbl_calloc(1, sizeof(*we));
1396:
1397: #ifdef WAPBL_DEBUG_BUFBYTES
1398: WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1399: ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1400: " unsynced=%zu"
1401: "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1402: "inodes=%d\n",
1403: curproc->p_pid, curlwp->l_lid, flushsize, delta,
1404: wapbl_space_used(wl->wl_circ_size, head, tail),
1405: wl->wl_unsynced_bufbytes, wl->wl_bufcount,
1406: wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
1407: wl->wl_inohashcnt));
1408: #else
1409: WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1410: ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1411: "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1412: "inodes=%d\n",
1413: curproc->p_pid, curlwp->l_lid, flushsize, delta,
1414: wapbl_space_used(wl->wl_circ_size, head, tail),
1415: wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1416: wl->wl_dealloccnt, wl->wl_inohashcnt));
1417: #endif
1418:
1419:
1420: mutex_enter(&bufcache_lock);
1421: mutex_enter(&wl->wl_mtx);
1422:
1423: wl->wl_reserved_bytes = reserved;
1424: wl->wl_head = head;
1425: wl->wl_tail = tail;
1426: KASSERT(wl->wl_reclaimable_bytes >= delta);
1427: wl->wl_reclaimable_bytes -= delta;
1428: wl->wl_dealloccnt = 0;
1429: #ifdef WAPBL_DEBUG_BUFBYTES
1430: wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
1431: #endif
1432:
1433: we->we_wapbl = wl;
1434: we->we_bufcount = wl->wl_bufcount;
1435: #ifdef WAPBL_DEBUG_BUFBYTES
1436: we->we_unsynced_bufbytes = wl->wl_bufbytes;
1437: #endif
1438: we->we_reclaimable_bytes = flushsize;
1439: we->we_error = 0;
1440: SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
1441:
1442: /*
1443: * this flushes bufs in reverse order than they were queued
1444: * it shouldn't matter, but if we care we could use TAILQ instead.
1445: * XXX Note they will get put on the lru queue when they flush
1446: * so we might actually want to change this to preserve order.
1447: */
1448: while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
1449: if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
1450: continue;
1451: }
1452: bp->b_iodone = wapbl_biodone;
1453: bp->b_private = we;
1454: bremfree(bp);
1455: wapbl_remove_buf_locked(wl, bp);
1456: mutex_exit(&wl->wl_mtx);
1457: mutex_exit(&bufcache_lock);
1458: bawrite(bp);
1459: mutex_enter(&bufcache_lock);
1460: mutex_enter(&wl->wl_mtx);
1461: }
1462: mutex_exit(&wl->wl_mtx);
1463: mutex_exit(&bufcache_lock);
1464:
1465: #if 0
1466: WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1467: ("wapbl_flush thread %d.%d done flushing entries...\n",
1468: curproc->p_pid, curlwp->l_lid));
1469: #endif
1470:
1471: out:
1472:
1473: /*
1474: * If the waitfor flag is set, don't return until everything is
1475: * fully flushed and the on disk log is empty.
1476: */
1477: if (waitfor) {
1478: error = wapbl_truncate(wl, wl->wl_circ_size -
1479: wl->wl_reserved_bytes, wapbl_lazy_truncate);
1480: }
1481:
1482: out2:
1483: if (error) {
1484: wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks,
1485: wl->wl_dealloclens, wl->wl_dealloccnt);
1486: }
1487:
1488: #ifdef WAPBL_DEBUG_PRINT
1489: if (error) {
1490: pid_t pid = -1;
1491: lwpid_t lid = -1;
1492: if (curproc)
1493: pid = curproc->p_pid;
1494: if (curlwp)
1495: lid = curlwp->l_lid;
1496: mutex_enter(&wl->wl_mtx);
1497: #ifdef WAPBL_DEBUG_BUFBYTES
1498: WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1499: ("wapbl_flush: thread %d.%d aborted flush: "
1500: "error = %d\n"
1501: "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1502: "deallocs=%d inodes=%d\n"
1503: "\terrcnt = %d, reclaimable=%zu reserved=%zu "
1504: "unsynced=%zu\n",
1505: pid, lid, error, wl->wl_bufcount,
1506: wl->wl_bufbytes, wl->wl_bcount,
1507: wl->wl_dealloccnt, wl->wl_inohashcnt,
1508: wl->wl_error_count, wl->wl_reclaimable_bytes,
1509: wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
1510: SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1511: WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1512: ("\tentry: bufcount = %zu, reclaimable = %zu, "
1513: "error = %d, unsynced = %zu\n",
1514: we->we_bufcount, we->we_reclaimable_bytes,
1515: we->we_error, we->we_unsynced_bufbytes));
1516: }
1517: #else
1518: WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1519: ("wapbl_flush: thread %d.%d aborted flush: "
1520: "error = %d\n"
1521: "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1522: "deallocs=%d inodes=%d\n"
1523: "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
1524: pid, lid, error, wl->wl_bufcount,
1525: wl->wl_bufbytes, wl->wl_bcount,
1526: wl->wl_dealloccnt, wl->wl_inohashcnt,
1527: wl->wl_error_count, wl->wl_reclaimable_bytes,
1528: wl->wl_reserved_bytes));
1529: SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1530: WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1531: ("\tentry: bufcount = %zu, reclaimable = %zu, "
1532: "error = %d\n", we->we_bufcount,
1533: we->we_reclaimable_bytes, we->we_error));
1534: }
1535: #endif
1536: mutex_exit(&wl->wl_mtx);
1537: }
1538: #endif
1539:
1540: rw_exit(&wl->wl_rwlock);
1541: return error;
1542: }
1543:
1544: /****************************************************************/
1545:
1546: void
1547: wapbl_jlock_assert(struct wapbl *wl)
1548: {
1549:
1.23 ad 1550: KASSERT(rw_lock_held(&wl->wl_rwlock));
1.2 simonb 1551: }
1552:
1553: void
1554: wapbl_junlock_assert(struct wapbl *wl)
1555: {
1556:
1557: KASSERT(!rw_write_held(&wl->wl_rwlock));
1558: }
1559:
1560: /****************************************************************/
1561:
1562: /* locks missing */
1563: void
1564: wapbl_print(struct wapbl *wl,
1565: int full,
1566: void (*pr)(const char *, ...))
1567: {
1568: struct buf *bp;
1569: struct wapbl_entry *we;
1570: (*pr)("wapbl %p", wl);
1571: (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
1572: wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
1573: (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
1574: wl->wl_circ_size, wl->wl_circ_off,
1575: (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
1576: (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
1577: wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
1578: #ifdef WAPBL_DEBUG_BUFBYTES
1579: (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1580: "reserved = %zu errcnt = %d unsynced = %zu\n",
1581: wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1582: wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1583: wl->wl_error_count, wl->wl_unsynced_bufbytes);
1584: #else
1585: (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1586: "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
1587: wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1588: wl->wl_error_count);
1589: #endif
1590: (*pr)("\tdealloccnt = %d, dealloclim = %d\n",
1591: wl->wl_dealloccnt, wl->wl_dealloclim);
1592: (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
1593: wl->wl_inohashcnt, wl->wl_inohashmask);
1594: (*pr)("entries:\n");
1595: SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1596: #ifdef WAPBL_DEBUG_BUFBYTES
1597: (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
1598: "unsynced = %zu\n",
1599: we->we_bufcount, we->we_reclaimable_bytes,
1600: we->we_error, we->we_unsynced_bufbytes);
1601: #else
1602: (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
1603: we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
1604: #endif
1605: }
1606: if (full) {
1607: int cnt = 0;
1608: (*pr)("bufs =");
1609: LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
1610: if (!LIST_NEXT(bp, b_wapbllist)) {
1611: (*pr)(" %p", bp);
1612: } else if ((++cnt % 6) == 0) {
1613: (*pr)(" %p,\n\t", bp);
1614: } else {
1615: (*pr)(" %p,", bp);
1616: }
1617: }
1618: (*pr)("\n");
1619:
1620: (*pr)("dealloced blks = ");
1621: {
1622: int i;
1623: cnt = 0;
1624: for (i = 0; i < wl->wl_dealloccnt; i++) {
1625: (*pr)(" %"PRId64":%d,",
1626: wl->wl_deallocblks[i],
1627: wl->wl_dealloclens[i]);
1628: if ((++cnt % 4) == 0) {
1629: (*pr)("\n\t");
1630: }
1631: }
1632: }
1633: (*pr)("\n");
1634:
1635: (*pr)("registered inodes = ");
1636: {
1637: int i;
1638: cnt = 0;
1639: for (i = 0; i <= wl->wl_inohashmask; i++) {
1640: struct wapbl_ino_head *wih;
1641: struct wapbl_ino *wi;
1642:
1643: wih = &wl->wl_inohash[i];
1644: LIST_FOREACH(wi, wih, wi_hash) {
1645: if (wi->wi_ino == 0)
1646: continue;
1647: (*pr)(" %"PRId32"/0%06"PRIo32",",
1648: wi->wi_ino, wi->wi_mode);
1649: if ((++cnt % 4) == 0) {
1650: (*pr)("\n\t");
1651: }
1652: }
1653: }
1654: (*pr)("\n");
1655: }
1656: }
1657: }
1658:
1659: #if defined(WAPBL_DEBUG) || defined(DDB)
1660: void
1661: wapbl_dump(struct wapbl *wl)
1662: {
1663: #if defined(WAPBL_DEBUG)
1664: if (!wl)
1665: wl = wapbl_debug_wl;
1666: #endif
1667: if (!wl)
1668: return;
1669: wapbl_print(wl, 1, printf);
1670: }
1671: #endif
1672:
1673: /****************************************************************/
1674:
1675: void
1676: wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len)
1677: {
1678:
1679: wapbl_jlock_assert(wl);
1680:
1681: /* XXX should eventually instead tie this into resource estimation */
1.27 pooka 1682: /*
1683: * XXX this panic needs locking/mutex analysis and the
1684: * ability to cope with the failure.
1685: */
1686: /* XXX this XXX doesn't have enough XXX */
1687: if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim))
1688: panic("wapbl_register_deallocation: out of resources");
1689:
1.2 simonb 1690: wl->wl_deallocblks[wl->wl_dealloccnt] = blk;
1691: wl->wl_dealloclens[wl->wl_dealloccnt] = len;
1692: wl->wl_dealloccnt++;
1693: WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
1694: ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len));
1695: }
1696:
1697: /****************************************************************/
1698:
1699: static void
1700: wapbl_inodetrk_init(struct wapbl *wl, u_int size)
1701: {
1702:
1703: wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
1704: if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
1705: pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
1706: "wapblinopl", &pool_allocator_nointr, IPL_NONE);
1707: }
1708: }
1709:
1710: static void
1711: wapbl_inodetrk_free(struct wapbl *wl)
1712: {
1713:
1714: /* XXX this KASSERT needs locking/mutex analysis */
1715: KASSERT(wl->wl_inohashcnt == 0);
1716: hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
1717: if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
1718: pool_destroy(&wapbl_ino_pool);
1719: }
1720: }
1721:
1722: static struct wapbl_ino *
1723: wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
1724: {
1725: struct wapbl_ino_head *wih;
1726: struct wapbl_ino *wi;
1727:
1728: KASSERT(mutex_owned(&wl->wl_mtx));
1729:
1730: wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1731: LIST_FOREACH(wi, wih, wi_hash) {
1732: if (ino == wi->wi_ino)
1733: return wi;
1734: }
1735: return 0;
1736: }
1737:
1738: void
1739: wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1740: {
1741: struct wapbl_ino_head *wih;
1742: struct wapbl_ino *wi;
1743:
1744: wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
1745:
1746: mutex_enter(&wl->wl_mtx);
1747: if (wapbl_inodetrk_get(wl, ino) == NULL) {
1748: wi->wi_ino = ino;
1749: wi->wi_mode = mode;
1750: wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1751: LIST_INSERT_HEAD(wih, wi, wi_hash);
1752: wl->wl_inohashcnt++;
1753: WAPBL_PRINTF(WAPBL_PRINT_INODE,
1754: ("wapbl_register_inode: ino=%"PRId64"\n", ino));
1755: mutex_exit(&wl->wl_mtx);
1756: } else {
1757: mutex_exit(&wl->wl_mtx);
1758: pool_put(&wapbl_ino_pool, wi);
1759: }
1760: }
1761:
1762: void
1763: wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1764: {
1765: struct wapbl_ino *wi;
1766:
1767: mutex_enter(&wl->wl_mtx);
1768: wi = wapbl_inodetrk_get(wl, ino);
1769: if (wi) {
1770: WAPBL_PRINTF(WAPBL_PRINT_INODE,
1771: ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
1772: KASSERT(wl->wl_inohashcnt > 0);
1773: wl->wl_inohashcnt--;
1774: LIST_REMOVE(wi, wi_hash);
1775: mutex_exit(&wl->wl_mtx);
1776:
1777: pool_put(&wapbl_ino_pool, wi);
1778: } else {
1779: mutex_exit(&wl->wl_mtx);
1780: }
1781: }
1782:
1783: /****************************************************************/
1784:
1.30 uebayasi 1785: static inline size_t
1.2 simonb 1786: wapbl_transaction_inodes_len(struct wapbl *wl)
1787: {
1788: int blocklen = 1<<wl->wl_log_dev_bshift;
1789: int iph;
1790:
1791: /* Calculate number of inodes described in a inodelist header */
1792: iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
1793: sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
1794:
1795: KASSERT(iph > 0);
1796:
1797: return MAX(1, howmany(wl->wl_inohashcnt, iph))*blocklen;
1798: }
1799:
1800:
1801: /* Calculate amount of space a transaction will take on disk */
1802: static size_t
1803: wapbl_transaction_len(struct wapbl *wl)
1804: {
1805: int blocklen = 1<<wl->wl_log_dev_bshift;
1806: size_t len;
1807: int bph;
1808:
1809: /* Calculate number of blocks described in a blocklist header */
1810: bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
1811: sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
1812:
1813: KASSERT(bph > 0);
1814:
1815: len = wl->wl_bcount;
1816: len += howmany(wl->wl_bufcount, bph)*blocklen;
1817: len += howmany(wl->wl_dealloccnt, bph)*blocklen;
1818: len += wapbl_transaction_inodes_len(wl);
1819:
1820: return len;
1821: }
1822:
1823: /*
1824: * Perform commit operation
1825: *
1826: * Note that generation number incrementation needs to
1827: * be protected against racing with other invocations
1828: * of wapbl_commit. This is ok since this routine
1829: * is only invoked from wapbl_flush
1830: */
1831: static int
1832: wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
1833: {
1834: struct wapbl_wc_header *wc = wl->wl_wc_header;
1835: struct timespec ts;
1836: int error;
1837: int force = 1;
1.34 ! mlelstv 1838: daddr_t pbn;
1.2 simonb 1839:
1840: /* XXX Calc checksum here, instead we do this for now */
1841: error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED);
1842: if (error) {
1843: WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1.29 pooka 1844: ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%"PRIx64
1845: " returned %d\n", wl->wl_devvp->v_rdev, error));
1.2 simonb 1846: }
1847:
1848: wc->wc_head = head;
1849: wc->wc_tail = tail;
1850: wc->wc_checksum = 0;
1851: wc->wc_version = 1;
1852: getnanotime(&ts);
1.17 yamt 1853: wc->wc_time = ts.tv_sec;
1.2 simonb 1854: wc->wc_timensec = ts.tv_nsec;
1855:
1856: WAPBL_PRINTF(WAPBL_PRINT_WRITE,
1857: ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
1858: (intmax_t)head, (intmax_t)tail));
1859:
1860: /*
1861: * XXX if generation will rollover, then first zero
1862: * over second commit header before trying to write both headers.
1863: */
1864:
1.34 ! mlelstv 1865: pbn = wl->wl_logpbn + (wc->wc_generation % 2);
! 1866: #ifdef _KERNEL
! 1867: pbn = btodb(pbn << wc->wc_log_dev_bshift);
! 1868: #endif
! 1869: error = wapbl_write(wc, wc->wc_len, wl->wl_devvp, pbn);
1.2 simonb 1870: if (error)
1871: return error;
1872:
1873: error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED);
1874: if (error) {
1875: WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1.29 pooka 1876: ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%"PRIx64
1877: " returned %d\n", wl->wl_devvp->v_rdev, error));
1.2 simonb 1878: }
1879:
1880: /*
1881: * If the generation number was zero, write it out a second time.
1882: * This handles initialization and generation number rollover
1883: */
1884: if (wc->wc_generation++ == 0) {
1885: error = wapbl_write_commit(wl, head, tail);
1886: /*
1887: * This panic should be able to be removed if we do the
1888: * zero'ing mentioned above, and we are certain to roll
1889: * back generation number on failure.
1890: */
1891: if (error)
1892: panic("wapbl_write_commit: error writing duplicate "
1893: "log header: %d\n", error);
1894: }
1895: return 0;
1896: }
1897:
1898: /* Returns new offset value */
1899: static int
1900: wapbl_write_blocks(struct wapbl *wl, off_t *offp)
1901: {
1902: struct wapbl_wc_blocklist *wc =
1903: (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
1904: int blocklen = 1<<wl->wl_log_dev_bshift;
1905: int bph;
1906: struct buf *bp;
1907: off_t off = *offp;
1908: int error;
1.7 joerg 1909: size_t padding;
1.2 simonb 1910:
1911: KASSERT(rw_write_held(&wl->wl_rwlock));
1912:
1913: bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
1914: sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
1915:
1916: bp = LIST_FIRST(&wl->wl_bufs);
1917:
1918: while (bp) {
1919: int cnt;
1920: struct buf *obp = bp;
1921:
1922: KASSERT(bp->b_flags & B_LOCKED);
1923:
1924: wc->wc_type = WAPBL_WC_BLOCKS;
1925: wc->wc_len = blocklen;
1926: wc->wc_blkcount = 0;
1927: while (bp && (wc->wc_blkcount < bph)) {
1928: /*
1929: * Make sure all the physical block numbers are up to
1930: * date. If this is not always true on a given
1931: * filesystem, then VOP_BMAP must be called. We
1932: * could call VOP_BMAP here, or else in the filesystem
1933: * specific flush callback, although neither of those
1934: * solutions allow us to take the vnode lock. If a
1935: * filesystem requires that we must take the vnode lock
1936: * to call VOP_BMAP, then we can probably do it in
1937: * bwrite when the vnode lock should already be held
1938: * by the invoking code.
1939: */
1940: KASSERT((bp->b_vp->v_type == VBLK) ||
1941: (bp->b_blkno != bp->b_lblkno));
1942: KASSERT(bp->b_blkno > 0);
1943:
1944: wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
1945: wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
1946: wc->wc_len += bp->b_bcount;
1947: wc->wc_blkcount++;
1948: bp = LIST_NEXT(bp, b_wapbllist);
1949: }
1.7 joerg 1950: if (wc->wc_len % blocklen != 0) {
1951: padding = blocklen - wc->wc_len % blocklen;
1952: wc->wc_len += padding;
1953: } else {
1954: padding = 0;
1955: }
1956:
1.2 simonb 1957: WAPBL_PRINTF(WAPBL_PRINT_WRITE,
1.7 joerg 1958: ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
1959: wc->wc_len, padding, (intmax_t)off));
1.2 simonb 1960:
1961: error = wapbl_circ_write(wl, wc, blocklen, &off);
1962: if (error)
1963: return error;
1964: bp = obp;
1965: cnt = 0;
1966: while (bp && (cnt++ < bph)) {
1967: error = wapbl_circ_write(wl, bp->b_data,
1968: bp->b_bcount, &off);
1969: if (error)
1970: return error;
1971: bp = LIST_NEXT(bp, b_wapbllist);
1972: }
1.7 joerg 1973: if (padding) {
1974: void *zero;
1975:
1976: zero = wapbl_malloc(padding);
1977: memset(zero, 0, padding);
1978: error = wapbl_circ_write(wl, zero, padding, &off);
1.18 yamt 1979: wapbl_free(zero, padding);
1.7 joerg 1980: if (error)
1981: return error;
1982: }
1.2 simonb 1983: }
1984: *offp = off;
1985: return 0;
1986: }
1987:
1988: static int
1989: wapbl_write_revocations(struct wapbl *wl, off_t *offp)
1990: {
1991: struct wapbl_wc_blocklist *wc =
1992: (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
1993: int i;
1994: int blocklen = 1<<wl->wl_log_dev_bshift;
1995: int bph;
1996: off_t off = *offp;
1997: int error;
1998:
1999: if (wl->wl_dealloccnt == 0)
2000: return 0;
2001:
2002: bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2003: sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2004:
2005: i = 0;
2006: while (i < wl->wl_dealloccnt) {
2007: wc->wc_type = WAPBL_WC_REVOCATIONS;
2008: wc->wc_len = blocklen;
2009: wc->wc_blkcount = 0;
2010: while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) {
2011: wc->wc_blocks[wc->wc_blkcount].wc_daddr =
2012: wl->wl_deallocblks[i];
2013: wc->wc_blocks[wc->wc_blkcount].wc_dlen =
2014: wl->wl_dealloclens[i];
2015: wc->wc_blkcount++;
2016: i++;
2017: }
2018: WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2019: ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
2020: wc->wc_len, (intmax_t)off));
2021: error = wapbl_circ_write(wl, wc, blocklen, &off);
2022: if (error)
2023: return error;
2024: }
2025: *offp = off;
2026: return 0;
2027: }
2028:
2029: static int
2030: wapbl_write_inodes(struct wapbl *wl, off_t *offp)
2031: {
2032: struct wapbl_wc_inodelist *wc =
2033: (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
2034: int i;
1.14 joerg 2035: int blocklen = 1 << wl->wl_log_dev_bshift;
1.2 simonb 2036: off_t off = *offp;
2037: int error;
2038:
2039: struct wapbl_ino_head *wih;
2040: struct wapbl_ino *wi;
2041: int iph;
2042:
2043: iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2044: sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2045:
2046: i = 0;
2047: wih = &wl->wl_inohash[0];
2048: wi = 0;
2049: do {
2050: wc->wc_type = WAPBL_WC_INODES;
2051: wc->wc_len = blocklen;
2052: wc->wc_inocnt = 0;
2053: wc->wc_clear = (i == 0);
2054: while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
2055: while (!wi) {
2056: KASSERT((wih - &wl->wl_inohash[0])
2057: <= wl->wl_inohashmask);
2058: wi = LIST_FIRST(wih++);
2059: }
2060: wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
2061: wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
2062: wc->wc_inocnt++;
2063: i++;
2064: wi = LIST_NEXT(wi, wi_hash);
2065: }
2066: WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2067: ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
2068: wc->wc_len, (intmax_t)off));
2069: error = wapbl_circ_write(wl, wc, blocklen, &off);
2070: if (error)
2071: return error;
2072: } while (i < wl->wl_inohashcnt);
2073:
2074: *offp = off;
2075: return 0;
2076: }
2077:
2078: #endif /* _KERNEL */
2079:
2080: /****************************************************************/
2081:
2082: struct wapbl_blk {
2083: LIST_ENTRY(wapbl_blk) wb_hash;
2084: daddr_t wb_blk;
2085: off_t wb_off; /* Offset of this block in the log */
2086: };
2087: #define WAPBL_BLKPOOL_MIN 83
2088:
2089: static void
2090: wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
2091: {
2092: if (size < WAPBL_BLKPOOL_MIN)
2093: size = WAPBL_BLKPOOL_MIN;
2094: KASSERT(wr->wr_blkhash == 0);
2095: #ifdef _KERNEL
2096: wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
2097: #else /* ! _KERNEL */
2098: /* Manually implement hashinit */
2099: {
1.25 lukem 2100: unsigned long i, hashsize;
1.2 simonb 2101: for (hashsize = 1; hashsize < size; hashsize <<= 1)
2102: continue;
2103: wr->wr_blkhash = wapbl_malloc(hashsize * sizeof(*wr->wr_blkhash));
2104: for (i = 0; i < wr->wr_blkhashmask; i++)
2105: LIST_INIT(&wr->wr_blkhash[i]);
2106: wr->wr_blkhashmask = hashsize - 1;
2107: }
2108: #endif /* ! _KERNEL */
2109: }
2110:
2111: static void
2112: wapbl_blkhash_free(struct wapbl_replay *wr)
2113: {
2114: KASSERT(wr->wr_blkhashcnt == 0);
2115: #ifdef _KERNEL
2116: hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
2117: #else /* ! _KERNEL */
1.18 yamt 2118: wapbl_free(wr->wr_blkhash,
2119: (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash));
1.2 simonb 2120: #endif /* ! _KERNEL */
2121: }
2122:
2123: static struct wapbl_blk *
2124: wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
2125: {
2126: struct wapbl_blk_head *wbh;
2127: struct wapbl_blk *wb;
2128: wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2129: LIST_FOREACH(wb, wbh, wb_hash) {
2130: if (blk == wb->wb_blk)
2131: return wb;
2132: }
2133: return 0;
2134: }
2135:
2136: static void
2137: wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
2138: {
2139: struct wapbl_blk_head *wbh;
2140: struct wapbl_blk *wb;
2141: wb = wapbl_blkhash_get(wr, blk);
2142: if (wb) {
2143: KASSERT(wb->wb_blk == blk);
2144: wb->wb_off = off;
2145: } else {
2146: wb = wapbl_malloc(sizeof(*wb));
2147: wb->wb_blk = blk;
2148: wb->wb_off = off;
2149: wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2150: LIST_INSERT_HEAD(wbh, wb, wb_hash);
2151: wr->wr_blkhashcnt++;
2152: }
2153: }
2154:
2155: static void
2156: wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
2157: {
2158: struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2159: if (wb) {
2160: KASSERT(wr->wr_blkhashcnt > 0);
2161: wr->wr_blkhashcnt--;
2162: LIST_REMOVE(wb, wb_hash);
1.18 yamt 2163: wapbl_free(wb, sizeof(*wb));
1.2 simonb 2164: }
2165: }
2166:
2167: static void
2168: wapbl_blkhash_clear(struct wapbl_replay *wr)
2169: {
1.25 lukem 2170: unsigned long i;
1.2 simonb 2171: for (i = 0; i <= wr->wr_blkhashmask; i++) {
2172: struct wapbl_blk *wb;
2173:
2174: while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
2175: KASSERT(wr->wr_blkhashcnt > 0);
2176: wr->wr_blkhashcnt--;
2177: LIST_REMOVE(wb, wb_hash);
1.18 yamt 2178: wapbl_free(wb, sizeof(*wb));
1.2 simonb 2179: }
2180: }
2181: KASSERT(wr->wr_blkhashcnt == 0);
2182: }
2183:
2184: /****************************************************************/
2185:
2186: static int
2187: wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
2188: {
2189: size_t slen;
2190: off_t off = *offp;
2191: int error;
1.34 ! mlelstv 2192: daddr_t pbn;
1.2 simonb 2193:
1.14 joerg 2194: KASSERT(((len >> wr->wr_log_dev_bshift) <<
2195: wr->wr_log_dev_bshift) == len);
1.34 ! mlelstv 2196:
1.14 joerg 2197: if (off < wr->wr_circ_off)
2198: off = wr->wr_circ_off;
2199: slen = wr->wr_circ_off + wr->wr_circ_size - off;
1.2 simonb 2200: if (slen < len) {
1.34 ! mlelstv 2201: pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
! 2202: #ifdef _KERNEL
! 2203: pbn = btodb(pbn << wr->wr_log_dev_bshift);
! 2204: #endif
! 2205: error = wapbl_read(data, slen, wr->wr_devvp, pbn);
1.2 simonb 2206: if (error)
2207: return error;
2208: data = (uint8_t *)data + slen;
2209: len -= slen;
1.14 joerg 2210: off = wr->wr_circ_off;
1.2 simonb 2211: }
1.34 ! mlelstv 2212: pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
! 2213: #ifdef _KERNEL
! 2214: pbn = btodb(pbn << wr->wr_log_dev_bshift);
! 2215: #endif
! 2216: error = wapbl_read(data, len, wr->wr_devvp, pbn);
1.2 simonb 2217: if (error)
2218: return error;
2219: off += len;
1.14 joerg 2220: if (off >= wr->wr_circ_off + wr->wr_circ_size)
2221: off = wr->wr_circ_off;
1.2 simonb 2222: *offp = off;
2223: return 0;
2224: }
2225:
2226: static void
2227: wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
2228: {
2229: size_t slen;
2230: off_t off = *offp;
2231:
1.14 joerg 2232: KASSERT(((len >> wr->wr_log_dev_bshift) <<
2233: wr->wr_log_dev_bshift) == len);
1.2 simonb 2234:
1.14 joerg 2235: if (off < wr->wr_circ_off)
2236: off = wr->wr_circ_off;
2237: slen = wr->wr_circ_off + wr->wr_circ_size - off;
1.2 simonb 2238: if (slen < len) {
2239: len -= slen;
1.14 joerg 2240: off = wr->wr_circ_off;
1.2 simonb 2241: }
2242: off += len;
1.14 joerg 2243: if (off >= wr->wr_circ_off + wr->wr_circ_size)
2244: off = wr->wr_circ_off;
1.2 simonb 2245: *offp = off;
2246: }
2247:
2248: /****************************************************************/
2249:
2250: int
2251: wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
2252: daddr_t off, size_t count, size_t blksize)
2253: {
2254: struct wapbl_replay *wr;
2255: int error;
2256: struct vnode *devvp;
2257: daddr_t logpbn;
2258: uint8_t *scratch;
2259: struct wapbl_wc_header *wch;
2260: struct wapbl_wc_header *wch2;
2261: /* Use this until we read the actual log header */
1.31 mlelstv 2262: int log_dev_bshift = ilog2(blksize);
1.2 simonb 2263: size_t used;
1.34 ! mlelstv 2264: daddr_t pbn;
1.2 simonb 2265:
2266: WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2267: ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
2268: vp, off, count, blksize));
2269:
2270: if (off < 0)
2271: return EINVAL;
2272:
2273: if (blksize < DEV_BSIZE)
2274: return EINVAL;
2275: if (blksize % DEV_BSIZE)
2276: return EINVAL;
2277:
2278: #ifdef _KERNEL
2279: #if 0
2280: /* XXX vp->v_size isn't reliably set for VBLK devices,
2281: * especially root. However, we might still want to verify
2282: * that the full load is readable */
2283: if ((off + count) * blksize > vp->v_size)
2284: return EINVAL;
2285: #endif
2286: if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
2287: return error;
2288: }
2289: #else /* ! _KERNEL */
2290: devvp = vp;
2291: logpbn = off;
2292: #endif /* ! _KERNEL */
2293:
2294: scratch = wapbl_malloc(MAXBSIZE);
2295:
1.34 ! mlelstv 2296: pbn = logpbn;
! 2297: #ifdef _KERNEL
! 2298: pbn = btodb(pbn << log_dev_bshift);
! 2299: #endif
! 2300: error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn);
1.2 simonb 2301: if (error)
2302: goto errout;
2303:
2304: wch = (struct wapbl_wc_header *)scratch;
2305: wch2 =
2306: (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
2307: /* XXX verify checksums and magic numbers */
2308: if (wch->wc_type != WAPBL_WC_HEADER) {
2309: printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
2310: error = EFTYPE;
2311: goto errout;
2312: }
2313:
2314: if (wch2->wc_generation > wch->wc_generation)
2315: wch = wch2;
2316:
2317: wr = wapbl_calloc(1, sizeof(*wr));
2318:
2319: wr->wr_logvp = vp;
2320: wr->wr_devvp = devvp;
2321: wr->wr_logpbn = logpbn;
2322:
2323: wr->wr_scratch = scratch;
2324:
1.14 joerg 2325: wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
2326: wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
2327: wr->wr_circ_off = wch->wc_circ_off;
2328: wr->wr_circ_size = wch->wc_circ_size;
2329: wr->wr_generation = wch->wc_generation;
1.2 simonb 2330:
2331: used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
2332:
2333: WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2334: ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
2335: " len=%"PRId64" used=%zu\n",
2336: wch->wc_head, wch->wc_tail, wch->wc_circ_off,
2337: wch->wc_circ_size, used));
2338:
2339: wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
1.11 joerg 2340:
1.14 joerg 2341: error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
1.2 simonb 2342: if (error) {
2343: wapbl_replay_stop(wr);
2344: wapbl_replay_free(wr);
2345: return error;
2346: }
2347:
2348: *wrp = wr;
2349: return 0;
2350:
2351: errout:
1.18 yamt 2352: wapbl_free(scratch, MAXBSIZE);
1.2 simonb 2353: return error;
2354: }
2355:
2356: void
2357: wapbl_replay_stop(struct wapbl_replay *wr)
2358: {
2359:
1.4 joerg 2360: if (!wapbl_replay_isopen(wr))
2361: return;
2362:
1.2 simonb 2363: WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
2364:
1.18 yamt 2365: wapbl_free(wr->wr_scratch, MAXBSIZE);
2366: wr->wr_scratch = NULL;
1.2 simonb 2367:
1.18 yamt 2368: wr->wr_logvp = NULL;
1.2 simonb 2369:
2370: wapbl_blkhash_clear(wr);
2371: wapbl_blkhash_free(wr);
2372: }
2373:
2374: void
2375: wapbl_replay_free(struct wapbl_replay *wr)
2376: {
2377:
2378: KDASSERT(!wapbl_replay_isopen(wr));
2379:
2380: if (wr->wr_inodes)
1.18 yamt 2381: wapbl_free(wr->wr_inodes,
2382: wr->wr_inodescnt * sizeof(wr->wr_inodes[0]));
2383: wapbl_free(wr, sizeof(*wr));
1.2 simonb 2384: }
2385:
1.4 joerg 2386: #ifdef _KERNEL
1.2 simonb 2387: int
2388: wapbl_replay_isopen1(struct wapbl_replay *wr)
2389: {
2390:
2391: return wapbl_replay_isopen(wr);
2392: }
1.4 joerg 2393: #endif
1.2 simonb 2394:
1.10 joerg 2395: static void
2396: wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
2397: {
2398: struct wapbl_wc_blocklist *wc =
2399: (struct wapbl_wc_blocklist *)wr->wr_scratch;
1.14 joerg 2400: int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.10 joerg 2401: int i, j, n;
2402:
2403: for (i = 0; i < wc->wc_blkcount; i++) {
2404: /*
2405: * Enter each physical block into the hashtable independently.
2406: */
1.14 joerg 2407: n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
1.10 joerg 2408: for (j = 0; j < n; j++) {
1.34 ! mlelstv 2409: wapbl_blkhash_ins(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen),
1.10 joerg 2410: *offp);
2411: wapbl_circ_advance(wr, fsblklen, offp);
2412: }
2413: }
2414: }
2415:
2416: static void
2417: wapbl_replay_process_revocations(struct wapbl_replay *wr)
2418: {
2419: struct wapbl_wc_blocklist *wc =
2420: (struct wapbl_wc_blocklist *)wr->wr_scratch;
1.34 ! mlelstv 2421: int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.10 joerg 2422: int i, j, n;
2423:
2424: for (i = 0; i < wc->wc_blkcount; i++) {
2425: /*
2426: * Remove any blocks found from the hashtable.
2427: */
1.14 joerg 2428: n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
1.10 joerg 2429: for (j = 0; j < n; j++)
1.34 ! mlelstv 2430: wapbl_blkhash_rem(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
1.10 joerg 2431: }
2432: }
2433:
2434: static void
2435: wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
2436: {
2437: struct wapbl_wc_inodelist *wc =
2438: (struct wapbl_wc_inodelist *)wr->wr_scratch;
1.18 yamt 2439: void *new_inodes;
2440: const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]);
2441:
2442: KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0]));
2443:
1.10 joerg 2444: /*
2445: * Keep track of where we found this so location won't be
2446: * overwritten.
2447: */
2448: if (wc->wc_clear) {
2449: wr->wr_inodestail = oldoff;
2450: wr->wr_inodescnt = 0;
1.12 joerg 2451: if (wr->wr_inodes != NULL) {
1.18 yamt 2452: wapbl_free(wr->wr_inodes, oldsize);
1.12 joerg 2453: wr->wr_inodes = NULL;
2454: }
1.10 joerg 2455: }
2456: wr->wr_inodeshead = newoff;
2457: if (wc->wc_inocnt == 0)
2458: return;
2459:
1.18 yamt 2460: new_inodes = wapbl_malloc((wr->wr_inodescnt + wc->wc_inocnt) *
2461: sizeof(wr->wr_inodes[0]));
2462: if (wr->wr_inodes != NULL) {
2463: memcpy(new_inodes, wr->wr_inodes, oldsize);
2464: wapbl_free(wr->wr_inodes, oldsize);
2465: }
2466: wr->wr_inodes = new_inodes;
1.10 joerg 2467: memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
1.18 yamt 2468: wc->wc_inocnt * sizeof(wr->wr_inodes[0]));
1.10 joerg 2469: wr->wr_inodescnt += wc->wc_inocnt;
2470: }
2471:
1.2 simonb 2472: static int
1.14 joerg 2473: wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
1.2 simonb 2474: {
2475: off_t off;
2476: int error;
2477:
1.14 joerg 2478: int logblklen = 1 << wr->wr_log_dev_bshift;
1.2 simonb 2479:
2480: wapbl_blkhash_clear(wr);
2481:
1.14 joerg 2482: off = tail;
2483: while (off != head) {
1.2 simonb 2484: struct wapbl_wc_null *wcn;
2485: off_t saveoff = off;
2486: error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2487: if (error)
2488: goto errout;
2489: wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2490: switch (wcn->wc_type) {
2491: case WAPBL_WC_BLOCKS:
1.10 joerg 2492: wapbl_replay_process_blocks(wr, &off);
1.2 simonb 2493: break;
2494:
2495: case WAPBL_WC_REVOCATIONS:
1.10 joerg 2496: wapbl_replay_process_revocations(wr);
1.2 simonb 2497: break;
2498:
2499: case WAPBL_WC_INODES:
1.10 joerg 2500: wapbl_replay_process_inodes(wr, saveoff, off);
1.2 simonb 2501: break;
1.10 joerg 2502:
1.2 simonb 2503: default:
2504: printf("Unrecognized wapbl type: 0x%08x\n",
2505: wcn->wc_type);
2506: error = EFTYPE;
2507: goto errout;
2508: }
2509: wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2510: if (off != saveoff) {
2511: printf("wapbl_replay: corrupted records\n");
2512: error = EFTYPE;
2513: goto errout;
2514: }
2515: }
2516: return 0;
2517:
2518: errout:
2519: wapbl_blkhash_clear(wr);
2520: return error;
2521: }
2522:
1.13 joerg 2523: #if 0
1.2 simonb 2524: int
2525: wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
2526: {
2527: off_t off;
2528: int mismatchcnt = 0;
1.14 joerg 2529: int logblklen = 1 << wr->wr_log_dev_bshift;
2530: int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.2 simonb 2531: void *scratch1 = wapbl_malloc(MAXBSIZE);
2532: void *scratch2 = wapbl_malloc(MAXBSIZE);
2533: int error = 0;
2534:
2535: KDASSERT(wapbl_replay_isopen(wr));
2536:
2537: off = wch->wc_tail;
2538: while (off != wch->wc_head) {
2539: struct wapbl_wc_null *wcn;
2540: #ifdef DEBUG
2541: off_t saveoff = off;
2542: #endif
2543: error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2544: if (error)
2545: goto out;
2546: wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2547: switch (wcn->wc_type) {
2548: case WAPBL_WC_BLOCKS:
2549: {
2550: struct wapbl_wc_blocklist *wc =
2551: (struct wapbl_wc_blocklist *)wr->wr_scratch;
2552: int i;
2553: for (i = 0; i < wc->wc_blkcount; i++) {
2554: int foundcnt = 0;
2555: int dirtycnt = 0;
2556: int j, n;
2557: /*
2558: * Check each physical block into the
2559: * hashtable independently
2560: */
2561: n = wc->wc_blocks[i].wc_dlen >>
2562: wch->wc_fs_dev_bshift;
2563: for (j = 0; j < n; j++) {
2564: struct wapbl_blk *wb =
2565: wapbl_blkhash_get(wr,
1.34 ! mlelstv 2566: wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
1.2 simonb 2567: if (wb && (wb->wb_off == off)) {
2568: foundcnt++;
2569: error =
2570: wapbl_circ_read(wr,
2571: scratch1, fsblklen,
2572: &off);
2573: if (error)
2574: goto out;
2575: error =
2576: wapbl_read(scratch2,
2577: fsblklen, fsdevvp,
2578: wb->wb_blk);
2579: if (error)
2580: goto out;
2581: if (memcmp(scratch1,
2582: scratch2,
2583: fsblklen)) {
2584: printf(
2585: "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
2586: wb->wb_blk, (intmax_t)off);
2587: dirtycnt++;
2588: mismatchcnt++;
2589: }
2590: } else {
2591: wapbl_circ_advance(wr,
2592: fsblklen, &off);
2593: }
2594: }
2595: #if 0
2596: /*
2597: * If all of the blocks in an entry
2598: * are clean, then remove all of its
2599: * blocks from the hashtable since they
2600: * never will need replay.
2601: */
2602: if ((foundcnt != 0) &&
2603: (dirtycnt == 0)) {
2604: off = saveoff;
2605: wapbl_circ_advance(wr,
2606: logblklen, &off);
2607: for (j = 0; j < n; j++) {
2608: struct wapbl_blk *wb =
2609: wapbl_blkhash_get(wr,
1.34 ! mlelstv 2610: wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
1.2 simonb 2611: if (wb &&
2612: (wb->wb_off == off)) {
2613: wapbl_blkhash_rem(wr, wb->wb_blk);
2614: }
2615: wapbl_circ_advance(wr,
2616: fsblklen, &off);
2617: }
2618: }
2619: #endif
2620: }
2621: }
2622: break;
2623: case WAPBL_WC_REVOCATIONS:
2624: case WAPBL_WC_INODES:
2625: break;
2626: default:
2627: KASSERT(0);
2628: }
2629: #ifdef DEBUG
2630: wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2631: KASSERT(off == saveoff);
2632: #endif
2633: }
2634: out:
1.18 yamt 2635: wapbl_free(scratch1, MAXBSIZE);
2636: wapbl_free(scratch2, MAXBSIZE);
1.2 simonb 2637: if (!error && mismatchcnt)
2638: error = EFTYPE;
2639: return error;
2640: }
2641: #endif
2642:
2643: int
2644: wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
2645: {
1.9 joerg 2646: struct wapbl_blk *wb;
2647: size_t i;
1.2 simonb 2648: off_t off;
1.9 joerg 2649: void *scratch;
1.2 simonb 2650: int error = 0;
1.14 joerg 2651: int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.2 simonb 2652:
2653: KDASSERT(wapbl_replay_isopen(wr));
2654:
1.9 joerg 2655: scratch = wapbl_malloc(MAXBSIZE);
1.2 simonb 2656:
1.9 joerg 2657: for (i = 0; i < wr->wr_blkhashmask; ++i) {
2658: LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
2659: off = wb->wb_off;
2660: error = wapbl_circ_read(wr, scratch, fsblklen, &off);
2661: if (error)
2662: break;
2663: error = wapbl_write(scratch, fsblklen, fsdevvp,
2664: wb->wb_blk);
2665: if (error)
2666: break;
1.2 simonb 2667: }
2668: }
1.9 joerg 2669:
1.18 yamt 2670: wapbl_free(scratch, MAXBSIZE);
1.2 simonb 2671: return error;
2672: }
2673:
2674: int
1.6 joerg 2675: wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
2676: {
1.14 joerg 2677: int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.6 joerg 2678:
2679: KDASSERT(wapbl_replay_isopen(wr));
2680: KASSERT((len % fsblklen) == 0);
2681:
2682: while (len != 0) {
2683: struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2684: if (wb)
2685: return 1;
2686: len -= fsblklen;
2687: }
2688: return 0;
2689: }
2690:
2691: int
1.2 simonb 2692: wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
2693: {
1.14 joerg 2694: int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.2 simonb 2695:
2696: KDASSERT(wapbl_replay_isopen(wr));
2697:
2698: KASSERT((len % fsblklen) == 0);
2699:
2700: while (len != 0) {
2701: struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2702: if (wb) {
2703: off_t off = wb->wb_off;
2704: int error;
2705: error = wapbl_circ_read(wr, data, fsblklen, &off);
2706: if (error)
2707: return error;
2708: }
2709: data = (uint8_t *)data + fsblklen;
2710: len -= fsblklen;
2711: blk++;
2712: }
2713: return 0;
2714: }
CVSweb <webmaster@jp.NetBSD.org>