[BACK]Return to vfs_wapbl.c CVS log [TXT][DIR] Up to [cvs.NetBSD.org] / src / sys / kern

Annotation of src/sys/kern/vfs_wapbl.c, Revision 1.34

1.34    ! mlelstv     1: /*     $NetBSD: vfs_wapbl.c,v 1.33 2010/02/27 12:04:19 mlelstv Exp $   */
1.2       simonb      2:
                      3: /*-
1.23      ad          4:  * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
1.2       simonb      5:  * All rights reserved.
                      6:  *
                      7:  * This code is derived from software contributed to The NetBSD Foundation
                      8:  * by Wasabi Systems, Inc.
                      9:  *
                     10:  * Redistribution and use in source and binary forms, with or without
                     11:  * modification, are permitted provided that the following conditions
                     12:  * are met:
                     13:  * 1. Redistributions of source code must retain the above copyright
                     14:  *    notice, this list of conditions and the following disclaimer.
                     15:  * 2. Redistributions in binary form must reproduce the above copyright
                     16:  *    notice, this list of conditions and the following disclaimer in the
                     17:  *    documentation and/or other materials provided with the distribution.
                     18:  *
                     19:  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
                     20:  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
                     21:  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
                     22:  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
                     23:  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
                     24:  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
                     25:  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
                     26:  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
                     27:  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
                     28:  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
                     29:  * POSSIBILITY OF SUCH DAMAGE.
                     30:  */
                     31:
                     32: /*
                     33:  * This implements file system independent write ahead filesystem logging.
                     34:  */
1.4       joerg      35:
                     36: #define WAPBL_INTERNAL
                     37:
1.2       simonb     38: #include <sys/cdefs.h>
1.34    ! mlelstv    39: __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.33 2010/02/27 12:04:19 mlelstv Exp $");
1.2       simonb     40:
                     41: #include <sys/param.h>
1.31      mlelstv    42: #include <sys/bitops.h>
1.2       simonb     43:
                     44: #ifdef _KERNEL
                     45: #include <sys/param.h>
                     46: #include <sys/namei.h>
                     47: #include <sys/proc.h>
                     48: #include <sys/uio.h>
                     49: #include <sys/vnode.h>
                     50: #include <sys/file.h>
1.19      yamt       51: #include <sys/malloc.h>
1.2       simonb     52: #include <sys/resourcevar.h>
                     53: #include <sys/conf.h>
                     54: #include <sys/mount.h>
                     55: #include <sys/kernel.h>
                     56: #include <sys/kauth.h>
                     57: #include <sys/mutex.h>
                     58: #include <sys/atomic.h>
                     59: #include <sys/wapbl.h>
1.16      joerg      60: #include <sys/wapbl_replay.h>
1.2       simonb     61:
                     62: #include <miscfs/specfs/specdev.h>
                     63:
1.19      yamt       64: #if 0 /* notyet */
1.18      yamt       65: #define        wapbl_malloc(s) kmem_alloc((s), KM_SLEEP)
                     66: #define        wapbl_free(a, s) kmem_free((a), (s))
                     67: #define        wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
1.19      yamt       68: #else
                     69: MALLOC_JUSTDEFINE(M_WAPBL, "wapbl", "write-ahead physical block logging");
                     70: #define        wapbl_malloc(s) malloc((s), M_WAPBL, M_WAITOK)
                     71: #define        wapbl_free(a, s) free((a), M_WAPBL)
                     72: #define        wapbl_calloc(n, s) malloc((n)*(s), M_WAPBL, M_WAITOK | M_ZERO)
                     73: #endif
1.2       simonb     74:
                     75: #else /* !_KERNEL */
                     76: #include <assert.h>
                     77: #include <errno.h>
                     78: #include <stdio.h>
                     79: #include <stdbool.h>
                     80: #include <stdlib.h>
                     81: #include <string.h>
                     82:
                     83: #include <sys/time.h>
                     84: #include <sys/wapbl.h>
1.16      joerg      85: #include <sys/wapbl_replay.h>
1.2       simonb     86:
                     87: #define        KDASSERT(x) assert(x)
                     88: #define        KASSERT(x) assert(x)
                     89: #define        wapbl_malloc(s) malloc(s)
1.18      yamt       90: #define        wapbl_free(a, s) free(a)
1.2       simonb     91: #define        wapbl_calloc(n, s) calloc((n), (s))
                     92:
                     93: #endif /* !_KERNEL */
                     94:
                     95: /*
                     96:  * INTERNAL DATA STRUCTURES
                     97:  */
                     98:
                     99: /*
                    100:  * This structure holds per-mount log information.
                    101:  *
                    102:  * Legend:     a = atomic access only
                    103:  *             r = read-only after init
                    104:  *             l = rwlock held
                    105:  *             m = mutex held
                    106:  *             u = unlocked access ok
                    107:  *             b = bufcache_lock held
                    108:  */
                    109: struct wapbl {
                    110:        struct vnode *wl_logvp; /* r:   log here */
                    111:        struct vnode *wl_devvp; /* r:   log on this device */
                    112:        struct mount *wl_mount; /* r:   mountpoint wl is associated with */
                    113:        daddr_t wl_logpbn;      /* r:   Physical block number of start of log */
                    114:        int wl_log_dev_bshift;  /* r:   logarithm of device block size of log
                    115:                                        device */
                    116:        int wl_fs_dev_bshift;   /* r:   logarithm of device block size of
                    117:                                        filesystem device */
                    118:
1.3       yamt      119:        unsigned wl_lock_count; /* m:   Count of transactions in progress */
1.2       simonb    120:
                    121:        size_t wl_circ_size;    /* r:   Number of bytes in buffer of log */
                    122:        size_t wl_circ_off;     /* r:   Number of bytes reserved at start */
                    123:
                    124:        size_t wl_bufcount_max; /* r:   Number of buffers reserved for log */
                    125:        size_t wl_bufbytes_max; /* r:   Number of buf bytes reserved for log */
                    126:
                    127:        off_t wl_head;          /* l:   Byte offset of log head */
                    128:        off_t wl_tail;          /* l:   Byte offset of log tail */
                    129:        /*
                    130:         * head == tail == 0 means log is empty
                    131:         * head == tail != 0 means log is full
                    132:         * see assertions in wapbl_advance() for other boundary conditions.
                    133:         * only truncate moves the tail, except when flush sets it to
                    134:         * wl_header_size only flush moves the head, except when truncate
                    135:         * sets it to 0.
                    136:         */
                    137:
                    138:        struct wapbl_wc_header *wl_wc_header;   /* l    */
                    139:        void *wl_wc_scratch;    /* l:   scratch space (XXX: por que?!?) */
                    140:
                    141:        kmutex_t wl_mtx;        /* u:   short-term lock */
                    142:        krwlock_t wl_rwlock;    /* u:   File system transaction lock */
                    143:
                    144:        /*
                    145:         * Must be held while accessing
                    146:         * wl_count or wl_bufs or head or tail
                    147:         */
                    148:
                    149:        /*
                    150:         * Callback called from within the flush routine to flush any extra
                    151:         * bits.  Note that flush may be skipped without calling this if
                    152:         * there are no outstanding buffers in the transaction.
                    153:         */
1.5       joerg     154: #if _KERNEL
1.2       simonb    155:        wapbl_flush_fn_t wl_flush;      /* r    */
                    156:        wapbl_flush_fn_t wl_flush_abort;/* r    */
1.5       joerg     157: #endif
1.2       simonb    158:
                    159:        size_t wl_bufbytes;     /* m:   Byte count of pages in wl_bufs */
                    160:        size_t wl_bufcount;     /* m:   Count of buffers in wl_bufs */
                    161:        size_t wl_bcount;       /* m:   Total bcount of wl_bufs */
                    162:
                    163:        LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
                    164:
                    165:        kcondvar_t wl_reclaimable_cv;   /* m (obviously) */
                    166:        size_t wl_reclaimable_bytes; /* m:      Amount of space available for
                    167:                                                reclamation by truncate */
                    168:        int wl_error_count;     /* m:   # of wl_entries with errors */
                    169:        size_t wl_reserved_bytes; /* never truncate log smaller than this */
                    170:
                    171: #ifdef WAPBL_DEBUG_BUFBYTES
                    172:        size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
                    173: #endif
                    174:
                    175:        daddr_t *wl_deallocblks;/* l:   address of block */
1.20      yamt      176:        int *wl_dealloclens;    /* l:   size of block */
1.2       simonb    177:        int wl_dealloccnt;      /* l:   total count */
                    178:        int wl_dealloclim;      /* l:   max count */
                    179:
                    180:        /* hashtable of inode numbers for allocated but unlinked inodes */
                    181:        /* synch ??? */
                    182:        LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash;
                    183:        u_long wl_inohashmask;
                    184:        int wl_inohashcnt;
                    185:
                    186:        SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
                    187:                                                   accounting */
                    188: };
                    189:
                    190: #ifdef WAPBL_DEBUG_PRINT
                    191: int wapbl_debug_print = WAPBL_DEBUG_PRINT;
                    192: #endif
                    193:
                    194: /****************************************************************/
                    195: #ifdef _KERNEL
                    196:
                    197: #ifdef WAPBL_DEBUG
                    198: struct wapbl *wapbl_debug_wl;
                    199: #endif
                    200:
                    201: static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
                    202: static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
                    203: static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
                    204: static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
                    205: #endif /* _KERNEL */
                    206:
1.14      joerg     207: static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
1.2       simonb    208:
1.30      uebayasi  209: static inline size_t wapbl_space_free(size_t avail, off_t head,
1.2       simonb    210:        off_t tail);
1.30      uebayasi  211: static inline size_t wapbl_space_used(size_t avail, off_t head,
1.2       simonb    212:        off_t tail);
                    213:
                    214: #ifdef _KERNEL
                    215:
                    216: #define        WAPBL_INODETRK_SIZE 83
                    217: static int wapbl_ino_pool_refcount;
                    218: static struct pool wapbl_ino_pool;
                    219: struct wapbl_ino {
                    220:        LIST_ENTRY(wapbl_ino) wi_hash;
                    221:        ino_t wi_ino;
                    222:        mode_t wi_mode;
                    223: };
                    224:
                    225: static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
                    226: static void wapbl_inodetrk_free(struct wapbl *wl);
                    227: static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
                    228:
                    229: static size_t wapbl_transaction_len(struct wapbl *wl);
1.30      uebayasi  230: static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
1.2       simonb    231:
1.13      joerg     232: #if 0
1.4       joerg     233: int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
                    234: #endif
                    235:
                    236: static int wapbl_replay_isopen1(struct wapbl_replay *);
                    237:
1.2       simonb    238: /*
                    239:  * This is useful for debugging.  If set, the log will
                    240:  * only be truncated when necessary.
                    241:  */
                    242: int wapbl_lazy_truncate = 0;
                    243:
                    244: struct wapbl_ops wapbl_ops = {
                    245:        .wo_wapbl_discard       = wapbl_discard,
                    246:        .wo_wapbl_replay_isopen = wapbl_replay_isopen1,
1.6       joerg     247:        .wo_wapbl_replay_can_read = wapbl_replay_can_read,
1.2       simonb    248:        .wo_wapbl_replay_read   = wapbl_replay_read,
                    249:        .wo_wapbl_add_buf       = wapbl_add_buf,
                    250:        .wo_wapbl_remove_buf    = wapbl_remove_buf,
                    251:        .wo_wapbl_resize_buf    = wapbl_resize_buf,
                    252:        .wo_wapbl_begin         = wapbl_begin,
                    253:        .wo_wapbl_end           = wapbl_end,
                    254:        .wo_wapbl_junlock_assert= wapbl_junlock_assert,
                    255:
                    256:        /* XXX: the following is only used to say "this is a wapbl buf" */
                    257:        .wo_wapbl_biodone       = wapbl_biodone,
                    258: };
                    259:
                    260: void
1.24      cegger    261: wapbl_init(void)
1.2       simonb    262: {
                    263:
1.22      yamt      264:        malloc_type_attach(M_WAPBL);
1.2       simonb    265: }
                    266:
1.21      yamt      267: static int
1.15      joerg     268: wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
                    269: {
                    270:        int error, i;
                    271:
                    272:        WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
                    273:            ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
                    274:
                    275:        /*
                    276:         * Its only valid to reuse the replay log if its
                    277:         * the same as the new log we just opened.
                    278:         */
                    279:        KDASSERT(!wapbl_replay_isopen(wr));
                    280:        KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
                    281:        KASSERT(wl->wl_logpbn == wr->wr_logpbn);
                    282:        KASSERT(wl->wl_circ_size == wr->wr_circ_size);
                    283:        KASSERT(wl->wl_circ_off == wr->wr_circ_off);
                    284:        KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
                    285:        KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
                    286:
                    287:        wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
                    288:
                    289:        for (i = 0; i < wr->wr_inodescnt; i++)
                    290:                wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
                    291:                    wr->wr_inodes[i].wr_imode);
                    292:
                    293:        /* Make sure new transaction won't overwrite old inodes list */
                    294:        KDASSERT(wapbl_transaction_len(wl) <=
                    295:            wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
                    296:            wr->wr_inodestail));
                    297:
                    298:        wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
                    299:        wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
                    300:            wapbl_transaction_len(wl);
                    301:
                    302:        error = wapbl_write_inodes(wl, &wl->wl_head);
                    303:        if (error)
                    304:                return error;
                    305:
                    306:        KASSERT(wl->wl_head != wl->wl_tail);
                    307:        KASSERT(wl->wl_head != 0);
                    308:
                    309:        return 0;
                    310: }
                    311:
1.2       simonb    312: int
                    313: wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
                    314:        daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
                    315:        wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
                    316: {
                    317:        struct wapbl *wl;
                    318:        struct vnode *devvp;
                    319:        daddr_t logpbn;
                    320:        int error;
1.31      mlelstv   321:        int log_dev_bshift = ilog2(blksize);
1.32      mlelstv   322:        int fs_dev_bshift = log_dev_bshift;
1.2       simonb    323:        int run;
                    324:
                    325:        WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
                    326:            " count=%zu blksize=%zu\n", vp, off, count, blksize));
                    327:
                    328:        if (log_dev_bshift > fs_dev_bshift) {
                    329:                WAPBL_PRINTF(WAPBL_PRINT_OPEN,
                    330:                        ("wapbl: log device's block size cannot be larger "
                    331:                         "than filesystem's\n"));
                    332:                /*
                    333:                 * Not currently implemented, although it could be if
                    334:                 * needed someday.
                    335:                 */
                    336:                return ENOSYS;
                    337:        }
                    338:
                    339:        if (off < 0)
                    340:                return EINVAL;
                    341:
                    342:        if (blksize < DEV_BSIZE)
                    343:                return EINVAL;
                    344:        if (blksize % DEV_BSIZE)
                    345:                return EINVAL;
                    346:
                    347:        /* XXXTODO: verify that the full load is writable */
                    348:
                    349:        /*
                    350:         * XXX check for minimum log size
                    351:         * minimum is governed by minimum amount of space
                    352:         * to complete a transaction. (probably truncate)
                    353:         */
                    354:        /* XXX for now pick something minimal */
                    355:        if ((count * blksize) < MAXPHYS) {
                    356:                return ENOSPC;
                    357:        }
                    358:
                    359:        if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
                    360:                return error;
                    361:        }
                    362:
                    363:        wl = wapbl_calloc(1, sizeof(*wl));
                    364:        rw_init(&wl->wl_rwlock);
                    365:        mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
                    366:        cv_init(&wl->wl_reclaimable_cv, "wapblrec");
                    367:        LIST_INIT(&wl->wl_bufs);
                    368:        SIMPLEQ_INIT(&wl->wl_entries);
                    369:
                    370:        wl->wl_logvp = vp;
                    371:        wl->wl_devvp = devvp;
                    372:        wl->wl_mount = mp;
                    373:        wl->wl_logpbn = logpbn;
                    374:        wl->wl_log_dev_bshift = log_dev_bshift;
                    375:        wl->wl_fs_dev_bshift = fs_dev_bshift;
                    376:
                    377:        wl->wl_flush = flushfn;
                    378:        wl->wl_flush_abort = flushabortfn;
                    379:
                    380:        /* Reserve two log device blocks for the commit headers */
                    381:        wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
1.34    ! mlelstv   382:        wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
1.2       simonb    383:        /* truncate the log usage to a multiple of log_dev_bshift */
                    384:        wl->wl_circ_size >>= wl->wl_log_dev_bshift;
                    385:        wl->wl_circ_size <<= wl->wl_log_dev_bshift;
                    386:
                    387:        /*
                    388:         * wl_bufbytes_max limits the size of the in memory transaction space.
                    389:         * - Since buffers are allocated and accounted for in units of
                    390:         *   PAGE_SIZE it is required to be a multiple of PAGE_SIZE
                    391:         *   (i.e. 1<<PAGE_SHIFT)
                    392:         * - Since the log device has to be written in units of
                    393:         *   1<<wl_log_dev_bshift it is required to be a mulitple of
                    394:         *   1<<wl_log_dev_bshift.
                    395:         * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
                    396:         *   it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
                    397:         * Therefore it must be multiple of the least common multiple of those
                    398:         * three quantities.  Fortunately, all of those quantities are
                    399:         * guaranteed to be a power of two, and the least common multiple of
                    400:         * a set of numbers which are all powers of two is simply the maximum
                    401:         * of those numbers.  Finally, the maximum logarithm of a power of two
                    402:         * is the same as the log of the maximum power of two.  So we can do
                    403:         * the following operations to size wl_bufbytes_max:
                    404:         */
                    405:
                    406:        /* XXX fix actual number of pages reserved per filesystem. */
                    407:        wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
                    408:
                    409:        /* Round wl_bufbytes_max to the largest power of two constraint */
                    410:        wl->wl_bufbytes_max >>= PAGE_SHIFT;
                    411:        wl->wl_bufbytes_max <<= PAGE_SHIFT;
                    412:        wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
                    413:        wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
                    414:        wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
                    415:        wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
                    416:
                    417:        /* XXX maybe use filesystem fragment size instead of 1024 */
                    418:        /* XXX fix actual number of buffers reserved per filesystem. */
                    419:        wl->wl_bufcount_max = (nbuf / 2) * 1024;
                    420:
                    421:        /* XXX tie this into resource estimation */
                    422:        wl->wl_dealloclim = 2 * btodb(wl->wl_bufbytes_max);
                    423:
                    424:        wl->wl_deallocblks = wapbl_malloc(sizeof(*wl->wl_deallocblks) *
                    425:            wl->wl_dealloclim);
                    426:        wl->wl_dealloclens = wapbl_malloc(sizeof(*wl->wl_dealloclens) *
                    427:            wl->wl_dealloclim);
                    428:
                    429:        wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
                    430:
                    431:        /* Initialize the commit header */
                    432:        {
                    433:                struct wapbl_wc_header *wc;
1.14      joerg     434:                size_t len = 1 << wl->wl_log_dev_bshift;
1.2       simonb    435:                wc = wapbl_calloc(1, len);
                    436:                wc->wc_type = WAPBL_WC_HEADER;
                    437:                wc->wc_len = len;
                    438:                wc->wc_circ_off = wl->wl_circ_off;
                    439:                wc->wc_circ_size = wl->wl_circ_size;
                    440:                /* XXX wc->wc_fsid */
                    441:                wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
                    442:                wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
                    443:                wl->wl_wc_header = wc;
                    444:                wl->wl_wc_scratch = wapbl_malloc(len);
                    445:        }
                    446:
                    447:        /*
                    448:         * if there was an existing set of unlinked but
                    449:         * allocated inodes, preserve it in the new
                    450:         * log.
                    451:         */
                    452:        if (wr && wr->wr_inodescnt) {
1.15      joerg     453:                error = wapbl_start_flush_inodes(wl, wr);
1.2       simonb    454:                if (error)
                    455:                        goto errout;
                    456:        }
                    457:
                    458:        error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
                    459:        if (error) {
                    460:                goto errout;
                    461:        }
                    462:
                    463:        *wlp = wl;
                    464: #if defined(WAPBL_DEBUG)
                    465:        wapbl_debug_wl = wl;
                    466: #endif
                    467:
                    468:        return 0;
                    469:  errout:
                    470:        wapbl_discard(wl);
1.18      yamt      471:        wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
                    472:        wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
                    473:        wapbl_free(wl->wl_deallocblks,
                    474:            sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
                    475:        wapbl_free(wl->wl_dealloclens,
                    476:            sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
1.2       simonb    477:        wapbl_inodetrk_free(wl);
1.18      yamt      478:        wapbl_free(wl, sizeof(*wl));
1.2       simonb    479:
                    480:        return error;
                    481: }
                    482:
                    483: /*
                    484:  * Like wapbl_flush, only discards the transaction
                    485:  * completely
                    486:  */
                    487:
                    488: void
                    489: wapbl_discard(struct wapbl *wl)
                    490: {
                    491:        struct wapbl_entry *we;
                    492:        struct buf *bp;
                    493:        int i;
                    494:
                    495:        /*
                    496:         * XXX we may consider using upgrade here
                    497:         * if we want to call flush from inside a transaction
                    498:         */
                    499:        rw_enter(&wl->wl_rwlock, RW_WRITER);
                    500:        wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
                    501:            wl->wl_dealloccnt);
                    502:
                    503: #ifdef WAPBL_DEBUG_PRINT
                    504:        {
                    505:                pid_t pid = -1;
                    506:                lwpid_t lid = -1;
                    507:                if (curproc)
                    508:                        pid = curproc->p_pid;
                    509:                if (curlwp)
                    510:                        lid = curlwp->l_lid;
                    511: #ifdef WAPBL_DEBUG_BUFBYTES
                    512:                WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
                    513:                    ("wapbl_discard: thread %d.%d discarding "
                    514:                    "transaction\n"
                    515:                    "\tbufcount=%zu bufbytes=%zu bcount=%zu "
                    516:                    "deallocs=%d inodes=%d\n"
                    517:                    "\terrcnt = %u, reclaimable=%zu reserved=%zu "
                    518:                    "unsynced=%zu\n",
                    519:                    pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
                    520:                    wl->wl_bcount, wl->wl_dealloccnt,
                    521:                    wl->wl_inohashcnt, wl->wl_error_count,
                    522:                    wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
                    523:                    wl->wl_unsynced_bufbytes));
                    524:                SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
                    525:                        WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
                    526:                            ("\tentry: bufcount = %zu, reclaimable = %zu, "
                    527:                             "error = %d, unsynced = %zu\n",
                    528:                             we->we_bufcount, we->we_reclaimable_bytes,
                    529:                             we->we_error, we->we_unsynced_bufbytes));
                    530:                }
                    531: #else /* !WAPBL_DEBUG_BUFBYTES */
                    532:                WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
                    533:                    ("wapbl_discard: thread %d.%d discarding transaction\n"
                    534:                    "\tbufcount=%zu bufbytes=%zu bcount=%zu "
                    535:                    "deallocs=%d inodes=%d\n"
                    536:                    "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
                    537:                    pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
                    538:                    wl->wl_bcount, wl->wl_dealloccnt,
                    539:                    wl->wl_inohashcnt, wl->wl_error_count,
                    540:                    wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
                    541:                SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
                    542:                        WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
                    543:                            ("\tentry: bufcount = %zu, reclaimable = %zu, "
                    544:                             "error = %d\n",
                    545:                             we->we_bufcount, we->we_reclaimable_bytes,
                    546:                             we->we_error));
                    547:                }
                    548: #endif /* !WAPBL_DEBUG_BUFBYTES */
                    549:        }
                    550: #endif /* WAPBL_DEBUG_PRINT */
                    551:
                    552:        for (i = 0; i <= wl->wl_inohashmask; i++) {
                    553:                struct wapbl_ino_head *wih;
                    554:                struct wapbl_ino *wi;
                    555:
                    556:                wih = &wl->wl_inohash[i];
                    557:                while ((wi = LIST_FIRST(wih)) != NULL) {
                    558:                        LIST_REMOVE(wi, wi_hash);
                    559:                        pool_put(&wapbl_ino_pool, wi);
                    560:                        KASSERT(wl->wl_inohashcnt > 0);
                    561:                        wl->wl_inohashcnt--;
                    562:                }
                    563:        }
                    564:
                    565:        /*
                    566:         * clean buffer list
                    567:         */
                    568:        mutex_enter(&bufcache_lock);
                    569:        mutex_enter(&wl->wl_mtx);
                    570:        while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
                    571:                if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
                    572:                        /*
                    573:                         * The buffer will be unlocked and
                    574:                         * removed from the transaction in brelse
                    575:                         */
                    576:                        mutex_exit(&wl->wl_mtx);
                    577:                        brelsel(bp, 0);
                    578:                        mutex_enter(&wl->wl_mtx);
                    579:                }
                    580:        }
                    581:        mutex_exit(&wl->wl_mtx);
                    582:        mutex_exit(&bufcache_lock);
                    583:
                    584:        /*
                    585:         * Remove references to this wl from wl_entries, free any which
                    586:         * no longer have buffers, others will be freed in wapbl_biodone
                    587:         * when they no longer have any buffers.
                    588:         */
                    589:        while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
                    590:                SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
                    591:                /* XXX should we be accumulating wl_error_count
                    592:                 * and increasing reclaimable bytes ? */
                    593:                we->we_wapbl = NULL;
                    594:                if (we->we_bufcount == 0) {
                    595: #ifdef WAPBL_DEBUG_BUFBYTES
                    596:                        KASSERT(we->we_unsynced_bufbytes == 0);
                    597: #endif
1.18      yamt      598:                        wapbl_free(we, sizeof(*we));
1.2       simonb    599:                }
                    600:        }
                    601:
                    602:        /* Discard list of deallocs */
                    603:        wl->wl_dealloccnt = 0;
                    604:        /* XXX should we clear wl_reserved_bytes? */
                    605:
                    606:        KASSERT(wl->wl_bufbytes == 0);
                    607:        KASSERT(wl->wl_bcount == 0);
                    608:        KASSERT(wl->wl_bufcount == 0);
                    609:        KASSERT(LIST_EMPTY(&wl->wl_bufs));
                    610:        KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
                    611:        KASSERT(wl->wl_inohashcnt == 0);
                    612:
                    613:        rw_exit(&wl->wl_rwlock);
                    614: }
                    615:
                    616: int
                    617: wapbl_stop(struct wapbl *wl, int force)
                    618: {
                    619:        struct vnode *vp;
                    620:        int error;
                    621:
                    622:        WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
                    623:        error = wapbl_flush(wl, 1);
                    624:        if (error) {
                    625:                if (force)
                    626:                        wapbl_discard(wl);
                    627:                else
                    628:                        return error;
                    629:        }
                    630:
                    631:        /* Unlinked inodes persist after a flush */
                    632:        if (wl->wl_inohashcnt) {
                    633:                if (force) {
                    634:                        wapbl_discard(wl);
                    635:                } else {
                    636:                        return EBUSY;
                    637:                }
                    638:        }
                    639:
                    640:        KASSERT(wl->wl_bufbytes == 0);
                    641:        KASSERT(wl->wl_bcount == 0);
                    642:        KASSERT(wl->wl_bufcount == 0);
                    643:        KASSERT(LIST_EMPTY(&wl->wl_bufs));
                    644:        KASSERT(wl->wl_dealloccnt == 0);
                    645:        KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
                    646:        KASSERT(wl->wl_inohashcnt == 0);
                    647:
                    648:        vp = wl->wl_logvp;
                    649:
1.18      yamt      650:        wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
                    651:        wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
                    652:        wapbl_free(wl->wl_deallocblks,
                    653:            sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
                    654:        wapbl_free(wl->wl_dealloclens,
                    655:            sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
1.2       simonb    656:        wapbl_inodetrk_free(wl);
                    657:
                    658:        cv_destroy(&wl->wl_reclaimable_cv);
                    659:        mutex_destroy(&wl->wl_mtx);
                    660:        rw_destroy(&wl->wl_rwlock);
1.18      yamt      661:        wapbl_free(wl, sizeof(*wl));
1.2       simonb    662:
                    663:        return 0;
                    664: }
                    665:
                    666: static int
                    667: wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
                    668: {
                    669:        struct pstats *pstats = curlwp->l_proc->p_stats;
                    670:        struct buf *bp;
                    671:        int error;
                    672:
                    673:        KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
                    674:        KASSERT(devvp->v_type == VBLK);
                    675:
                    676:        if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
                    677:                mutex_enter(&devvp->v_interlock);
                    678:                devvp->v_numoutput++;
                    679:                mutex_exit(&devvp->v_interlock);
                    680:                pstats->p_ru.ru_oublock++;
                    681:        } else {
                    682:                pstats->p_ru.ru_inblock++;
                    683:        }
                    684:
                    685:        bp = getiobuf(devvp, true);
                    686:        bp->b_flags = flags;
                    687:        bp->b_cflags = BC_BUSY; /* silly & dubious */
                    688:        bp->b_dev = devvp->v_rdev;
                    689:        bp->b_data = data;
                    690:        bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
                    691:        bp->b_blkno = pbn;
                    692:
                    693:        WAPBL_PRINTF(WAPBL_PRINT_IO,
1.29      pooka     694:            ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n",
1.2       simonb    695:            BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
                    696:            bp->b_blkno, bp->b_dev));
                    697:
                    698:        VOP_STRATEGY(devvp, bp);
                    699:
                    700:        error = biowait(bp);
                    701:        putiobuf(bp);
                    702:
                    703:        if (error) {
                    704:                WAPBL_PRINTF(WAPBL_PRINT_ERROR,
                    705:                    ("wapbl_doio: %s %zu bytes at block %" PRId64
1.29      pooka     706:                    " on dev 0x%"PRIx64" failed with error %d\n",
1.2       simonb    707:                    (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
                    708:                     "write" : "read"),
                    709:                    len, pbn, devvp->v_rdev, error));
                    710:        }
                    711:
                    712:        return error;
                    713: }
                    714:
                    715: int
                    716: wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
                    717: {
                    718:
                    719:        return wapbl_doio(data, len, devvp, pbn, B_WRITE);
                    720: }
                    721:
                    722: int
                    723: wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
                    724: {
                    725:
                    726:        return wapbl_doio(data, len, devvp, pbn, B_READ);
                    727: }
                    728:
                    729: /*
                    730:  * Off is byte offset returns new offset for next write
                    731:  * handles log wraparound
                    732:  */
                    733: static int
                    734: wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
                    735: {
                    736:        size_t slen;
                    737:        off_t off = *offp;
                    738:        int error;
1.34    ! mlelstv   739:        daddr_t pbn;
1.2       simonb    740:
                    741:        KDASSERT(((len >> wl->wl_log_dev_bshift) <<
                    742:            wl->wl_log_dev_bshift) == len);
                    743:
                    744:        if (off < wl->wl_circ_off)
                    745:                off = wl->wl_circ_off;
                    746:        slen = wl->wl_circ_off + wl->wl_circ_size - off;
                    747:        if (slen < len) {
1.34    ! mlelstv   748:                pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
        !           749: #ifdef _KERNEL
        !           750:                pbn = btodb(pbn << wl->wl_log_dev_bshift);
        !           751: #endif
        !           752:                error = wapbl_write(data, slen, wl->wl_devvp, pbn);
1.2       simonb    753:                if (error)
                    754:                        return error;
                    755:                data = (uint8_t *)data + slen;
                    756:                len -= slen;
                    757:                off = wl->wl_circ_off;
                    758:        }
1.34    ! mlelstv   759:        pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
        !           760: #ifdef _KERNEL
        !           761:        pbn = btodb(pbn << wl->wl_log_dev_bshift);
        !           762: #endif
        !           763:        error = wapbl_write(data, len, wl->wl_devvp, pbn);
1.2       simonb    764:        if (error)
                    765:                return error;
                    766:        off += len;
                    767:        if (off >= wl->wl_circ_off + wl->wl_circ_size)
                    768:                off = wl->wl_circ_off;
                    769:        *offp = off;
                    770:        return 0;
                    771: }
                    772:
                    773: /****************************************************************/
                    774:
                    775: int
                    776: wapbl_begin(struct wapbl *wl, const char *file, int line)
                    777: {
                    778:        int doflush;
                    779:        unsigned lockcount;
                    780:
                    781:        KDASSERT(wl);
                    782:
                    783:        /*
                    784:         * XXX this needs to be made much more sophisticated.
                    785:         * perhaps each wapbl_begin could reserve a specified
                    786:         * number of buffers and bytes.
                    787:         */
                    788:        mutex_enter(&wl->wl_mtx);
                    789:        lockcount = wl->wl_lock_count;
                    790:        doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
                    791:                   wl->wl_bufbytes_max / 2) ||
                    792:                  ((wl->wl_bufcount + (lockcount * 10)) >
                    793:                   wl->wl_bufcount_max / 2) ||
1.28      pooka     794:                  (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) ||
                    795:                  (wl->wl_dealloccnt >=
                    796:                   (wl->wl_dealloclim - (wl->wl_dealloclim >> 8)));
1.2       simonb    797:        mutex_exit(&wl->wl_mtx);
                    798:
                    799:        if (doflush) {
                    800:                WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
                    801:                    ("force flush lockcnt=%d bufbytes=%zu "
1.28      pooka     802:                    "(max=%zu) bufcount=%zu (max=%zu) "
                    803:                    "dealloccnt %d (lim=%d)\n",
1.2       simonb    804:                    lockcount, wl->wl_bufbytes,
                    805:                    wl->wl_bufbytes_max, wl->wl_bufcount,
1.28      pooka     806:                    wl->wl_bufcount_max,
                    807:                    wl->wl_dealloccnt, wl->wl_dealloclim));
1.2       simonb    808:        }
                    809:
                    810:        if (doflush) {
                    811:                int error = wapbl_flush(wl, 0);
                    812:                if (error)
                    813:                        return error;
                    814:        }
                    815:
1.23      ad        816:        rw_enter(&wl->wl_rwlock, RW_READER);
1.2       simonb    817:        mutex_enter(&wl->wl_mtx);
                    818:        wl->wl_lock_count++;
                    819:        mutex_exit(&wl->wl_mtx);
                    820:
1.23      ad        821: #if defined(WAPBL_DEBUG_PRINT)
1.2       simonb    822:        WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
                    823:            ("wapbl_begin thread %d.%d with bufcount=%zu "
                    824:            "bufbytes=%zu bcount=%zu at %s:%d\n",
                    825:            curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
                    826:            wl->wl_bufbytes, wl->wl_bcount, file, line));
                    827: #endif
                    828:
                    829:        return 0;
                    830: }
                    831:
                    832: void
                    833: wapbl_end(struct wapbl *wl)
                    834: {
                    835:
1.23      ad        836: #if defined(WAPBL_DEBUG_PRINT)
1.2       simonb    837:        WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
                    838:             ("wapbl_end thread %d.%d with bufcount=%zu "
                    839:              "bufbytes=%zu bcount=%zu\n",
                    840:              curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
                    841:              wl->wl_bufbytes, wl->wl_bcount));
                    842: #endif
                    843:
                    844:        mutex_enter(&wl->wl_mtx);
                    845:        KASSERT(wl->wl_lock_count > 0);
                    846:        wl->wl_lock_count--;
                    847:        mutex_exit(&wl->wl_mtx);
                    848:
                    849:        rw_exit(&wl->wl_rwlock);
                    850: }
                    851:
                    852: void
                    853: wapbl_add_buf(struct wapbl *wl, struct buf * bp)
                    854: {
                    855:
                    856:        KASSERT(bp->b_cflags & BC_BUSY);
                    857:        KASSERT(bp->b_vp);
                    858:
                    859:        wapbl_jlock_assert(wl);
                    860:
                    861: #if 0
                    862:        /*
                    863:         * XXX this might be an issue for swapfiles.
                    864:         * see uvm_swap.c:1702
                    865:         *
                    866:         * XXX2 why require it then?  leap of semantics?
                    867:         */
                    868:        KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
                    869: #endif
                    870:
                    871:        mutex_enter(&wl->wl_mtx);
                    872:        if (bp->b_flags & B_LOCKED) {
                    873:                LIST_REMOVE(bp, b_wapbllist);
                    874:                WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
                    875:                   ("wapbl_add_buf thread %d.%d re-adding buf %p "
                    876:                    "with %d bytes %d bcount\n",
                    877:                    curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
                    878:                    bp->b_bcount));
                    879:        } else {
                    880:                /* unlocked by dirty buffers shouldn't exist */
                    881:                KASSERT(!(bp->b_oflags & BO_DELWRI));
                    882:                wl->wl_bufbytes += bp->b_bufsize;
                    883:                wl->wl_bcount += bp->b_bcount;
                    884:                wl->wl_bufcount++;
                    885:                WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
                    886:                   ("wapbl_add_buf thread %d.%d adding buf %p "
                    887:                    "with %d bytes %d bcount\n",
                    888:                    curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
                    889:                    bp->b_bcount));
                    890:        }
                    891:        LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist);
                    892:        mutex_exit(&wl->wl_mtx);
                    893:
                    894:        bp->b_flags |= B_LOCKED;
                    895: }
                    896:
                    897: static void
                    898: wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
                    899: {
                    900:
                    901:        KASSERT(mutex_owned(&wl->wl_mtx));
                    902:        KASSERT(bp->b_cflags & BC_BUSY);
                    903:        wapbl_jlock_assert(wl);
                    904:
                    905: #if 0
                    906:        /*
                    907:         * XXX this might be an issue for swapfiles.
                    908:         * see uvm_swap.c:1725
                    909:         *
                    910:         * XXXdeux: see above
                    911:         */
                    912:        KASSERT((bp->b_flags & BC_NOCACHE) == 0);
                    913: #endif
                    914:        KASSERT(bp->b_flags & B_LOCKED);
                    915:
                    916:        WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
                    917:           ("wapbl_remove_buf thread %d.%d removing buf %p with "
                    918:            "%d bytes %d bcount\n",
                    919:            curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
                    920:
                    921:        KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
                    922:        wl->wl_bufbytes -= bp->b_bufsize;
                    923:        KASSERT(wl->wl_bcount >= bp->b_bcount);
                    924:        wl->wl_bcount -= bp->b_bcount;
                    925:        KASSERT(wl->wl_bufcount > 0);
                    926:        wl->wl_bufcount--;
                    927:        KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
                    928:        KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
                    929:        LIST_REMOVE(bp, b_wapbllist);
                    930:
                    931:        bp->b_flags &= ~B_LOCKED;
                    932: }
                    933:
                    934: /* called from brelsel() in vfs_bio among other places */
                    935: void
                    936: wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
                    937: {
                    938:
                    939:        mutex_enter(&wl->wl_mtx);
                    940:        wapbl_remove_buf_locked(wl, bp);
                    941:        mutex_exit(&wl->wl_mtx);
                    942: }
                    943:
                    944: void
                    945: wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
                    946: {
                    947:
                    948:        KASSERT(bp->b_cflags & BC_BUSY);
                    949:
                    950:        /*
                    951:         * XXX: why does this depend on B_LOCKED?  otherwise the buf
                    952:         * is not for a transaction?  if so, why is this called in the
                    953:         * first place?
                    954:         */
                    955:        if (bp->b_flags & B_LOCKED) {
                    956:                mutex_enter(&wl->wl_mtx);
                    957:                wl->wl_bufbytes += bp->b_bufsize - oldsz;
                    958:                wl->wl_bcount += bp->b_bcount - oldcnt;
                    959:                mutex_exit(&wl->wl_mtx);
                    960:        }
                    961: }
                    962:
                    963: #endif /* _KERNEL */
                    964:
                    965: /****************************************************************/
                    966: /* Some utility inlines */
                    967:
                    968: /* This is used to advance the pointer at old to new value at old+delta */
1.30      uebayasi  969: static inline off_t
1.2       simonb    970: wapbl_advance(size_t size, size_t off, off_t old, size_t delta)
                    971: {
                    972:        off_t new;
                    973:
                    974:        /* Define acceptable ranges for inputs. */
                    975:        KASSERT(delta <= size);
                    976:        KASSERT((old == 0) || (old >= off));
                    977:        KASSERT(old < (size + off));
                    978:
                    979:        if ((old == 0) && (delta != 0))
                    980:                new = off + delta;
                    981:        else if ((old + delta) < (size + off))
                    982:                new = old + delta;
                    983:        else
                    984:                new = (old + delta) - size;
                    985:
                    986:        /* Note some interesting axioms */
                    987:        KASSERT((delta != 0) || (new == old));
                    988:        KASSERT((delta == 0) || (new != 0));
                    989:        KASSERT((delta != (size)) || (new == old));
                    990:
                    991:        /* Define acceptable ranges for output. */
                    992:        KASSERT((new == 0) || (new >= off));
                    993:        KASSERT(new < (size + off));
                    994:        return new;
                    995: }
                    996:
1.30      uebayasi  997: static inline size_t
1.2       simonb    998: wapbl_space_used(size_t avail, off_t head, off_t tail)
                    999: {
                   1000:
                   1001:        if (tail == 0) {
                   1002:                KASSERT(head == 0);
                   1003:                return 0;
                   1004:        }
                   1005:        return ((head + (avail - 1) - tail) % avail) + 1;
                   1006: }
                   1007:
1.30      uebayasi 1008: static inline size_t
1.2       simonb   1009: wapbl_space_free(size_t avail, off_t head, off_t tail)
                   1010: {
                   1011:
                   1012:        return avail - wapbl_space_used(avail, head, tail);
                   1013: }
                   1014:
1.30      uebayasi 1015: static inline void
1.2       simonb   1016: wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
                   1017:                   off_t *tailp)
                   1018: {
                   1019:        off_t head = *headp;
                   1020:        off_t tail = *tailp;
                   1021:
                   1022:        KASSERT(delta <= wapbl_space_free(size, head, tail));
                   1023:        head = wapbl_advance(size, off, head, delta);
                   1024:        if ((tail == 0) && (head != 0))
                   1025:                tail = off;
                   1026:        *headp = head;
                   1027:        *tailp = tail;
                   1028: }
                   1029:
1.30      uebayasi 1030: static inline void
1.2       simonb   1031: wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
                   1032:                   off_t *tailp)
                   1033: {
                   1034:        off_t head = *headp;
                   1035:        off_t tail = *tailp;
                   1036:
                   1037:        KASSERT(delta <= wapbl_space_used(size, head, tail));
                   1038:        tail = wapbl_advance(size, off, tail, delta);
                   1039:        if (head == tail) {
                   1040:                head = tail = 0;
                   1041:        }
                   1042:        *headp = head;
                   1043:        *tailp = tail;
                   1044: }
                   1045:
                   1046: #ifdef _KERNEL
                   1047:
                   1048: /****************************************************************/
                   1049:
                   1050: /*
                   1051:  * Remove transactions whose buffers are completely flushed to disk.
                   1052:  * Will block until at least minfree space is available.
                   1053:  * only intended to be called from inside wapbl_flush and therefore
                   1054:  * does not protect against commit races with itself or with flush.
                   1055:  */
                   1056: static int
                   1057: wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly)
                   1058: {
                   1059:        size_t delta;
                   1060:        size_t avail;
                   1061:        off_t head;
                   1062:        off_t tail;
                   1063:        int error = 0;
                   1064:
                   1065:        KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
                   1066:        KASSERT(rw_write_held(&wl->wl_rwlock));
                   1067:
                   1068:        mutex_enter(&wl->wl_mtx);
                   1069:
                   1070:        /*
                   1071:         * First check to see if we have to do a commit
                   1072:         * at all.
                   1073:         */
                   1074:        avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
                   1075:        if (minfree < avail) {
                   1076:                mutex_exit(&wl->wl_mtx);
                   1077:                return 0;
                   1078:        }
                   1079:        minfree -= avail;
                   1080:        while ((wl->wl_error_count == 0) &&
                   1081:            (wl->wl_reclaimable_bytes < minfree)) {
                   1082:                WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
                   1083:                    ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
                   1084:                    "minfree=%zd\n",
                   1085:                     &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
                   1086:                    minfree));
                   1087:
                   1088:                cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
                   1089:        }
                   1090:        if (wl->wl_reclaimable_bytes < minfree) {
                   1091:                KASSERT(wl->wl_error_count);
                   1092:                /* XXX maybe get actual error from buffer instead someday? */
                   1093:                error = EIO;
                   1094:        }
                   1095:        head = wl->wl_head;
                   1096:        tail = wl->wl_tail;
                   1097:        delta = wl->wl_reclaimable_bytes;
                   1098:
                   1099:        /* If all of of the entries are flushed, then be sure to keep
                   1100:         * the reserved bytes reserved.  Watch out for discarded transactions,
                   1101:         * which could leave more bytes reserved than are reclaimable.
                   1102:         */
                   1103:        if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
                   1104:            (delta >= wl->wl_reserved_bytes)) {
                   1105:                delta -= wl->wl_reserved_bytes;
                   1106:        }
                   1107:        wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
                   1108:                           &tail);
                   1109:        KDASSERT(wl->wl_reserved_bytes <=
                   1110:                wapbl_space_used(wl->wl_circ_size, head, tail));
                   1111:        mutex_exit(&wl->wl_mtx);
                   1112:
                   1113:        if (error)
                   1114:                return error;
                   1115:
                   1116:        if (waitonly)
                   1117:                return 0;
                   1118:
                   1119:        /*
                   1120:         * This is where head, tail and delta are unprotected
                   1121:         * from races against itself or flush.  This is ok since
                   1122:         * we only call this routine from inside flush itself.
                   1123:         *
                   1124:         * XXX: how can it race against itself when accessed only
                   1125:         * from behind the write-locked rwlock?
                   1126:         */
                   1127:        error = wapbl_write_commit(wl, head, tail);
                   1128:        if (error)
                   1129:                return error;
                   1130:
                   1131:        wl->wl_head = head;
                   1132:        wl->wl_tail = tail;
                   1133:
                   1134:        mutex_enter(&wl->wl_mtx);
                   1135:        KASSERT(wl->wl_reclaimable_bytes >= delta);
                   1136:        wl->wl_reclaimable_bytes -= delta;
                   1137:        mutex_exit(&wl->wl_mtx);
                   1138:        WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
                   1139:            ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
                   1140:            curproc->p_pid, curlwp->l_lid, delta));
                   1141:
                   1142:        return 0;
                   1143: }
                   1144:
                   1145: /****************************************************************/
                   1146:
                   1147: void
                   1148: wapbl_biodone(struct buf *bp)
                   1149: {
                   1150:        struct wapbl_entry *we = bp->b_private;
                   1151:        struct wapbl *wl = we->we_wapbl;
                   1152:
                   1153:        /*
                   1154:         * Handle possible flushing of buffers after log has been
                   1155:         * decomissioned.
                   1156:         */
                   1157:        if (!wl) {
                   1158:                KASSERT(we->we_bufcount > 0);
                   1159:                we->we_bufcount--;
                   1160: #ifdef WAPBL_DEBUG_BUFBYTES
                   1161:                KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
                   1162:                we->we_unsynced_bufbytes -= bp->b_bufsize;
                   1163: #endif
                   1164:
                   1165:                if (we->we_bufcount == 0) {
                   1166: #ifdef WAPBL_DEBUG_BUFBYTES
                   1167:                        KASSERT(we->we_unsynced_bufbytes == 0);
                   1168: #endif
1.18      yamt     1169:                        wapbl_free(we, sizeof(*we));
1.2       simonb   1170:                }
                   1171:
                   1172:                brelse(bp, 0);
                   1173:                return;
                   1174:        }
                   1175:
                   1176: #ifdef ohbother
                   1177:        KDASSERT(bp->b_flags & B_DONE);
                   1178:        KDASSERT(!(bp->b_flags & B_DELWRI));
                   1179:        KDASSERT(bp->b_flags & B_ASYNC);
                   1180:        KDASSERT(bp->b_flags & B_BUSY);
                   1181:        KDASSERT(!(bp->b_flags & B_LOCKED));
                   1182:        KDASSERT(!(bp->b_flags & B_READ));
                   1183:        KDASSERT(!(bp->b_flags & B_INVAL));
                   1184:        KDASSERT(!(bp->b_flags & B_NOCACHE));
                   1185: #endif
                   1186:
                   1187:        if (bp->b_error) {
                   1188: #ifdef notyet /* Can't currently handle possible dirty buffer reuse */
1.26      apb      1189:                /*
                   1190:                 * XXXpooka: interfaces not fully updated
                   1191:                 * Note: this was not enabled in the original patch
                   1192:                 * against netbsd4 either.  I don't know if comment
                   1193:                 * above is true or not.
                   1194:                 */
1.2       simonb   1195:
                   1196:                /*
                   1197:                 * If an error occurs, report the error and leave the
                   1198:                 * buffer as a delayed write on the LRU queue.
                   1199:                 * restarting the write would likely result in
                   1200:                 * an error spinloop, so let it be done harmlessly
                   1201:                 * by the syncer.
                   1202:                 */
                   1203:                bp->b_flags &= ~(B_DONE);
                   1204:                simple_unlock(&bp->b_interlock);
                   1205:
                   1206:                if (we->we_error == 0) {
                   1207:                        mutex_enter(&wl->wl_mtx);
                   1208:                        wl->wl_error_count++;
                   1209:                        mutex_exit(&wl->wl_mtx);
                   1210:                        cv_broadcast(&wl->wl_reclaimable_cv);
                   1211:                }
                   1212:                we->we_error = bp->b_error;
                   1213:                bp->b_error = 0;
                   1214:                brelse(bp);
                   1215:                return;
                   1216: #else
                   1217:                /* For now, just mark the log permanently errored out */
                   1218:
                   1219:                mutex_enter(&wl->wl_mtx);
                   1220:                if (wl->wl_error_count == 0) {
                   1221:                        wl->wl_error_count++;
                   1222:                        cv_broadcast(&wl->wl_reclaimable_cv);
                   1223:                }
                   1224:                mutex_exit(&wl->wl_mtx);
                   1225: #endif
                   1226:        }
                   1227:
                   1228:        mutex_enter(&wl->wl_mtx);
                   1229:
                   1230:        KASSERT(we->we_bufcount > 0);
                   1231:        we->we_bufcount--;
                   1232: #ifdef WAPBL_DEBUG_BUFBYTES
                   1233:        KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
                   1234:        we->we_unsynced_bufbytes -= bp->b_bufsize;
                   1235:        KASSERT(wl->wl_unsynced_bufbytes >= bp->b_bufsize);
                   1236:        wl->wl_unsynced_bufbytes -= bp->b_bufsize;
                   1237: #endif
                   1238:
                   1239:        /*
                   1240:         * If the current transaction can be reclaimed, start
                   1241:         * at the beginning and reclaim any consecutive reclaimable
                   1242:         * transactions.  If we successfully reclaim anything,
                   1243:         * then wakeup anyone waiting for the reclaim.
                   1244:         */
                   1245:        if (we->we_bufcount == 0) {
                   1246:                size_t delta = 0;
                   1247:                int errcnt = 0;
                   1248: #ifdef WAPBL_DEBUG_BUFBYTES
                   1249:                KDASSERT(we->we_unsynced_bufbytes == 0);
                   1250: #endif
                   1251:                /*
                   1252:                 * clear any posted error, since the buffer it came from
                   1253:                 * has successfully flushed by now
                   1254:                 */
                   1255:                while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
                   1256:                       (we->we_bufcount == 0)) {
                   1257:                        delta += we->we_reclaimable_bytes;
                   1258:                        if (we->we_error)
                   1259:                                errcnt++;
                   1260:                        SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1.18      yamt     1261:                        wapbl_free(we, sizeof(*we));
1.2       simonb   1262:                }
                   1263:
                   1264:                if (delta) {
                   1265:                        wl->wl_reclaimable_bytes += delta;
                   1266:                        KASSERT(wl->wl_error_count >= errcnt);
                   1267:                        wl->wl_error_count -= errcnt;
                   1268:                        cv_broadcast(&wl->wl_reclaimable_cv);
                   1269:                }
                   1270:        }
                   1271:
                   1272:        mutex_exit(&wl->wl_mtx);
                   1273:        brelse(bp, 0);
                   1274: }
                   1275:
                   1276: /*
                   1277:  * Write transactions to disk + start I/O for contents
                   1278:  */
                   1279: int
                   1280: wapbl_flush(struct wapbl *wl, int waitfor)
                   1281: {
                   1282:        struct buf *bp;
                   1283:        struct wapbl_entry *we;
                   1284:        off_t off;
                   1285:        off_t head;
                   1286:        off_t tail;
                   1287:        size_t delta = 0;
                   1288:        size_t flushsize;
                   1289:        size_t reserved;
                   1290:        int error = 0;
                   1291:
                   1292:        /*
                   1293:         * Do a quick check to see if a full flush can be skipped
                   1294:         * This assumes that the flush callback does not need to be called
                   1295:         * unless there are other outstanding bufs.
                   1296:         */
                   1297:        if (!waitfor) {
                   1298:                size_t nbufs;
                   1299:                mutex_enter(&wl->wl_mtx);       /* XXX need mutex here to
                   1300:                                                   protect the KASSERTS */
                   1301:                nbufs = wl->wl_bufcount;
                   1302:                KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
                   1303:                KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
                   1304:                mutex_exit(&wl->wl_mtx);
                   1305:                if (nbufs == 0)
                   1306:                        return 0;
                   1307:        }
                   1308:
                   1309:        /*
                   1310:         * XXX we may consider using LK_UPGRADE here
                   1311:         * if we want to call flush from inside a transaction
                   1312:         */
                   1313:        rw_enter(&wl->wl_rwlock, RW_WRITER);
                   1314:        wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
                   1315:            wl->wl_dealloccnt);
                   1316:
                   1317:        /*
                   1318:         * Now that we are fully locked and flushed,
                   1319:         * do another check for nothing to do.
                   1320:         */
                   1321:        if (wl->wl_bufcount == 0) {
                   1322:                goto out;
                   1323:        }
                   1324:
                   1325: #if 0
                   1326:        WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
                   1327:                     ("wapbl_flush thread %d.%d flushing entries with "
                   1328:                      "bufcount=%zu bufbytes=%zu\n",
                   1329:                      curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
                   1330:                      wl->wl_bufbytes));
                   1331: #endif
                   1332:
                   1333:        /* Calculate amount of space needed to flush */
                   1334:        flushsize = wapbl_transaction_len(wl);
                   1335:
                   1336:        if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
                   1337:                /*
                   1338:                 * XXX this could be handled more gracefully, perhaps place
                   1339:                 * only a partial transaction in the log and allow the
                   1340:                 * remaining to flush without the protection of the journal.
                   1341:                 */
                   1342:                panic("wapbl_flush: current transaction too big to flush\n");
                   1343:        }
                   1344:
                   1345:        error = wapbl_truncate(wl, flushsize, 0);
                   1346:        if (error)
                   1347:                goto out2;
                   1348:
                   1349:        off = wl->wl_head;
                   1350:        KASSERT((off == 0) || ((off >= wl->wl_circ_off) &&
                   1351:                              (off < wl->wl_circ_off + wl->wl_circ_size)));
                   1352:        error = wapbl_write_blocks(wl, &off);
                   1353:        if (error)
                   1354:                goto out2;
                   1355:        error = wapbl_write_revocations(wl, &off);
                   1356:        if (error)
                   1357:                goto out2;
                   1358:        error = wapbl_write_inodes(wl, &off);
                   1359:        if (error)
                   1360:                goto out2;
                   1361:
                   1362:        reserved = 0;
                   1363:        if (wl->wl_inohashcnt)
                   1364:                reserved = wapbl_transaction_inodes_len(wl);
                   1365:
                   1366:        head = wl->wl_head;
                   1367:        tail = wl->wl_tail;
                   1368:
                   1369:        wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
                   1370:            &head, &tail);
                   1371: #ifdef WAPBL_DEBUG
                   1372:        if (head != off) {
                   1373:                panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX
                   1374:                      " off=%"PRIdMAX" flush=%zu\n",
                   1375:                      (intmax_t)head, (intmax_t)tail, (intmax_t)off,
                   1376:                      flushsize);
                   1377:        }
                   1378: #else
                   1379:        KASSERT(head == off);
                   1380: #endif
                   1381:
                   1382:        /* Opportunistically move the tail forward if we can */
                   1383:        if (!wapbl_lazy_truncate) {
                   1384:                mutex_enter(&wl->wl_mtx);
                   1385:                delta = wl->wl_reclaimable_bytes;
                   1386:                mutex_exit(&wl->wl_mtx);
                   1387:                wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
                   1388:                    &head, &tail);
                   1389:        }
                   1390:
                   1391:        error = wapbl_write_commit(wl, head, tail);
                   1392:        if (error)
                   1393:                goto out2;
                   1394:
                   1395:        we = wapbl_calloc(1, sizeof(*we));
                   1396:
                   1397: #ifdef WAPBL_DEBUG_BUFBYTES
                   1398:        WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
                   1399:                ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
                   1400:                 " unsynced=%zu"
                   1401:                 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
                   1402:                 "inodes=%d\n",
                   1403:                 curproc->p_pid, curlwp->l_lid, flushsize, delta,
                   1404:                 wapbl_space_used(wl->wl_circ_size, head, tail),
                   1405:                 wl->wl_unsynced_bufbytes, wl->wl_bufcount,
                   1406:                 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
                   1407:                 wl->wl_inohashcnt));
                   1408: #else
                   1409:        WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
                   1410:                ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
                   1411:                 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
                   1412:                 "inodes=%d\n",
                   1413:                 curproc->p_pid, curlwp->l_lid, flushsize, delta,
                   1414:                 wapbl_space_used(wl->wl_circ_size, head, tail),
                   1415:                 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
                   1416:                 wl->wl_dealloccnt, wl->wl_inohashcnt));
                   1417: #endif
                   1418:
                   1419:
                   1420:        mutex_enter(&bufcache_lock);
                   1421:        mutex_enter(&wl->wl_mtx);
                   1422:
                   1423:        wl->wl_reserved_bytes = reserved;
                   1424:        wl->wl_head = head;
                   1425:        wl->wl_tail = tail;
                   1426:        KASSERT(wl->wl_reclaimable_bytes >= delta);
                   1427:        wl->wl_reclaimable_bytes -= delta;
                   1428:        wl->wl_dealloccnt = 0;
                   1429: #ifdef WAPBL_DEBUG_BUFBYTES
                   1430:        wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
                   1431: #endif
                   1432:
                   1433:        we->we_wapbl = wl;
                   1434:        we->we_bufcount = wl->wl_bufcount;
                   1435: #ifdef WAPBL_DEBUG_BUFBYTES
                   1436:        we->we_unsynced_bufbytes = wl->wl_bufbytes;
                   1437: #endif
                   1438:        we->we_reclaimable_bytes = flushsize;
                   1439:        we->we_error = 0;
                   1440:        SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
                   1441:
                   1442:        /*
                   1443:         * this flushes bufs in reverse order than they were queued
                   1444:         * it shouldn't matter, but if we care we could use TAILQ instead.
                   1445:         * XXX Note they will get put on the lru queue when they flush
                   1446:         * so we might actually want to change this to preserve order.
                   1447:         */
                   1448:        while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
                   1449:                if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
                   1450:                        continue;
                   1451:                }
                   1452:                bp->b_iodone = wapbl_biodone;
                   1453:                bp->b_private = we;
                   1454:                bremfree(bp);
                   1455:                wapbl_remove_buf_locked(wl, bp);
                   1456:                mutex_exit(&wl->wl_mtx);
                   1457:                mutex_exit(&bufcache_lock);
                   1458:                bawrite(bp);
                   1459:                mutex_enter(&bufcache_lock);
                   1460:                mutex_enter(&wl->wl_mtx);
                   1461:        }
                   1462:        mutex_exit(&wl->wl_mtx);
                   1463:        mutex_exit(&bufcache_lock);
                   1464:
                   1465: #if 0
                   1466:        WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
                   1467:                     ("wapbl_flush thread %d.%d done flushing entries...\n",
                   1468:                     curproc->p_pid, curlwp->l_lid));
                   1469: #endif
                   1470:
                   1471:  out:
                   1472:
                   1473:        /*
                   1474:         * If the waitfor flag is set, don't return until everything is
                   1475:         * fully flushed and the on disk log is empty.
                   1476:         */
                   1477:        if (waitfor) {
                   1478:                error = wapbl_truncate(wl, wl->wl_circ_size -
                   1479:                        wl->wl_reserved_bytes, wapbl_lazy_truncate);
                   1480:        }
                   1481:
                   1482:  out2:
                   1483:        if (error) {
                   1484:                wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks,
                   1485:                    wl->wl_dealloclens, wl->wl_dealloccnt);
                   1486:        }
                   1487:
                   1488: #ifdef WAPBL_DEBUG_PRINT
                   1489:        if (error) {
                   1490:                pid_t pid = -1;
                   1491:                lwpid_t lid = -1;
                   1492:                if (curproc)
                   1493:                        pid = curproc->p_pid;
                   1494:                if (curlwp)
                   1495:                        lid = curlwp->l_lid;
                   1496:                mutex_enter(&wl->wl_mtx);
                   1497: #ifdef WAPBL_DEBUG_BUFBYTES
                   1498:                WAPBL_PRINTF(WAPBL_PRINT_ERROR,
                   1499:                    ("wapbl_flush: thread %d.%d aborted flush: "
                   1500:                    "error = %d\n"
                   1501:                    "\tbufcount=%zu bufbytes=%zu bcount=%zu "
                   1502:                    "deallocs=%d inodes=%d\n"
                   1503:                    "\terrcnt = %d, reclaimable=%zu reserved=%zu "
                   1504:                    "unsynced=%zu\n",
                   1505:                    pid, lid, error, wl->wl_bufcount,
                   1506:                    wl->wl_bufbytes, wl->wl_bcount,
                   1507:                    wl->wl_dealloccnt, wl->wl_inohashcnt,
                   1508:                    wl->wl_error_count, wl->wl_reclaimable_bytes,
                   1509:                    wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
                   1510:                SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
                   1511:                        WAPBL_PRINTF(WAPBL_PRINT_ERROR,
                   1512:                            ("\tentry: bufcount = %zu, reclaimable = %zu, "
                   1513:                             "error = %d, unsynced = %zu\n",
                   1514:                             we->we_bufcount, we->we_reclaimable_bytes,
                   1515:                             we->we_error, we->we_unsynced_bufbytes));
                   1516:                }
                   1517: #else
                   1518:                WAPBL_PRINTF(WAPBL_PRINT_ERROR,
                   1519:                    ("wapbl_flush: thread %d.%d aborted flush: "
                   1520:                     "error = %d\n"
                   1521:                     "\tbufcount=%zu bufbytes=%zu bcount=%zu "
                   1522:                     "deallocs=%d inodes=%d\n"
                   1523:                     "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
                   1524:                     pid, lid, error, wl->wl_bufcount,
                   1525:                     wl->wl_bufbytes, wl->wl_bcount,
                   1526:                     wl->wl_dealloccnt, wl->wl_inohashcnt,
                   1527:                     wl->wl_error_count, wl->wl_reclaimable_bytes,
                   1528:                     wl->wl_reserved_bytes));
                   1529:                SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
                   1530:                        WAPBL_PRINTF(WAPBL_PRINT_ERROR,
                   1531:                            ("\tentry: bufcount = %zu, reclaimable = %zu, "
                   1532:                             "error = %d\n", we->we_bufcount,
                   1533:                             we->we_reclaimable_bytes, we->we_error));
                   1534:                }
                   1535: #endif
                   1536:                mutex_exit(&wl->wl_mtx);
                   1537:        }
                   1538: #endif
                   1539:
                   1540:        rw_exit(&wl->wl_rwlock);
                   1541:        return error;
                   1542: }
                   1543:
                   1544: /****************************************************************/
                   1545:
                   1546: void
                   1547: wapbl_jlock_assert(struct wapbl *wl)
                   1548: {
                   1549:
1.23      ad       1550:        KASSERT(rw_lock_held(&wl->wl_rwlock));
1.2       simonb   1551: }
                   1552:
                   1553: void
                   1554: wapbl_junlock_assert(struct wapbl *wl)
                   1555: {
                   1556:
                   1557:        KASSERT(!rw_write_held(&wl->wl_rwlock));
                   1558: }
                   1559:
                   1560: /****************************************************************/
                   1561:
                   1562: /* locks missing */
                   1563: void
                   1564: wapbl_print(struct wapbl *wl,
                   1565:                int full,
                   1566:                void (*pr)(const char *, ...))
                   1567: {
                   1568:        struct buf *bp;
                   1569:        struct wapbl_entry *we;
                   1570:        (*pr)("wapbl %p", wl);
                   1571:        (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
                   1572:              wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
                   1573:        (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
                   1574:              wl->wl_circ_size, wl->wl_circ_off,
                   1575:              (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
                   1576:        (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
                   1577:              wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
                   1578: #ifdef WAPBL_DEBUG_BUFBYTES
                   1579:        (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
                   1580:              "reserved = %zu errcnt = %d unsynced = %zu\n",
                   1581:              wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
                   1582:              wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
                   1583:                                wl->wl_error_count, wl->wl_unsynced_bufbytes);
                   1584: #else
                   1585:        (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
                   1586:              "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
                   1587:              wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
                   1588:                                wl->wl_error_count);
                   1589: #endif
                   1590:        (*pr)("\tdealloccnt = %d, dealloclim = %d\n",
                   1591:              wl->wl_dealloccnt, wl->wl_dealloclim);
                   1592:        (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
                   1593:              wl->wl_inohashcnt, wl->wl_inohashmask);
                   1594:        (*pr)("entries:\n");
                   1595:        SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
                   1596: #ifdef WAPBL_DEBUG_BUFBYTES
                   1597:                (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
                   1598:                      "unsynced = %zu\n",
                   1599:                      we->we_bufcount, we->we_reclaimable_bytes,
                   1600:                      we->we_error, we->we_unsynced_bufbytes);
                   1601: #else
                   1602:                (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
                   1603:                      we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
                   1604: #endif
                   1605:        }
                   1606:        if (full) {
                   1607:                int cnt = 0;
                   1608:                (*pr)("bufs =");
                   1609:                LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
                   1610:                        if (!LIST_NEXT(bp, b_wapbllist)) {
                   1611:                                (*pr)(" %p", bp);
                   1612:                        } else if ((++cnt % 6) == 0) {
                   1613:                                (*pr)(" %p,\n\t", bp);
                   1614:                        } else {
                   1615:                                (*pr)(" %p,", bp);
                   1616:                        }
                   1617:                }
                   1618:                (*pr)("\n");
                   1619:
                   1620:                (*pr)("dealloced blks = ");
                   1621:                {
                   1622:                        int i;
                   1623:                        cnt = 0;
                   1624:                        for (i = 0; i < wl->wl_dealloccnt; i++) {
                   1625:                                (*pr)(" %"PRId64":%d,",
                   1626:                                      wl->wl_deallocblks[i],
                   1627:                                      wl->wl_dealloclens[i]);
                   1628:                                if ((++cnt % 4) == 0) {
                   1629:                                        (*pr)("\n\t");
                   1630:                                }
                   1631:                        }
                   1632:                }
                   1633:                (*pr)("\n");
                   1634:
                   1635:                (*pr)("registered inodes = ");
                   1636:                {
                   1637:                        int i;
                   1638:                        cnt = 0;
                   1639:                        for (i = 0; i <= wl->wl_inohashmask; i++) {
                   1640:                                struct wapbl_ino_head *wih;
                   1641:                                struct wapbl_ino *wi;
                   1642:
                   1643:                                wih = &wl->wl_inohash[i];
                   1644:                                LIST_FOREACH(wi, wih, wi_hash) {
                   1645:                                        if (wi->wi_ino == 0)
                   1646:                                                continue;
                   1647:                                        (*pr)(" %"PRId32"/0%06"PRIo32",",
                   1648:                                            wi->wi_ino, wi->wi_mode);
                   1649:                                        if ((++cnt % 4) == 0) {
                   1650:                                                (*pr)("\n\t");
                   1651:                                        }
                   1652:                                }
                   1653:                        }
                   1654:                        (*pr)("\n");
                   1655:                }
                   1656:        }
                   1657: }
                   1658:
                   1659: #if defined(WAPBL_DEBUG) || defined(DDB)
                   1660: void
                   1661: wapbl_dump(struct wapbl *wl)
                   1662: {
                   1663: #if defined(WAPBL_DEBUG)
                   1664:        if (!wl)
                   1665:                wl = wapbl_debug_wl;
                   1666: #endif
                   1667:        if (!wl)
                   1668:                return;
                   1669:        wapbl_print(wl, 1, printf);
                   1670: }
                   1671: #endif
                   1672:
                   1673: /****************************************************************/
                   1674:
                   1675: void
                   1676: wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len)
                   1677: {
                   1678:
                   1679:        wapbl_jlock_assert(wl);
                   1680:
                   1681:        /* XXX should eventually instead tie this into resource estimation */
1.27      pooka    1682:        /*
                   1683:         * XXX this panic needs locking/mutex analysis and the
                   1684:         * ability to cope with the failure.
                   1685:         */
                   1686:        /* XXX this XXX doesn't have enough XXX */
                   1687:        if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim))
                   1688:                panic("wapbl_register_deallocation: out of resources");
                   1689:
1.2       simonb   1690:        wl->wl_deallocblks[wl->wl_dealloccnt] = blk;
                   1691:        wl->wl_dealloclens[wl->wl_dealloccnt] = len;
                   1692:        wl->wl_dealloccnt++;
                   1693:        WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
                   1694:            ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len));
                   1695: }
                   1696:
                   1697: /****************************************************************/
                   1698:
                   1699: static void
                   1700: wapbl_inodetrk_init(struct wapbl *wl, u_int size)
                   1701: {
                   1702:
                   1703:        wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
                   1704:        if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
                   1705:                pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
                   1706:                    "wapblinopl", &pool_allocator_nointr, IPL_NONE);
                   1707:        }
                   1708: }
                   1709:
                   1710: static void
                   1711: wapbl_inodetrk_free(struct wapbl *wl)
                   1712: {
                   1713:
                   1714:        /* XXX this KASSERT needs locking/mutex analysis */
                   1715:        KASSERT(wl->wl_inohashcnt == 0);
                   1716:        hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
                   1717:        if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
                   1718:                pool_destroy(&wapbl_ino_pool);
                   1719:        }
                   1720: }
                   1721:
                   1722: static struct wapbl_ino *
                   1723: wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
                   1724: {
                   1725:        struct wapbl_ino_head *wih;
                   1726:        struct wapbl_ino *wi;
                   1727:
                   1728:        KASSERT(mutex_owned(&wl->wl_mtx));
                   1729:
                   1730:        wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
                   1731:        LIST_FOREACH(wi, wih, wi_hash) {
                   1732:                if (ino == wi->wi_ino)
                   1733:                        return wi;
                   1734:        }
                   1735:        return 0;
                   1736: }
                   1737:
                   1738: void
                   1739: wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
                   1740: {
                   1741:        struct wapbl_ino_head *wih;
                   1742:        struct wapbl_ino *wi;
                   1743:
                   1744:        wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
                   1745:
                   1746:        mutex_enter(&wl->wl_mtx);
                   1747:        if (wapbl_inodetrk_get(wl, ino) == NULL) {
                   1748:                wi->wi_ino = ino;
                   1749:                wi->wi_mode = mode;
                   1750:                wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
                   1751:                LIST_INSERT_HEAD(wih, wi, wi_hash);
                   1752:                wl->wl_inohashcnt++;
                   1753:                WAPBL_PRINTF(WAPBL_PRINT_INODE,
                   1754:                    ("wapbl_register_inode: ino=%"PRId64"\n", ino));
                   1755:                mutex_exit(&wl->wl_mtx);
                   1756:        } else {
                   1757:                mutex_exit(&wl->wl_mtx);
                   1758:                pool_put(&wapbl_ino_pool, wi);
                   1759:        }
                   1760: }
                   1761:
                   1762: void
                   1763: wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
                   1764: {
                   1765:        struct wapbl_ino *wi;
                   1766:
                   1767:        mutex_enter(&wl->wl_mtx);
                   1768:        wi = wapbl_inodetrk_get(wl, ino);
                   1769:        if (wi) {
                   1770:                WAPBL_PRINTF(WAPBL_PRINT_INODE,
                   1771:                    ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
                   1772:                KASSERT(wl->wl_inohashcnt > 0);
                   1773:                wl->wl_inohashcnt--;
                   1774:                LIST_REMOVE(wi, wi_hash);
                   1775:                mutex_exit(&wl->wl_mtx);
                   1776:
                   1777:                pool_put(&wapbl_ino_pool, wi);
                   1778:        } else {
                   1779:                mutex_exit(&wl->wl_mtx);
                   1780:        }
                   1781: }
                   1782:
                   1783: /****************************************************************/
                   1784:
1.30      uebayasi 1785: static inline size_t
1.2       simonb   1786: wapbl_transaction_inodes_len(struct wapbl *wl)
                   1787: {
                   1788:        int blocklen = 1<<wl->wl_log_dev_bshift;
                   1789:        int iph;
                   1790:
                   1791:        /* Calculate number of inodes described in a inodelist header */
                   1792:        iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
                   1793:            sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
                   1794:
                   1795:        KASSERT(iph > 0);
                   1796:
                   1797:        return MAX(1, howmany(wl->wl_inohashcnt, iph))*blocklen;
                   1798: }
                   1799:
                   1800:
                   1801: /* Calculate amount of space a transaction will take on disk */
                   1802: static size_t
                   1803: wapbl_transaction_len(struct wapbl *wl)
                   1804: {
                   1805:        int blocklen = 1<<wl->wl_log_dev_bshift;
                   1806:        size_t len;
                   1807:        int bph;
                   1808:
                   1809:        /* Calculate number of blocks described in a blocklist header */
                   1810:        bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
                   1811:            sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
                   1812:
                   1813:        KASSERT(bph > 0);
                   1814:
                   1815:        len = wl->wl_bcount;
                   1816:        len += howmany(wl->wl_bufcount, bph)*blocklen;
                   1817:        len += howmany(wl->wl_dealloccnt, bph)*blocklen;
                   1818:        len += wapbl_transaction_inodes_len(wl);
                   1819:
                   1820:        return len;
                   1821: }
                   1822:
                   1823: /*
                   1824:  * Perform commit operation
                   1825:  *
                   1826:  * Note that generation number incrementation needs to
                   1827:  * be protected against racing with other invocations
                   1828:  * of wapbl_commit.  This is ok since this routine
                   1829:  * is only invoked from wapbl_flush
                   1830:  */
                   1831: static int
                   1832: wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
                   1833: {
                   1834:        struct wapbl_wc_header *wc = wl->wl_wc_header;
                   1835:        struct timespec ts;
                   1836:        int error;
                   1837:        int force = 1;
1.34    ! mlelstv  1838:        daddr_t pbn;
1.2       simonb   1839:
                   1840:        /* XXX Calc checksum here, instead we do this for now */
                   1841:        error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED);
                   1842:        if (error) {
                   1843:                WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1.29      pooka    1844:                    ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%"PRIx64
                   1845:                    " returned %d\n", wl->wl_devvp->v_rdev, error));
1.2       simonb   1846:        }
                   1847:
                   1848:        wc->wc_head = head;
                   1849:        wc->wc_tail = tail;
                   1850:        wc->wc_checksum = 0;
                   1851:        wc->wc_version = 1;
                   1852:        getnanotime(&ts);
1.17      yamt     1853:        wc->wc_time = ts.tv_sec;
1.2       simonb   1854:        wc->wc_timensec = ts.tv_nsec;
                   1855:
                   1856:        WAPBL_PRINTF(WAPBL_PRINT_WRITE,
                   1857:            ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
                   1858:            (intmax_t)head, (intmax_t)tail));
                   1859:
                   1860:        /*
                   1861:         * XXX if generation will rollover, then first zero
                   1862:         * over second commit header before trying to write both headers.
                   1863:         */
                   1864:
1.34    ! mlelstv  1865:        pbn = wl->wl_logpbn + (wc->wc_generation % 2);
        !          1866: #ifdef _KERNEL
        !          1867:        pbn = btodb(pbn << wc->wc_log_dev_bshift);
        !          1868: #endif
        !          1869:        error = wapbl_write(wc, wc->wc_len, wl->wl_devvp, pbn);
1.2       simonb   1870:        if (error)
                   1871:                return error;
                   1872:
                   1873:        error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED);
                   1874:        if (error) {
                   1875:                WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1.29      pooka    1876:                    ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%"PRIx64
                   1877:                    " returned %d\n", wl->wl_devvp->v_rdev, error));
1.2       simonb   1878:        }
                   1879:
                   1880:        /*
                   1881:         * If the generation number was zero, write it out a second time.
                   1882:         * This handles initialization and generation number rollover
                   1883:         */
                   1884:        if (wc->wc_generation++ == 0) {
                   1885:                error = wapbl_write_commit(wl, head, tail);
                   1886:                /*
                   1887:                 * This panic should be able to be removed if we do the
                   1888:                 * zero'ing mentioned above, and we are certain to roll
                   1889:                 * back generation number on failure.
                   1890:                 */
                   1891:                if (error)
                   1892:                        panic("wapbl_write_commit: error writing duplicate "
                   1893:                              "log header: %d\n", error);
                   1894:        }
                   1895:        return 0;
                   1896: }
                   1897:
                   1898: /* Returns new offset value */
                   1899: static int
                   1900: wapbl_write_blocks(struct wapbl *wl, off_t *offp)
                   1901: {
                   1902:        struct wapbl_wc_blocklist *wc =
                   1903:            (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
                   1904:        int blocklen = 1<<wl->wl_log_dev_bshift;
                   1905:        int bph;
                   1906:        struct buf *bp;
                   1907:        off_t off = *offp;
                   1908:        int error;
1.7       joerg    1909:        size_t padding;
1.2       simonb   1910:
                   1911:        KASSERT(rw_write_held(&wl->wl_rwlock));
                   1912:
                   1913:        bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
                   1914:            sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
                   1915:
                   1916:        bp = LIST_FIRST(&wl->wl_bufs);
                   1917:
                   1918:        while (bp) {
                   1919:                int cnt;
                   1920:                struct buf *obp = bp;
                   1921:
                   1922:                KASSERT(bp->b_flags & B_LOCKED);
                   1923:
                   1924:                wc->wc_type = WAPBL_WC_BLOCKS;
                   1925:                wc->wc_len = blocklen;
                   1926:                wc->wc_blkcount = 0;
                   1927:                while (bp && (wc->wc_blkcount < bph)) {
                   1928:                        /*
                   1929:                         * Make sure all the physical block numbers are up to
                   1930:                         * date.  If this is not always true on a given
                   1931:                         * filesystem, then VOP_BMAP must be called.  We
                   1932:                         * could call VOP_BMAP here, or else in the filesystem
                   1933:                         * specific flush callback, although neither of those
                   1934:                         * solutions allow us to take the vnode lock.  If a
                   1935:                         * filesystem requires that we must take the vnode lock
                   1936:                         * to call VOP_BMAP, then we can probably do it in
                   1937:                         * bwrite when the vnode lock should already be held
                   1938:                         * by the invoking code.
                   1939:                         */
                   1940:                        KASSERT((bp->b_vp->v_type == VBLK) ||
                   1941:                                 (bp->b_blkno != bp->b_lblkno));
                   1942:                        KASSERT(bp->b_blkno > 0);
                   1943:
                   1944:                        wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
                   1945:                        wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
                   1946:                        wc->wc_len += bp->b_bcount;
                   1947:                        wc->wc_blkcount++;
                   1948:                        bp = LIST_NEXT(bp, b_wapbllist);
                   1949:                }
1.7       joerg    1950:                if (wc->wc_len % blocklen != 0) {
                   1951:                        padding = blocklen - wc->wc_len % blocklen;
                   1952:                        wc->wc_len += padding;
                   1953:                } else {
                   1954:                        padding = 0;
                   1955:                }
                   1956:
1.2       simonb   1957:                WAPBL_PRINTF(WAPBL_PRINT_WRITE,
1.7       joerg    1958:                    ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
                   1959:                    wc->wc_len, padding, (intmax_t)off));
1.2       simonb   1960:
                   1961:                error = wapbl_circ_write(wl, wc, blocklen, &off);
                   1962:                if (error)
                   1963:                        return error;
                   1964:                bp = obp;
                   1965:                cnt = 0;
                   1966:                while (bp && (cnt++ < bph)) {
                   1967:                        error = wapbl_circ_write(wl, bp->b_data,
                   1968:                            bp->b_bcount, &off);
                   1969:                        if (error)
                   1970:                                return error;
                   1971:                        bp = LIST_NEXT(bp, b_wapbllist);
                   1972:                }
1.7       joerg    1973:                if (padding) {
                   1974:                        void *zero;
                   1975:
                   1976:                        zero = wapbl_malloc(padding);
                   1977:                        memset(zero, 0, padding);
                   1978:                        error = wapbl_circ_write(wl, zero, padding, &off);
1.18      yamt     1979:                        wapbl_free(zero, padding);
1.7       joerg    1980:                        if (error)
                   1981:                                return error;
                   1982:                }
1.2       simonb   1983:        }
                   1984:        *offp = off;
                   1985:        return 0;
                   1986: }
                   1987:
                   1988: static int
                   1989: wapbl_write_revocations(struct wapbl *wl, off_t *offp)
                   1990: {
                   1991:        struct wapbl_wc_blocklist *wc =
                   1992:            (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
                   1993:        int i;
                   1994:        int blocklen = 1<<wl->wl_log_dev_bshift;
                   1995:        int bph;
                   1996:        off_t off = *offp;
                   1997:        int error;
                   1998:
                   1999:        if (wl->wl_dealloccnt == 0)
                   2000:                return 0;
                   2001:
                   2002:        bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
                   2003:            sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
                   2004:
                   2005:        i = 0;
                   2006:        while (i < wl->wl_dealloccnt) {
                   2007:                wc->wc_type = WAPBL_WC_REVOCATIONS;
                   2008:                wc->wc_len = blocklen;
                   2009:                wc->wc_blkcount = 0;
                   2010:                while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) {
                   2011:                        wc->wc_blocks[wc->wc_blkcount].wc_daddr =
                   2012:                            wl->wl_deallocblks[i];
                   2013:                        wc->wc_blocks[wc->wc_blkcount].wc_dlen =
                   2014:                            wl->wl_dealloclens[i];
                   2015:                        wc->wc_blkcount++;
                   2016:                        i++;
                   2017:                }
                   2018:                WAPBL_PRINTF(WAPBL_PRINT_WRITE,
                   2019:                    ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
                   2020:                    wc->wc_len, (intmax_t)off));
                   2021:                error = wapbl_circ_write(wl, wc, blocklen, &off);
                   2022:                if (error)
                   2023:                        return error;
                   2024:        }
                   2025:        *offp = off;
                   2026:        return 0;
                   2027: }
                   2028:
                   2029: static int
                   2030: wapbl_write_inodes(struct wapbl *wl, off_t *offp)
                   2031: {
                   2032:        struct wapbl_wc_inodelist *wc =
                   2033:            (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
                   2034:        int i;
1.14      joerg    2035:        int blocklen = 1 << wl->wl_log_dev_bshift;
1.2       simonb   2036:        off_t off = *offp;
                   2037:        int error;
                   2038:
                   2039:        struct wapbl_ino_head *wih;
                   2040:        struct wapbl_ino *wi;
                   2041:        int iph;
                   2042:
                   2043:        iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
                   2044:            sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
                   2045:
                   2046:        i = 0;
                   2047:        wih = &wl->wl_inohash[0];
                   2048:        wi = 0;
                   2049:        do {
                   2050:                wc->wc_type = WAPBL_WC_INODES;
                   2051:                wc->wc_len = blocklen;
                   2052:                wc->wc_inocnt = 0;
                   2053:                wc->wc_clear = (i == 0);
                   2054:                while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
                   2055:                        while (!wi) {
                   2056:                                KASSERT((wih - &wl->wl_inohash[0])
                   2057:                                    <= wl->wl_inohashmask);
                   2058:                                wi = LIST_FIRST(wih++);
                   2059:                        }
                   2060:                        wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
                   2061:                        wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
                   2062:                        wc->wc_inocnt++;
                   2063:                        i++;
                   2064:                        wi = LIST_NEXT(wi, wi_hash);
                   2065:                }
                   2066:                WAPBL_PRINTF(WAPBL_PRINT_WRITE,
                   2067:                    ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
                   2068:                    wc->wc_len, (intmax_t)off));
                   2069:                error = wapbl_circ_write(wl, wc, blocklen, &off);
                   2070:                if (error)
                   2071:                        return error;
                   2072:        } while (i < wl->wl_inohashcnt);
                   2073:
                   2074:        *offp = off;
                   2075:        return 0;
                   2076: }
                   2077:
                   2078: #endif /* _KERNEL */
                   2079:
                   2080: /****************************************************************/
                   2081:
                   2082: struct wapbl_blk {
                   2083:        LIST_ENTRY(wapbl_blk) wb_hash;
                   2084:        daddr_t wb_blk;
                   2085:        off_t wb_off; /* Offset of this block in the log */
                   2086: };
                   2087: #define        WAPBL_BLKPOOL_MIN 83
                   2088:
                   2089: static void
                   2090: wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
                   2091: {
                   2092:        if (size < WAPBL_BLKPOOL_MIN)
                   2093:                size = WAPBL_BLKPOOL_MIN;
                   2094:        KASSERT(wr->wr_blkhash == 0);
                   2095: #ifdef _KERNEL
                   2096:        wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
                   2097: #else /* ! _KERNEL */
                   2098:        /* Manually implement hashinit */
                   2099:        {
1.25      lukem    2100:                unsigned long i, hashsize;
1.2       simonb   2101:                for (hashsize = 1; hashsize < size; hashsize <<= 1)
                   2102:                        continue;
                   2103:                wr->wr_blkhash = wapbl_malloc(hashsize * sizeof(*wr->wr_blkhash));
                   2104:                for (i = 0; i < wr->wr_blkhashmask; i++)
                   2105:                        LIST_INIT(&wr->wr_blkhash[i]);
                   2106:                wr->wr_blkhashmask = hashsize - 1;
                   2107:        }
                   2108: #endif /* ! _KERNEL */
                   2109: }
                   2110:
                   2111: static void
                   2112: wapbl_blkhash_free(struct wapbl_replay *wr)
                   2113: {
                   2114:        KASSERT(wr->wr_blkhashcnt == 0);
                   2115: #ifdef _KERNEL
                   2116:        hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
                   2117: #else /* ! _KERNEL */
1.18      yamt     2118:        wapbl_free(wr->wr_blkhash,
                   2119:            (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash));
1.2       simonb   2120: #endif /* ! _KERNEL */
                   2121: }
                   2122:
                   2123: static struct wapbl_blk *
                   2124: wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
                   2125: {
                   2126:        struct wapbl_blk_head *wbh;
                   2127:        struct wapbl_blk *wb;
                   2128:        wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
                   2129:        LIST_FOREACH(wb, wbh, wb_hash) {
                   2130:                if (blk == wb->wb_blk)
                   2131:                        return wb;
                   2132:        }
                   2133:        return 0;
                   2134: }
                   2135:
                   2136: static void
                   2137: wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
                   2138: {
                   2139:        struct wapbl_blk_head *wbh;
                   2140:        struct wapbl_blk *wb;
                   2141:        wb = wapbl_blkhash_get(wr, blk);
                   2142:        if (wb) {
                   2143:                KASSERT(wb->wb_blk == blk);
                   2144:                wb->wb_off = off;
                   2145:        } else {
                   2146:                wb = wapbl_malloc(sizeof(*wb));
                   2147:                wb->wb_blk = blk;
                   2148:                wb->wb_off = off;
                   2149:                wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
                   2150:                LIST_INSERT_HEAD(wbh, wb, wb_hash);
                   2151:                wr->wr_blkhashcnt++;
                   2152:        }
                   2153: }
                   2154:
                   2155: static void
                   2156: wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
                   2157: {
                   2158:        struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
                   2159:        if (wb) {
                   2160:                KASSERT(wr->wr_blkhashcnt > 0);
                   2161:                wr->wr_blkhashcnt--;
                   2162:                LIST_REMOVE(wb, wb_hash);
1.18      yamt     2163:                wapbl_free(wb, sizeof(*wb));
1.2       simonb   2164:        }
                   2165: }
                   2166:
                   2167: static void
                   2168: wapbl_blkhash_clear(struct wapbl_replay *wr)
                   2169: {
1.25      lukem    2170:        unsigned long i;
1.2       simonb   2171:        for (i = 0; i <= wr->wr_blkhashmask; i++) {
                   2172:                struct wapbl_blk *wb;
                   2173:
                   2174:                while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
                   2175:                        KASSERT(wr->wr_blkhashcnt > 0);
                   2176:                        wr->wr_blkhashcnt--;
                   2177:                        LIST_REMOVE(wb, wb_hash);
1.18      yamt     2178:                        wapbl_free(wb, sizeof(*wb));
1.2       simonb   2179:                }
                   2180:        }
                   2181:        KASSERT(wr->wr_blkhashcnt == 0);
                   2182: }
                   2183:
                   2184: /****************************************************************/
                   2185:
                   2186: static int
                   2187: wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
                   2188: {
                   2189:        size_t slen;
                   2190:        off_t off = *offp;
                   2191:        int error;
1.34    ! mlelstv  2192:        daddr_t pbn;
1.2       simonb   2193:
1.14      joerg    2194:        KASSERT(((len >> wr->wr_log_dev_bshift) <<
                   2195:            wr->wr_log_dev_bshift) == len);
1.34    ! mlelstv  2196:
1.14      joerg    2197:        if (off < wr->wr_circ_off)
                   2198:                off = wr->wr_circ_off;
                   2199:        slen = wr->wr_circ_off + wr->wr_circ_size - off;
1.2       simonb   2200:        if (slen < len) {
1.34    ! mlelstv  2201:                pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
        !          2202: #ifdef _KERNEL
        !          2203:                pbn = btodb(pbn << wr->wr_log_dev_bshift);
        !          2204: #endif
        !          2205:                error = wapbl_read(data, slen, wr->wr_devvp, pbn);
1.2       simonb   2206:                if (error)
                   2207:                        return error;
                   2208:                data = (uint8_t *)data + slen;
                   2209:                len -= slen;
1.14      joerg    2210:                off = wr->wr_circ_off;
1.2       simonb   2211:        }
1.34    ! mlelstv  2212:        pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
        !          2213: #ifdef _KERNEL
        !          2214:        pbn = btodb(pbn << wr->wr_log_dev_bshift);
        !          2215: #endif
        !          2216:        error = wapbl_read(data, len, wr->wr_devvp, pbn);
1.2       simonb   2217:        if (error)
                   2218:                return error;
                   2219:        off += len;
1.14      joerg    2220:        if (off >= wr->wr_circ_off + wr->wr_circ_size)
                   2221:                off = wr->wr_circ_off;
1.2       simonb   2222:        *offp = off;
                   2223:        return 0;
                   2224: }
                   2225:
                   2226: static void
                   2227: wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
                   2228: {
                   2229:        size_t slen;
                   2230:        off_t off = *offp;
                   2231:
1.14      joerg    2232:        KASSERT(((len >> wr->wr_log_dev_bshift) <<
                   2233:            wr->wr_log_dev_bshift) == len);
1.2       simonb   2234:
1.14      joerg    2235:        if (off < wr->wr_circ_off)
                   2236:                off = wr->wr_circ_off;
                   2237:        slen = wr->wr_circ_off + wr->wr_circ_size - off;
1.2       simonb   2238:        if (slen < len) {
                   2239:                len -= slen;
1.14      joerg    2240:                off = wr->wr_circ_off;
1.2       simonb   2241:        }
                   2242:        off += len;
1.14      joerg    2243:        if (off >= wr->wr_circ_off + wr->wr_circ_size)
                   2244:                off = wr->wr_circ_off;
1.2       simonb   2245:        *offp = off;
                   2246: }
                   2247:
                   2248: /****************************************************************/
                   2249:
                   2250: int
                   2251: wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
                   2252:        daddr_t off, size_t count, size_t blksize)
                   2253: {
                   2254:        struct wapbl_replay *wr;
                   2255:        int error;
                   2256:        struct vnode *devvp;
                   2257:        daddr_t logpbn;
                   2258:        uint8_t *scratch;
                   2259:        struct wapbl_wc_header *wch;
                   2260:        struct wapbl_wc_header *wch2;
                   2261:        /* Use this until we read the actual log header */
1.31      mlelstv  2262:        int log_dev_bshift = ilog2(blksize);
1.2       simonb   2263:        size_t used;
1.34    ! mlelstv  2264:        daddr_t pbn;
1.2       simonb   2265:
                   2266:        WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
                   2267:            ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
                   2268:            vp, off, count, blksize));
                   2269:
                   2270:        if (off < 0)
                   2271:                return EINVAL;
                   2272:
                   2273:        if (blksize < DEV_BSIZE)
                   2274:                return EINVAL;
                   2275:        if (blksize % DEV_BSIZE)
                   2276:                return EINVAL;
                   2277:
                   2278: #ifdef _KERNEL
                   2279: #if 0
                   2280:        /* XXX vp->v_size isn't reliably set for VBLK devices,
                   2281:         * especially root.  However, we might still want to verify
                   2282:         * that the full load is readable */
                   2283:        if ((off + count) * blksize > vp->v_size)
                   2284:                return EINVAL;
                   2285: #endif
                   2286:        if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
                   2287:                return error;
                   2288:        }
                   2289: #else /* ! _KERNEL */
                   2290:        devvp = vp;
                   2291:        logpbn = off;
                   2292: #endif /* ! _KERNEL */
                   2293:
                   2294:        scratch = wapbl_malloc(MAXBSIZE);
                   2295:
1.34    ! mlelstv  2296:        pbn = logpbn;
        !          2297: #ifdef _KERNEL
        !          2298:        pbn = btodb(pbn << log_dev_bshift);
        !          2299: #endif
        !          2300:        error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn);
1.2       simonb   2301:        if (error)
                   2302:                goto errout;
                   2303:
                   2304:        wch = (struct wapbl_wc_header *)scratch;
                   2305:        wch2 =
                   2306:            (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
                   2307:        /* XXX verify checksums and magic numbers */
                   2308:        if (wch->wc_type != WAPBL_WC_HEADER) {
                   2309:                printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
                   2310:                error = EFTYPE;
                   2311:                goto errout;
                   2312:        }
                   2313:
                   2314:        if (wch2->wc_generation > wch->wc_generation)
                   2315:                wch = wch2;
                   2316:
                   2317:        wr = wapbl_calloc(1, sizeof(*wr));
                   2318:
                   2319:        wr->wr_logvp = vp;
                   2320:        wr->wr_devvp = devvp;
                   2321:        wr->wr_logpbn = logpbn;
                   2322:
                   2323:        wr->wr_scratch = scratch;
                   2324:
1.14      joerg    2325:        wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
                   2326:        wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
                   2327:        wr->wr_circ_off = wch->wc_circ_off;
                   2328:        wr->wr_circ_size = wch->wc_circ_size;
                   2329:        wr->wr_generation = wch->wc_generation;
1.2       simonb   2330:
                   2331:        used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
                   2332:
                   2333:        WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
                   2334:            ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
                   2335:            " len=%"PRId64" used=%zu\n",
                   2336:            wch->wc_head, wch->wc_tail, wch->wc_circ_off,
                   2337:            wch->wc_circ_size, used));
                   2338:
                   2339:        wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
1.11      joerg    2340:
1.14      joerg    2341:        error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
1.2       simonb   2342:        if (error) {
                   2343:                wapbl_replay_stop(wr);
                   2344:                wapbl_replay_free(wr);
                   2345:                return error;
                   2346:        }
                   2347:
                   2348:        *wrp = wr;
                   2349:        return 0;
                   2350:
                   2351:  errout:
1.18      yamt     2352:        wapbl_free(scratch, MAXBSIZE);
1.2       simonb   2353:        return error;
                   2354: }
                   2355:
                   2356: void
                   2357: wapbl_replay_stop(struct wapbl_replay *wr)
                   2358: {
                   2359:
1.4       joerg    2360:        if (!wapbl_replay_isopen(wr))
                   2361:                return;
                   2362:
1.2       simonb   2363:        WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
                   2364:
1.18      yamt     2365:        wapbl_free(wr->wr_scratch, MAXBSIZE);
                   2366:        wr->wr_scratch = NULL;
1.2       simonb   2367:
1.18      yamt     2368:        wr->wr_logvp = NULL;
1.2       simonb   2369:
                   2370:        wapbl_blkhash_clear(wr);
                   2371:        wapbl_blkhash_free(wr);
                   2372: }
                   2373:
                   2374: void
                   2375: wapbl_replay_free(struct wapbl_replay *wr)
                   2376: {
                   2377:
                   2378:        KDASSERT(!wapbl_replay_isopen(wr));
                   2379:
                   2380:        if (wr->wr_inodes)
1.18      yamt     2381:                wapbl_free(wr->wr_inodes,
                   2382:                    wr->wr_inodescnt * sizeof(wr->wr_inodes[0]));
                   2383:        wapbl_free(wr, sizeof(*wr));
1.2       simonb   2384: }
                   2385:
1.4       joerg    2386: #ifdef _KERNEL
1.2       simonb   2387: int
                   2388: wapbl_replay_isopen1(struct wapbl_replay *wr)
                   2389: {
                   2390:
                   2391:        return wapbl_replay_isopen(wr);
                   2392: }
1.4       joerg    2393: #endif
1.2       simonb   2394:
1.10      joerg    2395: static void
                   2396: wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
                   2397: {
                   2398:        struct wapbl_wc_blocklist *wc =
                   2399:            (struct wapbl_wc_blocklist *)wr->wr_scratch;
1.14      joerg    2400:        int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.10      joerg    2401:        int i, j, n;
                   2402:
                   2403:        for (i = 0; i < wc->wc_blkcount; i++) {
                   2404:                /*
                   2405:                 * Enter each physical block into the hashtable independently.
                   2406:                 */
1.14      joerg    2407:                n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
1.10      joerg    2408:                for (j = 0; j < n; j++) {
1.34    ! mlelstv  2409:                        wapbl_blkhash_ins(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen),
1.10      joerg    2410:                            *offp);
                   2411:                        wapbl_circ_advance(wr, fsblklen, offp);
                   2412:                }
                   2413:        }
                   2414: }
                   2415:
                   2416: static void
                   2417: wapbl_replay_process_revocations(struct wapbl_replay *wr)
                   2418: {
                   2419:        struct wapbl_wc_blocklist *wc =
                   2420:            (struct wapbl_wc_blocklist *)wr->wr_scratch;
1.34    ! mlelstv  2421:        int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.10      joerg    2422:        int i, j, n;
                   2423:
                   2424:        for (i = 0; i < wc->wc_blkcount; i++) {
                   2425:                /*
                   2426:                 * Remove any blocks found from the hashtable.
                   2427:                 */
1.14      joerg    2428:                n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
1.10      joerg    2429:                for (j = 0; j < n; j++)
1.34    ! mlelstv  2430:                        wapbl_blkhash_rem(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
1.10      joerg    2431:        }
                   2432: }
                   2433:
                   2434: static void
                   2435: wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
                   2436: {
                   2437:        struct wapbl_wc_inodelist *wc =
                   2438:            (struct wapbl_wc_inodelist *)wr->wr_scratch;
1.18      yamt     2439:        void *new_inodes;
                   2440:        const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]);
                   2441:
                   2442:        KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0]));
                   2443:
1.10      joerg    2444:        /*
                   2445:         * Keep track of where we found this so location won't be
                   2446:         * overwritten.
                   2447:         */
                   2448:        if (wc->wc_clear) {
                   2449:                wr->wr_inodestail = oldoff;
                   2450:                wr->wr_inodescnt = 0;
1.12      joerg    2451:                if (wr->wr_inodes != NULL) {
1.18      yamt     2452:                        wapbl_free(wr->wr_inodes, oldsize);
1.12      joerg    2453:                        wr->wr_inodes = NULL;
                   2454:                }
1.10      joerg    2455:        }
                   2456:        wr->wr_inodeshead = newoff;
                   2457:        if (wc->wc_inocnt == 0)
                   2458:                return;
                   2459:
1.18      yamt     2460:        new_inodes = wapbl_malloc((wr->wr_inodescnt + wc->wc_inocnt) *
                   2461:            sizeof(wr->wr_inodes[0]));
                   2462:        if (wr->wr_inodes != NULL) {
                   2463:                memcpy(new_inodes, wr->wr_inodes, oldsize);
                   2464:                wapbl_free(wr->wr_inodes, oldsize);
                   2465:        }
                   2466:        wr->wr_inodes = new_inodes;
1.10      joerg    2467:        memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
1.18      yamt     2468:            wc->wc_inocnt * sizeof(wr->wr_inodes[0]));
1.10      joerg    2469:        wr->wr_inodescnt += wc->wc_inocnt;
                   2470: }
                   2471:
1.2       simonb   2472: static int
1.14      joerg    2473: wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
1.2       simonb   2474: {
                   2475:        off_t off;
                   2476:        int error;
                   2477:
1.14      joerg    2478:        int logblklen = 1 << wr->wr_log_dev_bshift;
1.2       simonb   2479:
                   2480:        wapbl_blkhash_clear(wr);
                   2481:
1.14      joerg    2482:        off = tail;
                   2483:        while (off != head) {
1.2       simonb   2484:                struct wapbl_wc_null *wcn;
                   2485:                off_t saveoff = off;
                   2486:                error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
                   2487:                if (error)
                   2488:                        goto errout;
                   2489:                wcn = (struct wapbl_wc_null *)wr->wr_scratch;
                   2490:                switch (wcn->wc_type) {
                   2491:                case WAPBL_WC_BLOCKS:
1.10      joerg    2492:                        wapbl_replay_process_blocks(wr, &off);
1.2       simonb   2493:                        break;
                   2494:
                   2495:                case WAPBL_WC_REVOCATIONS:
1.10      joerg    2496:                        wapbl_replay_process_revocations(wr);
1.2       simonb   2497:                        break;
                   2498:
                   2499:                case WAPBL_WC_INODES:
1.10      joerg    2500:                        wapbl_replay_process_inodes(wr, saveoff, off);
1.2       simonb   2501:                        break;
1.10      joerg    2502:
1.2       simonb   2503:                default:
                   2504:                        printf("Unrecognized wapbl type: 0x%08x\n",
                   2505:                               wcn->wc_type);
                   2506:                        error = EFTYPE;
                   2507:                        goto errout;
                   2508:                }
                   2509:                wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
                   2510:                if (off != saveoff) {
                   2511:                        printf("wapbl_replay: corrupted records\n");
                   2512:                        error = EFTYPE;
                   2513:                        goto errout;
                   2514:                }
                   2515:        }
                   2516:        return 0;
                   2517:
                   2518:  errout:
                   2519:        wapbl_blkhash_clear(wr);
                   2520:        return error;
                   2521: }
                   2522:
1.13      joerg    2523: #if 0
1.2       simonb   2524: int
                   2525: wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
                   2526: {
                   2527:        off_t off;
                   2528:        int mismatchcnt = 0;
1.14      joerg    2529:        int logblklen = 1 << wr->wr_log_dev_bshift;
                   2530:        int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.2       simonb   2531:        void *scratch1 = wapbl_malloc(MAXBSIZE);
                   2532:        void *scratch2 = wapbl_malloc(MAXBSIZE);
                   2533:        int error = 0;
                   2534:
                   2535:        KDASSERT(wapbl_replay_isopen(wr));
                   2536:
                   2537:        off = wch->wc_tail;
                   2538:        while (off != wch->wc_head) {
                   2539:                struct wapbl_wc_null *wcn;
                   2540: #ifdef DEBUG
                   2541:                off_t saveoff = off;
                   2542: #endif
                   2543:                error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
                   2544:                if (error)
                   2545:                        goto out;
                   2546:                wcn = (struct wapbl_wc_null *)wr->wr_scratch;
                   2547:                switch (wcn->wc_type) {
                   2548:                case WAPBL_WC_BLOCKS:
                   2549:                        {
                   2550:                                struct wapbl_wc_blocklist *wc =
                   2551:                                    (struct wapbl_wc_blocklist *)wr->wr_scratch;
                   2552:                                int i;
                   2553:                                for (i = 0; i < wc->wc_blkcount; i++) {
                   2554:                                        int foundcnt = 0;
                   2555:                                        int dirtycnt = 0;
                   2556:                                        int j, n;
                   2557:                                        /*
                   2558:                                         * Check each physical block into the
                   2559:                                         * hashtable independently
                   2560:                                         */
                   2561:                                        n = wc->wc_blocks[i].wc_dlen >>
                   2562:                                            wch->wc_fs_dev_bshift;
                   2563:                                        for (j = 0; j < n; j++) {
                   2564:                                                struct wapbl_blk *wb =
                   2565:                                                   wapbl_blkhash_get(wr,
1.34    ! mlelstv  2566:                                                   wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
1.2       simonb   2567:                                                if (wb && (wb->wb_off == off)) {
                   2568:                                                        foundcnt++;
                   2569:                                                        error =
                   2570:                                                            wapbl_circ_read(wr,
                   2571:                                                            scratch1, fsblklen,
                   2572:                                                            &off);
                   2573:                                                        if (error)
                   2574:                                                                goto out;
                   2575:                                                        error =
                   2576:                                                            wapbl_read(scratch2,
                   2577:                                                            fsblklen, fsdevvp,
                   2578:                                                            wb->wb_blk);
                   2579:                                                        if (error)
                   2580:                                                                goto out;
                   2581:                                                        if (memcmp(scratch1,
                   2582:                                                                   scratch2,
                   2583:                                                                   fsblklen)) {
                   2584:                                                                printf(
                   2585:                "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
                   2586:                wb->wb_blk, (intmax_t)off);
                   2587:                                                                dirtycnt++;
                   2588:                                                                mismatchcnt++;
                   2589:                                                        }
                   2590:                                                } else {
                   2591:                                                        wapbl_circ_advance(wr,
                   2592:                                                            fsblklen, &off);
                   2593:                                                }
                   2594:                                        }
                   2595: #if 0
                   2596:                                        /*
                   2597:                                         * If all of the blocks in an entry
                   2598:                                         * are clean, then remove all of its
                   2599:                                         * blocks from the hashtable since they
                   2600:                                         * never will need replay.
                   2601:                                         */
                   2602:                                        if ((foundcnt != 0) &&
                   2603:                                            (dirtycnt == 0)) {
                   2604:                                                off = saveoff;
                   2605:                                                wapbl_circ_advance(wr,
                   2606:                                                    logblklen, &off);
                   2607:                                                for (j = 0; j < n; j++) {
                   2608:                                                        struct wapbl_blk *wb =
                   2609:                                                           wapbl_blkhash_get(wr,
1.34    ! mlelstv  2610:                                                           wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
1.2       simonb   2611:                                                        if (wb &&
                   2612:                                                          (wb->wb_off == off)) {
                   2613:                                                                wapbl_blkhash_rem(wr, wb->wb_blk);
                   2614:                                                        }
                   2615:                                                        wapbl_circ_advance(wr,
                   2616:                                                            fsblklen, &off);
                   2617:                                                }
                   2618:                                        }
                   2619: #endif
                   2620:                                }
                   2621:                        }
                   2622:                        break;
                   2623:                case WAPBL_WC_REVOCATIONS:
                   2624:                case WAPBL_WC_INODES:
                   2625:                        break;
                   2626:                default:
                   2627:                        KASSERT(0);
                   2628:                }
                   2629: #ifdef DEBUG
                   2630:                wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
                   2631:                KASSERT(off == saveoff);
                   2632: #endif
                   2633:        }
                   2634:  out:
1.18      yamt     2635:        wapbl_free(scratch1, MAXBSIZE);
                   2636:        wapbl_free(scratch2, MAXBSIZE);
1.2       simonb   2637:        if (!error && mismatchcnt)
                   2638:                error = EFTYPE;
                   2639:        return error;
                   2640: }
                   2641: #endif
                   2642:
                   2643: int
                   2644: wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
                   2645: {
1.9       joerg    2646:        struct wapbl_blk *wb;
                   2647:        size_t i;
1.2       simonb   2648:        off_t off;
1.9       joerg    2649:        void *scratch;
1.2       simonb   2650:        int error = 0;
1.14      joerg    2651:        int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.2       simonb   2652:
                   2653:        KDASSERT(wapbl_replay_isopen(wr));
                   2654:
1.9       joerg    2655:        scratch = wapbl_malloc(MAXBSIZE);
1.2       simonb   2656:
1.9       joerg    2657:        for (i = 0; i < wr->wr_blkhashmask; ++i) {
                   2658:                LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
                   2659:                        off = wb->wb_off;
                   2660:                        error = wapbl_circ_read(wr, scratch, fsblklen, &off);
                   2661:                        if (error)
                   2662:                                break;
                   2663:                        error = wapbl_write(scratch, fsblklen, fsdevvp,
                   2664:                            wb->wb_blk);
                   2665:                        if (error)
                   2666:                                break;
1.2       simonb   2667:                }
                   2668:        }
1.9       joerg    2669:
1.18      yamt     2670:        wapbl_free(scratch, MAXBSIZE);
1.2       simonb   2671:        return error;
                   2672: }
                   2673:
                   2674: int
1.6       joerg    2675: wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
                   2676: {
1.14      joerg    2677:        int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.6       joerg    2678:
                   2679:        KDASSERT(wapbl_replay_isopen(wr));
                   2680:        KASSERT((len % fsblklen) == 0);
                   2681:
                   2682:        while (len != 0) {
                   2683:                struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
                   2684:                if (wb)
                   2685:                        return 1;
                   2686:                len -= fsblklen;
                   2687:        }
                   2688:        return 0;
                   2689: }
                   2690:
                   2691: int
1.2       simonb   2692: wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
                   2693: {
1.14      joerg    2694:        int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.2       simonb   2695:
                   2696:        KDASSERT(wapbl_replay_isopen(wr));
                   2697:
                   2698:        KASSERT((len % fsblklen) == 0);
                   2699:
                   2700:        while (len != 0) {
                   2701:                struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
                   2702:                if (wb) {
                   2703:                        off_t off = wb->wb_off;
                   2704:                        int error;
                   2705:                        error = wapbl_circ_read(wr, data, fsblklen, &off);
                   2706:                        if (error)
                   2707:                                return error;
                   2708:                }
                   2709:                data = (uint8_t *)data + fsblklen;
                   2710:                len -= fsblklen;
                   2711:                blk++;
                   2712:        }
                   2713:        return 0;
                   2714: }

CVSweb <webmaster@jp.NetBSD.org>