[BACK]Return to vfs_wapbl.c CVS log [TXT][DIR] Up to [cvs.NetBSD.org] / src / sys / kern

Annotation of src/sys/kern/vfs_wapbl.c, Revision 1.53

1.53    ! hannken     1: /*     $NetBSD: vfs_wapbl.c,v 1.52 2012/04/29 22:55:11 chs Exp $       */
1.2       simonb      2:
                      3: /*-
1.23      ad          4:  * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
1.2       simonb      5:  * All rights reserved.
                      6:  *
                      7:  * This code is derived from software contributed to The NetBSD Foundation
                      8:  * by Wasabi Systems, Inc.
                      9:  *
                     10:  * Redistribution and use in source and binary forms, with or without
                     11:  * modification, are permitted provided that the following conditions
                     12:  * are met:
                     13:  * 1. Redistributions of source code must retain the above copyright
                     14:  *    notice, this list of conditions and the following disclaimer.
                     15:  * 2. Redistributions in binary form must reproduce the above copyright
                     16:  *    notice, this list of conditions and the following disclaimer in the
                     17:  *    documentation and/or other materials provided with the distribution.
                     18:  *
                     19:  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
                     20:  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
                     21:  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
                     22:  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
                     23:  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
                     24:  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
                     25:  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
                     26:  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
                     27:  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
                     28:  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
                     29:  * POSSIBILITY OF SUCH DAMAGE.
                     30:  */
                     31:
                     32: /*
                     33:  * This implements file system independent write ahead filesystem logging.
                     34:  */
1.4       joerg      35:
                     36: #define WAPBL_INTERNAL
                     37:
1.2       simonb     38: #include <sys/cdefs.h>
1.53    ! hannken    39: __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.52 2012/04/29 22:55:11 chs Exp $");
1.2       simonb     40:
                     41: #include <sys/param.h>
1.31      mlelstv    42: #include <sys/bitops.h>
1.2       simonb     43:
                     44: #ifdef _KERNEL
                     45: #include <sys/param.h>
                     46: #include <sys/namei.h>
                     47: #include <sys/proc.h>
1.39      christos   48: #include <sys/sysctl.h>
1.2       simonb     49: #include <sys/uio.h>
                     50: #include <sys/vnode.h>
                     51: #include <sys/file.h>
1.35      pooka      52: #include <sys/module.h>
1.2       simonb     53: #include <sys/resourcevar.h>
                     54: #include <sys/conf.h>
                     55: #include <sys/mount.h>
                     56: #include <sys/kernel.h>
                     57: #include <sys/kauth.h>
                     58: #include <sys/mutex.h>
                     59: #include <sys/atomic.h>
                     60: #include <sys/wapbl.h>
1.16      joerg      61: #include <sys/wapbl_replay.h>
1.2       simonb     62:
                     63: #include <miscfs/specfs/specdev.h>
                     64:
1.51      para       65: #define        wapbl_alloc(s) kmem_alloc((s), KM_SLEEP)
                     66: #define        wapbl_free(a, s) kmem_free((a), (s))
                     67: #define        wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
1.2       simonb     68:
1.39      christos   69: static struct sysctllog *wapbl_sysctl;
                     70: static int wapbl_flush_disk_cache = 1;
                     71: static int wapbl_verbose_commit = 0;
                     72:
1.2       simonb     73: #else /* !_KERNEL */
                     74: #include <assert.h>
                     75: #include <errno.h>
                     76: #include <stdio.h>
                     77: #include <stdbool.h>
                     78: #include <stdlib.h>
                     79: #include <string.h>
                     80:
                     81: #include <sys/time.h>
                     82: #include <sys/wapbl.h>
1.16      joerg      83: #include <sys/wapbl_replay.h>
1.2       simonb     84:
                     85: #define        KDASSERT(x) assert(x)
                     86: #define        KASSERT(x) assert(x)
1.51      para       87: #define        wapbl_alloc(s) malloc(s)
1.18      yamt       88: #define        wapbl_free(a, s) free(a)
1.2       simonb     89: #define        wapbl_calloc(n, s) calloc((n), (s))
                     90:
                     91: #endif /* !_KERNEL */
                     92:
                     93: /*
                     94:  * INTERNAL DATA STRUCTURES
                     95:  */
                     96:
                     97: /*
                     98:  * This structure holds per-mount log information.
                     99:  *
                    100:  * Legend:     a = atomic access only
                    101:  *             r = read-only after init
                    102:  *             l = rwlock held
                    103:  *             m = mutex held
1.38      hannken   104:  *             lm = rwlock held writing or mutex held
1.2       simonb    105:  *             u = unlocked access ok
                    106:  *             b = bufcache_lock held
                    107:  */
                    108: struct wapbl {
                    109:        struct vnode *wl_logvp; /* r:   log here */
                    110:        struct vnode *wl_devvp; /* r:   log on this device */
                    111:        struct mount *wl_mount; /* r:   mountpoint wl is associated with */
                    112:        daddr_t wl_logpbn;      /* r:   Physical block number of start of log */
                    113:        int wl_log_dev_bshift;  /* r:   logarithm of device block size of log
                    114:                                        device */
                    115:        int wl_fs_dev_bshift;   /* r:   logarithm of device block size of
                    116:                                        filesystem device */
                    117:
1.3       yamt      118:        unsigned wl_lock_count; /* m:   Count of transactions in progress */
1.2       simonb    119:
                    120:        size_t wl_circ_size;    /* r:   Number of bytes in buffer of log */
                    121:        size_t wl_circ_off;     /* r:   Number of bytes reserved at start */
                    122:
                    123:        size_t wl_bufcount_max; /* r:   Number of buffers reserved for log */
                    124:        size_t wl_bufbytes_max; /* r:   Number of buf bytes reserved for log */
                    125:
                    126:        off_t wl_head;          /* l:   Byte offset of log head */
                    127:        off_t wl_tail;          /* l:   Byte offset of log tail */
                    128:        /*
                    129:         * head == tail == 0 means log is empty
                    130:         * head == tail != 0 means log is full
                    131:         * see assertions in wapbl_advance() for other boundary conditions.
                    132:         * only truncate moves the tail, except when flush sets it to
                    133:         * wl_header_size only flush moves the head, except when truncate
                    134:         * sets it to 0.
                    135:         */
                    136:
                    137:        struct wapbl_wc_header *wl_wc_header;   /* l    */
                    138:        void *wl_wc_scratch;    /* l:   scratch space (XXX: por que?!?) */
                    139:
                    140:        kmutex_t wl_mtx;        /* u:   short-term lock */
                    141:        krwlock_t wl_rwlock;    /* u:   File system transaction lock */
                    142:
                    143:        /*
                    144:         * Must be held while accessing
                    145:         * wl_count or wl_bufs or head or tail
                    146:         */
                    147:
                    148:        /*
                    149:         * Callback called from within the flush routine to flush any extra
                    150:         * bits.  Note that flush may be skipped without calling this if
                    151:         * there are no outstanding buffers in the transaction.
                    152:         */
1.5       joerg     153: #if _KERNEL
1.2       simonb    154:        wapbl_flush_fn_t wl_flush;      /* r    */
                    155:        wapbl_flush_fn_t wl_flush_abort;/* r    */
1.5       joerg     156: #endif
1.2       simonb    157:
                    158:        size_t wl_bufbytes;     /* m:   Byte count of pages in wl_bufs */
                    159:        size_t wl_bufcount;     /* m:   Count of buffers in wl_bufs */
                    160:        size_t wl_bcount;       /* m:   Total bcount of wl_bufs */
                    161:
                    162:        LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
                    163:
                    164:        kcondvar_t wl_reclaimable_cv;   /* m (obviously) */
                    165:        size_t wl_reclaimable_bytes; /* m:      Amount of space available for
                    166:                                                reclamation by truncate */
                    167:        int wl_error_count;     /* m:   # of wl_entries with errors */
                    168:        size_t wl_reserved_bytes; /* never truncate log smaller than this */
                    169:
                    170: #ifdef WAPBL_DEBUG_BUFBYTES
                    171:        size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
                    172: #endif
                    173:
1.38      hannken   174:        daddr_t *wl_deallocblks;/* lm:  address of block */
                    175:        int *wl_dealloclens;    /* lm:  size of block */
                    176:        int wl_dealloccnt;      /* lm:  total count */
1.2       simonb    177:        int wl_dealloclim;      /* l:   max count */
                    178:
                    179:        /* hashtable of inode numbers for allocated but unlinked inodes */
                    180:        /* synch ??? */
                    181:        LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash;
                    182:        u_long wl_inohashmask;
                    183:        int wl_inohashcnt;
                    184:
                    185:        SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
                    186:                                                   accounting */
                    187: };
                    188:
                    189: #ifdef WAPBL_DEBUG_PRINT
                    190: int wapbl_debug_print = WAPBL_DEBUG_PRINT;
                    191: #endif
                    192:
                    193: /****************************************************************/
                    194: #ifdef _KERNEL
                    195:
                    196: #ifdef WAPBL_DEBUG
                    197: struct wapbl *wapbl_debug_wl;
                    198: #endif
                    199:
                    200: static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
                    201: static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
                    202: static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
                    203: static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
                    204: #endif /* _KERNEL */
                    205:
1.14      joerg     206: static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
1.2       simonb    207:
1.30      uebayasi  208: static inline size_t wapbl_space_free(size_t avail, off_t head,
1.2       simonb    209:        off_t tail);
1.30      uebayasi  210: static inline size_t wapbl_space_used(size_t avail, off_t head,
1.2       simonb    211:        off_t tail);
                    212:
                    213: #ifdef _KERNEL
                    214:
1.51      para      215: static struct pool wapbl_entry_pool;
                    216:
1.2       simonb    217: #define        WAPBL_INODETRK_SIZE 83
                    218: static int wapbl_ino_pool_refcount;
                    219: static struct pool wapbl_ino_pool;
                    220: struct wapbl_ino {
                    221:        LIST_ENTRY(wapbl_ino) wi_hash;
                    222:        ino_t wi_ino;
                    223:        mode_t wi_mode;
                    224: };
                    225:
                    226: static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
                    227: static void wapbl_inodetrk_free(struct wapbl *wl);
                    228: static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
                    229:
                    230: static size_t wapbl_transaction_len(struct wapbl *wl);
1.30      uebayasi  231: static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
1.2       simonb    232:
1.13      joerg     233: #if 0
1.4       joerg     234: int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
                    235: #endif
                    236:
                    237: static int wapbl_replay_isopen1(struct wapbl_replay *);
                    238:
1.2       simonb    239: /*
                    240:  * This is useful for debugging.  If set, the log will
                    241:  * only be truncated when necessary.
                    242:  */
                    243: int wapbl_lazy_truncate = 0;
                    244:
                    245: struct wapbl_ops wapbl_ops = {
                    246:        .wo_wapbl_discard       = wapbl_discard,
                    247:        .wo_wapbl_replay_isopen = wapbl_replay_isopen1,
1.6       joerg     248:        .wo_wapbl_replay_can_read = wapbl_replay_can_read,
1.2       simonb    249:        .wo_wapbl_replay_read   = wapbl_replay_read,
                    250:        .wo_wapbl_add_buf       = wapbl_add_buf,
                    251:        .wo_wapbl_remove_buf    = wapbl_remove_buf,
                    252:        .wo_wapbl_resize_buf    = wapbl_resize_buf,
                    253:        .wo_wapbl_begin         = wapbl_begin,
                    254:        .wo_wapbl_end           = wapbl_end,
                    255:        .wo_wapbl_junlock_assert= wapbl_junlock_assert,
                    256:
                    257:        /* XXX: the following is only used to say "this is a wapbl buf" */
                    258:        .wo_wapbl_biodone       = wapbl_biodone,
                    259: };
                    260:
1.21      yamt      261: static int
1.39      christos  262: wapbl_sysctl_init(void)
                    263: {
                    264:        int rv;
                    265:        const struct sysctlnode *rnode, *cnode;
                    266:
                    267:        wapbl_sysctl = NULL;
                    268:
                    269:        rv = sysctl_createv(&wapbl_sysctl, 0, NULL, &rnode,
                    270:                       CTLFLAG_PERMANENT,
                    271:                       CTLTYPE_NODE, "vfs", NULL,
                    272:                       NULL, 0, NULL, 0,
                    273:                       CTL_VFS, CTL_EOL);
                    274:        if (rv)
                    275:                return rv;
                    276:
                    277:        rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &rnode,
                    278:                       CTLFLAG_PERMANENT,
                    279:                       CTLTYPE_NODE, "wapbl",
                    280:                       SYSCTL_DESCR("WAPBL journaling options"),
                    281:                       NULL, 0, NULL, 0,
                    282:                       CTL_CREATE, CTL_EOL);
                    283:        if (rv)
                    284:                return rv;
                    285:
                    286:        rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
                    287:                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                    288:                       CTLTYPE_INT, "flush_disk_cache",
                    289:                       SYSCTL_DESCR("flush disk cache"),
                    290:                       NULL, 0, &wapbl_flush_disk_cache, 0,
                    291:                       CTL_CREATE, CTL_EOL);
                    292:        if (rv)
                    293:                return rv;
                    294:
                    295:        rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
                    296:                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                    297:                       CTLTYPE_INT, "verbose_commit",
                    298:                       SYSCTL_DESCR("show time and size of wapbl log commits"),
                    299:                       NULL, 0, &wapbl_verbose_commit, 0,
                    300:                       CTL_CREATE, CTL_EOL);
                    301:        return rv;
                    302: }
                    303:
                    304: static void
                    305: wapbl_init(void)
                    306: {
1.51      para      307:
                    308:        pool_init(&wapbl_entry_pool, sizeof(struct wapbl_entry), 0, 0, 0,
                    309:            "wapblentrypl", &pool_allocator_kmem, IPL_VM);
                    310:
1.39      christos  311:        wapbl_sysctl_init();
                    312: }
                    313:
                    314: #ifdef notyet
                    315: static int
                    316: wapbl_fini(bool interface)
                    317: {
1.51      para      318:
1.39      christos  319:        if (aio_sysctl != NULL)
                    320:                 sysctl_teardown(&aio_sysctl);
1.51      para      321:
                    322:        pool_destroy(&wapbl_entry_pool);
                    323:
1.39      christos  324:        return 0;
                    325: }
                    326: #endif
                    327:
                    328: static int
1.15      joerg     329: wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
                    330: {
                    331:        int error, i;
                    332:
                    333:        WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
                    334:            ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
                    335:
                    336:        /*
                    337:         * Its only valid to reuse the replay log if its
                    338:         * the same as the new log we just opened.
                    339:         */
                    340:        KDASSERT(!wapbl_replay_isopen(wr));
1.47      christos  341:        KASSERT(wl->wl_devvp->v_type == VBLK);
                    342:        KASSERT(wr->wr_devvp->v_type == VBLK);
1.15      joerg     343:        KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
                    344:        KASSERT(wl->wl_logpbn == wr->wr_logpbn);
                    345:        KASSERT(wl->wl_circ_size == wr->wr_circ_size);
                    346:        KASSERT(wl->wl_circ_off == wr->wr_circ_off);
                    347:        KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
                    348:        KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
                    349:
                    350:        wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
                    351:
                    352:        for (i = 0; i < wr->wr_inodescnt; i++)
                    353:                wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
                    354:                    wr->wr_inodes[i].wr_imode);
                    355:
                    356:        /* Make sure new transaction won't overwrite old inodes list */
                    357:        KDASSERT(wapbl_transaction_len(wl) <=
                    358:            wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
                    359:            wr->wr_inodestail));
                    360:
                    361:        wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
                    362:        wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
                    363:            wapbl_transaction_len(wl);
                    364:
                    365:        error = wapbl_write_inodes(wl, &wl->wl_head);
                    366:        if (error)
                    367:                return error;
                    368:
                    369:        KASSERT(wl->wl_head != wl->wl_tail);
                    370:        KASSERT(wl->wl_head != 0);
                    371:
                    372:        return 0;
                    373: }
                    374:
1.2       simonb    375: int
                    376: wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
                    377:        daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
                    378:        wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
                    379: {
                    380:        struct wapbl *wl;
                    381:        struct vnode *devvp;
                    382:        daddr_t logpbn;
                    383:        int error;
1.31      mlelstv   384:        int log_dev_bshift = ilog2(blksize);
1.32      mlelstv   385:        int fs_dev_bshift = log_dev_bshift;
1.2       simonb    386:        int run;
                    387:
                    388:        WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
                    389:            " count=%zu blksize=%zu\n", vp, off, count, blksize));
                    390:
                    391:        if (log_dev_bshift > fs_dev_bshift) {
                    392:                WAPBL_PRINTF(WAPBL_PRINT_OPEN,
                    393:                        ("wapbl: log device's block size cannot be larger "
                    394:                         "than filesystem's\n"));
                    395:                /*
                    396:                 * Not currently implemented, although it could be if
                    397:                 * needed someday.
                    398:                 */
                    399:                return ENOSYS;
                    400:        }
                    401:
                    402:        if (off < 0)
                    403:                return EINVAL;
                    404:
                    405:        if (blksize < DEV_BSIZE)
                    406:                return EINVAL;
                    407:        if (blksize % DEV_BSIZE)
                    408:                return EINVAL;
                    409:
                    410:        /* XXXTODO: verify that the full load is writable */
                    411:
                    412:        /*
                    413:         * XXX check for minimum log size
                    414:         * minimum is governed by minimum amount of space
                    415:         * to complete a transaction. (probably truncate)
                    416:         */
                    417:        /* XXX for now pick something minimal */
                    418:        if ((count * blksize) < MAXPHYS) {
                    419:                return ENOSPC;
                    420:        }
                    421:
                    422:        if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
                    423:                return error;
                    424:        }
                    425:
                    426:        wl = wapbl_calloc(1, sizeof(*wl));
                    427:        rw_init(&wl->wl_rwlock);
                    428:        mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
                    429:        cv_init(&wl->wl_reclaimable_cv, "wapblrec");
                    430:        LIST_INIT(&wl->wl_bufs);
                    431:        SIMPLEQ_INIT(&wl->wl_entries);
                    432:
                    433:        wl->wl_logvp = vp;
                    434:        wl->wl_devvp = devvp;
                    435:        wl->wl_mount = mp;
                    436:        wl->wl_logpbn = logpbn;
                    437:        wl->wl_log_dev_bshift = log_dev_bshift;
                    438:        wl->wl_fs_dev_bshift = fs_dev_bshift;
                    439:
                    440:        wl->wl_flush = flushfn;
                    441:        wl->wl_flush_abort = flushabortfn;
                    442:
                    443:        /* Reserve two log device blocks for the commit headers */
                    444:        wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
1.34      mlelstv   445:        wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
1.2       simonb    446:        /* truncate the log usage to a multiple of log_dev_bshift */
                    447:        wl->wl_circ_size >>= wl->wl_log_dev_bshift;
                    448:        wl->wl_circ_size <<= wl->wl_log_dev_bshift;
                    449:
                    450:        /*
                    451:         * wl_bufbytes_max limits the size of the in memory transaction space.
                    452:         * - Since buffers are allocated and accounted for in units of
                    453:         *   PAGE_SIZE it is required to be a multiple of PAGE_SIZE
                    454:         *   (i.e. 1<<PAGE_SHIFT)
                    455:         * - Since the log device has to be written in units of
                    456:         *   1<<wl_log_dev_bshift it is required to be a mulitple of
                    457:         *   1<<wl_log_dev_bshift.
                    458:         * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
                    459:         *   it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
                    460:         * Therefore it must be multiple of the least common multiple of those
                    461:         * three quantities.  Fortunately, all of those quantities are
                    462:         * guaranteed to be a power of two, and the least common multiple of
                    463:         * a set of numbers which are all powers of two is simply the maximum
                    464:         * of those numbers.  Finally, the maximum logarithm of a power of two
                    465:         * is the same as the log of the maximum power of two.  So we can do
                    466:         * the following operations to size wl_bufbytes_max:
                    467:         */
                    468:
                    469:        /* XXX fix actual number of pages reserved per filesystem. */
                    470:        wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
                    471:
                    472:        /* Round wl_bufbytes_max to the largest power of two constraint */
                    473:        wl->wl_bufbytes_max >>= PAGE_SHIFT;
                    474:        wl->wl_bufbytes_max <<= PAGE_SHIFT;
                    475:        wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
                    476:        wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
                    477:        wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
                    478:        wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
                    479:
                    480:        /* XXX maybe use filesystem fragment size instead of 1024 */
                    481:        /* XXX fix actual number of buffers reserved per filesystem. */
                    482:        wl->wl_bufcount_max = (nbuf / 2) * 1024;
                    483:
                    484:        /* XXX tie this into resource estimation */
1.41      hannken   485:        wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2;
1.2       simonb    486:
1.51      para      487:        wl->wl_deallocblks = wapbl_alloc(sizeof(*wl->wl_deallocblks) *
1.2       simonb    488:            wl->wl_dealloclim);
1.51      para      489:        wl->wl_dealloclens = wapbl_alloc(sizeof(*wl->wl_dealloclens) *
1.2       simonb    490:            wl->wl_dealloclim);
                    491:
                    492:        wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
                    493:
                    494:        /* Initialize the commit header */
                    495:        {
                    496:                struct wapbl_wc_header *wc;
1.14      joerg     497:                size_t len = 1 << wl->wl_log_dev_bshift;
1.2       simonb    498:                wc = wapbl_calloc(1, len);
                    499:                wc->wc_type = WAPBL_WC_HEADER;
                    500:                wc->wc_len = len;
                    501:                wc->wc_circ_off = wl->wl_circ_off;
                    502:                wc->wc_circ_size = wl->wl_circ_size;
                    503:                /* XXX wc->wc_fsid */
                    504:                wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
                    505:                wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
                    506:                wl->wl_wc_header = wc;
1.51      para      507:                wl->wl_wc_scratch = wapbl_alloc(len);
1.2       simonb    508:        }
                    509:
                    510:        /*
                    511:         * if there was an existing set of unlinked but
                    512:         * allocated inodes, preserve it in the new
                    513:         * log.
                    514:         */
                    515:        if (wr && wr->wr_inodescnt) {
1.15      joerg     516:                error = wapbl_start_flush_inodes(wl, wr);
1.2       simonb    517:                if (error)
                    518:                        goto errout;
                    519:        }
                    520:
                    521:        error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
                    522:        if (error) {
                    523:                goto errout;
                    524:        }
                    525:
                    526:        *wlp = wl;
                    527: #if defined(WAPBL_DEBUG)
                    528:        wapbl_debug_wl = wl;
                    529: #endif
                    530:
                    531:        return 0;
                    532:  errout:
                    533:        wapbl_discard(wl);
1.18      yamt      534:        wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
                    535:        wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
                    536:        wapbl_free(wl->wl_deallocblks,
                    537:            sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
                    538:        wapbl_free(wl->wl_dealloclens,
                    539:            sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
1.2       simonb    540:        wapbl_inodetrk_free(wl);
1.18      yamt      541:        wapbl_free(wl, sizeof(*wl));
1.2       simonb    542:
                    543:        return error;
                    544: }
                    545:
                    546: /*
                    547:  * Like wapbl_flush, only discards the transaction
                    548:  * completely
                    549:  */
                    550:
                    551: void
                    552: wapbl_discard(struct wapbl *wl)
                    553: {
                    554:        struct wapbl_entry *we;
                    555:        struct buf *bp;
                    556:        int i;
                    557:
                    558:        /*
                    559:         * XXX we may consider using upgrade here
                    560:         * if we want to call flush from inside a transaction
                    561:         */
                    562:        rw_enter(&wl->wl_rwlock, RW_WRITER);
                    563:        wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
                    564:            wl->wl_dealloccnt);
                    565:
                    566: #ifdef WAPBL_DEBUG_PRINT
                    567:        {
                    568:                pid_t pid = -1;
                    569:                lwpid_t lid = -1;
                    570:                if (curproc)
                    571:                        pid = curproc->p_pid;
                    572:                if (curlwp)
                    573:                        lid = curlwp->l_lid;
                    574: #ifdef WAPBL_DEBUG_BUFBYTES
                    575:                WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
                    576:                    ("wapbl_discard: thread %d.%d discarding "
                    577:                    "transaction\n"
                    578:                    "\tbufcount=%zu bufbytes=%zu bcount=%zu "
                    579:                    "deallocs=%d inodes=%d\n"
                    580:                    "\terrcnt = %u, reclaimable=%zu reserved=%zu "
                    581:                    "unsynced=%zu\n",
                    582:                    pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
                    583:                    wl->wl_bcount, wl->wl_dealloccnt,
                    584:                    wl->wl_inohashcnt, wl->wl_error_count,
                    585:                    wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
                    586:                    wl->wl_unsynced_bufbytes));
                    587:                SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
                    588:                        WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
                    589:                            ("\tentry: bufcount = %zu, reclaimable = %zu, "
                    590:                             "error = %d, unsynced = %zu\n",
                    591:                             we->we_bufcount, we->we_reclaimable_bytes,
                    592:                             we->we_error, we->we_unsynced_bufbytes));
                    593:                }
                    594: #else /* !WAPBL_DEBUG_BUFBYTES */
                    595:                WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
                    596:                    ("wapbl_discard: thread %d.%d discarding transaction\n"
                    597:                    "\tbufcount=%zu bufbytes=%zu bcount=%zu "
                    598:                    "deallocs=%d inodes=%d\n"
                    599:                    "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
                    600:                    pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
                    601:                    wl->wl_bcount, wl->wl_dealloccnt,
                    602:                    wl->wl_inohashcnt, wl->wl_error_count,
                    603:                    wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
                    604:                SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
                    605:                        WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
                    606:                            ("\tentry: bufcount = %zu, reclaimable = %zu, "
                    607:                             "error = %d\n",
                    608:                             we->we_bufcount, we->we_reclaimable_bytes,
                    609:                             we->we_error));
                    610:                }
                    611: #endif /* !WAPBL_DEBUG_BUFBYTES */
                    612:        }
                    613: #endif /* WAPBL_DEBUG_PRINT */
                    614:
                    615:        for (i = 0; i <= wl->wl_inohashmask; i++) {
                    616:                struct wapbl_ino_head *wih;
                    617:                struct wapbl_ino *wi;
                    618:
                    619:                wih = &wl->wl_inohash[i];
                    620:                while ((wi = LIST_FIRST(wih)) != NULL) {
                    621:                        LIST_REMOVE(wi, wi_hash);
                    622:                        pool_put(&wapbl_ino_pool, wi);
                    623:                        KASSERT(wl->wl_inohashcnt > 0);
                    624:                        wl->wl_inohashcnt--;
                    625:                }
                    626:        }
                    627:
                    628:        /*
                    629:         * clean buffer list
                    630:         */
                    631:        mutex_enter(&bufcache_lock);
                    632:        mutex_enter(&wl->wl_mtx);
                    633:        while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
                    634:                if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
                    635:                        /*
                    636:                         * The buffer will be unlocked and
                    637:                         * removed from the transaction in brelse
                    638:                         */
                    639:                        mutex_exit(&wl->wl_mtx);
                    640:                        brelsel(bp, 0);
                    641:                        mutex_enter(&wl->wl_mtx);
                    642:                }
                    643:        }
                    644:        mutex_exit(&wl->wl_mtx);
                    645:        mutex_exit(&bufcache_lock);
                    646:
                    647:        /*
                    648:         * Remove references to this wl from wl_entries, free any which
                    649:         * no longer have buffers, others will be freed in wapbl_biodone
                    650:         * when they no longer have any buffers.
                    651:         */
                    652:        while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
                    653:                SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
                    654:                /* XXX should we be accumulating wl_error_count
                    655:                 * and increasing reclaimable bytes ? */
                    656:                we->we_wapbl = NULL;
                    657:                if (we->we_bufcount == 0) {
                    658: #ifdef WAPBL_DEBUG_BUFBYTES
                    659:                        KASSERT(we->we_unsynced_bufbytes == 0);
                    660: #endif
1.51      para      661:                        pool_put(&wapbl_entry_pool, we);
1.2       simonb    662:                }
                    663:        }
                    664:
                    665:        /* Discard list of deallocs */
                    666:        wl->wl_dealloccnt = 0;
                    667:        /* XXX should we clear wl_reserved_bytes? */
                    668:
                    669:        KASSERT(wl->wl_bufbytes == 0);
                    670:        KASSERT(wl->wl_bcount == 0);
                    671:        KASSERT(wl->wl_bufcount == 0);
                    672:        KASSERT(LIST_EMPTY(&wl->wl_bufs));
                    673:        KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
                    674:        KASSERT(wl->wl_inohashcnt == 0);
                    675:
                    676:        rw_exit(&wl->wl_rwlock);
                    677: }
                    678:
                    679: int
                    680: wapbl_stop(struct wapbl *wl, int force)
                    681: {
                    682:        struct vnode *vp;
                    683:        int error;
                    684:
                    685:        WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
                    686:        error = wapbl_flush(wl, 1);
                    687:        if (error) {
                    688:                if (force)
                    689:                        wapbl_discard(wl);
                    690:                else
                    691:                        return error;
                    692:        }
                    693:
                    694:        /* Unlinked inodes persist after a flush */
                    695:        if (wl->wl_inohashcnt) {
                    696:                if (force) {
                    697:                        wapbl_discard(wl);
                    698:                } else {
                    699:                        return EBUSY;
                    700:                }
                    701:        }
                    702:
                    703:        KASSERT(wl->wl_bufbytes == 0);
                    704:        KASSERT(wl->wl_bcount == 0);
                    705:        KASSERT(wl->wl_bufcount == 0);
                    706:        KASSERT(LIST_EMPTY(&wl->wl_bufs));
                    707:        KASSERT(wl->wl_dealloccnt == 0);
                    708:        KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
                    709:        KASSERT(wl->wl_inohashcnt == 0);
                    710:
                    711:        vp = wl->wl_logvp;
                    712:
1.18      yamt      713:        wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
                    714:        wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
                    715:        wapbl_free(wl->wl_deallocblks,
                    716:            sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
                    717:        wapbl_free(wl->wl_dealloclens,
                    718:            sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
1.2       simonb    719:        wapbl_inodetrk_free(wl);
                    720:
                    721:        cv_destroy(&wl->wl_reclaimable_cv);
                    722:        mutex_destroy(&wl->wl_mtx);
                    723:        rw_destroy(&wl->wl_rwlock);
1.18      yamt      724:        wapbl_free(wl, sizeof(*wl));
1.2       simonb    725:
                    726:        return 0;
                    727: }
                    728:
                    729: static int
                    730: wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
                    731: {
                    732:        struct pstats *pstats = curlwp->l_proc->p_stats;
                    733:        struct buf *bp;
                    734:        int error;
                    735:
                    736:        KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
                    737:        KASSERT(devvp->v_type == VBLK);
                    738:
                    739:        if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
1.45      rmind     740:                mutex_enter(devvp->v_interlock);
1.2       simonb    741:                devvp->v_numoutput++;
1.45      rmind     742:                mutex_exit(devvp->v_interlock);
1.2       simonb    743:                pstats->p_ru.ru_oublock++;
                    744:        } else {
                    745:                pstats->p_ru.ru_inblock++;
                    746:        }
                    747:
                    748:        bp = getiobuf(devvp, true);
                    749:        bp->b_flags = flags;
                    750:        bp->b_cflags = BC_BUSY; /* silly & dubious */
                    751:        bp->b_dev = devvp->v_rdev;
                    752:        bp->b_data = data;
                    753:        bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
                    754:        bp->b_blkno = pbn;
1.52      chs       755:        BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
1.2       simonb    756:
                    757:        WAPBL_PRINTF(WAPBL_PRINT_IO,
1.29      pooka     758:            ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n",
1.2       simonb    759:            BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
                    760:            bp->b_blkno, bp->b_dev));
                    761:
                    762:        VOP_STRATEGY(devvp, bp);
                    763:
                    764:        error = biowait(bp);
                    765:        putiobuf(bp);
                    766:
                    767:        if (error) {
                    768:                WAPBL_PRINTF(WAPBL_PRINT_ERROR,
                    769:                    ("wapbl_doio: %s %zu bytes at block %" PRId64
1.29      pooka     770:                    " on dev 0x%"PRIx64" failed with error %d\n",
1.2       simonb    771:                    (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
                    772:                     "write" : "read"),
                    773:                    len, pbn, devvp->v_rdev, error));
                    774:        }
                    775:
                    776:        return error;
                    777: }
                    778:
                    779: int
                    780: wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
                    781: {
                    782:
                    783:        return wapbl_doio(data, len, devvp, pbn, B_WRITE);
                    784: }
                    785:
                    786: int
                    787: wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
                    788: {
                    789:
                    790:        return wapbl_doio(data, len, devvp, pbn, B_READ);
                    791: }
                    792:
                    793: /*
                    794:  * Off is byte offset returns new offset for next write
                    795:  * handles log wraparound
                    796:  */
                    797: static int
                    798: wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
                    799: {
                    800:        size_t slen;
                    801:        off_t off = *offp;
                    802:        int error;
1.34      mlelstv   803:        daddr_t pbn;
1.2       simonb    804:
                    805:        KDASSERT(((len >> wl->wl_log_dev_bshift) <<
                    806:            wl->wl_log_dev_bshift) == len);
                    807:
                    808:        if (off < wl->wl_circ_off)
                    809:                off = wl->wl_circ_off;
                    810:        slen = wl->wl_circ_off + wl->wl_circ_size - off;
                    811:        if (slen < len) {
1.34      mlelstv   812:                pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
                    813: #ifdef _KERNEL
                    814:                pbn = btodb(pbn << wl->wl_log_dev_bshift);
                    815: #endif
                    816:                error = wapbl_write(data, slen, wl->wl_devvp, pbn);
1.2       simonb    817:                if (error)
                    818:                        return error;
                    819:                data = (uint8_t *)data + slen;
                    820:                len -= slen;
                    821:                off = wl->wl_circ_off;
                    822:        }
1.34      mlelstv   823:        pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
                    824: #ifdef _KERNEL
                    825:        pbn = btodb(pbn << wl->wl_log_dev_bshift);
                    826: #endif
                    827:        error = wapbl_write(data, len, wl->wl_devvp, pbn);
1.2       simonb    828:        if (error)
                    829:                return error;
                    830:        off += len;
                    831:        if (off >= wl->wl_circ_off + wl->wl_circ_size)
                    832:                off = wl->wl_circ_off;
                    833:        *offp = off;
                    834:        return 0;
                    835: }
                    836:
                    837: /****************************************************************/
                    838:
                    839: int
                    840: wapbl_begin(struct wapbl *wl, const char *file, int line)
                    841: {
                    842:        int doflush;
                    843:        unsigned lockcount;
                    844:
                    845:        KDASSERT(wl);
                    846:
                    847:        /*
                    848:         * XXX this needs to be made much more sophisticated.
                    849:         * perhaps each wapbl_begin could reserve a specified
                    850:         * number of buffers and bytes.
                    851:         */
                    852:        mutex_enter(&wl->wl_mtx);
                    853:        lockcount = wl->wl_lock_count;
                    854:        doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
                    855:                   wl->wl_bufbytes_max / 2) ||
                    856:                  ((wl->wl_bufcount + (lockcount * 10)) >
                    857:                   wl->wl_bufcount_max / 2) ||
1.28      pooka     858:                  (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) ||
1.42      hannken   859:                  (wl->wl_dealloccnt >= (wl->wl_dealloclim / 2));
1.2       simonb    860:        mutex_exit(&wl->wl_mtx);
                    861:
                    862:        if (doflush) {
                    863:                WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
                    864:                    ("force flush lockcnt=%d bufbytes=%zu "
1.28      pooka     865:                    "(max=%zu) bufcount=%zu (max=%zu) "
                    866:                    "dealloccnt %d (lim=%d)\n",
1.2       simonb    867:                    lockcount, wl->wl_bufbytes,
                    868:                    wl->wl_bufbytes_max, wl->wl_bufcount,
1.28      pooka     869:                    wl->wl_bufcount_max,
                    870:                    wl->wl_dealloccnt, wl->wl_dealloclim));
1.2       simonb    871:        }
                    872:
                    873:        if (doflush) {
                    874:                int error = wapbl_flush(wl, 0);
                    875:                if (error)
                    876:                        return error;
                    877:        }
                    878:
1.23      ad        879:        rw_enter(&wl->wl_rwlock, RW_READER);
1.2       simonb    880:        mutex_enter(&wl->wl_mtx);
                    881:        wl->wl_lock_count++;
                    882:        mutex_exit(&wl->wl_mtx);
                    883:
1.23      ad        884: #if defined(WAPBL_DEBUG_PRINT)
1.2       simonb    885:        WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
                    886:            ("wapbl_begin thread %d.%d with bufcount=%zu "
                    887:            "bufbytes=%zu bcount=%zu at %s:%d\n",
                    888:            curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
                    889:            wl->wl_bufbytes, wl->wl_bcount, file, line));
                    890: #endif
                    891:
                    892:        return 0;
                    893: }
                    894:
                    895: void
                    896: wapbl_end(struct wapbl *wl)
                    897: {
                    898:
1.23      ad        899: #if defined(WAPBL_DEBUG_PRINT)
1.2       simonb    900:        WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
                    901:             ("wapbl_end thread %d.%d with bufcount=%zu "
                    902:              "bufbytes=%zu bcount=%zu\n",
                    903:              curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
                    904:              wl->wl_bufbytes, wl->wl_bcount));
                    905: #endif
                    906:
1.40      bouyer    907: #ifdef DIAGNOSTIC
                    908:        size_t flushsize = wapbl_transaction_len(wl);
                    909:        if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
                    910:                /*
                    911:                 * XXX this could be handled more gracefully, perhaps place
                    912:                 * only a partial transaction in the log and allow the
                    913:                 * remaining to flush without the protection of the journal.
                    914:                 */
                    915:                panic("wapbl_end: current transaction too big to flush\n");
                    916:        }
                    917: #endif
                    918:
1.2       simonb    919:        mutex_enter(&wl->wl_mtx);
                    920:        KASSERT(wl->wl_lock_count > 0);
                    921:        wl->wl_lock_count--;
                    922:        mutex_exit(&wl->wl_mtx);
                    923:
                    924:        rw_exit(&wl->wl_rwlock);
                    925: }
                    926:
                    927: void
                    928: wapbl_add_buf(struct wapbl *wl, struct buf * bp)
                    929: {
                    930:
                    931:        KASSERT(bp->b_cflags & BC_BUSY);
                    932:        KASSERT(bp->b_vp);
                    933:
                    934:        wapbl_jlock_assert(wl);
                    935:
                    936: #if 0
                    937:        /*
                    938:         * XXX this might be an issue for swapfiles.
                    939:         * see uvm_swap.c:1702
                    940:         *
                    941:         * XXX2 why require it then?  leap of semantics?
                    942:         */
                    943:        KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
                    944: #endif
                    945:
                    946:        mutex_enter(&wl->wl_mtx);
                    947:        if (bp->b_flags & B_LOCKED) {
                    948:                LIST_REMOVE(bp, b_wapbllist);
                    949:                WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
                    950:                   ("wapbl_add_buf thread %d.%d re-adding buf %p "
                    951:                    "with %d bytes %d bcount\n",
                    952:                    curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
                    953:                    bp->b_bcount));
                    954:        } else {
                    955:                /* unlocked by dirty buffers shouldn't exist */
                    956:                KASSERT(!(bp->b_oflags & BO_DELWRI));
                    957:                wl->wl_bufbytes += bp->b_bufsize;
                    958:                wl->wl_bcount += bp->b_bcount;
                    959:                wl->wl_bufcount++;
                    960:                WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
                    961:                   ("wapbl_add_buf thread %d.%d adding buf %p "
                    962:                    "with %d bytes %d bcount\n",
                    963:                    curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
                    964:                    bp->b_bcount));
                    965:        }
                    966:        LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist);
                    967:        mutex_exit(&wl->wl_mtx);
                    968:
                    969:        bp->b_flags |= B_LOCKED;
                    970: }
                    971:
                    972: static void
                    973: wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
                    974: {
                    975:
                    976:        KASSERT(mutex_owned(&wl->wl_mtx));
                    977:        KASSERT(bp->b_cflags & BC_BUSY);
                    978:        wapbl_jlock_assert(wl);
                    979:
                    980: #if 0
                    981:        /*
                    982:         * XXX this might be an issue for swapfiles.
                    983:         * see uvm_swap.c:1725
                    984:         *
                    985:         * XXXdeux: see above
                    986:         */
                    987:        KASSERT((bp->b_flags & BC_NOCACHE) == 0);
                    988: #endif
                    989:        KASSERT(bp->b_flags & B_LOCKED);
                    990:
                    991:        WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
                    992:           ("wapbl_remove_buf thread %d.%d removing buf %p with "
                    993:            "%d bytes %d bcount\n",
                    994:            curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
                    995:
                    996:        KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
                    997:        wl->wl_bufbytes -= bp->b_bufsize;
                    998:        KASSERT(wl->wl_bcount >= bp->b_bcount);
                    999:        wl->wl_bcount -= bp->b_bcount;
                   1000:        KASSERT(wl->wl_bufcount > 0);
                   1001:        wl->wl_bufcount--;
                   1002:        KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
                   1003:        KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
                   1004:        LIST_REMOVE(bp, b_wapbllist);
                   1005:
                   1006:        bp->b_flags &= ~B_LOCKED;
                   1007: }
                   1008:
                   1009: /* called from brelsel() in vfs_bio among other places */
                   1010: void
                   1011: wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
                   1012: {
                   1013:
                   1014:        mutex_enter(&wl->wl_mtx);
                   1015:        wapbl_remove_buf_locked(wl, bp);
                   1016:        mutex_exit(&wl->wl_mtx);
                   1017: }
                   1018:
                   1019: void
                   1020: wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
                   1021: {
                   1022:
                   1023:        KASSERT(bp->b_cflags & BC_BUSY);
                   1024:
                   1025:        /*
                   1026:         * XXX: why does this depend on B_LOCKED?  otherwise the buf
                   1027:         * is not for a transaction?  if so, why is this called in the
                   1028:         * first place?
                   1029:         */
                   1030:        if (bp->b_flags & B_LOCKED) {
                   1031:                mutex_enter(&wl->wl_mtx);
                   1032:                wl->wl_bufbytes += bp->b_bufsize - oldsz;
                   1033:                wl->wl_bcount += bp->b_bcount - oldcnt;
                   1034:                mutex_exit(&wl->wl_mtx);
                   1035:        }
                   1036: }
                   1037:
                   1038: #endif /* _KERNEL */
                   1039:
                   1040: /****************************************************************/
                   1041: /* Some utility inlines */
                   1042:
                   1043: /* This is used to advance the pointer at old to new value at old+delta */
1.30      uebayasi 1044: static inline off_t
1.2       simonb   1045: wapbl_advance(size_t size, size_t off, off_t old, size_t delta)
                   1046: {
                   1047:        off_t new;
                   1048:
                   1049:        /* Define acceptable ranges for inputs. */
1.46      christos 1050:        KASSERT(delta <= (size_t)size);
                   1051:        KASSERT((old == 0) || ((size_t)old >= off));
                   1052:        KASSERT(old < (off_t)(size + off));
1.2       simonb   1053:
                   1054:        if ((old == 0) && (delta != 0))
                   1055:                new = off + delta;
                   1056:        else if ((old + delta) < (size + off))
                   1057:                new = old + delta;
                   1058:        else
                   1059:                new = (old + delta) - size;
                   1060:
                   1061:        /* Note some interesting axioms */
                   1062:        KASSERT((delta != 0) || (new == old));
                   1063:        KASSERT((delta == 0) || (new != 0));
                   1064:        KASSERT((delta != (size)) || (new == old));
                   1065:
                   1066:        /* Define acceptable ranges for output. */
1.46      christos 1067:        KASSERT((new == 0) || ((size_t)new >= off));
                   1068:        KASSERT((size_t)new < (size + off));
1.2       simonb   1069:        return new;
                   1070: }
                   1071:
1.30      uebayasi 1072: static inline size_t
1.2       simonb   1073: wapbl_space_used(size_t avail, off_t head, off_t tail)
                   1074: {
                   1075:
                   1076:        if (tail == 0) {
                   1077:                KASSERT(head == 0);
                   1078:                return 0;
                   1079:        }
                   1080:        return ((head + (avail - 1) - tail) % avail) + 1;
                   1081: }
                   1082:
1.30      uebayasi 1083: static inline size_t
1.2       simonb   1084: wapbl_space_free(size_t avail, off_t head, off_t tail)
                   1085: {
                   1086:
                   1087:        return avail - wapbl_space_used(avail, head, tail);
                   1088: }
                   1089:
1.30      uebayasi 1090: static inline void
1.2       simonb   1091: wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
                   1092:                   off_t *tailp)
                   1093: {
                   1094:        off_t head = *headp;
                   1095:        off_t tail = *tailp;
                   1096:
                   1097:        KASSERT(delta <= wapbl_space_free(size, head, tail));
                   1098:        head = wapbl_advance(size, off, head, delta);
                   1099:        if ((tail == 0) && (head != 0))
                   1100:                tail = off;
                   1101:        *headp = head;
                   1102:        *tailp = tail;
                   1103: }
                   1104:
1.30      uebayasi 1105: static inline void
1.2       simonb   1106: wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
                   1107:                   off_t *tailp)
                   1108: {
                   1109:        off_t head = *headp;
                   1110:        off_t tail = *tailp;
                   1111:
                   1112:        KASSERT(delta <= wapbl_space_used(size, head, tail));
                   1113:        tail = wapbl_advance(size, off, tail, delta);
                   1114:        if (head == tail) {
                   1115:                head = tail = 0;
                   1116:        }
                   1117:        *headp = head;
                   1118:        *tailp = tail;
                   1119: }
                   1120:
                   1121: #ifdef _KERNEL
                   1122:
                   1123: /****************************************************************/
                   1124:
                   1125: /*
                   1126:  * Remove transactions whose buffers are completely flushed to disk.
                   1127:  * Will block until at least minfree space is available.
                   1128:  * only intended to be called from inside wapbl_flush and therefore
                   1129:  * does not protect against commit races with itself or with flush.
                   1130:  */
                   1131: static int
                   1132: wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly)
                   1133: {
                   1134:        size_t delta;
                   1135:        size_t avail;
                   1136:        off_t head;
                   1137:        off_t tail;
                   1138:        int error = 0;
                   1139:
                   1140:        KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
                   1141:        KASSERT(rw_write_held(&wl->wl_rwlock));
                   1142:
                   1143:        mutex_enter(&wl->wl_mtx);
                   1144:
                   1145:        /*
                   1146:         * First check to see if we have to do a commit
                   1147:         * at all.
                   1148:         */
                   1149:        avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
                   1150:        if (minfree < avail) {
                   1151:                mutex_exit(&wl->wl_mtx);
                   1152:                return 0;
                   1153:        }
                   1154:        minfree -= avail;
                   1155:        while ((wl->wl_error_count == 0) &&
                   1156:            (wl->wl_reclaimable_bytes < minfree)) {
                   1157:                WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
                   1158:                    ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
                   1159:                    "minfree=%zd\n",
                   1160:                     &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
                   1161:                    minfree));
                   1162:
                   1163:                cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
                   1164:        }
                   1165:        if (wl->wl_reclaimable_bytes < minfree) {
                   1166:                KASSERT(wl->wl_error_count);
                   1167:                /* XXX maybe get actual error from buffer instead someday? */
                   1168:                error = EIO;
                   1169:        }
                   1170:        head = wl->wl_head;
                   1171:        tail = wl->wl_tail;
                   1172:        delta = wl->wl_reclaimable_bytes;
                   1173:
                   1174:        /* If all of of the entries are flushed, then be sure to keep
                   1175:         * the reserved bytes reserved.  Watch out for discarded transactions,
                   1176:         * which could leave more bytes reserved than are reclaimable.
                   1177:         */
                   1178:        if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
                   1179:            (delta >= wl->wl_reserved_bytes)) {
                   1180:                delta -= wl->wl_reserved_bytes;
                   1181:        }
                   1182:        wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
                   1183:                           &tail);
                   1184:        KDASSERT(wl->wl_reserved_bytes <=
                   1185:                wapbl_space_used(wl->wl_circ_size, head, tail));
                   1186:        mutex_exit(&wl->wl_mtx);
                   1187:
                   1188:        if (error)
                   1189:                return error;
                   1190:
                   1191:        if (waitonly)
                   1192:                return 0;
                   1193:
                   1194:        /*
                   1195:         * This is where head, tail and delta are unprotected
                   1196:         * from races against itself or flush.  This is ok since
                   1197:         * we only call this routine from inside flush itself.
                   1198:         *
                   1199:         * XXX: how can it race against itself when accessed only
                   1200:         * from behind the write-locked rwlock?
                   1201:         */
                   1202:        error = wapbl_write_commit(wl, head, tail);
                   1203:        if (error)
                   1204:                return error;
                   1205:
                   1206:        wl->wl_head = head;
                   1207:        wl->wl_tail = tail;
                   1208:
                   1209:        mutex_enter(&wl->wl_mtx);
                   1210:        KASSERT(wl->wl_reclaimable_bytes >= delta);
                   1211:        wl->wl_reclaimable_bytes -= delta;
                   1212:        mutex_exit(&wl->wl_mtx);
                   1213:        WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
                   1214:            ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
                   1215:            curproc->p_pid, curlwp->l_lid, delta));
                   1216:
                   1217:        return 0;
                   1218: }
                   1219:
                   1220: /****************************************************************/
                   1221:
                   1222: void
                   1223: wapbl_biodone(struct buf *bp)
                   1224: {
                   1225:        struct wapbl_entry *we = bp->b_private;
                   1226:        struct wapbl *wl = we->we_wapbl;
1.53    ! hannken  1227: #ifdef WAPBL_DEBUG_BUFBYTES
        !          1228:        const int bufsize = bp->b_bufsize;
        !          1229: #endif
1.2       simonb   1230:
                   1231:        /*
                   1232:         * Handle possible flushing of buffers after log has been
                   1233:         * decomissioned.
                   1234:         */
                   1235:        if (!wl) {
                   1236:                KASSERT(we->we_bufcount > 0);
                   1237:                we->we_bufcount--;
                   1238: #ifdef WAPBL_DEBUG_BUFBYTES
1.53    ! hannken  1239:                KASSERT(we->we_unsynced_bufbytes >= bufsize);
        !          1240:                we->we_unsynced_bufbytes -= bufsize;
1.2       simonb   1241: #endif
                   1242:
                   1243:                if (we->we_bufcount == 0) {
                   1244: #ifdef WAPBL_DEBUG_BUFBYTES
                   1245:                        KASSERT(we->we_unsynced_bufbytes == 0);
                   1246: #endif
1.51      para     1247:                        pool_put(&wapbl_entry_pool, we);
1.2       simonb   1248:                }
                   1249:
                   1250:                brelse(bp, 0);
                   1251:                return;
                   1252:        }
                   1253:
                   1254: #ifdef ohbother
1.44      uebayasi 1255:        KDASSERT(bp->b_oflags & BO_DONE);
                   1256:        KDASSERT(!(bp->b_oflags & BO_DELWRI));
1.2       simonb   1257:        KDASSERT(bp->b_flags & B_ASYNC);
1.44      uebayasi 1258:        KDASSERT(bp->b_cflags & BC_BUSY);
1.2       simonb   1259:        KDASSERT(!(bp->b_flags & B_LOCKED));
                   1260:        KDASSERT(!(bp->b_flags & B_READ));
1.44      uebayasi 1261:        KDASSERT(!(bp->b_cflags & BC_INVAL));
                   1262:        KDASSERT(!(bp->b_cflags & BC_NOCACHE));
1.2       simonb   1263: #endif
                   1264:
                   1265:        if (bp->b_error) {
                   1266: #ifdef notyet /* Can't currently handle possible dirty buffer reuse */
1.26      apb      1267:                /*
                   1268:                 * XXXpooka: interfaces not fully updated
                   1269:                 * Note: this was not enabled in the original patch
                   1270:                 * against netbsd4 either.  I don't know if comment
                   1271:                 * above is true or not.
                   1272:                 */
1.2       simonb   1273:
                   1274:                /*
                   1275:                 * If an error occurs, report the error and leave the
                   1276:                 * buffer as a delayed write on the LRU queue.
                   1277:                 * restarting the write would likely result in
                   1278:                 * an error spinloop, so let it be done harmlessly
                   1279:                 * by the syncer.
                   1280:                 */
                   1281:                bp->b_flags &= ~(B_DONE);
                   1282:                simple_unlock(&bp->b_interlock);
                   1283:
                   1284:                if (we->we_error == 0) {
                   1285:                        mutex_enter(&wl->wl_mtx);
                   1286:                        wl->wl_error_count++;
                   1287:                        mutex_exit(&wl->wl_mtx);
                   1288:                        cv_broadcast(&wl->wl_reclaimable_cv);
                   1289:                }
                   1290:                we->we_error = bp->b_error;
                   1291:                bp->b_error = 0;
                   1292:                brelse(bp);
                   1293:                return;
                   1294: #else
                   1295:                /* For now, just mark the log permanently errored out */
                   1296:
                   1297:                mutex_enter(&wl->wl_mtx);
                   1298:                if (wl->wl_error_count == 0) {
                   1299:                        wl->wl_error_count++;
                   1300:                        cv_broadcast(&wl->wl_reclaimable_cv);
                   1301:                }
                   1302:                mutex_exit(&wl->wl_mtx);
                   1303: #endif
                   1304:        }
                   1305:
1.53    ! hannken  1306:        /*
        !          1307:         * Release the buffer here. wapbl_flush() may wait for the
        !          1308:         * log to become empty and we better unbusy the buffer before
        !          1309:         * wapbl_flush() returns.
        !          1310:         */
        !          1311:        brelse(bp, 0);
        !          1312:
1.2       simonb   1313:        mutex_enter(&wl->wl_mtx);
                   1314:
                   1315:        KASSERT(we->we_bufcount > 0);
                   1316:        we->we_bufcount--;
                   1317: #ifdef WAPBL_DEBUG_BUFBYTES
1.53    ! hannken  1318:        KASSERT(we->we_unsynced_bufbytes >= bufsize);
        !          1319:        we->we_unsynced_bufbytes -= bufsize;
        !          1320:        KASSERT(wl->wl_unsynced_bufbytes >= bufsize);
        !          1321:        wl->wl_unsynced_bufbytes -= bufsize;
1.2       simonb   1322: #endif
                   1323:
                   1324:        /*
                   1325:         * If the current transaction can be reclaimed, start
                   1326:         * at the beginning and reclaim any consecutive reclaimable
                   1327:         * transactions.  If we successfully reclaim anything,
                   1328:         * then wakeup anyone waiting for the reclaim.
                   1329:         */
                   1330:        if (we->we_bufcount == 0) {
                   1331:                size_t delta = 0;
                   1332:                int errcnt = 0;
                   1333: #ifdef WAPBL_DEBUG_BUFBYTES
                   1334:                KDASSERT(we->we_unsynced_bufbytes == 0);
                   1335: #endif
                   1336:                /*
                   1337:                 * clear any posted error, since the buffer it came from
                   1338:                 * has successfully flushed by now
                   1339:                 */
                   1340:                while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
                   1341:                       (we->we_bufcount == 0)) {
                   1342:                        delta += we->we_reclaimable_bytes;
                   1343:                        if (we->we_error)
                   1344:                                errcnt++;
                   1345:                        SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1.51      para     1346:                        pool_put(&wapbl_entry_pool, we);
1.2       simonb   1347:                }
                   1348:
                   1349:                if (delta) {
                   1350:                        wl->wl_reclaimable_bytes += delta;
                   1351:                        KASSERT(wl->wl_error_count >= errcnt);
                   1352:                        wl->wl_error_count -= errcnt;
                   1353:                        cv_broadcast(&wl->wl_reclaimable_cv);
                   1354:                }
                   1355:        }
                   1356:
                   1357:        mutex_exit(&wl->wl_mtx);
                   1358: }
                   1359:
                   1360: /*
                   1361:  * Write transactions to disk + start I/O for contents
                   1362:  */
                   1363: int
                   1364: wapbl_flush(struct wapbl *wl, int waitfor)
                   1365: {
                   1366:        struct buf *bp;
                   1367:        struct wapbl_entry *we;
                   1368:        off_t off;
                   1369:        off_t head;
                   1370:        off_t tail;
                   1371:        size_t delta = 0;
                   1372:        size_t flushsize;
                   1373:        size_t reserved;
                   1374:        int error = 0;
                   1375:
                   1376:        /*
                   1377:         * Do a quick check to see if a full flush can be skipped
                   1378:         * This assumes that the flush callback does not need to be called
                   1379:         * unless there are other outstanding bufs.
                   1380:         */
                   1381:        if (!waitfor) {
                   1382:                size_t nbufs;
                   1383:                mutex_enter(&wl->wl_mtx);       /* XXX need mutex here to
                   1384:                                                   protect the KASSERTS */
                   1385:                nbufs = wl->wl_bufcount;
                   1386:                KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
                   1387:                KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
                   1388:                mutex_exit(&wl->wl_mtx);
                   1389:                if (nbufs == 0)
                   1390:                        return 0;
                   1391:        }
                   1392:
                   1393:        /*
                   1394:         * XXX we may consider using LK_UPGRADE here
                   1395:         * if we want to call flush from inside a transaction
                   1396:         */
                   1397:        rw_enter(&wl->wl_rwlock, RW_WRITER);
                   1398:        wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
                   1399:            wl->wl_dealloccnt);
                   1400:
                   1401:        /*
                   1402:         * Now that we are fully locked and flushed,
                   1403:         * do another check for nothing to do.
                   1404:         */
                   1405:        if (wl->wl_bufcount == 0) {
                   1406:                goto out;
                   1407:        }
                   1408:
                   1409: #if 0
                   1410:        WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
                   1411:                     ("wapbl_flush thread %d.%d flushing entries with "
                   1412:                      "bufcount=%zu bufbytes=%zu\n",
                   1413:                      curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
                   1414:                      wl->wl_bufbytes));
                   1415: #endif
                   1416:
                   1417:        /* Calculate amount of space needed to flush */
                   1418:        flushsize = wapbl_transaction_len(wl);
1.39      christos 1419:        if (wapbl_verbose_commit) {
                   1420:                struct timespec ts;
                   1421:                getnanotime(&ts);
1.43      nakayama 1422:                printf("%s: %lld.%09ld this transaction = %zu bytes\n",
1.39      christos 1423:                    __func__, (long long)ts.tv_sec,
                   1424:                    (long)ts.tv_nsec, flushsize);
                   1425:        }
1.2       simonb   1426:
                   1427:        if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
                   1428:                /*
                   1429:                 * XXX this could be handled more gracefully, perhaps place
                   1430:                 * only a partial transaction in the log and allow the
                   1431:                 * remaining to flush without the protection of the journal.
                   1432:                 */
                   1433:                panic("wapbl_flush: current transaction too big to flush\n");
                   1434:        }
                   1435:
                   1436:        error = wapbl_truncate(wl, flushsize, 0);
                   1437:        if (error)
                   1438:                goto out2;
                   1439:
                   1440:        off = wl->wl_head;
                   1441:        KASSERT((off == 0) || ((off >= wl->wl_circ_off) &&
                   1442:                              (off < wl->wl_circ_off + wl->wl_circ_size)));
                   1443:        error = wapbl_write_blocks(wl, &off);
                   1444:        if (error)
                   1445:                goto out2;
                   1446:        error = wapbl_write_revocations(wl, &off);
                   1447:        if (error)
                   1448:                goto out2;
                   1449:        error = wapbl_write_inodes(wl, &off);
                   1450:        if (error)
                   1451:                goto out2;
                   1452:
                   1453:        reserved = 0;
                   1454:        if (wl->wl_inohashcnt)
                   1455:                reserved = wapbl_transaction_inodes_len(wl);
                   1456:
                   1457:        head = wl->wl_head;
                   1458:        tail = wl->wl_tail;
                   1459:
                   1460:        wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
                   1461:            &head, &tail);
                   1462: #ifdef WAPBL_DEBUG
                   1463:        if (head != off) {
                   1464:                panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX
                   1465:                      " off=%"PRIdMAX" flush=%zu\n",
                   1466:                      (intmax_t)head, (intmax_t)tail, (intmax_t)off,
                   1467:                      flushsize);
                   1468:        }
                   1469: #else
                   1470:        KASSERT(head == off);
                   1471: #endif
                   1472:
                   1473:        /* Opportunistically move the tail forward if we can */
                   1474:        if (!wapbl_lazy_truncate) {
                   1475:                mutex_enter(&wl->wl_mtx);
                   1476:                delta = wl->wl_reclaimable_bytes;
                   1477:                mutex_exit(&wl->wl_mtx);
                   1478:                wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
                   1479:                    &head, &tail);
                   1480:        }
                   1481:
                   1482:        error = wapbl_write_commit(wl, head, tail);
                   1483:        if (error)
                   1484:                goto out2;
                   1485:
1.51      para     1486:        we = pool_get(&wapbl_entry_pool, PR_WAITOK);
1.2       simonb   1487:
                   1488: #ifdef WAPBL_DEBUG_BUFBYTES
                   1489:        WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
                   1490:                ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
                   1491:                 " unsynced=%zu"
                   1492:                 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
                   1493:                 "inodes=%d\n",
                   1494:                 curproc->p_pid, curlwp->l_lid, flushsize, delta,
                   1495:                 wapbl_space_used(wl->wl_circ_size, head, tail),
                   1496:                 wl->wl_unsynced_bufbytes, wl->wl_bufcount,
                   1497:                 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
                   1498:                 wl->wl_inohashcnt));
                   1499: #else
                   1500:        WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
                   1501:                ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
                   1502:                 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
                   1503:                 "inodes=%d\n",
                   1504:                 curproc->p_pid, curlwp->l_lid, flushsize, delta,
                   1505:                 wapbl_space_used(wl->wl_circ_size, head, tail),
                   1506:                 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
                   1507:                 wl->wl_dealloccnt, wl->wl_inohashcnt));
                   1508: #endif
                   1509:
                   1510:
                   1511:        mutex_enter(&bufcache_lock);
                   1512:        mutex_enter(&wl->wl_mtx);
                   1513:
                   1514:        wl->wl_reserved_bytes = reserved;
                   1515:        wl->wl_head = head;
                   1516:        wl->wl_tail = tail;
                   1517:        KASSERT(wl->wl_reclaimable_bytes >= delta);
                   1518:        wl->wl_reclaimable_bytes -= delta;
                   1519:        wl->wl_dealloccnt = 0;
                   1520: #ifdef WAPBL_DEBUG_BUFBYTES
                   1521:        wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
                   1522: #endif
                   1523:
                   1524:        we->we_wapbl = wl;
                   1525:        we->we_bufcount = wl->wl_bufcount;
                   1526: #ifdef WAPBL_DEBUG_BUFBYTES
                   1527:        we->we_unsynced_bufbytes = wl->wl_bufbytes;
                   1528: #endif
                   1529:        we->we_reclaimable_bytes = flushsize;
                   1530:        we->we_error = 0;
                   1531:        SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
                   1532:
                   1533:        /*
                   1534:         * this flushes bufs in reverse order than they were queued
                   1535:         * it shouldn't matter, but if we care we could use TAILQ instead.
                   1536:         * XXX Note they will get put on the lru queue when they flush
                   1537:         * so we might actually want to change this to preserve order.
                   1538:         */
                   1539:        while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
                   1540:                if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
                   1541:                        continue;
                   1542:                }
                   1543:                bp->b_iodone = wapbl_biodone;
                   1544:                bp->b_private = we;
                   1545:                bremfree(bp);
                   1546:                wapbl_remove_buf_locked(wl, bp);
                   1547:                mutex_exit(&wl->wl_mtx);
                   1548:                mutex_exit(&bufcache_lock);
                   1549:                bawrite(bp);
                   1550:                mutex_enter(&bufcache_lock);
                   1551:                mutex_enter(&wl->wl_mtx);
                   1552:        }
                   1553:        mutex_exit(&wl->wl_mtx);
                   1554:        mutex_exit(&bufcache_lock);
                   1555:
                   1556: #if 0
                   1557:        WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
                   1558:                     ("wapbl_flush thread %d.%d done flushing entries...\n",
                   1559:                     curproc->p_pid, curlwp->l_lid));
                   1560: #endif
                   1561:
                   1562:  out:
                   1563:
                   1564:        /*
                   1565:         * If the waitfor flag is set, don't return until everything is
                   1566:         * fully flushed and the on disk log is empty.
                   1567:         */
                   1568:        if (waitfor) {
                   1569:                error = wapbl_truncate(wl, wl->wl_circ_size -
                   1570:                        wl->wl_reserved_bytes, wapbl_lazy_truncate);
                   1571:        }
                   1572:
                   1573:  out2:
                   1574:        if (error) {
                   1575:                wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks,
                   1576:                    wl->wl_dealloclens, wl->wl_dealloccnt);
                   1577:        }
                   1578:
                   1579: #ifdef WAPBL_DEBUG_PRINT
                   1580:        if (error) {
                   1581:                pid_t pid = -1;
                   1582:                lwpid_t lid = -1;
                   1583:                if (curproc)
                   1584:                        pid = curproc->p_pid;
                   1585:                if (curlwp)
                   1586:                        lid = curlwp->l_lid;
                   1587:                mutex_enter(&wl->wl_mtx);
                   1588: #ifdef WAPBL_DEBUG_BUFBYTES
                   1589:                WAPBL_PRINTF(WAPBL_PRINT_ERROR,
                   1590:                    ("wapbl_flush: thread %d.%d aborted flush: "
                   1591:                    "error = %d\n"
                   1592:                    "\tbufcount=%zu bufbytes=%zu bcount=%zu "
                   1593:                    "deallocs=%d inodes=%d\n"
                   1594:                    "\terrcnt = %d, reclaimable=%zu reserved=%zu "
                   1595:                    "unsynced=%zu\n",
                   1596:                    pid, lid, error, wl->wl_bufcount,
                   1597:                    wl->wl_bufbytes, wl->wl_bcount,
                   1598:                    wl->wl_dealloccnt, wl->wl_inohashcnt,
                   1599:                    wl->wl_error_count, wl->wl_reclaimable_bytes,
                   1600:                    wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
                   1601:                SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
                   1602:                        WAPBL_PRINTF(WAPBL_PRINT_ERROR,
                   1603:                            ("\tentry: bufcount = %zu, reclaimable = %zu, "
                   1604:                             "error = %d, unsynced = %zu\n",
                   1605:                             we->we_bufcount, we->we_reclaimable_bytes,
                   1606:                             we->we_error, we->we_unsynced_bufbytes));
                   1607:                }
                   1608: #else
                   1609:                WAPBL_PRINTF(WAPBL_PRINT_ERROR,
                   1610:                    ("wapbl_flush: thread %d.%d aborted flush: "
                   1611:                     "error = %d\n"
                   1612:                     "\tbufcount=%zu bufbytes=%zu bcount=%zu "
                   1613:                     "deallocs=%d inodes=%d\n"
                   1614:                     "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
                   1615:                     pid, lid, error, wl->wl_bufcount,
                   1616:                     wl->wl_bufbytes, wl->wl_bcount,
                   1617:                     wl->wl_dealloccnt, wl->wl_inohashcnt,
                   1618:                     wl->wl_error_count, wl->wl_reclaimable_bytes,
                   1619:                     wl->wl_reserved_bytes));
                   1620:                SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
                   1621:                        WAPBL_PRINTF(WAPBL_PRINT_ERROR,
                   1622:                            ("\tentry: bufcount = %zu, reclaimable = %zu, "
                   1623:                             "error = %d\n", we->we_bufcount,
                   1624:                             we->we_reclaimable_bytes, we->we_error));
                   1625:                }
                   1626: #endif
                   1627:                mutex_exit(&wl->wl_mtx);
                   1628:        }
                   1629: #endif
                   1630:
                   1631:        rw_exit(&wl->wl_rwlock);
                   1632:        return error;
                   1633: }
                   1634:
                   1635: /****************************************************************/
                   1636:
                   1637: void
                   1638: wapbl_jlock_assert(struct wapbl *wl)
                   1639: {
                   1640:
1.23      ad       1641:        KASSERT(rw_lock_held(&wl->wl_rwlock));
1.2       simonb   1642: }
                   1643:
                   1644: void
                   1645: wapbl_junlock_assert(struct wapbl *wl)
                   1646: {
                   1647:
                   1648:        KASSERT(!rw_write_held(&wl->wl_rwlock));
                   1649: }
                   1650:
                   1651: /****************************************************************/
                   1652:
                   1653: /* locks missing */
                   1654: void
                   1655: wapbl_print(struct wapbl *wl,
                   1656:                int full,
                   1657:                void (*pr)(const char *, ...))
                   1658: {
                   1659:        struct buf *bp;
                   1660:        struct wapbl_entry *we;
                   1661:        (*pr)("wapbl %p", wl);
                   1662:        (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
                   1663:              wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
                   1664:        (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
                   1665:              wl->wl_circ_size, wl->wl_circ_off,
                   1666:              (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
                   1667:        (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
                   1668:              wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
                   1669: #ifdef WAPBL_DEBUG_BUFBYTES
                   1670:        (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
                   1671:              "reserved = %zu errcnt = %d unsynced = %zu\n",
                   1672:              wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
                   1673:              wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
                   1674:                                wl->wl_error_count, wl->wl_unsynced_bufbytes);
                   1675: #else
                   1676:        (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
                   1677:              "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
                   1678:              wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
                   1679:                                wl->wl_error_count);
                   1680: #endif
                   1681:        (*pr)("\tdealloccnt = %d, dealloclim = %d\n",
                   1682:              wl->wl_dealloccnt, wl->wl_dealloclim);
                   1683:        (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
                   1684:              wl->wl_inohashcnt, wl->wl_inohashmask);
                   1685:        (*pr)("entries:\n");
                   1686:        SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
                   1687: #ifdef WAPBL_DEBUG_BUFBYTES
                   1688:                (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
                   1689:                      "unsynced = %zu\n",
                   1690:                      we->we_bufcount, we->we_reclaimable_bytes,
                   1691:                      we->we_error, we->we_unsynced_bufbytes);
                   1692: #else
                   1693:                (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
                   1694:                      we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
                   1695: #endif
                   1696:        }
                   1697:        if (full) {
                   1698:                int cnt = 0;
                   1699:                (*pr)("bufs =");
                   1700:                LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
                   1701:                        if (!LIST_NEXT(bp, b_wapbllist)) {
                   1702:                                (*pr)(" %p", bp);
                   1703:                        } else if ((++cnt % 6) == 0) {
                   1704:                                (*pr)(" %p,\n\t", bp);
                   1705:                        } else {
                   1706:                                (*pr)(" %p,", bp);
                   1707:                        }
                   1708:                }
                   1709:                (*pr)("\n");
                   1710:
                   1711:                (*pr)("dealloced blks = ");
                   1712:                {
                   1713:                        int i;
                   1714:                        cnt = 0;
                   1715:                        for (i = 0; i < wl->wl_dealloccnt; i++) {
                   1716:                                (*pr)(" %"PRId64":%d,",
                   1717:                                      wl->wl_deallocblks[i],
                   1718:                                      wl->wl_dealloclens[i]);
                   1719:                                if ((++cnt % 4) == 0) {
                   1720:                                        (*pr)("\n\t");
                   1721:                                }
                   1722:                        }
                   1723:                }
                   1724:                (*pr)("\n");
                   1725:
                   1726:                (*pr)("registered inodes = ");
                   1727:                {
                   1728:                        int i;
                   1729:                        cnt = 0;
                   1730:                        for (i = 0; i <= wl->wl_inohashmask; i++) {
                   1731:                                struct wapbl_ino_head *wih;
                   1732:                                struct wapbl_ino *wi;
                   1733:
                   1734:                                wih = &wl->wl_inohash[i];
                   1735:                                LIST_FOREACH(wi, wih, wi_hash) {
                   1736:                                        if (wi->wi_ino == 0)
                   1737:                                                continue;
                   1738:                                        (*pr)(" %"PRId32"/0%06"PRIo32",",
                   1739:                                            wi->wi_ino, wi->wi_mode);
                   1740:                                        if ((++cnt % 4) == 0) {
                   1741:                                                (*pr)("\n\t");
                   1742:                                        }
                   1743:                                }
                   1744:                        }
                   1745:                        (*pr)("\n");
                   1746:                }
                   1747:        }
                   1748: }
                   1749:
                   1750: #if defined(WAPBL_DEBUG) || defined(DDB)
                   1751: void
                   1752: wapbl_dump(struct wapbl *wl)
                   1753: {
                   1754: #if defined(WAPBL_DEBUG)
                   1755:        if (!wl)
                   1756:                wl = wapbl_debug_wl;
                   1757: #endif
                   1758:        if (!wl)
                   1759:                return;
                   1760:        wapbl_print(wl, 1, printf);
                   1761: }
                   1762: #endif
                   1763:
                   1764: /****************************************************************/
                   1765:
                   1766: void
                   1767: wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len)
                   1768: {
                   1769:
                   1770:        wapbl_jlock_assert(wl);
                   1771:
1.38      hannken  1772:        mutex_enter(&wl->wl_mtx);
1.2       simonb   1773:        /* XXX should eventually instead tie this into resource estimation */
1.27      pooka    1774:        /*
                   1775:         * XXX this panic needs locking/mutex analysis and the
                   1776:         * ability to cope with the failure.
                   1777:         */
                   1778:        /* XXX this XXX doesn't have enough XXX */
                   1779:        if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim))
                   1780:                panic("wapbl_register_deallocation: out of resources");
                   1781:
1.2       simonb   1782:        wl->wl_deallocblks[wl->wl_dealloccnt] = blk;
                   1783:        wl->wl_dealloclens[wl->wl_dealloccnt] = len;
                   1784:        wl->wl_dealloccnt++;
                   1785:        WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
                   1786:            ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len));
1.38      hannken  1787:        mutex_exit(&wl->wl_mtx);
1.2       simonb   1788: }
                   1789:
                   1790: /****************************************************************/
                   1791:
                   1792: static void
                   1793: wapbl_inodetrk_init(struct wapbl *wl, u_int size)
                   1794: {
                   1795:
                   1796:        wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
                   1797:        if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
                   1798:                pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
                   1799:                    "wapblinopl", &pool_allocator_nointr, IPL_NONE);
                   1800:        }
                   1801: }
                   1802:
                   1803: static void
                   1804: wapbl_inodetrk_free(struct wapbl *wl)
                   1805: {
                   1806:
                   1807:        /* XXX this KASSERT needs locking/mutex analysis */
                   1808:        KASSERT(wl->wl_inohashcnt == 0);
                   1809:        hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
                   1810:        if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
                   1811:                pool_destroy(&wapbl_ino_pool);
                   1812:        }
                   1813: }
                   1814:
                   1815: static struct wapbl_ino *
                   1816: wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
                   1817: {
                   1818:        struct wapbl_ino_head *wih;
                   1819:        struct wapbl_ino *wi;
                   1820:
                   1821:        KASSERT(mutex_owned(&wl->wl_mtx));
                   1822:
                   1823:        wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
                   1824:        LIST_FOREACH(wi, wih, wi_hash) {
                   1825:                if (ino == wi->wi_ino)
                   1826:                        return wi;
                   1827:        }
                   1828:        return 0;
                   1829: }
                   1830:
                   1831: void
                   1832: wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
                   1833: {
                   1834:        struct wapbl_ino_head *wih;
                   1835:        struct wapbl_ino *wi;
                   1836:
                   1837:        wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
                   1838:
                   1839:        mutex_enter(&wl->wl_mtx);
                   1840:        if (wapbl_inodetrk_get(wl, ino) == NULL) {
                   1841:                wi->wi_ino = ino;
                   1842:                wi->wi_mode = mode;
                   1843:                wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
                   1844:                LIST_INSERT_HEAD(wih, wi, wi_hash);
                   1845:                wl->wl_inohashcnt++;
                   1846:                WAPBL_PRINTF(WAPBL_PRINT_INODE,
                   1847:                    ("wapbl_register_inode: ino=%"PRId64"\n", ino));
                   1848:                mutex_exit(&wl->wl_mtx);
                   1849:        } else {
                   1850:                mutex_exit(&wl->wl_mtx);
                   1851:                pool_put(&wapbl_ino_pool, wi);
                   1852:        }
                   1853: }
                   1854:
                   1855: void
                   1856: wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
                   1857: {
                   1858:        struct wapbl_ino *wi;
                   1859:
                   1860:        mutex_enter(&wl->wl_mtx);
                   1861:        wi = wapbl_inodetrk_get(wl, ino);
                   1862:        if (wi) {
                   1863:                WAPBL_PRINTF(WAPBL_PRINT_INODE,
                   1864:                    ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
                   1865:                KASSERT(wl->wl_inohashcnt > 0);
                   1866:                wl->wl_inohashcnt--;
                   1867:                LIST_REMOVE(wi, wi_hash);
                   1868:                mutex_exit(&wl->wl_mtx);
                   1869:
                   1870:                pool_put(&wapbl_ino_pool, wi);
                   1871:        } else {
                   1872:                mutex_exit(&wl->wl_mtx);
                   1873:        }
                   1874: }
                   1875:
                   1876: /****************************************************************/
                   1877:
1.30      uebayasi 1878: static inline size_t
1.2       simonb   1879: wapbl_transaction_inodes_len(struct wapbl *wl)
                   1880: {
                   1881:        int blocklen = 1<<wl->wl_log_dev_bshift;
                   1882:        int iph;
                   1883:
                   1884:        /* Calculate number of inodes described in a inodelist header */
                   1885:        iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
                   1886:            sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
                   1887:
                   1888:        KASSERT(iph > 0);
                   1889:
1.39      christos 1890:        return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen;
1.2       simonb   1891: }
                   1892:
                   1893:
                   1894: /* Calculate amount of space a transaction will take on disk */
                   1895: static size_t
                   1896: wapbl_transaction_len(struct wapbl *wl)
                   1897: {
                   1898:        int blocklen = 1<<wl->wl_log_dev_bshift;
                   1899:        size_t len;
                   1900:        int bph;
                   1901:
                   1902:        /* Calculate number of blocks described in a blocklist header */
                   1903:        bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
                   1904:            sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
                   1905:
                   1906:        KASSERT(bph > 0);
                   1907:
                   1908:        len = wl->wl_bcount;
1.39      christos 1909:        len += howmany(wl->wl_bufcount, bph) * blocklen;
                   1910:        len += howmany(wl->wl_dealloccnt, bph) * blocklen;
1.2       simonb   1911:        len += wapbl_transaction_inodes_len(wl);
                   1912:
                   1913:        return len;
                   1914: }
                   1915:
                   1916: /*
1.48      yamt     1917:  * wapbl_cache_sync: issue DIOCCACHESYNC
                   1918:  */
                   1919: static int
                   1920: wapbl_cache_sync(struct wapbl *wl, const char *msg)
                   1921: {
                   1922:        const bool verbose = wapbl_verbose_commit >= 2;
                   1923:        struct bintime start_time;
                   1924:        int force = 1;
                   1925:        int error;
                   1926:
                   1927:        if (!wapbl_flush_disk_cache) {
                   1928:                return 0;
                   1929:        }
                   1930:        if (verbose) {
                   1931:                bintime(&start_time);
                   1932:        }
                   1933:        error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force,
                   1934:            FWRITE, FSCRED);
                   1935:        if (error) {
                   1936:                WAPBL_PRINTF(WAPBL_PRINT_ERROR,
                   1937:                    ("wapbl_cache_sync: DIOCCACHESYNC on dev 0x%x "
                   1938:                    "returned %d\n", wl->wl_devvp->v_rdev, error));
                   1939:        }
                   1940:        if (verbose) {
                   1941:                struct bintime d;
                   1942:                struct timespec ts;
                   1943:
                   1944:                bintime(&d);
                   1945:                bintime_sub(&d, &start_time);
                   1946:                bintime2timespec(&d, &ts);
                   1947:                printf("wapbl_cache_sync: %s: dev 0x%jx %ju.%09lu\n",
                   1948:                    msg, (uintmax_t)wl->wl_devvp->v_rdev,
                   1949:                    (uintmax_t)ts.tv_sec, ts.tv_nsec);
                   1950:        }
                   1951:        return error;
                   1952: }
                   1953:
                   1954: /*
1.2       simonb   1955:  * Perform commit operation
                   1956:  *
                   1957:  * Note that generation number incrementation needs to
                   1958:  * be protected against racing with other invocations
1.48      yamt     1959:  * of wapbl_write_commit.  This is ok since this routine
1.2       simonb   1960:  * is only invoked from wapbl_flush
                   1961:  */
                   1962: static int
                   1963: wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
                   1964: {
                   1965:        struct wapbl_wc_header *wc = wl->wl_wc_header;
                   1966:        struct timespec ts;
                   1967:        int error;
1.34      mlelstv  1968:        daddr_t pbn;
1.2       simonb   1969:
1.49      yamt     1970:        /*
                   1971:         * flush disk cache to ensure that blocks we've written are actually
                   1972:         * written to the stable storage before the commit header.
                   1973:         *
                   1974:         * XXX Calc checksum here, instead we do this for now
                   1975:         */
1.48      yamt     1976:        wapbl_cache_sync(wl, "1");
1.2       simonb   1977:
                   1978:        wc->wc_head = head;
                   1979:        wc->wc_tail = tail;
                   1980:        wc->wc_checksum = 0;
                   1981:        wc->wc_version = 1;
                   1982:        getnanotime(&ts);
1.17      yamt     1983:        wc->wc_time = ts.tv_sec;
1.2       simonb   1984:        wc->wc_timensec = ts.tv_nsec;
                   1985:
                   1986:        WAPBL_PRINTF(WAPBL_PRINT_WRITE,
                   1987:            ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
                   1988:            (intmax_t)head, (intmax_t)tail));
                   1989:
                   1990:        /*
1.49      yamt     1991:         * write the commit header.
                   1992:         *
1.2       simonb   1993:         * XXX if generation will rollover, then first zero
                   1994:         * over second commit header before trying to write both headers.
                   1995:         */
                   1996:
1.34      mlelstv  1997:        pbn = wl->wl_logpbn + (wc->wc_generation % 2);
                   1998: #ifdef _KERNEL
                   1999:        pbn = btodb(pbn << wc->wc_log_dev_bshift);
                   2000: #endif
                   2001:        error = wapbl_write(wc, wc->wc_len, wl->wl_devvp, pbn);
1.2       simonb   2002:        if (error)
                   2003:                return error;
                   2004:
1.49      yamt     2005:        /*
                   2006:         * flush disk cache to ensure that the commit header is actually
                   2007:         * written before meta data blocks.
                   2008:         */
1.48      yamt     2009:        wapbl_cache_sync(wl, "2");
1.2       simonb   2010:
                   2011:        /*
                   2012:         * If the generation number was zero, write it out a second time.
                   2013:         * This handles initialization and generation number rollover
                   2014:         */
                   2015:        if (wc->wc_generation++ == 0) {
                   2016:                error = wapbl_write_commit(wl, head, tail);
                   2017:                /*
                   2018:                 * This panic should be able to be removed if we do the
                   2019:                 * zero'ing mentioned above, and we are certain to roll
                   2020:                 * back generation number on failure.
                   2021:                 */
                   2022:                if (error)
                   2023:                        panic("wapbl_write_commit: error writing duplicate "
                   2024:                              "log header: %d\n", error);
                   2025:        }
                   2026:        return 0;
                   2027: }
                   2028:
                   2029: /* Returns new offset value */
                   2030: static int
                   2031: wapbl_write_blocks(struct wapbl *wl, off_t *offp)
                   2032: {
                   2033:        struct wapbl_wc_blocklist *wc =
                   2034:            (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
                   2035:        int blocklen = 1<<wl->wl_log_dev_bshift;
                   2036:        int bph;
                   2037:        struct buf *bp;
                   2038:        off_t off = *offp;
                   2039:        int error;
1.7       joerg    2040:        size_t padding;
1.2       simonb   2041:
                   2042:        KASSERT(rw_write_held(&wl->wl_rwlock));
                   2043:
                   2044:        bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
                   2045:            sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
                   2046:
                   2047:        bp = LIST_FIRST(&wl->wl_bufs);
                   2048:
                   2049:        while (bp) {
                   2050:                int cnt;
                   2051:                struct buf *obp = bp;
                   2052:
                   2053:                KASSERT(bp->b_flags & B_LOCKED);
                   2054:
                   2055:                wc->wc_type = WAPBL_WC_BLOCKS;
                   2056:                wc->wc_len = blocklen;
                   2057:                wc->wc_blkcount = 0;
                   2058:                while (bp && (wc->wc_blkcount < bph)) {
                   2059:                        /*
                   2060:                         * Make sure all the physical block numbers are up to
                   2061:                         * date.  If this is not always true on a given
                   2062:                         * filesystem, then VOP_BMAP must be called.  We
                   2063:                         * could call VOP_BMAP here, or else in the filesystem
                   2064:                         * specific flush callback, although neither of those
                   2065:                         * solutions allow us to take the vnode lock.  If a
                   2066:                         * filesystem requires that we must take the vnode lock
                   2067:                         * to call VOP_BMAP, then we can probably do it in
                   2068:                         * bwrite when the vnode lock should already be held
                   2069:                         * by the invoking code.
                   2070:                         */
                   2071:                        KASSERT((bp->b_vp->v_type == VBLK) ||
                   2072:                                 (bp->b_blkno != bp->b_lblkno));
                   2073:                        KASSERT(bp->b_blkno > 0);
                   2074:
                   2075:                        wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
                   2076:                        wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
                   2077:                        wc->wc_len += bp->b_bcount;
                   2078:                        wc->wc_blkcount++;
                   2079:                        bp = LIST_NEXT(bp, b_wapbllist);
                   2080:                }
1.7       joerg    2081:                if (wc->wc_len % blocklen != 0) {
                   2082:                        padding = blocklen - wc->wc_len % blocklen;
                   2083:                        wc->wc_len += padding;
                   2084:                } else {
                   2085:                        padding = 0;
                   2086:                }
                   2087:
1.2       simonb   2088:                WAPBL_PRINTF(WAPBL_PRINT_WRITE,
1.7       joerg    2089:                    ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
                   2090:                    wc->wc_len, padding, (intmax_t)off));
1.2       simonb   2091:
                   2092:                error = wapbl_circ_write(wl, wc, blocklen, &off);
                   2093:                if (error)
                   2094:                        return error;
                   2095:                bp = obp;
                   2096:                cnt = 0;
                   2097:                while (bp && (cnt++ < bph)) {
                   2098:                        error = wapbl_circ_write(wl, bp->b_data,
                   2099:                            bp->b_bcount, &off);
                   2100:                        if (error)
                   2101:                                return error;
                   2102:                        bp = LIST_NEXT(bp, b_wapbllist);
                   2103:                }
1.7       joerg    2104:                if (padding) {
                   2105:                        void *zero;
                   2106:
1.51      para     2107:                        zero = wapbl_alloc(padding);
1.7       joerg    2108:                        memset(zero, 0, padding);
                   2109:                        error = wapbl_circ_write(wl, zero, padding, &off);
1.18      yamt     2110:                        wapbl_free(zero, padding);
1.7       joerg    2111:                        if (error)
                   2112:                                return error;
                   2113:                }
1.2       simonb   2114:        }
                   2115:        *offp = off;
                   2116:        return 0;
                   2117: }
                   2118:
                   2119: static int
                   2120: wapbl_write_revocations(struct wapbl *wl, off_t *offp)
                   2121: {
                   2122:        struct wapbl_wc_blocklist *wc =
                   2123:            (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
                   2124:        int i;
                   2125:        int blocklen = 1<<wl->wl_log_dev_bshift;
                   2126:        int bph;
                   2127:        off_t off = *offp;
                   2128:        int error;
                   2129:
                   2130:        if (wl->wl_dealloccnt == 0)
                   2131:                return 0;
                   2132:
                   2133:        bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
                   2134:            sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
                   2135:
                   2136:        i = 0;
                   2137:        while (i < wl->wl_dealloccnt) {
                   2138:                wc->wc_type = WAPBL_WC_REVOCATIONS;
                   2139:                wc->wc_len = blocklen;
                   2140:                wc->wc_blkcount = 0;
                   2141:                while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) {
                   2142:                        wc->wc_blocks[wc->wc_blkcount].wc_daddr =
                   2143:                            wl->wl_deallocblks[i];
                   2144:                        wc->wc_blocks[wc->wc_blkcount].wc_dlen =
                   2145:                            wl->wl_dealloclens[i];
                   2146:                        wc->wc_blkcount++;
                   2147:                        i++;
                   2148:                }
                   2149:                WAPBL_PRINTF(WAPBL_PRINT_WRITE,
                   2150:                    ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
                   2151:                    wc->wc_len, (intmax_t)off));
                   2152:                error = wapbl_circ_write(wl, wc, blocklen, &off);
                   2153:                if (error)
                   2154:                        return error;
                   2155:        }
                   2156:        *offp = off;
                   2157:        return 0;
                   2158: }
                   2159:
                   2160: static int
                   2161: wapbl_write_inodes(struct wapbl *wl, off_t *offp)
                   2162: {
                   2163:        struct wapbl_wc_inodelist *wc =
                   2164:            (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
                   2165:        int i;
1.14      joerg    2166:        int blocklen = 1 << wl->wl_log_dev_bshift;
1.2       simonb   2167:        off_t off = *offp;
                   2168:        int error;
                   2169:
                   2170:        struct wapbl_ino_head *wih;
                   2171:        struct wapbl_ino *wi;
                   2172:        int iph;
                   2173:
                   2174:        iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
                   2175:            sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
                   2176:
                   2177:        i = 0;
                   2178:        wih = &wl->wl_inohash[0];
                   2179:        wi = 0;
                   2180:        do {
                   2181:                wc->wc_type = WAPBL_WC_INODES;
                   2182:                wc->wc_len = blocklen;
                   2183:                wc->wc_inocnt = 0;
                   2184:                wc->wc_clear = (i == 0);
                   2185:                while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
                   2186:                        while (!wi) {
                   2187:                                KASSERT((wih - &wl->wl_inohash[0])
                   2188:                                    <= wl->wl_inohashmask);
                   2189:                                wi = LIST_FIRST(wih++);
                   2190:                        }
                   2191:                        wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
                   2192:                        wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
                   2193:                        wc->wc_inocnt++;
                   2194:                        i++;
                   2195:                        wi = LIST_NEXT(wi, wi_hash);
                   2196:                }
                   2197:                WAPBL_PRINTF(WAPBL_PRINT_WRITE,
                   2198:                    ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
                   2199:                    wc->wc_len, (intmax_t)off));
                   2200:                error = wapbl_circ_write(wl, wc, blocklen, &off);
                   2201:                if (error)
                   2202:                        return error;
                   2203:        } while (i < wl->wl_inohashcnt);
                   2204:
                   2205:        *offp = off;
                   2206:        return 0;
                   2207: }
                   2208:
                   2209: #endif /* _KERNEL */
                   2210:
                   2211: /****************************************************************/
                   2212:
                   2213: struct wapbl_blk {
                   2214:        LIST_ENTRY(wapbl_blk) wb_hash;
                   2215:        daddr_t wb_blk;
                   2216:        off_t wb_off; /* Offset of this block in the log */
                   2217: };
                   2218: #define        WAPBL_BLKPOOL_MIN 83
                   2219:
                   2220: static void
                   2221: wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
                   2222: {
                   2223:        if (size < WAPBL_BLKPOOL_MIN)
                   2224:                size = WAPBL_BLKPOOL_MIN;
                   2225:        KASSERT(wr->wr_blkhash == 0);
                   2226: #ifdef _KERNEL
                   2227:        wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
                   2228: #else /* ! _KERNEL */
                   2229:        /* Manually implement hashinit */
                   2230:        {
1.25      lukem    2231:                unsigned long i, hashsize;
1.2       simonb   2232:                for (hashsize = 1; hashsize < size; hashsize <<= 1)
                   2233:                        continue;
1.51      para     2234:                wr->wr_blkhash = wapbl_alloc(hashsize * sizeof(*wr->wr_blkhash));
1.37      drochner 2235:                for (i = 0; i < hashsize; i++)
1.2       simonb   2236:                        LIST_INIT(&wr->wr_blkhash[i]);
                   2237:                wr->wr_blkhashmask = hashsize - 1;
                   2238:        }
                   2239: #endif /* ! _KERNEL */
                   2240: }
                   2241:
                   2242: static void
                   2243: wapbl_blkhash_free(struct wapbl_replay *wr)
                   2244: {
                   2245:        KASSERT(wr->wr_blkhashcnt == 0);
                   2246: #ifdef _KERNEL
                   2247:        hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
                   2248: #else /* ! _KERNEL */
1.18      yamt     2249:        wapbl_free(wr->wr_blkhash,
                   2250:            (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash));
1.2       simonb   2251: #endif /* ! _KERNEL */
                   2252: }
                   2253:
                   2254: static struct wapbl_blk *
                   2255: wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
                   2256: {
                   2257:        struct wapbl_blk_head *wbh;
                   2258:        struct wapbl_blk *wb;
                   2259:        wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
                   2260:        LIST_FOREACH(wb, wbh, wb_hash) {
                   2261:                if (blk == wb->wb_blk)
                   2262:                        return wb;
                   2263:        }
                   2264:        return 0;
                   2265: }
                   2266:
                   2267: static void
                   2268: wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
                   2269: {
                   2270:        struct wapbl_blk_head *wbh;
                   2271:        struct wapbl_blk *wb;
                   2272:        wb = wapbl_blkhash_get(wr, blk);
                   2273:        if (wb) {
                   2274:                KASSERT(wb->wb_blk == blk);
                   2275:                wb->wb_off = off;
                   2276:        } else {
1.51      para     2277:                wb = wapbl_alloc(sizeof(*wb));
1.2       simonb   2278:                wb->wb_blk = blk;
                   2279:                wb->wb_off = off;
                   2280:                wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
                   2281:                LIST_INSERT_HEAD(wbh, wb, wb_hash);
                   2282:                wr->wr_blkhashcnt++;
                   2283:        }
                   2284: }
                   2285:
                   2286: static void
                   2287: wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
                   2288: {
                   2289:        struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
                   2290:        if (wb) {
                   2291:                KASSERT(wr->wr_blkhashcnt > 0);
                   2292:                wr->wr_blkhashcnt--;
                   2293:                LIST_REMOVE(wb, wb_hash);
1.18      yamt     2294:                wapbl_free(wb, sizeof(*wb));
1.2       simonb   2295:        }
                   2296: }
                   2297:
                   2298: static void
                   2299: wapbl_blkhash_clear(struct wapbl_replay *wr)
                   2300: {
1.25      lukem    2301:        unsigned long i;
1.2       simonb   2302:        for (i = 0; i <= wr->wr_blkhashmask; i++) {
                   2303:                struct wapbl_blk *wb;
                   2304:
                   2305:                while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
                   2306:                        KASSERT(wr->wr_blkhashcnt > 0);
                   2307:                        wr->wr_blkhashcnt--;
                   2308:                        LIST_REMOVE(wb, wb_hash);
1.18      yamt     2309:                        wapbl_free(wb, sizeof(*wb));
1.2       simonb   2310:                }
                   2311:        }
                   2312:        KASSERT(wr->wr_blkhashcnt == 0);
                   2313: }
                   2314:
                   2315: /****************************************************************/
                   2316:
                   2317: static int
                   2318: wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
                   2319: {
                   2320:        size_t slen;
                   2321:        off_t off = *offp;
                   2322:        int error;
1.34      mlelstv  2323:        daddr_t pbn;
1.2       simonb   2324:
1.14      joerg    2325:        KASSERT(((len >> wr->wr_log_dev_bshift) <<
                   2326:            wr->wr_log_dev_bshift) == len);
1.34      mlelstv  2327:
1.14      joerg    2328:        if (off < wr->wr_circ_off)
                   2329:                off = wr->wr_circ_off;
                   2330:        slen = wr->wr_circ_off + wr->wr_circ_size - off;
1.2       simonb   2331:        if (slen < len) {
1.34      mlelstv  2332:                pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
                   2333: #ifdef _KERNEL
                   2334:                pbn = btodb(pbn << wr->wr_log_dev_bshift);
                   2335: #endif
                   2336:                error = wapbl_read(data, slen, wr->wr_devvp, pbn);
1.2       simonb   2337:                if (error)
                   2338:                        return error;
                   2339:                data = (uint8_t *)data + slen;
                   2340:                len -= slen;
1.14      joerg    2341:                off = wr->wr_circ_off;
1.2       simonb   2342:        }
1.34      mlelstv  2343:        pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
                   2344: #ifdef _KERNEL
                   2345:        pbn = btodb(pbn << wr->wr_log_dev_bshift);
                   2346: #endif
                   2347:        error = wapbl_read(data, len, wr->wr_devvp, pbn);
1.2       simonb   2348:        if (error)
                   2349:                return error;
                   2350:        off += len;
1.14      joerg    2351:        if (off >= wr->wr_circ_off + wr->wr_circ_size)
                   2352:                off = wr->wr_circ_off;
1.2       simonb   2353:        *offp = off;
                   2354:        return 0;
                   2355: }
                   2356:
                   2357: static void
                   2358: wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
                   2359: {
                   2360:        size_t slen;
                   2361:        off_t off = *offp;
                   2362:
1.14      joerg    2363:        KASSERT(((len >> wr->wr_log_dev_bshift) <<
                   2364:            wr->wr_log_dev_bshift) == len);
1.2       simonb   2365:
1.14      joerg    2366:        if (off < wr->wr_circ_off)
                   2367:                off = wr->wr_circ_off;
                   2368:        slen = wr->wr_circ_off + wr->wr_circ_size - off;
1.2       simonb   2369:        if (slen < len) {
                   2370:                len -= slen;
1.14      joerg    2371:                off = wr->wr_circ_off;
1.2       simonb   2372:        }
                   2373:        off += len;
1.14      joerg    2374:        if (off >= wr->wr_circ_off + wr->wr_circ_size)
                   2375:                off = wr->wr_circ_off;
1.2       simonb   2376:        *offp = off;
                   2377: }
                   2378:
                   2379: /****************************************************************/
                   2380:
                   2381: int
                   2382: wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
                   2383:        daddr_t off, size_t count, size_t blksize)
                   2384: {
                   2385:        struct wapbl_replay *wr;
                   2386:        int error;
                   2387:        struct vnode *devvp;
                   2388:        daddr_t logpbn;
                   2389:        uint8_t *scratch;
                   2390:        struct wapbl_wc_header *wch;
                   2391:        struct wapbl_wc_header *wch2;
                   2392:        /* Use this until we read the actual log header */
1.31      mlelstv  2393:        int log_dev_bshift = ilog2(blksize);
1.2       simonb   2394:        size_t used;
1.34      mlelstv  2395:        daddr_t pbn;
1.2       simonb   2396:
                   2397:        WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
                   2398:            ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
                   2399:            vp, off, count, blksize));
                   2400:
                   2401:        if (off < 0)
                   2402:                return EINVAL;
                   2403:
                   2404:        if (blksize < DEV_BSIZE)
                   2405:                return EINVAL;
                   2406:        if (blksize % DEV_BSIZE)
                   2407:                return EINVAL;
                   2408:
                   2409: #ifdef _KERNEL
                   2410: #if 0
                   2411:        /* XXX vp->v_size isn't reliably set for VBLK devices,
                   2412:         * especially root.  However, we might still want to verify
                   2413:         * that the full load is readable */
                   2414:        if ((off + count) * blksize > vp->v_size)
                   2415:                return EINVAL;
                   2416: #endif
                   2417:        if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
                   2418:                return error;
                   2419:        }
                   2420: #else /* ! _KERNEL */
                   2421:        devvp = vp;
                   2422:        logpbn = off;
                   2423: #endif /* ! _KERNEL */
                   2424:
1.51      para     2425:        scratch = wapbl_alloc(MAXBSIZE);
1.2       simonb   2426:
1.34      mlelstv  2427:        pbn = logpbn;
                   2428: #ifdef _KERNEL
                   2429:        pbn = btodb(pbn << log_dev_bshift);
                   2430: #endif
                   2431:        error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn);
1.2       simonb   2432:        if (error)
                   2433:                goto errout;
                   2434:
                   2435:        wch = (struct wapbl_wc_header *)scratch;
                   2436:        wch2 =
                   2437:            (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
                   2438:        /* XXX verify checksums and magic numbers */
                   2439:        if (wch->wc_type != WAPBL_WC_HEADER) {
                   2440:                printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
                   2441:                error = EFTYPE;
                   2442:                goto errout;
                   2443:        }
                   2444:
                   2445:        if (wch2->wc_generation > wch->wc_generation)
                   2446:                wch = wch2;
                   2447:
                   2448:        wr = wapbl_calloc(1, sizeof(*wr));
                   2449:
                   2450:        wr->wr_logvp = vp;
                   2451:        wr->wr_devvp = devvp;
                   2452:        wr->wr_logpbn = logpbn;
                   2453:
                   2454:        wr->wr_scratch = scratch;
                   2455:
1.14      joerg    2456:        wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
                   2457:        wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
                   2458:        wr->wr_circ_off = wch->wc_circ_off;
                   2459:        wr->wr_circ_size = wch->wc_circ_size;
                   2460:        wr->wr_generation = wch->wc_generation;
1.2       simonb   2461:
                   2462:        used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
                   2463:
                   2464:        WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
                   2465:            ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
                   2466:            " len=%"PRId64" used=%zu\n",
                   2467:            wch->wc_head, wch->wc_tail, wch->wc_circ_off,
                   2468:            wch->wc_circ_size, used));
                   2469:
                   2470:        wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
1.11      joerg    2471:
1.14      joerg    2472:        error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
1.2       simonb   2473:        if (error) {
                   2474:                wapbl_replay_stop(wr);
                   2475:                wapbl_replay_free(wr);
                   2476:                return error;
                   2477:        }
                   2478:
                   2479:        *wrp = wr;
                   2480:        return 0;
                   2481:
                   2482:  errout:
1.18      yamt     2483:        wapbl_free(scratch, MAXBSIZE);
1.2       simonb   2484:        return error;
                   2485: }
                   2486:
                   2487: void
                   2488: wapbl_replay_stop(struct wapbl_replay *wr)
                   2489: {
                   2490:
1.4       joerg    2491:        if (!wapbl_replay_isopen(wr))
                   2492:                return;
                   2493:
1.2       simonb   2494:        WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
                   2495:
1.18      yamt     2496:        wapbl_free(wr->wr_scratch, MAXBSIZE);
                   2497:        wr->wr_scratch = NULL;
1.2       simonb   2498:
1.18      yamt     2499:        wr->wr_logvp = NULL;
1.2       simonb   2500:
                   2501:        wapbl_blkhash_clear(wr);
                   2502:        wapbl_blkhash_free(wr);
                   2503: }
                   2504:
                   2505: void
                   2506: wapbl_replay_free(struct wapbl_replay *wr)
                   2507: {
                   2508:
                   2509:        KDASSERT(!wapbl_replay_isopen(wr));
                   2510:
                   2511:        if (wr->wr_inodes)
1.18      yamt     2512:                wapbl_free(wr->wr_inodes,
                   2513:                    wr->wr_inodescnt * sizeof(wr->wr_inodes[0]));
                   2514:        wapbl_free(wr, sizeof(*wr));
1.2       simonb   2515: }
                   2516:
1.4       joerg    2517: #ifdef _KERNEL
1.2       simonb   2518: int
                   2519: wapbl_replay_isopen1(struct wapbl_replay *wr)
                   2520: {
                   2521:
                   2522:        return wapbl_replay_isopen(wr);
                   2523: }
1.4       joerg    2524: #endif
1.2       simonb   2525:
1.10      joerg    2526: static void
                   2527: wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
                   2528: {
                   2529:        struct wapbl_wc_blocklist *wc =
                   2530:            (struct wapbl_wc_blocklist *)wr->wr_scratch;
1.14      joerg    2531:        int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.10      joerg    2532:        int i, j, n;
                   2533:
                   2534:        for (i = 0; i < wc->wc_blkcount; i++) {
                   2535:                /*
                   2536:                 * Enter each physical block into the hashtable independently.
                   2537:                 */
1.14      joerg    2538:                n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
1.10      joerg    2539:                for (j = 0; j < n; j++) {
1.34      mlelstv  2540:                        wapbl_blkhash_ins(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen),
1.10      joerg    2541:                            *offp);
                   2542:                        wapbl_circ_advance(wr, fsblklen, offp);
                   2543:                }
                   2544:        }
                   2545: }
                   2546:
                   2547: static void
                   2548: wapbl_replay_process_revocations(struct wapbl_replay *wr)
                   2549: {
                   2550:        struct wapbl_wc_blocklist *wc =
                   2551:            (struct wapbl_wc_blocklist *)wr->wr_scratch;
1.34      mlelstv  2552:        int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.10      joerg    2553:        int i, j, n;
                   2554:
                   2555:        for (i = 0; i < wc->wc_blkcount; i++) {
                   2556:                /*
                   2557:                 * Remove any blocks found from the hashtable.
                   2558:                 */
1.14      joerg    2559:                n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
1.10      joerg    2560:                for (j = 0; j < n; j++)
1.34      mlelstv  2561:                        wapbl_blkhash_rem(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
1.10      joerg    2562:        }
                   2563: }
                   2564:
                   2565: static void
                   2566: wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
                   2567: {
                   2568:        struct wapbl_wc_inodelist *wc =
                   2569:            (struct wapbl_wc_inodelist *)wr->wr_scratch;
1.18      yamt     2570:        void *new_inodes;
                   2571:        const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]);
                   2572:
                   2573:        KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0]));
                   2574:
1.10      joerg    2575:        /*
                   2576:         * Keep track of where we found this so location won't be
                   2577:         * overwritten.
                   2578:         */
                   2579:        if (wc->wc_clear) {
                   2580:                wr->wr_inodestail = oldoff;
                   2581:                wr->wr_inodescnt = 0;
1.12      joerg    2582:                if (wr->wr_inodes != NULL) {
1.18      yamt     2583:                        wapbl_free(wr->wr_inodes, oldsize);
1.12      joerg    2584:                        wr->wr_inodes = NULL;
                   2585:                }
1.10      joerg    2586:        }
                   2587:        wr->wr_inodeshead = newoff;
                   2588:        if (wc->wc_inocnt == 0)
                   2589:                return;
                   2590:
1.51      para     2591:        new_inodes = wapbl_alloc((wr->wr_inodescnt + wc->wc_inocnt) *
1.18      yamt     2592:            sizeof(wr->wr_inodes[0]));
                   2593:        if (wr->wr_inodes != NULL) {
                   2594:                memcpy(new_inodes, wr->wr_inodes, oldsize);
                   2595:                wapbl_free(wr->wr_inodes, oldsize);
                   2596:        }
                   2597:        wr->wr_inodes = new_inodes;
1.10      joerg    2598:        memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
1.18      yamt     2599:            wc->wc_inocnt * sizeof(wr->wr_inodes[0]));
1.10      joerg    2600:        wr->wr_inodescnt += wc->wc_inocnt;
                   2601: }
                   2602:
1.2       simonb   2603: static int
1.14      joerg    2604: wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
1.2       simonb   2605: {
                   2606:        off_t off;
                   2607:        int error;
                   2608:
1.14      joerg    2609:        int logblklen = 1 << wr->wr_log_dev_bshift;
1.2       simonb   2610:
                   2611:        wapbl_blkhash_clear(wr);
                   2612:
1.14      joerg    2613:        off = tail;
                   2614:        while (off != head) {
1.2       simonb   2615:                struct wapbl_wc_null *wcn;
                   2616:                off_t saveoff = off;
                   2617:                error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
                   2618:                if (error)
                   2619:                        goto errout;
                   2620:                wcn = (struct wapbl_wc_null *)wr->wr_scratch;
                   2621:                switch (wcn->wc_type) {
                   2622:                case WAPBL_WC_BLOCKS:
1.10      joerg    2623:                        wapbl_replay_process_blocks(wr, &off);
1.2       simonb   2624:                        break;
                   2625:
                   2626:                case WAPBL_WC_REVOCATIONS:
1.10      joerg    2627:                        wapbl_replay_process_revocations(wr);
1.2       simonb   2628:                        break;
                   2629:
                   2630:                case WAPBL_WC_INODES:
1.10      joerg    2631:                        wapbl_replay_process_inodes(wr, saveoff, off);
1.2       simonb   2632:                        break;
1.10      joerg    2633:
1.2       simonb   2634:                default:
                   2635:                        printf("Unrecognized wapbl type: 0x%08x\n",
                   2636:                               wcn->wc_type);
                   2637:                        error = EFTYPE;
                   2638:                        goto errout;
                   2639:                }
                   2640:                wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
                   2641:                if (off != saveoff) {
                   2642:                        printf("wapbl_replay: corrupted records\n");
                   2643:                        error = EFTYPE;
                   2644:                        goto errout;
                   2645:                }
                   2646:        }
                   2647:        return 0;
                   2648:
                   2649:  errout:
                   2650:        wapbl_blkhash_clear(wr);
                   2651:        return error;
                   2652: }
                   2653:
1.13      joerg    2654: #if 0
1.2       simonb   2655: int
                   2656: wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
                   2657: {
                   2658:        off_t off;
                   2659:        int mismatchcnt = 0;
1.14      joerg    2660:        int logblklen = 1 << wr->wr_log_dev_bshift;
                   2661:        int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.51      para     2662:        void *scratch1 = wapbl_alloc(MAXBSIZE);
                   2663:        void *scratch2 = wapbl_alloc(MAXBSIZE);
1.2       simonb   2664:        int error = 0;
                   2665:
                   2666:        KDASSERT(wapbl_replay_isopen(wr));
                   2667:
                   2668:        off = wch->wc_tail;
                   2669:        while (off != wch->wc_head) {
                   2670:                struct wapbl_wc_null *wcn;
                   2671: #ifdef DEBUG
                   2672:                off_t saveoff = off;
                   2673: #endif
                   2674:                error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
                   2675:                if (error)
                   2676:                        goto out;
                   2677:                wcn = (struct wapbl_wc_null *)wr->wr_scratch;
                   2678:                switch (wcn->wc_type) {
                   2679:                case WAPBL_WC_BLOCKS:
                   2680:                        {
                   2681:                                struct wapbl_wc_blocklist *wc =
                   2682:                                    (struct wapbl_wc_blocklist *)wr->wr_scratch;
                   2683:                                int i;
                   2684:                                for (i = 0; i < wc->wc_blkcount; i++) {
                   2685:                                        int foundcnt = 0;
                   2686:                                        int dirtycnt = 0;
                   2687:                                        int j, n;
                   2688:                                        /*
                   2689:                                         * Check each physical block into the
                   2690:                                         * hashtable independently
                   2691:                                         */
                   2692:                                        n = wc->wc_blocks[i].wc_dlen >>
                   2693:                                            wch->wc_fs_dev_bshift;
                   2694:                                        for (j = 0; j < n; j++) {
                   2695:                                                struct wapbl_blk *wb =
                   2696:                                                   wapbl_blkhash_get(wr,
1.34      mlelstv  2697:                                                   wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
1.2       simonb   2698:                                                if (wb && (wb->wb_off == off)) {
                   2699:                                                        foundcnt++;
                   2700:                                                        error =
                   2701:                                                            wapbl_circ_read(wr,
                   2702:                                                            scratch1, fsblklen,
                   2703:                                                            &off);
                   2704:                                                        if (error)
                   2705:                                                                goto out;
                   2706:                                                        error =
                   2707:                                                            wapbl_read(scratch2,
                   2708:                                                            fsblklen, fsdevvp,
                   2709:                                                            wb->wb_blk);
                   2710:                                                        if (error)
                   2711:                                                                goto out;
                   2712:                                                        if (memcmp(scratch1,
                   2713:                                                                   scratch2,
                   2714:                                                                   fsblklen)) {
                   2715:                                                                printf(
                   2716:                "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
                   2717:                wb->wb_blk, (intmax_t)off);
                   2718:                                                                dirtycnt++;
                   2719:                                                                mismatchcnt++;
                   2720:                                                        }
                   2721:                                                } else {
                   2722:                                                        wapbl_circ_advance(wr,
                   2723:                                                            fsblklen, &off);
                   2724:                                                }
                   2725:                                        }
                   2726: #if 0
                   2727:                                        /*
                   2728:                                         * If all of the blocks in an entry
                   2729:                                         * are clean, then remove all of its
                   2730:                                         * blocks from the hashtable since they
                   2731:                                         * never will need replay.
                   2732:                                         */
                   2733:                                        if ((foundcnt != 0) &&
                   2734:                                            (dirtycnt == 0)) {
                   2735:                                                off = saveoff;
                   2736:                                                wapbl_circ_advance(wr,
                   2737:                                                    logblklen, &off);
                   2738:                                                for (j = 0; j < n; j++) {
                   2739:                                                        struct wapbl_blk *wb =
                   2740:                                                           wapbl_blkhash_get(wr,
1.34      mlelstv  2741:                                                           wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
1.2       simonb   2742:                                                        if (wb &&
                   2743:                                                          (wb->wb_off == off)) {
                   2744:                                                                wapbl_blkhash_rem(wr, wb->wb_blk);
                   2745:                                                        }
                   2746:                                                        wapbl_circ_advance(wr,
                   2747:                                                            fsblklen, &off);
                   2748:                                                }
                   2749:                                        }
                   2750: #endif
                   2751:                                }
                   2752:                        }
                   2753:                        break;
                   2754:                case WAPBL_WC_REVOCATIONS:
                   2755:                case WAPBL_WC_INODES:
                   2756:                        break;
                   2757:                default:
                   2758:                        KASSERT(0);
                   2759:                }
                   2760: #ifdef DEBUG
                   2761:                wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
                   2762:                KASSERT(off == saveoff);
                   2763: #endif
                   2764:        }
                   2765:  out:
1.18      yamt     2766:        wapbl_free(scratch1, MAXBSIZE);
                   2767:        wapbl_free(scratch2, MAXBSIZE);
1.2       simonb   2768:        if (!error && mismatchcnt)
                   2769:                error = EFTYPE;
                   2770:        return error;
                   2771: }
                   2772: #endif
                   2773:
                   2774: int
                   2775: wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
                   2776: {
1.9       joerg    2777:        struct wapbl_blk *wb;
                   2778:        size_t i;
1.2       simonb   2779:        off_t off;
1.9       joerg    2780:        void *scratch;
1.2       simonb   2781:        int error = 0;
1.14      joerg    2782:        int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.2       simonb   2783:
                   2784:        KDASSERT(wapbl_replay_isopen(wr));
                   2785:
1.51      para     2786:        scratch = wapbl_alloc(MAXBSIZE);
1.2       simonb   2787:
1.37      drochner 2788:        for (i = 0; i <= wr->wr_blkhashmask; ++i) {
1.9       joerg    2789:                LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
                   2790:                        off = wb->wb_off;
                   2791:                        error = wapbl_circ_read(wr, scratch, fsblklen, &off);
                   2792:                        if (error)
                   2793:                                break;
                   2794:                        error = wapbl_write(scratch, fsblklen, fsdevvp,
                   2795:                            wb->wb_blk);
                   2796:                        if (error)
                   2797:                                break;
1.2       simonb   2798:                }
                   2799:        }
1.9       joerg    2800:
1.18      yamt     2801:        wapbl_free(scratch, MAXBSIZE);
1.2       simonb   2802:        return error;
                   2803: }
                   2804:
                   2805: int
1.6       joerg    2806: wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
                   2807: {
1.14      joerg    2808:        int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.6       joerg    2809:
                   2810:        KDASSERT(wapbl_replay_isopen(wr));
                   2811:        KASSERT((len % fsblklen) == 0);
                   2812:
                   2813:        while (len != 0) {
                   2814:                struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
                   2815:                if (wb)
                   2816:                        return 1;
                   2817:                len -= fsblklen;
                   2818:        }
                   2819:        return 0;
                   2820: }
                   2821:
                   2822: int
1.2       simonb   2823: wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
                   2824: {
1.14      joerg    2825:        int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.2       simonb   2826:
                   2827:        KDASSERT(wapbl_replay_isopen(wr));
                   2828:
                   2829:        KASSERT((len % fsblklen) == 0);
                   2830:
                   2831:        while (len != 0) {
                   2832:                struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
                   2833:                if (wb) {
                   2834:                        off_t off = wb->wb_off;
                   2835:                        int error;
                   2836:                        error = wapbl_circ_read(wr, data, fsblklen, &off);
                   2837:                        if (error)
                   2838:                                return error;
                   2839:                }
                   2840:                data = (uint8_t *)data + fsblklen;
                   2841:                len -= fsblklen;
                   2842:                blk++;
                   2843:        }
                   2844:        return 0;
                   2845: }
1.35      pooka    2846:
1.36      pooka    2847: #ifdef _KERNEL
1.35      pooka    2848: /*
                   2849:  * This is not really a module now, but maybe on it's way to
                   2850:  * being one some day.
                   2851:  */
                   2852: MODULE(MODULE_CLASS_VFS, wapbl, NULL);
                   2853:
                   2854: static int
                   2855: wapbl_modcmd(modcmd_t cmd, void *arg)
                   2856: {
                   2857:
                   2858:        switch (cmd) {
                   2859:        case MODULE_CMD_INIT:
1.39      christos 2860:                wapbl_init();
1.35      pooka    2861:                return 0;
                   2862:        case MODULE_CMD_FINI:
1.39      christos 2863: #ifdef notyet
                   2864:                return wapbl_fini(true);
                   2865: #endif
1.35      pooka    2866:                return EOPNOTSUPP;
                   2867:        default:
                   2868:                return ENOTTY;
                   2869:        }
                   2870: }
1.36      pooka    2871: #endif /* _KERNEL */

CVSweb <webmaster@jp.NetBSD.org>