[BACK]Return to vfs_wapbl.c CVS log [TXT][DIR] Up to [cvs.NetBSD.org] / src / sys / kern

Annotation of src/sys/kern/vfs_wapbl.c, Revision 1.78.2.1

1.78.2.1! pgoyette    1: /*     $NetBSD: vfs_wapbl.c,v 1.85 2016/10/28 20:38:12 jdolecek Exp $  */
1.2       simonb      2:
                      3: /*-
1.23      ad          4:  * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
1.2       simonb      5:  * All rights reserved.
                      6:  *
                      7:  * This code is derived from software contributed to The NetBSD Foundation
                      8:  * by Wasabi Systems, Inc.
                      9:  *
                     10:  * Redistribution and use in source and binary forms, with or without
                     11:  * modification, are permitted provided that the following conditions
                     12:  * are met:
                     13:  * 1. Redistributions of source code must retain the above copyright
                     14:  *    notice, this list of conditions and the following disclaimer.
                     15:  * 2. Redistributions in binary form must reproduce the above copyright
                     16:  *    notice, this list of conditions and the following disclaimer in the
                     17:  *    documentation and/or other materials provided with the distribution.
                     18:  *
                     19:  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
                     20:  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
                     21:  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
                     22:  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
                     23:  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
                     24:  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
                     25:  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
                     26:  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
                     27:  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
                     28:  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
                     29:  * POSSIBILITY OF SUCH DAMAGE.
                     30:  */
                     31:
                     32: /*
                     33:  * This implements file system independent write ahead filesystem logging.
                     34:  */
1.4       joerg      35:
                     36: #define WAPBL_INTERNAL
                     37:
1.2       simonb     38: #include <sys/cdefs.h>
1.78.2.1! pgoyette   39: __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.85 2016/10/28 20:38:12 jdolecek Exp $");
1.2       simonb     40:
                     41: #include <sys/param.h>
1.31      mlelstv    42: #include <sys/bitops.h>
1.68      riastrad   43: #include <sys/time.h>
                     44: #include <sys/wapbl.h>
                     45: #include <sys/wapbl_replay.h>
1.2       simonb     46:
                     47: #ifdef _KERNEL
1.68      riastrad   48:
                     49: #include <sys/atomic.h>
                     50: #include <sys/conf.h>
                     51: #include <sys/file.h>
                     52: #include <sys/kauth.h>
                     53: #include <sys/kernel.h>
                     54: #include <sys/module.h>
                     55: #include <sys/mount.h>
                     56: #include <sys/mutex.h>
1.2       simonb     57: #include <sys/namei.h>
                     58: #include <sys/proc.h>
1.68      riastrad   59: #include <sys/resourcevar.h>
1.39      christos   60: #include <sys/sysctl.h>
1.2       simonb     61: #include <sys/uio.h>
                     62: #include <sys/vnode.h>
                     63:
                     64: #include <miscfs/specfs/specdev.h>
                     65:
1.51      para       66: #define        wapbl_alloc(s) kmem_alloc((s), KM_SLEEP)
                     67: #define        wapbl_free(a, s) kmem_free((a), (s))
                     68: #define        wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
1.2       simonb     69:
1.39      christos   70: static struct sysctllog *wapbl_sysctl;
                     71: static int wapbl_flush_disk_cache = 1;
                     72: static int wapbl_verbose_commit = 0;
                     73:
1.57      joerg      74: static inline size_t wapbl_space_free(size_t, off_t, off_t);
                     75:
1.2       simonb     76: #else /* !_KERNEL */
1.68      riastrad   77:
1.2       simonb     78: #include <assert.h>
                     79: #include <errno.h>
1.68      riastrad   80: #include <stdbool.h>
1.2       simonb     81: #include <stdio.h>
                     82: #include <stdlib.h>
                     83: #include <string.h>
                     84:
                     85: #define        KDASSERT(x) assert(x)
                     86: #define        KASSERT(x) assert(x)
1.51      para       87: #define        wapbl_alloc(s) malloc(s)
1.18      yamt       88: #define        wapbl_free(a, s) free(a)
1.2       simonb     89: #define        wapbl_calloc(n, s) calloc((n), (s))
                     90:
                     91: #endif /* !_KERNEL */
                     92:
                     93: /*
                     94:  * INTERNAL DATA STRUCTURES
                     95:  */
                     96:
                     97: /*
                     98:  * This structure holds per-mount log information.
                     99:  *
                    100:  * Legend:     a = atomic access only
                    101:  *             r = read-only after init
                    102:  *             l = rwlock held
                    103:  *             m = mutex held
1.38      hannken   104:  *             lm = rwlock held writing or mutex held
1.2       simonb    105:  *             u = unlocked access ok
                    106:  *             b = bufcache_lock held
                    107:  */
1.60      matt      108: LIST_HEAD(wapbl_ino_head, wapbl_ino);
1.2       simonb    109: struct wapbl {
                    110:        struct vnode *wl_logvp; /* r:   log here */
                    111:        struct vnode *wl_devvp; /* r:   log on this device */
                    112:        struct mount *wl_mount; /* r:   mountpoint wl is associated with */
                    113:        daddr_t wl_logpbn;      /* r:   Physical block number of start of log */
                    114:        int wl_log_dev_bshift;  /* r:   logarithm of device block size of log
                    115:                                        device */
                    116:        int wl_fs_dev_bshift;   /* r:   logarithm of device block size of
                    117:                                        filesystem device */
                    118:
1.3       yamt      119:        unsigned wl_lock_count; /* m:   Count of transactions in progress */
1.2       simonb    120:
                    121:        size_t wl_circ_size;    /* r:   Number of bytes in buffer of log */
                    122:        size_t wl_circ_off;     /* r:   Number of bytes reserved at start */
                    123:
                    124:        size_t wl_bufcount_max; /* r:   Number of buffers reserved for log */
                    125:        size_t wl_bufbytes_max; /* r:   Number of buf bytes reserved for log */
                    126:
                    127:        off_t wl_head;          /* l:   Byte offset of log head */
                    128:        off_t wl_tail;          /* l:   Byte offset of log tail */
                    129:        /*
1.71      riastrad  130:         * WAPBL log layout, stored on wl_devvp at wl_logpbn:
                    131:         *
                    132:         *  ___________________ wl_circ_size __________________
                    133:         * /                                                   \
                    134:         * +---------+---------+-------+--------------+--------+
                    135:         * [ commit0 | commit1 | CCWCW | EEEEEEEEEEEE | CCCWCW ]
                    136:         * +---------+---------+-------+--------------+--------+
                    137:         *       wl_circ_off --^       ^-- wl_head    ^-- wl_tail
                    138:         *
                    139:         * commit0 and commit1 are commit headers.  A commit header has
                    140:         * a generation number, indicating which of the two headers is
                    141:         * more recent, and an assignment of head and tail pointers.
                    142:         * The rest is a circular queue of log records, starting at
                    143:         * the byte offset wl_circ_off.
                    144:         *
                    145:         * E marks empty space for records.
                    146:         * W marks records for block writes issued but waiting.
                    147:         * C marks completed records.
                    148:         *
                    149:         * wapbl_flush writes new records to empty `E' spaces after
                    150:         * wl_head from the current transaction in memory.
                    151:         *
                    152:         * wapbl_truncate advances wl_tail past any completed `C'
                    153:         * records, freeing them up for use.
                    154:         *
                    155:         * head == tail == 0 means log is empty.
                    156:         * head == tail != 0 means log is full.
                    157:         *
                    158:         * See assertions in wapbl_advance() for other boundary
                    159:         * conditions.
                    160:         *
                    161:         * Only wapbl_flush moves the head, except when wapbl_truncate
                    162:         * sets it to 0 to indicate that the log is empty.
                    163:         *
                    164:         * Only wapbl_truncate moves the tail, except when wapbl_flush
                    165:         * sets it to wl_circ_off to indicate that the log is full.
1.2       simonb    166:         */
                    167:
                    168:        struct wapbl_wc_header *wl_wc_header;   /* l    */
                    169:        void *wl_wc_scratch;    /* l:   scratch space (XXX: por que?!?) */
                    170:
                    171:        kmutex_t wl_mtx;        /* u:   short-term lock */
                    172:        krwlock_t wl_rwlock;    /* u:   File system transaction lock */
                    173:
                    174:        /*
                    175:         * Must be held while accessing
                    176:         * wl_count or wl_bufs or head or tail
                    177:         */
                    178:
                    179:        /*
                    180:         * Callback called from within the flush routine to flush any extra
                    181:         * bits.  Note that flush may be skipped without calling this if
                    182:         * there are no outstanding buffers in the transaction.
                    183:         */
1.5       joerg     184: #if _KERNEL
1.2       simonb    185:        wapbl_flush_fn_t wl_flush;      /* r    */
                    186:        wapbl_flush_fn_t wl_flush_abort;/* r    */
1.5       joerg     187: #endif
1.2       simonb    188:
                    189:        size_t wl_bufbytes;     /* m:   Byte count of pages in wl_bufs */
                    190:        size_t wl_bufcount;     /* m:   Count of buffers in wl_bufs */
                    191:        size_t wl_bcount;       /* m:   Total bcount of wl_bufs */
                    192:
                    193:        LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
                    194:
                    195:        kcondvar_t wl_reclaimable_cv;   /* m (obviously) */
                    196:        size_t wl_reclaimable_bytes; /* m:      Amount of space available for
                    197:                                                reclamation by truncate */
                    198:        int wl_error_count;     /* m:   # of wl_entries with errors */
                    199:        size_t wl_reserved_bytes; /* never truncate log smaller than this */
                    200:
                    201: #ifdef WAPBL_DEBUG_BUFBYTES
                    202:        size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
                    203: #endif
                    204:
1.78.2.1! pgoyette  205: #if _KERNEL
        !           206:        int wl_brperjblock;     /* r Block records per journal block */
        !           207: #endif
        !           208:
        !           209:        SIMPLEQ_HEAD(, wapbl_dealloc) wl_dealloclist;   /* lm:  list head */
        !           210:        int wl_dealloccnt;                              /* lm:  total count */
        !           211:        int wl_dealloclim;                              /* r:   max count */
1.2       simonb    212:
                    213:        /* hashtable of inode numbers for allocated but unlinked inodes */
                    214:        /* synch ??? */
1.60      matt      215:        struct wapbl_ino_head *wl_inohash;
1.2       simonb    216:        u_long wl_inohashmask;
                    217:        int wl_inohashcnt;
                    218:
                    219:        SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
                    220:                                                   accounting */
1.54      hannken   221:
                    222:        u_char *wl_buffer;      /* l:   buffer for wapbl_buffered_write() */
                    223:        daddr_t wl_buffer_dblk; /* l:   buffer disk block address */
                    224:        size_t wl_buffer_used;  /* l:   buffer current use */
1.2       simonb    225: };
                    226:
                    227: #ifdef WAPBL_DEBUG_PRINT
                    228: int wapbl_debug_print = WAPBL_DEBUG_PRINT;
                    229: #endif
                    230:
                    231: /****************************************************************/
                    232: #ifdef _KERNEL
                    233:
                    234: #ifdef WAPBL_DEBUG
                    235: struct wapbl *wapbl_debug_wl;
                    236: #endif
                    237:
                    238: static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
                    239: static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
                    240: static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
                    241: static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
                    242: #endif /* _KERNEL */
                    243:
1.14      joerg     244: static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
1.2       simonb    245:
1.30      uebayasi  246: static inline size_t wapbl_space_used(size_t avail, off_t head,
1.2       simonb    247:        off_t tail);
                    248:
                    249: #ifdef _KERNEL
                    250:
1.51      para      251: static struct pool wapbl_entry_pool;
1.78.2.1! pgoyette  252: static struct pool wapbl_dealloc_pool;
1.51      para      253:
1.2       simonb    254: #define        WAPBL_INODETRK_SIZE 83
                    255: static int wapbl_ino_pool_refcount;
                    256: static struct pool wapbl_ino_pool;
                    257: struct wapbl_ino {
                    258:        LIST_ENTRY(wapbl_ino) wi_hash;
                    259:        ino_t wi_ino;
                    260:        mode_t wi_mode;
                    261: };
                    262:
                    263: static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
                    264: static void wapbl_inodetrk_free(struct wapbl *wl);
                    265: static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
                    266:
                    267: static size_t wapbl_transaction_len(struct wapbl *wl);
1.30      uebayasi  268: static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
1.2       simonb    269:
1.13      joerg     270: #if 0
1.4       joerg     271: int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
                    272: #endif
                    273:
                    274: static int wapbl_replay_isopen1(struct wapbl_replay *);
                    275:
1.2       simonb    276: struct wapbl_ops wapbl_ops = {
                    277:        .wo_wapbl_discard       = wapbl_discard,
                    278:        .wo_wapbl_replay_isopen = wapbl_replay_isopen1,
1.6       joerg     279:        .wo_wapbl_replay_can_read = wapbl_replay_can_read,
1.2       simonb    280:        .wo_wapbl_replay_read   = wapbl_replay_read,
                    281:        .wo_wapbl_add_buf       = wapbl_add_buf,
                    282:        .wo_wapbl_remove_buf    = wapbl_remove_buf,
                    283:        .wo_wapbl_resize_buf    = wapbl_resize_buf,
                    284:        .wo_wapbl_begin         = wapbl_begin,
                    285:        .wo_wapbl_end           = wapbl_end,
                    286:        .wo_wapbl_junlock_assert= wapbl_junlock_assert,
                    287:
                    288:        /* XXX: the following is only used to say "this is a wapbl buf" */
                    289:        .wo_wapbl_biodone       = wapbl_biodone,
                    290: };
                    291:
1.21      yamt      292: static int
1.39      christos  293: wapbl_sysctl_init(void)
                    294: {
                    295:        int rv;
                    296:        const struct sysctlnode *rnode, *cnode;
                    297:
                    298:        wapbl_sysctl = NULL;
                    299:
                    300:        rv = sysctl_createv(&wapbl_sysctl, 0, NULL, &rnode,
                    301:                       CTLFLAG_PERMANENT,
                    302:                       CTLTYPE_NODE, "wapbl",
                    303:                       SYSCTL_DESCR("WAPBL journaling options"),
                    304:                       NULL, 0, NULL, 0,
1.59      pooka     305:                       CTL_VFS, CTL_CREATE, CTL_EOL);
1.39      christos  306:        if (rv)
                    307:                return rv;
                    308:
                    309:        rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
                    310:                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                    311:                       CTLTYPE_INT, "flush_disk_cache",
                    312:                       SYSCTL_DESCR("flush disk cache"),
                    313:                       NULL, 0, &wapbl_flush_disk_cache, 0,
                    314:                       CTL_CREATE, CTL_EOL);
                    315:        if (rv)
                    316:                return rv;
                    317:
                    318:        rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
                    319:                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                    320:                       CTLTYPE_INT, "verbose_commit",
                    321:                       SYSCTL_DESCR("show time and size of wapbl log commits"),
                    322:                       NULL, 0, &wapbl_verbose_commit, 0,
                    323:                       CTL_CREATE, CTL_EOL);
                    324:        return rv;
                    325: }
                    326:
                    327: static void
                    328: wapbl_init(void)
                    329: {
1.51      para      330:
                    331:        pool_init(&wapbl_entry_pool, sizeof(struct wapbl_entry), 0, 0, 0,
                    332:            "wapblentrypl", &pool_allocator_kmem, IPL_VM);
1.78.2.1! pgoyette  333:        pool_init(&wapbl_dealloc_pool, sizeof(struct wapbl_dealloc), 0, 0, 0,
        !           334:            "wapbldealloc", &pool_allocator_nointr, IPL_NONE);
1.51      para      335:
1.39      christos  336:        wapbl_sysctl_init();
                    337: }
                    338:
                    339: static int
1.74      riastrad  340: wapbl_fini(void)
1.39      christos  341: {
1.51      para      342:
1.63      pgoyette  343:        if (wapbl_sysctl != NULL)
                    344:                 sysctl_teardown(&wapbl_sysctl);
1.51      para      345:
1.78.2.1! pgoyette  346:        pool_destroy(&wapbl_dealloc_pool);
1.51      para      347:        pool_destroy(&wapbl_entry_pool);
                    348:
1.39      christos  349:        return 0;
                    350: }
                    351:
                    352: static int
1.15      joerg     353: wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
                    354: {
                    355:        int error, i;
                    356:
                    357:        WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
                    358:            ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
                    359:
                    360:        /*
                    361:         * Its only valid to reuse the replay log if its
                    362:         * the same as the new log we just opened.
                    363:         */
                    364:        KDASSERT(!wapbl_replay_isopen(wr));
1.47      christos  365:        KASSERT(wl->wl_devvp->v_type == VBLK);
                    366:        KASSERT(wr->wr_devvp->v_type == VBLK);
1.15      joerg     367:        KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
                    368:        KASSERT(wl->wl_logpbn == wr->wr_logpbn);
                    369:        KASSERT(wl->wl_circ_size == wr->wr_circ_size);
                    370:        KASSERT(wl->wl_circ_off == wr->wr_circ_off);
                    371:        KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
                    372:        KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
                    373:
                    374:        wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
                    375:
                    376:        for (i = 0; i < wr->wr_inodescnt; i++)
                    377:                wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
                    378:                    wr->wr_inodes[i].wr_imode);
                    379:
                    380:        /* Make sure new transaction won't overwrite old inodes list */
                    381:        KDASSERT(wapbl_transaction_len(wl) <=
                    382:            wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
                    383:            wr->wr_inodestail));
                    384:
                    385:        wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
                    386:        wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
                    387:            wapbl_transaction_len(wl);
                    388:
                    389:        error = wapbl_write_inodes(wl, &wl->wl_head);
                    390:        if (error)
                    391:                return error;
                    392:
                    393:        KASSERT(wl->wl_head != wl->wl_tail);
                    394:        KASSERT(wl->wl_head != 0);
                    395:
                    396:        return 0;
                    397: }
                    398:
1.2       simonb    399: int
                    400: wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
                    401:        daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
                    402:        wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
                    403: {
                    404:        struct wapbl *wl;
                    405:        struct vnode *devvp;
                    406:        daddr_t logpbn;
                    407:        int error;
1.31      mlelstv   408:        int log_dev_bshift = ilog2(blksize);
1.32      mlelstv   409:        int fs_dev_bshift = log_dev_bshift;
1.2       simonb    410:        int run;
                    411:
                    412:        WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
                    413:            " count=%zu blksize=%zu\n", vp, off, count, blksize));
                    414:
                    415:        if (log_dev_bshift > fs_dev_bshift) {
                    416:                WAPBL_PRINTF(WAPBL_PRINT_OPEN,
                    417:                        ("wapbl: log device's block size cannot be larger "
                    418:                         "than filesystem's\n"));
                    419:                /*
                    420:                 * Not currently implemented, although it could be if
                    421:                 * needed someday.
                    422:                 */
                    423:                return ENOSYS;
                    424:        }
                    425:
                    426:        if (off < 0)
                    427:                return EINVAL;
                    428:
                    429:        if (blksize < DEV_BSIZE)
                    430:                return EINVAL;
                    431:        if (blksize % DEV_BSIZE)
                    432:                return EINVAL;
                    433:
                    434:        /* XXXTODO: verify that the full load is writable */
                    435:
                    436:        /*
                    437:         * XXX check for minimum log size
                    438:         * minimum is governed by minimum amount of space
                    439:         * to complete a transaction. (probably truncate)
                    440:         */
                    441:        /* XXX for now pick something minimal */
                    442:        if ((count * blksize) < MAXPHYS) {
                    443:                return ENOSPC;
                    444:        }
                    445:
                    446:        if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
                    447:                return error;
                    448:        }
                    449:
                    450:        wl = wapbl_calloc(1, sizeof(*wl));
                    451:        rw_init(&wl->wl_rwlock);
                    452:        mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
                    453:        cv_init(&wl->wl_reclaimable_cv, "wapblrec");
                    454:        LIST_INIT(&wl->wl_bufs);
                    455:        SIMPLEQ_INIT(&wl->wl_entries);
                    456:
                    457:        wl->wl_logvp = vp;
                    458:        wl->wl_devvp = devvp;
                    459:        wl->wl_mount = mp;
                    460:        wl->wl_logpbn = logpbn;
                    461:        wl->wl_log_dev_bshift = log_dev_bshift;
                    462:        wl->wl_fs_dev_bshift = fs_dev_bshift;
                    463:
                    464:        wl->wl_flush = flushfn;
                    465:        wl->wl_flush_abort = flushabortfn;
                    466:
                    467:        /* Reserve two log device blocks for the commit headers */
                    468:        wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
1.34      mlelstv   469:        wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
1.2       simonb    470:        /* truncate the log usage to a multiple of log_dev_bshift */
                    471:        wl->wl_circ_size >>= wl->wl_log_dev_bshift;
                    472:        wl->wl_circ_size <<= wl->wl_log_dev_bshift;
                    473:
                    474:        /*
                    475:         * wl_bufbytes_max limits the size of the in memory transaction space.
                    476:         * - Since buffers are allocated and accounted for in units of
                    477:         *   PAGE_SIZE it is required to be a multiple of PAGE_SIZE
                    478:         *   (i.e. 1<<PAGE_SHIFT)
                    479:         * - Since the log device has to be written in units of
                    480:         *   1<<wl_log_dev_bshift it is required to be a mulitple of
                    481:         *   1<<wl_log_dev_bshift.
                    482:         * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
                    483:         *   it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
                    484:         * Therefore it must be multiple of the least common multiple of those
                    485:         * three quantities.  Fortunately, all of those quantities are
                    486:         * guaranteed to be a power of two, and the least common multiple of
                    487:         * a set of numbers which are all powers of two is simply the maximum
                    488:         * of those numbers.  Finally, the maximum logarithm of a power of two
                    489:         * is the same as the log of the maximum power of two.  So we can do
                    490:         * the following operations to size wl_bufbytes_max:
                    491:         */
                    492:
                    493:        /* XXX fix actual number of pages reserved per filesystem. */
                    494:        wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
                    495:
                    496:        /* Round wl_bufbytes_max to the largest power of two constraint */
                    497:        wl->wl_bufbytes_max >>= PAGE_SHIFT;
                    498:        wl->wl_bufbytes_max <<= PAGE_SHIFT;
                    499:        wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
                    500:        wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
                    501:        wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
                    502:        wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
                    503:
                    504:        /* XXX maybe use filesystem fragment size instead of 1024 */
                    505:        /* XXX fix actual number of buffers reserved per filesystem. */
                    506:        wl->wl_bufcount_max = (nbuf / 2) * 1024;
                    507:
1.78.2.1! pgoyette  508:        wl->wl_brperjblock = ((1<<wl->wl_log_dev_bshift)
        !           509:            - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
        !           510:            sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
        !           511:        KASSERT(wl->wl_brperjblock > 0);
        !           512:
1.2       simonb    513:        /* XXX tie this into resource estimation */
1.41      hannken   514:        wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2;
1.78.2.1! pgoyette  515:        SIMPLEQ_INIT(&wl->wl_dealloclist);
1.2       simonb    516:
1.54      hannken   517:        wl->wl_buffer = wapbl_alloc(MAXPHYS);
                    518:        wl->wl_buffer_used = 0;
                    519:
1.2       simonb    520:        wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
                    521:
                    522:        /* Initialize the commit header */
                    523:        {
                    524:                struct wapbl_wc_header *wc;
1.14      joerg     525:                size_t len = 1 << wl->wl_log_dev_bshift;
1.2       simonb    526:                wc = wapbl_calloc(1, len);
                    527:                wc->wc_type = WAPBL_WC_HEADER;
                    528:                wc->wc_len = len;
                    529:                wc->wc_circ_off = wl->wl_circ_off;
                    530:                wc->wc_circ_size = wl->wl_circ_size;
                    531:                /* XXX wc->wc_fsid */
                    532:                wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
                    533:                wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
                    534:                wl->wl_wc_header = wc;
1.51      para      535:                wl->wl_wc_scratch = wapbl_alloc(len);
1.2       simonb    536:        }
                    537:
                    538:        /*
                    539:         * if there was an existing set of unlinked but
                    540:         * allocated inodes, preserve it in the new
                    541:         * log.
                    542:         */
                    543:        if (wr && wr->wr_inodescnt) {
1.15      joerg     544:                error = wapbl_start_flush_inodes(wl, wr);
1.2       simonb    545:                if (error)
                    546:                        goto errout;
                    547:        }
                    548:
                    549:        error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
                    550:        if (error) {
                    551:                goto errout;
                    552:        }
                    553:
                    554:        *wlp = wl;
                    555: #if defined(WAPBL_DEBUG)
                    556:        wapbl_debug_wl = wl;
                    557: #endif
                    558:
                    559:        return 0;
                    560:  errout:
                    561:        wapbl_discard(wl);
1.18      yamt      562:        wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
                    563:        wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
1.54      hannken   564:        wapbl_free(wl->wl_buffer, MAXPHYS);
1.2       simonb    565:        wapbl_inodetrk_free(wl);
1.18      yamt      566:        wapbl_free(wl, sizeof(*wl));
1.2       simonb    567:
                    568:        return error;
                    569: }
                    570:
                    571: /*
                    572:  * Like wapbl_flush, only discards the transaction
                    573:  * completely
                    574:  */
                    575:
                    576: void
                    577: wapbl_discard(struct wapbl *wl)
                    578: {
                    579:        struct wapbl_entry *we;
1.78.2.1! pgoyette  580:        struct wapbl_dealloc *wd;
1.2       simonb    581:        struct buf *bp;
                    582:        int i;
                    583:
                    584:        /*
                    585:         * XXX we may consider using upgrade here
                    586:         * if we want to call flush from inside a transaction
                    587:         */
                    588:        rw_enter(&wl->wl_rwlock, RW_WRITER);
1.78.2.1! pgoyette  589:        wl->wl_flush(wl->wl_mount, SIMPLEQ_FIRST(&wl->wl_dealloclist));
1.2       simonb    590:
                    591: #ifdef WAPBL_DEBUG_PRINT
                    592:        {
                    593:                pid_t pid = -1;
                    594:                lwpid_t lid = -1;
                    595:                if (curproc)
                    596:                        pid = curproc->p_pid;
                    597:                if (curlwp)
                    598:                        lid = curlwp->l_lid;
                    599: #ifdef WAPBL_DEBUG_BUFBYTES
                    600:                WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
                    601:                    ("wapbl_discard: thread %d.%d discarding "
                    602:                    "transaction\n"
                    603:                    "\tbufcount=%zu bufbytes=%zu bcount=%zu "
                    604:                    "deallocs=%d inodes=%d\n"
                    605:                    "\terrcnt = %u, reclaimable=%zu reserved=%zu "
                    606:                    "unsynced=%zu\n",
                    607:                    pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
                    608:                    wl->wl_bcount, wl->wl_dealloccnt,
                    609:                    wl->wl_inohashcnt, wl->wl_error_count,
                    610:                    wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
                    611:                    wl->wl_unsynced_bufbytes));
                    612:                SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
                    613:                        WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
                    614:                            ("\tentry: bufcount = %zu, reclaimable = %zu, "
                    615:                             "error = %d, unsynced = %zu\n",
                    616:                             we->we_bufcount, we->we_reclaimable_bytes,
                    617:                             we->we_error, we->we_unsynced_bufbytes));
                    618:                }
                    619: #else /* !WAPBL_DEBUG_BUFBYTES */
                    620:                WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
                    621:                    ("wapbl_discard: thread %d.%d discarding transaction\n"
                    622:                    "\tbufcount=%zu bufbytes=%zu bcount=%zu "
                    623:                    "deallocs=%d inodes=%d\n"
                    624:                    "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
                    625:                    pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
                    626:                    wl->wl_bcount, wl->wl_dealloccnt,
                    627:                    wl->wl_inohashcnt, wl->wl_error_count,
                    628:                    wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
                    629:                SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
                    630:                        WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
                    631:                            ("\tentry: bufcount = %zu, reclaimable = %zu, "
                    632:                             "error = %d\n",
                    633:                             we->we_bufcount, we->we_reclaimable_bytes,
                    634:                             we->we_error));
                    635:                }
                    636: #endif /* !WAPBL_DEBUG_BUFBYTES */
                    637:        }
                    638: #endif /* WAPBL_DEBUG_PRINT */
                    639:
                    640:        for (i = 0; i <= wl->wl_inohashmask; i++) {
                    641:                struct wapbl_ino_head *wih;
                    642:                struct wapbl_ino *wi;
                    643:
                    644:                wih = &wl->wl_inohash[i];
                    645:                while ((wi = LIST_FIRST(wih)) != NULL) {
                    646:                        LIST_REMOVE(wi, wi_hash);
                    647:                        pool_put(&wapbl_ino_pool, wi);
                    648:                        KASSERT(wl->wl_inohashcnt > 0);
                    649:                        wl->wl_inohashcnt--;
                    650:                }
                    651:        }
                    652:
                    653:        /*
                    654:         * clean buffer list
                    655:         */
                    656:        mutex_enter(&bufcache_lock);
                    657:        mutex_enter(&wl->wl_mtx);
                    658:        while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
                    659:                if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
                    660:                        /*
                    661:                         * The buffer will be unlocked and
                    662:                         * removed from the transaction in brelse
                    663:                         */
                    664:                        mutex_exit(&wl->wl_mtx);
                    665:                        brelsel(bp, 0);
                    666:                        mutex_enter(&wl->wl_mtx);
                    667:                }
                    668:        }
                    669:        mutex_exit(&wl->wl_mtx);
                    670:        mutex_exit(&bufcache_lock);
                    671:
                    672:        /*
                    673:         * Remove references to this wl from wl_entries, free any which
                    674:         * no longer have buffers, others will be freed in wapbl_biodone
                    675:         * when they no longer have any buffers.
                    676:         */
                    677:        while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
                    678:                SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
                    679:                /* XXX should we be accumulating wl_error_count
                    680:                 * and increasing reclaimable bytes ? */
                    681:                we->we_wapbl = NULL;
                    682:                if (we->we_bufcount == 0) {
                    683: #ifdef WAPBL_DEBUG_BUFBYTES
                    684:                        KASSERT(we->we_unsynced_bufbytes == 0);
                    685: #endif
1.51      para      686:                        pool_put(&wapbl_entry_pool, we);
1.2       simonb    687:                }
                    688:        }
                    689:
                    690:        /* Discard list of deallocs */
1.78.2.1! pgoyette  691:        while ((wd = SIMPLEQ_FIRST(&wl->wl_dealloclist)) != NULL) {
        !           692:                SIMPLEQ_REMOVE_HEAD(&wl->wl_dealloclist, wd_entries);
        !           693:                pool_put(&wapbl_dealloc_pool, wd);
        !           694:                wl->wl_dealloccnt--;
        !           695:        }
        !           696:
1.2       simonb    697:        /* XXX should we clear wl_reserved_bytes? */
                    698:
                    699:        KASSERT(wl->wl_bufbytes == 0);
                    700:        KASSERT(wl->wl_bcount == 0);
                    701:        KASSERT(wl->wl_bufcount == 0);
                    702:        KASSERT(LIST_EMPTY(&wl->wl_bufs));
                    703:        KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
                    704:        KASSERT(wl->wl_inohashcnt == 0);
1.78.2.1! pgoyette  705:        KASSERT(SIMPLEQ_EMPTY(&wl->wl_dealloclist));
        !           706:        KASSERT(wl->wl_dealloccnt == 0);
1.2       simonb    707:
                    708:        rw_exit(&wl->wl_rwlock);
                    709: }
                    710:
                    711: int
                    712: wapbl_stop(struct wapbl *wl, int force)
                    713: {
                    714:        int error;
                    715:
                    716:        WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
                    717:        error = wapbl_flush(wl, 1);
                    718:        if (error) {
                    719:                if (force)
                    720:                        wapbl_discard(wl);
                    721:                else
                    722:                        return error;
                    723:        }
                    724:
                    725:        /* Unlinked inodes persist after a flush */
                    726:        if (wl->wl_inohashcnt) {
                    727:                if (force) {
                    728:                        wapbl_discard(wl);
                    729:                } else {
                    730:                        return EBUSY;
                    731:                }
                    732:        }
                    733:
                    734:        KASSERT(wl->wl_bufbytes == 0);
                    735:        KASSERT(wl->wl_bcount == 0);
                    736:        KASSERT(wl->wl_bufcount == 0);
                    737:        KASSERT(LIST_EMPTY(&wl->wl_bufs));
                    738:        KASSERT(wl->wl_dealloccnt == 0);
                    739:        KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
                    740:        KASSERT(wl->wl_inohashcnt == 0);
1.78.2.1! pgoyette  741:        KASSERT(SIMPLEQ_EMPTY(&wl->wl_dealloclist));
        !           742:        KASSERT(wl->wl_dealloccnt == 0);
1.2       simonb    743:
1.18      yamt      744:        wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
                    745:        wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
1.54      hannken   746:        wapbl_free(wl->wl_buffer, MAXPHYS);
1.2       simonb    747:        wapbl_inodetrk_free(wl);
                    748:
                    749:        cv_destroy(&wl->wl_reclaimable_cv);
                    750:        mutex_destroy(&wl->wl_mtx);
                    751:        rw_destroy(&wl->wl_rwlock);
1.18      yamt      752:        wapbl_free(wl, sizeof(*wl));
1.2       simonb    753:
                    754:        return 0;
                    755: }
                    756:
1.71      riastrad  757: /****************************************************************/
                    758: /*
                    759:  * Unbuffered disk I/O
                    760:  */
                    761:
1.2       simonb    762: static int
                    763: wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
                    764: {
                    765:        struct pstats *pstats = curlwp->l_proc->p_stats;
                    766:        struct buf *bp;
                    767:        int error;
                    768:
                    769:        KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
                    770:        KASSERT(devvp->v_type == VBLK);
                    771:
                    772:        if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
1.45      rmind     773:                mutex_enter(devvp->v_interlock);
1.2       simonb    774:                devvp->v_numoutput++;
1.45      rmind     775:                mutex_exit(devvp->v_interlock);
1.2       simonb    776:                pstats->p_ru.ru_oublock++;
                    777:        } else {
                    778:                pstats->p_ru.ru_inblock++;
                    779:        }
                    780:
                    781:        bp = getiobuf(devvp, true);
                    782:        bp->b_flags = flags;
                    783:        bp->b_cflags = BC_BUSY; /* silly & dubious */
                    784:        bp->b_dev = devvp->v_rdev;
                    785:        bp->b_data = data;
                    786:        bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
                    787:        bp->b_blkno = pbn;
1.52      chs       788:        BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
1.2       simonb    789:
                    790:        WAPBL_PRINTF(WAPBL_PRINT_IO,
1.29      pooka     791:            ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n",
1.2       simonb    792:            BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
                    793:            bp->b_blkno, bp->b_dev));
                    794:
                    795:        VOP_STRATEGY(devvp, bp);
                    796:
                    797:        error = biowait(bp);
                    798:        putiobuf(bp);
                    799:
                    800:        if (error) {
                    801:                WAPBL_PRINTF(WAPBL_PRINT_ERROR,
                    802:                    ("wapbl_doio: %s %zu bytes at block %" PRId64
1.29      pooka     803:                    " on dev 0x%"PRIx64" failed with error %d\n",
1.2       simonb    804:                    (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
                    805:                     "write" : "read"),
                    806:                    len, pbn, devvp->v_rdev, error));
                    807:        }
                    808:
                    809:        return error;
                    810: }
                    811:
1.71      riastrad  812: /*
                    813:  * wapbl_write(data, len, devvp, pbn)
                    814:  *
                    815:  *     Synchronously write len bytes from data to physical block pbn
                    816:  *     on devvp.
                    817:  */
1.2       simonb    818: int
                    819: wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
                    820: {
                    821:
                    822:        return wapbl_doio(data, len, devvp, pbn, B_WRITE);
                    823: }
                    824:
1.71      riastrad  825: /*
                    826:  * wapbl_read(data, len, devvp, pbn)
                    827:  *
                    828:  *     Synchronously read len bytes into data from physical block pbn
                    829:  *     on devvp.
                    830:  */
1.2       simonb    831: int
                    832: wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
                    833: {
                    834:
                    835:        return wapbl_doio(data, len, devvp, pbn, B_READ);
                    836: }
                    837:
1.71      riastrad  838: /****************************************************************/
                    839: /*
                    840:  * Buffered disk writes -- try to coalesce writes and emit
                    841:  * MAXPHYS-aligned blocks.
                    842:  */
                    843:
1.2       simonb    844: /*
1.71      riastrad  845:  * wapbl_buffered_flush(wl)
                    846:  *
                    847:  *     Flush any buffered writes from wapbl_buffered_write.
1.54      hannken   848:  */
                    849: static int
                    850: wapbl_buffered_flush(struct wapbl *wl)
                    851: {
                    852:        int error;
                    853:
                    854:        if (wl->wl_buffer_used == 0)
                    855:                return 0;
                    856:
                    857:        error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
                    858:            wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
                    859:        wl->wl_buffer_used = 0;
                    860:
                    861:        return error;
                    862: }
                    863:
                    864: /*
1.71      riastrad  865:  * wapbl_buffered_write(data, len, wl, pbn)
                    866:  *
                    867:  *     Write len bytes from data to physical block pbn on
                    868:  *     wl->wl_devvp.  The write may not complete until
                    869:  *     wapbl_buffered_flush.
1.54      hannken   870:  */
                    871: static int
                    872: wapbl_buffered_write(void *data, size_t len, struct wapbl *wl, daddr_t pbn)
                    873: {
                    874:        int error;
                    875:        size_t resid;
                    876:
                    877:        /*
                    878:         * If not adjacent to buffered data flush first.  Disk block
                    879:         * address is always valid for non-empty buffer.
                    880:         */
                    881:        if (wl->wl_buffer_used > 0 &&
                    882:            pbn != wl->wl_buffer_dblk + btodb(wl->wl_buffer_used)) {
                    883:                error = wapbl_buffered_flush(wl);
                    884:                if (error)
                    885:                        return error;
                    886:        }
                    887:        /*
                    888:         * If this write goes to an empty buffer we have to
                    889:         * save the disk block address first.
                    890:         */
                    891:        if (wl->wl_buffer_used == 0)
                    892:                wl->wl_buffer_dblk = pbn;
                    893:        /*
                    894:         * Remaining space so this buffer ends on a MAXPHYS boundary.
                    895:         *
                    896:         * Cannot become less or equal zero as the buffer would have been
                    897:         * flushed on the last call then.
                    898:         */
                    899:        resid = MAXPHYS - dbtob(wl->wl_buffer_dblk % btodb(MAXPHYS)) -
                    900:            wl->wl_buffer_used;
                    901:        KASSERT(resid > 0);
                    902:        KASSERT(dbtob(btodb(resid)) == resid);
                    903:        if (len >= resid) {
                    904:                memcpy(wl->wl_buffer + wl->wl_buffer_used, data, resid);
                    905:                wl->wl_buffer_used += resid;
                    906:                error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
                    907:                    wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
                    908:                data = (uint8_t *)data + resid;
                    909:                len -= resid;
                    910:                wl->wl_buffer_dblk = pbn + btodb(resid);
                    911:                wl->wl_buffer_used = 0;
                    912:                if (error)
                    913:                        return error;
                    914:        }
                    915:        KASSERT(len < MAXPHYS);
                    916:        if (len > 0) {
                    917:                memcpy(wl->wl_buffer + wl->wl_buffer_used, data, len);
                    918:                wl->wl_buffer_used += len;
                    919:        }
                    920:
                    921:        return 0;
                    922: }
                    923:
                    924: /*
1.71      riastrad  925:  * wapbl_circ_write(wl, data, len, offp)
                    926:  *
                    927:  *     Write len bytes from data to the circular queue of wl, starting
                    928:  *     at linear byte offset *offp, and returning the new linear byte
                    929:  *     offset in *offp.
                    930:  *
                    931:  *     If the starting linear byte offset precedes wl->wl_circ_off,
                    932:  *     the write instead begins at wl->wl_circ_off.  XXX WTF?  This
                    933:  *     should be a KASSERT, not a conditional.
                    934:  *
                    935:  *     The write is buffered in wl and must be flushed with
                    936:  *     wapbl_buffered_flush before it will be submitted to the disk.
1.2       simonb    937:  */
                    938: static int
                    939: wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
                    940: {
                    941:        size_t slen;
                    942:        off_t off = *offp;
                    943:        int error;
1.34      mlelstv   944:        daddr_t pbn;
1.2       simonb    945:
                    946:        KDASSERT(((len >> wl->wl_log_dev_bshift) <<
                    947:            wl->wl_log_dev_bshift) == len);
                    948:
                    949:        if (off < wl->wl_circ_off)
                    950:                off = wl->wl_circ_off;
                    951:        slen = wl->wl_circ_off + wl->wl_circ_size - off;
                    952:        if (slen < len) {
1.34      mlelstv   953:                pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
                    954: #ifdef _KERNEL
                    955:                pbn = btodb(pbn << wl->wl_log_dev_bshift);
                    956: #endif
1.54      hannken   957:                error = wapbl_buffered_write(data, slen, wl, pbn);
1.2       simonb    958:                if (error)
                    959:                        return error;
                    960:                data = (uint8_t *)data + slen;
                    961:                len -= slen;
                    962:                off = wl->wl_circ_off;
                    963:        }
1.34      mlelstv   964:        pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
                    965: #ifdef _KERNEL
                    966:        pbn = btodb(pbn << wl->wl_log_dev_bshift);
                    967: #endif
1.54      hannken   968:        error = wapbl_buffered_write(data, len, wl, pbn);
1.2       simonb    969:        if (error)
                    970:                return error;
                    971:        off += len;
                    972:        if (off >= wl->wl_circ_off + wl->wl_circ_size)
                    973:                off = wl->wl_circ_off;
                    974:        *offp = off;
                    975:        return 0;
                    976: }
                    977:
                    978: /****************************************************************/
1.71      riastrad  979: /*
                    980:  * WAPBL transactions: entering, adding/removing bufs, and exiting
                    981:  */
1.2       simonb    982:
                    983: int
                    984: wapbl_begin(struct wapbl *wl, const char *file, int line)
                    985: {
                    986:        int doflush;
                    987:        unsigned lockcount;
                    988:
                    989:        KDASSERT(wl);
                    990:
                    991:        /*
                    992:         * XXX this needs to be made much more sophisticated.
                    993:         * perhaps each wapbl_begin could reserve a specified
                    994:         * number of buffers and bytes.
                    995:         */
                    996:        mutex_enter(&wl->wl_mtx);
                    997:        lockcount = wl->wl_lock_count;
                    998:        doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
                    999:                   wl->wl_bufbytes_max / 2) ||
                   1000:                  ((wl->wl_bufcount + (lockcount * 10)) >
                   1001:                   wl->wl_bufcount_max / 2) ||
1.28      pooka    1002:                  (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) ||
1.42      hannken  1003:                  (wl->wl_dealloccnt >= (wl->wl_dealloclim / 2));
1.2       simonb   1004:        mutex_exit(&wl->wl_mtx);
                   1005:
                   1006:        if (doflush) {
                   1007:                WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
                   1008:                    ("force flush lockcnt=%d bufbytes=%zu "
1.28      pooka    1009:                    "(max=%zu) bufcount=%zu (max=%zu) "
                   1010:                    "dealloccnt %d (lim=%d)\n",
1.2       simonb   1011:                    lockcount, wl->wl_bufbytes,
                   1012:                    wl->wl_bufbytes_max, wl->wl_bufcount,
1.28      pooka    1013:                    wl->wl_bufcount_max,
                   1014:                    wl->wl_dealloccnt, wl->wl_dealloclim));
1.2       simonb   1015:        }
                   1016:
                   1017:        if (doflush) {
                   1018:                int error = wapbl_flush(wl, 0);
                   1019:                if (error)
                   1020:                        return error;
                   1021:        }
                   1022:
1.23      ad       1023:        rw_enter(&wl->wl_rwlock, RW_READER);
1.2       simonb   1024:        mutex_enter(&wl->wl_mtx);
                   1025:        wl->wl_lock_count++;
                   1026:        mutex_exit(&wl->wl_mtx);
                   1027:
1.23      ad       1028: #if defined(WAPBL_DEBUG_PRINT)
1.2       simonb   1029:        WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
                   1030:            ("wapbl_begin thread %d.%d with bufcount=%zu "
                   1031:            "bufbytes=%zu bcount=%zu at %s:%d\n",
                   1032:            curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
                   1033:            wl->wl_bufbytes, wl->wl_bcount, file, line));
                   1034: #endif
                   1035:
                   1036:        return 0;
                   1037: }
                   1038:
                   1039: void
                   1040: wapbl_end(struct wapbl *wl)
                   1041: {
                   1042:
1.23      ad       1043: #if defined(WAPBL_DEBUG_PRINT)
1.2       simonb   1044:        WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
                   1045:             ("wapbl_end thread %d.%d with bufcount=%zu "
                   1046:              "bufbytes=%zu bcount=%zu\n",
                   1047:              curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
                   1048:              wl->wl_bufbytes, wl->wl_bcount));
                   1049: #endif
                   1050:
1.65      riastrad 1051:        /*
                   1052:         * XXX this could be handled more gracefully, perhaps place
                   1053:         * only a partial transaction in the log and allow the
                   1054:         * remaining to flush without the protection of the journal.
                   1055:         */
1.67      riastrad 1056:        KASSERTMSG((wapbl_transaction_len(wl) <=
                   1057:                (wl->wl_circ_size - wl->wl_reserved_bytes)),
1.65      riastrad 1058:            "wapbl_end: current transaction too big to flush");
1.40      bouyer   1059:
1.2       simonb   1060:        mutex_enter(&wl->wl_mtx);
                   1061:        KASSERT(wl->wl_lock_count > 0);
                   1062:        wl->wl_lock_count--;
                   1063:        mutex_exit(&wl->wl_mtx);
                   1064:
                   1065:        rw_exit(&wl->wl_rwlock);
                   1066: }
                   1067:
                   1068: void
                   1069: wapbl_add_buf(struct wapbl *wl, struct buf * bp)
                   1070: {
                   1071:
                   1072:        KASSERT(bp->b_cflags & BC_BUSY);
                   1073:        KASSERT(bp->b_vp);
                   1074:
                   1075:        wapbl_jlock_assert(wl);
                   1076:
                   1077: #if 0
                   1078:        /*
                   1079:         * XXX this might be an issue for swapfiles.
                   1080:         * see uvm_swap.c:1702
                   1081:         *
                   1082:         * XXX2 why require it then?  leap of semantics?
                   1083:         */
                   1084:        KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
                   1085: #endif
                   1086:
                   1087:        mutex_enter(&wl->wl_mtx);
                   1088:        if (bp->b_flags & B_LOCKED) {
                   1089:                LIST_REMOVE(bp, b_wapbllist);
                   1090:                WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
                   1091:                   ("wapbl_add_buf thread %d.%d re-adding buf %p "
                   1092:                    "with %d bytes %d bcount\n",
                   1093:                    curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
                   1094:                    bp->b_bcount));
                   1095:        } else {
                   1096:                /* unlocked by dirty buffers shouldn't exist */
                   1097:                KASSERT(!(bp->b_oflags & BO_DELWRI));
                   1098:                wl->wl_bufbytes += bp->b_bufsize;
                   1099:                wl->wl_bcount += bp->b_bcount;
                   1100:                wl->wl_bufcount++;
                   1101:                WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
                   1102:                   ("wapbl_add_buf thread %d.%d adding buf %p "
                   1103:                    "with %d bytes %d bcount\n",
                   1104:                    curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
                   1105:                    bp->b_bcount));
                   1106:        }
                   1107:        LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist);
                   1108:        mutex_exit(&wl->wl_mtx);
                   1109:
                   1110:        bp->b_flags |= B_LOCKED;
                   1111: }
                   1112:
                   1113: static void
                   1114: wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
                   1115: {
                   1116:
                   1117:        KASSERT(mutex_owned(&wl->wl_mtx));
                   1118:        KASSERT(bp->b_cflags & BC_BUSY);
                   1119:        wapbl_jlock_assert(wl);
                   1120:
                   1121: #if 0
                   1122:        /*
                   1123:         * XXX this might be an issue for swapfiles.
                   1124:         * see uvm_swap.c:1725
                   1125:         *
                   1126:         * XXXdeux: see above
                   1127:         */
                   1128:        KASSERT((bp->b_flags & BC_NOCACHE) == 0);
                   1129: #endif
                   1130:        KASSERT(bp->b_flags & B_LOCKED);
                   1131:
                   1132:        WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
                   1133:           ("wapbl_remove_buf thread %d.%d removing buf %p with "
                   1134:            "%d bytes %d bcount\n",
                   1135:            curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
                   1136:
                   1137:        KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
                   1138:        wl->wl_bufbytes -= bp->b_bufsize;
                   1139:        KASSERT(wl->wl_bcount >= bp->b_bcount);
                   1140:        wl->wl_bcount -= bp->b_bcount;
                   1141:        KASSERT(wl->wl_bufcount > 0);
                   1142:        wl->wl_bufcount--;
                   1143:        KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
                   1144:        KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
                   1145:        LIST_REMOVE(bp, b_wapbllist);
                   1146:
                   1147:        bp->b_flags &= ~B_LOCKED;
                   1148: }
                   1149:
                   1150: /* called from brelsel() in vfs_bio among other places */
                   1151: void
                   1152: wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
                   1153: {
                   1154:
                   1155:        mutex_enter(&wl->wl_mtx);
                   1156:        wapbl_remove_buf_locked(wl, bp);
                   1157:        mutex_exit(&wl->wl_mtx);
                   1158: }
                   1159:
                   1160: void
                   1161: wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
                   1162: {
                   1163:
                   1164:        KASSERT(bp->b_cflags & BC_BUSY);
                   1165:
                   1166:        /*
                   1167:         * XXX: why does this depend on B_LOCKED?  otherwise the buf
                   1168:         * is not for a transaction?  if so, why is this called in the
                   1169:         * first place?
                   1170:         */
                   1171:        if (bp->b_flags & B_LOCKED) {
                   1172:                mutex_enter(&wl->wl_mtx);
                   1173:                wl->wl_bufbytes += bp->b_bufsize - oldsz;
                   1174:                wl->wl_bcount += bp->b_bcount - oldcnt;
                   1175:                mutex_exit(&wl->wl_mtx);
                   1176:        }
                   1177: }
                   1178:
                   1179: #endif /* _KERNEL */
                   1180:
                   1181: /****************************************************************/
                   1182: /* Some utility inlines */
                   1183:
1.71      riastrad 1184: /*
                   1185:  * wapbl_space_used(avail, head, tail)
                   1186:  *
                   1187:  *     Number of bytes used in a circular queue of avail total bytes,
                   1188:  *     from tail to head.
                   1189:  */
1.56      joerg    1190: static inline size_t
                   1191: wapbl_space_used(size_t avail, off_t head, off_t tail)
                   1192: {
                   1193:
                   1194:        if (tail == 0) {
                   1195:                KASSERT(head == 0);
                   1196:                return 0;
                   1197:        }
                   1198:        return ((head + (avail - 1) - tail) % avail) + 1;
                   1199: }
                   1200:
                   1201: #ifdef _KERNEL
1.71      riastrad 1202: /*
                   1203:  * wapbl_advance(size, off, oldoff, delta)
                   1204:  *
                   1205:  *     Given a byte offset oldoff into a circular queue of size bytes
                   1206:  *     starting at off, return a new byte offset oldoff + delta into
                   1207:  *     the circular queue.
                   1208:  */
1.30      uebayasi 1209: static inline off_t
1.60      matt     1210: wapbl_advance(size_t size, size_t off, off_t oldoff, size_t delta)
1.2       simonb   1211: {
1.60      matt     1212:        off_t newoff;
1.2       simonb   1213:
                   1214:        /* Define acceptable ranges for inputs. */
1.46      christos 1215:        KASSERT(delta <= (size_t)size);
1.60      matt     1216:        KASSERT((oldoff == 0) || ((size_t)oldoff >= off));
                   1217:        KASSERT(oldoff < (off_t)(size + off));
1.2       simonb   1218:
1.60      matt     1219:        if ((oldoff == 0) && (delta != 0))
                   1220:                newoff = off + delta;
                   1221:        else if ((oldoff + delta) < (size + off))
                   1222:                newoff = oldoff + delta;
1.2       simonb   1223:        else
1.60      matt     1224:                newoff = (oldoff + delta) - size;
1.2       simonb   1225:
                   1226:        /* Note some interesting axioms */
1.60      matt     1227:        KASSERT((delta != 0) || (newoff == oldoff));
                   1228:        KASSERT((delta == 0) || (newoff != 0));
                   1229:        KASSERT((delta != (size)) || (newoff == oldoff));
1.2       simonb   1230:
                   1231:        /* Define acceptable ranges for output. */
1.60      matt     1232:        KASSERT((newoff == 0) || ((size_t)newoff >= off));
                   1233:        KASSERT((size_t)newoff < (size + off));
                   1234:        return newoff;
1.2       simonb   1235: }
                   1236:
1.71      riastrad 1237: /*
                   1238:  * wapbl_space_free(avail, head, tail)
                   1239:  *
                   1240:  *     Number of bytes free in a circular queue of avail total bytes,
                   1241:  *     in which everything from tail to head is used.
                   1242:  */
1.30      uebayasi 1243: static inline size_t
1.2       simonb   1244: wapbl_space_free(size_t avail, off_t head, off_t tail)
                   1245: {
                   1246:
                   1247:        return avail - wapbl_space_used(avail, head, tail);
                   1248: }
                   1249:
1.71      riastrad 1250: /*
                   1251:  * wapbl_advance_head(size, off, delta, headp, tailp)
                   1252:  *
                   1253:  *     In a circular queue of size bytes starting at off, given the
                   1254:  *     old head and tail offsets *headp and *tailp, store the new head
                   1255:  *     and tail offsets in *headp and *tailp resulting from adding
                   1256:  *     delta bytes of data to the head.
                   1257:  */
1.30      uebayasi 1258: static inline void
1.2       simonb   1259: wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
                   1260:                   off_t *tailp)
                   1261: {
                   1262:        off_t head = *headp;
                   1263:        off_t tail = *tailp;
                   1264:
                   1265:        KASSERT(delta <= wapbl_space_free(size, head, tail));
                   1266:        head = wapbl_advance(size, off, head, delta);
                   1267:        if ((tail == 0) && (head != 0))
                   1268:                tail = off;
                   1269:        *headp = head;
                   1270:        *tailp = tail;
                   1271: }
                   1272:
1.71      riastrad 1273: /*
                   1274:  * wapbl_advance_tail(size, off, delta, headp, tailp)
                   1275:  *
                   1276:  *     In a circular queue of size bytes starting at off, given the
                   1277:  *     old head and tail offsets *headp and *tailp, store the new head
                   1278:  *     and tail offsets in *headp and *tailp resulting from removing
                   1279:  *     delta bytes of data from the tail.
                   1280:  */
1.30      uebayasi 1281: static inline void
1.2       simonb   1282: wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
                   1283:                   off_t *tailp)
                   1284: {
                   1285:        off_t head = *headp;
                   1286:        off_t tail = *tailp;
                   1287:
                   1288:        KASSERT(delta <= wapbl_space_used(size, head, tail));
                   1289:        tail = wapbl_advance(size, off, tail, delta);
                   1290:        if (head == tail) {
                   1291:                head = tail = 0;
                   1292:        }
                   1293:        *headp = head;
                   1294:        *tailp = tail;
                   1295: }
                   1296:
                   1297:
                   1298: /****************************************************************/
                   1299:
                   1300: /*
1.73      riastrad 1301:  * wapbl_truncate(wl, minfree)
1.71      riastrad 1302:  *
                   1303:  *     Wait until at least minfree bytes are available in the log.
                   1304:  *
1.73      riastrad 1305:  *     If it was necessary to wait for writes to complete,
                   1306:  *     advance the circular queue tail to reflect the new write
                   1307:  *     completions and issue a write commit to the log.
1.71      riastrad 1308:  *
                   1309:  *     => Caller must hold wl->wl_rwlock writer lock.
1.2       simonb   1310:  */
                   1311: static int
1.73      riastrad 1312: wapbl_truncate(struct wapbl *wl, size_t minfree)
1.2       simonb   1313: {
                   1314:        size_t delta;
                   1315:        size_t avail;
                   1316:        off_t head;
                   1317:        off_t tail;
                   1318:        int error = 0;
                   1319:
                   1320:        KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
                   1321:        KASSERT(rw_write_held(&wl->wl_rwlock));
                   1322:
                   1323:        mutex_enter(&wl->wl_mtx);
                   1324:
                   1325:        /*
                   1326:         * First check to see if we have to do a commit
                   1327:         * at all.
                   1328:         */
                   1329:        avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
                   1330:        if (minfree < avail) {
                   1331:                mutex_exit(&wl->wl_mtx);
                   1332:                return 0;
                   1333:        }
                   1334:        minfree -= avail;
                   1335:        while ((wl->wl_error_count == 0) &&
                   1336:            (wl->wl_reclaimable_bytes < minfree)) {
                   1337:                WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
                   1338:                    ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
                   1339:                    "minfree=%zd\n",
                   1340:                     &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
                   1341:                    minfree));
                   1342:
                   1343:                cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
                   1344:        }
                   1345:        if (wl->wl_reclaimable_bytes < minfree) {
                   1346:                KASSERT(wl->wl_error_count);
                   1347:                /* XXX maybe get actual error from buffer instead someday? */
                   1348:                error = EIO;
                   1349:        }
                   1350:        head = wl->wl_head;
                   1351:        tail = wl->wl_tail;
                   1352:        delta = wl->wl_reclaimable_bytes;
                   1353:
                   1354:        /* If all of of the entries are flushed, then be sure to keep
                   1355:         * the reserved bytes reserved.  Watch out for discarded transactions,
                   1356:         * which could leave more bytes reserved than are reclaimable.
                   1357:         */
                   1358:        if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
                   1359:            (delta >= wl->wl_reserved_bytes)) {
                   1360:                delta -= wl->wl_reserved_bytes;
                   1361:        }
                   1362:        wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
                   1363:                           &tail);
                   1364:        KDASSERT(wl->wl_reserved_bytes <=
                   1365:                wapbl_space_used(wl->wl_circ_size, head, tail));
                   1366:        mutex_exit(&wl->wl_mtx);
                   1367:
                   1368:        if (error)
                   1369:                return error;
                   1370:
                   1371:        /*
                   1372:         * This is where head, tail and delta are unprotected
                   1373:         * from races against itself or flush.  This is ok since
                   1374:         * we only call this routine from inside flush itself.
                   1375:         *
                   1376:         * XXX: how can it race against itself when accessed only
                   1377:         * from behind the write-locked rwlock?
                   1378:         */
                   1379:        error = wapbl_write_commit(wl, head, tail);
                   1380:        if (error)
                   1381:                return error;
                   1382:
                   1383:        wl->wl_head = head;
                   1384:        wl->wl_tail = tail;
                   1385:
                   1386:        mutex_enter(&wl->wl_mtx);
                   1387:        KASSERT(wl->wl_reclaimable_bytes >= delta);
                   1388:        wl->wl_reclaimable_bytes -= delta;
                   1389:        mutex_exit(&wl->wl_mtx);
                   1390:        WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
                   1391:            ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
                   1392:            curproc->p_pid, curlwp->l_lid, delta));
                   1393:
                   1394:        return 0;
                   1395: }
                   1396:
                   1397: /****************************************************************/
                   1398:
                   1399: void
                   1400: wapbl_biodone(struct buf *bp)
                   1401: {
                   1402:        struct wapbl_entry *we = bp->b_private;
                   1403:        struct wapbl *wl = we->we_wapbl;
1.53      hannken  1404: #ifdef WAPBL_DEBUG_BUFBYTES
                   1405:        const int bufsize = bp->b_bufsize;
                   1406: #endif
1.2       simonb   1407:
                   1408:        /*
                   1409:         * Handle possible flushing of buffers after log has been
                   1410:         * decomissioned.
                   1411:         */
                   1412:        if (!wl) {
                   1413:                KASSERT(we->we_bufcount > 0);
                   1414:                we->we_bufcount--;
                   1415: #ifdef WAPBL_DEBUG_BUFBYTES
1.53      hannken  1416:                KASSERT(we->we_unsynced_bufbytes >= bufsize);
                   1417:                we->we_unsynced_bufbytes -= bufsize;
1.2       simonb   1418: #endif
                   1419:
                   1420:                if (we->we_bufcount == 0) {
                   1421: #ifdef WAPBL_DEBUG_BUFBYTES
                   1422:                        KASSERT(we->we_unsynced_bufbytes == 0);
                   1423: #endif
1.51      para     1424:                        pool_put(&wapbl_entry_pool, we);
1.2       simonb   1425:                }
                   1426:
                   1427:                brelse(bp, 0);
                   1428:                return;
                   1429:        }
                   1430:
                   1431: #ifdef ohbother
1.44      uebayasi 1432:        KDASSERT(bp->b_oflags & BO_DONE);
                   1433:        KDASSERT(!(bp->b_oflags & BO_DELWRI));
1.2       simonb   1434:        KDASSERT(bp->b_flags & B_ASYNC);
1.44      uebayasi 1435:        KDASSERT(bp->b_cflags & BC_BUSY);
1.2       simonb   1436:        KDASSERT(!(bp->b_flags & B_LOCKED));
                   1437:        KDASSERT(!(bp->b_flags & B_READ));
1.44      uebayasi 1438:        KDASSERT(!(bp->b_cflags & BC_INVAL));
                   1439:        KDASSERT(!(bp->b_cflags & BC_NOCACHE));
1.2       simonb   1440: #endif
                   1441:
                   1442:        if (bp->b_error) {
1.26      apb      1443:                /*
1.78      riastrad 1444:                 * If an error occurs, it would be nice to leave the buffer
                   1445:                 * as a delayed write on the LRU queue so that we can retry
                   1446:                 * it later. But buffercache(9) can't handle dirty buffer
                   1447:                 * reuse, so just mark the log permanently errored out.
1.26      apb      1448:                 */
1.2       simonb   1449:                mutex_enter(&wl->wl_mtx);
                   1450:                if (wl->wl_error_count == 0) {
                   1451:                        wl->wl_error_count++;
                   1452:                        cv_broadcast(&wl->wl_reclaimable_cv);
                   1453:                }
                   1454:                mutex_exit(&wl->wl_mtx);
                   1455:        }
                   1456:
1.53      hannken  1457:        /*
                   1458:         * Release the buffer here. wapbl_flush() may wait for the
                   1459:         * log to become empty and we better unbusy the buffer before
                   1460:         * wapbl_flush() returns.
                   1461:         */
                   1462:        brelse(bp, 0);
                   1463:
1.2       simonb   1464:        mutex_enter(&wl->wl_mtx);
                   1465:
                   1466:        KASSERT(we->we_bufcount > 0);
                   1467:        we->we_bufcount--;
                   1468: #ifdef WAPBL_DEBUG_BUFBYTES
1.53      hannken  1469:        KASSERT(we->we_unsynced_bufbytes >= bufsize);
                   1470:        we->we_unsynced_bufbytes -= bufsize;
                   1471:        KASSERT(wl->wl_unsynced_bufbytes >= bufsize);
                   1472:        wl->wl_unsynced_bufbytes -= bufsize;
1.2       simonb   1473: #endif
                   1474:
                   1475:        /*
                   1476:         * If the current transaction can be reclaimed, start
                   1477:         * at the beginning and reclaim any consecutive reclaimable
                   1478:         * transactions.  If we successfully reclaim anything,
                   1479:         * then wakeup anyone waiting for the reclaim.
                   1480:         */
                   1481:        if (we->we_bufcount == 0) {
                   1482:                size_t delta = 0;
                   1483:                int errcnt = 0;
                   1484: #ifdef WAPBL_DEBUG_BUFBYTES
                   1485:                KDASSERT(we->we_unsynced_bufbytes == 0);
                   1486: #endif
                   1487:                /*
                   1488:                 * clear any posted error, since the buffer it came from
                   1489:                 * has successfully flushed by now
                   1490:                 */
                   1491:                while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
                   1492:                       (we->we_bufcount == 0)) {
                   1493:                        delta += we->we_reclaimable_bytes;
                   1494:                        if (we->we_error)
                   1495:                                errcnt++;
                   1496:                        SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1.51      para     1497:                        pool_put(&wapbl_entry_pool, we);
1.2       simonb   1498:                }
                   1499:
                   1500:                if (delta) {
                   1501:                        wl->wl_reclaimable_bytes += delta;
                   1502:                        KASSERT(wl->wl_error_count >= errcnt);
                   1503:                        wl->wl_error_count -= errcnt;
                   1504:                        cv_broadcast(&wl->wl_reclaimable_cv);
                   1505:                }
                   1506:        }
                   1507:
                   1508:        mutex_exit(&wl->wl_mtx);
                   1509: }
                   1510:
                   1511: /*
1.71      riastrad 1512:  * wapbl_flush(wl, wait)
                   1513:  *
                   1514:  *     Flush pending block writes, deallocations, and inodes from
                   1515:  *     the current transaction in memory to the log on disk:
                   1516:  *
                   1517:  *     1. Call the file system's wl_flush callback to flush any
                   1518:  *        per-file-system pending updates.
                   1519:  *     2. Wait for enough space in the log for the current transaction.
                   1520:  *     3. Synchronously write the new log records, advancing the
                   1521:  *        circular queue head.
1.77      riastrad 1522:  *     4. Issue the pending block writes asynchronously, now that they
                   1523:  *        are recorded in the log and can be replayed after crash.
                   1524:  *     5. If wait is true, wait for all writes to complete and for the
                   1525:  *        log to become empty.
1.71      riastrad 1526:  *
                   1527:  *     On failure, call the file system's wl_flush_abort callback.
1.2       simonb   1528:  */
                   1529: int
                   1530: wapbl_flush(struct wapbl *wl, int waitfor)
                   1531: {
                   1532:        struct buf *bp;
                   1533:        struct wapbl_entry *we;
                   1534:        off_t off;
                   1535:        off_t head;
                   1536:        off_t tail;
                   1537:        size_t delta = 0;
                   1538:        size_t flushsize;
                   1539:        size_t reserved;
                   1540:        int error = 0;
                   1541:
                   1542:        /*
                   1543:         * Do a quick check to see if a full flush can be skipped
                   1544:         * This assumes that the flush callback does not need to be called
                   1545:         * unless there are other outstanding bufs.
                   1546:         */
                   1547:        if (!waitfor) {
                   1548:                size_t nbufs;
                   1549:                mutex_enter(&wl->wl_mtx);       /* XXX need mutex here to
                   1550:                                                   protect the KASSERTS */
                   1551:                nbufs = wl->wl_bufcount;
                   1552:                KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
                   1553:                KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
                   1554:                mutex_exit(&wl->wl_mtx);
                   1555:                if (nbufs == 0)
                   1556:                        return 0;
                   1557:        }
                   1558:
                   1559:        /*
                   1560:         * XXX we may consider using LK_UPGRADE here
                   1561:         * if we want to call flush from inside a transaction
                   1562:         */
                   1563:        rw_enter(&wl->wl_rwlock, RW_WRITER);
1.78.2.1! pgoyette 1564:        wl->wl_flush(wl->wl_mount, SIMPLEQ_FIRST(&wl->wl_dealloclist));
1.2       simonb   1565:
                   1566:        /*
1.75      riastrad 1567:         * Now that we are exclusively locked and the file system has
                   1568:         * issued any deferred block writes for this transaction, check
                   1569:         * whether there are any blocks to write to the log.  If not,
                   1570:         * skip waiting for space or writing any log entries.
                   1571:         *
                   1572:         * XXX Shouldn't this also check wl_dealloccnt and
                   1573:         * wl_inohashcnt?  Perhaps wl_dealloccnt doesn't matter if the
                   1574:         * file system didn't produce any blocks as a consequence of
                   1575:         * it, but the same does not seem to be so of wl_inohashcnt.
1.2       simonb   1576:         */
                   1577:        if (wl->wl_bufcount == 0) {
1.69      riastrad 1578:                goto wait_out;
1.2       simonb   1579:        }
                   1580:
                   1581: #if 0
                   1582:        WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
                   1583:                     ("wapbl_flush thread %d.%d flushing entries with "
                   1584:                      "bufcount=%zu bufbytes=%zu\n",
                   1585:                      curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
                   1586:                      wl->wl_bufbytes));
                   1587: #endif
                   1588:
                   1589:        /* Calculate amount of space needed to flush */
                   1590:        flushsize = wapbl_transaction_len(wl);
1.39      christos 1591:        if (wapbl_verbose_commit) {
                   1592:                struct timespec ts;
                   1593:                getnanotime(&ts);
1.43      nakayama 1594:                printf("%s: %lld.%09ld this transaction = %zu bytes\n",
1.39      christos 1595:                    __func__, (long long)ts.tv_sec,
                   1596:                    (long)ts.tv_nsec, flushsize);
                   1597:        }
1.2       simonb   1598:
                   1599:        if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
                   1600:                /*
                   1601:                 * XXX this could be handled more gracefully, perhaps place
                   1602:                 * only a partial transaction in the log and allow the
                   1603:                 * remaining to flush without the protection of the journal.
                   1604:                 */
1.66      riastrad 1605:                panic("wapbl_flush: current transaction too big to flush");
1.2       simonb   1606:        }
                   1607:
1.73      riastrad 1608:        error = wapbl_truncate(wl, flushsize);
1.2       simonb   1609:        if (error)
1.69      riastrad 1610:                goto out;
1.2       simonb   1611:
                   1612:        off = wl->wl_head;
1.70      riastrad 1613:        KASSERT((off == 0) || (off >= wl->wl_circ_off));
                   1614:        KASSERT((off == 0) || (off < wl->wl_circ_off + wl->wl_circ_size));
1.2       simonb   1615:        error = wapbl_write_blocks(wl, &off);
                   1616:        if (error)
1.69      riastrad 1617:                goto out;
1.2       simonb   1618:        error = wapbl_write_revocations(wl, &off);
                   1619:        if (error)
1.69      riastrad 1620:                goto out;
1.2       simonb   1621:        error = wapbl_write_inodes(wl, &off);
                   1622:        if (error)
1.69      riastrad 1623:                goto out;
1.2       simonb   1624:
                   1625:        reserved = 0;
                   1626:        if (wl->wl_inohashcnt)
                   1627:                reserved = wapbl_transaction_inodes_len(wl);
                   1628:
                   1629:        head = wl->wl_head;
                   1630:        tail = wl->wl_tail;
                   1631:
                   1632:        wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
                   1633:            &head, &tail);
1.72      riastrad 1634:
                   1635:        KASSERTMSG(head == off,
                   1636:            "lost head! head=%"PRIdMAX" tail=%" PRIdMAX
                   1637:            " off=%"PRIdMAX" flush=%zu",
                   1638:            (intmax_t)head, (intmax_t)tail, (intmax_t)off,
                   1639:            flushsize);
1.2       simonb   1640:
                   1641:        /* Opportunistically move the tail forward if we can */
1.73      riastrad 1642:        mutex_enter(&wl->wl_mtx);
                   1643:        delta = wl->wl_reclaimable_bytes;
                   1644:        mutex_exit(&wl->wl_mtx);
                   1645:        wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
                   1646:            &head, &tail);
1.2       simonb   1647:
                   1648:        error = wapbl_write_commit(wl, head, tail);
                   1649:        if (error)
1.69      riastrad 1650:                goto out;
1.2       simonb   1651:
1.51      para     1652:        we = pool_get(&wapbl_entry_pool, PR_WAITOK);
1.2       simonb   1653:
                   1654: #ifdef WAPBL_DEBUG_BUFBYTES
                   1655:        WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
                   1656:                ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
                   1657:                 " unsynced=%zu"
                   1658:                 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
                   1659:                 "inodes=%d\n",
                   1660:                 curproc->p_pid, curlwp->l_lid, flushsize, delta,
                   1661:                 wapbl_space_used(wl->wl_circ_size, head, tail),
                   1662:                 wl->wl_unsynced_bufbytes, wl->wl_bufcount,
                   1663:                 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
                   1664:                 wl->wl_inohashcnt));
                   1665: #else
                   1666:        WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
                   1667:                ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
                   1668:                 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
                   1669:                 "inodes=%d\n",
                   1670:                 curproc->p_pid, curlwp->l_lid, flushsize, delta,
                   1671:                 wapbl_space_used(wl->wl_circ_size, head, tail),
                   1672:                 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
                   1673:                 wl->wl_dealloccnt, wl->wl_inohashcnt));
                   1674: #endif
                   1675:
                   1676:
                   1677:        mutex_enter(&bufcache_lock);
                   1678:        mutex_enter(&wl->wl_mtx);
                   1679:
                   1680:        wl->wl_reserved_bytes = reserved;
                   1681:        wl->wl_head = head;
                   1682:        wl->wl_tail = tail;
                   1683:        KASSERT(wl->wl_reclaimable_bytes >= delta);
                   1684:        wl->wl_reclaimable_bytes -= delta;
1.78.2.1! pgoyette 1685:        KDASSERT(wl->wl_dealloccnt == 0);
1.2       simonb   1686: #ifdef WAPBL_DEBUG_BUFBYTES
                   1687:        wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
                   1688: #endif
                   1689:
                   1690:        we->we_wapbl = wl;
                   1691:        we->we_bufcount = wl->wl_bufcount;
                   1692: #ifdef WAPBL_DEBUG_BUFBYTES
                   1693:        we->we_unsynced_bufbytes = wl->wl_bufbytes;
                   1694: #endif
                   1695:        we->we_reclaimable_bytes = flushsize;
                   1696:        we->we_error = 0;
                   1697:        SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
                   1698:
                   1699:        /*
                   1700:         * this flushes bufs in reverse order than they were queued
                   1701:         * it shouldn't matter, but if we care we could use TAILQ instead.
                   1702:         * XXX Note they will get put on the lru queue when they flush
                   1703:         * so we might actually want to change this to preserve order.
                   1704:         */
                   1705:        while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
                   1706:                if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
                   1707:                        continue;
                   1708:                }
                   1709:                bp->b_iodone = wapbl_biodone;
                   1710:                bp->b_private = we;
                   1711:                bremfree(bp);
                   1712:                wapbl_remove_buf_locked(wl, bp);
                   1713:                mutex_exit(&wl->wl_mtx);
                   1714:                mutex_exit(&bufcache_lock);
                   1715:                bawrite(bp);
                   1716:                mutex_enter(&bufcache_lock);
                   1717:                mutex_enter(&wl->wl_mtx);
                   1718:        }
                   1719:        mutex_exit(&wl->wl_mtx);
                   1720:        mutex_exit(&bufcache_lock);
                   1721:
                   1722: #if 0
                   1723:        WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
                   1724:                     ("wapbl_flush thread %d.%d done flushing entries...\n",
                   1725:                     curproc->p_pid, curlwp->l_lid));
                   1726: #endif
                   1727:
1.69      riastrad 1728:  wait_out:
1.2       simonb   1729:
                   1730:        /*
                   1731:         * If the waitfor flag is set, don't return until everything is
                   1732:         * fully flushed and the on disk log is empty.
                   1733:         */
                   1734:        if (waitfor) {
                   1735:                error = wapbl_truncate(wl, wl->wl_circ_size -
1.73      riastrad 1736:                        wl->wl_reserved_bytes);
1.2       simonb   1737:        }
                   1738:
1.69      riastrad 1739:  out:
1.2       simonb   1740:        if (error) {
1.78.2.1! pgoyette 1741:                wl->wl_flush_abort(wl->wl_mount,
        !          1742:                    SIMPLEQ_FIRST(&wl->wl_dealloclist));
1.2       simonb   1743:        }
                   1744:
                   1745: #ifdef WAPBL_DEBUG_PRINT
                   1746:        if (error) {
                   1747:                pid_t pid = -1;
                   1748:                lwpid_t lid = -1;
                   1749:                if (curproc)
                   1750:                        pid = curproc->p_pid;
                   1751:                if (curlwp)
                   1752:                        lid = curlwp->l_lid;
                   1753:                mutex_enter(&wl->wl_mtx);
                   1754: #ifdef WAPBL_DEBUG_BUFBYTES
                   1755:                WAPBL_PRINTF(WAPBL_PRINT_ERROR,
                   1756:                    ("wapbl_flush: thread %d.%d aborted flush: "
                   1757:                    "error = %d\n"
                   1758:                    "\tbufcount=%zu bufbytes=%zu bcount=%zu "
                   1759:                    "deallocs=%d inodes=%d\n"
                   1760:                    "\terrcnt = %d, reclaimable=%zu reserved=%zu "
                   1761:                    "unsynced=%zu\n",
                   1762:                    pid, lid, error, wl->wl_bufcount,
                   1763:                    wl->wl_bufbytes, wl->wl_bcount,
                   1764:                    wl->wl_dealloccnt, wl->wl_inohashcnt,
                   1765:                    wl->wl_error_count, wl->wl_reclaimable_bytes,
                   1766:                    wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
                   1767:                SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
                   1768:                        WAPBL_PRINTF(WAPBL_PRINT_ERROR,
                   1769:                            ("\tentry: bufcount = %zu, reclaimable = %zu, "
                   1770:                             "error = %d, unsynced = %zu\n",
                   1771:                             we->we_bufcount, we->we_reclaimable_bytes,
                   1772:                             we->we_error, we->we_unsynced_bufbytes));
                   1773:                }
                   1774: #else
                   1775:                WAPBL_PRINTF(WAPBL_PRINT_ERROR,
                   1776:                    ("wapbl_flush: thread %d.%d aborted flush: "
                   1777:                     "error = %d\n"
                   1778:                     "\tbufcount=%zu bufbytes=%zu bcount=%zu "
                   1779:                     "deallocs=%d inodes=%d\n"
                   1780:                     "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
                   1781:                     pid, lid, error, wl->wl_bufcount,
                   1782:                     wl->wl_bufbytes, wl->wl_bcount,
                   1783:                     wl->wl_dealloccnt, wl->wl_inohashcnt,
                   1784:                     wl->wl_error_count, wl->wl_reclaimable_bytes,
                   1785:                     wl->wl_reserved_bytes));
                   1786:                SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
                   1787:                        WAPBL_PRINTF(WAPBL_PRINT_ERROR,
                   1788:                            ("\tentry: bufcount = %zu, reclaimable = %zu, "
                   1789:                             "error = %d\n", we->we_bufcount,
                   1790:                             we->we_reclaimable_bytes, we->we_error));
                   1791:                }
                   1792: #endif
                   1793:                mutex_exit(&wl->wl_mtx);
                   1794:        }
                   1795: #endif
                   1796:
                   1797:        rw_exit(&wl->wl_rwlock);
                   1798:        return error;
                   1799: }
                   1800:
                   1801: /****************************************************************/
                   1802:
                   1803: void
                   1804: wapbl_jlock_assert(struct wapbl *wl)
                   1805: {
                   1806:
1.23      ad       1807:        KASSERT(rw_lock_held(&wl->wl_rwlock));
1.2       simonb   1808: }
                   1809:
                   1810: void
                   1811: wapbl_junlock_assert(struct wapbl *wl)
                   1812: {
                   1813:
                   1814:        KASSERT(!rw_write_held(&wl->wl_rwlock));
                   1815: }
                   1816:
                   1817: /****************************************************************/
                   1818:
                   1819: /* locks missing */
                   1820: void
                   1821: wapbl_print(struct wapbl *wl,
                   1822:                int full,
                   1823:                void (*pr)(const char *, ...))
                   1824: {
                   1825:        struct buf *bp;
                   1826:        struct wapbl_entry *we;
                   1827:        (*pr)("wapbl %p", wl);
                   1828:        (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
                   1829:              wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
                   1830:        (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
                   1831:              wl->wl_circ_size, wl->wl_circ_off,
                   1832:              (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
                   1833:        (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
                   1834:              wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
                   1835: #ifdef WAPBL_DEBUG_BUFBYTES
                   1836:        (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
                   1837:              "reserved = %zu errcnt = %d unsynced = %zu\n",
                   1838:              wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
                   1839:              wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
                   1840:                                wl->wl_error_count, wl->wl_unsynced_bufbytes);
                   1841: #else
                   1842:        (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
                   1843:              "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
                   1844:              wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
                   1845:                                wl->wl_error_count);
                   1846: #endif
                   1847:        (*pr)("\tdealloccnt = %d, dealloclim = %d\n",
                   1848:              wl->wl_dealloccnt, wl->wl_dealloclim);
                   1849:        (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
                   1850:              wl->wl_inohashcnt, wl->wl_inohashmask);
                   1851:        (*pr)("entries:\n");
                   1852:        SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
                   1853: #ifdef WAPBL_DEBUG_BUFBYTES
                   1854:                (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
                   1855:                      "unsynced = %zu\n",
                   1856:                      we->we_bufcount, we->we_reclaimable_bytes,
                   1857:                      we->we_error, we->we_unsynced_bufbytes);
                   1858: #else
                   1859:                (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
                   1860:                      we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
                   1861: #endif
                   1862:        }
                   1863:        if (full) {
                   1864:                int cnt = 0;
                   1865:                (*pr)("bufs =");
                   1866:                LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
                   1867:                        if (!LIST_NEXT(bp, b_wapbllist)) {
                   1868:                                (*pr)(" %p", bp);
                   1869:                        } else if ((++cnt % 6) == 0) {
                   1870:                                (*pr)(" %p,\n\t", bp);
                   1871:                        } else {
                   1872:                                (*pr)(" %p,", bp);
                   1873:                        }
                   1874:                }
                   1875:                (*pr)("\n");
                   1876:
                   1877:                (*pr)("dealloced blks = ");
                   1878:                {
1.78.2.1! pgoyette 1879:                        struct wapbl_dealloc *wd;
1.2       simonb   1880:                        cnt = 0;
1.78.2.1! pgoyette 1881:                        SIMPLEQ_FOREACH(wd, &wl->wl_dealloclist, wd_entries) {
1.2       simonb   1882:                                (*pr)(" %"PRId64":%d,",
1.78.2.1! pgoyette 1883:                                      wd->wd_blkno,
        !          1884:                                      wd->wd_len);
1.2       simonb   1885:                                if ((++cnt % 4) == 0) {
                   1886:                                        (*pr)("\n\t");
                   1887:                                }
                   1888:                        }
                   1889:                }
                   1890:                (*pr)("\n");
                   1891:
                   1892:                (*pr)("registered inodes = ");
                   1893:                {
                   1894:                        int i;
                   1895:                        cnt = 0;
                   1896:                        for (i = 0; i <= wl->wl_inohashmask; i++) {
                   1897:                                struct wapbl_ino_head *wih;
                   1898:                                struct wapbl_ino *wi;
                   1899:
                   1900:                                wih = &wl->wl_inohash[i];
                   1901:                                LIST_FOREACH(wi, wih, wi_hash) {
                   1902:                                        if (wi->wi_ino == 0)
                   1903:                                                continue;
1.55      christos 1904:                                        (*pr)(" %"PRIu64"/0%06"PRIo32",",
1.2       simonb   1905:                                            wi->wi_ino, wi->wi_mode);
                   1906:                                        if ((++cnt % 4) == 0) {
                   1907:                                                (*pr)("\n\t");
                   1908:                                        }
                   1909:                                }
                   1910:                        }
                   1911:                        (*pr)("\n");
                   1912:                }
                   1913:        }
                   1914: }
                   1915:
                   1916: #if defined(WAPBL_DEBUG) || defined(DDB)
                   1917: void
                   1918: wapbl_dump(struct wapbl *wl)
                   1919: {
                   1920: #if defined(WAPBL_DEBUG)
                   1921:        if (!wl)
                   1922:                wl = wapbl_debug_wl;
                   1923: #endif
                   1924:        if (!wl)
                   1925:                return;
                   1926:        wapbl_print(wl, 1, printf);
                   1927: }
                   1928: #endif
                   1929:
                   1930: /****************************************************************/
                   1931:
1.78.2.1! pgoyette 1932: int
        !          1933: wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len, bool force)
1.2       simonb   1934: {
1.78.2.1! pgoyette 1935:        struct wapbl_dealloc *wd;
        !          1936:        int error = 0;
1.2       simonb   1937:
                   1938:        wapbl_jlock_assert(wl);
                   1939:
1.38      hannken  1940:        mutex_enter(&wl->wl_mtx);
1.27      pooka    1941:
1.78.2.1! pgoyette 1942:        if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim)) {
        !          1943:                if (!force) {
        !          1944:                        error = EAGAIN;
        !          1945:                        goto out;
        !          1946:                }
        !          1947:
        !          1948:                /*
        !          1949:                 * Forced registration can only be used when:
        !          1950:                 * 1) the caller can't cope with failure
        !          1951:                 * 2) the path can be triggered only bounded, small
        !          1952:                 *    times per transaction
        !          1953:                 * If this is not fullfilled, and the path would be triggered
        !          1954:                 * many times, this could overflow maximum transaction size
        !          1955:                 * and panic later.
        !          1956:                 */
        !          1957:                printf("%s: forced dealloc registration over limit: %d >= %d\n",
        !          1958:                        wl->wl_mount->mnt_stat.f_mntonname,
        !          1959:                        wl->wl_dealloccnt, wl->wl_dealloclim);
        !          1960:        }
        !          1961:
1.2       simonb   1962:        wl->wl_dealloccnt++;
1.38      hannken  1963:        mutex_exit(&wl->wl_mtx);
1.78.2.1! pgoyette 1964:
        !          1965:        wd = pool_get(&wapbl_dealloc_pool, PR_WAITOK);
        !          1966:        wd->wd_blkno = blk;
        !          1967:        wd->wd_len = len;
        !          1968:
        !          1969:        mutex_enter(&wl->wl_mtx);
        !          1970:        SIMPLEQ_INSERT_TAIL(&wl->wl_dealloclist, wd, wd_entries);
        !          1971:
        !          1972:  out:
        !          1973:        mutex_exit(&wl->wl_mtx);
        !          1974:
        !          1975:        WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
        !          1976:            ("wapbl_register_deallocation: blk=%"PRId64" len=%d error=%d\n",
        !          1977:            blk, len, error));
        !          1978:
        !          1979:        return error;
1.2       simonb   1980: }
                   1981:
                   1982: /****************************************************************/
                   1983:
                   1984: static void
                   1985: wapbl_inodetrk_init(struct wapbl *wl, u_int size)
                   1986: {
                   1987:
                   1988:        wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
                   1989:        if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
                   1990:                pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
                   1991:                    "wapblinopl", &pool_allocator_nointr, IPL_NONE);
                   1992:        }
                   1993: }
                   1994:
                   1995: static void
                   1996: wapbl_inodetrk_free(struct wapbl *wl)
                   1997: {
                   1998:
                   1999:        /* XXX this KASSERT needs locking/mutex analysis */
                   2000:        KASSERT(wl->wl_inohashcnt == 0);
                   2001:        hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
                   2002:        if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
                   2003:                pool_destroy(&wapbl_ino_pool);
                   2004:        }
                   2005: }
                   2006:
                   2007: static struct wapbl_ino *
                   2008: wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
                   2009: {
                   2010:        struct wapbl_ino_head *wih;
                   2011:        struct wapbl_ino *wi;
                   2012:
                   2013:        KASSERT(mutex_owned(&wl->wl_mtx));
                   2014:
                   2015:        wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
                   2016:        LIST_FOREACH(wi, wih, wi_hash) {
                   2017:                if (ino == wi->wi_ino)
                   2018:                        return wi;
                   2019:        }
                   2020:        return 0;
                   2021: }
                   2022:
                   2023: void
                   2024: wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
                   2025: {
                   2026:        struct wapbl_ino_head *wih;
                   2027:        struct wapbl_ino *wi;
                   2028:
                   2029:        wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
                   2030:
                   2031:        mutex_enter(&wl->wl_mtx);
                   2032:        if (wapbl_inodetrk_get(wl, ino) == NULL) {
                   2033:                wi->wi_ino = ino;
                   2034:                wi->wi_mode = mode;
                   2035:                wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
                   2036:                LIST_INSERT_HEAD(wih, wi, wi_hash);
                   2037:                wl->wl_inohashcnt++;
                   2038:                WAPBL_PRINTF(WAPBL_PRINT_INODE,
                   2039:                    ("wapbl_register_inode: ino=%"PRId64"\n", ino));
                   2040:                mutex_exit(&wl->wl_mtx);
                   2041:        } else {
                   2042:                mutex_exit(&wl->wl_mtx);
                   2043:                pool_put(&wapbl_ino_pool, wi);
                   2044:        }
                   2045: }
                   2046:
                   2047: void
                   2048: wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
                   2049: {
                   2050:        struct wapbl_ino *wi;
                   2051:
                   2052:        mutex_enter(&wl->wl_mtx);
                   2053:        wi = wapbl_inodetrk_get(wl, ino);
                   2054:        if (wi) {
                   2055:                WAPBL_PRINTF(WAPBL_PRINT_INODE,
                   2056:                    ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
                   2057:                KASSERT(wl->wl_inohashcnt > 0);
                   2058:                wl->wl_inohashcnt--;
                   2059:                LIST_REMOVE(wi, wi_hash);
                   2060:                mutex_exit(&wl->wl_mtx);
                   2061:
                   2062:                pool_put(&wapbl_ino_pool, wi);
                   2063:        } else {
                   2064:                mutex_exit(&wl->wl_mtx);
                   2065:        }
                   2066: }
                   2067:
                   2068: /****************************************************************/
                   2069:
1.71      riastrad 2070: /*
                   2071:  * wapbl_transaction_inodes_len(wl)
                   2072:  *
                   2073:  *     Calculate the number of bytes required for inode registration
                   2074:  *     log records in wl.
                   2075:  */
1.30      uebayasi 2076: static inline size_t
1.2       simonb   2077: wapbl_transaction_inodes_len(struct wapbl *wl)
                   2078: {
                   2079:        int blocklen = 1<<wl->wl_log_dev_bshift;
                   2080:        int iph;
                   2081:
                   2082:        /* Calculate number of inodes described in a inodelist header */
                   2083:        iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
                   2084:            sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
                   2085:
                   2086:        KASSERT(iph > 0);
                   2087:
1.39      christos 2088:        return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen;
1.2       simonb   2089: }
                   2090:
                   2091:
1.71      riastrad 2092: /*
                   2093:  * wapbl_transaction_len(wl)
                   2094:  *
                   2095:  *     Calculate number of bytes required for all log records in wl.
                   2096:  */
1.2       simonb   2097: static size_t
                   2098: wapbl_transaction_len(struct wapbl *wl)
                   2099: {
                   2100:        int blocklen = 1<<wl->wl_log_dev_bshift;
                   2101:        size_t len;
                   2102:
                   2103:        /* Calculate number of blocks described in a blocklist header */
                   2104:        len = wl->wl_bcount;
1.78.2.1! pgoyette 2105:        len += howmany(wl->wl_bufcount, wl->wl_brperjblock) * blocklen;
        !          2106:        len += howmany(wl->wl_dealloccnt, wl->wl_brperjblock) * blocklen;
1.2       simonb   2107:        len += wapbl_transaction_inodes_len(wl);
                   2108:
                   2109:        return len;
                   2110: }
                   2111:
                   2112: /*
1.71      riastrad 2113:  * wapbl_cache_sync(wl, msg)
                   2114:  *
                   2115:  *     Issue DIOCCACHESYNC to wl->wl_devvp.
                   2116:  *
                   2117:  *     If sysctl(vfs.wapbl.verbose_commit) >= 2, print a message
                   2118:  *     including msg about the duration of the cache sync.
1.48      yamt     2119:  */
                   2120: static int
                   2121: wapbl_cache_sync(struct wapbl *wl, const char *msg)
                   2122: {
                   2123:        const bool verbose = wapbl_verbose_commit >= 2;
                   2124:        struct bintime start_time;
                   2125:        int force = 1;
                   2126:        int error;
                   2127:
                   2128:        if (!wapbl_flush_disk_cache) {
                   2129:                return 0;
                   2130:        }
                   2131:        if (verbose) {
                   2132:                bintime(&start_time);
                   2133:        }
                   2134:        error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force,
                   2135:            FWRITE, FSCRED);
                   2136:        if (error) {
                   2137:                WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1.76      riastrad 2138:                    ("wapbl_cache_sync: DIOCCACHESYNC on dev 0x%jx "
                   2139:                    "returned %d\n", (uintmax_t)wl->wl_devvp->v_rdev, error));
1.48      yamt     2140:        }
                   2141:        if (verbose) {
                   2142:                struct bintime d;
                   2143:                struct timespec ts;
                   2144:
                   2145:                bintime(&d);
                   2146:                bintime_sub(&d, &start_time);
                   2147:                bintime2timespec(&d, &ts);
                   2148:                printf("wapbl_cache_sync: %s: dev 0x%jx %ju.%09lu\n",
                   2149:                    msg, (uintmax_t)wl->wl_devvp->v_rdev,
                   2150:                    (uintmax_t)ts.tv_sec, ts.tv_nsec);
                   2151:        }
                   2152:        return error;
                   2153: }
                   2154:
                   2155: /*
1.71      riastrad 2156:  * wapbl_write_commit(wl, head, tail)
                   2157:  *
                   2158:  *     Issue a disk cache sync to wait for all pending writes to the
                   2159:  *     log to complete, and then synchronously commit the current
                   2160:  *     circular queue head and tail to the log, in the next of two
                   2161:  *     locations for commit headers on disk.
1.2       simonb   2162:  *
1.71      riastrad 2163:  *     Increment the generation number.  If the generation number
                   2164:  *     rolls over to zero, then a subsequent commit would appear to
                   2165:  *     have an older generation than this one -- in that case, issue a
                   2166:  *     duplicate commit to avoid this.
                   2167:  *
                   2168:  *     => Caller must have exclusive access to wl, either by holding
                   2169:  *     wl->wl_rwlock for writer or by being wapbl_start before anyone
                   2170:  *     else has seen wl.
1.2       simonb   2171:  */
                   2172: static int
                   2173: wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
                   2174: {
                   2175:        struct wapbl_wc_header *wc = wl->wl_wc_header;
                   2176:        struct timespec ts;
                   2177:        int error;
1.34      mlelstv  2178:        daddr_t pbn;
1.2       simonb   2179:
1.54      hannken  2180:        error = wapbl_buffered_flush(wl);
                   2181:        if (error)
                   2182:                return error;
1.49      yamt     2183:        /*
                   2184:         * flush disk cache to ensure that blocks we've written are actually
                   2185:         * written to the stable storage before the commit header.
                   2186:         *
                   2187:         * XXX Calc checksum here, instead we do this for now
                   2188:         */
1.48      yamt     2189:        wapbl_cache_sync(wl, "1");
1.2       simonb   2190:
                   2191:        wc->wc_head = head;
                   2192:        wc->wc_tail = tail;
                   2193:        wc->wc_checksum = 0;
                   2194:        wc->wc_version = 1;
                   2195:        getnanotime(&ts);
1.17      yamt     2196:        wc->wc_time = ts.tv_sec;
1.2       simonb   2197:        wc->wc_timensec = ts.tv_nsec;
                   2198:
                   2199:        WAPBL_PRINTF(WAPBL_PRINT_WRITE,
                   2200:            ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
                   2201:            (intmax_t)head, (intmax_t)tail));
                   2202:
                   2203:        /*
1.49      yamt     2204:         * write the commit header.
                   2205:         *
1.2       simonb   2206:         * XXX if generation will rollover, then first zero
                   2207:         * over second commit header before trying to write both headers.
                   2208:         */
                   2209:
1.34      mlelstv  2210:        pbn = wl->wl_logpbn + (wc->wc_generation % 2);
                   2211: #ifdef _KERNEL
                   2212:        pbn = btodb(pbn << wc->wc_log_dev_bshift);
                   2213: #endif
1.54      hannken  2214:        error = wapbl_buffered_write(wc, wc->wc_len, wl, pbn);
                   2215:        if (error)
                   2216:                return error;
                   2217:        error = wapbl_buffered_flush(wl);
1.2       simonb   2218:        if (error)
                   2219:                return error;
                   2220:
1.49      yamt     2221:        /*
                   2222:         * flush disk cache to ensure that the commit header is actually
                   2223:         * written before meta data blocks.
                   2224:         */
1.48      yamt     2225:        wapbl_cache_sync(wl, "2");
1.2       simonb   2226:
                   2227:        /*
                   2228:         * If the generation number was zero, write it out a second time.
                   2229:         * This handles initialization and generation number rollover
                   2230:         */
                   2231:        if (wc->wc_generation++ == 0) {
                   2232:                error = wapbl_write_commit(wl, head, tail);
                   2233:                /*
                   2234:                 * This panic should be able to be removed if we do the
                   2235:                 * zero'ing mentioned above, and we are certain to roll
                   2236:                 * back generation number on failure.
                   2237:                 */
                   2238:                if (error)
                   2239:                        panic("wapbl_write_commit: error writing duplicate "
1.66      riastrad 2240:                              "log header: %d", error);
1.2       simonb   2241:        }
                   2242:        return 0;
                   2243: }
                   2244:
1.71      riastrad 2245: /*
                   2246:  * wapbl_write_blocks(wl, offp)
                   2247:  *
                   2248:  *     Write all pending physical blocks in the current transaction
                   2249:  *     from wapbl_add_buf to the log on disk, adding to the circular
                   2250:  *     queue head at byte offset *offp, and returning the new head's
                   2251:  *     byte offset in *offp.
                   2252:  */
1.2       simonb   2253: static int
                   2254: wapbl_write_blocks(struct wapbl *wl, off_t *offp)
                   2255: {
                   2256:        struct wapbl_wc_blocklist *wc =
                   2257:            (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
                   2258:        int blocklen = 1<<wl->wl_log_dev_bshift;
                   2259:        struct buf *bp;
                   2260:        off_t off = *offp;
                   2261:        int error;
1.7       joerg    2262:        size_t padding;
1.2       simonb   2263:
                   2264:        KASSERT(rw_write_held(&wl->wl_rwlock));
                   2265:
                   2266:        bp = LIST_FIRST(&wl->wl_bufs);
                   2267:
                   2268:        while (bp) {
                   2269:                int cnt;
                   2270:                struct buf *obp = bp;
                   2271:
                   2272:                KASSERT(bp->b_flags & B_LOCKED);
                   2273:
                   2274:                wc->wc_type = WAPBL_WC_BLOCKS;
                   2275:                wc->wc_len = blocklen;
                   2276:                wc->wc_blkcount = 0;
1.78.2.1! pgoyette 2277:                while (bp && (wc->wc_blkcount < wl->wl_brperjblock)) {
1.2       simonb   2278:                        /*
                   2279:                         * Make sure all the physical block numbers are up to
                   2280:                         * date.  If this is not always true on a given
                   2281:                         * filesystem, then VOP_BMAP must be called.  We
                   2282:                         * could call VOP_BMAP here, or else in the filesystem
                   2283:                         * specific flush callback, although neither of those
                   2284:                         * solutions allow us to take the vnode lock.  If a
                   2285:                         * filesystem requires that we must take the vnode lock
                   2286:                         * to call VOP_BMAP, then we can probably do it in
                   2287:                         * bwrite when the vnode lock should already be held
                   2288:                         * by the invoking code.
                   2289:                         */
                   2290:                        KASSERT((bp->b_vp->v_type == VBLK) ||
                   2291:                                 (bp->b_blkno != bp->b_lblkno));
                   2292:                        KASSERT(bp->b_blkno > 0);
                   2293:
                   2294:                        wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
                   2295:                        wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
                   2296:                        wc->wc_len += bp->b_bcount;
                   2297:                        wc->wc_blkcount++;
                   2298:                        bp = LIST_NEXT(bp, b_wapbllist);
                   2299:                }
1.7       joerg    2300:                if (wc->wc_len % blocklen != 0) {
                   2301:                        padding = blocklen - wc->wc_len % blocklen;
                   2302:                        wc->wc_len += padding;
                   2303:                } else {
                   2304:                        padding = 0;
                   2305:                }
                   2306:
1.2       simonb   2307:                WAPBL_PRINTF(WAPBL_PRINT_WRITE,
1.7       joerg    2308:                    ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
                   2309:                    wc->wc_len, padding, (intmax_t)off));
1.2       simonb   2310:
                   2311:                error = wapbl_circ_write(wl, wc, blocklen, &off);
                   2312:                if (error)
                   2313:                        return error;
                   2314:                bp = obp;
                   2315:                cnt = 0;
1.78.2.1! pgoyette 2316:                while (bp && (cnt++ < wl->wl_brperjblock)) {
1.2       simonb   2317:                        error = wapbl_circ_write(wl, bp->b_data,
                   2318:                            bp->b_bcount, &off);
                   2319:                        if (error)
                   2320:                                return error;
                   2321:                        bp = LIST_NEXT(bp, b_wapbllist);
                   2322:                }
1.7       joerg    2323:                if (padding) {
                   2324:                        void *zero;
                   2325:
1.51      para     2326:                        zero = wapbl_alloc(padding);
1.7       joerg    2327:                        memset(zero, 0, padding);
                   2328:                        error = wapbl_circ_write(wl, zero, padding, &off);
1.18      yamt     2329:                        wapbl_free(zero, padding);
1.7       joerg    2330:                        if (error)
                   2331:                                return error;
                   2332:                }
1.2       simonb   2333:        }
                   2334:        *offp = off;
                   2335:        return 0;
                   2336: }
                   2337:
1.71      riastrad 2338: /*
                   2339:  * wapbl_write_revocations(wl, offp)
                   2340:  *
                   2341:  *     Write all pending deallocations in the current transaction from
                   2342:  *     wapbl_register_deallocation to the log on disk, adding to the
                   2343:  *     circular queue's head at byte offset *offp, and returning the
                   2344:  *     new head's byte offset in *offp.
                   2345:  */
1.2       simonb   2346: static int
                   2347: wapbl_write_revocations(struct wapbl *wl, off_t *offp)
                   2348: {
                   2349:        struct wapbl_wc_blocklist *wc =
                   2350:            (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
1.78.2.1! pgoyette 2351:        struct wapbl_dealloc *wd, *lwd;
1.2       simonb   2352:        int blocklen = 1<<wl->wl_log_dev_bshift;
                   2353:        off_t off = *offp;
                   2354:        int error;
                   2355:
                   2356:        if (wl->wl_dealloccnt == 0)
                   2357:                return 0;
                   2358:
1.78.2.1! pgoyette 2359:        while ((wd = SIMPLEQ_FIRST(&wl->wl_dealloclist)) != NULL) {
1.2       simonb   2360:                wc->wc_type = WAPBL_WC_REVOCATIONS;
                   2361:                wc->wc_len = blocklen;
                   2362:                wc->wc_blkcount = 0;
1.78.2.1! pgoyette 2363:                while (wd && (wc->wc_blkcount < wl->wl_brperjblock)) {
1.2       simonb   2364:                        wc->wc_blocks[wc->wc_blkcount].wc_daddr =
1.78.2.1! pgoyette 2365:                            wd->wd_blkno;
1.2       simonb   2366:                        wc->wc_blocks[wc->wc_blkcount].wc_dlen =
1.78.2.1! pgoyette 2367:                            wd->wd_len;
1.2       simonb   2368:                        wc->wc_blkcount++;
1.78.2.1! pgoyette 2369:
        !          2370:                        wd = SIMPLEQ_NEXT(wd, wd_entries);
1.2       simonb   2371:                }
                   2372:                WAPBL_PRINTF(WAPBL_PRINT_WRITE,
                   2373:                    ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
                   2374:                    wc->wc_len, (intmax_t)off));
                   2375:                error = wapbl_circ_write(wl, wc, blocklen, &off);
                   2376:                if (error)
                   2377:                        return error;
1.78.2.1! pgoyette 2378:
        !          2379:                /* free all successfully written deallocs */
        !          2380:                lwd = wd;
        !          2381:                while ((wd = SIMPLEQ_FIRST(&wl->wl_dealloclist)) != NULL) {
        !          2382:                        if (wd == lwd)
        !          2383:                                break;
        !          2384:                        SIMPLEQ_REMOVE_HEAD(&wl->wl_dealloclist, wd_entries);
        !          2385:                        pool_put(&wapbl_dealloc_pool, wd);
        !          2386:                        wl->wl_dealloccnt--;
        !          2387:                }
1.2       simonb   2388:        }
                   2389:        *offp = off;
                   2390:        return 0;
                   2391: }
                   2392:
1.71      riastrad 2393: /*
                   2394:  * wapbl_write_inodes(wl, offp)
                   2395:  *
                   2396:  *     Write all pending inode allocations in the current transaction
                   2397:  *     from wapbl_register_inode to the log on disk, adding to the
                   2398:  *     circular queue's head at byte offset *offp and returning the
                   2399:  *     new head's byte offset in *offp.
                   2400:  */
1.2       simonb   2401: static int
                   2402: wapbl_write_inodes(struct wapbl *wl, off_t *offp)
                   2403: {
                   2404:        struct wapbl_wc_inodelist *wc =
                   2405:            (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
                   2406:        int i;
1.14      joerg    2407:        int blocklen = 1 << wl->wl_log_dev_bshift;
1.2       simonb   2408:        off_t off = *offp;
                   2409:        int error;
                   2410:
                   2411:        struct wapbl_ino_head *wih;
                   2412:        struct wapbl_ino *wi;
                   2413:        int iph;
                   2414:
                   2415:        iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
                   2416:            sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
                   2417:
                   2418:        i = 0;
                   2419:        wih = &wl->wl_inohash[0];
                   2420:        wi = 0;
                   2421:        do {
                   2422:                wc->wc_type = WAPBL_WC_INODES;
                   2423:                wc->wc_len = blocklen;
                   2424:                wc->wc_inocnt = 0;
                   2425:                wc->wc_clear = (i == 0);
                   2426:                while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
                   2427:                        while (!wi) {
                   2428:                                KASSERT((wih - &wl->wl_inohash[0])
                   2429:                                    <= wl->wl_inohashmask);
                   2430:                                wi = LIST_FIRST(wih++);
                   2431:                        }
                   2432:                        wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
                   2433:                        wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
                   2434:                        wc->wc_inocnt++;
                   2435:                        i++;
                   2436:                        wi = LIST_NEXT(wi, wi_hash);
                   2437:                }
                   2438:                WAPBL_PRINTF(WAPBL_PRINT_WRITE,
                   2439:                    ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
                   2440:                    wc->wc_len, (intmax_t)off));
                   2441:                error = wapbl_circ_write(wl, wc, blocklen, &off);
                   2442:                if (error)
                   2443:                        return error;
                   2444:        } while (i < wl->wl_inohashcnt);
                   2445:
                   2446:        *offp = off;
                   2447:        return 0;
                   2448: }
                   2449:
                   2450: #endif /* _KERNEL */
                   2451:
                   2452: /****************************************************************/
                   2453:
                   2454: struct wapbl_blk {
                   2455:        LIST_ENTRY(wapbl_blk) wb_hash;
                   2456:        daddr_t wb_blk;
                   2457:        off_t wb_off; /* Offset of this block in the log */
                   2458: };
                   2459: #define        WAPBL_BLKPOOL_MIN 83
                   2460:
                   2461: static void
                   2462: wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
                   2463: {
                   2464:        if (size < WAPBL_BLKPOOL_MIN)
                   2465:                size = WAPBL_BLKPOOL_MIN;
                   2466:        KASSERT(wr->wr_blkhash == 0);
                   2467: #ifdef _KERNEL
                   2468:        wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
                   2469: #else /* ! _KERNEL */
                   2470:        /* Manually implement hashinit */
                   2471:        {
1.25      lukem    2472:                unsigned long i, hashsize;
1.2       simonb   2473:                for (hashsize = 1; hashsize < size; hashsize <<= 1)
                   2474:                        continue;
1.51      para     2475:                wr->wr_blkhash = wapbl_alloc(hashsize * sizeof(*wr->wr_blkhash));
1.37      drochner 2476:                for (i = 0; i < hashsize; i++)
1.2       simonb   2477:                        LIST_INIT(&wr->wr_blkhash[i]);
                   2478:                wr->wr_blkhashmask = hashsize - 1;
                   2479:        }
                   2480: #endif /* ! _KERNEL */
                   2481: }
                   2482:
                   2483: static void
                   2484: wapbl_blkhash_free(struct wapbl_replay *wr)
                   2485: {
                   2486:        KASSERT(wr->wr_blkhashcnt == 0);
                   2487: #ifdef _KERNEL
                   2488:        hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
                   2489: #else /* ! _KERNEL */
1.18      yamt     2490:        wapbl_free(wr->wr_blkhash,
                   2491:            (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash));
1.2       simonb   2492: #endif /* ! _KERNEL */
                   2493: }
                   2494:
                   2495: static struct wapbl_blk *
                   2496: wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
                   2497: {
                   2498:        struct wapbl_blk_head *wbh;
                   2499:        struct wapbl_blk *wb;
                   2500:        wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
                   2501:        LIST_FOREACH(wb, wbh, wb_hash) {
                   2502:                if (blk == wb->wb_blk)
                   2503:                        return wb;
                   2504:        }
                   2505:        return 0;
                   2506: }
                   2507:
                   2508: static void
                   2509: wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
                   2510: {
                   2511:        struct wapbl_blk_head *wbh;
                   2512:        struct wapbl_blk *wb;
                   2513:        wb = wapbl_blkhash_get(wr, blk);
                   2514:        if (wb) {
                   2515:                KASSERT(wb->wb_blk == blk);
                   2516:                wb->wb_off = off;
                   2517:        } else {
1.51      para     2518:                wb = wapbl_alloc(sizeof(*wb));
1.2       simonb   2519:                wb->wb_blk = blk;
                   2520:                wb->wb_off = off;
                   2521:                wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
                   2522:                LIST_INSERT_HEAD(wbh, wb, wb_hash);
                   2523:                wr->wr_blkhashcnt++;
                   2524:        }
                   2525: }
                   2526:
                   2527: static void
                   2528: wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
                   2529: {
                   2530:        struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
                   2531:        if (wb) {
                   2532:                KASSERT(wr->wr_blkhashcnt > 0);
                   2533:                wr->wr_blkhashcnt--;
                   2534:                LIST_REMOVE(wb, wb_hash);
1.18      yamt     2535:                wapbl_free(wb, sizeof(*wb));
1.2       simonb   2536:        }
                   2537: }
                   2538:
                   2539: static void
                   2540: wapbl_blkhash_clear(struct wapbl_replay *wr)
                   2541: {
1.25      lukem    2542:        unsigned long i;
1.2       simonb   2543:        for (i = 0; i <= wr->wr_blkhashmask; i++) {
                   2544:                struct wapbl_blk *wb;
                   2545:
                   2546:                while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
                   2547:                        KASSERT(wr->wr_blkhashcnt > 0);
                   2548:                        wr->wr_blkhashcnt--;
                   2549:                        LIST_REMOVE(wb, wb_hash);
1.18      yamt     2550:                        wapbl_free(wb, sizeof(*wb));
1.2       simonb   2551:                }
                   2552:        }
                   2553:        KASSERT(wr->wr_blkhashcnt == 0);
                   2554: }
                   2555:
                   2556: /****************************************************************/
                   2557:
1.71      riastrad 2558: /*
                   2559:  * wapbl_circ_read(wr, data, len, offp)
                   2560:  *
                   2561:  *     Read len bytes into data from the circular queue of wr,
                   2562:  *     starting at the linear byte offset *offp, and returning the new
                   2563:  *     linear byte offset in *offp.
                   2564:  *
                   2565:  *     If the starting linear byte offset precedes wr->wr_circ_off,
                   2566:  *     the read instead begins at wr->wr_circ_off.  XXX WTF?  This
                   2567:  *     should be a KASSERT, not a conditional.
                   2568:  */
1.2       simonb   2569: static int
                   2570: wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
                   2571: {
                   2572:        size_t slen;
                   2573:        off_t off = *offp;
                   2574:        int error;
1.34      mlelstv  2575:        daddr_t pbn;
1.2       simonb   2576:
1.14      joerg    2577:        KASSERT(((len >> wr->wr_log_dev_bshift) <<
                   2578:            wr->wr_log_dev_bshift) == len);
1.34      mlelstv  2579:
1.14      joerg    2580:        if (off < wr->wr_circ_off)
                   2581:                off = wr->wr_circ_off;
                   2582:        slen = wr->wr_circ_off + wr->wr_circ_size - off;
1.2       simonb   2583:        if (slen < len) {
1.34      mlelstv  2584:                pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
                   2585: #ifdef _KERNEL
                   2586:                pbn = btodb(pbn << wr->wr_log_dev_bshift);
                   2587: #endif
                   2588:                error = wapbl_read(data, slen, wr->wr_devvp, pbn);
1.2       simonb   2589:                if (error)
                   2590:                        return error;
                   2591:                data = (uint8_t *)data + slen;
                   2592:                len -= slen;
1.14      joerg    2593:                off = wr->wr_circ_off;
1.2       simonb   2594:        }
1.34      mlelstv  2595:        pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
                   2596: #ifdef _KERNEL
                   2597:        pbn = btodb(pbn << wr->wr_log_dev_bshift);
                   2598: #endif
                   2599:        error = wapbl_read(data, len, wr->wr_devvp, pbn);
1.2       simonb   2600:        if (error)
                   2601:                return error;
                   2602:        off += len;
1.14      joerg    2603:        if (off >= wr->wr_circ_off + wr->wr_circ_size)
                   2604:                off = wr->wr_circ_off;
1.2       simonb   2605:        *offp = off;
                   2606:        return 0;
                   2607: }
                   2608:
1.71      riastrad 2609: /*
                   2610:  * wapbl_circ_advance(wr, len, offp)
                   2611:  *
                   2612:  *     Compute the linear byte offset of the circular queue of wr that
                   2613:  *     is len bytes past *offp, and store it in *offp.
                   2614:  *
                   2615:  *     This is as if wapbl_circ_read, but without actually reading
                   2616:  *     anything.
                   2617:  *
                   2618:  *     If the starting linear byte offset precedes wr->wr_circ_off, it
                   2619:  *     is taken to be wr->wr_circ_off instead.  XXX WTF?  This should
                   2620:  *     be a KASSERT, not a conditional.
                   2621:  */
1.2       simonb   2622: static void
                   2623: wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
                   2624: {
                   2625:        size_t slen;
                   2626:        off_t off = *offp;
                   2627:
1.14      joerg    2628:        KASSERT(((len >> wr->wr_log_dev_bshift) <<
                   2629:            wr->wr_log_dev_bshift) == len);
1.2       simonb   2630:
1.14      joerg    2631:        if (off < wr->wr_circ_off)
                   2632:                off = wr->wr_circ_off;
                   2633:        slen = wr->wr_circ_off + wr->wr_circ_size - off;
1.2       simonb   2634:        if (slen < len) {
                   2635:                len -= slen;
1.14      joerg    2636:                off = wr->wr_circ_off;
1.2       simonb   2637:        }
                   2638:        off += len;
1.14      joerg    2639:        if (off >= wr->wr_circ_off + wr->wr_circ_size)
                   2640:                off = wr->wr_circ_off;
1.2       simonb   2641:        *offp = off;
                   2642: }
                   2643:
                   2644: /****************************************************************/
                   2645:
                   2646: int
                   2647: wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
                   2648:        daddr_t off, size_t count, size_t blksize)
                   2649: {
                   2650:        struct wapbl_replay *wr;
                   2651:        int error;
                   2652:        struct vnode *devvp;
                   2653:        daddr_t logpbn;
                   2654:        uint8_t *scratch;
                   2655:        struct wapbl_wc_header *wch;
                   2656:        struct wapbl_wc_header *wch2;
                   2657:        /* Use this until we read the actual log header */
1.31      mlelstv  2658:        int log_dev_bshift = ilog2(blksize);
1.2       simonb   2659:        size_t used;
1.34      mlelstv  2660:        daddr_t pbn;
1.2       simonb   2661:
                   2662:        WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
                   2663:            ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
                   2664:            vp, off, count, blksize));
                   2665:
                   2666:        if (off < 0)
                   2667:                return EINVAL;
                   2668:
                   2669:        if (blksize < DEV_BSIZE)
                   2670:                return EINVAL;
                   2671:        if (blksize % DEV_BSIZE)
                   2672:                return EINVAL;
                   2673:
                   2674: #ifdef _KERNEL
                   2675: #if 0
                   2676:        /* XXX vp->v_size isn't reliably set for VBLK devices,
                   2677:         * especially root.  However, we might still want to verify
                   2678:         * that the full load is readable */
                   2679:        if ((off + count) * blksize > vp->v_size)
                   2680:                return EINVAL;
                   2681: #endif
                   2682:        if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
                   2683:                return error;
                   2684:        }
                   2685: #else /* ! _KERNEL */
                   2686:        devvp = vp;
                   2687:        logpbn = off;
                   2688: #endif /* ! _KERNEL */
                   2689:
1.51      para     2690:        scratch = wapbl_alloc(MAXBSIZE);
1.2       simonb   2691:
1.34      mlelstv  2692:        pbn = logpbn;
                   2693: #ifdef _KERNEL
                   2694:        pbn = btodb(pbn << log_dev_bshift);
                   2695: #endif
                   2696:        error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn);
1.2       simonb   2697:        if (error)
                   2698:                goto errout;
                   2699:
                   2700:        wch = (struct wapbl_wc_header *)scratch;
                   2701:        wch2 =
                   2702:            (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
                   2703:        /* XXX verify checksums and magic numbers */
                   2704:        if (wch->wc_type != WAPBL_WC_HEADER) {
                   2705:                printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
                   2706:                error = EFTYPE;
                   2707:                goto errout;
                   2708:        }
                   2709:
                   2710:        if (wch2->wc_generation > wch->wc_generation)
                   2711:                wch = wch2;
                   2712:
                   2713:        wr = wapbl_calloc(1, sizeof(*wr));
                   2714:
                   2715:        wr->wr_logvp = vp;
                   2716:        wr->wr_devvp = devvp;
                   2717:        wr->wr_logpbn = logpbn;
                   2718:
                   2719:        wr->wr_scratch = scratch;
                   2720:
1.14      joerg    2721:        wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
                   2722:        wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
                   2723:        wr->wr_circ_off = wch->wc_circ_off;
                   2724:        wr->wr_circ_size = wch->wc_circ_size;
                   2725:        wr->wr_generation = wch->wc_generation;
1.2       simonb   2726:
                   2727:        used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
                   2728:
                   2729:        WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
                   2730:            ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
                   2731:            " len=%"PRId64" used=%zu\n",
                   2732:            wch->wc_head, wch->wc_tail, wch->wc_circ_off,
                   2733:            wch->wc_circ_size, used));
                   2734:
                   2735:        wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
1.11      joerg    2736:
1.14      joerg    2737:        error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
1.2       simonb   2738:        if (error) {
                   2739:                wapbl_replay_stop(wr);
                   2740:                wapbl_replay_free(wr);
                   2741:                return error;
                   2742:        }
                   2743:
                   2744:        *wrp = wr;
                   2745:        return 0;
                   2746:
                   2747:  errout:
1.18      yamt     2748:        wapbl_free(scratch, MAXBSIZE);
1.2       simonb   2749:        return error;
                   2750: }
                   2751:
                   2752: void
                   2753: wapbl_replay_stop(struct wapbl_replay *wr)
                   2754: {
                   2755:
1.4       joerg    2756:        if (!wapbl_replay_isopen(wr))
                   2757:                return;
                   2758:
1.2       simonb   2759:        WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
                   2760:
1.18      yamt     2761:        wapbl_free(wr->wr_scratch, MAXBSIZE);
                   2762:        wr->wr_scratch = NULL;
1.2       simonb   2763:
1.18      yamt     2764:        wr->wr_logvp = NULL;
1.2       simonb   2765:
                   2766:        wapbl_blkhash_clear(wr);
                   2767:        wapbl_blkhash_free(wr);
                   2768: }
                   2769:
                   2770: void
                   2771: wapbl_replay_free(struct wapbl_replay *wr)
                   2772: {
                   2773:
                   2774:        KDASSERT(!wapbl_replay_isopen(wr));
                   2775:
                   2776:        if (wr->wr_inodes)
1.18      yamt     2777:                wapbl_free(wr->wr_inodes,
                   2778:                    wr->wr_inodescnt * sizeof(wr->wr_inodes[0]));
                   2779:        wapbl_free(wr, sizeof(*wr));
1.2       simonb   2780: }
                   2781:
1.4       joerg    2782: #ifdef _KERNEL
1.2       simonb   2783: int
                   2784: wapbl_replay_isopen1(struct wapbl_replay *wr)
                   2785: {
                   2786:
                   2787:        return wapbl_replay_isopen(wr);
                   2788: }
1.4       joerg    2789: #endif
1.2       simonb   2790:
1.62      mlelstv  2791: /*
                   2792:  * calculate the disk address for the i'th block in the wc_blockblist
                   2793:  * offset by j blocks of size blen.
                   2794:  *
                   2795:  * wc_daddr is always a kernel disk address in DEV_BSIZE units that
                   2796:  * was written to the journal.
                   2797:  *
                   2798:  * The kernel needs that address plus the offset in DEV_BSIZE units.
                   2799:  *
                   2800:  * Userland needs that address plus the offset in blen units.
                   2801:  *
                   2802:  */
                   2803: static daddr_t
                   2804: wapbl_block_daddr(struct wapbl_wc_blocklist *wc, int i, int j, int blen)
                   2805: {
                   2806:        daddr_t pbn;
                   2807:
                   2808: #ifdef _KERNEL
                   2809:        pbn = wc->wc_blocks[i].wc_daddr + btodb(j * blen);
                   2810: #else
                   2811:        pbn = dbtob(wc->wc_blocks[i].wc_daddr) / blen + j;
                   2812: #endif
                   2813:
                   2814:        return pbn;
                   2815: }
                   2816:
1.10      joerg    2817: static void
                   2818: wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
                   2819: {
                   2820:        struct wapbl_wc_blocklist *wc =
                   2821:            (struct wapbl_wc_blocklist *)wr->wr_scratch;
1.14      joerg    2822:        int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.10      joerg    2823:        int i, j, n;
                   2824:
                   2825:        for (i = 0; i < wc->wc_blkcount; i++) {
                   2826:                /*
                   2827:                 * Enter each physical block into the hashtable independently.
                   2828:                 */
1.14      joerg    2829:                n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
1.10      joerg    2830:                for (j = 0; j < n; j++) {
1.62      mlelstv  2831:                        wapbl_blkhash_ins(wr, wapbl_block_daddr(wc, i, j, fsblklen),
1.10      joerg    2832:                            *offp);
                   2833:                        wapbl_circ_advance(wr, fsblklen, offp);
                   2834:                }
                   2835:        }
                   2836: }
                   2837:
                   2838: static void
                   2839: wapbl_replay_process_revocations(struct wapbl_replay *wr)
                   2840: {
                   2841:        struct wapbl_wc_blocklist *wc =
                   2842:            (struct wapbl_wc_blocklist *)wr->wr_scratch;
1.34      mlelstv  2843:        int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.10      joerg    2844:        int i, j, n;
                   2845:
                   2846:        for (i = 0; i < wc->wc_blkcount; i++) {
                   2847:                /*
                   2848:                 * Remove any blocks found from the hashtable.
                   2849:                 */
1.14      joerg    2850:                n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
1.10      joerg    2851:                for (j = 0; j < n; j++)
1.62      mlelstv  2852:                        wapbl_blkhash_rem(wr, wapbl_block_daddr(wc, i, j, fsblklen));
1.10      joerg    2853:        }
                   2854: }
                   2855:
                   2856: static void
                   2857: wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
                   2858: {
                   2859:        struct wapbl_wc_inodelist *wc =
                   2860:            (struct wapbl_wc_inodelist *)wr->wr_scratch;
1.18      yamt     2861:        void *new_inodes;
                   2862:        const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]);
                   2863:
                   2864:        KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0]));
                   2865:
1.10      joerg    2866:        /*
                   2867:         * Keep track of where we found this so location won't be
                   2868:         * overwritten.
                   2869:         */
                   2870:        if (wc->wc_clear) {
                   2871:                wr->wr_inodestail = oldoff;
                   2872:                wr->wr_inodescnt = 0;
1.12      joerg    2873:                if (wr->wr_inodes != NULL) {
1.18      yamt     2874:                        wapbl_free(wr->wr_inodes, oldsize);
1.12      joerg    2875:                        wr->wr_inodes = NULL;
                   2876:                }
1.10      joerg    2877:        }
                   2878:        wr->wr_inodeshead = newoff;
                   2879:        if (wc->wc_inocnt == 0)
                   2880:                return;
                   2881:
1.51      para     2882:        new_inodes = wapbl_alloc((wr->wr_inodescnt + wc->wc_inocnt) *
1.18      yamt     2883:            sizeof(wr->wr_inodes[0]));
                   2884:        if (wr->wr_inodes != NULL) {
                   2885:                memcpy(new_inodes, wr->wr_inodes, oldsize);
                   2886:                wapbl_free(wr->wr_inodes, oldsize);
                   2887:        }
                   2888:        wr->wr_inodes = new_inodes;
1.10      joerg    2889:        memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
1.18      yamt     2890:            wc->wc_inocnt * sizeof(wr->wr_inodes[0]));
1.10      joerg    2891:        wr->wr_inodescnt += wc->wc_inocnt;
                   2892: }
                   2893:
1.2       simonb   2894: static int
1.14      joerg    2895: wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
1.2       simonb   2896: {
                   2897:        off_t off;
                   2898:        int error;
                   2899:
1.14      joerg    2900:        int logblklen = 1 << wr->wr_log_dev_bshift;
1.2       simonb   2901:
                   2902:        wapbl_blkhash_clear(wr);
                   2903:
1.14      joerg    2904:        off = tail;
                   2905:        while (off != head) {
1.2       simonb   2906:                struct wapbl_wc_null *wcn;
                   2907:                off_t saveoff = off;
                   2908:                error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
                   2909:                if (error)
                   2910:                        goto errout;
                   2911:                wcn = (struct wapbl_wc_null *)wr->wr_scratch;
                   2912:                switch (wcn->wc_type) {
                   2913:                case WAPBL_WC_BLOCKS:
1.10      joerg    2914:                        wapbl_replay_process_blocks(wr, &off);
1.2       simonb   2915:                        break;
                   2916:
                   2917:                case WAPBL_WC_REVOCATIONS:
1.10      joerg    2918:                        wapbl_replay_process_revocations(wr);
1.2       simonb   2919:                        break;
                   2920:
                   2921:                case WAPBL_WC_INODES:
1.10      joerg    2922:                        wapbl_replay_process_inodes(wr, saveoff, off);
1.2       simonb   2923:                        break;
1.10      joerg    2924:
1.2       simonb   2925:                default:
                   2926:                        printf("Unrecognized wapbl type: 0x%08x\n",
                   2927:                               wcn->wc_type);
                   2928:                        error = EFTYPE;
                   2929:                        goto errout;
                   2930:                }
                   2931:                wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
                   2932:                if (off != saveoff) {
                   2933:                        printf("wapbl_replay: corrupted records\n");
                   2934:                        error = EFTYPE;
                   2935:                        goto errout;
                   2936:                }
                   2937:        }
                   2938:        return 0;
                   2939:
                   2940:  errout:
                   2941:        wapbl_blkhash_clear(wr);
                   2942:        return error;
                   2943: }
                   2944:
1.13      joerg    2945: #if 0
1.2       simonb   2946: int
                   2947: wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
                   2948: {
                   2949:        off_t off;
                   2950:        int mismatchcnt = 0;
1.14      joerg    2951:        int logblklen = 1 << wr->wr_log_dev_bshift;
                   2952:        int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.51      para     2953:        void *scratch1 = wapbl_alloc(MAXBSIZE);
                   2954:        void *scratch2 = wapbl_alloc(MAXBSIZE);
1.2       simonb   2955:        int error = 0;
                   2956:
                   2957:        KDASSERT(wapbl_replay_isopen(wr));
                   2958:
                   2959:        off = wch->wc_tail;
                   2960:        while (off != wch->wc_head) {
                   2961:                struct wapbl_wc_null *wcn;
                   2962: #ifdef DEBUG
                   2963:                off_t saveoff = off;
                   2964: #endif
                   2965:                error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
                   2966:                if (error)
                   2967:                        goto out;
                   2968:                wcn = (struct wapbl_wc_null *)wr->wr_scratch;
                   2969:                switch (wcn->wc_type) {
                   2970:                case WAPBL_WC_BLOCKS:
                   2971:                        {
                   2972:                                struct wapbl_wc_blocklist *wc =
                   2973:                                    (struct wapbl_wc_blocklist *)wr->wr_scratch;
                   2974:                                int i;
                   2975:                                for (i = 0; i < wc->wc_blkcount; i++) {
                   2976:                                        int foundcnt = 0;
                   2977:                                        int dirtycnt = 0;
                   2978:                                        int j, n;
                   2979:                                        /*
                   2980:                                         * Check each physical block into the
                   2981:                                         * hashtable independently
                   2982:                                         */
                   2983:                                        n = wc->wc_blocks[i].wc_dlen >>
                   2984:                                            wch->wc_fs_dev_bshift;
                   2985:                                        for (j = 0; j < n; j++) {
                   2986:                                                struct wapbl_blk *wb =
                   2987:                                                   wapbl_blkhash_get(wr,
1.62      mlelstv  2988:                                                   wapbl_block_daddr(wc, i, j, fsblklen));
1.2       simonb   2989:                                                if (wb && (wb->wb_off == off)) {
                   2990:                                                        foundcnt++;
                   2991:                                                        error =
                   2992:                                                            wapbl_circ_read(wr,
                   2993:                                                            scratch1, fsblklen,
                   2994:                                                            &off);
                   2995:                                                        if (error)
                   2996:                                                                goto out;
                   2997:                                                        error =
                   2998:                                                            wapbl_read(scratch2,
                   2999:                                                            fsblklen, fsdevvp,
                   3000:                                                            wb->wb_blk);
                   3001:                                                        if (error)
                   3002:                                                                goto out;
                   3003:                                                        if (memcmp(scratch1,
                   3004:                                                                   scratch2,
                   3005:                                                                   fsblklen)) {
                   3006:                                                                printf(
                   3007:                "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
                   3008:                wb->wb_blk, (intmax_t)off);
                   3009:                                                                dirtycnt++;
                   3010:                                                                mismatchcnt++;
                   3011:                                                        }
                   3012:                                                } else {
                   3013:                                                        wapbl_circ_advance(wr,
                   3014:                                                            fsblklen, &off);
                   3015:                                                }
                   3016:                                        }
                   3017: #if 0
                   3018:                                        /*
                   3019:                                         * If all of the blocks in an entry
                   3020:                                         * are clean, then remove all of its
                   3021:                                         * blocks from the hashtable since they
                   3022:                                         * never will need replay.
                   3023:                                         */
                   3024:                                        if ((foundcnt != 0) &&
                   3025:                                            (dirtycnt == 0)) {
                   3026:                                                off = saveoff;
                   3027:                                                wapbl_circ_advance(wr,
                   3028:                                                    logblklen, &off);
                   3029:                                                for (j = 0; j < n; j++) {
                   3030:                                                        struct wapbl_blk *wb =
                   3031:                                                           wapbl_blkhash_get(wr,
1.62      mlelstv  3032:                                                           wapbl_block_daddr(wc, i, j, fsblklen));
1.2       simonb   3033:                                                        if (wb &&
                   3034:                                                          (wb->wb_off == off)) {
                   3035:                                                                wapbl_blkhash_rem(wr, wb->wb_blk);
                   3036:                                                        }
                   3037:                                                        wapbl_circ_advance(wr,
                   3038:                                                            fsblklen, &off);
                   3039:                                                }
                   3040:                                        }
                   3041: #endif
                   3042:                                }
                   3043:                        }
                   3044:                        break;
                   3045:                case WAPBL_WC_REVOCATIONS:
                   3046:                case WAPBL_WC_INODES:
                   3047:                        break;
                   3048:                default:
                   3049:                        KASSERT(0);
                   3050:                }
                   3051: #ifdef DEBUG
                   3052:                wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
                   3053:                KASSERT(off == saveoff);
                   3054: #endif
                   3055:        }
                   3056:  out:
1.18      yamt     3057:        wapbl_free(scratch1, MAXBSIZE);
                   3058:        wapbl_free(scratch2, MAXBSIZE);
1.2       simonb   3059:        if (!error && mismatchcnt)
                   3060:                error = EFTYPE;
                   3061:        return error;
                   3062: }
                   3063: #endif
                   3064:
                   3065: int
                   3066: wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
                   3067: {
1.9       joerg    3068:        struct wapbl_blk *wb;
                   3069:        size_t i;
1.2       simonb   3070:        off_t off;
1.9       joerg    3071:        void *scratch;
1.2       simonb   3072:        int error = 0;
1.14      joerg    3073:        int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.2       simonb   3074:
                   3075:        KDASSERT(wapbl_replay_isopen(wr));
                   3076:
1.51      para     3077:        scratch = wapbl_alloc(MAXBSIZE);
1.2       simonb   3078:
1.37      drochner 3079:        for (i = 0; i <= wr->wr_blkhashmask; ++i) {
1.9       joerg    3080:                LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
                   3081:                        off = wb->wb_off;
                   3082:                        error = wapbl_circ_read(wr, scratch, fsblklen, &off);
                   3083:                        if (error)
                   3084:                                break;
                   3085:                        error = wapbl_write(scratch, fsblklen, fsdevvp,
                   3086:                            wb->wb_blk);
                   3087:                        if (error)
                   3088:                                break;
1.2       simonb   3089:                }
                   3090:        }
1.9       joerg    3091:
1.18      yamt     3092:        wapbl_free(scratch, MAXBSIZE);
1.2       simonb   3093:        return error;
                   3094: }
                   3095:
                   3096: int
1.6       joerg    3097: wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
                   3098: {
1.14      joerg    3099:        int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.6       joerg    3100:
                   3101:        KDASSERT(wapbl_replay_isopen(wr));
                   3102:        KASSERT((len % fsblklen) == 0);
                   3103:
                   3104:        while (len != 0) {
                   3105:                struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
                   3106:                if (wb)
                   3107:                        return 1;
                   3108:                len -= fsblklen;
                   3109:        }
                   3110:        return 0;
                   3111: }
                   3112:
                   3113: int
1.2       simonb   3114: wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
                   3115: {
1.14      joerg    3116:        int fsblklen = 1 << wr->wr_fs_dev_bshift;
1.2       simonb   3117:
                   3118:        KDASSERT(wapbl_replay_isopen(wr));
                   3119:
                   3120:        KASSERT((len % fsblklen) == 0);
                   3121:
                   3122:        while (len != 0) {
                   3123:                struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
                   3124:                if (wb) {
                   3125:                        off_t off = wb->wb_off;
                   3126:                        int error;
                   3127:                        error = wapbl_circ_read(wr, data, fsblklen, &off);
                   3128:                        if (error)
                   3129:                                return error;
                   3130:                }
                   3131:                data = (uint8_t *)data + fsblklen;
                   3132:                len -= fsblklen;
                   3133:                blk++;
                   3134:        }
                   3135:        return 0;
                   3136: }
1.35      pooka    3137:
1.36      pooka    3138: #ifdef _KERNEL
1.64      pgoyette 3139:
1.35      pooka    3140: MODULE(MODULE_CLASS_VFS, wapbl, NULL);
                   3141:
                   3142: static int
                   3143: wapbl_modcmd(modcmd_t cmd, void *arg)
                   3144: {
                   3145:
                   3146:        switch (cmd) {
                   3147:        case MODULE_CMD_INIT:
1.39      christos 3148:                wapbl_init();
1.35      pooka    3149:                return 0;
                   3150:        case MODULE_CMD_FINI:
1.74      riastrad 3151:                return wapbl_fini();
1.35      pooka    3152:        default:
                   3153:                return ENOTTY;
                   3154:        }
                   3155: }
1.36      pooka    3156: #endif /* _KERNEL */

CVSweb <webmaster@jp.NetBSD.org>