Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. =================================================================== RCS file: /ftp/cvs/cvsroot/src/sys/ufs/ffs/ffs_wapbl.c,v rcsdiff: /ftp/cvs/cvsroot/src/sys/ufs/ffs/ffs_wapbl.c,v: warning: Unknown phrases like `commitid ...;' are present. retrieving revision 1.12.2.1 retrieving revision 1.12.2.2 diff -u -p -r1.12.2.1 -r1.12.2.2 --- src/sys/ufs/ffs/ffs_wapbl.c 2009/02/22 20:28:06 1.12.2.1 +++ src/sys/ufs/ffs/ffs_wapbl.c 2009/05/04 08:14:38 1.12.2.2 @@ -0,0 +1,882 @@ +/* $NetBSD: ffs_wapbl.c,v 1.12.2.2 2009/05/04 08:14:38 yamt Exp $ */ + +/*- + * Copyright (c) 2003,2006,2008 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ffs_wapbl.c,v 1.12.2.2 2009/05/04 08:14:38 yamt Exp $"); + +#define WAPBL_INTERNAL + +#if defined(_KERNEL_OPT) +#include "opt_ffs.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#undef WAPBL_DEBUG +#ifdef WAPBL_DEBUG +int ffs_wapbl_debug = 1; +#define DPRINTF(fmt, args...) \ +do { \ + if (ffs_wapbl_debug) \ + printf("%s:%d "fmt, __func__ , __LINE__, ##args); \ +} while (/* CONSTCOND */0) +#else +#define DPRINTF(fmt, args...) \ +do { \ + /* nothing */ \ +} while (/* CONSTCOND */0) +#endif + +static int ffs_superblock_layout(struct fs *); +static int wapbl_log_position(struct mount *, struct fs *, struct vnode *, + daddr_t *, size_t *, size_t *, uint64_t *); +static int wapbl_create_infs_log(struct mount *, struct fs *, struct vnode *, + daddr_t *, size_t *, size_t *, uint64_t *); +static void wapbl_find_log_start(struct mount *, struct vnode *, off_t, + daddr_t *, daddr_t *, size_t *); +static int wapbl_remove_log(struct mount *); +static int wapbl_allocate_log_file(struct mount *, struct vnode *); + +/* + * Return the super block layout format - UFS1 or UFS2. + * WAPBL only works with UFS2 layout (which is still available + * with FFSv1). + * + * XXX Should this be in ufs/ffs/fs.h? Same style of check is + * also used in ffs_alloc.c in a few places. + */ +static int +ffs_superblock_layout(struct fs *fs) +{ + if ((fs->fs_magic == FS_UFS1_MAGIC) && + ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) + return 1; + else + return 2; +} + +/* + * This function is invoked after a log is replayed to + * disk to perform logical cleanup actions as described by + * the log + */ +void +ffs_wapbl_replay_finish(struct mount *mp) +{ + struct wapbl_replay *wr = mp->mnt_wapbl_replay; + int i; + int error; + + if (!wr) + return; + + KDASSERT((mp->mnt_flag & MNT_RDONLY) == 0); + + for (i = 0; i < wr->wr_inodescnt; i++) { + struct vnode *vp; + struct inode *ip; + error = VFS_VGET(mp, wr->wr_inodes[i].wr_inumber, &vp); + if (error) { + printf("ffs_wapbl_replay_finish: " + "unable to cleanup inode %" PRIu32 "\n", + wr->wr_inodes[i].wr_inumber); + continue; + } + ip = VTOI(vp); + KDASSERT(wr->wr_inodes[i].wr_inumber == ip->i_number); +#ifdef WAPBL_DEBUG + printf("ffs_wapbl_replay_finish: " + "cleaning inode %" PRIu64 " size=%" PRIu64 " mode=%o nlink=%d\n", + ip->i_number, ip->i_size, ip->i_mode, ip->i_nlink); +#endif + KASSERT(ip->i_nlink == 0); + + /* + * The journal may have left partially allocated inodes in mode + * zero. This may occur if a crash occurs betweeen the node + * allocation in ffs_nodeallocg and when the node is properly + * initialized in ufs_makeinode. If so, just dallocate them. + */ + if (ip->i_mode == 0) { + UFS_WAPBL_BEGIN(mp); + ffs_vfree(vp, ip->i_number, wr->wr_inodes[i].wr_imode); + UFS_WAPBL_END(mp); + } + vput(vp); + } + wapbl_replay_stop(wr); + wapbl_replay_free(wr); + mp->mnt_wapbl_replay = NULL; +} + +/* Callback for wapbl */ +void +ffs_wapbl_sync_metadata(struct mount *mp, daddr_t *deallocblks, + int *dealloclens, int dealloccnt) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct fs *fs = ump->um_fs; + int i, error; + +#ifdef WAPBL_DEBUG_INODES + ufs_wapbl_verify_inodes(mp, "ffs_wapbl_sync_metadata"); +#endif + + for (i = 0; i< dealloccnt; i++) { + /* + * blkfree errors are unreported, might silently fail + * if it cannot read the cylinder group block + */ + ffs_blkfree(fs, ump->um_devvp, + dbtofsb(fs, deallocblks[i]), dealloclens[i], -1); + } + + fs->fs_fmod = 0; + fs->fs_time = time_second; + error = ffs_cgupdate(ump, 0); + KASSERT(error == 0); +} + +void +ffs_wapbl_abort_sync_metadata(struct mount *mp, daddr_t *deallocblks, + int *dealloclens, int dealloccnt) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct fs *fs = ump->um_fs; + int i; + + for (i = 0; i < dealloccnt; i++) { + /* + * Since the above blkfree may have failed, this blkalloc might + * fail as well, so don't check its error. Note that if the + * blkfree succeeded above, then this shouldn't fail because + * the buffer will be locked in the current transaction. + */ + ffs_blkalloc_ump(ump, dbtofsb(fs, deallocblks[i]), + dealloclens[i]); + } +} + +static int +wapbl_remove_log(struct mount *mp) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct fs *fs = ump->um_fs; + struct vnode *vp; + struct inode *ip; + ino_t log_ino; + int error; + + /* If super block layout is too old to support WAPBL, return */ + if (ffs_superblock_layout(fs) < 2) + return 0; + + /* If all the log locators are 0, just clean up */ + if (fs->fs_journallocs[0] == 0 && + fs->fs_journallocs[1] == 0 && + fs->fs_journallocs[2] == 0 && + fs->fs_journallocs[3] == 0) { + DPRINTF("empty locators, just clear\n"); + goto done; + } + + switch (fs->fs_journal_location) { + case UFS_WAPBL_JOURNALLOC_NONE: + /* nothing! */ + DPRINTF("no log\n"); + break; + + case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM: + log_ino = fs->fs_journallocs[UFS_WAPBL_INFS_INO]; + DPRINTF("in-fs log, ino = %" PRId64 "\n",log_ino); + + /* if no existing log inode, just clear all fields and bail */ + if (log_ino == 0) + goto done; + error = VFS_VGET(mp, log_ino, &vp); + if (error != 0) { + printf("ffs_wapbl: vget failed %d\n", + error); + /* clear out log info on error */ + goto done; + } + ip = VTOI(vp); + KASSERT(log_ino == ip->i_number); + if ((ip->i_flags & SF_LOG) == 0) { + printf("ffs_wapbl: try to clear non-log inode " + "%" PRId64 "\n", log_ino); + vput(vp); + /* clear out log info on error */ + goto done; + } + + /* + * remove the log inode by setting its link count back + * to zero and bail. + */ + ip->i_nlink = 0; + DIP_ASSIGN(ip, nlink, 0); + vput(vp); + + case UFS_WAPBL_JOURNALLOC_END_PARTITION: + DPRINTF("end-of-partition log\n"); + /* no extra work required */ + break; + + default: + printf("ffs_wapbl: unknown journal type %d\n", + fs->fs_journal_location); + return EINVAL; + } + + +done: + /* Clear out all previous knowledge of journal */ + fs->fs_journal_version = 0; + fs->fs_journal_location = 0; + fs->fs_journal_flags = 0; + fs->fs_journallocs[0] = 0; + fs->fs_journallocs[1] = 0; + fs->fs_journallocs[2] = 0; + fs->fs_journallocs[3] = 0; + (void) ffs_sbupdate(ump, MNT_WAIT); + + return 0; +} + +int +ffs_wapbl_start(struct mount *mp) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct fs *fs = ump->um_fs; + struct vnode *devvp = ump->um_devvp; + daddr_t off; + size_t count; + size_t blksize; + uint64_t extradata; + int error; + + if (mp->mnt_wapbl == NULL) { + if (fs->fs_journal_flags & UFS_WAPBL_FLAGS_CLEAR_LOG) { + /* Clear out any existing journal file */ + error = wapbl_remove_log(mp); + if (error != 0) + return error; + } + + if (mp->mnt_flag & MNT_LOG) { + KDASSERT(fs->fs_ronly == 0); + + /* WAPBL needs UFS2 format super block */ + if (ffs_superblock_layout(fs) < 2) { + printf("%s fs superblock in old format, " + "not journaling\n", + VFSTOUFS(mp)->um_fs->fs_fsmnt); + mp->mnt_flag &= ~MNT_LOG; + return EINVAL; + } + + error = wapbl_log_position(mp, fs, devvp, &off, + &count, &blksize, &extradata); + if (error) + return error; + + /* XXX any other consistancy checks here? */ + if (blksize != DEV_BSIZE) { + printf("%s: bad blocksize %zu\n", __func__, + blksize); + return EINVAL; + } + + error = wapbl_start(&mp->mnt_wapbl, mp, devvp, off, + count, blksize, mp->mnt_wapbl_replay, + ffs_wapbl_sync_metadata, + ffs_wapbl_abort_sync_metadata); + if (error) + return error; + + mp->mnt_wapbl_op = &wapbl_ops; + +#ifdef WAPBL_DEBUG + printf("%s: enabling logging\n", fs->fs_fsmnt); +#endif + + if ((fs->fs_flags & FS_DOWAPBL) == 0) { + UFS_WAPBL_BEGIN(mp); + fs->fs_flags |= FS_DOWAPBL; + error = ffs_sbupdate(ump, MNT_WAIT); + if (error) { + UFS_WAPBL_END(mp); + ffs_wapbl_stop(mp, MNT_FORCE); + return error; + } + UFS_WAPBL_END(mp); + error = wapbl_flush(mp->mnt_wapbl, 1); + if (error) { + ffs_wapbl_stop(mp, MNT_FORCE); + return error; + } + } + } else if (fs->fs_flags & FS_DOWAPBL) { + fs->fs_fmod = 1; + fs->fs_flags &= ~FS_DOWAPBL; + } + } + + /* + * It is recommended that you finish replay with logging enabled. + * However, even if logging is not enabled, the remaining log + * replay should be safely recoverable with an fsck, so perform + * it anyway. + */ + if ((fs->fs_ronly == 0) && mp->mnt_wapbl_replay) { + int saveflag = mp->mnt_flag & MNT_RDONLY; + /* + * Make sure MNT_RDONLY is not set so that the inode + * cleanup in ufs_inactive will actually do its work. + */ + mp->mnt_flag &= ~MNT_RDONLY; + ffs_wapbl_replay_finish(mp); + mp->mnt_flag |= saveflag; + KASSERT(fs->fs_ronly == 0); + } + + return 0; +} + +int +ffs_wapbl_stop(struct mount *mp, int force) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct fs *fs = ump->um_fs; + int error; + + if (mp->mnt_wapbl) { + KDASSERT(fs->fs_ronly == 0); + + /* + * Make sure turning off FS_DOWAPBL is only removed + * as the only change in the final flush since otherwise + * a transaction may reorder writes. + */ + error = wapbl_flush(mp->mnt_wapbl, 1); + if (error && !force) + return error; + if (error && force) + goto forceout; + error = UFS_WAPBL_BEGIN(mp); + if (error && !force) + return error; + if (error && force) + goto forceout; + KASSERT(fs->fs_flags & FS_DOWAPBL); + + fs->fs_flags &= ~FS_DOWAPBL; + error = ffs_sbupdate(ump, MNT_WAIT); + KASSERT(error == 0); /* XXX a bit drastic! */ + UFS_WAPBL_END(mp); + forceout: + error = wapbl_stop(mp->mnt_wapbl, force); + if (error) { + KASSERT(!force); + fs->fs_flags |= FS_DOWAPBL; + return error; + } + fs->fs_flags &= ~FS_DOWAPBL; /* Repeat in case of forced error */ + mp->mnt_wapbl = NULL; + +#ifdef WAPBL_DEBUG + printf("%s: disabled logging\n", fs->fs_fsmnt); +#endif + } + + return 0; +} + +int +ffs_wapbl_replay_start(struct mount *mp, struct fs *fs, struct vnode *devvp) +{ + int error; + daddr_t off; + size_t count; + size_t blksize; + uint64_t extradata; + + /* + * WAPBL needs UFS2 format super block, if we got here with a + * UFS1 format super block something is amiss... + */ + if (ffs_superblock_layout(fs) < 2) + return EINVAL; + + error = wapbl_log_position(mp, fs, devvp, &off, &count, &blksize, + &extradata); + + if (error) + return error; + + error = wapbl_replay_start(&mp->mnt_wapbl_replay, devvp, off, + count, blksize); + if (error) + return error; + + mp->mnt_wapbl_op = &wapbl_ops; + + return 0; +} + +/* + * If the superblock doesn't already have a recorded journal location + * then we allocate the journal in one of two positions: + * + * - At the end of the partition after the filesystem if there's + * enough space. "Enough space" is defined as >= 1MB of journal + * per 1GB of filesystem or 64MB, whichever is smaller. + * + * - Inside the filesystem. We try to allocate a contiguous journal + * based on the total filesystem size - the target is 1MB of journal + * per 1GB of filesystem, up to a maximum journal size of 64MB. As + * a worst case allowing for fragmentation, we'll allocate a journal + * 1/4 of the desired size but never smaller than 1MB. + * + * XXX In the future if we allow for non-contiguous journal files we + * can tighten the above restrictions. + * + * XXX + * These seems like a lot of duplication both here and in some of + * the userland tools (fsck_ffs, dumpfs, tunefs) with similar + * "switch (fs_journal_location)" constructs. Can we centralise + * this sort of code somehow/somewhere? + */ +static int +wapbl_log_position(struct mount *mp, struct fs *fs, struct vnode *devvp, + daddr_t *startp, size_t *countp, size_t *blksizep, uint64_t *extradatap) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct partinfo dpart; + daddr_t logstart, logend, desired_logsize; + size_t blksize; + int error; + + if (fs->fs_journal_version == UFS_WAPBL_VERSION) { + switch (fs->fs_journal_location) { + case UFS_WAPBL_JOURNALLOC_END_PARTITION: + DPRINTF("found existing end-of-partition log\n"); + *startp = fs->fs_journallocs[UFS_WAPBL_EPART_ADDR]; + *countp = fs->fs_journallocs[UFS_WAPBL_EPART_COUNT]; + *blksizep = fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ]; + DPRINTF(" start = %" PRId64 ", size = %zu, " + "blksize = %zu\n", *startp, *countp, *blksizep); + return 0; + + case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM: + DPRINTF("found existing in-filesystem log\n"); + *startp = fs->fs_journallocs[UFS_WAPBL_INFS_ADDR]; + *countp = fs->fs_journallocs[UFS_WAPBL_INFS_COUNT]; + *blksizep = fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]; + DPRINTF(" start = %" PRId64 ", size = %zu, " + "blksize = %zu\n", *startp, *countp, *blksizep); + return 0; + + default: + printf("ffs_wapbl: unknown journal type %d\n", + fs->fs_journal_location); + return EINVAL; + } + } + + desired_logsize = + lfragtosize(fs, fs->fs_size) / UFS_WAPBL_JOURNAL_SCALE; + DPRINTF("desired log size = %" PRId64 " kB\n", desired_logsize / 1024); + desired_logsize = max(desired_logsize, UFS_WAPBL_MIN_JOURNAL_SIZE); + desired_logsize = min(desired_logsize, UFS_WAPBL_MAX_JOURNAL_SIZE); + DPRINTF("adjusted desired log size = %" PRId64 " kB\n", + desired_logsize / 1024); + + /* Is there space after after filesystem on partition for log? */ + logstart = fsbtodb(fs, fs->fs_size); + error = VOP_IOCTL(devvp, DIOCGPART, &dpart, FREAD, FSCRED); + if (!error) { + logend = dpart.part->p_size; + blksize = dpart.disklab->d_secsize; + } else { + struct dkwedge_info dkw; + error = VOP_IOCTL(devvp, DIOCGWEDGEINFO, &dkw, FREAD, FSCRED); + if (error) + return error; + + blksize = DEV_BSIZE; + logend = dkw.dkw_size; + } + + if ((logend - logstart) * blksize >= desired_logsize) { + KDASSERT(blksize != 0); + DPRINTF("enough space, use end-of-partition log\n"); + + *startp = logstart; + *countp = (logend - logstart); + *blksizep = blksize; + *extradatap = 0; + + /* update superblock with log location */ + fs->fs_journal_version = UFS_WAPBL_VERSION; + fs->fs_journal_location = UFS_WAPBL_JOURNALLOC_END_PARTITION; + fs->fs_journal_flags = 0; + fs->fs_journallocs[UFS_WAPBL_EPART_ADDR] = *startp; + fs->fs_journallocs[UFS_WAPBL_EPART_COUNT] = *countp; + fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ] = *blksizep; + fs->fs_journallocs[UFS_WAPBL_EPART_UNUSED] = *extradatap; + + error = ffs_sbupdate(ump, MNT_WAIT); + return error; + } + DPRINTF("end-of-partition has only %" PRId64 " free\n", + logend - logstart); + + error = wapbl_create_infs_log(mp, fs, devvp, startp, countp, blksizep, + extradatap); + + ffs_sync(mp, MNT_WAIT, FSCRED); + + return error; +} + +/* + * Try to create a journal log inside the filesystem. + */ +static int +wapbl_create_infs_log(struct mount *mp, struct fs *fs, struct vnode *devvp, + daddr_t *startp, size_t *countp, size_t *blksizep, uint64_t *extradatap) +{ + struct vnode *vp, *rvp; + struct inode *ip; + int error; + + if ((error = VFS_ROOT(mp, &rvp)) != 0) + return error; + + if ((error = UFS_VALLOC(rvp, 0 | S_IFREG, NOCRED, &vp)) != 0) { + vput(rvp); + return error; + } + vput(rvp); + + vp->v_type = VREG; + ip = VTOI(vp); + ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; + ip->i_mode = 0 | IFREG; + DIP_ASSIGN(ip, mode, ip->i_mode); + ip->i_flags = SF_LOG; + DIP_ASSIGN(ip, flags, ip->i_flags); + ip->i_nlink = 1; + DIP_ASSIGN(ip, nlink, 1); + ffs_update(vp, NULL, NULL, UPDATE_WAIT); + + if ((error = wapbl_allocate_log_file(mp, vp)) != 0) { + /* + * If we couldn't allocate the space for the log file, + * remove the inode by setting its link count back to + * zero and bail. + */ + ip->i_nlink = 0; + DIP_ASSIGN(ip, nlink, 0); + vput(vp); + + return error; + } + + /* + * Now that we have the place-holder inode for the journal, + * we don't need the vnode ever again. + */ + vput(vp); + + *startp = fs->fs_journallocs[UFS_WAPBL_INFS_ADDR]; + *countp = fs->fs_journallocs[UFS_WAPBL_INFS_COUNT]; + *blksizep = fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]; + *extradatap = fs->fs_journallocs[UFS_WAPBL_INFS_INO]; + + return 0; +} + +int +wapbl_allocate_log_file(struct mount *mp, struct vnode *vp) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct fs *fs = ump->um_fs; + daddr_t addr, indir_addr; + off_t logsize; + size_t size; + int error; + + logsize = 0; + /* check if there's a suggested log size */ + if (fs->fs_journal_flags & UFS_WAPBL_FLAGS_CREATE_LOG && + fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) + logsize = fs->fs_journallocs[UFS_WAPBL_INFS_COUNT]; + + if (vp->v_size > 0) { + printf("%s: file size (%" PRId64 ") non zero\n", __func__, + vp->v_size); + return EEXIST; + } + wapbl_find_log_start(mp, vp, logsize, &addr, &indir_addr, &size); + if (addr == 0) { + printf("%s: log not allocated, largest extent is " + "%" PRId64 "MB\n", __func__, + lblktosize(fs, size) / (1024 * 1024)); + return ENOSPC; + } + + logsize = lblktosize(fs, size); /* final log size */ + + VTOI(vp)->i_ffs_first_data_blk = addr; + VTOI(vp)->i_ffs_first_indir_blk = indir_addr; + + error = GOP_ALLOC(vp, 0, logsize, B_CONTIG, FSCRED); + if (error) { + printf("%s: GOP_ALLOC error %d\n", __func__, error); + return error; + } + + fs->fs_journal_version = UFS_WAPBL_VERSION; + fs->fs_journal_location = UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM; + fs->fs_journal_flags = 0; + fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] = + lfragtosize(fs, addr) / DEV_BSIZE; + fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] = logsize / DEV_BSIZE; + fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ] = DEV_BSIZE; + fs->fs_journallocs[UFS_WAPBL_INFS_INO] = VTOI(vp)->i_number; + + error = ffs_sbupdate(ump, MNT_WAIT); + return error; +} + +/* + * Find a suitable location for the journal in the filesystem. + * + * Our strategy here is to look for a contiguous block of free space + * at least "logfile" MB in size (plus room for any indirect blocks). + * We start at the middle of the filesystem and check each cylinder + * group working outwards. If "logfile" MB is not available as a + * single contigous chunk, then return the address and size of the + * largest chunk found. + * + * XXX + * At what stage does the search fail? Is if the largest space we could + * find is less than a quarter the requested space reasonable? If the + * search fails entirely, return a block address if "0" it indicate this. + */ +static void +wapbl_find_log_start(struct mount *mp, struct vnode *vp, off_t logsize, + daddr_t *addr, daddr_t *indir_addr, size_t *size) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct fs *fs = ump->um_fs; + struct vnode *devvp = ump->um_devvp; + struct cg *cgp; + struct buf *bp; + uint8_t *blksfree; + daddr_t blkno, best_addr, start_addr; + daddr_t desired_blks, min_desired_blks; + daddr_t freeblks, best_blks; + int bpcg, cg, error, fixedsize, indir_blks, n, s; +#ifdef FFS_EI + const int needswap = UFS_FSNEEDSWAP(fs); +#endif + + if (logsize == 0) { + fixedsize = 0; /* We can adjust the size if tight */ + logsize = lfragtosize(fs, fs->fs_dsize) / + UFS_WAPBL_JOURNAL_SCALE; + DPRINTF("suggested log size = %" PRId64 "\n", logsize); + logsize = max(logsize, UFS_WAPBL_MIN_JOURNAL_SIZE); + logsize = min(logsize, UFS_WAPBL_MAX_JOURNAL_SIZE); + DPRINTF("adjusted log size = %" PRId64 "\n", logsize); + } else { + fixedsize = 1; + DPRINTF("fixed log size = %" PRId64 "\n", logsize); + } + + desired_blks = logsize / fs->fs_bsize; + DPRINTF("desired blocks = %" PRId64 "\n", desired_blks); + + /* add in number of indirect blocks needed */ + indir_blks = 0; + if (desired_blks >= NDADDR) { + struct indir indirs[NIADDR + 2]; + int num; + + error = ufs_getlbns(vp, desired_blks, indirs, &num); + if (error) { + printf("%s: ufs_getlbns failed, error %d!\n", + __func__, error); + goto bad; + } + + switch (num) { + case 2: + indir_blks = 1; /* 1st level indirect */ + break; + case 3: + indir_blks = 1 + /* 1st level indirect */ + 1 + /* 2nd level indirect */ + indirs[1].in_off + 1; /* extra 1st level indirect */ + break; + default: + printf("%s: unexpected numlevels %d from ufs_getlbns\n", + __func__, num); + *size = 0; + goto bad; + } + desired_blks += indir_blks; + } + DPRINTF("desired blocks = %" PRId64 " (including indirect)\n", + desired_blks); + + /* + * If a specific size wasn't requested, allow for a smaller log + * if we're really tight for space... + */ + min_desired_blks = desired_blks; + if (!fixedsize) + min_desired_blks = desired_blks / 4; + + /* Look at number of blocks per CG. If it's too small, bail early. */ + bpcg = fragstoblks(fs, fs->fs_fpg); + if (min_desired_blks > bpcg) { + printf("ffs_wapbl: cylinder group size of %" PRId64 " MB " + " is not big enough for journal\n", + lblktosize(fs, bpcg) / (1024 * 1024)); + goto bad; + } + + /* + * Start with the middle cylinder group, and search outwards in + * both directions until we either find the requested log size + * or reach the start/end of the file system. If we reach the + * start/end without finding enough space for the full requested + * log size, use the largest extent found if it is large enough + * to satisfy the our minimum size. + * + * XXX + * Can we just use the cluster contigsum stuff (esp on UFS2) + * here to simplify this search code? + */ + best_addr = 0; + best_blks = 0; + for (cg = fs->fs_ncg / 2, s = 0, n = 1; + best_blks < desired_blks && cg >= 0 && cg < fs->fs_ncg; + s++, n = -n, cg += n * s) { + DPRINTF("check cg %d of %d\n", cg, fs->fs_ncg); + error = bread(devvp, fsbtodb(fs, cgtod(fs, cg)), + fs->fs_cgsize, FSCRED, 0, &bp); + cgp = (struct cg *)bp->b_data; + if (error || !cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) { + brelse(bp, 0); + continue; + } + + blksfree = cg_blksfree(cgp, needswap); + + for (blkno = 0; blkno < bpcg;) { + /* look for next free block */ + /* XXX use scanc() and fragtbl[] here? */ + for (; blkno < bpcg - min_desired_blks; blkno++) + if (ffs_isblock(fs, blksfree, blkno)) + break; + + /* past end of search space in this CG? */ + if (blkno >= bpcg - min_desired_blks) + break; + + /* count how many free blocks in this extent */ + start_addr = blkno; + for (freeblks = 0; blkno < bpcg; blkno++, freeblks++) + if (!ffs_isblock(fs, blksfree, blkno)) + break; + + if (freeblks > best_blks) { + best_blks = freeblks; + best_addr = blkstofrags(fs, start_addr) + + cgbase(fs, cg); + + if (freeblks >= desired_blks) { + DPRINTF("found len %" PRId64 + " at offset %" PRId64 " in gc\n", + freeblks, start_addr); + break; + } + } + } + brelse(bp, 0); + } + DPRINTF("best found len = %" PRId64 ", wanted %" PRId64 + " at addr %" PRId64 "\n", best_blks, desired_blks, best_addr); + + if (best_blks < min_desired_blks) { + *addr = 0; + *indir_addr = 0; + } else { + /* put indirect blocks at start, and data blocks after */ + *addr = best_addr + blkstofrags(fs, indir_blks); + *indir_addr = best_addr; + } + *size = min(desired_blks, best_blks) - indir_blks; + return; + +bad: + *addr = 0; + *indir_addr = 0; + *size = 0; + return; +}