[BACK]Return to journal.c CVS log [TXT][DIR] Up to [cvs.NetBSD.org] / src / external / mpl / bind / dist / lib / dns

File: [cvs.NetBSD.org] / src / external / mpl / bind / dist / lib / dns / journal.c (download)

Revision 1.1.1.4 (vendor branch), Sun May 24 19:36:41 2020 UTC (3 years, 10 months ago) by christos
Branch: ISC
CVS Tags: bind-9-16-5, bind-9-16-3
Changes since 1.1.1.3: +449 -383 lines

	--- 9.16.3 released ---

5404.	[bug]		'named-checkconf -z' could incorrectly indicate
			success if errors were found in one view but not in a
			subsequent one. [GL #1807]

5403.	[func]		Do not set UDP receive/send buffer sizes - use system
			defaults. [GL #1713]

5402.	[bug]		On FreeBSD, use SO_REUSEPORT_LB instead of SO_REUSEPORT.
			Enable use of SO_REUSEADDR on all platforms which
			support it. [GL !3365]

5401.	[bug]		The number of input queues allocated during dnstap
			initialization was too low, which could prevent some
			dnstap data from being logged. [GL #1795]

5400.	[func]		Add engine support to OpenSSL EdDSA implementation.
			[GL #1763]

5399.	[func]		Add engine support to OpenSSL ECDSA implementation.
			[GL #1534]

5398.	[bug]		Named could fail to restart if a zone with a double
			quote (") in its name was added with 'rndc addzone'.
			[GL #1695]

5397.	[func]		Update PKCS#11 EdDSA implementation to PKCS#11 v3.0.
			Thanks to Aaron Thompson. [GL !3326]

5396.	[func]		When necessary (i.e. in libuv >= 1.37), use the
			UV_UDP_RECVMMSG flag to enable recvmmsg() support in
			libuv. [GL #1797]

5395.	[security]	Further limit the number of queries that can be
			triggered from a request.  Root and TLD servers
			are no longer exempt from max-recursion-queries.
			Fetches for missing name server address records
			are limited to 4 for any domain. (CVE-2020-8616)
			[GL #1388]

5394.	[cleanup]	Named formerly attempted to change the effective UID and
			GID in named_os_openfile(), which could trigger a
			spurious log message if they were already set to the
			desired values. This has been fixed. [GL #1042]
			[GL #1090]

5392.	[bug]		It was possible for named to crash during shutdown
			or reconfiguration if an RPZ zone was still being
			updated. [GL #1779]

5390.	[security]	Replaying a TSIG BADTIME response as a request could
			trigger an assertion failure. (CVE-2020-8617)
			[GL #1703]

5389.	[bug]		Finish PKCS#11 code cleanup, fix a couple of smaller
			bugs and use PKCS#11 v3.0 EdDSA macros and constants.
			Thanks to Aaron Thompson. [GL !3391]

5387.	[func]		Warn about AXFR streams with inconsistent message IDs.
			[GL #1674]

5386.	[cleanup]	Address Coverity warnings in lib/dns/keymgr.c.
			[GL #1737]

5385.	[func]		Make ISC rwlock implementation the default again.
			[GL #1753]

5384.	[bug]		With "dnssec-policy" in effect, "inline-signing" was
			implicitly set to "yes". Now "inline-signing" is only
			set to "yes" if the zone is not dynamic. [GL #1709]

	--- 9.16.2 released ---

5383.	[func]		Add a quota attach function with a callback and clean up
			the isc_quota API. [GL !3280]

5382.	[bug]		Use clock_gettime() instead of gettimeofday() for
			isc_stdtime() function. [GL #1679]

5381.	[bug]		Fix logging API data race by adding rwlock and caching
			logging levels in stdatomic variables to restore
			performance to original levels. [GL #1675] [GL #1717]

5380.	[contrib]	Fix building MySQL DLZ modules against MySQL 8
			libraries. [GL #1678]

5378.	[bug]		Receiving invalid DNS data was triggering an assertion
			failure in nslookup. [GL #1652]

5376.	[bug]		Fix ineffective DNS rebinding protection when BIND is
			configured as a forwarding DNS server. Thanks to Tobias
			Klein. [GL #1574]

5375.	[test]		Fix timing issues in the "kasp" system test. [GL #1669]

5374.	[bug]		Statistics counters tracking recursive clients and
			active connections could underflow. [GL #1087]

5373.	[bug]		Collecting statistics for DNSSEC signing operations
			(change 5254) caused an array of significant size (over
			100 kB) to be allocated for each configured zone. Each
			of these arrays is tracking all possible key IDs; this
			could trigger an out-of-memory condition on servers with
			a high enough number of zones configured. Fixed by
			tracking up to four keys per zone and rotating counters
			when keys are replaced. This fixes the immediate problem
			of high memory usage, but should be improved in a future
			release by growing or shrinking the number of keys to
			track upon key rollover events. [GL #1179]

5372.	[bug]		Fix migration from existing DNSSEC key files
			("auto-dnssec maintain") to "dnssec-policy". [GL #1706]

5371.	[bug]		Improve incremental updates of the RPZ summary
			database to reduce delays that could occur when
			a policy zone update included a large number of
			record deletions. [GL #1447]

5370.	[bug]		Deactivation of a netmgr handle associated with a
			socket could be skipped in some circumstances.
			Fixed by deactivating the netmgr handle before
			scheduling the asynchronous close routine. [GL #1700]

5368.	[bug]		Named failed to restart if 'rndc addzone' names
			contained special characters (e.g. '/'). [GL #1655]

5367.	[bug]		Fixed a flaw in the calculation of the zone database
			size so that "max-journal-size default" uses the correct
			limit. [GL #1661]

	--- 9.16.1 released ---

5366.	[bug]		Fix a race condition with the keymgr when the same
			zone plus dnssec-policy is configured in multiple
			views. [GL #1653]

5365.	[bug]		Algorithm rollover was stuck on submitting DS
			because keymgr thought it would move to an invalid
			state.  Fixed by checking the current key against
			the desired state, not the existing state. [GL #1626]

5364.	[bug]		Algorithm rollover waited too long before introducing
			zone signatures.  It waited to make sure all signatures
			were regenerated, but when introducing a new algorithm,
			all signatures are regenerated immediately.  Only
			add the sign delay if there is a predecessor key.
			[GL #1625]

5363.	[bug]		When changing a dnssec-policy, existing keys with
			properties that no longer match were not being retired.
			[GL #1624]

5361.	[bug]		named might not accept new connections after
			hitting tcp-clients quota. [GL #1643]

5360.	[bug]		delv could fail to load trust anchors in DNSKEY
			format. [GL #1647]

5358.	[bug]		Inline master zones whose master files were touched
			but otherwise unchanged and were subsequently reloaded
			may have stopped re-signing. [GL !3135]

5357.	[bug]		Newly added RRSIG records with expiry times before
			the previous earliest expiry times might not be
			re-signed in time.  This was a side effect of 5315.
			[GL !3137]

	--- 9.16.0 released ---

5356.	[func]		Update dnssec-policy configuration statements:
			- Rename "zone-max-ttl" dnssec-policy option to
			  "max-zone-ttl" for consistency with the existing
			  zone option.
			- Allow for "lifetime unlimited" as a synonym for
			  "lifetime PT0S".
			- Make "key-directory" optional.
			- Warn if specifying a key length does not make
			  sense; fail if key length is out of range for
			  the algorithm.
			- Allow use of mnemonics when specifying key
			  algorithm (e.g. "rsasha256", "ecdsa384", etc.).
			- Make ISO 8601 durations case-insensitive.
			[GL #1598]

5355.	[func]		What was set with --with-tuning=large option in
			older BIND9 versions is now a default, and
			a --with-tuning=small option was added for small
			(e.g. OpenWRT) systems. [GL !2989]

5354.	[bug]		dnssec-policy created new KSK keys for zones in the
			initial stage of signing (with the DS not yet in the
			rumoured or omnipresent states).  Fix by checking the
			key goals rather than the active state when determining
			whether new keys are needed. [GL #1593]

5353.	[doc]		Document port and dscp parameters in forwarders
			configuration option. [GL #914]

5352.	[bug]		Correctly handle catalog zone entries containing
			characters that aren't legal in filenames. [GL #1592]

5351.	[bug]		CDS / CDNSKEY consistency checks failed to handle
			removal records. [GL #1554]

5350.	[bug]		When a view was configured with class CHAOS, the
			server could crash while processing a query for a
			non-existent record. [GL #1540]

5349.	[bug]		Fix a race in task_pause/unpause. [GL #1571]

5348.	[bug]		dnssec-settime -Psync was not being honoured.
			[GL !2925]

	--- 9.15.8 released ---

5347.	[bug]		Fixed a bug that could cause an intermittent crash
			in validator.c when validating a negative cache
			entry. [GL #1561]

5346.	[bug]		Make hazard pointer array allocations dynamic, fixing
			a bug that caused named to crash on machines with more
			than 40 cores. [GL #1493]

5345.	[func]		Key-style trust anchors and DS-style trust anchors
			can now both be used for the same name. [GL #1237]

5344.	[bug]		Handle accept() errors properly in netmgr. [GL !2880]

5343.	[func]		Add statistics counters to the netmgr. [GL #1311]

5342.	[bug]		Disable pktinfo for IPv6 and bind to each interface
			explicitly instead, because libuv doesn't support
			pktinfo control messages. [GL #1558]

5341.	[func]		Simplify passing the bound TCP socket to child
			threads by using isc_uv_export/import functions.
			[GL !2825]

5340.	[bug]		Don't deadlock when binding to a TCP socket fails.
			[GL #1499]

5339.	[bug]		With some libmaxminddb versions, named could erroneously
			match an IP address not belonging to any subnet defined
			in a given GeoIP2 database to one of the existing
			entries in that database. [GL #1552]

5338.	[bug]		Fix line spacing in `rndc secroots`.
			Thanks to Tony Finch. [GL !2478]

5337.	[func]		'named -V' now reports maxminddb and protobuf-c
			versions. [GL !2686]

	--- 9.15.7 released ---

5336.	[bug]		The TCP high-water statistic could report an
			incorrect value on startup. [GL #1392]

5335.	[func]		Make TCP listening code multithreaded. [GL !2659]

5334.	[doc]		Update documentation with dnssec-policy clarifications.
			Also change some defaults. [GL !2711]

5333.	[bug]		Fix duration printing on Solaris when value is not
			an ISO 8601 duration. [GL #1460]

5332.	[func]		Renamed "dnssec-keys" configuration statement
			to the more descriptive "trust-anchors". [GL !2702]

5331.	[func]		Use compiler-provided mechanisms for thread local
			storage, and make the requirement for such mechanisms
			explicit in configure. [GL #1444]

5330.	[bug]		'configure --without-python' was ineffective if
			PYTHON was set in the environment. [GL #1434]

5329.	[bug]		Reconfiguring named caused memory to be leaked when any
			GeoIP2 database was in use. [GL #1445]

5328.	[bug]		rbtdb.c:rdataset_{get,set}ownercase failed to obtain
			a node lock. [GL #1417]

5327.	[func]		Added a statistics counter to track queries
			dropped because the recursive-clients quota was
			exceeded. [GL #1399]

5326.	[bug]		Add Python dependency on 'distutils.core' to configure.
			'distutils.core' is required for installation.
			[GL #1397]

5325.	[bug]		Addressed several issues with TCP connections in
			the netmgr: restored support for TCP connection
			timeouts, restored TCP backlog support, actively
			close all open sockets during shutdown. [GL #1312]

5324.	[bug]		Change the category of some log messages from general
			to the more appropriate catergory of xfer-in. [GL #1394]

5323.	[bug]		Fix a bug in DNSSEC trust anchor verification.
			[GL !2609]

5322.	[placeholder]

5321.	[bug]		Obtain write lock before updating version->records
			and version->bytes. [GL #1341]

5320.	[cleanup]	Silence TSAN on header->count. [GL #1344]

	--- 9.15.6 released ---

5319.	[func]		Trust anchors can now be configured using DS
			format to represent a key digest, by using the
			new "initial-ds" or "static-ds" keywords in
			the "dnssec-keys" statement.

			Note: DNSKEY-format and DS-format trust anchors
			cannot both be used for the same domain name.
			[GL #622]

5318.	[cleanup]	The DNSSEC validation code has been refactored
			for clarity and to reduce code duplication.
			[GL #622]

5317.	[func]		A new asynchronous network communications system
			based on libuv is now used for listening for
			incoming requests and responding to them. (The
			old isc_socket API remains in use for sending
			iterative queries and processing responses; this
			will be changed too in a later release.)

			This change will make it easier to improve
			performance and implement new protocol layers
			(e.g., DNS over TLS) in the future. [GL #29]

5316.	[func]		A new "dnssec-policy" option has been added to
			named.conf to implement a key and signing policy
			(KASP) for zones. When this option is in use,
			named can generate new keys as needed and
			automatically roll both ZSK and KSK keys. (Note
			that the syntax for this statement differs from
			the dnssec policy used by dnssec-keymgr.)

			See the ARM for configuration details. [GL #1134]

5315.	[bug]		Apply the initial RRSIG expiration spread fixed
			to all dynamically created records in the zone
			including NSEC3. Also fix the signature clusters
			when the server has been offline for prolonged
			period of times. [GL #1256]

5314.	[func]		Added a new statistics variable "tcp-highwater"
			that reports the maximum number of simultaneous TCP
			clients BIND has handled while running. [GL #1206]

5313.	[bug]		The default GeoIP2 database location did not match
			the ARM.  'named -V' now reports the default
			location. [GL #1301]

5312.	[bug]		Do not flush the cache for `rndc validation status`.
			Thanks to Tony Finch. [GL !2462]

5311.	[cleanup]	Include all views in output of `rndc validation status`.
			Thanks to Tony Finch. [GL !2461]

5310.	[bug]		TCP failures were affecting EDNS statistics. [GL #1059]

5309.	[placeholder]

5308.	[bug]		Don't log DNS_R_UNCHANGED from sync_secure_journal()
			at ERROR level in receive_secure_serial(). [GL #1288]

5307.	[bug]		Fix hang when named-compilezone output is sent to pipe.
			Thanks to Tony Finch. [GL !2481]

5306.	[security]	Set a limit on number of simultaneous pipelined TCP
			queries. (CVE-2019-6477) [GL #1264]

5305.	[bug]		NSEC Aggressive Cache ("synth-from-dnssec") has been
			disabled by default because it was found to have
			a significant performance impact on the recursive
			service. [GL #1265]

5304.	[bug]		"dnskey-sig-validity 0;" was not being accepted.
			[GL #876]

5303.	[placeholder]

5302.	[bug]		Fix checking that "dnstap-output" is defined when
			"dnstap" is specified in a view. [GL #1281]

5301.	[bug]		Detect partial prefixes / incomplete IPv4 address in
			acls. [GL #1143]

5300.	[bug]		dig/mdig/delv: Add a colon after EDNS option names,
			even when the option is empty, to improve
			readability and allow correct parsing of YAML
			output. [GL #1226]

	--- 9.15.5 released ---

5299.	[security]	A flaw in DNSSEC verification when transferring
			mirror zones could allow data to be incorrectly
			marked valid. (CVE-2019-6475) [GL #1252]

5298.	[security]	Named could assert if a forwarder returned a
			referral, rather than resolving the query, when QNAME
			minimization was enabled. (CVE-2019-6476) [GL #1051]

5297.	[bug]		Check whether a previous QNAME minimization fetch
			is still running before starting a new one; return
			SERVFAIL and log an error if so. [GL #1191]

5296.	[placeholder]

5295.	[cleanup]	Split dns_name_copy() calls into dns_name_copy() and
			dns_name_copynf() for those calls that can potentially
			fail and those that should not fail respectively.
			[GL !2265]

5294.	[func]		Fallback to ACE name on output in locale, which does not
			support converting it to unicode.  [GL #846]

5293.	[bug]		On Windows, named crashed upon any attempt to fetch XML
			statistics from it. [GL #1245]

5292.	[bug]		Queue 'rndc nsec3param' requests while signing inline
			zone changes. [GL #1205]

	--- 9.15.4 released ---

5291.	[placeholder]

5290.	[placeholder]

5289.	[bug]		Address NULL pointer dereference in rpz.c:rpz_detach.
			[GL #1210]

5288.	[bug]		dnssec-must-be-secure was not always honored.
			[GL #1209]

5287.	[placeholder]

5286.	[contrib]	Address potential NULL pointer dereferences in
			dlz_mysqldyn_mod.c. [GL #1207]

5285.	[port]		win32: implement "-T maxudpXXX". [GL #837]

5284.	[func]		Added +unexpected command line option to dig.
			By default, dig won't accept a reply from a source
			other than the one to which it sent the query.
			Invoking dig with +unexpected argument will allow it
			to process replies from unexpected sources.

5283.	[bug]		When a response-policy zone expires, ensure that
			its policies are removed from the RPZ summary
			database. [GL #1146]

5282.	[bug]		Fixed a bug in searching for possible wildcard matches
			for query names in the RPZ summary database. [GL #1146]

5281.	[cleanup]	Don't escape commas when reporting named's command
			line. [GL #1189]

5280.	[protocol]	Add support for displaying EDNS option LLQ. [GL #1201]

5279.	[bug]		When loading, reject zones containing CDS or CDNSKEY
			RRsets at the zone apex if they would cause DNSSEC
			validation failures if published in the parent zone
			as the DS RRset.  [GL #1187]

5278.	[func]		Add YAML output formats for dig, mdig and delv;
			use the "+yaml" option to enable. [GL #1145]

	--- 9.15.3 released ---

5277.	[bug]		Cache DB statistics could underflow when serve-stale
			was in use, because of a bug in counter maintenance
			when RRsets become stale.

			Functions for dumping statistics have been updated
			to dump active, stale, and ancient statistic
			counters.  Ancient RRset counters are prefixed
			with '~'; stale RRset counters are still prefixed
			with '#'. [GL #602]

5276.	[func]		DNSSEC Lookaside Validation (DLV) is now obsolete;
			all code enabling its use has been removed from the
			validator, "delv", and the DNSSEC tools. [GL #7]

5275.	[bug]		Mark DS records included in referral messages
			with trust level "pending" so that they can be
			validated and cached immediately, with no need to
			re-query. [GL #964]

5274.	[bug]		Address potential use after free race when shutting
			down rpz. [GL #1175]

5273.	[bug]		Check that bits [64..71] of a dns64 prefix are zero.
			[GL #1159]

5272.	[cleanup]	Remove isc-config.sh script as the BIND 9 libraries
			are now purely internal. [GL #1123]

5271.	[func]		The normal (non-debugging) output of dnssec-signzone
			and dnssec-verify tools now goes to stdout, instead of
			the combination of stderr and stdout.

5270.	[bug]		'dig +expandaaaa +short' did not work. [GL #1152]

5269.	[port]		cygwin: can return ETIMEDOUT on connect() with a
			non-blocking socket. [GL #1133]

5268.	[placeholder]

5267.	[func]		Allow statistics groups display to be toggle-able.
			[GL #1030]

5266.	[bug]		named-checkconf failed to report dnstap-output
			missing from named.conf when dnstap was specified.
			[GL #1136]

5265.	[bug]		DNS64 and RPZ nodata (CNAME *.) rules interacted badly
			[GL #1106]

5264.	[func]		New DNS Cookie algorithm - siphash24 - has been added
			to BIND 9, and the old HMAC-SHA DNS Cookie algorithms
			have been removed. [GL #605]

	--- 9.15.2 released ---

5263.	[cleanup]	Use atomics and isc_refcount_t wherever possible.
			[GL #1038]

5262.	[func]		Removed support for the legacy GeoIP API. [GL #1112]

5261.	[cleanup]	Remove SO_BSDCOMPAT socket option usage.

5260.	[bug]		dnstap-read was producing malformed output for large
			packets. [GL #1093]

5259.	[func]		New option '-i' for 'named-checkconf' to ignore
			warnings about deprecated options. [GL #1101]

5258.	[func]		Added support for the GeoIP2 API from MaxMind. This
			will be compiled in by default if the "libmaxminddb"
			library is found at compile time, but can be
			suppressed using "configure --disable-geoip".

			Certain geoip ACL settings that were available with
			legacy GeoIP are not available when using GeoIP2.
			[GL #182]

5257.	[bug]		Some statistics data was not being displayed.
			Add shading to the zone tables. [GL #1030]

5256.	[bug]		Ensure that glue records are included in root
			priming responses if "minimal-responses" is not
			set to "yes". [GL #1092]

5255.	[bug]		Errors encountered while reloading inline-signing
			zones could be ignored, causing the zone content to
			be left in an incompletely updated state rather than
			reverted. [GL #1109]

5254.	[func]		Collect metrics to report to the statistics-channel
			DNSSEC signing operations (dnssec-sign) and refresh
			operations (dnssec-refresh) per zone and per keytag.
			[GL #513]

5253.	[port]		Support platforms that don't define ULLONG_MAX.
			[GL #1098]

5252.	[func]		Report if the last 'rndc reload/reconfig' failed in
			rndc status. [GL !2040]

5251.	[bug]		Statistics were broken in x86 Windows builds.
			[GL #1081]

5250.	[func]		The default size for RSA keys is now 2048 bits,
			for both ZSKs and KSKs. [GL #1097]

5249.	[bug]		Fix a possible underflow in recursion clients
			statistics when hitting recursive clients
			soft quota. [GL #1067]

	--- 9.15.1 released ---

5248.	[func]		To clarify the configuration of DNSSEC keys,
			the "managed-keys" and "trusted-keys" options
			have both been deprecated.  The new "dnssec-keys"
			statement can now be used for all trust anchors,
			with the keywords "iniital-key" or "static-key"
			to indicate whether the configured trust anchor
			should be used for initialization of RFC 5011 key
			management, or as a permanent trust anchor.

			The "static-key" keyword will generate a warning if
			used for the root zone.

			Configurations using "trusted-keys" or "managed-keys"
			will continue to work with no changes, but will
			generate warnings in the log. In a future release,
			these options will be marked obsolete. [GL #6]

5247.	[cleanup]	The 'cleaning-interval' option has been removed.
			[GL !1731]

5246.	[func]		Log TSIG if appropriate in 'sending notify to' message.
			[GL #1058]

5245.	[cleanup]	Reduce logging level for IXFR up-to-date poll
			responses. [GL #1009]

5244.	[security]	Fixed a race condition in dns_dispatch_getnext()
			that could cause an assertion failure if a
			significant number of incoming packets were
			rejected. (CVE-2019-6471) [GL #942]

5243.	[bug]		Fix a possible race between dispatcher and socket
			code in a high-load cold-cache resolver scenario.
			[GL #943]

5242.	[bug]		In relaxed qname minimization mode, fall back to
			normal resolution when encountering a lame
			delegation, and use _.domain/A queries rather
			than domain/NS. [GL #1055]

5241.	[bug]		Fix Ed448 private and public key ASN.1 prefix blobs.
			[GL #225]

5240.	[bug]		Remove key id calculation for RSAMD5. [GL #996]

5239.	[func]		Change the json-c detection to pkg-config. [GL #855]

5238.	[bug]		Fix a possible deadlock in TCP code. [GL #1046]

5237.	[bug]		Recurse to find the root server list with 'dig +trace'.
			[GL #1028]

5236.	[func]		Add SipHash 2-4 implementation in lib/isc/siphash.c
			and switch isc_hash_function() to use SipHash 2-4.
			[GL #605]

5235.	[cleanup]	Refactor lib/isc/app.c to be thread-safe, unused
			parts of the API has been removed and the
			isc_appctx_t data type has been changed to be
			fully opaque. [GL #1023]

5234.	[port]		arm: just use the compiler's default support for
			yield. [GL #981]

	--- 9.15.0 released ---

5233.	[bug]		Negative trust anchors did not work with "forward only;"
			to validating resolvers. [GL #997]

5232.	[placeholder]

5231.	[protocol]	Add support for displaying CLIENT-TAG and SERVER-TAG.
			[GL #960]

5230.	[protocol]	The SHA-1 hash algorithm is no longer used when
			generating DS and CDS records. [GL #1015]

5229.	[protocol]	Enforce known SSHFP fingerprint lengths. [GL #852]

5228.	[func]		If trusted-keys and managed-keys were configured
			simultaneously for the same name, the key could
			not be be rolled automatically. This is now
			a fatal configuration error. [GL #868]

5227.	[placeholder]

5226.	[placeholder]

5225.	[func]		Allow dig to print out AAAA record fully expanded.
			with +[no]expandaaaa. [GL #765]

5224.	[bug]		Only test provide-ixfr on TCP streams. [GL #991]

5223.	[bug]		Fixed a race in the filter-aaaa plugin accessing
			the hash table. [GL #1005]

5222.	[bug]		'delv -t ANY' could leak memory. [GL #983]

5221.	[test]		Enable parallel execution of system tests on
			Windows. [GL !4101]

5220.	[cleanup]	Refactor the isc_stat structure to take advantage
			of stdatomic. [GL !1493]

5219.	[bug]		Fixed a race in the filter-aaaa plugin that could
			trigger a crash when returning an instance object
			to the memory pool. [GL #982]

5218.	[bug]		Conditionally include <dlfcn.h>. [GL #995]

5217.	[bug]		Restore key id calculation for RSAMD5. [GL #996]

5216.	[bug]		Fetches-per-zone counter wasn't updated correctly
			when doing qname minimization. [GL #992]

5215.	[bug]		Change #5124 was incomplete; named could still
			return FORMERR instead of SERVFAIL in some cases.
			[GL #990]

5214.	[bug]		win32: named now removes its lock file upon shutdown.
			[GL #979]

5213.	[bug]		win32: Eliminated a race which allowed named.exe running
			as a service to be killed prematurely during shutdown.
			[GL #978]

5212.	[placeholder]

5211.	[bug]		Allow out-of-zone additional data to be included
			in authoritative responses if recursion is allowed
			and "minimal-responses" is disabled.  This behavior
			was inadvertently removed in change #4605. [GL #817]

5210.	[bug]		When dnstap is enabled and recursion is not
			available, incoming queries are now logged
			as "auth". Previously, this depended on whether
			recursion was requested by the client, not on
			whether recursion was available. [GL #963]

5209.	[bug]		When update-check-ksk is true, add_sigs was not
			considering offline keys, leaving record sets signed
			with the incorrect type key. [GL #763]

5208.	[test]		Run valid rdata wire encodings through totext+fromtext
			and tofmttext+fromtext methods to check these methods.
			[GL #899]

5207.	[test]		Check delv and dig TTL values. [GL #965]

5206.	[bug]		Delv could print out bad TTLs. [GL #965]

5205.	[bug]		Enforce that a DS hash exists. [GL #899]

5204.	[test]		Check that dns_rdata_fromtext() produces a record that
			will be accepted by dns_rdata_fromwire(). [GL #852]

5203.	[bug]		Enforce whether key rdata exists or not in KEY,
			DNSKEY, CDNSKEY and RKEY. [GL #899]

5202.	[bug]		<dns/ecs.h> was missing ISC_LANG_ENDDECLS. [GL #976]

5201.	[bug]		Fix a possible deadlock in RPZ update code. [GL #973]

5200.	[security]	tcp-clients settings could be exceeded in some cases,
			which could lead to exhaustion of file descriptors.
			(CVE-2018-5743) [GL #615]

5199.	[security]	In certain configurations, named could crash
			if nxdomain-redirect was in use and a redirected
			query resulted in an NXDOMAIN from the cache.
			(CVE-2019-6467) [GL #880]

5198.	[bug]		If a fetch context was being shut down and, at the same
			time, we returned from qname minimization, an INSIST
			could be hit. [GL #966]

5197.	[bug]		dig could die in best effort mode on multiple SIG(0)
			records. Similarly on multiple OPT and multiple TSIG
			records. [GL #920]

5196.	[bug]		make install failed with --with-dlopen=no. [GL #955]

5195.	[bug]		"allow-update" and "allow-update-forwarding" were
			treated as configuration errors if used at the
			options or view level. [GL #913]

5194.	[bug]		Enforce non empty ZOMEMD hash. [GL #899]

5193.	[bug]		EID and NIMLOC failed to do multi-line output
			correctly. [GL #899]

5192.	[placeholder]

5191.	[placeholder]

5190.	[bug]		Ignore trust anchors using disabled algorithms.
			[GL #806]

5189.	[cleanup]	Remove revoked root DNSKEY from bind.keys. [GL #945]

5188.	[func]		The "dnssec-enable" option is deprecated and no
			longer has any effect; DNSSEC responses are
			always enabled. [GL #866]

5187.	[test]		Set time zone before running any tests in dnstap_test.
			[GL #940]

5186.	[cleanup]	More dnssec-keygen manual tidying. [GL !1678]

5185.	[placeholder]

5184.	[bug]		Missing unlocks in sdlz.c. [GL #936]

5183.	[bug]		Reinitialize ECS data before reusing client
			structures. [GL #881]

5182.	[bug]		Fix a high-load race/crash in handling of
			isc_socket_close() in resolver. [GL #834]

5181.	[func]		Add a mechanism for a DLZ module to signal that
			the view's allow-transfer ACL should be used to
			determine whether transfers are allowed. [GL #803]

5180.	[bug]		delv now honors the operating system's preferred
			ephemeral port range. [GL #925]

5179.	[cleanup]	Replace some vague type declarations with the more
			specific dns_secalg_t and dns_dsdigest_t.
			Thanks to Tony Finch. [GL !1498]

5178.	[bug]		Handle EDQUOT (disk quota) and ENOSPC (disk full)
			errors when writing files. [GL #902]

5177.	[func]		Add the ability to specify in named.conf whether a
			response-policy zone's SOA record should be added
			to the additional section (add-soa yes/no). [GL #865]

5176.	[tests]		Remove a dependency on libxml in statschannel system
			test. [GL #926]

5175.	[bug]		Fixed a problem with file input in dnssec-keymgr,
			dnssec-coverage and dnssec-checkds when using
			python3. [GL #882]

5174.	[doc]		Tidy dnssec-keygen manual. [GL !1557]

5173.	[bug]		Fixed a race in socket code that could occur when
			accept, send, or recv were called from an event
			loop but the socket had been closed by another
			thread. [RT #874]

5172.	[bug]		nsupdate now honors the operating system's preferred
			ephemeral port range. [GL #905]

5171.	[func]		named plugins are now installed into a separate
			directory.  Supplying a filename (a string without path
			separators) in a "plugin" configuration stanza now
			causes named to look for that plugin in that directory.
			[GL #878]

5170.	[test]		Added --with-dlz-filesystem to feature-test. [GL !1587]

5169.	[bug]		The presence of certain types in an otherwise
			empty node could cause a crash while processing a
			type ANY query. [GL #901]

5168.	[bug]		Do not crash on shutdown when RPZ fails to load.  Also,
			keep previous version of the database if RPZ fails to
			load. [GL #813]

5167.	[bug]		nxdomain-redirect could sometimes lookup the wrong
			redirect name. [GL #892]

5166.	[placeholder]

5165.	[contrib]	Removed SDB drivers from contrib; they're obsolete.
			[GL #428]

5164.	[bug]		Correct errno to result translation in dlz filesystem
			modules. [GL #884]

5163.	[cleanup]	Out-of-tree builds failed --enable-dnstap. [GL #836]

5162.	[cleanup]	Improve dnssec-keymgr manual. Thanks to Tony Finch.
			[GL !1518]

5161.	[bug]		Do not require the SEP bit to be set for mirror zone
			trust anchors. [GL #873]

5160.	[contrib]	Added DNAME support to the DLZ LDAP schema. Also
			fixed a compilation bug affecting several DLZ
			modules. [GL #872]

5159.	[bug]		dnssec-coverage was incorrectly ignoring
			names specified on the command line without
			trailing dots. [GL !1478]

5158.	[protocol]	Add support for AMTRELAY and ZONEMD. [GL #867]

5157.	[bug]		Nslookup now errors out if there are extra command
			line arguments. [GL #207]

5156.	[doc]		Extended and refined the section of the ARM describing
			mirror zones. [GL #774]

5155.	[func]		"named -V" now outputs the default paths to
			named.conf, rndc.conf, bind.keys, and other
			files used or created by named and other tools, so
			that the correct paths to these files can quickly be
			determined regardless of the configure settings
			used when BIND was built. [GL #859]

5154.	[bug]		dig: process_opt could be called twice on the same
			message leading to a assertion failure. [GL #860]

5153.	[func]		Zone transfer statistics (size, number of records, and
			number of messages) are now logged for outgoing
			transfers as well as incoming ones. [GL #513]

5152.	[func]		Improved logging of DNSSEC key events:
			- Zone signing and DNSKEY maintenance events are
			  now logged to the "dnssec" category
			- Messages are now logged when DNSSEC keys are
			  published, activated, inactivated, deleted,
			  or revoked.
			[GL #714]

5151.	[func]		Options that have been been marked as obsolete in
			named.conf for a very long time are now fatal
			configuration errors. [GL #358]

5150.	[cleanup]	Remove the ability to compile BIND with assertions
			disabled. [GL #735]

5149.	[func]		"rndc dumpdb" now prints a line above a stale RRset
			indicating how long the data will be retained in the
			cache for emergency use. [GL #101]

5148.	[bug]		named did not sign the TKEY response. [GL #821]

5147.	[bug]		dnssec-keymgr: Add a five-minute margin to better
			handle key events close to 'now'. [GL #848]

5146.	[placeholder]

5145.	[func]		Use atomics instead of locked variables for isc_quota
			and isc_counter. [GL !1389]

5144.	[bug]		dig now returns a non-zero exit code when a TCP
			connection is prematurely closed by a peer more than
			once for the same lookup.  [GL #820]

5143.	[bug]		dnssec-keymgr and dnssec-coverage failed to find
			key files for zone names ending in ".". [GL #560]

5142.	[cleanup]	Removed "configure --disable-rpz-nsip" and
			"--disable-rpz-nsdname" options. "nsip-enable"
			and "nsdname-enable" both now default to yes,
			regardless of compile-time settings. [GL #824]

5141.	[security]	Zone transfer controls for writable DLZ zones were
			not effective as the allowzonexfr method was not being
			called for such zones. (CVE-2019-6465) [GL #790]

5140.	[bug]		Don't immediately mark existing keys as inactive and
			deleted when running dnssec-keymgr for the first
			time. [GL #117]

5139.	[bug]		If possible, don't use forwarders when priming.
			This ensures we can get root server IP addresses
			from priming query response glue, which may not
			be present if the forwarding server is returning
			minimal responses. [GL #752]

5138.	[bug]		Under some circumstances named could hit an assertion
			failure when doing qname minimization when using
			forwarders. [GL #797]

5137.	[func]		named now logs messages whenever a mirror zone becomes
			usable or unusable for resolution purposes. [GL #818]

5136.	[cleanup]	Check in named-checkconf that allow-update and
			allow-update-forwarding are not set at the
			view/options level; fix documentation. [GL #512]

5135.	[port]		sparc: Use smt_pause() instead of pause. [GL #816]

5134.	[bug]		win32: WSAStartup was not called before getservbyname
			was called. [GL #590]

5133.	[bug]		'rndc managed-keys' didn't handle class and view
			correctly and failed to add new lines between each
			view. [GL !1327]

5132.	[bug]		Fix race condition in cleanup part of dns_dt_create().
			[GL !1323]

5131.	[cleanup]	Address Coverity warnings. [GL #801]

5130.	[cleanup]	Remove support for l10n message catalogs. [GL #709]

5129.	[contrib]	sdlz_helper.c:build_querylist was not properly
			splitting the query string. [GL #798]

5128.	[bug]		Refreshkeytime was not being updated for managed
			keys zones. [GL #784]

5127.	[bug]		rcode.c:maybe_numeric failed to handle NUL in text
			regions. [GL #807]

5126.	[bug]		Named incorrectly accepted empty base64 and hex encoded
			fields when reading master files. [GL #807]

5125.	[bug]		Allow for up to 100 records or 64k of data when caching
			a negative response. [GL #804]

5124.	[bug]		Named could incorrectly return FORMERR rather than
			SERVFAIL. [GL #804]

5123.	[bug]		dig could hang indefinitely after encountering an error
			before creating a TCP socket. [GL #692]

5122.	[bug]		In a "forward first;" configuration, a forwarder
			timeout did not prevent that forwarder from being
			queried again after falling back to full recursive
			resolution. [GL #315]

5121.	[contrib]	dlz_stub_driver.c fails to return ISC_R_NOTFOUND on none
			matching zone names. [GL !1299]

5120.	[placeholder]

5119.	[placeholder]

5118.	[security]	Named could crash if it is managing a key with
			`managed-keys` and the authoritative zone is rolling
			the key to an unsupported algorithm. (CVE-2018-5745)
			[GL #780]

5117.	[placeholder]

5116.	[bug]		Named/named-checkconf triggered a assertion when
			a mirror zone's name is bad. [GL #778]

5115.	[bug]		Allow unsupported algorithms in zone when not used for
			signing with dnssec-signzone. [GL #783]

5114.	[func]		Include a 'reconfig/reload in progress' status line
			in rndc status, use it in tests.

5113.	[port]		Fixed a Windows build error.

5112.	[bug]		Named/named-checkconf could dump core if there was
			a missing masters clause and a bad notify clause.
			[GL #779]

5111.	[bug]		Occluded DNSKEY records could make it into the
			delegating NSEC/NSEC3 bitmap. [GL #742]

5110.	[security]	Named leaked memory if there were multiple Key Tag
			EDNS options present. (CVE-2018-5744) [GL #772]

5109.	[cleanup]	Remove support for RSAMD5 algorithm. [GL #628]

/*	$NetBSD: journal.c,v 1.1.1.4 2020/05/24 19:36:41 christos Exp $	*/

/*
 * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 *
 * See the COPYRIGHT file distributed with this work for additional
 * information regarding copyright ownership.
 */

#include <errno.h>
#include <inttypes.h>
#include <stdbool.h>
#include <stdlib.h>
#include <unistd.h>

#include <isc/file.h>
#include <isc/mem.h>
#include <isc/print.h>
#include <isc/stdio.h>
#include <isc/string.h>
#include <isc/util.h>

#include <dns/compress.h>
#include <dns/db.h>
#include <dns/dbiterator.h>
#include <dns/diff.h>
#include <dns/fixedname.h>
#include <dns/journal.h>
#include <dns/log.h>
#include <dns/rdataset.h>
#include <dns/rdatasetiter.h>
#include <dns/result.h>
#include <dns/soa.h>

/*! \file
 * \brief Journaling.
 *
 * A journal file consists of
 *
 *   \li A fixed-size header of type journal_rawheader_t.
 *
 *   \li The index.  This is an unordered array of index entries
 *     of type journal_rawpos_t giving the locations
 *     of some arbitrary subset of the journal's addressable
 *     transactions.  The index entries are used as hints to
 *     speed up the process of locating a transaction with a given
 *     serial number.  Unused index entries have an "offset"
 *     field of zero.  The size of the index can vary between
 *     journal files, but does not change during the lifetime
 *     of a file.  The size can be zero.
 *
 *   \li The journal data.  This  consists of one or more transactions.
 *     Each transaction begins with a transaction header of type
 *     journal_rawxhdr_t.  The transaction header is followed by a
 *     sequence of RRs, similar in structure to an IXFR difference
 *     sequence (RFC1995).  That is, the pre-transaction SOA,
 *     zero or more other deleted RRs, the post-transaction SOA,
 *     and zero or more other added RRs.  Unlike in IXFR, each RR
 *     is prefixed with a 32-bit length.
 *
 *     The journal data part grows as new transactions are
 *     appended to the file.  Only those transactions
 *     whose serial number is current-(2^31-1) to current
 *     are considered "addressable" and may be pointed
 *     to from the header or index.  They may be preceded
 *     by old transactions that are no longer addressable,
 *     and they may be followed by transactions that were
 *     appended to the journal but never committed by updating
 *     the "end" position in the header.  The latter will
 *     be overwritten when new transactions are added.
 */

/**************************************************************************/
/*
 * Miscellaneous utilities.
 */

#define JOURNAL_COMMON_LOGARGS \
	dns_lctx, DNS_LOGCATEGORY_GENERAL, DNS_LOGMODULE_JOURNAL

#define JOURNAL_DEBUG_LOGARGS(n) JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(n)

/*%
 * It would be non-sensical (or at least obtuse) to use FAIL() with an
 * ISC_R_SUCCESS code, but the test is there to keep the Solaris compiler
 * from complaining about "end-of-loop code not reached".
 */
#define FAIL(code)                           \
	do {                                 \
		result = (code);             \
		if (result != ISC_R_SUCCESS) \
			goto failure;        \
	} while (0)

#define CHECK(op)                            \
	do {                                 \
		result = (op);               \
		if (result != ISC_R_SUCCESS) \
			goto failure;        \
	} while (0)

#define JOURNAL_SERIALSET 0x01U

static isc_result_t
index_to_disk(dns_journal_t *);

static inline uint32_t
decode_uint32(unsigned char *p) {
	return ((p[0] << 24) + (p[1] << 16) + (p[2] << 8) + (p[3] << 0));
}

static inline void
encode_uint32(uint32_t val, unsigned char *p) {
	p[0] = (uint8_t)(val >> 24);
	p[1] = (uint8_t)(val >> 16);
	p[2] = (uint8_t)(val >> 8);
	p[3] = (uint8_t)(val >> 0);
}

isc_result_t
dns_db_createsoatuple(dns_db_t *db, dns_dbversion_t *ver, isc_mem_t *mctx,
		      dns_diffop_t op, dns_difftuple_t **tp) {
	isc_result_t result;
	dns_dbnode_t *node;
	dns_rdataset_t rdataset;
	dns_rdata_t rdata = DNS_RDATA_INIT;
	dns_fixedname_t fixed;
	dns_name_t *zonename;

	zonename = dns_fixedname_initname(&fixed);
	dns_name_copynf(dns_db_origin(db), zonename);

	node = NULL;
	result = dns_db_findnode(db, zonename, false, &node);
	if (result != ISC_R_SUCCESS) {
		goto nonode;
	}

	dns_rdataset_init(&rdataset);
	result = dns_db_findrdataset(db, node, ver, dns_rdatatype_soa, 0,
				     (isc_stdtime_t)0, &rdataset, NULL);
	if (result != ISC_R_SUCCESS) {
		goto freenode;
	}

	result = dns_rdataset_first(&rdataset);
	if (result != ISC_R_SUCCESS) {
		goto freenode;
	}

	dns_rdataset_current(&rdataset, &rdata);
	dns_rdataset_getownercase(&rdataset, zonename);

	result = dns_difftuple_create(mctx, op, zonename, rdataset.ttl, &rdata,
				      tp);

	dns_rdataset_disassociate(&rdataset);
	dns_db_detachnode(db, &node);
	return (result);

freenode:
	dns_db_detachnode(db, &node);
nonode:
	UNEXPECTED_ERROR(__FILE__, __LINE__, "missing SOA");
	return (result);
}

/* Journaling */

/*%
 * On-disk representation of a "pointer" to a journal entry.
 * These are used in the journal header to locate the beginning
 * and end of the journal, and in the journal index to locate
 * other transactions.
 */
typedef struct {
	unsigned char serial[4]; /*%< SOA serial before update. */
	/*
	 * XXXRTH  Should offset be 8 bytes?
	 * XXXDCL ... probably, since isc_offset_t is 8 bytes on many OSs.
	 * XXXAG  ... but we will not be able to seek >2G anyway on many
	 *            platforms as long as we are using fseek() rather
	 *            than lseek().
	 */
	unsigned char offset[4]; /*%< Offset from beginning of file. */
} journal_rawpos_t;

/*%
 * The header is of a fixed size, with some spare room for future
 * extensions.
 */
#define JOURNAL_HEADER_SIZE 64 /* Bytes. */

/*%
 * The on-disk representation of the journal header.
 * All numbers are stored in big-endian order.
 */
typedef union {
	struct {
		/*% File format version ID. */
		unsigned char format[16];
		/*% Position of the first addressable transaction */
		journal_rawpos_t begin;
		/*% Position of the next (yet nonexistent) transaction. */
		journal_rawpos_t end;
		/*% Number of index entries following the header. */
		unsigned char index_size[4];
		/*% Source serial number. */
		unsigned char sourceserial[4];
		unsigned char flags;
	} h;
	/* Pad the header to a fixed size. */
	unsigned char pad[JOURNAL_HEADER_SIZE];
} journal_rawheader_t;

/*%
 * The on-disk representation of the transaction header.
 * There is one of these at the beginning of each transaction.
 */
typedef struct {
	unsigned char size[4];	  /*%< In bytes, excluding header. */
	unsigned char serial0[4]; /*%< SOA serial before update. */
	unsigned char serial1[4]; /*%< SOA serial after update. */
} journal_rawxhdr_t;

/*%
 * The on-disk representation of the RR header.
 * There is one of these at the beginning of each RR.
 */
typedef struct {
	unsigned char size[4]; /*%< In bytes, excluding header. */
} journal_rawrrhdr_t;

/*%
 * The in-core representation of the journal header.
 */
typedef struct {
	uint32_t serial;
	isc_offset_t offset;
} journal_pos_t;

#define POS_VALID(pos)	    ((pos).offset != 0)
#define POS_INVALIDATE(pos) ((pos).offset = 0, (pos).serial = 0)

typedef struct {
	unsigned char format[16];
	journal_pos_t begin;
	journal_pos_t end;
	uint32_t index_size;
	uint32_t sourceserial;
	bool serialset;
} journal_header_t;

/*%
 * The in-core representation of the transaction header.
 */

typedef struct {
	uint32_t size;
	uint32_t serial0;
	uint32_t serial1;
} journal_xhdr_t;

/*%
 * The in-core representation of the RR header.
 */
typedef struct {
	uint32_t size;
} journal_rrhdr_t;

/*%
 * Initial contents to store in the header of a newly created
 * journal file.
 *
 * The header starts with the magic string ";BIND LOG V9\n"
 * to identify the file as a BIND 9 journal file.  An ASCII
 * identification string is used rather than a binary magic
 * number to be consistent with BIND 8 (BIND 8 journal files
 * are ASCII text files).
 */

static journal_header_t initial_journal_header = {
	";BIND LOG V9\n", { 0, 0 }, { 0, 0 }, 0, 0, 0
};

#define JOURNAL_EMPTY(h) ((h)->begin.offset == (h)->end.offset)

typedef enum {
	JOURNAL_STATE_INVALID,
	JOURNAL_STATE_READ,
	JOURNAL_STATE_WRITE,
	JOURNAL_STATE_TRANSACTION,
	JOURNAL_STATE_INLINE
} journal_state_t;

struct dns_journal {
	unsigned int magic; /*%< JOUR */
	isc_mem_t *mctx;    /*%< Memory context */
	journal_state_t state;
	char *filename;		 /*%< Journal file name */
	FILE *fp;		 /*%< File handle */
	isc_offset_t offset;	 /*%< Current file offset */
	journal_header_t header; /*%< In-core journal header */
	unsigned char *rawindex; /*%< In-core buffer for journal index
				  * in
				  * on-disk format */
	journal_pos_t *index;	 /*%< In-core journal index */

	/*% Current transaction state (when writing). */
	struct {
		unsigned int n_soa;   /*%< Number of SOAs seen */
		journal_pos_t pos[2]; /*%< Begin/end position */
	} x;

	/*% Iteration state (when reading). */
	struct {
		/* These define the part of the journal we iterate over. */
		journal_pos_t bpos; /*%< Position before first, */
		journal_pos_t epos; /*%< and after last transaction */
		/* The rest is iterator state. */
		uint32_t current_serial; /*%< Current SOA serial
					  * */
		isc_buffer_t source;	 /*%< Data from disk */
		isc_buffer_t target;	 /*%< Data from _fromwire check
					  * */
		dns_decompress_t dctx;	 /*%< Dummy decompression ctx */
		dns_name_t name;	 /*%< Current domain name */
		dns_rdata_t rdata;	 /*%< Current rdata */
		uint32_t ttl;		 /*%< Current TTL */
		unsigned int xsize;	 /*%< Size of transaction data */
		unsigned int xpos;	 /*%< Current position in it */
		isc_result_t result;	 /*%< Result of last call */
	} it;
};

#define DNS_JOURNAL_MAGIC    ISC_MAGIC('J', 'O', 'U', 'R')
#define DNS_JOURNAL_VALID(t) ISC_MAGIC_VALID(t, DNS_JOURNAL_MAGIC)

static void
journal_pos_decode(journal_rawpos_t *raw, journal_pos_t *cooked) {
	cooked->serial = decode_uint32(raw->serial);
	cooked->offset = decode_uint32(raw->offset);
}

static void
journal_pos_encode(journal_rawpos_t *raw, journal_pos_t *cooked) {
	encode_uint32(cooked->serial, raw->serial);
	encode_uint32(cooked->offset, raw->offset);
}

static void
journal_header_decode(journal_rawheader_t *raw, journal_header_t *cooked) {
	INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
	memmove(cooked->format, raw->h.format, sizeof(cooked->format));
	journal_pos_decode(&raw->h.begin, &cooked->begin);
	journal_pos_decode(&raw->h.end, &cooked->end);
	cooked->index_size = decode_uint32(raw->h.index_size);
	cooked->sourceserial = decode_uint32(raw->h.sourceserial);
	cooked->serialset = ((raw->h.flags & JOURNAL_SERIALSET) != 0);
}

static void
journal_header_encode(journal_header_t *cooked, journal_rawheader_t *raw) {
	unsigned char flags = 0;

	INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
	memset(raw->pad, 0, sizeof(raw->pad));
	memmove(raw->h.format, cooked->format, sizeof(raw->h.format));
	journal_pos_encode(&raw->h.begin, &cooked->begin);
	journal_pos_encode(&raw->h.end, &cooked->end);
	encode_uint32(cooked->index_size, raw->h.index_size);
	encode_uint32(cooked->sourceserial, raw->h.sourceserial);
	if (cooked->serialset) {
		flags |= JOURNAL_SERIALSET;
	}
	raw->h.flags = flags;
}

/*
 * Journal file I/O subroutines, with error checking and reporting.
 */
static isc_result_t
journal_seek(dns_journal_t *j, uint32_t offset) {
	isc_result_t result;

	result = isc_stdio_seek(j->fp, (off_t)offset, SEEK_SET);
	if (result != ISC_R_SUCCESS) {
		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
			      "%s: seek: %s", j->filename,
			      isc_result_totext(result));
		return (ISC_R_UNEXPECTED);
	}
	j->offset = offset;
	return (ISC_R_SUCCESS);
}

static isc_result_t
journal_read(dns_journal_t *j, void *mem, size_t nbytes) {
	isc_result_t result;

	result = isc_stdio_read(mem, 1, nbytes, j->fp, NULL);
	if (result != ISC_R_SUCCESS) {
		if (result == ISC_R_EOF) {
			return (ISC_R_NOMORE);
		}
		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
			      "%s: read: %s", j->filename,
			      isc_result_totext(result));
		return (ISC_R_UNEXPECTED);
	}
	j->offset += (isc_offset_t)nbytes;
	return (ISC_R_SUCCESS);
}

static isc_result_t
journal_write(dns_journal_t *j, void *mem, size_t nbytes) {
	isc_result_t result;

	result = isc_stdio_write(mem, 1, nbytes, j->fp, NULL);
	if (result != ISC_R_SUCCESS) {
		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
			      "%s: write: %s", j->filename,
			      isc_result_totext(result));
		return (ISC_R_UNEXPECTED);
	}
	j->offset += (isc_offset_t)nbytes;
	return (ISC_R_SUCCESS);
}

static isc_result_t
journal_fsync(dns_journal_t *j) {
	isc_result_t result;
	result = isc_stdio_flush(j->fp);
	if (result != ISC_R_SUCCESS) {
		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
			      "%s: flush: %s", j->filename,
			      isc_result_totext(result));
		return (ISC_R_UNEXPECTED);
	}
	result = isc_stdio_sync(j->fp);
	if (result != ISC_R_SUCCESS) {
		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
			      "%s: fsync: %s", j->filename,
			      isc_result_totext(result));
		return (ISC_R_UNEXPECTED);
	}
	return (ISC_R_SUCCESS);
}

/*
 * Read/write a transaction header at the current file position.
 */

static isc_result_t
journal_read_xhdr(dns_journal_t *j, journal_xhdr_t *xhdr) {
	journal_rawxhdr_t raw;
	isc_result_t result;
	result = journal_read(j, &raw, sizeof(raw));
	if (result != ISC_R_SUCCESS) {
		return (result);
	}
	xhdr->size = decode_uint32(raw.size);
	xhdr->serial0 = decode_uint32(raw.serial0);
	xhdr->serial1 = decode_uint32(raw.serial1);
	return (ISC_R_SUCCESS);
}

static isc_result_t
journal_write_xhdr(dns_journal_t *j, uint32_t size, uint32_t serial0,
		   uint32_t serial1) {
	journal_rawxhdr_t raw;
	encode_uint32(size, raw.size);
	encode_uint32(serial0, raw.serial0);
	encode_uint32(serial1, raw.serial1);
	return (journal_write(j, &raw, sizeof(raw)));
}

/*
 * Read an RR header at the current file position.
 */

static isc_result_t
journal_read_rrhdr(dns_journal_t *j, journal_rrhdr_t *rrhdr) {
	journal_rawrrhdr_t raw;
	isc_result_t result;
	result = journal_read(j, &raw, sizeof(raw));
	if (result != ISC_R_SUCCESS) {
		return (result);
	}
	rrhdr->size = decode_uint32(raw.size);
	return (ISC_R_SUCCESS);
}

static isc_result_t
journal_file_create(isc_mem_t *mctx, const char *filename) {
	FILE *fp = NULL;
	isc_result_t result;
	journal_header_t header;
	journal_rawheader_t rawheader;
	int index_size = 56; /* XXX configurable */
	int size;
	void *mem; /* Memory for temporary index image. */

	INSIST(sizeof(journal_rawheader_t) == JOURNAL_HEADER_SIZE);

	result = isc_stdio_open(filename, "wb", &fp);
	if (result != ISC_R_SUCCESS) {
		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
			      "%s: create: %s", filename,
			      isc_result_totext(result));
		return (ISC_R_UNEXPECTED);
	}

	header = initial_journal_header;
	header.index_size = index_size;
	journal_header_encode(&header, &rawheader);

	size = sizeof(journal_rawheader_t) +
	       index_size * sizeof(journal_rawpos_t);

	mem = isc_mem_get(mctx, size);
	memset(mem, 0, size);
	memmove(mem, &rawheader, sizeof(rawheader));

	result = isc_stdio_write(mem, 1, (size_t)size, fp, NULL);
	if (result != ISC_R_SUCCESS) {
		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
			      "%s: write: %s", filename,
			      isc_result_totext(result));
		(void)isc_stdio_close(fp);
		(void)isc_file_remove(filename);
		isc_mem_put(mctx, mem, size);
		return (ISC_R_UNEXPECTED);
	}
	isc_mem_put(mctx, mem, size);

	result = isc_stdio_close(fp);
	if (result != ISC_R_SUCCESS) {
		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
			      "%s: close: %s", filename,
			      isc_result_totext(result));
		(void)isc_file_remove(filename);
		return (ISC_R_UNEXPECTED);
	}

	return (ISC_R_SUCCESS);
}

static isc_result_t
journal_open(isc_mem_t *mctx, const char *filename, bool writable, bool create,
	     dns_journal_t **journalp) {
	FILE *fp = NULL;
	isc_result_t result;
	journal_rawheader_t rawheader;
	dns_journal_t *j;

	INSIST(journalp != NULL && *journalp == NULL);
	j = isc_mem_get(mctx, sizeof(*j));

	j->mctx = NULL;
	isc_mem_attach(mctx, &j->mctx);
	j->state = JOURNAL_STATE_INVALID;
	j->fp = NULL;
	j->filename = isc_mem_strdup(mctx, filename);
	j->index = NULL;
	j->rawindex = NULL;

	if (j->filename == NULL) {
		FAIL(ISC_R_NOMEMORY);
	}

	result = isc_stdio_open(j->filename, writable ? "rb+" : "rb", &fp);

	if (result == ISC_R_FILENOTFOUND) {
		if (create) {
			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(1),
				      "journal file %s does not exist, "
				      "creating it",
				      j->filename);
			CHECK(journal_file_create(mctx, filename));
			/*
			 * Retry.
			 */
			result = isc_stdio_open(j->filename, "rb+", &fp);
		} else {
			FAIL(ISC_R_NOTFOUND);
		}
	}
	if (result != ISC_R_SUCCESS) {
		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
			      "%s: open: %s", j->filename,
			      isc_result_totext(result));
		FAIL(ISC_R_UNEXPECTED);
	}

	j->fp = fp;

	/*
	 * Set magic early so that seek/read can succeed.
	 */
	j->magic = DNS_JOURNAL_MAGIC;

	CHECK(journal_seek(j, 0));
	CHECK(journal_read(j, &rawheader, sizeof(rawheader)));

	if (memcmp(rawheader.h.format, initial_journal_header.format,
		   sizeof(initial_journal_header.format)) != 0)
	{
		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
			      "%s: journal format not recognized", j->filename);
		FAIL(ISC_R_UNEXPECTED);
	}
	journal_header_decode(&rawheader, &j->header);

	/*
	 * If there is an index, read the raw index into a dynamically
	 * allocated buffer and then convert it into a cooked index.
	 */
	if (j->header.index_size != 0) {
		unsigned int i;
		unsigned int rawbytes;
		unsigned char *p;

		rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
		j->rawindex = isc_mem_get(mctx, rawbytes);

		CHECK(journal_read(j, j->rawindex, rawbytes));

		j->index = isc_mem_get(mctx, j->header.index_size *
						     sizeof(journal_pos_t));

		p = j->rawindex;
		for (i = 0; i < j->header.index_size; i++) {
			j->index[i].serial = decode_uint32(p);
			p += 4;
			j->index[i].offset = decode_uint32(p);
			p += 4;
		}
		INSIST(p == j->rawindex + rawbytes);
	}
	j->offset = -1; /* Invalid, must seek explicitly. */

	/*
	 * Initialize the iterator.
	 */
	dns_name_init(&j->it.name, NULL);
	dns_rdata_init(&j->it.rdata);

	/*
	 * Set up empty initial buffers for unchecked and checked
	 * wire format RR data.  They will be reallocated
	 * later.
	 */
	isc_buffer_init(&j->it.source, NULL, 0);
	isc_buffer_init(&j->it.target, NULL, 0);
	dns_decompress_init(&j->it.dctx, -1, DNS_DECOMPRESS_NONE);

	j->state = writable ? JOURNAL_STATE_WRITE : JOURNAL_STATE_READ;

	*journalp = j;
	return (ISC_R_SUCCESS);

failure:
	j->magic = 0;
	if (j->rawindex != NULL) {
		isc_mem_put(j->mctx, j->rawindex,
			    j->header.index_size * sizeof(journal_rawpos_t));
	}
	if (j->index != NULL) {
		isc_mem_put(j->mctx, j->index,
			    j->header.index_size * sizeof(journal_pos_t));
	}
	if (j->filename != NULL) {
		isc_mem_free(j->mctx, j->filename);
	}
	if (j->fp != NULL) {
		(void)isc_stdio_close(j->fp);
	}
	isc_mem_putanddetach(&j->mctx, j, sizeof(*j));
	return (result);
}

isc_result_t
dns_journal_open(isc_mem_t *mctx, const char *filename, unsigned int mode,
		 dns_journal_t **journalp) {
	isc_result_t result;
	size_t namelen;
	char backup[1024];
	bool writable, create;

	create = ((mode & DNS_JOURNAL_CREATE) != 0);
	writable = ((mode & (DNS_JOURNAL_WRITE | DNS_JOURNAL_CREATE)) != 0);

	result = journal_open(mctx, filename, writable, create, journalp);
	if (result == ISC_R_NOTFOUND) {
		namelen = strlen(filename);
		if (namelen > 4U && strcmp(filename + namelen - 4, ".jnl") == 0)
		{
			namelen -= 4;
		}

		result = snprintf(backup, sizeof(backup), "%.*s.jbk",
				  (int)namelen, filename);
		if (result >= sizeof(backup)) {
			return (ISC_R_NOSPACE);
		}
		result = journal_open(mctx, backup, writable, writable,
				      journalp);
	}
	return (result);
}

/*
 * A comparison function defining the sorting order for
 * entries in the IXFR-style journal file.
 *
 * The IXFR format requires that deletions are sorted before
 * additions, and within either one, SOA records are sorted
 * before others.
 *
 * Also sort the non-SOA records by type as a courtesy to the
 * server receiving the IXFR - it may help reduce the amount of
 * rdataset merging it has to do.
 */
static int
ixfr_order(const void *av, const void *bv) {
	dns_difftuple_t const *const *ap = av;
	dns_difftuple_t const *const *bp = bv;
	dns_difftuple_t const *a = *ap;
	dns_difftuple_t const *b = *bp;
	int r;
	int bop = 0, aop = 0;

	switch (a->op) {
	case DNS_DIFFOP_DEL:
	case DNS_DIFFOP_DELRESIGN:
		aop = 1;
		break;
	case DNS_DIFFOP_ADD:
	case DNS_DIFFOP_ADDRESIGN:
		aop = 0;
		break;
	default:
		INSIST(0);
		ISC_UNREACHABLE();
	}

	switch (b->op) {
	case DNS_DIFFOP_DEL:
	case DNS_DIFFOP_DELRESIGN:
		bop = 1;
		break;
	case DNS_DIFFOP_ADD:
	case DNS_DIFFOP_ADDRESIGN:
		bop = 0;
		break;
	default:
		INSIST(0);
		ISC_UNREACHABLE();
	}

	r = bop - aop;
	if (r != 0) {
		return (r);
	}

	r = (b->rdata.type == dns_rdatatype_soa) -
	    (a->rdata.type == dns_rdatatype_soa);
	if (r != 0) {
		return (r);
	}

	r = (a->rdata.type - b->rdata.type);
	return (r);
}

/*
 * Advance '*pos' to the next journal transaction.
 *
 * Requires:
 *	*pos refers to a valid journal transaction.
 *
 * Ensures:
 *	When ISC_R_SUCCESS is returned,
 *	*pos refers to the next journal transaction.
 *
 * Returns one of:
 *
 *    ISC_R_SUCCESS
 *    ISC_R_NOMORE 	*pos pointed at the last transaction
 *    Other results due to file errors are possible.
 */
static isc_result_t
journal_next(dns_journal_t *j, journal_pos_t *pos) {
	isc_result_t result;
	journal_xhdr_t xhdr;
	REQUIRE(DNS_JOURNAL_VALID(j));

	result = journal_seek(j, pos->offset);
	if (result != ISC_R_SUCCESS) {
		return (result);
	}

	if (pos->serial == j->header.end.serial) {
		return (ISC_R_NOMORE);
	}
	/*
	 * Read the header of the current transaction.
	 * This will return ISC_R_NOMORE if we are at EOF.
	 */
	result = journal_read_xhdr(j, &xhdr);
	if (result != ISC_R_SUCCESS) {
		return (result);
	}

	/*
	 * Check serial number consistency.
	 */
	if (xhdr.serial0 != pos->serial) {
		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
			      "%s: journal file corrupt: "
			      "expected serial %u, got %u",
			      j->filename, pos->serial, xhdr.serial0);
		return (ISC_R_UNEXPECTED);
	}

	/*
	 * Check for offset wraparound.
	 */
	if ((isc_offset_t)(pos->offset + sizeof(journal_rawxhdr_t) +
			   xhdr.size) < pos->offset)
	{
		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
			      "%s: offset too large", j->filename);
		return (ISC_R_UNEXPECTED);
	}

	pos->offset += sizeof(journal_rawxhdr_t) + xhdr.size;
	pos->serial = xhdr.serial1;
	return (ISC_R_SUCCESS);
}

/*
 * If the index of the journal 'j' contains an entry "better"
 * than '*best_guess', replace '*best_guess' with it.
 *
 * "Better" means having a serial number closer to 'serial'
 * but not greater than 'serial'.
 */
static void
index_find(dns_journal_t *j, uint32_t serial, journal_pos_t *best_guess) {
	unsigned int i;
	if (j->index == NULL) {
		return;
	}
	for (i = 0; i < j->header.index_size; i++) {
		if (POS_VALID(j->index[i]) &&
		    DNS_SERIAL_GE(serial, j->index[i].serial) &&
		    DNS_SERIAL_GT(j->index[i].serial, best_guess->serial))
		{
			*best_guess = j->index[i];
		}
	}
}

/*
 * Add a new index entry.  If there is no room, make room by removing
 * the odd-numbered entries and compacting the others into the first
 * half of the index.  This decimates old index entries exponentially
 * over time, so that the index always contains a much larger fraction
 * of recent serial numbers than of old ones.  This is deliberate -
 * most index searches are for outgoing IXFR, and IXFR tends to request
 * recent versions more often than old ones.
 */
static void
index_add(dns_journal_t *j, journal_pos_t *pos) {
	unsigned int i;
	if (j->index == NULL) {
		return;
	}
	/*
	 * Search for a vacant position.
	 */
	for (i = 0; i < j->header.index_size; i++) {
		if (!POS_VALID(j->index[i])) {
			break;
		}
	}
	if (i == j->header.index_size) {
		unsigned int k = 0;
		/*
		 * Found no vacant position.  Make some room.
		 */
		for (i = 0; i < j->header.index_size; i += 2) {
			j->index[k++] = j->index[i];
		}
		i = k; /* 'i' identifies the first vacant position. */
		while (k < j->header.index_size) {
			POS_INVALIDATE(j->index[k]);
			k++;
		}
	}
	INSIST(i < j->header.index_size);
	INSIST(!POS_VALID(j->index[i]));

	/*
	 * Store the new index entry.
	 */
	j->index[i] = *pos;
}

/*
 * Invalidate any existing index entries that could become
 * ambiguous when a new transaction with number 'serial' is added.
 */
static void
index_invalidate(dns_journal_t *j, uint32_t serial) {
	unsigned int i;
	if (j->index == NULL) {
		return;
	}
	for (i = 0; i < j->header.index_size; i++) {
		if (!DNS_SERIAL_GT(serial, j->index[i].serial)) {
			POS_INVALIDATE(j->index[i]);
		}
	}
}

/*
 * Try to find a transaction with initial serial number 'serial'
 * in the journal 'j'.
 *
 * If found, store its position at '*pos' and return ISC_R_SUCCESS.
 *
 * If 'serial' is current (= the ending serial number of the
 * last transaction in the journal), set '*pos' to
 * the position immediately following the last transaction and
 * return ISC_R_SUCCESS.
 *
 * If 'serial' is within the range of addressable serial numbers
 * covered by the journal but that particular serial number is missing
 * (from the journal, not just from the index), return ISC_R_NOTFOUND.
 *
 * If 'serial' is outside the range of addressable serial numbers
 * covered by the journal, return ISC_R_RANGE.
 *
 */
static isc_result_t
journal_find(dns_journal_t *j, uint32_t serial, journal_pos_t *pos) {
	isc_result_t result;
	journal_pos_t current_pos;
	REQUIRE(DNS_JOURNAL_VALID(j));

	if (DNS_SERIAL_GT(j->header.begin.serial, serial)) {
		return (ISC_R_RANGE);
	}
	if (DNS_SERIAL_GT(serial, j->header.end.serial)) {
		return (ISC_R_RANGE);
	}
	if (serial == j->header.end.serial) {
		*pos = j->header.end;
		return (ISC_R_SUCCESS);
	}

	current_pos = j->header.begin;
	index_find(j, serial, &current_pos);

	while (current_pos.serial != serial) {
		if (DNS_SERIAL_GT(current_pos.serial, serial)) {
			return (ISC_R_NOTFOUND);
		}
		result = journal_next(j, &current_pos);
		if (result != ISC_R_SUCCESS) {
			return (result);
		}
	}
	*pos = current_pos;
	return (ISC_R_SUCCESS);
}

isc_result_t
dns_journal_begin_transaction(dns_journal_t *j) {
	uint32_t offset;
	isc_result_t result;
	journal_rawxhdr_t hdr;

	REQUIRE(DNS_JOURNAL_VALID(j));
	REQUIRE(j->state == JOURNAL_STATE_WRITE ||
		j->state == JOURNAL_STATE_INLINE);

	/*
	 * Find the file offset where the new transaction should
	 * be written, and seek there.
	 */
	if (JOURNAL_EMPTY(&j->header)) {
		offset = sizeof(journal_rawheader_t) +
			 j->header.index_size * sizeof(journal_rawpos_t);
	} else {
		offset = j->header.end.offset;
	}
	j->x.pos[0].offset = offset;
	j->x.pos[1].offset = offset; /* Initial value, will be incremented. */
	j->x.n_soa = 0;

	CHECK(journal_seek(j, offset));

	/*
	 * Write a dummy transaction header of all zeroes to reserve
	 * space.  It will be filled in when the transaction is
	 * finished.
	 */
	memset(&hdr, 0, sizeof(hdr));
	CHECK(journal_write(j, &hdr, sizeof(hdr)));
	j->x.pos[1].offset = j->offset;

	j->state = JOURNAL_STATE_TRANSACTION;
	result = ISC_R_SUCCESS;
failure:
	return (result);
}

isc_result_t
dns_journal_writediff(dns_journal_t *j, dns_diff_t *diff) {
	dns_difftuple_t *t;
	isc_buffer_t buffer;
	void *mem = NULL;
	uint64_t size;
	isc_result_t result;
	isc_region_t used;

	REQUIRE(DNS_DIFF_VALID(diff));
	REQUIRE(j->state == JOURNAL_STATE_TRANSACTION);

	isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "writing to journal");
	(void)dns_diff_print(diff, NULL);

	/*
	 * Pass 1: determine the buffer size needed, and
	 * keep track of SOA serial numbers.
	 */
	size = 0;
	for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
	     t = ISC_LIST_NEXT(t, link)) {
		if (t->rdata.type == dns_rdatatype_soa) {
			if (j->x.n_soa < 2) {
				j->x.pos[j->x.n_soa].serial =
					dns_soa_getserial(&t->rdata);
			}
			j->x.n_soa++;
		}
		size += sizeof(journal_rawrrhdr_t);
		size += t->name.length; /* XXX should have access macro? */
		size += 10;
		size += t->rdata.length;
	}

	if (size >= DNS_JOURNAL_SIZE_MAX) {
		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
			      "dns_journal_writediff: %s: journal entry "
			      "too big to be stored: %" PRIu64 " bytes",
			      j->filename, size);
		return (ISC_R_NOSPACE);
	}

	mem = isc_mem_get(j->mctx, size);

	isc_buffer_init(&buffer, mem, size);

	/*
	 * Pass 2.  Write RRs to buffer.
	 */
	for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
	     t = ISC_LIST_NEXT(t, link)) {
		/*
		 * Write the RR header.
		 */
		isc_buffer_putuint32(&buffer,
				     t->name.length + 10 + t->rdata.length);
		/*
		 * Write the owner name, RR header, and RR data.
		 */
		isc_buffer_putmem(&buffer, t->name.ndata, t->name.length);
		isc_buffer_putuint16(&buffer, t->rdata.type);
		isc_buffer_putuint16(&buffer, t->rdata.rdclass);
		isc_buffer_putuint32(&buffer, t->ttl);
		INSIST(t->rdata.length < 65536);
		isc_buffer_putuint16(&buffer, (uint16_t)t->rdata.length);
		INSIST(isc_buffer_availablelength(&buffer) >= t->rdata.length);
		isc_buffer_putmem(&buffer, t->rdata.data, t->rdata.length);
	}

	isc_buffer_usedregion(&buffer, &used);
	INSIST(used.length == size);

	j->x.pos[1].offset += used.length;

	/*
	 * Write the buffer contents to the journal file.
	 */
	CHECK(journal_write(j, used.base, used.length));

	result = ISC_R_SUCCESS;

failure:
	if (mem != NULL) {
		isc_mem_put(j->mctx, mem, size);
	}
	return (result);
}

isc_result_t
dns_journal_commit(dns_journal_t *j) {
	isc_result_t result;
	journal_rawheader_t rawheader;
	uint64_t total;

	REQUIRE(DNS_JOURNAL_VALID(j));
	REQUIRE(j->state == JOURNAL_STATE_TRANSACTION ||
		j->state == JOURNAL_STATE_INLINE);

	/*
	 * Just write out a updated header.
	 */
	if (j->state == JOURNAL_STATE_INLINE) {
		CHECK(journal_fsync(j));
		journal_header_encode(&j->header, &rawheader);
		CHECK(journal_seek(j, 0));
		CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
		CHECK(journal_fsync(j));
		j->state = JOURNAL_STATE_WRITE;
		return (ISC_R_SUCCESS);
	}

	/*
	 * Perform some basic consistency checks.
	 */
	if (j->x.n_soa != 2) {
		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
			      "%s: malformed transaction: %d SOAs", j->filename,
			      j->x.n_soa);
		return (ISC_R_UNEXPECTED);
	}
	if (!DNS_SERIAL_GT(j->x.pos[1].serial, j->x.pos[0].serial)) {
		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
			      "%s: malformed transaction: serial number "
			      "did not increase",
			      j->filename);
		return (ISC_R_UNEXPECTED);
	}
	if (!JOURNAL_EMPTY(&j->header)) {
		if (j->x.pos[0].serial != j->header.end.serial) {
			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
				      "malformed transaction: "
				      "%s last serial %u != "
				      "transaction first serial %u",
				      j->filename, j->header.end.serial,
				      j->x.pos[0].serial);
			return (ISC_R_UNEXPECTED);
		}
	}

	/*
	 * We currently don't support huge journal entries.
	 */
	total = j->x.pos[1].offset - j->x.pos[0].offset;
	if (total >= DNS_JOURNAL_SIZE_MAX) {
		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
			      "transaction too big to be stored in journal: "
			      "%" PRIu64 "b (max is %" PRIu64 "b)",
			      total, (uint64_t)DNS_JOURNAL_SIZE_MAX);
		return (ISC_R_UNEXPECTED);
	}

	/*
	 * Some old journal entries may become non-addressable
	 * when we increment the current serial number.  Purge them
	 * by stepping header.begin forward to the first addressable
	 * transaction.  Also purge them from the index.
	 */
	if (!JOURNAL_EMPTY(&j->header)) {
		while (!DNS_SERIAL_GT(j->x.pos[1].serial,
				      j->header.begin.serial)) {
			CHECK(journal_next(j, &j->header.begin));
		}
		index_invalidate(j, j->x.pos[1].serial);
	}
#ifdef notyet
	if (DNS_SERIAL_GT(last_dumped_serial, j->x.pos[1].serial)) {
		force_dump(...);
	}
#endif /* ifdef notyet */

	/*
	 * Commit the transaction data to stable storage.
	 */
	CHECK(journal_fsync(j));

	if (j->state == JOURNAL_STATE_TRANSACTION) {
		isc_offset_t offset;
		offset = (j->x.pos[1].offset - j->x.pos[0].offset) -
			 sizeof(journal_rawxhdr_t);
		/*
		 * Update the transaction header.
		 */
		CHECK(journal_seek(j, j->x.pos[0].offset));
		CHECK(journal_write_xhdr(j, offset, j->x.pos[0].serial,
					 j->x.pos[1].serial));
	}

	/*
	 * Update the journal header.
	 */
	if (JOURNAL_EMPTY(&j->header)) {
		j->header.begin = j->x.pos[0];
	}
	j->header.end = j->x.pos[1];
	journal_header_encode(&j->header, &rawheader);
	CHECK(journal_seek(j, 0));
	CHECK(journal_write(j, &rawheader, sizeof(rawheader)));

	/*
	 * Update the index.
	 */
	index_add(j, &j->x.pos[0]);

	/*
	 * Convert the index into on-disk format and write
	 * it to disk.
	 */
	CHECK(index_to_disk(j));

	/*
	 * Commit the header to stable storage.
	 */
	CHECK(journal_fsync(j));

	/*
	 * We no longer have a transaction open.
	 */
	j->state = JOURNAL_STATE_WRITE;

	result = ISC_R_SUCCESS;

failure:
	return (result);
}

isc_result_t
dns_journal_write_transaction(dns_journal_t *j, dns_diff_t *diff) {
	isc_result_t result;
	CHECK(dns_diff_sort(diff, ixfr_order));
	CHECK(dns_journal_begin_transaction(j));
	CHECK(dns_journal_writediff(j, diff));
	CHECK(dns_journal_commit(j));
	result = ISC_R_SUCCESS;
failure:
	return (result);
}

void
dns_journal_destroy(dns_journal_t **journalp) {
	dns_journal_t *j = *journalp;
	*journalp = NULL;
	REQUIRE(DNS_JOURNAL_VALID(j));

	j->it.result = ISC_R_FAILURE;
	dns_name_invalidate(&j->it.name);
	dns_decompress_invalidate(&j->it.dctx);
	if (j->rawindex != NULL) {
		isc_mem_put(j->mctx, j->rawindex,
			    j->header.index_size * sizeof(journal_rawpos_t));
	}
	if (j->index != NULL) {
		isc_mem_put(j->mctx, j->index,
			    j->header.index_size * sizeof(journal_pos_t));
	}
	if (j->it.target.base != NULL) {
		isc_mem_put(j->mctx, j->it.target.base, j->it.target.length);
	}
	if (j->it.source.base != NULL) {
		isc_mem_put(j->mctx, j->it.source.base, j->it.source.length);
	}
	if (j->filename != NULL) {
		isc_mem_free(j->mctx, j->filename);
	}
	if (j->fp != NULL) {
		(void)isc_stdio_close(j->fp);
	}
	j->magic = 0;
	isc_mem_putanddetach(&j->mctx, j, sizeof(*j));
}

/*
 * Roll the open journal 'j' into the database 'db'.
 * A new database version will be created.
 */

/* XXX Share code with incoming IXFR? */

static isc_result_t
roll_forward(dns_journal_t *j, dns_db_t *db, unsigned int options) {
	isc_buffer_t source; /* Transaction data from disk */
	isc_buffer_t target; /* Ditto after _fromwire check */
	uint32_t db_serial;  /* Database SOA serial */
	uint32_t end_serial; /* Last journal SOA serial */
	isc_result_t result;
	dns_dbversion_t *ver = NULL;
	journal_pos_t pos;
	dns_diff_t diff;
	unsigned int n_soa = 0;
	unsigned int n_put = 0;
	dns_diffop_t op;

	REQUIRE(DNS_JOURNAL_VALID(j));
	REQUIRE(DNS_DB_VALID(db));

	dns_diff_init(j->mctx, &diff);

	/*
	 * Set up empty initial buffers for unchecked and checked
	 * wire format transaction data.  They will be reallocated
	 * later.
	 */
	isc_buffer_init(&source, NULL, 0);
	isc_buffer_init(&target, NULL, 0);

	/*
	 * Create the new database version.
	 */
	CHECK(dns_db_newversion(db, &ver));

	/*
	 * Get the current database SOA serial number.
	 */
	CHECK(dns_db_getsoaserial(db, ver, &db_serial));

	/*
	 * Locate a journal entry for the current database serial.
	 */
	CHECK(journal_find(j, db_serial, &pos));
	/*
	 * XXX do more drastic things, like marking zone stale,
	 * if this fails?
	 */
	/*
	 * XXXRTH  The zone code should probably mark the zone as bad and
	 *         scream loudly into the log if this is a dynamic update
	 *	   log reply that failed.
	 */

	end_serial = dns_journal_last_serial(j);
	if (db_serial == end_serial) {
		CHECK(DNS_R_UPTODATE);
	}

	CHECK(dns_journal_iter_init(j, db_serial, end_serial));

	for (result = dns_journal_first_rr(j); result == ISC_R_SUCCESS;
	     result = dns_journal_next_rr(j))
	{
		dns_name_t *name;
		uint32_t ttl;
		dns_rdata_t *rdata;
		dns_difftuple_t *tuple = NULL;

		name = NULL;
		rdata = NULL;
		dns_journal_current_rr(j, &name, &ttl, &rdata);

		if (rdata->type == dns_rdatatype_soa) {
			n_soa++;
			if (n_soa == 2) {
				db_serial = j->it.current_serial;
			}
		}

		if (n_soa == 3) {
			n_soa = 1;
		}
		if (n_soa == 0) {
			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
				      "%s: journal file corrupt: missing "
				      "initial SOA",
				      j->filename);
			FAIL(ISC_R_UNEXPECTED);
		}
		if ((options & DNS_JOURNALOPT_RESIGN) != 0) {
			op = (n_soa == 1) ? DNS_DIFFOP_DELRESIGN
					  : DNS_DIFFOP_ADDRESIGN;
		} else {
			op = (n_soa == 1) ? DNS_DIFFOP_DEL : DNS_DIFFOP_ADD;
		}

		CHECK(dns_difftuple_create(diff.mctx, op, name, ttl, rdata,
					   &tuple));
		dns_diff_append(&diff, &tuple);

		if (++n_put > 100) {
			isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
				      "%s: applying diff to database (%u)",
				      j->filename, db_serial);
			(void)dns_diff_print(&diff, NULL);
			CHECK(dns_diff_apply(&diff, db, ver));
			dns_diff_clear(&diff);
			n_put = 0;
		}
	}
	if (result == ISC_R_NOMORE) {
		result = ISC_R_SUCCESS;
	}
	CHECK(result);

	if (n_put != 0) {
		isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
			      "%s: applying final diff to database (%u)",
			      j->filename, db_serial);
		(void)dns_diff_print(&diff, NULL);
		CHECK(dns_diff_apply(&diff, db, ver));
		dns_diff_clear(&diff);
	}

failure:
	if (ver != NULL) {
		dns_db_closeversion(db, &ver,
				    result == ISC_R_SUCCESS ? true : false);
	}

	if (source.base != NULL) {
		isc_mem_put(j->mctx, source.base, source.length);
	}
	if (target.base != NULL) {
		isc_mem_put(j->mctx, target.base, target.length);
	}

	dns_diff_clear(&diff);

	INSIST(ver == NULL);

	return (result);
}

isc_result_t
dns_journal_rollforward(isc_mem_t *mctx, dns_db_t *db, unsigned int options,
			const char *filename) {
	dns_journal_t *j;
	isc_result_t result;

	REQUIRE(DNS_DB_VALID(db));
	REQUIRE(filename != NULL);

	j = NULL;
	result = dns_journal_open(mctx, filename, DNS_JOURNAL_READ, &j);
	if (result == ISC_R_NOTFOUND) {
		isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no journal file, but "
							"that's OK");
		return (DNS_R_NOJOURNAL);
	}
	if (result != ISC_R_SUCCESS) {
		return (result);
	}
	if (JOURNAL_EMPTY(&j->header)) {
		result = DNS_R_UPTODATE;
	} else {
		result = roll_forward(j, db, options);
	}

	dns_journal_destroy(&j);

	return (result);
}

isc_result_t
dns_journal_print(isc_mem_t *mctx, const char *filename, FILE *file) {
	dns_journal_t *j;
	isc_buffer_t source;   /* Transaction data from disk */
	isc_buffer_t target;   /* Ditto after _fromwire check */
	uint32_t start_serial; /* Database SOA serial */
	uint32_t end_serial;   /* Last journal SOA serial */
	isc_result_t result;
	dns_diff_t diff;
	unsigned int n_soa = 0;
	unsigned int n_put = 0;

	REQUIRE(filename != NULL);

	j = NULL;
	result = dns_journal_open(mctx, filename, DNS_JOURNAL_READ, &j);
	if (result == ISC_R_NOTFOUND) {
		isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no journal file");
		return (DNS_R_NOJOURNAL);
	}

	if (result != ISC_R_SUCCESS) {
		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
			      "journal open failure: %s: %s",
			      isc_result_totext(result), filename);
		return (result);
	}

	if (j->header.serialset) {
		fprintf(file, "Source serial = %u\n", j->header.sourceserial);
	}
	dns_diff_init(j->mctx, &diff);

	/*
	 * Set up empty initial buffers for unchecked and checked
	 * wire format transaction data.  They will be reallocated
	 * later.
	 */
	isc_buffer_init(&source, NULL, 0);
	isc_buffer_init(&target, NULL, 0);

	start_serial = dns_journal_first_serial(j);
	end_serial = dns_journal_last_serial(j);

	CHECK(dns_journal_iter_init(j, start_serial, end_serial));

	for (result = dns_journal_first_rr(j); result == ISC_R_SUCCESS;
	     result = dns_journal_next_rr(j))
	{
		dns_name_t *name;
		uint32_t ttl;
		dns_rdata_t *rdata;
		dns_difftuple_t *tuple = NULL;

		name = NULL;
		rdata = NULL;
		dns_journal_current_rr(j, &name, &ttl, &rdata);

		if (rdata->type == dns_rdatatype_soa) {
			n_soa++;
		}

		if (n_soa == 3) {
			n_soa = 1;
		}
		if (n_soa == 0) {
			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
				      "%s: journal file corrupt: missing "
				      "initial SOA",
				      j->filename);
			FAIL(ISC_R_UNEXPECTED);
		}
		CHECK(dns_difftuple_create(
			diff.mctx, n_soa == 1 ? DNS_DIFFOP_DEL : DNS_DIFFOP_ADD,
			name, ttl, rdata, &tuple));
		dns_diff_append(&diff, &tuple);

		if (++n_put > 100) {
			result = dns_diff_print(&diff, file);
			dns_diff_clear(&diff);
			n_put = 0;
			if (result != ISC_R_SUCCESS) {
				break;
			}
		}
	}
	if (result == ISC_R_NOMORE) {
		result = ISC_R_SUCCESS;
	}
	CHECK(result);

	if (n_put != 0) {
		result = dns_diff_print(&diff, file);
		dns_diff_clear(&diff);
	}
	goto cleanup;

failure:
	isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
		      "%s: cannot print: journal file corrupt", j->filename);

cleanup:
	if (source.base != NULL) {
		isc_mem_put(j->mctx, source.base, source.length);
	}
	if (target.base != NULL) {
		isc_mem_put(j->mctx, target.base, target.length);
	}

	dns_diff_clear(&diff);
	dns_journal_destroy(&j);

	return (result);
}

/**************************************************************************/
/*
 * Miscellaneous accessors.
 */
bool
dns_journal_empty(dns_journal_t *j) {
	return (JOURNAL_EMPTY(&j->header));
}

uint32_t
dns_journal_first_serial(dns_journal_t *j) {
	return (j->header.begin.serial);
}

uint32_t
dns_journal_last_serial(dns_journal_t *j) {
	return (j->header.end.serial);
}

void
dns_journal_set_sourceserial(dns_journal_t *j, uint32_t sourceserial) {
	REQUIRE(j->state == JOURNAL_STATE_WRITE ||
		j->state == JOURNAL_STATE_INLINE ||
		j->state == JOURNAL_STATE_TRANSACTION);

	j->header.sourceserial = sourceserial;
	j->header.serialset = true;
	if (j->state == JOURNAL_STATE_WRITE) {
		j->state = JOURNAL_STATE_INLINE;
	}
}

bool
dns_journal_get_sourceserial(dns_journal_t *j, uint32_t *sourceserial) {
	REQUIRE(sourceserial != NULL);

	if (!j->header.serialset) {
		return (false);
	}
	*sourceserial = j->header.sourceserial;
	return (true);
}

/**************************************************************************/
/*
 * Iteration support.
 *
 * When serving an outgoing IXFR, we transmit a part the journal starting
 * at the serial number in the IXFR request and ending at the serial
 * number that is current when the IXFR request arrives.  The ending
 * serial number is not necessarily at the end of the journal:
 * the journal may grow while the IXFR is in progress, but we stop
 * when we reach the serial number that was current when the IXFR started.
 */

static isc_result_t
read_one_rr(dns_journal_t *j);

/*
 * Make sure the buffer 'b' is has at least 'size' bytes
 * allocated, and clear it.
 *
 * Requires:
 *	Either b->base is NULL, or it points to b->length bytes of memory
 *	previously allocated by isc_mem_get().
 */

static isc_result_t
size_buffer(isc_mem_t *mctx, isc_buffer_t *b, unsigned size) {
	if (b->length < size) {
		void *mem = isc_mem_get(mctx, size);
		if (mem == NULL) {
			return (ISC_R_NOMEMORY);
		}
		if (b->base != NULL) {
			isc_mem_put(mctx, b->base, b->length);
		}
		b->base = mem;
		b->length = size;
	}
	isc_buffer_clear(b);
	return (ISC_R_SUCCESS);
}

isc_result_t
dns_journal_iter_init(dns_journal_t *j, uint32_t begin_serial,
		      uint32_t end_serial) {
	isc_result_t result;

	CHECK(journal_find(j, begin_serial, &j->it.bpos));
	INSIST(j->it.bpos.serial == begin_serial);

	CHECK(journal_find(j, end_serial, &j->it.epos));
	INSIST(j->it.epos.serial == end_serial);

	result = ISC_R_SUCCESS;
failure:
	j->it.result = result;
	return (j->it.result);
}

isc_result_t
dns_journal_first_rr(dns_journal_t *j) {
	isc_result_t result;

	/*
	 * Seek to the beginning of the first transaction we are
	 * interested in.
	 */
	CHECK(journal_seek(j, j->it.bpos.offset));
	j->it.current_serial = j->it.bpos.serial;

	j->it.xsize = 0; /* We have no transaction data yet... */
	j->it.xpos = 0;	 /* ...and haven't used any of it. */

	return (read_one_rr(j));

failure:
	return (result);
}

static isc_result_t
read_one_rr(dns_journal_t *j) {
	isc_result_t result;

	dns_rdatatype_t rdtype;
	dns_rdataclass_t rdclass;
	unsigned int rdlen;
	uint32_t ttl;
	journal_xhdr_t xhdr;
	journal_rrhdr_t rrhdr;

	if (j->offset > j->it.epos.offset) {
		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
			      "%s: journal corrupt: possible integer overflow",
			      j->filename);
		return (ISC_R_UNEXPECTED);
	}
	if (j->offset == j->it.epos.offset) {
		return (ISC_R_NOMORE);
	}
	if (j->it.xpos == j->it.xsize) {
		/*
		 * We are at a transaction boundary.
		 * Read another transaction header.
		 */
		CHECK(journal_read_xhdr(j, &xhdr));
		if (xhdr.size == 0) {
			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
				      "%s: journal corrupt: empty transaction",
				      j->filename);
			FAIL(ISC_R_UNEXPECTED);
		}
		if (xhdr.serial0 != j->it.current_serial) {
			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
				      "%s: journal file corrupt: "
				      "expected serial %u, got %u",
				      j->filename, j->it.current_serial,
				      xhdr.serial0);
			FAIL(ISC_R_UNEXPECTED);
		}
		j->it.xsize = xhdr.size;
		j->it.xpos = 0;
	}
	/*
	 * Read an RR.
	 */
	CHECK(journal_read_rrhdr(j, &rrhdr));
	/*
	 * Perform a sanity check on the journal RR size.
	 * The smallest possible RR has a 1-byte owner name
	 * and a 10-byte header.  The largest possible
	 * RR has 65535 bytes of data, a header, and a maximum-
	 * size owner name, well below 70 k total.
	 */
	if (rrhdr.size < 1 + 10 || rrhdr.size > 70000) {
		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
			      "%s: journal corrupt: impossible RR size "
			      "(%d bytes)",
			      j->filename, rrhdr.size);
		FAIL(ISC_R_UNEXPECTED);
	}

	CHECK(size_buffer(j->mctx, &j->it.source, rrhdr.size));
	CHECK(journal_read(j, j->it.source.base, rrhdr.size));
	isc_buffer_add(&j->it.source, rrhdr.size);

	/*
	 * The target buffer is made the same size
	 * as the source buffer, with the assumption that when
	 * no compression in present, the output of dns_*_fromwire()
	 * is no larger than the input.
	 */
	CHECK(size_buffer(j->mctx, &j->it.target, rrhdr.size));

	/*
	 * Parse the owner name.  We don't know where it
	 * ends yet, so we make the entire "remaining"
	 * part of the buffer "active".
	 */
	isc_buffer_setactive(&j->it.source,
			     j->it.source.used - j->it.source.current);
	CHECK(dns_name_fromwire(&j->it.name, &j->it.source, &j->it.dctx, 0,
				&j->it.target));

	/*
	 * Check that the RR header is there, and parse it.
	 */
	if (isc_buffer_remaininglength(&j->it.source) < 10) {
		FAIL(DNS_R_FORMERR);
	}

	rdtype = isc_buffer_getuint16(&j->it.source);
	rdclass = isc_buffer_getuint16(&j->it.source);
	ttl = isc_buffer_getuint32(&j->it.source);
	rdlen = isc_buffer_getuint16(&j->it.source);

	/*
	 * Parse the rdata.
	 */
	if (isc_buffer_remaininglength(&j->it.source) != rdlen) {
		FAIL(DNS_R_FORMERR);
	}
	isc_buffer_setactive(&j->it.source, rdlen);
	dns_rdata_reset(&j->it.rdata);
	CHECK(dns_rdata_fromwire(&j->it.rdata, rdclass, rdtype, &j->it.source,
				 &j->it.dctx, 0, &j->it.target));
	j->it.ttl = ttl;

	j->it.xpos += sizeof(journal_rawrrhdr_t) + rrhdr.size;
	if (rdtype == dns_rdatatype_soa) {
		/* XXX could do additional consistency checks here */
		j->it.current_serial = dns_soa_getserial(&j->it.rdata);
	}

	result = ISC_R_SUCCESS;

failure:
	j->it.result = result;
	return (result);
}

isc_result_t
dns_journal_next_rr(dns_journal_t *j) {
	j->it.result = read_one_rr(j);
	return (j->it.result);
}

void
dns_journal_current_rr(dns_journal_t *j, dns_name_t **name, uint32_t *ttl,
		       dns_rdata_t **rdata) {
	REQUIRE(j->it.result == ISC_R_SUCCESS);
	*name = &j->it.name;
	*ttl = j->it.ttl;
	*rdata = &j->it.rdata;
}

/**************************************************************************/
/*
 * Generating diffs from databases
 */

/*
 * Construct a diff containing all the RRs at the current name of the
 * database iterator 'dbit' in database 'db', version 'ver'.
 * Set '*name' to the current name, and append the diff to 'diff'.
 * All new tuples will have the operation 'op'.
 *
 * Requires: 'name' must have buffer large enough to hold the name.
 * Typically, a dns_fixedname_t would be used.
 */
static isc_result_t
get_name_diff(dns_db_t *db, dns_dbversion_t *ver, isc_stdtime_t now,
	      dns_dbiterator_t *dbit, dns_name_t *name, dns_diffop_t op,
	      dns_diff_t *diff) {
	isc_result_t result;
	dns_dbnode_t *node = NULL;
	dns_rdatasetiter_t *rdsiter = NULL;
	dns_difftuple_t *tuple = NULL;

	result = dns_dbiterator_current(dbit, &node, name);
	if (result != ISC_R_SUCCESS) {
		return (result);
	}

	result = dns_db_allrdatasets(db, node, ver, now, &rdsiter);
	if (result != ISC_R_SUCCESS) {
		goto cleanup_node;
	}

	for (result = dns_rdatasetiter_first(rdsiter); result == ISC_R_SUCCESS;
	     result = dns_rdatasetiter_next(rdsiter))
	{
		dns_rdataset_t rdataset;

		dns_rdataset_init(&rdataset);
		dns_rdatasetiter_current(rdsiter, &rdataset);

		for (result = dns_rdataset_first(&rdataset);
		     result == ISC_R_SUCCESS;
		     result = dns_rdataset_next(&rdataset))
		{
			dns_rdata_t rdata = DNS_RDATA_INIT;
			dns_rdataset_current(&rdataset, &rdata);
			result = dns_difftuple_create(diff->mctx, op, name,
						      rdataset.ttl, &rdata,
						      &tuple);
			if (result != ISC_R_SUCCESS) {
				dns_rdataset_disassociate(&rdataset);
				goto cleanup_iterator;
			}
			dns_diff_append(diff, &tuple);
		}
		dns_rdataset_disassociate(&rdataset);
		if (result != ISC_R_NOMORE) {
			goto cleanup_iterator;
		}
	}
	if (result != ISC_R_NOMORE) {
		goto cleanup_iterator;
	}

	result = ISC_R_SUCCESS;

cleanup_iterator:
	dns_rdatasetiter_destroy(&rdsiter);

cleanup_node:
	dns_db_detachnode(db, &node);

	return (result);
}

/*
 * Comparison function for use by dns_diff_subtract when sorting
 * the diffs to be subtracted.  The sort keys are the rdata type
 * and the rdata itself.  The owner name is ignored, because
 * it is known to be the same for all tuples.
 */
static int
rdata_order(const void *av, const void *bv) {
	dns_difftuple_t const *const *ap = av;
	dns_difftuple_t const *const *bp = bv;
	dns_difftuple_t const *a = *ap;
	dns_difftuple_t const *b = *bp;
	int r;
	r = (b->rdata.type - a->rdata.type);
	if (r != 0) {
		return (r);
	}
	r = dns_rdata_compare(&a->rdata, &b->rdata);
	return (r);
}

static isc_result_t
dns_diff_subtract(dns_diff_t diff[2], dns_diff_t *r) {
	isc_result_t result;
	dns_difftuple_t *p[2];
	int i, t;
	bool append;

	CHECK(dns_diff_sort(&diff[0], rdata_order));
	CHECK(dns_diff_sort(&diff[1], rdata_order));

	for (;;) {
		p[0] = ISC_LIST_HEAD(diff[0].tuples);
		p[1] = ISC_LIST_HEAD(diff[1].tuples);
		if (p[0] == NULL && p[1] == NULL) {
			break;
		}

		for (i = 0; i < 2; i++) {
			if (p[!i] == NULL) {
				{
					ISC_LIST_UNLINK(diff[i].tuples, p[i],
							link);
					ISC_LIST_APPEND(r->tuples, p[i], link);
					goto next;
				}
			}
		}
		t = rdata_order(&p[0], &p[1]);
		if (t < 0) {
			ISC_LIST_UNLINK(diff[0].tuples, p[0], link);
			ISC_LIST_APPEND(r->tuples, p[0], link);
			goto next;
		}
		if (t > 0) {
			ISC_LIST_UNLINK(diff[1].tuples, p[1], link);
			ISC_LIST_APPEND(r->tuples, p[1], link);
			goto next;
		}
		INSIST(t == 0);
		/*
		 * Identical RRs in both databases; skip them both
		 * if the ttl differs.
		 */
		append = (p[0]->ttl != p[1]->ttl);
		for (i = 0; i < 2; i++) {
			ISC_LIST_UNLINK(diff[i].tuples, p[i], link);
			if (append) {
				ISC_LIST_APPEND(r->tuples, p[i], link);
			} else {
				dns_difftuple_free(&p[i]);
			}
		}
	next:;
	}
	result = ISC_R_SUCCESS;
failure:
	return (result);
}

static isc_result_t
diff_namespace(dns_db_t *dba, dns_dbversion_t *dbvera, dns_db_t *dbb,
	       dns_dbversion_t *dbverb, unsigned int options,
	       dns_diff_t *resultdiff) {
	dns_db_t *db[2];
	dns_dbversion_t *ver[2];
	dns_dbiterator_t *dbit[2] = { NULL, NULL };
	bool have[2] = { false, false };
	dns_fixedname_t fixname[2];
	isc_result_t result, itresult[2];
	dns_diff_t diff[2];
	int i, t;

	db[0] = dba, db[1] = dbb;
	ver[0] = dbvera, ver[1] = dbverb;

	dns_diff_init(resultdiff->mctx, &diff[0]);
	dns_diff_init(resultdiff->mctx, &diff[1]);

	dns_fixedname_init(&fixname[0]);
	dns_fixedname_init(&fixname[1]);

	result = dns_db_createiterator(db[0], options, &dbit[0]);
	if (result != ISC_R_SUCCESS) {
		return (result);
	}
	result = dns_db_createiterator(db[1], options, &dbit[1]);
	if (result != ISC_R_SUCCESS) {
		goto cleanup_iterator;
	}

	itresult[0] = dns_dbiterator_first(dbit[0]);
	itresult[1] = dns_dbiterator_first(dbit[1]);

	for (;;) {
		for (i = 0; i < 2; i++) {
			if (!have[i] && itresult[i] == ISC_R_SUCCESS) {
				CHECK(get_name_diff(
					db[i], ver[i], 0, dbit[i],
					dns_fixedname_name(&fixname[i]),
					i == 0 ? DNS_DIFFOP_ADD
					       : DNS_DIFFOP_DEL,
					&diff[i]));
				itresult[i] = dns_dbiterator_next(dbit[i]);
				have[i] = true;
			}
		}

		if (!have[0] && !have[1]) {
			INSIST(ISC_LIST_EMPTY(diff[0].tuples));
			INSIST(ISC_LIST_EMPTY(diff[1].tuples));
			break;
		}

		for (i = 0; i < 2; i++) {
			if (!have[!i]) {
				ISC_LIST_APPENDLIST(resultdiff->tuples,
						    diff[i].tuples, link);
				INSIST(ISC_LIST_EMPTY(diff[i].tuples));
				have[i] = false;
				goto next;
			}
		}

		t = dns_name_compare(dns_fixedname_name(&fixname[0]),
				     dns_fixedname_name(&fixname[1]));
		if (t < 0) {
			ISC_LIST_APPENDLIST(resultdiff->tuples, diff[0].tuples,
					    link);
			INSIST(ISC_LIST_EMPTY(diff[0].tuples));
			have[0] = false;
			continue;
		}
		if (t > 0) {
			ISC_LIST_APPENDLIST(resultdiff->tuples, diff[1].tuples,
					    link);
			INSIST(ISC_LIST_EMPTY(diff[1].tuples));
			have[1] = false;
			continue;
		}
		INSIST(t == 0);
		CHECK(dns_diff_subtract(diff, resultdiff));
		INSIST(ISC_LIST_EMPTY(diff[0].tuples));
		INSIST(ISC_LIST_EMPTY(diff[1].tuples));
		have[0] = have[1] = false;
	next:;
	}
	if (itresult[0] != ISC_R_NOMORE) {
		FAIL(itresult[0]);
	}
	if (itresult[1] != ISC_R_NOMORE) {
		FAIL(itresult[1]);
	}

	INSIST(ISC_LIST_EMPTY(diff[0].tuples));
	INSIST(ISC_LIST_EMPTY(diff[1].tuples));

failure:
	dns_dbiterator_destroy(&dbit[1]);

cleanup_iterator:
	dns_dbiterator_destroy(&dbit[0]);
	dns_diff_clear(&diff[0]);
	dns_diff_clear(&diff[1]);
	return (result);
}

/*
 * Compare the databases 'dba' and 'dbb' and generate a journal
 * entry containing the changes to make 'dba' from 'dbb' (note
 * the order).  This journal entry will consist of a single,
 * possibly very large transaction.
 */
isc_result_t
dns_db_diff(isc_mem_t *mctx, dns_db_t *dba, dns_dbversion_t *dbvera,
	    dns_db_t *dbb, dns_dbversion_t *dbverb, const char *filename) {
	isc_result_t result;
	dns_diff_t diff;

	dns_diff_init(mctx, &diff);

	result = dns_db_diffx(&diff, dba, dbvera, dbb, dbverb, filename);

	dns_diff_clear(&diff);

	return (result);
}

isc_result_t
dns_db_diffx(dns_diff_t *diff, dns_db_t *dba, dns_dbversion_t *dbvera,
	     dns_db_t *dbb, dns_dbversion_t *dbverb, const char *filename) {
	isc_result_t result;
	dns_journal_t *journal = NULL;

	if (filename != NULL) {
		result = dns_journal_open(diff->mctx, filename,
					  DNS_JOURNAL_CREATE, &journal);
		if (result != ISC_R_SUCCESS) {
			return (result);
		}
	}

	CHECK(diff_namespace(dba, dbvera, dbb, dbverb, DNS_DB_NONSEC3, diff));
	CHECK(diff_namespace(dba, dbvera, dbb, dbverb, DNS_DB_NSEC3ONLY, diff));

	if (journal != NULL) {
		if (ISC_LIST_EMPTY(diff->tuples)) {
			isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no changes");
		} else {
			CHECK(dns_journal_write_transaction(journal, diff));
		}
	}

failure:
	if (journal != NULL) {
		dns_journal_destroy(&journal);
	}
	return (result);
}

isc_result_t
dns_journal_compact(isc_mem_t *mctx, char *filename, uint32_t serial,
		    uint32_t target_size) {
	unsigned int i;
	journal_pos_t best_guess;
	journal_pos_t current_pos;
	dns_journal_t *j1 = NULL;
	dns_journal_t *j2 = NULL;
	journal_rawheader_t rawheader;
	unsigned int copy_length;
	size_t namelen;
	char *buf = NULL;
	unsigned int size = 0;
	isc_result_t result;
	unsigned int indexend;
	char newname[PATH_MAX];
	char backup[PATH_MAX];
	bool is_backup = false;

	REQUIRE(filename != NULL);

	namelen = strlen(filename);
	if (namelen > 4U && strcmp(filename + namelen - 4, ".jnl") == 0) {
		namelen -= 4;
	}

	result = snprintf(newname, sizeof(newname), "%.*s.jnw", (int)namelen,
			  filename);
	RUNTIME_CHECK(result < sizeof(newname));

	result = snprintf(backup, sizeof(backup), "%.*s.jbk", (int)namelen,
			  filename);
	RUNTIME_CHECK(result < sizeof(backup));

	result = journal_open(mctx, filename, false, false, &j1);
	if (result == ISC_R_NOTFOUND) {
		is_backup = true;
		result = journal_open(mctx, backup, false, false, &j1);
	}
	if (result != ISC_R_SUCCESS) {
		return (result);
	}

	if (JOURNAL_EMPTY(&j1->header)) {
		dns_journal_destroy(&j1);
		return (ISC_R_SUCCESS);
	}

	if (DNS_SERIAL_GT(j1->header.begin.serial, serial) ||
	    DNS_SERIAL_GT(serial, j1->header.end.serial))
	{
		dns_journal_destroy(&j1);
		return (ISC_R_RANGE);
	}

	/*
	 * Cope with very small target sizes.
	 */
	indexend = sizeof(journal_rawheader_t) +
		   j1->header.index_size * sizeof(journal_rawpos_t);
	if (target_size < DNS_JOURNAL_SIZE_MIN) {
		target_size = DNS_JOURNAL_SIZE_MIN;
	}
	if (target_size < indexend * 2) {
		target_size = target_size / 2 + indexend;
	}

	/*
	 * See if there is any work to do.
	 */
	if ((uint32_t)j1->header.end.offset < target_size) {
		dns_journal_destroy(&j1);
		return (ISC_R_SUCCESS);
	}

	CHECK(journal_open(mctx, newname, true, true, &j2));

	/*
	 * Remove overhead so space test below can succeed.
	 */
	if (target_size >= indexend) {
		target_size -= indexend;
	}

	/*
	 * Find if we can create enough free space.
	 */
	best_guess = j1->header.begin;
	for (i = 0; i < j1->header.index_size; i++) {
		if (POS_VALID(j1->index[i]) &&
		    DNS_SERIAL_GE(serial, j1->index[i].serial) &&
		    ((uint32_t)(j1->header.end.offset - j1->index[i].offset) >=
		     target_size / 2) &&
		    j1->index[i].offset > best_guess.offset)
		{
			best_guess = j1->index[i];
		}
	}

	current_pos = best_guess;
	while (current_pos.serial != serial) {
		CHECK(journal_next(j1, &current_pos));
		if (current_pos.serial == j1->header.end.serial) {
			break;
		}

		if (DNS_SERIAL_GE(serial, current_pos.serial) &&
		    ((uint32_t)(j1->header.end.offset - current_pos.offset) >=
		     (target_size / 2)) &&
		    current_pos.offset > best_guess.offset)
		{
			best_guess = current_pos;
		} else {
			break;
		}
	}

	INSIST(best_guess.serial != j1->header.end.serial);
	if (best_guess.serial != serial) {
		CHECK(journal_next(j1, &best_guess));
	}

	/*
	 * We should now be roughly half target_size provided
	 * we did not reach 'serial'.  If not we will just copy
	 * all uncommitted deltas regardless of the size.
	 */
	copy_length = j1->header.end.offset - best_guess.offset;

	if (copy_length != 0) {
		/*
		 * Copy best_guess to end into space just freed.
		 */
		size = 64 * 1024;
		if (copy_length < size) {
			size = copy_length;
		}
		buf = isc_mem_get(mctx, size);

		CHECK(journal_seek(j1, best_guess.offset));
		CHECK(journal_seek(j2, indexend));
		for (i = 0; i < copy_length; i += size) {
			unsigned int len = (copy_length - i) > size
						   ? size
						   : (copy_length - i);
			CHECK(journal_read(j1, buf, len));
			CHECK(journal_write(j2, buf, len));
		}

		CHECK(journal_fsync(j2));

		/*
		 * Compute new header.
		 */
		j2->header.begin.serial = best_guess.serial;
		j2->header.begin.offset = indexend;
		j2->header.end.serial = j1->header.end.serial;
		j2->header.end.offset = indexend + copy_length;
		j2->header.sourceserial = j1->header.sourceserial;
		j2->header.serialset = j1->header.serialset;

		/*
		 * Update the journal header.
		 */
		journal_header_encode(&j2->header, &rawheader);
		CHECK(journal_seek(j2, 0));
		CHECK(journal_write(j2, &rawheader, sizeof(rawheader)));
		CHECK(journal_fsync(j2));

		/*
		 * Build new index.
		 */
		current_pos = j2->header.begin;
		while (current_pos.serial != j2->header.end.serial) {
			index_add(j2, &current_pos);
			CHECK(journal_next(j2, &current_pos));
		}

		/*
		 * Write index.
		 */
		CHECK(index_to_disk(j2));
		CHECK(journal_fsync(j2));

		indexend = j2->header.end.offset;
		POST(indexend);
	}

	/*
	 * Close both journals before trying to rename files (this is
	 * necessary on WIN32).
	 */
	dns_journal_destroy(&j1);
	dns_journal_destroy(&j2);

	/*
	 * With a UFS file system this should just succeed and be atomic.
	 * Any IXFR outs will just continue and the old journal will be
	 * removed on final close.
	 *
	 * With MSDOS / NTFS we need to do a two stage rename, triggered
	 * by EEXIST.  (If any IXFR's are running in other threads, however,
	 * this will fail, and the journal will not be compacted.  But
	 * if so, hopefully they'll be finished by the next time we
	 * compact.)
	 */
	if (rename(newname, filename) == -1) {
		if (errno == EEXIST && !is_backup) {
			result = isc_file_remove(backup);
			if (result != ISC_R_SUCCESS &&
			    result != ISC_R_FILENOTFOUND) {
				goto failure;
			}
			if (rename(filename, backup) == -1) {
				goto maperrno;
			}
			if (rename(newname, filename) == -1) {
				goto maperrno;
			}
			(void)isc_file_remove(backup);
		} else {
		maperrno:
			result = ISC_R_FAILURE;
			goto failure;
		}
	}

	result = ISC_R_SUCCESS;

failure:
	(void)isc_file_remove(newname);
	if (buf != NULL) {
		isc_mem_put(mctx, buf, size);
	}
	if (j1 != NULL) {
		dns_journal_destroy(&j1);
	}
	if (j2 != NULL) {
		dns_journal_destroy(&j2);
	}
	return (result);
}

static isc_result_t
index_to_disk(dns_journal_t *j) {
	isc_result_t result = ISC_R_SUCCESS;

	if (j->header.index_size != 0) {
		unsigned int i;
		unsigned char *p;
		unsigned int rawbytes;

		rawbytes = j->header.index_size * sizeof(journal_rawpos_t);

		p = j->rawindex;
		for (i = 0; i < j->header.index_size; i++) {
			encode_uint32(j->index[i].serial, p);
			p += 4;
			encode_uint32(j->index[i].offset, p);
			p += 4;
		}
		INSIST(p == j->rawindex + rawbytes);

		CHECK(journal_seek(j, sizeof(journal_rawheader_t)));
		CHECK(journal_write(j, j->rawindex, rawbytes));
	}
failure:
	return (result);
}