[BACK]Return to blockio.S CVS log [TXT][DIR] Up to [cvs.NetBSD.org] / src / sys / arch / arm / arm

File: [cvs.NetBSD.org] / src / sys / arch / arm / arm / blockio.S (download)

Revision 1.5, Thu Aug 15 01:38:16 2002 UTC (21 years, 7 months ago) by briggs
Branch: MAIN
CVS Tags: yamt-x86pmap-base4, yamt-x86pmap-base3, yamt-x86pmap-base2, yamt-x86pmap-base, yamt-x86pmap, yamt-vop-base3, yamt-vop-base2, yamt-vop-base, yamt-vop, yamt-uio_vmspace-base5, yamt-uio_vmspace, yamt-splraiseipl-base5, yamt-splraiseipl-base4, yamt-splraiseipl-base3, yamt-splraiseipl-base2, yamt-splraiseipl-base, yamt-splraiseipl, yamt-readahead-pervnode, yamt-readahead-perfile, yamt-readahead-base3, yamt-readahead-base2, yamt-readahead-base, yamt-readahead, yamt-pf42-baseX, yamt-pf42-base4, yamt-pf42-base3, yamt-pf42-base2, yamt-pf42-base, yamt-pf42, yamt-pdpolicy-base9, yamt-pdpolicy-base8, yamt-pdpolicy-base7, yamt-pdpolicy-base6, yamt-pdpolicy-base5, yamt-pdpolicy-base4, yamt-pdpolicy-base3, yamt-pdpolicy-base2, yamt-pdpolicy-base, yamt-pdpolicy, yamt-pagecache-tag8, yamt-pagecache-base8, yamt-pagecache-base7, yamt-pagecache-base6, yamt-pagecache-base5, yamt-pagecache-base4, yamt-pagecache-base3, yamt-pagecache-base2, yamt-pagecache-base, yamt-nfs-mp-base9, yamt-nfs-mp-base8, yamt-nfs-mp-base7, yamt-nfs-mp-base6, yamt-nfs-mp-base5, yamt-nfs-mp-base4, yamt-nfs-mp-base3, yamt-nfs-mp-base2, yamt-nfs-mp-base11, yamt-nfs-mp-base10, yamt-nfs-mp-base, yamt-nfs-mp, yamt-lazymbuf-base15, yamt-lazymbuf-base14, yamt-lazymbuf, yamt-kmem-base3, yamt-kmem-base2, yamt-kmem-base, yamt-kmem, yamt-km-base4, yamt-km-base3, yamt-km-base2, yamt-km-base, yamt-km, yamt-idlelwp-base8, yamt-idlelwp, wrstuden-revivesa-base-4, wrstuden-revivesa-base-3, wrstuden-revivesa-base-2, wrstuden-revivesa-base-1, wrstuden-revivesa-base, wrstuden-revivesa, wrstuden-fixsa-newbase, wrstuden-fixsa-base-1, wrstuden-fixsa-base, wrstuden-fixsa, vmlocking2-base3, vmlocking2-base2, vmlocking2-base1, vmlocking2, vmlocking-nbase, vmlocking-base, vmlocking, uebayasi-xip-base7, uebayasi-xip-base6, uebayasi-xip-base5, uebayasi-xip-base4, uebayasi-xip-base3, uebayasi-xip-base2, uebayasi-xip-base1, uebayasi-xip-base, uebayasi-xip, thorpej-vnode-attr-base, thorpej-vnode-attr, thorpej-atomic-base, thorpej-atomic, simonb-wapbl-nbase, simonb-wapbl-base, simonb-wapbl, simonb-timecounters-base, simonb-timecounters, simonb-timcounters-final, rpaulo-netinet-merge-pcb-base, rpaulo-netinet-merge-pcb, rmind-uvmplock-nbase, rmind-uvmplock-base, rmind-uvmplock, riastradh-drm2-base2, riastradh-drm2-base1, riastradh-drm2-base, riastradh-drm2, reinoud-bufcleanup-nbase, reinoud-bufcleanup-base, reinoud-bufcleanup, ppcoea-renovation-base, ppcoea-renovation, post-newlock2-merge, peter-altq-base, peter-altq, nick-net80211-sync-base, nick-net80211-sync, nick-hppapmap-base4, nick-hppapmap-base3, nick-hppapmap-base2, nick-hppapmap-base, nick-hppapmap, nick-csl-alignment-base5, nick-csl-alignment-base, nick-csl-alignment, newlock2-nbase, newlock2-base, newlock2, netbsd-6-base, netbsd-6-1-RELEASE, netbsd-6-1-RC4, netbsd-6-1-RC3, netbsd-6-1-RC2, netbsd-6-1-RC1, netbsd-6-1-5-RELEASE, netbsd-6-1-4-RELEASE, netbsd-6-1-3-RELEASE, netbsd-6-1-2-RELEASE, netbsd-6-1-1-RELEASE, netbsd-6-1, netbsd-6-0-RELEASE, netbsd-6-0-RC2, netbsd-6-0-RC1, netbsd-6-0-6-RELEASE, netbsd-6-0-5-RELEASE, netbsd-6-0-4-RELEASE, netbsd-6-0-3-RELEASE, netbsd-6-0-2-RELEASE, netbsd-6-0-1-RELEASE, netbsd-6-0, netbsd-6, netbsd-5-base, netbsd-5-2-RELEASE, netbsd-5-2-RC1, netbsd-5-2-3-RELEASE, netbsd-5-2-2-RELEASE, netbsd-5-2-1-RELEASE, netbsd-5-2, netbsd-5-1-RELEASE, netbsd-5-1-RC4, netbsd-5-1-RC3, netbsd-5-1-RC2, netbsd-5-1-RC1, netbsd-5-1-5-RELEASE, netbsd-5-1-4-RELEASE, netbsd-5-1-3-RELEASE, netbsd-5-1-2-RELEASE, netbsd-5-1-1-RELEASE, netbsd-5-1, netbsd-5-0-RELEASE, netbsd-5-0-RC4, netbsd-5-0-RC3, netbsd-5-0-RC2, netbsd-5-0-RC1, netbsd-5-0-2-RELEASE, netbsd-5-0-1-RELEASE, netbsd-5-0, netbsd-5, netbsd-4-base, netbsd-4-0-RELEASE, netbsd-4-0-RC5, netbsd-4-0-RC4, netbsd-4-0-RC3, netbsd-4-0-RC2, netbsd-4-0-RC1, netbsd-4-0-1-RELEASE, netbsd-4-0, netbsd-4, netbsd-3-base, netbsd-3-1-RELEASE, netbsd-3-1-RC4, netbsd-3-1-RC3, netbsd-3-1-RC2, netbsd-3-1-RC1, netbsd-3-1-1-RELEASE, netbsd-3-1, netbsd-3-0-RELEASE, netbsd-3-0-RC6, netbsd-3-0-RC5, netbsd-3-0-RC4, netbsd-3-0-RC3, netbsd-3-0-RC2, netbsd-3-0-RC1, netbsd-3-0-3-RELEASE, netbsd-3-0-2-RELEASE, netbsd-3-0-1-RELEASE, netbsd-3-0, netbsd-3, netbsd-2-base, netbsd-2-1-RELEASE, netbsd-2-1-RC6, netbsd-2-1-RC5, netbsd-2-1-RC4, netbsd-2-1-RC3, netbsd-2-1-RC2, netbsd-2-1-RC1, netbsd-2-1, netbsd-2-0-base, netbsd-2-0-RELEASE, netbsd-2-0-RC5, netbsd-2-0-RC4, netbsd-2-0-RC3, netbsd-2-0-RC2, netbsd-2-0-RC1, netbsd-2-0-3-RELEASE, netbsd-2-0-2-RELEASE, netbsd-2-0-1-RELEASE, netbsd-2-0, netbsd-2, nathanw_sa_before_merge, nathanw_sa_base, mjf-ufs-trans-base, mjf-ufs-trans, mjf-devfs2-base, mjf-devfs2, mjf-devfs-base, mjf-devfs, matt-premerge-20091211, matt-nb6-plus-nbase, matt-nb6-plus-base, matt-nb6-plus, matt-nb5-pq3-base, matt-nb5-pq3, matt-nb5-mips64-u2-k2-k4-k7-k8-k9, matt-nb5-mips64-u1-k1-k5, matt-nb5-mips64-premerge-20101231, matt-nb5-mips64-premerge-20091211, matt-nb5-mips64-k15, matt-nb4-mips64-k7-u2a-k9b, matt-nb4-arm-base, matt-nb4-arm, matt-mips64-premerge-20101231, matt-mips64-base2, matt-mips64-base, matt-mips64, matt-armv6-prevmlocking, matt-armv6-nbase, matt-armv6-base, matt-armv6, ktrace-lwp-base, ktrace-lwp, kqueue-beforemerge, kqueue-base, kqueue-aftermerge, khorben-n900, kent-audio2-base, kent-audio2, kent-audio1-beforemerge, kent-audio1-base, kent-audio1, keiichi-mipv6-nbase, keiichi-mipv6-base, keiichi-mipv6, jymxensuspend-base, jym-xensuspend-nbase, jym-xensuspend-base, jym-xensuspend, jruoho-x86intr-base, jruoho-x86intr, jmcneill-usbmp-pre-base2, jmcneill-usbmp-base9, jmcneill-usbmp-base8, jmcneill-usbmp-base7, jmcneill-usbmp-base6, jmcneill-usbmp-base5, jmcneill-usbmp-base4, jmcneill-usbmp-base3, jmcneill-usbmp-base2, jmcneill-usbmp-base10, jmcneill-usbmp-base, jmcneill-usbmp, jmcneill-pm-base, jmcneill-pm, jmcneill-base, jmcneill-audiomp3-base, jmcneill-audiomp3, hpcarm-cleanup-nbase, hpcarm-cleanup-base, hpcarm-cleanup, haad-nbase2, haad-dm-base2, haad-dm-base1, haad-dm-base, haad-dm, gmcgarry_ucred_base, gmcgarry_ucred, gmcgarry_ctxsw_base, gmcgarry_ctxsw, gehenna-devsw-base, gdamore-uart-base, gdamore-uart, fvdl_fs64_base, elad-kernelauth-base, elad-kernelauth, cube-autoconf-base, cube-autoconf, chris-arm-intr-rework-base7, chris-arm-intr-rework-base6, chris-arm-intr-rework-base5, chris-arm-intr-rework-base4, chris-arm-intr-rework-base3, chris-arm-intr-rework-base2, chris-arm-intr-rework-base, chris-arm-intr-rework, cherry-xenmp-base, cherry-xenmp, chap-midi-nbase, chap-midi-base, chap-midi, bouyer-xeni386-nbase, bouyer-xeni386-merge1, bouyer-xeni386-base, bouyer-xeni386, bouyer-xenamd64-base2, bouyer-xenamd64-base, bouyer-xenamd64, bouyer-quota2-nbase, bouyer-quota2-base, bouyer-quota2, bjh21-hydra-base, bjh21-hydra, agc-symver-base, agc-symver, ad-socklock-base1, ad-audiomp2-base, ad-audiomp2, ad-audiomp-base, ad-audiomp, abandoned-netbsd-4-base, abandoned-netbsd-4
Branch point for: yamt-pagecache, tls-maxphys, rmind-smpnet, matt-nb5-mips64
Changes since 1.4: +58 -58 lines

Use local label names (.Lfoo vs. (Lfoo or foo))

/*	$NetBSD: blockio.S,v 1.5 2002/08/15 01:38:16 briggs Exp $	*/

/*
 * Copyright (c) 2001 Ben Harris.
 * Copyright (c) 1994 Mark Brinicombe.
 * Copyright (c) 1994 Brini.
 * All rights reserved.
 *
 * This code is derived from software written for Brini by Mark Brinicombe
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *	This product includes software developed by Brini.
 * 4. The name of the company nor the name of the author may be used to
 *    endorse or promote products derived from this software without specific
 *    prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * RiscBSD kernel project
 *
 * blockio.S
 *
 * optimised block read/write from/to IO routines.
 *
 * Created      : 08/10/94
 * Modified	: 22/01/99  -- R.Earnshaw
 *			       Faster, and small tweaks for StrongARM 	
 */

#include <machine/asm.h>

RCSID("$NetBSD: blockio.S,v 1.5 2002/08/15 01:38:16 briggs Exp $")

/*
 * Read bytes from an I/O address into a block of memory
 *
 * r0 = address to read from (IO)
 * r1 = address to write to (memory)
 * r2 = length
 */

/* This code will look very familiar if you've read _memcpy(). */
ENTRY(read_multi_1)
	mov	ip, sp
	stmfd	sp!, {fp, ip, lr, pc}
	sub	fp, ip, #4
	subs	r2, r2, #4		/* r2 = length - 4 */
	blt	.Lrm1_l4			/* less than 4 bytes */
	ands	r12, r1, #3
	beq	.Lrm1_main		/* aligned destination */
	rsb	r12, r12, #4
	cmp	r12, #2
	ldrb	r3, [r0]
	strb	r3, [r1], #1
	ldrgeb	r3, [r0]
	strgeb	r3, [r1], #1
	ldrgtb	r3, [r0]
	strgtb	r3, [r1], #1
	subs	r2, r2, r12
	blt	.Lrm1_l4
.Lrm1_main:
.Lrm1loop:
	ldrb	r3, [r0]
	ldrb	r12, [r0]
	orr	r3, r3, r12, lsl #8
	ldrb	r12, [r0]
	orr	r3, r3, r12, lsl #16
	ldrb	r12, [r0]
	orr	r3, r3, r12, lsl #24
	str	r3, [r1], #4
	subs	r2, r2, #4
	bge	.Lrm1loop
.Lrm1_l4:
	adds	r2, r2, #4			/* r2 = length again */
	ldmeqdb	fp, {fp, sp, pc}
	moveq	pc, r14
	cmp	r2, #2
	ldrb	r3, [r0]
	strb	r3, [r1], #1
	ldrgeb	r3, [r0]
	strgeb	r3, [r1], #1
	ldrgtb	r3, [r0]
	strgtb	r3, [r1], #1
	ldmdb	fp, {fp, sp, pc}

/*
 * Write bytes to an I/O address from a block of memory
 *
 * r0 = address to write to (IO)
 * r1 = address to read from (memory)
 * r2 = length
 */

/* This code will look very familiar if you've read _memcpy(). */
ENTRY(write_multi_1)
	mov	ip, sp
	stmfd	sp!, {fp, ip, lr, pc}
	sub	fp, ip, #4
	subs	r2, r2, #4		/* r2 = length - 4 */
	blt	.Lwm1_l4		/* less than 4 bytes */
	ands	r12, r1, #3
	beq	.Lwm1_main		/* aligned source */
	rsb	r12, r12, #4
	cmp	r12, #2
	ldrb	r3, [r1], #1
	strb	r3, [r0]
	ldrgeb	r3, [r1], #1
	strgeb	r3, [r0]
	ldrgtb	r3, [r1], #1
	strgtb	r3, [r0]
	subs	r2, r2, r12
	blt	.Lwm1_l4
.Lwm1_main:
.Lwm1loop:
	ldr	r3, [r1], #4
	strb	r3, [r0]
	mov	r3, r3, lsr #8
	strb	r3, [r0]
	mov	r3, r3, lsr #8
	strb	r3, [r0]
	mov	r3, r3, lsr #8
	strb	r3, [r0]
	subs	r2, r2, #4
	bge	.Lwm1loop
.Lwm1_l4:
	adds	r2, r2, #4			/* r2 = length again */
	ldmeqdb	fp, {fp, sp, pc}
	cmp	r2, #2
	ldrb	r3, [r1], #1
	strb	r3, [r0]
	ldrgeb	r3, [r1], #1
	strgeb	r3, [r0]
	ldrgtb	r3, [r1], #1
	strgtb	r3, [r0]
	ldmdb	fp, {fp, sp, pc}

/*
 * Reads short ints (16 bits) from an I/O address into a block of memory
 *
 * r0 = address to read from (IO)
 * r1 = address to write to (memory)
 * r2 = length
 */

ENTRY(insw)
/* Make sure that we have a positive length */
	cmp	r2, #0x00000000
	movle	pc, lr

/* If the destination address and the size is word aligned, do it fast */

	tst	r2, #0x00000001
	tsteq	r1, #0x00000003
	beq	.Lfastinsw

/* Non aligned insw */

.Linswloop:
	ldr	r3, [r0]
	subs	r2, r2, #0x00000001	/* Loop test in load delay slot */
	strb	r3, [r1], #0x0001
	mov	r3, r3, lsr #8
	strb	r3, [r1], #0x0001
	bgt	.Linswloop

	mov	pc, lr

/* Word aligned insw */

.Lfastinsw:

.Lfastinswloop:
	ldr	r3, [r0, #0x0002]	/* take advantage of nonaligned
					 * word accesses */
	ldr	ip, [r0]
	mov	r3, r3, lsr #16		/* Put the two shorts together */
	orr	r3, r3, ip, lsl #16
	str	r3, [r1], #0x0004	/* Store */
	subs	r2, r2, #0x00000002	/* Next */
	bgt	.Lfastinswloop

	mov	pc, lr


/*
 * Writes short ints (16 bits) from a block of memory to an I/O address
 *
 * r0 = address to write to (IO)
 * r1 = address to read from (memory)
 * r2 = length
 */

ENTRY(outsw)
/* Make sure that we have a positive length */
	cmp	r2, #0x00000000
	movle	pc, lr

/* If the destination address and the size is word aligned, do it fast */

	tst	r2, #0x00000001
	tsteq	r1, #0x00000003
	beq	.Lfastoutsw

/* Non aligned outsw */

.Loutswloop:
	ldrb	r3, [r1], #0x0001
	ldrb	ip, [r1], #0x0001
	subs	r2, r2, #0x00000001	/* Loop test in load delay slot */
	orr	r3, r3, ip, lsl #8
	orr	r3, r3, r3, lsl #16
	str	r3, [r0]
	bgt	.Loutswloop

	mov	pc, lr

/* Word aligned outsw */

.Lfastoutsw:

.Lfastoutswloop:
	ldr	r3, [r1], #0x0004	/* r3 = (H)(L) */
	subs	r2, r2, #0x00000002	/* Loop test in load delay slot */

	eor	ip, r3, r3, lsr #16	/* ip = (H)(H^L) */
	eor	r3, r3, ip, lsl #16	/* r3 = (H^H^L)(L) = (L)(L) */
	eor	ip, ip, r3, lsr #16	/* ip = (H)(H^L^L) = (H)(H) */

	str	r3, [r0]
	str	ip, [r0]
	
/*	mov	ip, r3, lsl #16
 *	orr	ip, ip, ip, lsr #16
 *	str	ip, [r0]
 *
 *	mov	ip, r3, lsr #16
 *	orr	ip, ip, ip, lsl #16
 *	str	ip, [r0]
 */

	bgt	.Lfastoutswloop

	mov	pc, lr

/*
 * reads short ints (16 bits) from an I/O address into a block of memory
 * with a length garenteed to be a multiple of 16 bytes
 * with a word aligned destination address
 *
 * r0 = address to read from (IO)
 * r1 = address to write to (memory)
 * r2 = length
 */

ENTRY(insw16)
/* Make sure that we have a positive length */
	cmp	r2, #0x00000000
	movle	pc, lr

/* If the destination address is word aligned and the size suitably
   aligned, do it fast */

	tst	r2, #0x00000007
	tsteq	r1, #0x00000003

	bne	_C_LABEL(insw)

/* Word aligned insw */

	stmfd	sp!, {r4,r5,lr}

.Linsw16loop:
	ldr	r3, [r0, #0x0002]	/* take advantage of nonaligned
					 * word accesses */
	ldr	lr, [r0]
	mov	r3, r3, lsr #16		/* Put the two shorts together */
	orr	r3, r3, lr, lsl #16

	ldr	r4, [r0, #0x0002]	/* take advantage of nonaligned
					 * word accesses */
	ldr	lr, [r0]
	mov	r4, r4, lsr #16		/* Put the two shorts together */
	orr	r4, r4, lr, lsl #16

	ldr	r5, [r0, #0x0002]	/* take advantage of nonaligned
					 * word accesses */
	ldr	lr, [r0]
	mov	r5, r5, lsr #16		/* Put the two shorts together */
	orr	r5, r5, lr, lsl #16

	ldr	ip, [r0, #0x0002]	/* take advantage of nonaligned
					 * word accesses */
	ldr	lr, [r0]
	mov	ip, ip, lsr #16		/* Put the two shorts together */
	orr	ip, ip, lr, lsl #16

	stmia	r1!, {r3-r5,ip}
	subs	r2, r2, #0x00000008	/* Next */
	bgt	.Linsw16loop

	ldmfd	sp!, {r4,r5,pc}		/* Restore regs and go home */


/*
 * Writes short ints (16 bits) from a block of memory to an I/O address
 *
 * r0 = address to write to (IO)
 * r1 = address to read from (memory)
 * r2 = length
 */

ENTRY(outsw16)
/* Make sure that we have a positive length */
	cmp	r2, #0x00000000
	movle	pc, lr

/* If the destination address is word aligned and the size suitably
   aligned, do it fast */

	tst	r2, #0x00000007
	tsteq	r1, #0x00000003

	bne	_C_LABEL(outsw)

/* Word aligned outsw */

	stmfd	sp!, {r4,r5,lr}

.Loutsw16loop:
	ldmia	r1!, {r4,r5,ip,lr}

	eor	r3, r4, r4, lsl #16	/* r3 = (A^B)(B) */
	eor	r4, r4, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
	eor	r3, r3, r4, lsl #16	/* r3 = (A^B^A)(B) = (B)(B) */
	str	r3, [r0]
	str	r4, [r0]
	
/*	mov	r3, r4, lsl #16
 *	orr	r3, r3, r3, lsr #16
 *	str	r3, [r0]
 *
 *	mov	r3, r4, lsr #16
 *	orr	r3, r3, r3, lsl #16
 *	str	r3, [r0]
 */

	eor	r3, r5, r5, lsl #16	/* r3 = (A^B)(B) */
	eor	r5, r5, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
	eor	r3, r3, r5, lsl #16	/* r3 = (A^B^A)(B) = (B)(B) */
	str	r3, [r0]
	str	r5, [r0]

	eor	r3, ip, ip, lsl #16	/* r3 = (A^B)(B) */
	eor	ip, ip, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
	eor	r3, r3, ip, lsl #16	/* r3 = (A^B^A)(B) = (B)(B) */
	str	r3, [r0]
	str	ip, [r0]

	eor	r3, lr, lr, lsl #16	/* r3 = (A^B)(B) */
	eor	lr, lr, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
	eor	r3, r3, lr, lsl #16	/* r3 = (A^B^A)(B) = (B)(B) */
	str	r3, [r0]
	str	lr, [r0]

	subs	r2, r2, #0x00000008
	bgt	.Loutsw16loop

	ldmfd	sp!, {r4,r5,pc}		/* and go home */

/*
 * reads short ints (16 bits) from an I/O address into a block of memory
 * The I/O address is assumed to be mapped multiple times in a block of
 * 8 words.
 * The destination address should be word aligned.
 *
 * r0 = address to read from (IO)
 * r1 = address to write to (memory)
 * r2 = length
 */

ENTRY(inswm8)
/* Make sure that we have a positive length */
	cmp	r2, #0x00000000
	movle	pc, lr

/* If the destination address is word aligned and the size suitably
   aligned, do it fast */

	tst	r1, #0x00000003

	bne	_C_LABEL(insw)

/* Word aligned insw */

	stmfd	sp!, {r4-r9,lr}

	mov	lr, #0xff000000
	orr	lr, lr, #0x00ff0000

.Linswm8_loop8:
	cmp	r2, #8
	bcc	.Linswm8_l8

	ldmia	r0, {r3-r9,ip}

	bic	r3, r3, lr
	orr	r3, r3, r4, lsl #16
	bic	r5, r5, lr
	orr	r4, r5, r6, lsl #16
	bic	r7, r7, lr
	orr	r5, r7, r8, lsl #16
	bic	r9, r9, lr
	orr	r6, r9, ip, lsl #16

	stmia	r1!, {r3-r6}

	subs	r2, r2, #0x00000008	/* Next */
	bne	.Linswm8_loop8
	beq	.Linswm8_l1

.Linswm8_l8:
	cmp	r2, #4
	bcc	.Linswm8_l4

	ldmia	r0, {r3-r6}

	bic	r3, r3, lr
	orr	r3, r3, r4, lsl #16
	bic	r5, r5, lr
	orr	r4, r5, r6, lsl #16

	stmia	r1!, {r3-r4}

	subs	r2, r2, #0x00000004
	beq	.Linswm8_l1

.Linswm8_l4:
	cmp	r2, #2
	bcc	.Linswm8_l2

	ldmia	r0, {r3-r4}

	bic	r3, r3, lr
	orr	r3, r3, r4, lsl #16
	str	r3, [r1], #0x0004

	subs	r2, r2, #0x00000002
	beq	.Linswm8_l1

.Linswm8_l2:
	cmp	r2, #1
	bcc	.Linswm8_l1

	ldr	r3, [r0]
	subs	r2, r2, #0x00000001	/* Test in load delay slot */
					/* XXX, why don't we use result?  */

	strb	r3, [r1], #0x0001
	mov	r3, r3, lsr #8
	strb	r3, [r1], #0x0001


.Linswm8_l1:
	ldmfd	sp!, {r4-r9,pc}		/* And go home */

/*
 * write short ints (16 bits) to an I/O address from a block of memory
 * The I/O address is assumed to be mapped multiple times in a block of
 * 8 words.
 * The source address should be word aligned.
 *
 * r0 = address to read to (IO)
 * r1 = address to write from (memory)
 * r2 = length
 */

ENTRY(outswm8)
/* Make sure that we have a positive length */
	cmp	r2, #0x00000000
	movle	pc, lr

/* If the destination address is word aligned and the size suitably
   aligned, do it fast */

	tst	r1, #0x00000003

	bne	_C_LABEL(outsw)

/* Word aligned outsw */

	stmfd	sp!, {r4-r8,lr}

.Loutswm8_loop8:
	cmp	r2, #8
	bcc	.Loutswm8_l8

	ldmia	r1!, {r3,r5,r7,ip}

	eor	r4, r3, r3, lsr #16	/* r4 = (A)(A^B) */
	eor	r3, r3, r4, lsl #16	/* r3 = (A^A^B)(B) = (B)(B) */
	eor	r4, r4, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */

	eor	r6, r5, r5, lsr #16	/* r6 = (A)(A^B) */
	eor	r5, r5, r6, lsl #16	/* r5 = (A^A^B)(B) = (B)(B) */
	eor	r6, r6, r5, lsr #16	/* r6 = (A)(B^A^B) = (A)(A) */

	eor	r8, r7, r7, lsr #16	/* r8 = (A)(A^B) */
	eor	r7, r7, r8, lsl #16	/* r7 = (A^A^B)(B) = (B)(B) */
	eor	r8, r8, r7, lsr #16	/* r8 = (A)(B^A^B) = (A)(A) */

	eor	lr, ip, ip, lsr #16	/* lr = (A)(A^B) */
	eor	ip, ip, lr, lsl #16	/* ip = (A^A^B)(B) = (B)(B) */
	eor	lr, lr, ip, lsr #16	/* lr = (A)(B^A^B) = (A)(A) */

	stmia	r0, {r3-r8,ip,lr}

	subs	r2, r2, #0x00000008	/* Next */
	bne	.Loutswm8_loop8
	beq	.Loutswm8_l1

.Loutswm8_l8:
	cmp	r2, #4
	bcc	.Loutswm8_l4

	ldmia	r1!, {r3-r4}

	eor	r6, r3, r3, lsr #16	/* r6 = (A)(A^B) */
	eor	r5, r3, r6, lsl #16	/* r5 = (A^A^B)(B) = (B)(B) */
	eor	r6, r6, r5, lsr #16	/* r6 = (A)(B^A^B) = (A)(A) */

	eor	r8, r4, r4, lsr #16	/* r8 = (A)(A^B) */
	eor	r7, r4, r8, lsl #16	/* r7 = (A^A^B)(B) = (B)(B) */
	eor	r8, r8, r7, lsr #16	/* r8 = (A)(B^A^B) = (A)(A) */

	stmia	r0, {r5-r8}

	subs	r2, r2, #0x00000004
	beq	.Loutswm8_l1

.Loutswm8_l4:
	cmp	r2, #2
	bcc	.Loutswm8_l2

	ldr	r3, [r1], #0x0004	/* r3 = (A)(B) */
	subs	r2, r2, #0x00000002	/* Done test in Load delay slot */

	eor	r5, r3, r3, lsr #16	/* r5 = (A)(A^B)*/
	eor	r4, r3, r5, lsl #16	/* r4 = (A^A^B)(B) = (B)(B) */
	eor	r5, r5, r4, lsr #16	/* r5 = (A)(B^A^B) = (A)(A) */

	stmia	r0, {r4, r5}

	beq	.Loutswm8_l1

.Loutswm8_l2:
	cmp	r2, #1
	bcc	.Loutswm8_l1

	ldrb	r3, [r1], #0x0001
	ldrb	r4, [r1], #0x0001
	subs	r2, r2, #0x00000001	/* Done test in load delay slot */
					/* XXX This test isn't used?  */
	orr	r3, r3, r4, lsl #8
	orr	r3, r3, r3, lsl #16
	str	r3, [r0]

.Loutswm8_l1:
	ldmfd	sp!, {r4-r8,pc}		/* And go home */