[BACK]Return to memcpy.S CVS log [TXT][DIR] Up to [cvs.NetBSD.org] / src / lib / libc / arch / sparc64 / string

File: [cvs.NetBSD.org] / src / lib / libc / arch / sparc64 / string / Attic / memcpy.S (download)

Revision 1.5, Tue Jul 12 07:51:33 2011 UTC (11 years, 5 months ago) by mrg
Branch: MAIN
CVS Tags: yamt-pagecache-tag8, yamt-pagecache-base8, yamt-pagecache-base7, yamt-pagecache-base6, yamt-pagecache-base5, yamt-pagecache-base4, yamt-pagecache-base3, yamt-pagecache-base2, yamt-pagecache-base, netbsd-6-base, netbsd-6-1-RELEASE, netbsd-6-1-RC4, netbsd-6-1-RC3, netbsd-6-1-RC2, netbsd-6-1-RC1, netbsd-6-1-5-RELEASE, netbsd-6-1-4-RELEASE, netbsd-6-1-3-RELEASE, netbsd-6-1-2-RELEASE, netbsd-6-1-1-RELEASE, netbsd-6-1, netbsd-6-0-RELEASE, netbsd-6-0-RC2, netbsd-6-0-RC1, netbsd-6-0-6-RELEASE, netbsd-6-0-5-RELEASE, netbsd-6-0-4-RELEASE, netbsd-6-0-3-RELEASE, netbsd-6-0-2-RELEASE, netbsd-6-0-1-RELEASE, netbsd-6-0, netbsd-6, matt-nb6-plus-nbase, matt-nb6-plus-base, matt-nb6-plus
Branch point for: yamt-pagecache, tls-maxphys
Changes since 1.4: +4 -3 lines

rename sparc64 BLOCK_SIZE and BLOCK_ALIGN to have SPARC64_ prefixes.
for the assembler files, define the old names to the new names
since using the new names cause ugliness due to longer identifer
names, and reduces churn.

fixes build issues in dtv and vaguely makes <machine/psl.h> slightly
less name-space invasive.

/*	$NetBSD: memcpy.S,v 1.5 2011/07/12 07:51:33 mrg Exp $	*/

/*
 * Copyright (c) 2001	Eduardo E. Horvath
 *
 * This software was developed by the Computer Systems Engineering group
 * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
 * contributed to Berkeley.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *	This product includes software developed by the University of
 *	California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 */

#include <machine/asm.h>
#ifndef _LOCORE
#define _LOCORE
#endif
#include <machine/ctlreg.h>
#include <machine/frame.h>
#include <machine/psl.h>

#if defined(LIBC_SCCS) && !defined(lint)
	RCSID("$NetBSD: memcpy.S,v 1.5 2011/07/12 07:51:33 mrg Exp $")
#endif  /* LIBC_SCCS and not lint */

#define	EMPTY	nop
#define	NOTREACHED	ta	1

#define	BCOPY_SMALL	16
#define	BLOCK_SIZE	SPARC64_BLOCK_SIZE
#define	BLOCK_ALIGN	SPARC64_BLOCK_ALIGN

#if 0
#define ASI_STORE	ASI_BLK_COMMIT_P
#else
#define ASI_STORE	ASI_BLK_P
#endif

#ifndef	_ALIGN
#define _ALIGN	.align 8
#endif

#if 1
/*
 * kernel bcopy/memcpy
 * Assumes regions do not overlap; has no useful return value.
 *
 * Must not use %g7 (see copyin/copyout above).
 */
ENTRY(memcpy) /* dest, src, size */
	/*
	 * Swap args for bcopy.  Gcc generates calls to memcpy for
	 * structure assignments.
	 */
	mov	%o0, %o3
	mov	%o1, %o0
	mov	%o3, %o1
#endif
ENTRY(bcopy) /* src, dest, size */
#ifdef DEBUG
	set	pmapdebug, %o4
	ld	[%o4], %o4
	btst	0x80, %o4	! PDB_COPY
	bz,pt	%icc, 3f
	 nop
	save	%sp, -CC64FSZ, %sp
	mov	%i0, %o1
	set	2f, %o0
	mov	%i1, %o2
	call	printf
	 mov	%i2, %o3
!	ta	1; nop
	restore
	.data
2:	.asciz	"bcopy(%p->%p,%x)\n"
	_ALIGN
	.text
3:
#endif
	/*
	 * Check for overlaps and punt.
	 *
	 * If src <= dest <= src+len we have a problem.
	 */

	sub	%o1, %o0, %o3

	cmp	%o3, %o2
	blu,pn	%xcc, Lovbcopy
	 cmp	%o2, BCOPY_SMALL
Lbcopy_start:
	bge,pt	%xcc, 2f	! if >= this many, go be fancy.
	 cmp	%o2, 256

	mov	%o1, %o5	! Save memcpy return value
	/*
	 * Not much to copy, just do it a byte at a time.
	 */
	deccc	%o2		! while (--len >= 0)
	bl	1f
	 EMPTY
0:
	inc	%o0
	ldsb	[%o0 - 1], %o4	!	(++dst)[-1] = *src++;
	stb	%o4, [%o1]
	deccc	%o2
	bge	0b
	 inc	%o1
1:
	retl
	 mov	%o5, %o0
	NOTREACHED

	/*
	 * Overlapping bcopies -- punt.
	 */
Lovbcopy:

	/*
	 * Since src comes before dst, and the regions might overlap,
	 * we have to do the copy starting at the end and working backwards.
	 *
	 * We could optimize this, but it almost never happens.
	 */
	mov	%o1, %o5	! Retval
	add	%o2, %o0, %o0	! src += len
	add	%o2, %o1, %o1	! dst += len
	
	deccc	%o2
	bl,pn	%xcc, 1f
	 dec	%o0
0:
	dec	%o1
	ldsb	[%o0], %o4
	dec	%o0
	
	deccc	%o2
	bge,pt	%xcc, 0b
	 stb	%o4, [%o1]
1:
	retl
	 mov	%o5, %o0

	/*
	 * Plenty of data to copy, so try to do it optimally.
	 */
2:
#if 1
	! If it is big enough, use VIS instructions
	bge	Lbcopy_block
	 nop
#endif
Lbcopy_fancy:

	!!
	!! First align the output to a 8-byte entity
	!! 

	save	%sp, -CC64FSZ, %sp
	
	mov	%i0, %o0
	mov	%i1, %o1
	
	mov	%i2, %o2
	btst	1, %o1
	
	bz,pt	%icc, 4f
	 btst	2, %o1
	ldub	[%o0], %o4				! Load 1st byte
	
	deccc	1, %o2
	ble,pn	%xcc, Lbcopy_finish			! XXXX
	 inc	1, %o0
	
	stb	%o4, [%o1]				! Store 1st byte
	inc	1, %o1					! Update address
	btst	2, %o1
4:	
	bz,pt	%icc, 4f
	
	 btst	1, %o0
	bz,a	1f
	 lduh	[%o0], %o4				! Load short

	ldub	[%o0], %o4				! Load bytes
	
	ldub	[%o0+1], %o3
	sllx	%o4, 8, %o4
	or	%o3, %o4, %o4
	
1:	
	deccc	2, %o2
	ble,pn	%xcc, Lbcopy_finish			! XXXX
	 inc	2, %o0
	sth	%o4, [%o1]				! Store 1st short
	
	inc	2, %o1
4:
	btst	4, %o1
	bz,pt	%xcc, 4f
	
	 btst	3, %o0
	bz,a,pt	%xcc, 1f
	 lduw	[%o0], %o4				! Load word -1

	btst	1, %o0
	bz,a,pt	%icc, 2f
	 lduh	[%o0], %o4
	
	ldub	[%o0], %o4
	
	lduh	[%o0+1], %o3
	sllx	%o4, 16, %o4
	or	%o4, %o3, %o4
	
	ldub	[%o0+3], %o3
	sllx	%o4, 8, %o4
	ba,pt	%icc, 1f
	 or	%o4, %o3, %o4
	
2:
	lduh	[%o0+2], %o3
	sllx	%o4, 16, %o4
	or	%o4, %o3, %o4
	
1:	
	deccc	4, %o2
	ble,pn	%xcc, Lbcopy_finish		! XXXX
	 inc	4, %o0
	
	st	%o4, [%o1]				! Store word
	inc	4, %o1
4:
	!!
	!! We are now 32-bit aligned in the dest.
	!!
Lbcopy__common:	

	and	%o0, 7, %o4				! Shift amount
	andn	%o0, 7, %o0				! Source addr
	
	brz,pt	%o4, Lbcopy_noshift8			! No shift version...

	 sllx	%o4, 3, %o4				! In bits
	mov	8<<3, %o3
	
	ldx	[%o0], %l0				! Load word -1
	sub	%o3, %o4, %o3				! Reverse shift
	deccc	16*8, %o2				! Have enough room?
	
	sllx	%l0, %o4, %l0
	bl,pn	%xcc, 2f
	 and	%o3, 0x38, %o3
Lbcopy_unrolled8:

	/*
	 * This is about as close to optimal as you can get, since
	 * the shifts require EU0 and cannot be paired, and you have
	 * 3 dependent operations on the data.
	 */ 

!	ldx	[%o0+0*8], %l0				! Already done
!	sllx	%l0, %o4, %l0				! Already done
	ldx	[%o0+1*8], %l1
	ldx	[%o0+2*8], %l2
	ldx	[%o0+3*8], %l3
	ldx	[%o0+4*8], %l4
	ldx	[%o0+5*8], %l5
	ldx	[%o0+6*8], %l6
#if 1
	ba,pt	%icc, 1f
	 ldx	[%o0+7*8], %l7
	.align	8
1:
	srlx	%l1, %o3, %g1
	inc	8*8, %o0
	
	sllx	%l1, %o4, %l1
	or	%g1, %l0, %o5
	ldx	[%o0+0*8], %l0
	
	stx	%o5, [%o1+0*8]
	srlx	%l2, %o3, %g1

	sllx	%l2, %o4, %l2
	or	%g1, %l1, %o5
	ldx	[%o0+1*8], %l1
	
	stx	%o5, [%o1+1*8]
	srlx	%l3, %o3, %g1
	
	sllx	%l3, %o4, %l3
	or	%g1, %l2, %o5
	ldx	[%o0+2*8], %l2
	
	stx	%o5, [%o1+2*8]
	srlx	%l4, %o3, %g1
	
	sllx	%l4, %o4, %l4	
	or	%g1, %l3, %o5
	ldx	[%o0+3*8], %l3
	
	stx	%o5, [%o1+3*8]
	srlx	%l5, %o3, %g1
	
	sllx	%l5, %o4, %l5
	or	%g1, %l4, %o5
	ldx	[%o0+4*8], %l4
	
	stx	%o5, [%o1+4*8]
	srlx	%l6, %o3, %g1
	
	sllx	%l6, %o4, %l6
	or	%g1, %l5, %o5
	ldx	[%o0+5*8], %l5
	
	stx	%o5, [%o1+5*8]
	srlx	%l7, %o3, %g1
	
	sllx	%l7, %o4, %l7
	or	%g1, %l6, %o5
	ldx	[%o0+6*8], %l6
	
	stx	%o5, [%o1+6*8]
	srlx	%l0, %o3, %g1
	deccc	8*8, %o2				! Have enough room?
	
	sllx	%l0, %o4, %l0				! Next loop
	or	%g1, %l7, %o5
	ldx	[%o0+7*8], %l7
	
	stx	%o5, [%o1+7*8]
	bge,pt	%xcc, 1b
	 inc	8*8, %o1

Lbcopy_unrolled8_cleanup:	
	!!
	!! Finished 8 byte block, unload the regs.
	!! 
	srlx	%l1, %o3, %g1
	inc	7*8, %o0
	
	sllx	%l1, %o4, %l1
	or	%g1, %l0, %o5
		
	stx	%o5, [%o1+0*8]
	srlx	%l2, %o3, %g1
	
	sllx	%l2, %o4, %l2
	or	%g1, %l1, %o5
		
	stx	%o5, [%o1+1*8]
	srlx	%l3, %o3, %g1
	
	sllx	%l3, %o4, %l3
	or	%g1, %l2, %o5
		
	stx	%o5, [%o1+2*8]
	srlx	%l4, %o3, %g1
	
	sllx	%l4, %o4, %l4	
	or	%g1, %l3, %o5
		
	stx	%o5, [%o1+3*8]
	srlx	%l5, %o3, %g1
	
	sllx	%l5, %o4, %l5
	or	%g1, %l4, %o5
		
	stx	%o5, [%o1+4*8]
	srlx	%l6, %o3, %g1
	
	sllx	%l6, %o4, %l6
	or	%g1, %l5, %o5
		
	stx	%o5, [%o1+5*8]
	srlx	%l7, %o3, %g1
	
	sllx	%l7, %o4, %l7
	or	%g1, %l6, %o5
		
	stx	%o5, [%o1+6*8]
	inc	7*8, %o1
	
	mov	%l7, %l0				! Save our unused data
	dec	7*8, %o2
#else
	/*
	 * This version also handles aligned copies at almost the
	 * same speed.  It should take the same number of cycles
	 * as the previous version, but is slightly slower, probably
	 * due to i$ issues.
	 */
	ldx	[%o0+7*8], %l7
	ba,pt	%icc, 1f
	 clr	%g1
	.align 32
1:
	srlx	%l1, %o3, %g1
	bz,pn	%xcc, 3f
	 inc	8*8, %o0

	sllx	%l1, %o4, %l1
	or	%g1, %l0, %o5
	ba,pt	%icc, 4f
	ldx	[%o0+0*8], %l0
	
	nop
3:
	mov	%l0, %o5
	ldx	[%o0+0*8], %l0
	
4:	
	bz,pn	%icc, 3f
	stx	%o5, [%o1+0*8]
	srlx	%l2, %o3, %g1

	sllx	%l2, %o4, %l2
3:	
	or	%g1, %l1, %o5
	ldx	[%o0+1*8], %l1
	
	bz,pn	%icc, 3f
	stx	%o5, [%o1+1*8]
	srlx	%l3, %o3, %g1
	
	sllx	%l3, %o4, %l3
3:	
	or	%g1, %l2, %o5
	ldx	[%o0+2*8], %l2
	
	bz,pn	%icc, 3f
	stx	%o5, [%o1+2*8]
	srlx	%l4, %o3, %g1
	
	sllx	%l4, %o4, %l4
3:	
	or	%g1, %l3, %o5
	ldx	[%o0+3*8], %l3
	
	bz,pn	%icc, 3f
	stx	%o5, [%o1+3*8]
	srlx	%l5, %o3, %g1
	
	sllx	%l5, %o4, %l5
3:	
	or	%g1, %l4, %o5
	ldx	[%o0+4*8], %l4
	
	bz,pn	%icc, 3f
	stx	%o5, [%o1+4*8]
	srlx	%l6, %o3, %g1
	
	sllx	%l6, %o4, %l6
3:	
	or	%g1, %l5, %o5
	ldx	[%o0+5*8], %l5
	
	bz,pn	%icc, 3f
	stx	%o5, [%o1+5*8]
	srlx	%l7, %o3, %g1
	
	sllx	%l7, %o4, %l7
3:	
	or	%g1, %l6, %o5
	ldx	[%o0+6*8], %l6
	
	bz,pn	%icc, 3f
	stx	%o5, [%o1+6*8]
	srlx	%l0, %o3, %g1
	
	sllx	%l0, %o4, %l0				! Next loop
3:	
	or	%g1, %l7, %o5
	ldx	[%o0+7*8], %l7
	deccc	8*8, %o2				! Have enough room?
	
	stx	%o5, [%o1+7*8]
	inc	8*8, %o1
	bge,pt	%xcc, 1b
	 tst	%o4


	!!
	!! Now unload all those regs
	!! 
Lbcopy_unrolled8_cleanup:	
	srlx	%l1, %o3, %g1
	bz,pn	%xcc, 3f
	 inc	7*8, %o0				! Point at the last load

	sllx	%l1, %o4, %l1
	ba,pt	%icc, 4f
	 or	%g1, %l0, %o5
	
3:
	mov	%l0, %o5
	
4:	
	bz,pn	%icc, 3f
	stx	%o5, [%o1+0*8]
	srlx	%l2, %o3, %g1

	sllx	%l2, %o4, %l2
3:	
	or	%g1, %l1, %o5
	
	bz,pn	%icc, 3f
	stx	%o5, [%o1+1*8]
	srlx	%l3, %o3, %g1
	
	sllx	%l3, %o4, %l3
3:	
	or	%g1, %l2, %o5
	
	bz,pn	%icc, 3f
	stx	%o5, [%o1+2*8]
	srlx	%l4, %o3, %g1
	
	sllx	%l4, %o4, %l4
3:	
	or	%g1, %l3, %o5
	
	bz,pn	%icc, 3f
	stx	%o5, [%o1+3*8]
	srlx	%l5, %o3, %g1
	
	sllx	%l5, %o4, %l5
3:	
	or	%g1, %l4, %o5
	
	bz,pn	%icc, 3f
	stx	%o5, [%o1+4*8]
	srlx	%l6, %o3, %g1
	
	sllx	%l6, %o4, %l6
3:	
	or	%g1, %l5, %o5
	
	bz,pn	%icc, 3f
	stx	%o5, [%o1+5*8]
	srlx	%l7, %o3, %g1
	
	sllx	%l7, %o4, %l7
3:	
	or	%g1, %l6, %o5
	mov	%l7, %l0				! Shuffle to %l0
	
	stx	%o5, [%o1+6*8]
	or	%g1, %l7, %o5
	dec	7*8, %o2
	
	inc	7*8, %o1				! Point at last store
#endif
2:
	inccc	16*8, %o2
	bz,pn	%icc, Lbcopy_complete
	
	!! Unrolled 8 times
Lbcopy_aligned8:	
!	ldx	[%o0], %l0				! Already done
!	sllx	%l0, %o4, %l0				! Shift high word
	
	 deccc	8, %o2					! Pre-decrement
	bl,pn	%xcc, Lbcopy_finish
1:
	ldx	[%o0+8], %l1				! Load word 0
	inc	8, %o0
	
	srlx	%l1, %o3, %o5
	or	%o5, %l0, %o5				! Combine
	
	stx	%o5, [%o1]				! Store result
	 inc	8, %o1
	
	deccc	8, %o2
	bge,pn	%xcc, 1b
	 sllx	%l1, %o4, %l0	

	btst	7, %o2					! Done?
	bz,pt	%xcc, Lbcopy_complete

	!!
	!! Loadup the last dregs into %l0 and shift it into place
	!! 
	 srlx	%o3, 3, %o5				! # bytes in %l0
	dec	8, %o5					!  - 8
	!! n-8 - (by - 8) -> n - by
	subcc	%o2, %o5, %g0				! # bytes we need
	ble,pt	%icc, Lbcopy_finish
	 nop
	ldx	[%o0+8], %l1				! Need another word
	srlx	%l1, %o3, %l1
	ba,pt	%icc, Lbcopy_finish
	 or	%l0, %l1, %l0				! All loaded up.
	
Lbcopy_noshift8:
	deccc	8*8, %o2				! Have enough room?
	bl,pn	%xcc, 2f
	 nop
	ba,pt	%icc, 1f
	 nop
	.align	32
1:	
	ldx	[%o0+0*8], %l0
	ldx	[%o0+1*8], %l1
	ldx	[%o0+2*8], %l2
	ldx	[%o0+3*8], %l3
	stx	%l0, [%o1+0*8]
	stx	%l1, [%o1+1*8]
	stx	%l2, [%o1+2*8]
	stx	%l3, [%o1+3*8]

	
	ldx	[%o0+4*8], %l4
	ldx	[%o0+5*8], %l5
	ldx	[%o0+6*8], %l6
	ldx	[%o0+7*8], %l7
	inc	8*8, %o0
	stx	%l4, [%o1+4*8]
	stx	%l5, [%o1+5*8]
	deccc	8*8, %o2
	stx	%l6, [%o1+6*8]
	stx	%l7, [%o1+7*8]
	stx	%l2, [%o1+2*8]
	bge,pt	%xcc, 1b
	 inc	8*8, %o1
2:
	inc	8*8, %o2
1:	
	deccc	8, %o2
	bl,pn	%icc, 1f				! < 0 --> sub word
	 nop
	ldx	[%o0], %o5
	inc	8, %o0
	stx	%o5, [%o1]
	bg,pt	%icc, 1b				! Exactly 0 --> done
	 inc	8, %o1
1:
	btst	7, %o2					! Done?
	bz,pt	%xcc, Lbcopy_complete
	 clr	%o4
	ldx	[%o0], %l0
Lbcopy_finish:
	
	brz,pn	%o2, 2f					! 100% complete?
	 cmp	%o2, 8					! Exactly 8 bytes?
	bz,a,pn	%xcc, 2f
	 stx	%l0, [%o1]

	btst	4, %o2					! Word store?
	bz	%xcc, 1f
	 srlx	%l0, 32, %o5				! Shift high word down
	stw	%o5, [%o1]
	inc	4, %o1
	mov	%l0, %o5				! Operate on the low bits
1:
	btst	2, %o2
	mov	%o5, %l0
	bz	1f
	 srlx	%l0, 16, %o5
	
	sth	%o5, [%o1]				! Store short
	inc	2, %o1
	mov	%l0, %o5				! Operate on low bytes
1:
	mov	%o5, %l0
	btst	1, %o2					! Byte aligned?
	bz	2f
	 srlx	%l0, 8, %o5

	stb	%o5, [%o1]				! Store last byte
	inc	1, %o1					! Update address
2:	
Lbcopy_complete:
#if 0
	!!
	!! verify copy success.
	!! 

	mov	%i0, %o2
	mov	%i1, %o4
	mov	%i2, %l4
0:	
	ldub	[%o2], %o1
	inc	%o2
	ldub	[%o4], %o3
	inc	%o4
	cmp	%o3, %o1
	bnz	1f
	 dec	%l4
	brnz	%l4, 0b
	 nop
	ba	2f
	 nop

1:
	set	0f, %o0
	call	printf
	 sub	%i2, %l4, %o5
	set	1f, %o0
	mov	%i0, %o1
	mov	%i1, %o2
	call	printf
	 mov	%i2, %o3
	ta	1
	.data
0:	.asciz	"bcopy failed: %x@%p != %x@%p byte %d\n"
1:	.asciz	"bcopy(%p, %p, %lx)\n"
	_ALIGN
	.text
2:	
#endif
	ret
	 restore %i1, %g0, %o0

#if 1

/*
 * Block copy.  Useful for >256 byte copies.
 *
 * Benchmarking has shown this always seems to be slower than
 * the integer version, so this is disabled.  Maybe someone will
 * figure out why sometime.
 */
	
Lbcopy_block:
#ifdef _KERNEL
/*
 * Kernel:
 *
 * Here we use VIS instructions to do a block clear of a page.
 * But before we can do that we need to save and enable the FPU.
 * The last owner of the FPU registers is fpproc, and
 * fpproc->p_md.md_fpstate is the current fpstate.  If that's not
 * null, call savefpstate() with it to store our current fp state.
 *
 * Next, allocate an aligned fpstate on the stack.  We will properly
 * nest calls on a particular stack so this should not be a problem.
 *
 * Now we grab either curproc (or if we're on the interrupt stack
 * proc0).  We stash its existing fpstate in a local register and
 * put our new fpstate in curproc->p_md.md_fpstate.  We point
 * fpproc at curproc (or proc0) and enable the FPU.
 *
 * If we are ever preempted, our FPU state will be saved in our
 * fpstate.  Then, when we're resumed and we take an FPDISABLED
 * trap, the trap handler will be able to fish our FPU state out
 * of curproc (or proc0).
 *
 * On exiting this routine we undo the damage: restore the original
 * pointer to curproc->p_md.md_fpstate, clear our fpproc, and disable
 * the MMU.
 *
 *
 * Register usage, Kernel only (after save):
 *
 * %i0		src
 * %i1		dest
 * %i2		size
 *
 * %l0		XXXX DEBUG old fpstate
 * %l1		fpproc (hi bits only)
 * %l2		orig fpproc
 * %l3		orig fpstate
 * %l5		curproc
 * %l6		old fpstate
 *
 * Register ussage, Kernel and user:
 *
 * %g1		src (retval for memcpy)
 *
 * %o0		src
 * %o1		dest
 * %o2		end dest
 * %o5		last safe fetchable address
 */

#if 1
	ENABLE_FPU(0)
#else
	save	%sp, -(CC64FSZ+FS_SIZE+BLOCK_SIZE), %sp	! Allocate an fpstate
	sethi	%hi(FPPROC), %l1
	LDPTR	[%l1 + %lo(FPPROC)], %l2		! Load fpproc
	add	%sp, (CC64FSZ+STKB+BLOCK_SIZE-1), %l0	! Calculate pointer to fpstate
	brz,pt	%l2, 1f					! fpproc == NULL?
	 andn	%l0, BLOCK_ALIGN, %l0			! And make it block aligned
	LDPTR	[%l2 + P_FPSTATE], %l3
	brz,pn	%l3, 1f					! Make sure we have an fpstate
	 mov	%l3, %o0
	call	_C_LABEL(savefpstate)			! Save the old fpstate
	 set	EINTSTACK-STKB, %l4			! Are we on intr stack?
	cmp	%sp, %l4
	bgu,pt	%xcc, 1f
	 set	INTSTACK-STKB, %l4
	cmp	%sp, %l4
	blu	%xcc, 1f
0:
	 sethi	%hi(_C_LABEL(proc0)), %l4		! Yes, use proc0
	ba,pt	%xcc, 2f				! XXXX needs to change to CPUs idle proc
	 or	%l4, %lo(_C_LABEL(proc0)), %l5
1:
	sethi	%hi(CURPROC), %l4			! Use curproc
	LDPTR	[%l4 + %lo(CURPROC)], %l5
	brz,pn	%l5, 0b					! If curproc is NULL need to use proc0
	 nop
2:
	LDPTR	[%l5 + P_FPSTATE], %l6			! Save old fpstate
	STPTR	%l0, [%l5 + P_FPSTATE]			! Insert new fpstate
	STPTR	%l5, [%l1 + %lo(FPPROC)]		! Set new fpproc
	wr	%g0, FPRS_FEF, %fprs			! Enable FPU
#endif
	mov	%i0, %o0				! Src addr.
	mov	%i1, %o1				! Store our dest ptr here.
	mov	%i2, %o2				! Len counter
#endif

	!!
	!! First align the output to a 64-bit entity
	!! 

	mov	%o1, %g1				! memcpy retval
	add	%o0, %o2, %o5				! End of source block

	andn	%o0, 7, %o3				! Start of block
	dec	%o5
	fzero	%f0

	andn	%o5, BLOCK_ALIGN, %o5			! Last safe addr.
	ldd	[%o3], %f2				! Load 1st word

	dec	8, %o3					! Move %o3 1 word back
	btst	1, %o1
	bz	4f
	
	 mov	-7, %o4					! Lowest src addr possible
	alignaddr %o0, %o4, %o4				! Base addr for load.

	cmp	%o3, %o4
	be,pt	%xcc, 1f				! Already loaded?
	 mov	%o4, %o3
	fmovd	%f2, %f0				! No. Shift
	ldd	[%o3+8], %f2				! And load
1:	

	faligndata	%f0, %f2, %f4			! Isolate 1st byte

	stda	%f4, [%o1] ASI_FL8_P			! Store 1st byte
	inc	1, %o1					! Update address
	inc	1, %o0
	dec	1, %o2
4:	
	btst	2, %o1
	bz	4f

	 mov	-6, %o4					! Calculate src - 6
	alignaddr %o0, %o4, %o4				! calculate shift mask and dest.

	cmp	%o3, %o4				! Addresses same?
	be,pt	%xcc, 1f
	 mov	%o4, %o3
	fmovd	%f2, %f0				! Shuffle data
	ldd	[%o3+8], %f2				! Load word 0
1:	
	faligndata %f0, %f2, %f4			! Move 1st short low part of f8

	stda	%f4, [%o1] ASI_FL16_P			! Store 1st short
	dec	2, %o2
	inc	2, %o1
	inc	2, %o0
4:
	brz,pn	%o2, Lbcopy_blockfinish			! XXXX

	 btst	4, %o1
	bz	4f

	mov	-4, %o4
	alignaddr %o0, %o4, %o4				! calculate shift mask and dest.

	cmp	%o3, %o4				! Addresses same?
	beq,pt	%xcc, 1f
	 mov	%o4, %o3
	fmovd	%f2, %f0				! Shuffle data
	ldd	[%o3+8], %f2				! Load word 0
1:	
	faligndata %f0, %f2, %f4			! Move 1st short low part of f8

	st	%f5, [%o1]				! Store word
	dec	4, %o2
	inc	4, %o1
	inc	4, %o0
4:
	brz,pn	%o2, Lbcopy_blockfinish			! XXXX
	!!
	!! We are now 32-bit aligned in the dest.
	!!
Lbcopy_block_common:	

	 mov	-0, %o4
	alignaddr %o0, %o4, %o4				! base - shift

	cmp	%o3, %o4				! Addresses same?
	beq,pt	%xcc, 1f
	 mov	%o4, %o3
	fmovd	%f2, %f0				! Shuffle data
	ldd	[%o3+8], %f2				! Load word 0
1:	
	add	%o3, 8, %o0				! now use %o0 for src
	
	!!
	!! Continue until our dest is block aligned
	!! 
Lbcopy_block_aligned8:	
1:
	brz	%o2, Lbcopy_blockfinish
	 btst	BLOCK_ALIGN, %o1			! Block aligned?
	bz	1f
	
	 faligndata %f0, %f2, %f4			! Generate result
	deccc	8, %o2
	ble,pn	%icc, Lbcopy_blockfinish		! Should never happen
	 fmovd	%f4, %f48
	
	std	%f4, [%o1]				! Store result
	inc	8, %o1
	
	fmovd	%f2, %f0
	inc	8, %o0
	ba,pt	%xcc, 1b				! Not yet.
	 ldd	[%o0], %f2				! Load next part
Lbcopy_block_aligned64:	
1:

/*
 * 64-byte aligned -- ready for block operations.
 *
 * Here we have the destination block aligned, but the
 * source pointer may not be.  Sub-word alignment will
 * be handled by faligndata instructions.  But the source
 * can still be potentially aligned to 8 different words
 * in our 64-bit block, so we have 8 different copy routines.
 *
 * Once we figure out our source alignment, we branch
 * to the appropriate copy routine, which sets up the
 * alignment for faligndata and loads (sets) the values
 * into the source registers and does the copy loop.
 *
 * When were down to less than 1 block to store, we
 * exit the copy loop and execute cleanup code.
 *
 * Block loads and stores are not properly interlocked.
 * Stores save one reg/cycle, so you can start overwriting
 * registers the cycle after the store is issued.  
 * 
 * Block loads require a block load to a different register
 * block or a membar #Sync before accessing the loaded
 * data.
 *	
 * Since the faligndata instructions may be offset as far
 * as 7 registers into a block (if you are shifting source 
 * 7 -> dest 0), you need 3 source register blocks for full 
 * performance: one you are copying, one you are loading, 
 * and one for interlocking.  Otherwise, we would need to
 * sprinkle the code with membar #Sync and lose the advantage
 * of running faligndata in parallel with block stores.  This 
 * means we are fetching a full 128 bytes ahead of the stores.  
 * We need to make sure the prefetch does not inadvertently 
 * cross a page boundary and fault on data that we will never 
 * store.
 *
 */
#if 1
	and	%o0, BLOCK_ALIGN, %o3
	srax	%o3, 3, %o3				! Isolate the offset

	brz	%o3, L100				! 0->0
	 btst	4, %o3
	bnz	%xcc, 4f
	 btst	2, %o3
	bnz	%xcc, 2f
	 btst	1, %o3
	ba,pt	%xcc, L101				! 0->1
	 nop	/* XXX spitfire bug */
2:
	bz	%xcc, L102				! 0->2
	 nop
	ba,pt	%xcc, L103				! 0->3
	 nop	/* XXX spitfire bug */
4:	
	bnz	%xcc, 2f
	 btst	1, %o3
	bz	%xcc, L104				! 0->4
	 nop
	ba,pt	%xcc, L105				! 0->5
	 nop	/* XXX spitfire bug */
2:
	bz	%xcc, L106				! 0->6
	 nop
	ba,pt	%xcc, L107				! 0->7
	 nop	/* XXX spitfire bug */
#else

	!!
	!! Isolate the word offset, which just happens to be
	!! the slot in our jump table.
	!!
	!! This is 6 insns, most of which cannot be paired,
	!! which is about the same as the above version.
	!!
	rd	%pc, %o4
1:	
	and	%o0, 0x31, %o3
	add	%o3, (Lbcopy_block_jmp - 1b), %o3
	jmpl	%o4 + %o3, %g0
	 nop

	!!
	!! Jump table
	!!
	
Lbcopy_block_jmp:
	ba,a,pt	%xcc, L100
	 nop
	ba,a,pt	%xcc, L101
	 nop
	ba,a,pt	%xcc, L102
	 nop
	ba,a,pt	%xcc, L103
	 nop
	ba,a,pt	%xcc, L104
	 nop
	ba,a,pt	%xcc, L105
	 nop
	ba,a,pt	%xcc, L106
	 nop
	ba,a,pt	%xcc, L107
	 nop
#endif

	!!
	!! Source is block aligned.
	!!
	!! Just load a block and go.
	!!
L100:
#ifdef RETURN_NAME
	sethi	%hi(1f), %g1
	ba,pt	%icc, 2f
	 or	%g1, %lo(1f), %g1
1:	
	.asciz	"L100"
	.align	8
2:	
#endif
	fmovd	%f0 , %f62
	ldda	[%o0] ASI_BLK_P, %f0
	inc	BLOCK_SIZE, %o0
	cmp	%o0, %o5
	bleu,a,pn	%icc, 3f
	 ldda	[%o0] ASI_BLK_P, %f16
	ba,pt	%icc, 3f
	 membar #Sync
	
	.align	32					! ICache align.
3:
	faligndata	%f62, %f0, %f32
	inc	BLOCK_SIZE, %o0
	faligndata	%f0, %f2, %f34
	dec	BLOCK_SIZE, %o2
	faligndata	%f2, %f4, %f36
	cmp	%o0, %o5
	faligndata	%f4, %f6, %f38
	faligndata	%f6, %f8, %f40
	faligndata	%f8, %f10, %f42
	faligndata	%f10, %f12, %f44
	brlez,pn	%o2, Lbcopy_blockdone
	 faligndata	%f12, %f14, %f46
	
	bleu,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f48
	membar	#Sync
2:	
	stda	%f32, [%o1] ASI_STORE
	faligndata	%f14, %f16, %f32
	inc	BLOCK_SIZE, %o0
	faligndata	%f16, %f18, %f34
	inc	BLOCK_SIZE, %o1
	faligndata	%f18, %f20, %f36
	dec	BLOCK_SIZE, %o2
	faligndata	%f20, %f22, %f38
	cmp	%o0, %o5
	faligndata	%f22, %f24, %f40
	faligndata	%f24, %f26, %f42
	faligndata	%f26, %f28, %f44
	brlez,pn	%o2, Lbcopy_blockdone
	 faligndata	%f28, %f30, %f46
	
	bleu,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f0
	membar	#Sync
2:
	stda	%f32, [%o1] ASI_STORE
	faligndata	%f30, %f48, %f32
	inc	BLOCK_SIZE, %o0
	faligndata	%f48, %f50, %f34
	inc	BLOCK_SIZE, %o1
	faligndata	%f50, %f52, %f36
	dec	BLOCK_SIZE, %o2
	faligndata	%f52, %f54, %f38
	cmp	%o0, %o5
	faligndata	%f54, %f56, %f40
	faligndata	%f56, %f58, %f42
	faligndata	%f58, %f60, %f44
	brlez,pn	%o2, Lbcopy_blockdone
	 faligndata	%f60, %f62, %f46
	bleu,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f16			! Increment is at top
	membar	#Sync
2:	
	stda	%f32, [%o1] ASI_STORE
	ba	3b
	 inc	BLOCK_SIZE, %o1
	
	!!
	!! Source at BLOCK_ALIGN+8
	!!
	!! We need to load almost 1 complete block by hand.
	!! 
L101:
#ifdef RETURN_NAME
	sethi	%hi(1f), %g1
	ba,pt	%icc, 2f
	 or	%g1, %lo(1f), %g1
1:	
	.asciz	"L101"
	.align	8
2:	
#endif
!	fmovd	%f0, %f0				! Hoist fmovd
	ldd	[%o0], %f2
	inc	8, %o0
	ldd	[%o0], %f4
	inc	8, %o0
	ldd	[%o0], %f6
	inc	8, %o0
	ldd	[%o0], %f8
	inc	8, %o0
	ldd	[%o0], %f10
	inc	8, %o0
	ldd	[%o0], %f12
	inc	8, %o0
	ldd	[%o0], %f14
	inc	8, %o0
	
	cmp	%o0, %o5
	bleu,a,pn	%icc, 3f
	 ldda	[%o0] ASI_BLK_P, %f16
	membar #Sync
3:	
	faligndata	%f0, %f2, %f32
	inc	BLOCK_SIZE, %o0
	faligndata	%f2, %f4, %f34
	cmp	%o0, %o5
	faligndata	%f4, %f6, %f36
	dec	BLOCK_SIZE, %o2
	faligndata	%f6, %f8, %f38
	faligndata	%f8, %f10, %f40
	faligndata	%f10, %f12, %f42
	faligndata	%f12, %f14, %f44
	bleu,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f48
	membar	#Sync
2:
	brlez,pn	%o2, Lbcopy_blockdone
	 faligndata	%f14, %f16, %f46

	stda	%f32, [%o1] ASI_STORE
	
	faligndata	%f16, %f18, %f32
	inc	BLOCK_SIZE, %o0
	faligndata	%f18, %f20, %f34
	inc	BLOCK_SIZE, %o1
	faligndata	%f20, %f22, %f36
	cmp	%o0, %o5
	faligndata	%f22, %f24, %f38
	dec	BLOCK_SIZE, %o2
	faligndata	%f24, %f26, %f40
	faligndata	%f26, %f28, %f42
	faligndata	%f28, %f30, %f44
	bleu,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f0
	membar	#Sync
2:	
	brlez,pn	%o2, Lbcopy_blockdone
	 faligndata	%f30, %f48, %f46

	stda	%f32, [%o1] ASI_STORE

	faligndata	%f48, %f50, %f32
	inc	BLOCK_SIZE, %o0
	faligndata	%f50, %f52, %f34
	inc	BLOCK_SIZE, %o1
	faligndata	%f52, %f54, %f36
	cmp	%o0, %o5
	faligndata	%f54, %f56, %f38
	dec	BLOCK_SIZE, %o2
	faligndata	%f56, %f58, %f40
	faligndata	%f58, %f60, %f42
	faligndata	%f60, %f62, %f44
	bleu,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f16
	membar	#Sync
2:	
	brlez,pn	%o2, Lbcopy_blockdone
	 faligndata	%f62, %f0, %f46

	stda	%f32, [%o1] ASI_STORE
	ba	3b
	 inc	BLOCK_SIZE, %o1

	!!
	!! Source at BLOCK_ALIGN+16
	!!
	!! We need to load 6 doubles by hand.
	!! 
L102:
#ifdef RETURN_NAME
	sethi	%hi(1f), %g1
	ba,pt	%icc, 2f
	 or	%g1, %lo(1f), %g1
1:	
	.asciz	"L102"
	.align	8
2:	
#endif
	ldd	[%o0], %f4
	inc	8, %o0
	fmovd	%f0, %f2				! Hoist fmovd
	ldd	[%o0], %f6
	inc	8, %o0
	
	ldd	[%o0], %f8
	inc	8, %o0
	ldd	[%o0], %f10
	inc	8, %o0
	ldd	[%o0], %f12
	inc	8, %o0
	ldd	[%o0], %f14
	inc	8, %o0
	
	cmp	%o0, %o5
	bleu,a,pn	%icc, 3f
	 ldda	[%o0] ASI_BLK_P, %f16
	membar #Sync
3:	
	faligndata	%f2, %f4, %f32
	inc	BLOCK_SIZE, %o0
	faligndata	%f4, %f6, %f34
	cmp	%o0, %o5
	faligndata	%f6, %f8, %f36
	dec	BLOCK_SIZE, %o2
	faligndata	%f8, %f10, %f38
	faligndata	%f10, %f12, %f40
	faligndata	%f12, %f14, %f42
	bleu,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f48
	membar	#Sync
2:
	faligndata	%f14, %f16, %f44

	brlez,pn	%o2, Lbcopy_blockdone
	 faligndata	%f16, %f18, %f46
	
	stda	%f32, [%o1] ASI_STORE

	faligndata	%f18, %f20, %f32
	inc	BLOCK_SIZE, %o0
	faligndata	%f20, %f22, %f34
	inc	BLOCK_SIZE, %o1
	faligndata	%f22, %f24, %f36
	cmp	%o0, %o5
	faligndata	%f24, %f26, %f38
	dec	BLOCK_SIZE, %o2
	faligndata	%f26, %f28, %f40
	faligndata	%f28, %f30, %f42
	bleu,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f0
	membar	#Sync
2:	
	faligndata	%f30, %f48, %f44
	brlez,pn	%o2, Lbcopy_blockdone
	 faligndata	%f48, %f50, %f46

	stda	%f32, [%o1] ASI_STORE

	faligndata	%f50, %f52, %f32
	inc	BLOCK_SIZE, %o0
	faligndata	%f52, %f54, %f34
	inc	BLOCK_SIZE, %o1
	faligndata	%f54, %f56, %f36
	cmp	%o0, %o5
	faligndata	%f56, %f58, %f38
	dec	BLOCK_SIZE, %o2
	faligndata	%f58, %f60, %f40
	faligndata	%f60, %f62, %f42
	bleu,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f16
	membar	#Sync
2:	
	faligndata	%f62, %f0, %f44
	brlez,pn	%o2, Lbcopy_blockdone
	 faligndata	%f0, %f2, %f46

	stda	%f32, [%o1] ASI_STORE
	ba	3b
	 inc	BLOCK_SIZE, %o1
	
	!!
	!! Source at BLOCK_ALIGN+24
	!!
	!! We need to load 5 doubles by hand.
	!! 
L103:
#ifdef RETURN_NAME
	sethi	%hi(1f), %g1
	ba,pt	%icc, 2f
	 or	%g1, %lo(1f), %g1
1:	
	.asciz	"L103"
	.align	8
2:	
#endif
	fmovd	%f0, %f4
	ldd	[%o0], %f6
	inc	8, %o0
	ldd	[%o0], %f8
	inc	8, %o0
	ldd	[%o0], %f10
	inc	8, %o0
	ldd	[%o0], %f12
	inc	8, %o0
	ldd	[%o0], %f14
	inc	8, %o0

	cmp	%o0, %o5
	bleu,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f16
	membar #Sync
2:	
	inc	BLOCK_SIZE, %o0
3:	
	faligndata	%f4, %f6, %f32
	cmp	%o0, %o5
	faligndata	%f6, %f8, %f34
	dec	BLOCK_SIZE, %o2
	faligndata	%f8, %f10, %f36
	faligndata	%f10, %f12, %f38
	faligndata	%f12, %f14, %f40
	bleu,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f48
	membar	#Sync
2:
	faligndata	%f14, %f16, %f42
	inc	BLOCK_SIZE, %o0
	faligndata	%f16, %f18, %f44
	brlez,pn	%o2, Lbcopy_blockdone
	 faligndata	%f18, %f20, %f46
	
	stda	%f32, [%o1] ASI_STORE

	faligndata	%f20, %f22, %f32
	cmp	%o0, %o5
	faligndata	%f22, %f24, %f34
	dec	BLOCK_SIZE, %o2
	faligndata	%f24, %f26, %f36
	inc	BLOCK_SIZE, %o1
	faligndata	%f26, %f28, %f38
	faligndata	%f28, %f30, %f40
	ble,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f0
	membar	#Sync
2:	
	faligndata	%f30, %f48, %f42
	inc	BLOCK_SIZE, %o0
	faligndata	%f48, %f50, %f44
	brlez,pn	%o2, Lbcopy_blockdone
	 faligndata	%f50, %f52, %f46

	stda	%f32, [%o1] ASI_STORE

	faligndata	%f52, %f54, %f32
	cmp	%o0, %o5
	faligndata	%f54, %f56, %f34
	dec	BLOCK_SIZE, %o2
	faligndata	%f56, %f58, %f36
	faligndata	%f58, %f60, %f38
	inc	BLOCK_SIZE, %o1
	faligndata	%f60, %f62, %f40
	bleu,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f16
	membar	#Sync
2:	
	faligndata	%f62, %f0, %f42
	inc	BLOCK_SIZE, %o0
	faligndata	%f0, %f2, %f44
	brlez,pn	%o2, Lbcopy_blockdone
	 faligndata	%f2, %f4, %f46

	stda	%f32, [%o1] ASI_STORE
	ba	3b
	 inc	BLOCK_SIZE, %o1

	!!
	!! Source at BLOCK_ALIGN+32
	!!
	!! We need to load 4 doubles by hand.
	!! 
L104:
#ifdef RETURN_NAME
	sethi	%hi(1f), %g1
	ba,pt	%icc, 2f
	 or	%g1, %lo(1f), %g1
1:	
	.asciz	"L104"
	.align	8
2:	
#endif
	fmovd	%f0, %f6
	ldd	[%o0], %f8
	inc	8, %o0
	ldd	[%o0], %f10
	inc	8, %o0
	ldd	[%o0], %f12
	inc	8, %o0
	ldd	[%o0], %f14
	inc	8, %o0
	
	cmp	%o0, %o5
	bleu,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f16
	membar #Sync
2:	
	inc	BLOCK_SIZE, %o0
3:	
	faligndata	%f6, %f8, %f32
	cmp	%o0, %o5
	faligndata	%f8, %f10, %f34
	dec	BLOCK_SIZE, %o2
	faligndata	%f10, %f12, %f36
	faligndata	%f12, %f14, %f38
	bleu,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f48
	membar	#Sync
2:
	faligndata	%f14, %f16, %f40
	faligndata	%f16, %f18, %f42
	inc	BLOCK_SIZE, %o0
	faligndata	%f18, %f20, %f44
	brlez,pn	%o2, Lbcopy_blockdone
	 faligndata	%f20, %f22, %f46
	
	stda	%f32, [%o1] ASI_STORE

	faligndata	%f22, %f24, %f32
	cmp	%o0, %o5
	faligndata	%f24, %f26, %f34
	faligndata	%f26, %f28, %f36
	inc	BLOCK_SIZE, %o1
	faligndata	%f28, %f30, %f38
	bleu,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f0
	membar	#Sync
2:	
	faligndata	%f30, %f48, %f40
	dec	BLOCK_SIZE, %o2
	faligndata	%f48, %f50, %f42
	inc	BLOCK_SIZE, %o0
	faligndata	%f50, %f52, %f44
	brlez,pn	%o2, Lbcopy_blockdone
	 faligndata	%f52, %f54, %f46

	stda	%f32, [%o1] ASI_STORE

	faligndata	%f54, %f56, %f32
	cmp	%o0, %o5
	faligndata	%f56, %f58, %f34
	faligndata	%f58, %f60, %f36
	inc	BLOCK_SIZE, %o1
	faligndata	%f60, %f62, %f38
	bleu,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f16
	membar	#Sync
2:	
	faligndata	%f62, %f0, %f40
	dec	BLOCK_SIZE, %o2
	faligndata	%f0, %f2, %f42
	inc	BLOCK_SIZE, %o0
	faligndata	%f2, %f4, %f44
	brlez,pn	%o2, Lbcopy_blockdone
	 faligndata	%f4, %f6, %f46

	stda	%f32, [%o1] ASI_STORE
	ba	3b
	 inc	BLOCK_SIZE, %o1

	!!
	!! Source at BLOCK_ALIGN+40
	!!
	!! We need to load 3 doubles by hand.
	!! 
L105:
#ifdef RETURN_NAME
	sethi	%hi(1f), %g1
	ba,pt	%icc, 2f
	 or	%g1, %lo(1f), %g1
1:	
	.asciz	"L105"
	.align	8
2:	
#endif
	fmovd	%f0, %f8
	ldd	[%o0], %f10
	inc	8, %o0
	ldd	[%o0], %f12
	inc	8, %o0
	ldd	[%o0], %f14
	inc	8, %o0
	
	cmp	%o0, %o5
	bleu,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f16
	membar #Sync
2:	
	inc	BLOCK_SIZE, %o0
3:	
	faligndata	%f8, %f10, %f32
	cmp	%o0, %o5
	faligndata	%f10, %f12, %f34
	faligndata	%f12, %f14, %f36
	bleu,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f48
	membar	#Sync
2:
	faligndata	%f14, %f16, %f38
	dec	BLOCK_SIZE, %o2
	faligndata	%f16, %f18, %f40
	inc	BLOCK_SIZE, %o0
	faligndata	%f18, %f20, %f42
	faligndata	%f20, %f22, %f44
	brlez,pn	%o2, Lbcopy_blockdone
	 faligndata	%f22, %f24, %f46
	
	stda	%f32, [%o1] ASI_STORE

	faligndata	%f24, %f26, %f32
	cmp	%o0, %o5
	faligndata	%f26, %f28, %f34
	dec	BLOCK_SIZE, %o2
	faligndata	%f28, %f30, %f36
	bleu,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f0
	membar	#Sync
2:
	faligndata	%f30, %f48, %f38
	inc	BLOCK_SIZE, %o1
	faligndata	%f48, %f50, %f40
	inc	BLOCK_SIZE, %o0
	faligndata	%f50, %f52, %f42
	faligndata	%f52, %f54, %f44
	brlez,pn	%o2, Lbcopy_blockdone
	 faligndata	%f54, %f56, %f46

	stda	%f32, [%o1] ASI_STORE

	faligndata	%f56, %f58, %f32
	cmp	%o0, %o5
	faligndata	%f58, %f60, %f34
	dec	BLOCK_SIZE, %o2
	faligndata	%f60, %f62, %f36
	bleu,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f16
	membar	#Sync
2:
	faligndata	%f62, %f0, %f38
	inc	BLOCK_SIZE, %o1
	faligndata	%f0, %f2, %f40
	inc	BLOCK_SIZE, %o0
	faligndata	%f2, %f4, %f42
	faligndata	%f4, %f6, %f44
	brlez,pn	%o2, Lbcopy_blockdone
	 faligndata	%f6, %f8, %f46

	stda	%f32, [%o1] ASI_STORE
	ba	3b
	 inc	BLOCK_SIZE, %o1


	!!
	!! Source at BLOCK_ALIGN+48
	!!
	!! We need to load 2 doubles by hand.
	!! 
L106:
#ifdef RETURN_NAME
	sethi	%hi(1f), %g1
	ba,pt	%icc, 2f
	 or	%g1, %lo(1f), %g1
1:	
	.asciz	"L106"
	.align	8
2:	
#endif
	fmovd	%f0, %f10
	ldd	[%o0], %f12
	inc	8, %o0
	ldd	[%o0], %f14
	inc	8, %o0
	
	cmp	%o0, %o5
	bleu,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f16
	membar #Sync
2:	
	inc	BLOCK_SIZE, %o0
3:	
	faligndata	%f10, %f12, %f32
	cmp	%o0, %o5
	faligndata	%f12, %f14, %f34
	bleu,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f48
	membar	#Sync
2:
	faligndata	%f14, %f16, %f36
	dec	BLOCK_SIZE, %o2
	faligndata	%f16, %f18, %f38
	inc	BLOCK_SIZE, %o0
	faligndata	%f18, %f20, %f40
	faligndata	%f20, %f22, %f42
	faligndata	%f22, %f24, %f44
	brlez,pn	%o2, Lbcopy_blockdone
	 faligndata	%f24, %f26, %f46
	
	stda	%f32, [%o1] ASI_STORE

	faligndata	%f26, %f28, %f32
	cmp	%o0, %o5
	faligndata	%f28, %f30, %f34
	bleu,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f0
	membar	#Sync
2:
	faligndata	%f30, %f48, %f36
	dec	BLOCK_SIZE, %o2
	faligndata	%f48, %f50, %f38
	inc	BLOCK_SIZE, %o1
	faligndata	%f50, %f52, %f40
	faligndata	%f52, %f54, %f42
	inc	BLOCK_SIZE, %o0
	faligndata	%f54, %f56, %f44
	brlez,pn	%o2, Lbcopy_blockdone
	 faligndata	%f56, %f58, %f46

	stda	%f32, [%o1] ASI_STORE

	faligndata	%f58, %f60, %f32
	cmp	%o0, %o5
	faligndata	%f60, %f62, %f34
	bleu,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f16
	membar	#Sync
2:
	faligndata	%f62, %f0, %f36
	dec	BLOCK_SIZE, %o2
	faligndata	%f0, %f2, %f38
	inc	BLOCK_SIZE, %o1
	faligndata	%f2, %f4, %f40
	faligndata	%f4, %f6, %f42
	inc	BLOCK_SIZE, %o0
	faligndata	%f6, %f8, %f44
	brlez,pn	%o2, Lbcopy_blockdone
	 faligndata	%f8, %f10, %f46

	stda	%f32, [%o1] ASI_STORE
	ba	3b
	 inc	BLOCK_SIZE, %o1


	!!
	!! Source at BLOCK_ALIGN+56
	!!
	!! We need to load 1 double by hand.
	!! 
L107:
#ifdef RETURN_NAME
	sethi	%hi(1f), %g1
	ba,pt	%icc, 2f
	 or	%g1, %lo(1f), %g1
1:	
	.asciz	"L107"
	.align	8
2:	
#endif
	fmovd	%f0, %f12
	ldd	[%o0], %f14
	inc	8, %o0

	cmp	%o0, %o5
	bleu,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f16
	membar #Sync
2:	
	inc	BLOCK_SIZE, %o0
3:	
	faligndata	%f12, %f14, %f32
	cmp	%o0, %o5
	bleu,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f48
	membar	#Sync
2:
	faligndata	%f14, %f16, %f34
	dec	BLOCK_SIZE, %o2
	faligndata	%f16, %f18, %f36
	inc	BLOCK_SIZE, %o0
	faligndata	%f18, %f20, %f38
	faligndata	%f20, %f22, %f40
	faligndata	%f22, %f24, %f42
	faligndata	%f24, %f26, %f44
	brlez,pn	%o2, Lbcopy_blockdone
	 faligndata	%f26, %f28, %f46
	
	stda	%f32, [%o1] ASI_STORE

	faligndata	%f28, %f30, %f32
	cmp	%o0, %o5
	bleu,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f0
	membar	#Sync
2:
	faligndata	%f30, %f48, %f34
	dec	BLOCK_SIZE, %o2
	faligndata	%f48, %f50, %f36
	inc	BLOCK_SIZE, %o1
	faligndata	%f50, %f52, %f38
	faligndata	%f52, %f54, %f40
	inc	BLOCK_SIZE, %o0
	faligndata	%f54, %f56, %f42
	faligndata	%f56, %f58, %f44
	brlez,pn	%o2, Lbcopy_blockdone
	 faligndata	%f58, %f60, %f46
	
	stda	%f32, [%o1] ASI_STORE

	faligndata	%f60, %f62, %f32
	cmp	%o0, %o5
	bleu,a,pn	%icc, 2f
	 ldda	[%o0] ASI_BLK_P, %f16
	membar	#Sync
2:
	faligndata	%f62, %f0, %f34
	dec	BLOCK_SIZE, %o2
	faligndata	%f0, %f2, %f36
	inc	BLOCK_SIZE, %o1
	faligndata	%f2, %f4, %f38
	faligndata	%f4, %f6, %f40
	inc	BLOCK_SIZE, %o0
	faligndata	%f6, %f8, %f42
	faligndata	%f8, %f10, %f44

	brlez,pn	%o2, Lbcopy_blockdone
	 faligndata	%f10, %f12, %f46

	stda	%f32, [%o1] ASI_STORE
	ba	3b
	 inc	BLOCK_SIZE, %o1
	
Lbcopy_blockdone:
	inc	BLOCK_SIZE, %o2				! Fixup our overcommit
	membar	#Sync					! Finish any pending loads
#define	FINISH_REG(f)				\
	deccc	8, %o2;				\
	bl,a	Lbcopy_blockfinish;		\
	 fmovd	f, %f48;			\
	std	f, [%o1];			\
	inc	8, %o1

	FINISH_REG(%f32)
	FINISH_REG(%f34)
	FINISH_REG(%f36)
	FINISH_REG(%f38)
	FINISH_REG(%f40)
	FINISH_REG(%f42)
	FINISH_REG(%f44)
	FINISH_REG(%f46)
	FINISH_REG(%f48)
#undef FINISH_REG
	!! 
	!! The low 3 bits have the sub-word bits needed to be
	!! stored [because (x-8)&0x7 == x].
	!!
Lbcopy_blockfinish:
	brz,pn	%o2, 2f					! 100% complete?
	 fmovd	%f48, %f4
	cmp	%o2, 8					! Exactly 8 bytes?
	bz,a,pn	%xcc, 2f
	 std	%f4, [%o1]

	btst	4, %o2					! Word store?
	bz	%xcc, 1f
	 nop
	st	%f4, [%o1]
	inc	4, %o1
1:
	btst	2, %o2
	fzero	%f0
	bz	1f

	 mov	-6, %o4
	alignaddr %o1, %o4, %g0

	faligndata %f0, %f4, %f8
	
	stda	%f8, [%o1] ASI_FL16_P			! Store short
	inc	2, %o1
1:
	btst	1, %o2					! Byte aligned?
	bz	2f

	 mov	-7, %o0					! Calculate dest - 7
	alignaddr %o1, %o0, %g0				! Calculate shift mask and dest.

	faligndata %f0, %f4, %f8			! Move 1st byte to low part of f8

	stda	%f8, [%o1] ASI_FL8_P			! Store 1st byte
	inc	1, %o1					! Update address
2:
	membar	#Sync
#if 0
	!!
	!! verify copy success.
	!! 

	mov	%i0, %o2
	mov	%i1, %o4
	mov	%i2, %l4
0:	
	ldub	[%o2], %o1
	inc	%o2
	ldub	[%o4], %o3
	inc	%o4
	cmp	%o3, %o1
	bnz	1f
	 dec	%l4
	brnz	%l4, 0b
	 nop
	ba	2f
	 nop

1:
	set	block_disable, %o0
	stx	%o0, [%o0]
	
	set	0f, %o0
	call	prom_printf
	 sub	%i2, %l4, %o5
	set	1f, %o0
	mov	%i0, %o1
	mov	%i1, %o2
	call	prom_printf
	 mov	%i2, %o3
	ta	1
	.data
	_ALIGN
block_disable:	.xword	0
0:	.asciz	"bcopy failed: %x@%p != %x@%p byte %d\r\n"
1:	.asciz	"bcopy(%p, %p, %lx)\r\n"
	_ALIGN
	.text
2:	
#endif
#ifdef _KERNEL		

	set 1f, %o0
	mov	%i0, %o1
	mov	%i1, %o2
	call	printf
	mov	%i2, %o3
	
	.data
	_ALIGN
1:	.asciz "block exit (%p, %p, %d)\n"
	_ALIGN
	.text
/*
 * Weve saved our possible fpstate, now disable the fpu
 * and continue with life.
 */
#if 1
	RESTORE_FPU
#else
#ifdef DEBUG
	LDPTR	[%l1 + %lo(FPPROC)], %l7
	cmp	%l7, %l5
!	tnz	1		! fpproc has changed!
	LDPTR	[%l5 + P_FPSTATE], %l7
	cmp	%l7, %l0
	tnz	1		! fpstate has changed!
#endif
	andcc	%l2, %l3, %g0				! If (fpproc && fpstate)
	STPTR	%l2, [%l1 + %lo(FPPROC)]		! Restore old fproc
	bz,pt	%xcc, 1f				! Skip if no fpstate
	 STPTR	%l6, [%l5 + P_FPSTATE]			! Restore old fpstate
	
	call	_C_LABEL(loadfpstate)			! Re-load orig fpstate
	 mov	%l3, %o0
1:
#endif
	set 1f, %o0
	mov	%i0, %o1
	mov	%i1, %o2
	call	printf
	mov	%i2, %o3
	
	.data
	_ALIGN
1:	.asciz "block done (%p, %p, %d)\n"
	_ALIGN
	.text

	
	ret
	 restore	%g1, 0, %o0			! Return DEST for memcpy
#endif
	retl
	 mov	%g1, %o0
#endif