Annotation of src/lib/libc/arch/sparc64/string/memcpy.S, Revision 1.5
1.5 ! mrg 1: /* $NetBSD: memcpy.S,v 1.4 2011/05/25 02:11:16 christos Exp $ */
1.1 eeh 2:
3: /*
4: * Copyright (c) 2001 Eduardo E. Horvath
5: *
6: * This software was developed by the Computer Systems Engineering group
7: * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
8: * contributed to Berkeley.
9: *
10: * Redistribution and use in source and binary forms, with or without
11: * modification, are permitted provided that the following conditions
12: * are met:
13: * 1. Redistributions of source code must retain the above copyright
14: * notice, this list of conditions and the following disclaimer.
15: * 2. Redistributions in binary form must reproduce the above copyright
16: * notice, this list of conditions and the following disclaimer in the
17: * documentation and/or other materials provided with the distribution.
18: * 3. All advertising materials mentioning features or use of this software
19: * must display the following acknowledgement:
20: * This product includes software developed by the University of
21: * California, Berkeley and its contributors.
22: * 4. Neither the name of the University nor the names of its contributors
23: * may be used to endorse or promote products derived from this software
24: * without specific prior written permission.
25: *
26: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36: * SUCH DAMAGE.
37: *
38: */
39:
40: #include <machine/asm.h>
41: #ifndef _LOCORE
42: #define _LOCORE
43: #endif
44: #include <machine/ctlreg.h>
45: #include <machine/frame.h>
46: #include <machine/psl.h>
47:
48: #if defined(LIBC_SCCS) && !defined(lint)
1.5 ! mrg 49: RCSID("$NetBSD: memcpy.S,v 1.4 2011/05/25 02:11:16 christos Exp $")
1.1 eeh 50: #endif /* LIBC_SCCS and not lint */
51:
52: #define EMPTY nop
53: #define NOTREACHED ta 1
54:
55: #define BCOPY_SMALL 16
1.5 ! mrg 56: #define BLOCK_SIZE SPARC64_BLOCK_SIZE
! 57: #define BLOCK_ALIGN SPARC64_BLOCK_ALIGN
1.1 eeh 58:
59: #if 0
60: #define ASI_STORE ASI_BLK_COMMIT_P
61: #else
62: #define ASI_STORE ASI_BLK_P
63: #endif
64:
1.3 christos 65: #ifndef _ALIGN
66: #define _ALIGN .align 8
67: #endif
68:
1.1 eeh 69: #if 1
70: /*
71: * kernel bcopy/memcpy
72: * Assumes regions do not overlap; has no useful return value.
73: *
74: * Must not use %g7 (see copyin/copyout above).
75: */
76: ENTRY(memcpy) /* dest, src, size */
77: /*
78: * Swap args for bcopy. Gcc generates calls to memcpy for
79: * structure assignments.
80: */
81: mov %o0, %o3
82: mov %o1, %o0
83: mov %o3, %o1
84: #endif
85: ENTRY(bcopy) /* src, dest, size */
86: #ifdef DEBUG
87: set pmapdebug, %o4
88: ld [%o4], %o4
89: btst 0x80, %o4 ! PDB_COPY
90: bz,pt %icc, 3f
91: nop
92: save %sp, -CC64FSZ, %sp
93: mov %i0, %o1
94: set 2f, %o0
95: mov %i1, %o2
96: call printf
97: mov %i2, %o3
98: ! ta 1; nop
99: restore
100: .data
101: 2: .asciz "bcopy(%p->%p,%x)\n"
102: _ALIGN
103: .text
104: 3:
105: #endif
106: /*
107: * Check for overlaps and punt.
108: *
109: * If src <= dest <= src+len we have a problem.
110: */
111:
112: sub %o1, %o0, %o3
113:
114: cmp %o3, %o2
1.2 eeh 115: blu,pn %xcc, Lovbcopy
1.1 eeh 116: cmp %o2, BCOPY_SMALL
117: Lbcopy_start:
118: bge,pt %xcc, 2f ! if >= this many, go be fancy.
119: cmp %o2, 256
120:
121: mov %o1, %o5 ! Save memcpy return value
122: /*
123: * Not much to copy, just do it a byte at a time.
124: */
125: deccc %o2 ! while (--len >= 0)
126: bl 1f
127: EMPTY
128: 0:
129: inc %o0
130: ldsb [%o0 - 1], %o4 ! (++dst)[-1] = *src++;
131: stb %o4, [%o1]
132: deccc %o2
133: bge 0b
134: inc %o1
135: 1:
136: retl
137: mov %o5, %o0
138: NOTREACHED
139:
140: /*
141: * Overlapping bcopies -- punt.
142: */
143: Lovbcopy:
144:
145: /*
146: * Since src comes before dst, and the regions might overlap,
147: * we have to do the copy starting at the end and working backwards.
148: *
149: * We could optimize this, but it almost never happens.
150: */
151: mov %o1, %o5 ! Retval
152: add %o2, %o0, %o0 ! src += len
153: add %o2, %o1, %o1 ! dst += len
154:
155: deccc %o2
156: bl,pn %xcc, 1f
157: dec %o0
158: 0:
159: dec %o1
160: ldsb [%o0], %o4
161: dec %o0
162:
163: deccc %o2
164: bge,pt %xcc, 0b
165: stb %o4, [%o1]
166: 1:
167: retl
168: mov %o5, %o0
169:
170: /*
171: * Plenty of data to copy, so try to do it optimally.
172: */
173: 2:
174: #if 1
175: ! If it is big enough, use VIS instructions
176: bge Lbcopy_block
177: nop
178: #endif
179: Lbcopy_fancy:
180:
181: !!
182: !! First align the output to a 8-byte entity
183: !!
184:
185: save %sp, -CC64FSZ, %sp
186:
187: mov %i0, %o0
188: mov %i1, %o1
189:
190: mov %i2, %o2
191: btst 1, %o1
192:
193: bz,pt %icc, 4f
194: btst 2, %o1
195: ldub [%o0], %o4 ! Load 1st byte
196:
197: deccc 1, %o2
198: ble,pn %xcc, Lbcopy_finish ! XXXX
199: inc 1, %o0
200:
201: stb %o4, [%o1] ! Store 1st byte
202: inc 1, %o1 ! Update address
203: btst 2, %o1
204: 4:
205: bz,pt %icc, 4f
206:
207: btst 1, %o0
208: bz,a 1f
209: lduh [%o0], %o4 ! Load short
210:
211: ldub [%o0], %o4 ! Load bytes
212:
213: ldub [%o0+1], %o3
214: sllx %o4, 8, %o4
215: or %o3, %o4, %o4
216:
217: 1:
218: deccc 2, %o2
219: ble,pn %xcc, Lbcopy_finish ! XXXX
220: inc 2, %o0
221: sth %o4, [%o1] ! Store 1st short
222:
223: inc 2, %o1
224: 4:
225: btst 4, %o1
226: bz,pt %xcc, 4f
227:
228: btst 3, %o0
229: bz,a,pt %xcc, 1f
230: lduw [%o0], %o4 ! Load word -1
231:
232: btst 1, %o0
233: bz,a,pt %icc, 2f
234: lduh [%o0], %o4
235:
236: ldub [%o0], %o4
237:
238: lduh [%o0+1], %o3
239: sllx %o4, 16, %o4
240: or %o4, %o3, %o4
241:
242: ldub [%o0+3], %o3
243: sllx %o4, 8, %o4
244: ba,pt %icc, 1f
245: or %o4, %o3, %o4
246:
247: 2:
248: lduh [%o0+2], %o3
249: sllx %o4, 16, %o4
250: or %o4, %o3, %o4
251:
252: 1:
253: deccc 4, %o2
254: ble,pn %xcc, Lbcopy_finish ! XXXX
255: inc 4, %o0
256:
257: st %o4, [%o1] ! Store word
258: inc 4, %o1
259: 4:
260: !!
261: !! We are now 32-bit aligned in the dest.
262: !!
263: Lbcopy__common:
264:
265: and %o0, 7, %o4 ! Shift amount
266: andn %o0, 7, %o0 ! Source addr
267:
268: brz,pt %o4, Lbcopy_noshift8 ! No shift version...
269:
270: sllx %o4, 3, %o4 ! In bits
271: mov 8<<3, %o3
272:
273: ldx [%o0], %l0 ! Load word -1
274: sub %o3, %o4, %o3 ! Reverse shift
275: deccc 16*8, %o2 ! Have enough room?
276:
277: sllx %l0, %o4, %l0
278: bl,pn %xcc, 2f
279: and %o3, 0x38, %o3
280: Lbcopy_unrolled8:
281:
282: /*
283: * This is about as close to optimal as you can get, since
284: * the shifts require EU0 and cannot be paired, and you have
285: * 3 dependent operations on the data.
286: */
287:
288: ! ldx [%o0+0*8], %l0 ! Already done
289: ! sllx %l0, %o4, %l0 ! Already done
290: ldx [%o0+1*8], %l1
291: ldx [%o0+2*8], %l2
292: ldx [%o0+3*8], %l3
293: ldx [%o0+4*8], %l4
294: ldx [%o0+5*8], %l5
295: ldx [%o0+6*8], %l6
296: #if 1
297: ba,pt %icc, 1f
298: ldx [%o0+7*8], %l7
299: .align 8
300: 1:
301: srlx %l1, %o3, %g1
302: inc 8*8, %o0
303:
304: sllx %l1, %o4, %l1
305: or %g1, %l0, %o5
306: ldx [%o0+0*8], %l0
307:
308: stx %o5, [%o1+0*8]
309: srlx %l2, %o3, %g1
310:
311: sllx %l2, %o4, %l2
312: or %g1, %l1, %o5
313: ldx [%o0+1*8], %l1
314:
315: stx %o5, [%o1+1*8]
316: srlx %l3, %o3, %g1
317:
318: sllx %l3, %o4, %l3
319: or %g1, %l2, %o5
320: ldx [%o0+2*8], %l2
321:
322: stx %o5, [%o1+2*8]
323: srlx %l4, %o3, %g1
324:
325: sllx %l4, %o4, %l4
326: or %g1, %l3, %o5
327: ldx [%o0+3*8], %l3
328:
329: stx %o5, [%o1+3*8]
330: srlx %l5, %o3, %g1
331:
332: sllx %l5, %o4, %l5
333: or %g1, %l4, %o5
334: ldx [%o0+4*8], %l4
335:
336: stx %o5, [%o1+4*8]
337: srlx %l6, %o3, %g1
338:
339: sllx %l6, %o4, %l6
340: or %g1, %l5, %o5
341: ldx [%o0+5*8], %l5
342:
343: stx %o5, [%o1+5*8]
344: srlx %l7, %o3, %g1
345:
346: sllx %l7, %o4, %l7
347: or %g1, %l6, %o5
348: ldx [%o0+6*8], %l6
349:
350: stx %o5, [%o1+6*8]
351: srlx %l0, %o3, %g1
352: deccc 8*8, %o2 ! Have enough room?
353:
354: sllx %l0, %o4, %l0 ! Next loop
355: or %g1, %l7, %o5
356: ldx [%o0+7*8], %l7
357:
358: stx %o5, [%o1+7*8]
359: bge,pt %xcc, 1b
360: inc 8*8, %o1
361:
362: Lbcopy_unrolled8_cleanup:
363: !!
364: !! Finished 8 byte block, unload the regs.
365: !!
366: srlx %l1, %o3, %g1
367: inc 7*8, %o0
368:
369: sllx %l1, %o4, %l1
370: or %g1, %l0, %o5
371:
372: stx %o5, [%o1+0*8]
373: srlx %l2, %o3, %g1
374:
375: sllx %l2, %o4, %l2
376: or %g1, %l1, %o5
377:
378: stx %o5, [%o1+1*8]
379: srlx %l3, %o3, %g1
380:
381: sllx %l3, %o4, %l3
382: or %g1, %l2, %o5
383:
384: stx %o5, [%o1+2*8]
385: srlx %l4, %o3, %g1
386:
387: sllx %l4, %o4, %l4
388: or %g1, %l3, %o5
389:
390: stx %o5, [%o1+3*8]
391: srlx %l5, %o3, %g1
392:
393: sllx %l5, %o4, %l5
394: or %g1, %l4, %o5
395:
396: stx %o5, [%o1+4*8]
397: srlx %l6, %o3, %g1
398:
399: sllx %l6, %o4, %l6
400: or %g1, %l5, %o5
401:
402: stx %o5, [%o1+5*8]
403: srlx %l7, %o3, %g1
404:
405: sllx %l7, %o4, %l7
406: or %g1, %l6, %o5
407:
408: stx %o5, [%o1+6*8]
409: inc 7*8, %o1
410:
411: mov %l7, %l0 ! Save our unused data
412: dec 7*8, %o2
413: #else
414: /*
415: * This version also handles aligned copies at almost the
416: * same speed. It should take the same number of cycles
417: * as the previous version, but is slightly slower, probably
418: * due to i$ issues.
419: */
420: ldx [%o0+7*8], %l7
421: ba,pt %icc, 1f
422: clr %g1
423: .align 32
424: 1:
425: srlx %l1, %o3, %g1
426: bz,pn %xcc, 3f
427: inc 8*8, %o0
428:
429: sllx %l1, %o4, %l1
430: or %g1, %l0, %o5
431: ba,pt %icc, 4f
432: ldx [%o0+0*8], %l0
433:
434: nop
435: 3:
436: mov %l0, %o5
437: ldx [%o0+0*8], %l0
438:
439: 4:
440: bz,pn %icc, 3f
441: stx %o5, [%o1+0*8]
442: srlx %l2, %o3, %g1
443:
444: sllx %l2, %o4, %l2
445: 3:
446: or %g1, %l1, %o5
447: ldx [%o0+1*8], %l1
448:
449: bz,pn %icc, 3f
450: stx %o5, [%o1+1*8]
451: srlx %l3, %o3, %g1
452:
453: sllx %l3, %o4, %l3
454: 3:
455: or %g1, %l2, %o5
456: ldx [%o0+2*8], %l2
457:
458: bz,pn %icc, 3f
459: stx %o5, [%o1+2*8]
460: srlx %l4, %o3, %g1
461:
462: sllx %l4, %o4, %l4
463: 3:
464: or %g1, %l3, %o5
465: ldx [%o0+3*8], %l3
466:
467: bz,pn %icc, 3f
468: stx %o5, [%o1+3*8]
469: srlx %l5, %o3, %g1
470:
471: sllx %l5, %o4, %l5
472: 3:
473: or %g1, %l4, %o5
474: ldx [%o0+4*8], %l4
475:
476: bz,pn %icc, 3f
477: stx %o5, [%o1+4*8]
478: srlx %l6, %o3, %g1
479:
480: sllx %l6, %o4, %l6
481: 3:
482: or %g1, %l5, %o5
483: ldx [%o0+5*8], %l5
484:
485: bz,pn %icc, 3f
486: stx %o5, [%o1+5*8]
487: srlx %l7, %o3, %g1
488:
489: sllx %l7, %o4, %l7
490: 3:
491: or %g1, %l6, %o5
492: ldx [%o0+6*8], %l6
493:
494: bz,pn %icc, 3f
495: stx %o5, [%o1+6*8]
496: srlx %l0, %o3, %g1
497:
498: sllx %l0, %o4, %l0 ! Next loop
499: 3:
500: or %g1, %l7, %o5
501: ldx [%o0+7*8], %l7
502: deccc 8*8, %o2 ! Have enough room?
503:
504: stx %o5, [%o1+7*8]
505: inc 8*8, %o1
506: bge,pt %xcc, 1b
507: tst %o4
508:
509:
510: !!
511: !! Now unload all those regs
512: !!
513: Lbcopy_unrolled8_cleanup:
514: srlx %l1, %o3, %g1
515: bz,pn %xcc, 3f
516: inc 7*8, %o0 ! Point at the last load
517:
518: sllx %l1, %o4, %l1
519: ba,pt %icc, 4f
520: or %g1, %l0, %o5
521:
522: 3:
523: mov %l0, %o5
524:
525: 4:
526: bz,pn %icc, 3f
527: stx %o5, [%o1+0*8]
528: srlx %l2, %o3, %g1
529:
530: sllx %l2, %o4, %l2
531: 3:
532: or %g1, %l1, %o5
533:
534: bz,pn %icc, 3f
535: stx %o5, [%o1+1*8]
536: srlx %l3, %o3, %g1
537:
538: sllx %l3, %o4, %l3
539: 3:
540: or %g1, %l2, %o5
541:
542: bz,pn %icc, 3f
543: stx %o5, [%o1+2*8]
544: srlx %l4, %o3, %g1
545:
546: sllx %l4, %o4, %l4
547: 3:
548: or %g1, %l3, %o5
549:
550: bz,pn %icc, 3f
551: stx %o5, [%o1+3*8]
552: srlx %l5, %o3, %g1
553:
554: sllx %l5, %o4, %l5
555: 3:
556: or %g1, %l4, %o5
557:
558: bz,pn %icc, 3f
559: stx %o5, [%o1+4*8]
560: srlx %l6, %o3, %g1
561:
562: sllx %l6, %o4, %l6
563: 3:
564: or %g1, %l5, %o5
565:
566: bz,pn %icc, 3f
567: stx %o5, [%o1+5*8]
568: srlx %l7, %o3, %g1
569:
570: sllx %l7, %o4, %l7
571: 3:
572: or %g1, %l6, %o5
573: mov %l7, %l0 ! Shuffle to %l0
574:
575: stx %o5, [%o1+6*8]
576: or %g1, %l7, %o5
577: dec 7*8, %o2
578:
579: inc 7*8, %o1 ! Point at last store
580: #endif
581: 2:
582: inccc 16*8, %o2
583: bz,pn %icc, Lbcopy_complete
584:
585: !! Unrolled 8 times
586: Lbcopy_aligned8:
587: ! ldx [%o0], %l0 ! Already done
588: ! sllx %l0, %o4, %l0 ! Shift high word
589:
590: deccc 8, %o2 ! Pre-decrement
591: bl,pn %xcc, Lbcopy_finish
592: 1:
593: ldx [%o0+8], %l1 ! Load word 0
594: inc 8, %o0
595:
596: srlx %l1, %o3, %o5
597: or %o5, %l0, %o5 ! Combine
598:
599: stx %o5, [%o1] ! Store result
600: inc 8, %o1
601:
602: deccc 8, %o2
603: bge,pn %xcc, 1b
604: sllx %l1, %o4, %l0
605:
606: btst 7, %o2 ! Done?
607: bz,pt %xcc, Lbcopy_complete
608:
609: !!
610: !! Loadup the last dregs into %l0 and shift it into place
611: !!
612: srlx %o3, 3, %o5 ! # bytes in %l0
613: dec 8, %o5 ! - 8
614: !! n-8 - (by - 8) -> n - by
615: subcc %o2, %o5, %g0 ! # bytes we need
616: ble,pt %icc, Lbcopy_finish
617: nop
618: ldx [%o0+8], %l1 ! Need another word
619: srlx %l1, %o3, %l1
620: ba,pt %icc, Lbcopy_finish
621: or %l0, %l1, %l0 ! All loaded up.
622:
623: Lbcopy_noshift8:
624: deccc 8*8, %o2 ! Have enough room?
625: bl,pn %xcc, 2f
626: nop
627: ba,pt %icc, 1f
628: nop
629: .align 32
630: 1:
631: ldx [%o0+0*8], %l0
632: ldx [%o0+1*8], %l1
633: ldx [%o0+2*8], %l2
634: ldx [%o0+3*8], %l3
635: stx %l0, [%o1+0*8]
636: stx %l1, [%o1+1*8]
637: stx %l2, [%o1+2*8]
638: stx %l3, [%o1+3*8]
639:
640:
641: ldx [%o0+4*8], %l4
642: ldx [%o0+5*8], %l5
643: ldx [%o0+6*8], %l6
644: ldx [%o0+7*8], %l7
645: inc 8*8, %o0
646: stx %l4, [%o1+4*8]
647: stx %l5, [%o1+5*8]
648: deccc 8*8, %o2
649: stx %l6, [%o1+6*8]
650: stx %l7, [%o1+7*8]
651: stx %l2, [%o1+2*8]
652: bge,pt %xcc, 1b
653: inc 8*8, %o1
654: 2:
655: inc 8*8, %o2
656: 1:
657: deccc 8, %o2
658: bl,pn %icc, 1f ! < 0 --> sub word
659: nop
660: ldx [%o0], %o5
661: inc 8, %o0
662: stx %o5, [%o1]
663: bg,pt %icc, 1b ! Exactly 0 --> done
664: inc 8, %o1
665: 1:
666: btst 7, %o2 ! Done?
667: bz,pt %xcc, Lbcopy_complete
668: clr %o4
669: ldx [%o0], %l0
670: Lbcopy_finish:
671:
672: brz,pn %o2, 2f ! 100% complete?
673: cmp %o2, 8 ! Exactly 8 bytes?
674: bz,a,pn %xcc, 2f
675: stx %l0, [%o1]
676:
677: btst 4, %o2 ! Word store?
678: bz %xcc, 1f
679: srlx %l0, 32, %o5 ! Shift high word down
680: stw %o5, [%o1]
681: inc 4, %o1
682: mov %l0, %o5 ! Operate on the low bits
683: 1:
684: btst 2, %o2
685: mov %o5, %l0
686: bz 1f
687: srlx %l0, 16, %o5
688:
689: sth %o5, [%o1] ! Store short
690: inc 2, %o1
691: mov %l0, %o5 ! Operate on low bytes
692: 1:
693: mov %o5, %l0
694: btst 1, %o2 ! Byte aligned?
695: bz 2f
696: srlx %l0, 8, %o5
697:
698: stb %o5, [%o1] ! Store last byte
699: inc 1, %o1 ! Update address
700: 2:
701: Lbcopy_complete:
702: #if 0
703: !!
704: !! verify copy success.
705: !!
706:
707: mov %i0, %o2
708: mov %i1, %o4
709: mov %i2, %l4
710: 0:
711: ldub [%o2], %o1
712: inc %o2
713: ldub [%o4], %o3
714: inc %o4
715: cmp %o3, %o1
716: bnz 1f
717: dec %l4
718: brnz %l4, 0b
719: nop
720: ba 2f
721: nop
722:
723: 1:
724: set 0f, %o0
725: call printf
726: sub %i2, %l4, %o5
727: set 1f, %o0
728: mov %i0, %o1
729: mov %i1, %o2
730: call printf
731: mov %i2, %o3
732: ta 1
733: .data
734: 0: .asciz "bcopy failed: %x@%p != %x@%p byte %d\n"
735: 1: .asciz "bcopy(%p, %p, %lx)\n"
1.4 christos 736: _ALIGN
1.1 eeh 737: .text
738: 2:
739: #endif
740: ret
741: restore %i1, %g0, %o0
742:
743: #if 1
744:
745: /*
746: * Block copy. Useful for >256 byte copies.
747: *
748: * Benchmarking has shown this always seems to be slower than
749: * the integer version, so this is disabled. Maybe someone will
750: * figure out why sometime.
751: */
752:
753: Lbcopy_block:
754: #ifdef _KERNEL
755: /*
756: * Kernel:
757: *
758: * Here we use VIS instructions to do a block clear of a page.
759: * But before we can do that we need to save and enable the FPU.
760: * The last owner of the FPU registers is fpproc, and
761: * fpproc->p_md.md_fpstate is the current fpstate. If that's not
762: * null, call savefpstate() with it to store our current fp state.
763: *
764: * Next, allocate an aligned fpstate on the stack. We will properly
765: * nest calls on a particular stack so this should not be a problem.
766: *
767: * Now we grab either curproc (or if we're on the interrupt stack
768: * proc0). We stash its existing fpstate in a local register and
769: * put our new fpstate in curproc->p_md.md_fpstate. We point
770: * fpproc at curproc (or proc0) and enable the FPU.
771: *
772: * If we are ever preempted, our FPU state will be saved in our
773: * fpstate. Then, when we're resumed and we take an FPDISABLED
774: * trap, the trap handler will be able to fish our FPU state out
775: * of curproc (or proc0).
776: *
777: * On exiting this routine we undo the damage: restore the original
778: * pointer to curproc->p_md.md_fpstate, clear our fpproc, and disable
779: * the MMU.
780: *
781: *
782: * Register usage, Kernel only (after save):
783: *
784: * %i0 src
785: * %i1 dest
786: * %i2 size
787: *
788: * %l0 XXXX DEBUG old fpstate
789: * %l1 fpproc (hi bits only)
790: * %l2 orig fpproc
791: * %l3 orig fpstate
792: * %l5 curproc
793: * %l6 old fpstate
794: *
795: * Register ussage, Kernel and user:
796: *
797: * %g1 src (retval for memcpy)
798: *
799: * %o0 src
800: * %o1 dest
801: * %o2 end dest
802: * %o5 last safe fetchable address
803: */
804:
805: #if 1
806: ENABLE_FPU(0)
807: #else
808: save %sp, -(CC64FSZ+FS_SIZE+BLOCK_SIZE), %sp ! Allocate an fpstate
809: sethi %hi(FPPROC), %l1
810: LDPTR [%l1 + %lo(FPPROC)], %l2 ! Load fpproc
811: add %sp, (CC64FSZ+STKB+BLOCK_SIZE-1), %l0 ! Calculate pointer to fpstate
812: brz,pt %l2, 1f ! fpproc == NULL?
813: andn %l0, BLOCK_ALIGN, %l0 ! And make it block aligned
814: LDPTR [%l2 + P_FPSTATE], %l3
815: brz,pn %l3, 1f ! Make sure we have an fpstate
816: mov %l3, %o0
817: call _C_LABEL(savefpstate) ! Save the old fpstate
818: set EINTSTACK-STKB, %l4 ! Are we on intr stack?
819: cmp %sp, %l4
820: bgu,pt %xcc, 1f
821: set INTSTACK-STKB, %l4
822: cmp %sp, %l4
823: blu %xcc, 1f
824: 0:
825: sethi %hi(_C_LABEL(proc0)), %l4 ! Yes, use proc0
826: ba,pt %xcc, 2f ! XXXX needs to change to CPUs idle proc
827: or %l4, %lo(_C_LABEL(proc0)), %l5
828: 1:
829: sethi %hi(CURPROC), %l4 ! Use curproc
830: LDPTR [%l4 + %lo(CURPROC)], %l5
831: brz,pn %l5, 0b ! If curproc is NULL need to use proc0
832: nop
833: 2:
834: LDPTR [%l5 + P_FPSTATE], %l6 ! Save old fpstate
835: STPTR %l0, [%l5 + P_FPSTATE] ! Insert new fpstate
836: STPTR %l5, [%l1 + %lo(FPPROC)] ! Set new fpproc
837: wr %g0, FPRS_FEF, %fprs ! Enable FPU
838: #endif
839: mov %i0, %o0 ! Src addr.
840: mov %i1, %o1 ! Store our dest ptr here.
841: mov %i2, %o2 ! Len counter
842: #endif
843:
844: !!
845: !! First align the output to a 64-bit entity
846: !!
847:
848: mov %o1, %g1 ! memcpy retval
849: add %o0, %o2, %o5 ! End of source block
850:
851: andn %o0, 7, %o3 ! Start of block
852: dec %o5
853: fzero %f0
854:
855: andn %o5, BLOCK_ALIGN, %o5 ! Last safe addr.
856: ldd [%o3], %f2 ! Load 1st word
857:
858: dec 8, %o3 ! Move %o3 1 word back
859: btst 1, %o1
860: bz 4f
861:
862: mov -7, %o4 ! Lowest src addr possible
863: alignaddr %o0, %o4, %o4 ! Base addr for load.
864:
865: cmp %o3, %o4
866: be,pt %xcc, 1f ! Already loaded?
867: mov %o4, %o3
868: fmovd %f2, %f0 ! No. Shift
869: ldd [%o3+8], %f2 ! And load
870: 1:
871:
872: faligndata %f0, %f2, %f4 ! Isolate 1st byte
873:
874: stda %f4, [%o1] ASI_FL8_P ! Store 1st byte
875: inc 1, %o1 ! Update address
876: inc 1, %o0
877: dec 1, %o2
878: 4:
879: btst 2, %o1
880: bz 4f
881:
882: mov -6, %o4 ! Calculate src - 6
883: alignaddr %o0, %o4, %o4 ! calculate shift mask and dest.
884:
885: cmp %o3, %o4 ! Addresses same?
886: be,pt %xcc, 1f
887: mov %o4, %o3
888: fmovd %f2, %f0 ! Shuffle data
889: ldd [%o3+8], %f2 ! Load word 0
890: 1:
891: faligndata %f0, %f2, %f4 ! Move 1st short low part of f8
892:
893: stda %f4, [%o1] ASI_FL16_P ! Store 1st short
894: dec 2, %o2
895: inc 2, %o1
896: inc 2, %o0
897: 4:
898: brz,pn %o2, Lbcopy_blockfinish ! XXXX
899:
900: btst 4, %o1
901: bz 4f
902:
903: mov -4, %o4
904: alignaddr %o0, %o4, %o4 ! calculate shift mask and dest.
905:
906: cmp %o3, %o4 ! Addresses same?
907: beq,pt %xcc, 1f
908: mov %o4, %o3
909: fmovd %f2, %f0 ! Shuffle data
910: ldd [%o3+8], %f2 ! Load word 0
911: 1:
912: faligndata %f0, %f2, %f4 ! Move 1st short low part of f8
913:
914: st %f5, [%o1] ! Store word
915: dec 4, %o2
916: inc 4, %o1
917: inc 4, %o0
918: 4:
919: brz,pn %o2, Lbcopy_blockfinish ! XXXX
920: !!
921: !! We are now 32-bit aligned in the dest.
922: !!
923: Lbcopy_block_common:
924:
925: mov -0, %o4
926: alignaddr %o0, %o4, %o4 ! base - shift
927:
928: cmp %o3, %o4 ! Addresses same?
929: beq,pt %xcc, 1f
930: mov %o4, %o3
931: fmovd %f2, %f0 ! Shuffle data
932: ldd [%o3+8], %f2 ! Load word 0
933: 1:
934: add %o3, 8, %o0 ! now use %o0 for src
935:
936: !!
937: !! Continue until our dest is block aligned
938: !!
939: Lbcopy_block_aligned8:
940: 1:
941: brz %o2, Lbcopy_blockfinish
942: btst BLOCK_ALIGN, %o1 ! Block aligned?
943: bz 1f
944:
945: faligndata %f0, %f2, %f4 ! Generate result
946: deccc 8, %o2
947: ble,pn %icc, Lbcopy_blockfinish ! Should never happen
948: fmovd %f4, %f48
949:
950: std %f4, [%o1] ! Store result
951: inc 8, %o1
952:
953: fmovd %f2, %f0
954: inc 8, %o0
955: ba,pt %xcc, 1b ! Not yet.
956: ldd [%o0], %f2 ! Load next part
957: Lbcopy_block_aligned64:
958: 1:
959:
960: /*
961: * 64-byte aligned -- ready for block operations.
962: *
963: * Here we have the destination block aligned, but the
964: * source pointer may not be. Sub-word alignment will
965: * be handled by faligndata instructions. But the source
966: * can still be potentially aligned to 8 different words
967: * in our 64-bit block, so we have 8 different copy routines.
968: *
969: * Once we figure out our source alignment, we branch
970: * to the appropriate copy routine, which sets up the
971: * alignment for faligndata and loads (sets) the values
972: * into the source registers and does the copy loop.
973: *
974: * When were down to less than 1 block to store, we
975: * exit the copy loop and execute cleanup code.
976: *
977: * Block loads and stores are not properly interlocked.
978: * Stores save one reg/cycle, so you can start overwriting
979: * registers the cycle after the store is issued.
980: *
981: * Block loads require a block load to a different register
982: * block or a membar #Sync before accessing the loaded
983: * data.
984: *
985: * Since the faligndata instructions may be offset as far
986: * as 7 registers into a block (if you are shifting source
987: * 7 -> dest 0), you need 3 source register blocks for full
988: * performance: one you are copying, one you are loading,
989: * and one for interlocking. Otherwise, we would need to
990: * sprinkle the code with membar #Sync and lose the advantage
991: * of running faligndata in parallel with block stores. This
992: * means we are fetching a full 128 bytes ahead of the stores.
993: * We need to make sure the prefetch does not inadvertently
994: * cross a page boundary and fault on data that we will never
995: * store.
996: *
997: */
998: #if 1
999: and %o0, BLOCK_ALIGN, %o3
1000: srax %o3, 3, %o3 ! Isolate the offset
1001:
1002: brz %o3, L100 ! 0->0
1003: btst 4, %o3
1004: bnz %xcc, 4f
1005: btst 2, %o3
1006: bnz %xcc, 2f
1007: btst 1, %o3
1008: ba,pt %xcc, L101 ! 0->1
1009: nop /* XXX spitfire bug */
1010: 2:
1011: bz %xcc, L102 ! 0->2
1012: nop
1013: ba,pt %xcc, L103 ! 0->3
1014: nop /* XXX spitfire bug */
1015: 4:
1016: bnz %xcc, 2f
1017: btst 1, %o3
1018: bz %xcc, L104 ! 0->4
1019: nop
1020: ba,pt %xcc, L105 ! 0->5
1021: nop /* XXX spitfire bug */
1022: 2:
1023: bz %xcc, L106 ! 0->6
1024: nop
1025: ba,pt %xcc, L107 ! 0->7
1026: nop /* XXX spitfire bug */
1027: #else
1028:
1029: !!
1030: !! Isolate the word offset, which just happens to be
1031: !! the slot in our jump table.
1032: !!
1033: !! This is 6 insns, most of which cannot be paired,
1034: !! which is about the same as the above version.
1035: !!
1036: rd %pc, %o4
1037: 1:
1038: and %o0, 0x31, %o3
1039: add %o3, (Lbcopy_block_jmp - 1b), %o3
1040: jmpl %o4 + %o3, %g0
1041: nop
1042:
1043: !!
1044: !! Jump table
1045: !!
1046:
1047: Lbcopy_block_jmp:
1048: ba,a,pt %xcc, L100
1049: nop
1050: ba,a,pt %xcc, L101
1051: nop
1052: ba,a,pt %xcc, L102
1053: nop
1054: ba,a,pt %xcc, L103
1055: nop
1056: ba,a,pt %xcc, L104
1057: nop
1058: ba,a,pt %xcc, L105
1059: nop
1060: ba,a,pt %xcc, L106
1061: nop
1062: ba,a,pt %xcc, L107
1063: nop
1064: #endif
1065:
1066: !!
1067: !! Source is block aligned.
1068: !!
1069: !! Just load a block and go.
1070: !!
1071: L100:
1072: #ifdef RETURN_NAME
1073: sethi %hi(1f), %g1
1074: ba,pt %icc, 2f
1075: or %g1, %lo(1f), %g1
1076: 1:
1077: .asciz "L100"
1078: .align 8
1079: 2:
1080: #endif
1081: fmovd %f0 , %f62
1082: ldda [%o0] ASI_BLK_P, %f0
1083: inc BLOCK_SIZE, %o0
1084: cmp %o0, %o5
1085: bleu,a,pn %icc, 3f
1086: ldda [%o0] ASI_BLK_P, %f16
1087: ba,pt %icc, 3f
1088: membar #Sync
1089:
1090: .align 32 ! ICache align.
1091: 3:
1092: faligndata %f62, %f0, %f32
1093: inc BLOCK_SIZE, %o0
1094: faligndata %f0, %f2, %f34
1095: dec BLOCK_SIZE, %o2
1096: faligndata %f2, %f4, %f36
1097: cmp %o0, %o5
1098: faligndata %f4, %f6, %f38
1099: faligndata %f6, %f8, %f40
1100: faligndata %f8, %f10, %f42
1101: faligndata %f10, %f12, %f44
1102: brlez,pn %o2, Lbcopy_blockdone
1103: faligndata %f12, %f14, %f46
1104:
1105: bleu,a,pn %icc, 2f
1106: ldda [%o0] ASI_BLK_P, %f48
1107: membar #Sync
1108: 2:
1109: stda %f32, [%o1] ASI_STORE
1110: faligndata %f14, %f16, %f32
1111: inc BLOCK_SIZE, %o0
1112: faligndata %f16, %f18, %f34
1113: inc BLOCK_SIZE, %o1
1114: faligndata %f18, %f20, %f36
1115: dec BLOCK_SIZE, %o2
1116: faligndata %f20, %f22, %f38
1117: cmp %o0, %o5
1118: faligndata %f22, %f24, %f40
1119: faligndata %f24, %f26, %f42
1120: faligndata %f26, %f28, %f44
1121: brlez,pn %o2, Lbcopy_blockdone
1122: faligndata %f28, %f30, %f46
1123:
1124: bleu,a,pn %icc, 2f
1125: ldda [%o0] ASI_BLK_P, %f0
1126: membar #Sync
1127: 2:
1128: stda %f32, [%o1] ASI_STORE
1129: faligndata %f30, %f48, %f32
1130: inc BLOCK_SIZE, %o0
1131: faligndata %f48, %f50, %f34
1132: inc BLOCK_SIZE, %o1
1133: faligndata %f50, %f52, %f36
1134: dec BLOCK_SIZE, %o2
1135: faligndata %f52, %f54, %f38
1136: cmp %o0, %o5
1137: faligndata %f54, %f56, %f40
1138: faligndata %f56, %f58, %f42
1139: faligndata %f58, %f60, %f44
1140: brlez,pn %o2, Lbcopy_blockdone
1141: faligndata %f60, %f62, %f46
1142: bleu,a,pn %icc, 2f
1143: ldda [%o0] ASI_BLK_P, %f16 ! Increment is at top
1144: membar #Sync
1145: 2:
1146: stda %f32, [%o1] ASI_STORE
1147: ba 3b
1148: inc BLOCK_SIZE, %o1
1149:
1150: !!
1151: !! Source at BLOCK_ALIGN+8
1152: !!
1153: !! We need to load almost 1 complete block by hand.
1154: !!
1155: L101:
1156: #ifdef RETURN_NAME
1157: sethi %hi(1f), %g1
1158: ba,pt %icc, 2f
1159: or %g1, %lo(1f), %g1
1160: 1:
1161: .asciz "L101"
1162: .align 8
1163: 2:
1164: #endif
1165: ! fmovd %f0, %f0 ! Hoist fmovd
1166: ldd [%o0], %f2
1167: inc 8, %o0
1168: ldd [%o0], %f4
1169: inc 8, %o0
1170: ldd [%o0], %f6
1171: inc 8, %o0
1172: ldd [%o0], %f8
1173: inc 8, %o0
1174: ldd [%o0], %f10
1175: inc 8, %o0
1176: ldd [%o0], %f12
1177: inc 8, %o0
1178: ldd [%o0], %f14
1179: inc 8, %o0
1180:
1181: cmp %o0, %o5
1182: bleu,a,pn %icc, 3f
1183: ldda [%o0] ASI_BLK_P, %f16
1184: membar #Sync
1185: 3:
1186: faligndata %f0, %f2, %f32
1187: inc BLOCK_SIZE, %o0
1188: faligndata %f2, %f4, %f34
1189: cmp %o0, %o5
1190: faligndata %f4, %f6, %f36
1191: dec BLOCK_SIZE, %o2
1192: faligndata %f6, %f8, %f38
1193: faligndata %f8, %f10, %f40
1194: faligndata %f10, %f12, %f42
1195: faligndata %f12, %f14, %f44
1196: bleu,a,pn %icc, 2f
1197: ldda [%o0] ASI_BLK_P, %f48
1198: membar #Sync
1199: 2:
1200: brlez,pn %o2, Lbcopy_blockdone
1201: faligndata %f14, %f16, %f46
1202:
1203: stda %f32, [%o1] ASI_STORE
1204:
1205: faligndata %f16, %f18, %f32
1206: inc BLOCK_SIZE, %o0
1207: faligndata %f18, %f20, %f34
1208: inc BLOCK_SIZE, %o1
1209: faligndata %f20, %f22, %f36
1210: cmp %o0, %o5
1211: faligndata %f22, %f24, %f38
1212: dec BLOCK_SIZE, %o2
1213: faligndata %f24, %f26, %f40
1214: faligndata %f26, %f28, %f42
1215: faligndata %f28, %f30, %f44
1216: bleu,a,pn %icc, 2f
1217: ldda [%o0] ASI_BLK_P, %f0
1218: membar #Sync
1219: 2:
1220: brlez,pn %o2, Lbcopy_blockdone
1221: faligndata %f30, %f48, %f46
1222:
1223: stda %f32, [%o1] ASI_STORE
1224:
1225: faligndata %f48, %f50, %f32
1226: inc BLOCK_SIZE, %o0
1227: faligndata %f50, %f52, %f34
1228: inc BLOCK_SIZE, %o1
1229: faligndata %f52, %f54, %f36
1230: cmp %o0, %o5
1231: faligndata %f54, %f56, %f38
1232: dec BLOCK_SIZE, %o2
1233: faligndata %f56, %f58, %f40
1234: faligndata %f58, %f60, %f42
1235: faligndata %f60, %f62, %f44
1236: bleu,a,pn %icc, 2f
1237: ldda [%o0] ASI_BLK_P, %f16
1238: membar #Sync
1239: 2:
1240: brlez,pn %o2, Lbcopy_blockdone
1241: faligndata %f62, %f0, %f46
1242:
1243: stda %f32, [%o1] ASI_STORE
1244: ba 3b
1245: inc BLOCK_SIZE, %o1
1246:
1247: !!
1248: !! Source at BLOCK_ALIGN+16
1249: !!
1250: !! We need to load 6 doubles by hand.
1251: !!
1252: L102:
1253: #ifdef RETURN_NAME
1254: sethi %hi(1f), %g1
1255: ba,pt %icc, 2f
1256: or %g1, %lo(1f), %g1
1257: 1:
1258: .asciz "L102"
1259: .align 8
1260: 2:
1261: #endif
1262: ldd [%o0], %f4
1263: inc 8, %o0
1264: fmovd %f0, %f2 ! Hoist fmovd
1265: ldd [%o0], %f6
1266: inc 8, %o0
1267:
1268: ldd [%o0], %f8
1269: inc 8, %o0
1270: ldd [%o0], %f10
1271: inc 8, %o0
1272: ldd [%o0], %f12
1273: inc 8, %o0
1274: ldd [%o0], %f14
1275: inc 8, %o0
1276:
1277: cmp %o0, %o5
1278: bleu,a,pn %icc, 3f
1279: ldda [%o0] ASI_BLK_P, %f16
1280: membar #Sync
1281: 3:
1282: faligndata %f2, %f4, %f32
1283: inc BLOCK_SIZE, %o0
1284: faligndata %f4, %f6, %f34
1285: cmp %o0, %o5
1286: faligndata %f6, %f8, %f36
1287: dec BLOCK_SIZE, %o2
1288: faligndata %f8, %f10, %f38
1289: faligndata %f10, %f12, %f40
1290: faligndata %f12, %f14, %f42
1291: bleu,a,pn %icc, 2f
1292: ldda [%o0] ASI_BLK_P, %f48
1293: membar #Sync
1294: 2:
1295: faligndata %f14, %f16, %f44
1296:
1297: brlez,pn %o2, Lbcopy_blockdone
1298: faligndata %f16, %f18, %f46
1299:
1300: stda %f32, [%o1] ASI_STORE
1301:
1302: faligndata %f18, %f20, %f32
1303: inc BLOCK_SIZE, %o0
1304: faligndata %f20, %f22, %f34
1305: inc BLOCK_SIZE, %o1
1306: faligndata %f22, %f24, %f36
1307: cmp %o0, %o5
1308: faligndata %f24, %f26, %f38
1309: dec BLOCK_SIZE, %o2
1310: faligndata %f26, %f28, %f40
1311: faligndata %f28, %f30, %f42
1312: bleu,a,pn %icc, 2f
1313: ldda [%o0] ASI_BLK_P, %f0
1314: membar #Sync
1315: 2:
1316: faligndata %f30, %f48, %f44
1317: brlez,pn %o2, Lbcopy_blockdone
1318: faligndata %f48, %f50, %f46
1319:
1320: stda %f32, [%o1] ASI_STORE
1321:
1322: faligndata %f50, %f52, %f32
1323: inc BLOCK_SIZE, %o0
1324: faligndata %f52, %f54, %f34
1325: inc BLOCK_SIZE, %o1
1326: faligndata %f54, %f56, %f36
1327: cmp %o0, %o5
1328: faligndata %f56, %f58, %f38
1329: dec BLOCK_SIZE, %o2
1330: faligndata %f58, %f60, %f40
1331: faligndata %f60, %f62, %f42
1332: bleu,a,pn %icc, 2f
1333: ldda [%o0] ASI_BLK_P, %f16
1334: membar #Sync
1335: 2:
1336: faligndata %f62, %f0, %f44
1337: brlez,pn %o2, Lbcopy_blockdone
1338: faligndata %f0, %f2, %f46
1339:
1340: stda %f32, [%o1] ASI_STORE
1341: ba 3b
1342: inc BLOCK_SIZE, %o1
1343:
1344: !!
1345: !! Source at BLOCK_ALIGN+24
1346: !!
1347: !! We need to load 5 doubles by hand.
1348: !!
1349: L103:
1350: #ifdef RETURN_NAME
1351: sethi %hi(1f), %g1
1352: ba,pt %icc, 2f
1353: or %g1, %lo(1f), %g1
1354: 1:
1355: .asciz "L103"
1356: .align 8
1357: 2:
1358: #endif
1359: fmovd %f0, %f4
1360: ldd [%o0], %f6
1361: inc 8, %o0
1362: ldd [%o0], %f8
1363: inc 8, %o0
1364: ldd [%o0], %f10
1365: inc 8, %o0
1366: ldd [%o0], %f12
1367: inc 8, %o0
1368: ldd [%o0], %f14
1369: inc 8, %o0
1370:
1371: cmp %o0, %o5
1372: bleu,a,pn %icc, 2f
1373: ldda [%o0] ASI_BLK_P, %f16
1374: membar #Sync
1375: 2:
1376: inc BLOCK_SIZE, %o0
1377: 3:
1378: faligndata %f4, %f6, %f32
1379: cmp %o0, %o5
1380: faligndata %f6, %f8, %f34
1381: dec BLOCK_SIZE, %o2
1382: faligndata %f8, %f10, %f36
1383: faligndata %f10, %f12, %f38
1384: faligndata %f12, %f14, %f40
1385: bleu,a,pn %icc, 2f
1386: ldda [%o0] ASI_BLK_P, %f48
1387: membar #Sync
1388: 2:
1389: faligndata %f14, %f16, %f42
1390: inc BLOCK_SIZE, %o0
1391: faligndata %f16, %f18, %f44
1392: brlez,pn %o2, Lbcopy_blockdone
1393: faligndata %f18, %f20, %f46
1394:
1395: stda %f32, [%o1] ASI_STORE
1396:
1397: faligndata %f20, %f22, %f32
1398: cmp %o0, %o5
1399: faligndata %f22, %f24, %f34
1400: dec BLOCK_SIZE, %o2
1401: faligndata %f24, %f26, %f36
1402: inc BLOCK_SIZE, %o1
1403: faligndata %f26, %f28, %f38
1404: faligndata %f28, %f30, %f40
1405: ble,a,pn %icc, 2f
1406: ldda [%o0] ASI_BLK_P, %f0
1407: membar #Sync
1408: 2:
1409: faligndata %f30, %f48, %f42
1410: inc BLOCK_SIZE, %o0
1411: faligndata %f48, %f50, %f44
1412: brlez,pn %o2, Lbcopy_blockdone
1413: faligndata %f50, %f52, %f46
1414:
1415: stda %f32, [%o1] ASI_STORE
1416:
1417: faligndata %f52, %f54, %f32
1418: cmp %o0, %o5
1419: faligndata %f54, %f56, %f34
1420: dec BLOCK_SIZE, %o2
1421: faligndata %f56, %f58, %f36
1422: faligndata %f58, %f60, %f38
1423: inc BLOCK_SIZE, %o1
1424: faligndata %f60, %f62, %f40
1425: bleu,a,pn %icc, 2f
1426: ldda [%o0] ASI_BLK_P, %f16
1427: membar #Sync
1428: 2:
1429: faligndata %f62, %f0, %f42
1430: inc BLOCK_SIZE, %o0
1431: faligndata %f0, %f2, %f44
1432: brlez,pn %o2, Lbcopy_blockdone
1433: faligndata %f2, %f4, %f46
1434:
1435: stda %f32, [%o1] ASI_STORE
1436: ba 3b
1437: inc BLOCK_SIZE, %o1
1438:
1439: !!
1440: !! Source at BLOCK_ALIGN+32
1441: !!
1442: !! We need to load 4 doubles by hand.
1443: !!
1444: L104:
1445: #ifdef RETURN_NAME
1446: sethi %hi(1f), %g1
1447: ba,pt %icc, 2f
1448: or %g1, %lo(1f), %g1
1449: 1:
1450: .asciz "L104"
1451: .align 8
1452: 2:
1453: #endif
1454: fmovd %f0, %f6
1455: ldd [%o0], %f8
1456: inc 8, %o0
1457: ldd [%o0], %f10
1458: inc 8, %o0
1459: ldd [%o0], %f12
1460: inc 8, %o0
1461: ldd [%o0], %f14
1462: inc 8, %o0
1463:
1464: cmp %o0, %o5
1465: bleu,a,pn %icc, 2f
1466: ldda [%o0] ASI_BLK_P, %f16
1467: membar #Sync
1468: 2:
1469: inc BLOCK_SIZE, %o0
1470: 3:
1471: faligndata %f6, %f8, %f32
1472: cmp %o0, %o5
1473: faligndata %f8, %f10, %f34
1474: dec BLOCK_SIZE, %o2
1475: faligndata %f10, %f12, %f36
1476: faligndata %f12, %f14, %f38
1477: bleu,a,pn %icc, 2f
1478: ldda [%o0] ASI_BLK_P, %f48
1479: membar #Sync
1480: 2:
1481: faligndata %f14, %f16, %f40
1482: faligndata %f16, %f18, %f42
1483: inc BLOCK_SIZE, %o0
1484: faligndata %f18, %f20, %f44
1485: brlez,pn %o2, Lbcopy_blockdone
1486: faligndata %f20, %f22, %f46
1487:
1488: stda %f32, [%o1] ASI_STORE
1489:
1490: faligndata %f22, %f24, %f32
1491: cmp %o0, %o5
1492: faligndata %f24, %f26, %f34
1493: faligndata %f26, %f28, %f36
1494: inc BLOCK_SIZE, %o1
1495: faligndata %f28, %f30, %f38
1496: bleu,a,pn %icc, 2f
1497: ldda [%o0] ASI_BLK_P, %f0
1498: membar #Sync
1499: 2:
1500: faligndata %f30, %f48, %f40
1501: dec BLOCK_SIZE, %o2
1502: faligndata %f48, %f50, %f42
1503: inc BLOCK_SIZE, %o0
1504: faligndata %f50, %f52, %f44
1505: brlez,pn %o2, Lbcopy_blockdone
1506: faligndata %f52, %f54, %f46
1507:
1508: stda %f32, [%o1] ASI_STORE
1509:
1510: faligndata %f54, %f56, %f32
1511: cmp %o0, %o5
1512: faligndata %f56, %f58, %f34
1513: faligndata %f58, %f60, %f36
1514: inc BLOCK_SIZE, %o1
1515: faligndata %f60, %f62, %f38
1516: bleu,a,pn %icc, 2f
1517: ldda [%o0] ASI_BLK_P, %f16
1518: membar #Sync
1519: 2:
1520: faligndata %f62, %f0, %f40
1521: dec BLOCK_SIZE, %o2
1522: faligndata %f0, %f2, %f42
1523: inc BLOCK_SIZE, %o0
1524: faligndata %f2, %f4, %f44
1525: brlez,pn %o2, Lbcopy_blockdone
1526: faligndata %f4, %f6, %f46
1527:
1528: stda %f32, [%o1] ASI_STORE
1529: ba 3b
1530: inc BLOCK_SIZE, %o1
1531:
1532: !!
1533: !! Source at BLOCK_ALIGN+40
1534: !!
1535: !! We need to load 3 doubles by hand.
1536: !!
1537: L105:
1538: #ifdef RETURN_NAME
1539: sethi %hi(1f), %g1
1540: ba,pt %icc, 2f
1541: or %g1, %lo(1f), %g1
1542: 1:
1543: .asciz "L105"
1544: .align 8
1545: 2:
1546: #endif
1547: fmovd %f0, %f8
1548: ldd [%o0], %f10
1549: inc 8, %o0
1550: ldd [%o0], %f12
1551: inc 8, %o0
1552: ldd [%o0], %f14
1553: inc 8, %o0
1554:
1555: cmp %o0, %o5
1556: bleu,a,pn %icc, 2f
1557: ldda [%o0] ASI_BLK_P, %f16
1558: membar #Sync
1559: 2:
1560: inc BLOCK_SIZE, %o0
1561: 3:
1562: faligndata %f8, %f10, %f32
1563: cmp %o0, %o5
1564: faligndata %f10, %f12, %f34
1565: faligndata %f12, %f14, %f36
1566: bleu,a,pn %icc, 2f
1567: ldda [%o0] ASI_BLK_P, %f48
1568: membar #Sync
1569: 2:
1570: faligndata %f14, %f16, %f38
1571: dec BLOCK_SIZE, %o2
1572: faligndata %f16, %f18, %f40
1573: inc BLOCK_SIZE, %o0
1574: faligndata %f18, %f20, %f42
1575: faligndata %f20, %f22, %f44
1576: brlez,pn %o2, Lbcopy_blockdone
1577: faligndata %f22, %f24, %f46
1578:
1579: stda %f32, [%o1] ASI_STORE
1580:
1581: faligndata %f24, %f26, %f32
1582: cmp %o0, %o5
1583: faligndata %f26, %f28, %f34
1584: dec BLOCK_SIZE, %o2
1585: faligndata %f28, %f30, %f36
1586: bleu,a,pn %icc, 2f
1587: ldda [%o0] ASI_BLK_P, %f0
1588: membar #Sync
1589: 2:
1590: faligndata %f30, %f48, %f38
1591: inc BLOCK_SIZE, %o1
1592: faligndata %f48, %f50, %f40
1593: inc BLOCK_SIZE, %o0
1594: faligndata %f50, %f52, %f42
1595: faligndata %f52, %f54, %f44
1596: brlez,pn %o2, Lbcopy_blockdone
1597: faligndata %f54, %f56, %f46
1598:
1599: stda %f32, [%o1] ASI_STORE
1600:
1601: faligndata %f56, %f58, %f32
1602: cmp %o0, %o5
1603: faligndata %f58, %f60, %f34
1604: dec BLOCK_SIZE, %o2
1605: faligndata %f60, %f62, %f36
1606: bleu,a,pn %icc, 2f
1607: ldda [%o0] ASI_BLK_P, %f16
1608: membar #Sync
1609: 2:
1610: faligndata %f62, %f0, %f38
1611: inc BLOCK_SIZE, %o1
1612: faligndata %f0, %f2, %f40
1613: inc BLOCK_SIZE, %o0
1614: faligndata %f2, %f4, %f42
1615: faligndata %f4, %f6, %f44
1616: brlez,pn %o2, Lbcopy_blockdone
1617: faligndata %f6, %f8, %f46
1618:
1619: stda %f32, [%o1] ASI_STORE
1620: ba 3b
1621: inc BLOCK_SIZE, %o1
1622:
1623:
1624: !!
1625: !! Source at BLOCK_ALIGN+48
1626: !!
1627: !! We need to load 2 doubles by hand.
1628: !!
1629: L106:
1630: #ifdef RETURN_NAME
1631: sethi %hi(1f), %g1
1632: ba,pt %icc, 2f
1633: or %g1, %lo(1f), %g1
1634: 1:
1635: .asciz "L106"
1636: .align 8
1637: 2:
1638: #endif
1639: fmovd %f0, %f10
1640: ldd [%o0], %f12
1641: inc 8, %o0
1642: ldd [%o0], %f14
1643: inc 8, %o0
1644:
1645: cmp %o0, %o5
1646: bleu,a,pn %icc, 2f
1647: ldda [%o0] ASI_BLK_P, %f16
1648: membar #Sync
1649: 2:
1650: inc BLOCK_SIZE, %o0
1651: 3:
1652: faligndata %f10, %f12, %f32
1653: cmp %o0, %o5
1654: faligndata %f12, %f14, %f34
1655: bleu,a,pn %icc, 2f
1656: ldda [%o0] ASI_BLK_P, %f48
1657: membar #Sync
1658: 2:
1659: faligndata %f14, %f16, %f36
1660: dec BLOCK_SIZE, %o2
1661: faligndata %f16, %f18, %f38
1662: inc BLOCK_SIZE, %o0
1663: faligndata %f18, %f20, %f40
1664: faligndata %f20, %f22, %f42
1665: faligndata %f22, %f24, %f44
1666: brlez,pn %o2, Lbcopy_blockdone
1667: faligndata %f24, %f26, %f46
1668:
1669: stda %f32, [%o1] ASI_STORE
1670:
1671: faligndata %f26, %f28, %f32
1672: cmp %o0, %o5
1673: faligndata %f28, %f30, %f34
1674: bleu,a,pn %icc, 2f
1675: ldda [%o0] ASI_BLK_P, %f0
1676: membar #Sync
1677: 2:
1678: faligndata %f30, %f48, %f36
1679: dec BLOCK_SIZE, %o2
1680: faligndata %f48, %f50, %f38
1681: inc BLOCK_SIZE, %o1
1682: faligndata %f50, %f52, %f40
1683: faligndata %f52, %f54, %f42
1684: inc BLOCK_SIZE, %o0
1685: faligndata %f54, %f56, %f44
1686: brlez,pn %o2, Lbcopy_blockdone
1687: faligndata %f56, %f58, %f46
1688:
1689: stda %f32, [%o1] ASI_STORE
1690:
1691: faligndata %f58, %f60, %f32
1692: cmp %o0, %o5
1693: faligndata %f60, %f62, %f34
1694: bleu,a,pn %icc, 2f
1695: ldda [%o0] ASI_BLK_P, %f16
1696: membar #Sync
1697: 2:
1698: faligndata %f62, %f0, %f36
1699: dec BLOCK_SIZE, %o2
1700: faligndata %f0, %f2, %f38
1701: inc BLOCK_SIZE, %o1
1702: faligndata %f2, %f4, %f40
1703: faligndata %f4, %f6, %f42
1704: inc BLOCK_SIZE, %o0
1705: faligndata %f6, %f8, %f44
1706: brlez,pn %o2, Lbcopy_blockdone
1707: faligndata %f8, %f10, %f46
1708:
1709: stda %f32, [%o1] ASI_STORE
1710: ba 3b
1711: inc BLOCK_SIZE, %o1
1712:
1713:
1714: !!
1715: !! Source at BLOCK_ALIGN+56
1716: !!
1717: !! We need to load 1 double by hand.
1718: !!
1719: L107:
1720: #ifdef RETURN_NAME
1721: sethi %hi(1f), %g1
1722: ba,pt %icc, 2f
1723: or %g1, %lo(1f), %g1
1724: 1:
1725: .asciz "L107"
1726: .align 8
1727: 2:
1728: #endif
1729: fmovd %f0, %f12
1730: ldd [%o0], %f14
1731: inc 8, %o0
1732:
1733: cmp %o0, %o5
1734: bleu,a,pn %icc, 2f
1735: ldda [%o0] ASI_BLK_P, %f16
1736: membar #Sync
1737: 2:
1738: inc BLOCK_SIZE, %o0
1739: 3:
1740: faligndata %f12, %f14, %f32
1741: cmp %o0, %o5
1742: bleu,a,pn %icc, 2f
1743: ldda [%o0] ASI_BLK_P, %f48
1744: membar #Sync
1745: 2:
1746: faligndata %f14, %f16, %f34
1747: dec BLOCK_SIZE, %o2
1748: faligndata %f16, %f18, %f36
1749: inc BLOCK_SIZE, %o0
1750: faligndata %f18, %f20, %f38
1751: faligndata %f20, %f22, %f40
1752: faligndata %f22, %f24, %f42
1753: faligndata %f24, %f26, %f44
1754: brlez,pn %o2, Lbcopy_blockdone
1755: faligndata %f26, %f28, %f46
1756:
1757: stda %f32, [%o1] ASI_STORE
1758:
1759: faligndata %f28, %f30, %f32
1760: cmp %o0, %o5
1761: bleu,a,pn %icc, 2f
1762: ldda [%o0] ASI_BLK_P, %f0
1763: membar #Sync
1764: 2:
1765: faligndata %f30, %f48, %f34
1766: dec BLOCK_SIZE, %o2
1767: faligndata %f48, %f50, %f36
1768: inc BLOCK_SIZE, %o1
1769: faligndata %f50, %f52, %f38
1770: faligndata %f52, %f54, %f40
1771: inc BLOCK_SIZE, %o0
1772: faligndata %f54, %f56, %f42
1773: faligndata %f56, %f58, %f44
1774: brlez,pn %o2, Lbcopy_blockdone
1775: faligndata %f58, %f60, %f46
1776:
1777: stda %f32, [%o1] ASI_STORE
1778:
1779: faligndata %f60, %f62, %f32
1780: cmp %o0, %o5
1781: bleu,a,pn %icc, 2f
1782: ldda [%o0] ASI_BLK_P, %f16
1783: membar #Sync
1784: 2:
1785: faligndata %f62, %f0, %f34
1786: dec BLOCK_SIZE, %o2
1787: faligndata %f0, %f2, %f36
1788: inc BLOCK_SIZE, %o1
1789: faligndata %f2, %f4, %f38
1790: faligndata %f4, %f6, %f40
1791: inc BLOCK_SIZE, %o0
1792: faligndata %f6, %f8, %f42
1793: faligndata %f8, %f10, %f44
1794:
1795: brlez,pn %o2, Lbcopy_blockdone
1796: faligndata %f10, %f12, %f46
1797:
1798: stda %f32, [%o1] ASI_STORE
1799: ba 3b
1800: inc BLOCK_SIZE, %o1
1801:
1802: Lbcopy_blockdone:
1803: inc BLOCK_SIZE, %o2 ! Fixup our overcommit
1804: membar #Sync ! Finish any pending loads
1805: #define FINISH_REG(f) \
1806: deccc 8, %o2; \
1807: bl,a Lbcopy_blockfinish; \
1808: fmovd f, %f48; \
1809: std f, [%o1]; \
1810: inc 8, %o1
1811:
1812: FINISH_REG(%f32)
1813: FINISH_REG(%f34)
1814: FINISH_REG(%f36)
1815: FINISH_REG(%f38)
1816: FINISH_REG(%f40)
1817: FINISH_REG(%f42)
1818: FINISH_REG(%f44)
1819: FINISH_REG(%f46)
1820: FINISH_REG(%f48)
1821: #undef FINISH_REG
1822: !!
1823: !! The low 3 bits have the sub-word bits needed to be
1824: !! stored [because (x-8)&0x7 == x].
1825: !!
1826: Lbcopy_blockfinish:
1827: brz,pn %o2, 2f ! 100% complete?
1828: fmovd %f48, %f4
1829: cmp %o2, 8 ! Exactly 8 bytes?
1830: bz,a,pn %xcc, 2f
1831: std %f4, [%o1]
1832:
1833: btst 4, %o2 ! Word store?
1834: bz %xcc, 1f
1835: nop
1836: st %f4, [%o1]
1837: inc 4, %o1
1838: 1:
1839: btst 2, %o2
1840: fzero %f0
1841: bz 1f
1842:
1843: mov -6, %o4
1844: alignaddr %o1, %o4, %g0
1845:
1846: faligndata %f0, %f4, %f8
1847:
1848: stda %f8, [%o1] ASI_FL16_P ! Store short
1849: inc 2, %o1
1850: 1:
1851: btst 1, %o2 ! Byte aligned?
1852: bz 2f
1853:
1854: mov -7, %o0 ! Calculate dest - 7
1855: alignaddr %o1, %o0, %g0 ! Calculate shift mask and dest.
1856:
1857: faligndata %f0, %f4, %f8 ! Move 1st byte to low part of f8
1858:
1859: stda %f8, [%o1] ASI_FL8_P ! Store 1st byte
1860: inc 1, %o1 ! Update address
1861: 2:
1862: membar #Sync
1863: #if 0
1864: !!
1865: !! verify copy success.
1866: !!
1867:
1868: mov %i0, %o2
1869: mov %i1, %o4
1870: mov %i2, %l4
1871: 0:
1872: ldub [%o2], %o1
1873: inc %o2
1874: ldub [%o4], %o3
1875: inc %o4
1876: cmp %o3, %o1
1877: bnz 1f
1878: dec %l4
1879: brnz %l4, 0b
1880: nop
1881: ba 2f
1882: nop
1883:
1884: 1:
1885: set block_disable, %o0
1886: stx %o0, [%o0]
1887:
1888: set 0f, %o0
1889: call prom_printf
1890: sub %i2, %l4, %o5
1891: set 1f, %o0
1892: mov %i0, %o1
1893: mov %i1, %o2
1894: call prom_printf
1895: mov %i2, %o3
1896: ta 1
1897: .data
1898: _ALIGN
1899: block_disable: .xword 0
1900: 0: .asciz "bcopy failed: %x@%p != %x@%p byte %d\r\n"
1901: 1: .asciz "bcopy(%p, %p, %lx)\r\n"
1902: _ALIGN
1903: .text
1904: 2:
1905: #endif
1906: #ifdef _KERNEL
1907:
1908: set 1f, %o0
1909: mov %i0, %o1
1910: mov %i1, %o2
1911: call printf
1912: mov %i2, %o3
1913:
1914: .data
1915: _ALIGN
1916: 1: .asciz "block exit (%p, %p, %d)\n"
1917: _ALIGN
1918: .text
1919: /*
1920: * Weve saved our possible fpstate, now disable the fpu
1921: * and continue with life.
1922: */
1923: #if 1
1924: RESTORE_FPU
1925: #else
1926: #ifdef DEBUG
1927: LDPTR [%l1 + %lo(FPPROC)], %l7
1928: cmp %l7, %l5
1929: ! tnz 1 ! fpproc has changed!
1930: LDPTR [%l5 + P_FPSTATE], %l7
1931: cmp %l7, %l0
1932: tnz 1 ! fpstate has changed!
1933: #endif
1934: andcc %l2, %l3, %g0 ! If (fpproc && fpstate)
1935: STPTR %l2, [%l1 + %lo(FPPROC)] ! Restore old fproc
1936: bz,pt %xcc, 1f ! Skip if no fpstate
1937: STPTR %l6, [%l5 + P_FPSTATE] ! Restore old fpstate
1938:
1939: call _C_LABEL(loadfpstate) ! Re-load orig fpstate
1940: mov %l3, %o0
1941: 1:
1942: #endif
1943: set 1f, %o0
1944: mov %i0, %o1
1945: mov %i1, %o2
1946: call printf
1947: mov %i2, %o3
1948:
1949: .data
1950: _ALIGN
1951: 1: .asciz "block done (%p, %p, %d)\n"
1952: _ALIGN
1953: .text
1954:
1955:
1956: ret
1957: restore %g1, 0, %o0 ! Return DEST for memcpy
1958: #endif
1959: retl
1960: mov %g1, %o0
1961: #endif
1962:
1963:
CVSweb <webmaster@jp.NetBSD.org>