newlib/libc/machine/h8300/memcpy.S - native_client/nacl-newlib - Git at Google

 #include "setarch.h"

 #include "defines.h"

 #ifdef __H8300SX__

 	.global _memcpy
 _memcpy:
 	stm.l	er4-er6,@-er7

 	; Set up source and destination pointers for movmd.
 	mov.l	er0,er6
 	mov.l	er1,er5

 	; See whether the copy is long enough to use the movmd.l code.
 	; Although the code can handle anything longer than 6 bytes,
 	; it can be more expensive than movmd.b for small moves.
 	; It's better to use a higher threshold to account for this.
 	;
 	; Note that the exact overhead of the movmd.l checks depends on
 	; the alignments of the length and pointers.  They are faster when
 	; er0 & 3 == er1 & 3 == er2 & 3, faster still when these values
 	; are 0.  This threshold is a compromise between the various cases.
 	cmp	#16,LEN(r2)
 	blo	simple

 	; movmd.l only works for even addresses.  If one of the addresses
 	; is odd and the other is not, fall back on a simple move.
 	bld	#0,r5l
 	bxor	#0,r6l
 	bcs	simple

 	; Make the addresses even.
 	bld	#0,r5l
 	bcc	word_aligned
 	mov.b	@er5+,@er6+
 	sub	#1,LEN(r2)

 word_aligned:
 	; See if copying one word would make the first operand longword
 	; aligned.  Although this is only really worthwhile if it aligns
 	; the second operand as well, it's no worse if doesn't, so it
 	; hardly seems worth the overhead of a "band" check.
 	bld	#1,r6l
 	bcc	fast_copy
 	mov.w	@er5+,@er6+
 	sub	#2,LEN(r2)

 fast_copy:
 	; Set (e)r4 to the number of longwords to copy.
 	mov	LEN(r2),LEN(r4)
 	shlr	#2,LEN(r4)

 #ifdef __NORMAL_MODE__
 	; 16-bit pointers and size_ts: one movmd.l is enough.  This code
 	; is never reached with r4 == 0.
 	movmd.l
 	and.w	#3,r2
 simple:
 	mov.w	r2,r4
 	beq	quit
 	movmd.b
 quit:
 	rts/l	er4-er6
 #else
 	; Skip the first iteration if the number of longwords is divisible
 	; by 0x10000.
 	mov.w	r4,r4
 	beq	fast_loop_next

 	; This loop copies r4 (!= 0) longwords the first time round and 65536
 	; longwords on each iteration after that.
 fast_loop:
 	movmd.l
 fast_loop_next:
 	sub.w	#1,e4
 	bhs	fast_loop

 	; Mop up any left-over bytes.  We could just fall through to the
 	; simple code after the "and" but the version below is quicker
 	; and only takes 10 more bytes.
 	and.w	#3,r2
 	beq	quit
 	mov.w	r2,r4
 	movmd.b
 quit:
 	rts/l	er4-er6

 simple:
 	; Simple bytewise copy.  We need to handle all lengths, including zero.
 	mov.w	r2,r4
 	beq	simple_loop_next
 simple_loop:
 	movmd.b
 simple_loop_next:
 	sub.w	#1,e2
 	bhs	simple_loop
 	rts/l	er4-er6
 #endif

 #else

 	.global _memcpy
 _memcpy:
 ;	MOVP	@(2/4,r7),A0P	; dst
 ;	MOVP	@(4/8,r7),A1P	; src
 ;	MOVP	@(6/12,r7),A2P	; len

 	MOVP	A0P,A3P	; keep copy of final dst
 	ADDP	A2P,A0P	; point to end of dst
 	CMPP	A0P,A3P	; see if anything to do
 	beq	quit

 	ADDP	A2P,A1P	; point to end of src

 	; lets see if we can do this in words
 	or	A0L,A2L	; or in the dst address
 	or	A3L,A2L	; or the length
 	or	A1L,A2L	; or the src address
 	btst	#0,A2L	; see if the lsb is zero
 	bne	byteloop

 wordloop:
 #ifdef __NORMAL_MODE__
 	sub	#2,A1P
 #else
 	subs	#2,A1P		; point to word
 #endif
 	mov.w	@A1P,A2		; get word
 	mov.w	A2,@-A0P	; save word
 	CMPP	A0P,A3P		; at the front again ?
 	bne 	wordloop
 	rts

 byteloop:
 #ifdef __NORMAL_MODE__
 	sub	#1,A1P
 #else
 	subs	#1,A1P		; point to byte
 #endif
 	mov.b	@A1P,A2L	; get byte
 	mov.b	A2L,@-A0P	; save byte
 	CMPP	A0P,A3P 	; at the front again ?
 	bne 	byteloop

 	; return with A0 pointing to dst
 quit:	rts

 #endif
	#include "setarch.h"

	#include "defines.h"

	#ifdef __H8300SX__

	.global _memcpy
	_memcpy:
	stm.l er4-er6,@-er7

	; Set up source and destination pointers for movmd.
	mov.l er0,er6
	mov.l er1,er5

	; See whether the copy is long enough to use the movmd.l code.
	; Although the code can handle anything longer than 6 bytes,
	; it can be more expensive than movmd.b for small moves.
	; It's better to use a higher threshold to account for this.
	;
	; Note that the exact overhead of the movmd.l checks depends on
	; the alignments of the length and pointers. They are faster when
	; er0 & 3 == er1 & 3 == er2 & 3, faster still when these values
	; are 0. This threshold is a compromise between the various cases.
	cmp #16,LEN(r2)
	blo simple

	; movmd.l only works for even addresses. If one of the addresses
	; is odd and the other is not, fall back on a simple move.
	bld #0,r5l
	bxor #0,r6l
	bcs simple

	; Make the addresses even.
	bld #0,r5l
	bcc word_aligned
	mov.b @er5+,@er6+
	sub #1,LEN(r2)

	word_aligned:
	; See if copying one word would make the first operand longword
	; aligned. Although this is only really worthwhile if it aligns
	; the second operand as well, it's no worse if doesn't, so it
	; hardly seems worth the overhead of a "band" check.
	bld #1,r6l
	bcc fast_copy
	mov.w @er5+,@er6+
	sub #2,LEN(r2)

	fast_copy:
	; Set (e)r4 to the number of longwords to copy.
	mov LEN(r2),LEN(r4)
	shlr #2,LEN(r4)

	#ifdef __NORMAL_MODE__
	; 16-bit pointers and size_ts: one movmd.l is enough. This code
	; is never reached with r4 == 0.
	movmd.l
	and.w #3,r2
	simple:
	mov.w r2,r4
	beq quit
	movmd.b
	quit:
	rts/l er4-er6
	#else
	; Skip the first iteration if the number of longwords is divisible
	; by 0x10000.
	mov.w r4,r4
	beq fast_loop_next

	; This loop copies r4 (!= 0) longwords the first time round and 65536
	; longwords on each iteration after that.
	fast_loop:
	movmd.l
	fast_loop_next:
	sub.w #1,e4
	bhs fast_loop

	; Mop up any left-over bytes. We could just fall through to the
	; simple code after the "and" but the version below is quicker
	; and only takes 10 more bytes.
	and.w #3,r2
	beq quit
	mov.w r2,r4
	movmd.b
	quit:
	rts/l er4-er6

	simple:
	; Simple bytewise copy. We need to handle all lengths, including zero.
	mov.w r2,r4
	beq simple_loop_next
	simple_loop:
	movmd.b
	simple_loop_next:
	sub.w #1,e2
	bhs simple_loop
	rts/l er4-er6
	#endif

	#else

	.global _memcpy
	_memcpy:
	; MOVP @(2/4,r7),A0P ; dst
	; MOVP @(4/8,r7),A1P ; src
	; MOVP @(6/12,r7),A2P ; len

	MOVP A0P,A3P ; keep copy of final dst
	ADDP A2P,A0P ; point to end of dst
	CMPP A0P,A3P ; see if anything to do
	beq quit

	ADDP A2P,A1P ; point to end of src

	; lets see if we can do this in words
	or A0L,A2L ; or in the dst address
	or A3L,A2L ; or the length
	or A1L,A2L ; or the src address
	btst #0,A2L ; see if the lsb is zero
	bne byteloop

	wordloop:
	#ifdef __NORMAL_MODE__
	sub #2,A1P
	#else
	subs #2,A1P ; point to word
	#endif
	mov.w @A1P,A2 ; get word
	mov.w A2,@-A0P ; save word
	CMPP A0P,A3P ; at the front again ?
	bne wordloop
	rts

	byteloop:
	#ifdef __NORMAL_MODE__
	sub #1,A1P
	#else
	subs #1,A1P ; point to byte
	#endif
	mov.b @A1P,A2L ; get byte
	mov.b A2L,@-A0P ; save byte
	CMPP A0P,A3P ; at the front again ?
	bne byteloop

	; return with A0 pointing to dst
	quit: rts

	#endif