src/arch/xtensa/up/hal/memcopy.S - chromiumos/third_party/sound-open-firmware - Git at Google

 /*
  *  Core HAL library functions xthal_memcpy and xthal_bcopy
  */

 /*
  * Copyright (c) 2003, 2006, 2010 Tensilica Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
  * without limitation the rights to use, copy, modify, merge, publish,
  * distribute, sublicense, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
  *
  * The above copyright notice and this permission notice shall be included
  * in all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */

 #include <xtensa/coreasm.h>


 #ifdef __XTENSA_EB__
 # define BL(b,l)	b
 #else
 # define BL(b,l)	l
 #endif

 	.macro	srcel	r, early, late	// combine early and late words, shift into \r
 	src	\r, BL(\early,\late), BL(\late,\early)
 	.endm

 	.macro	ssa8f	r	// set shift-amount for shift *from* given 2-bit alignment
 	BL(ssa8b,ssa8l)	\r
 	.endm

 	.macro	ssa8t	r	// set shift-amount for shift *to* given 2-bit alignment
 	BL(ssa8l,ssa8b)	\r	// (reverse of ssa8f)
 	.endm

 	.macro	s2ll	r, s	// shift-to-later logical (away from zero-addressed byte)
 	BL(srl,sll)	\r, \s
 	.endm

 	.macro	s2el	r, s	// shift-to-early logical (towards zero-addressed byte)
 	BL(sll,srl)	\r, \s
 	.endm

 /*
  * void *xthal_memcpy(void *dst, const void *src, size_t len);
  * void *xthal_bcopy(const void *src, void *dst, size_t len);
  *
  * This function is intended to do the same thing as the standard
  * library function memcpy() (or bcopy()) for most cases.
  * However, it uses strictly 32-bit load and store instructions
  * to copy data.  This ensures this function will work
  * where the source and/or destination references an
  * instruction RAM or ROM, which can only be accessed
  * using l32i (IRAM+IROM) and s32i (IRAM).
  *
  * The bcopy version is provided here to avoid the overhead
  * of an extra call, for callers that require this convention.
  *
  * The (general case) algorithm is as follows:
  *   If destination is unaligned, align it by copying 1 to 3 bytes.
  *   If source is aligned,
  *     do 16 bytes with a loop, and then finish up with
  *     8, 4, and 0-3 byte copies conditional on the length;
  *   else (if source is unaligned),
  *     do the same, but use SRC to align the source data.
  *   This code tries to use fall-through branches for the common
  *     case of aligned source and destination and multiple
  *     of 4 length.
  *
  * Register use:
  *	a0/ return address
  *	a1/ stack pointer
  *	a2/ return value
  *	a3/ src
  *	a4/ length
  *	a5/ dst
  *	a6/ tmp
  *	a7/ tmp
  *	a8/ tmp
  *	a9/ tmp
  *	a10/ tmp
  *	a11/ tmp
  *	a12/ tmp
  */

 /* xthal_bcopy and xthal_memcpy need to allocate the same stack size
  * on entry since they share the same function-return code.  Also,
  * there is more than one return point. */

 #define SAVE_A0  0
 #define SAVE_A3  4
 #define SAVE_A4  8
 #define SAVE_A5  12
 #define SAVE_A12 16
 #define STKSIZE  32


 	.text
 	.align	4
 	.global	xthal_bcopy
 	.type	xthal_bcopy,@function
 xthal_bcopy:
 #ifdef __XTENSA_CALL0_ABI__
 	addi    sp, sp, -STKSIZE
 	s32i    a12, a1, SAVE_A12
 #else
 	entry	sp, 32		// allow for call8 below
 #endif
 	// a2=src, a3=dst, a4=len
 	mov	a5, a3		// copy dst so that a2 is return value
 	mov	a3, a2
 	mov	a2, a5
 	j	.Lcommon	// go to common code for memcpy+bcopy

 	.size	xthal_bcopy, . - xthal_bcopy


 /*
  * Destination is unaligned
  */

 	.align	4
 xthal_memcpy.prefixcode:	// purely for purpose of .size
 .Ldstunaligned:
 	mov	a10, a5
 	mov	a11, a3
 	movi	a12, 4
 	sub	a6, a12, a6	// number of bytes to copy for dst alignment
 	mov	a12, a6
 #ifdef __XTENSA_CALL0_ABI__
 	s32i	a0, a1, SAVE_A0	// preserve live registers
 	s32i	a3, a1, SAVE_A3
 	s32i	a4, a1, SAVE_A4
 	s32i	a5, a1, SAVE_A5
 	call0	xthal_copy123
 	l32i	a0, a1, SAVE_A0	// restore live registers
 	l32i	a3, a1, SAVE_A3
 	l32i	a4, a1, SAVE_A4
 	l32i	a5, a1, SAVE_A5
 	mov	a6, a12		// restore a6 from callee-saved register
 #else
 	call8	xthal_copy123
 #endif
 	add	a5, a5, a6
 	add	a3, a3, a6
 	sub	a4, a4, a6
 	j	.Ldstaligned

 	//  Not sure how else to count code that precedes a function, in .size:
 	.size	xthal_memcpy.prefixcode, . - xthal_memcpy.prefixcode


 	.align	4
 	.global	xthal_memcpy
 	.type	xthal_memcpy,@function
 xthal_memcpy:
 #ifdef __XTENSA_CALL0_ABI__
 	addi    sp, sp, -STKSIZE
 	s32i    a12, a1, SAVE_A12
 #else
 	entry	sp, 32		// allow for call8 below
 #endif
 	// a2=dst, a3=src, a4=len
 	mov	a5, a2			// copy dst so that a2 is return value
 .Lcommon:
 #ifdef __XTENSA_CALL0_ABI__
 	/*
 	 * have to restore the stack
 	 */
 	_bgeui	a4, 4, 1f
 	mov	a12, a0		// preserve return address
 	call0	xthal_copy123
 	mov	a0, a12		// restore return address
 	l32i    a12, a1, SAVE_A12
 	addi    sp, sp, STKSIZE
 	ret
 1:
 #else
 	bltui	a4, 4, xthal_copy123_pastentry	// NOTE: sometimes relaxes
 #endif

 	extui	a6, a2, 0, 2		// destination unalignment offset
 	bnez	a6, .Ldstunaligned	// align the destination
 .Ldstaligned:				// return here once dst is aligned
 	srli	a7, a4, 4		// number of loop iterations of 16-bytes each
 	extui	a11, a3, 0, 2		// source unalignment offset
 	_bnez	a11, .Lsrcunaligned	// if source not aligned, use shifting copy
 	/*
 	 * Destination and source are 32-bit aligned, use 32-bit copy.
 	 */
 #if XCHAL_HAVE_LOOPS
 	loopnez	a7, .Loop1done
 #else /* !XCHAL_HAVE_LOOPS */
 	beqz	a7, .Loop1done
 	slli	a8, a7, 4
 	add	a8, a8, a3		// a8 = end of last 16B source chunk
 #endif /* !XCHAL_HAVE_LOOPS */
 .Loop1:
 	l32i	a6, a3,  0
 	l32i	a7, a3,  4
 	s32i	a6, a5,  0
 	l32i	a6, a3,  8
 	s32i	a7, a5,  4
 	l32i	a7, a3, 12
 	s32i	a6, a5,  8
 	addi	a3, a3, 16
 	s32i	a7, a5, 12
 	addi	a5, a5, 16
 #if !XCHAL_HAVE_LOOPS
 	blt	a3, a8, .Loop1
 #endif /* !XCHAL_HAVE_LOOPS */
 .Loop1done:
 	bbci.l	a4, 3, .L2
 	// copy 8 bytes
 	l32i	a6, a3,  0
 	l32i	a7, a3,  4
 	addi	a3, a3,  8
 	s32i	a6, a5,  0
 	s32i	a7, a5,  4
 	addi	a5, a5,  8
 .L2:
 	bbci.l	a4, 2, .L3
 	// copy 4 bytes
 	l32i	a6, a3,  0
 	addi	a3, a3,  4
 	s32i	a6, a5,  0
 	addi	a5, a5,  4
 .L3:
 	//  Copy last 0 to 3 bytes using 32-bit accesses (aligned source and destination):
 	extui	a4, a4, 0, 2	// any bytes to copy?
 	beqz	a4, 1f		// if not, skip this to avoid extraneous loads/stores
 	l32i	a6, a3, 0	// get source word
 	l32i	a7, a5, 0	// get destination word
 	ssa8f	a4		// shift from length (end of source)
 	s2ll	a6, a6		// align source to last byte
 	s2el	a7, a7		// align parts of a7 following modified bytes, to early byte
 	ssa8t	a4		// shift to end of modified destination (length)
 	srcel	a7, a6, a7	// combine source with late-dst to form last word
 	s32i	a7, a5, 0	// update last word
 1:

 #ifdef __XTENSA_CALL0_ABI__
 	l32i    a12, a1, SAVE_A12
 	addi    sp, sp, STKSIZE
 	ret
 #else
 	retw
 #endif

 	.size	xthal_memcpy, . - xthal_memcpy


 	//  void xthal_copy123(dst, src, len);
 	//
 	//  Copy from 0 to 3 bytes, using only 32-bit loads and stores,
 	//  with arbitrarily aligned source and destination.
 	//
 	// arg1 = a2 = dst
 	// arg2 = a3 = src
 	// arg3 = a4 = len

 	.global	xthal_copy123
 	.type	xthal_copy123,@function
 	.align	4
 xthal_copy123:
 	abi_entry

 xthal_copy123_pastentry:
 	_beqz	a4, cdone	// don't load or store if zero bytes
 	//  First get the bytes:
 	movi	a5, ~3
 	and	a5, a3, a5	// align src address
 	l32i	a6, a5, 0
 	l32i	a7, a5, 4
 	ssa8f	a3
 	srcel	a3, a6, a7
 	// a3 now contains source bytes, aligned to 1st byte (memory order)
 	// (source address is no longer needed at this point)

 	//  Does destination span two words?:
 	extui	a10, a2, 0, 2	// destination alignment
 	sub	a5, a2, a10	// align destination address
 	l32i	a8, a5, 0	// get first destination word regardless
 	add	a6, a10, a4	// dst_align + len
 	ssa8f	a2		// shift from dst_align (to 1st or last byte)
 	s2ll	a10, a8		// a10 = first part of destination, aligned to last byte
 	bltui	a6, 4, oneword	// branch if destination contained in single word

 	//  Two-word destination case:
 	l32i	a8, a5, 4	// get second word
 	ssa8t	a2		// shift to dst_align
 	srcel	a10, a10, a3	// with a10 in early bytes, a3 in later bytes
 	s32i	a10, a5, 0	// update first word
 	addi	a5, a5, 4	// advance to last word for common code below
 	//movi	a10, 0		// not needed, gets dropped

 oneword:
 	//  One-word (and two-word) destination case:
 	//	a8 =  contents of last destination word
 	//	a10 = early part of a8 preceding modified bytes, shifted towards last byte
 	//
 	ssa8f	a4		// shift from length (end of source)
 	srcel	a3, a10, a3	// combine early-destination with source, aligned to last byte

 	ssa8f	a6		// shift from end of modified destination (dst_align+len)
 	s2el	a8, a8		// align parts of a8 following modified bytes, to early byte
 	ssa8t	a6		// shift to end of modified destination (dst_align+len)
 	srcel	a8, a3, a8	// combine early-dst+source with late-dst to form last word
 	s32i	a8, a5, 0	// update last word
 cdone:	abi_return		// return dst

 /*
  * Destination is aligned, Source is unaligned
  */

 	.align	4
 .Lsrcunaligned:
 	// Copy 16 bytes per iteration for word-aligned dst and unaligned src
 	ssa8f	a3		// set shift amount from byte offset
 #define SIM_CHECKS_ALIGNMENT	1	/* set to 1 when running on ISS (simulator) with the
 					   lint or ferret client, or 0 to save a few cycles */
 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
 	extui	a11, a3, 0, 2	// save unalignment offset for below
 	sub	a3, a3, a11	// align a3
 #endif
 	l32i	a6, a3, 0	// load first word
 #if XCHAL_HAVE_LOOPS
 	loopnez	a7, .Loop2done
 #else /* !XCHAL_HAVE_LOOPS */
 	beqz	a7, .Loop2done
 	slli	a10, a7, 4
 	add	a10, a10, a3	// a10 = end of last 16B source chunk
 #endif /* !XCHAL_HAVE_LOOPS */
 .Loop2:
 	l32i	a7, a3,  4
 	l32i	a8, a3,  8
 	srcel	a6, a6, a7
 	s32i	a6, a5,  0
 	l32i	a9, a3, 12
 	srcel	a7, a7, a8
 	s32i	a7, a5,  4
 	l32i	a6, a3, 16
 	srcel	a8, a8, a9
 	s32i	a8, a5,  8
 	addi	a3, a3, 16
 	srcel	a9, a9, a6
 	s32i	a9, a5, 12
 	addi	a5, a5, 16
 #if !XCHAL_HAVE_LOOPS
 	blt	a3, a10, .Loop2
 #endif /* !XCHAL_HAVE_LOOPS */
 .Loop2done:
 	bbci.l	a4, 3, .L12
 	// copy 8 bytes
 	l32i	a7, a3,  4
 	l32i	a8, a3,  8
 	srcel	a6, a6, a7
 	s32i	a6, a5,  0
 	addi	a3, a3,  8
 	srcel	a7, a7, a8
 	s32i	a7, a5,  4
 	addi	a5, a5,  8
 	mov	a6, a8
 .L12:
 	bbci.l	a4, 2, .L13
 	// copy 4 bytes
 	l32i	a7, a3,  4
 	addi	a3, a3,  4
 	srcel	a6, a6, a7
 	s32i	a6, a5,  0
 	addi	a5, a5,  4
 	mov	a6, a7
 .L13:
 	//  Copy last 0 to 3 bytes using 32-bit accesses (shifting source, aligned destination):
 	//_beqz	a4[1:0], cdone	// don't load or store if zero bytes
 	l32i	a7, a3, 4	// get source word
 	l32i	a3, a5, 0	// get destination word
 	srcel	a6, a6, a7	// source bytes, aligned to early (1st) byte
 	ssa8f	a4		// shift from length (end of source)
 	s2ll	a6, a6		// combine early-destination with source, aligned to last byte
 	s2el	a3, a3		// align parts of a3 following modified bytes, to early byte
 	ssa8t	a4		// shift to end of modified destination (length)
 	srcel	a3, a6, a3	// combine early-dst+source with late-dst to form last word
 	s32i	a3, a5, 0	// update last word
 .Ldone:
 #ifdef __XTENSA_CALL0_ABI__
 	l32i    a12, a1, SAVE_A12
 	addi    sp, sp, STKSIZE
 	ret
 #else
 	retw
 #endif

 	.size	xthal_copy123, . - xthal_copy123
	/*
	* Core HAL library functions xthal_memcpy and xthal_bcopy
	*/

	/*
	* Copyright (c) 2003, 2006, 2010 Tensilica Inc.
	*
	* Permission is hereby granted, free of charge, to any person obtaining
	* a copy of this software and associated documentation files (the
	* "Software"), to deal in the Software without restriction, including
	* without limitation the rights to use, copy, modify, merge, publish,
	* distribute, sublicense, and/or sell copies of the Software, and to
	* permit persons to whom the Software is furnished to do so, subject to
	* the following conditions:
	*
	* The above copyright notice and this permission notice shall be included
	* in all copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
	* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
	* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
	* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
	* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
	*/

	#include <xtensa/coreasm.h>


	#ifdef __XTENSA_EB__
	# define BL(b,l) b
	#else
	# define BL(b,l) l
	#endif

	.macro srcel r, early, late // combine early and late words, shift into \r
	src \r, BL(\early,\late), BL(\late,\early)
	.endm

	.macro ssa8f r // set shift-amount for shift from given 2-bit alignment
	BL(ssa8b,ssa8l) \r
	.endm

	.macro ssa8t r // set shift-amount for shift to given 2-bit alignment
	BL(ssa8l,ssa8b) \r // (reverse of ssa8f)
	.endm

	.macro s2ll r, s // shift-to-later logical (away from zero-addressed byte)
	BL(srl,sll) \r, \s
	.endm

	.macro s2el r, s // shift-to-early logical (towards zero-addressed byte)
	BL(sll,srl) \r, \s
	.endm

	/*
	* void xthal_memcpy(void dst, const void *src, size_t len);
	* void xthal_bcopy(const void src, void *dst, size_t len);
	*
	* This function is intended to do the same thing as the standard
	* library function memcpy() (or bcopy()) for most cases.
	* However, it uses strictly 32-bit load and store instructions
	* to copy data. This ensures this function will work
	* where the source and/or destination references an
	* instruction RAM or ROM, which can only be accessed
	* using l32i (IRAM+IROM) and s32i (IRAM).
	*
	* The bcopy version is provided here to avoid the overhead
	* of an extra call, for callers that require this convention.
	*
	* The (general case) algorithm is as follows:
	* If destination is unaligned, align it by copying 1 to 3 bytes.
	* If source is aligned,
	* do 16 bytes with a loop, and then finish up with
	* 8, 4, and 0-3 byte copies conditional on the length;
	* else (if source is unaligned),
	* do the same, but use SRC to align the source data.
	* This code tries to use fall-through branches for the common
	* case of aligned source and destination and multiple
	* of 4 length.
	*
	* Register use:
	* a0/ return address
	* a1/ stack pointer
	* a2/ return value
	* a3/ src
	* a4/ length
	* a5/ dst
	* a6/ tmp
	* a7/ tmp
	* a8/ tmp
	* a9/ tmp
	* a10/ tmp
	* a11/ tmp
	* a12/ tmp
	*/

	/* xthal_bcopy and xthal_memcpy need to allocate the same stack size
	* on entry since they share the same function-return code. Also,
	* there is more than one return point. */

	#define SAVE_A0 0
	#define SAVE_A3 4
	#define SAVE_A4 8
	#define SAVE_A5 12
	#define SAVE_A12 16
	#define STKSIZE 32


	.text
	.align 4
	.global xthal_bcopy
	.type xthal_bcopy,@function
	xthal_bcopy:
	#ifdef __XTENSA_CALL0_ABI__
	addi sp, sp, -STKSIZE
	s32i a12, a1, SAVE_A12
	#else
	entry sp, 32 // allow for call8 below
	#endif
	// a2=src, a3=dst, a4=len
	mov a5, a3 // copy dst so that a2 is return value
	mov a3, a2
	mov a2, a5
	j .Lcommon // go to common code for memcpy+bcopy

	.size xthal_bcopy, . - xthal_bcopy



	/*
	* Destination is unaligned
	*/

	.align 4
	xthal_memcpy.prefixcode: // purely for purpose of .size
	.Ldstunaligned:
	mov a10, a5
	mov a11, a3
	movi a12, 4
	sub a6, a12, a6 // number of bytes to copy for dst alignment
	mov a12, a6
	#ifdef __XTENSA_CALL0_ABI__
	s32i a0, a1, SAVE_A0 // preserve live registers
	s32i a3, a1, SAVE_A3
	s32i a4, a1, SAVE_A4
	s32i a5, a1, SAVE_A5
	call0 xthal_copy123
	l32i a0, a1, SAVE_A0 // restore live registers
	l32i a3, a1, SAVE_A3
	l32i a4, a1, SAVE_A4
	l32i a5, a1, SAVE_A5
	mov a6, a12 // restore a6 from callee-saved register
	#else
	call8 xthal_copy123
	#endif
	add a5, a5, a6
	add a3, a3, a6
	sub a4, a4, a6
	j .Ldstaligned

	// Not sure how else to count code that precedes a function, in .size:
	.size xthal_memcpy.prefixcode, . - xthal_memcpy.prefixcode


	.align 4
	.global xthal_memcpy
	.type xthal_memcpy,@function
	xthal_memcpy:
	#ifdef __XTENSA_CALL0_ABI__
	addi sp, sp, -STKSIZE
	s32i a12, a1, SAVE_A12
	#else
	entry sp, 32 // allow for call8 below
	#endif
	// a2=dst, a3=src, a4=len
	mov a5, a2 // copy dst so that a2 is return value
	.Lcommon:
	#ifdef __XTENSA_CALL0_ABI__
	/*
	* have to restore the stack
	*/
	_bgeui a4, 4, 1f
	mov a12, a0 // preserve return address
	call0 xthal_copy123
	mov a0, a12 // restore return address
	l32i a12, a1, SAVE_A12
	addi sp, sp, STKSIZE
	ret
	1:
	#else
	bltui a4, 4, xthal_copy123_pastentry // NOTE: sometimes relaxes
	#endif

	extui a6, a2, 0, 2 // destination unalignment offset
	bnez a6, .Ldstunaligned // align the destination
	.Ldstaligned: // return here once dst is aligned
	srli a7, a4, 4 // number of loop iterations of 16-bytes each
	extui a11, a3, 0, 2 // source unalignment offset
	_bnez a11, .Lsrcunaligned // if source not aligned, use shifting copy
	/*
	* Destination and source are 32-bit aligned, use 32-bit copy.
	*/
	#if XCHAL_HAVE_LOOPS
	loopnez a7, .Loop1done
	#else /* !XCHAL_HAVE_LOOPS */
	beqz a7, .Loop1done
	slli a8, a7, 4
	add a8, a8, a3 // a8 = end of last 16B source chunk
	#endif /* !XCHAL_HAVE_LOOPS */
	.Loop1:
	l32i a6, a3, 0
	l32i a7, a3, 4
	s32i a6, a5, 0
	l32i a6, a3, 8
	s32i a7, a5, 4
	l32i a7, a3, 12
	s32i a6, a5, 8
	addi a3, a3, 16
	s32i a7, a5, 12
	addi a5, a5, 16
	#if !XCHAL_HAVE_LOOPS
	blt a3, a8, .Loop1
	#endif /* !XCHAL_HAVE_LOOPS */
	.Loop1done:
	bbci.l a4, 3, .L2
	// copy 8 bytes
	l32i a6, a3, 0
	l32i a7, a3, 4
	addi a3, a3, 8
	s32i a6, a5, 0
	s32i a7, a5, 4
	addi a5, a5, 8
	.L2:
	bbci.l a4, 2, .L3
	// copy 4 bytes
	l32i a6, a3, 0
	addi a3, a3, 4
	s32i a6, a5, 0
	addi a5, a5, 4
	.L3:
	// Copy last 0 to 3 bytes using 32-bit accesses (aligned source and destination):
	extui a4, a4, 0, 2 // any bytes to copy?
	beqz a4, 1f // if not, skip this to avoid extraneous loads/stores
	l32i a6, a3, 0 // get source word
	l32i a7, a5, 0 // get destination word
	ssa8f a4 // shift from length (end of source)
	s2ll a6, a6 // align source to last byte
	s2el a7, a7 // align parts of a7 following modified bytes, to early byte
	ssa8t a4 // shift to end of modified destination (length)
	srcel a7, a6, a7 // combine source with late-dst to form last word
	s32i a7, a5, 0 // update last word
	1:

	#ifdef __XTENSA_CALL0_ABI__
	l32i a12, a1, SAVE_A12
	addi sp, sp, STKSIZE
	ret
	#else
	retw
	#endif

	.size xthal_memcpy, . - xthal_memcpy


	// void xthal_copy123(dst, src, len);
	//
	// Copy from 0 to 3 bytes, using only 32-bit loads and stores,
	// with arbitrarily aligned source and destination.
	//
	// arg1 = a2 = dst
	// arg2 = a3 = src
	// arg3 = a4 = len

	.global xthal_copy123
	.type xthal_copy123,@function
	.align 4
	xthal_copy123:
	abi_entry

	xthal_copy123_pastentry:
	_beqz a4, cdone // don't load or store if zero bytes
	// First get the bytes:
	movi a5, ~3
	and a5, a3, a5 // align src address
	l32i a6, a5, 0
	l32i a7, a5, 4
	ssa8f a3
	srcel a3, a6, a7
	// a3 now contains source bytes, aligned to 1st byte (memory order)
	// (source address is no longer needed at this point)

	// Does destination span two words?:
	extui a10, a2, 0, 2 // destination alignment
	sub a5, a2, a10 // align destination address
	l32i a8, a5, 0 // get first destination word regardless
	add a6, a10, a4 // dst_align + len
	ssa8f a2 // shift from dst_align (to 1st or last byte)
	s2ll a10, a8 // a10 = first part of destination, aligned to last byte
	bltui a6, 4, oneword // branch if destination contained in single word

	// Two-word destination case:
	l32i a8, a5, 4 // get second word
	ssa8t a2 // shift to dst_align
	srcel a10, a10, a3 // with a10 in early bytes, a3 in later bytes
	s32i a10, a5, 0 // update first word
	addi a5, a5, 4 // advance to last word for common code below
	//movi a10, 0 // not needed, gets dropped

	oneword:
	// One-word (and two-word) destination case:
	// a8 = contents of last destination word
	// a10 = early part of a8 preceding modified bytes, shifted towards last byte
	//
	ssa8f a4 // shift from length (end of source)
	srcel a3, a10, a3 // combine early-destination with source, aligned to last byte

	ssa8f a6 // shift from end of modified destination (dst_align+len)
	s2el a8, a8 // align parts of a8 following modified bytes, to early byte
	ssa8t a6 // shift to end of modified destination (dst_align+len)
	srcel a8, a3, a8 // combine early-dst+source with late-dst to form last word
	s32i a8, a5, 0 // update last word
	cdone: abi_return // return dst

	/*
	* Destination is aligned, Source is unaligned
	*/

	.align 4
	.Lsrcunaligned:
	// Copy 16 bytes per iteration for word-aligned dst and unaligned src
	ssa8f a3 // set shift amount from byte offset
	#define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS (simulator) with the
	lint or ferret client, or 0 to save a few cycles */
	#if XCHAL_UNALIGNED_LOAD_EXCEPTION \|\| SIM_CHECKS_ALIGNMENT
	extui a11, a3, 0, 2 // save unalignment offset for below
	sub a3, a3, a11 // align a3
	#endif
	l32i a6, a3, 0 // load first word
	#if XCHAL_HAVE_LOOPS
	loopnez a7, .Loop2done
	#else /* !XCHAL_HAVE_LOOPS */
	beqz a7, .Loop2done
	slli a10, a7, 4
	add a10, a10, a3 // a10 = end of last 16B source chunk
	#endif /* !XCHAL_HAVE_LOOPS */
	.Loop2:
	l32i a7, a3, 4
	l32i a8, a3, 8
	srcel a6, a6, a7
	s32i a6, a5, 0
	l32i a9, a3, 12
	srcel a7, a7, a8
	s32i a7, a5, 4
	l32i a6, a3, 16
	srcel a8, a8, a9
	s32i a8, a5, 8
	addi a3, a3, 16
	srcel a9, a9, a6
	s32i a9, a5, 12
	addi a5, a5, 16
	#if !XCHAL_HAVE_LOOPS
	blt a3, a10, .Loop2
	#endif /* !XCHAL_HAVE_LOOPS */
	.Loop2done:
	bbci.l a4, 3, .L12
	// copy 8 bytes
	l32i a7, a3, 4
	l32i a8, a3, 8
	srcel a6, a6, a7
	s32i a6, a5, 0
	addi a3, a3, 8
	srcel a7, a7, a8
	s32i a7, a5, 4
	addi a5, a5, 8
	mov a6, a8
	.L12:
	bbci.l a4, 2, .L13
	// copy 4 bytes
	l32i a7, a3, 4
	addi a3, a3, 4
	srcel a6, a6, a7
	s32i a6, a5, 0
	addi a5, a5, 4
	mov a6, a7
	.L13:
	// Copy last 0 to 3 bytes using 32-bit accesses (shifting source, aligned destination):
	//_beqz a4[1:0], cdone // don't load or store if zero bytes
	l32i a7, a3, 4 // get source word
	l32i a3, a5, 0 // get destination word
	srcel a6, a6, a7 // source bytes, aligned to early (1st) byte
	ssa8f a4 // shift from length (end of source)
	s2ll a6, a6 // combine early-destination with source, aligned to last byte
	s2el a3, a3 // align parts of a3 following modified bytes, to early byte
	ssa8t a4 // shift to end of modified destination (length)
	srcel a3, a6, a3 // combine early-dst+source with late-dst to form last word
	s32i a3, a5, 0 // update last word
	.Ldone:
	#ifdef __XTENSA_CALL0_ABI__
	l32i a12, a1, SAVE_A12
	addi sp, sp, STKSIZE
	ret
	#else
	retw
	#endif

	.size xthal_copy123, . - xthal_copy123