[picodrive.git] / platform / gp2x / code940 / memcpy.s

/* $NetBSD: memcpy.S,v 1.3 1997/11/22 03:27:12 mark Exp $ */

/*-
* Copyright (c) 1997 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Neil A. Carson and Mark Brinicombe
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
*    notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
*    notice, this list of conditions and the following disclaimer in the
*    documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
*    must display the following acknowledgement:
* This product includes software developed by the NetBSD
* Foundation, Inc. and its contributors.
* 4. Neither the name of The NetBSD Foundation nor the names of its
*    contributors may be used to endorse or promote products derived
*    from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS\'\' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/

/* #include <machine/asm.h>*/

.globl memcpy
.globl _memcpy
memcpy:

stmfd sp!, {r0, lr}
bl _memcpy
ldmfd sp!, {r0, pc}


.globl memmove
memmove:

stmfd sp!, {r0, lr}
bl _memcpy
ldmfd sp!, {r0, pc}


/*
* This is one fun bit of code ...
* Some easy listening music is suggested while trying to understand this
* code e.g. Iron Maiden
*
* For anyone attempting to understand it :
*
* The core code is implemented here with simple stubs for memcpy()
* memmove() and bcopy().
*
* All local labels are prefixed with Lmemcpy_
* Following the prefix a label starting f is used in the forward copy code
* while a label using b is used in the backwards copy code
* The source and destination addresses determine whether a forward or
* backward copy is performed.
* Separate bits of code are used to deal with the following situations
* for both the forward and backwards copy.
* unaligned source address
* unaligned destination address
* Separate copy routines are used to produce an optimised result for each
* of these cases.
* The copy code will use LDM/STM instructions to copy up to 32 bytes at
* a time where possible.
*
* Note: r12 (aka ip) can be trashed during the function along with
* r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
* Additional registers are preserved prior to use i.e. r4, r5 & lr
*
* Apologies for the state of the comments;-)
*/


_memcpy:

/* Determine copy direction */
cmp r1, r0
bcc Lmemcpy_backwards

moveq r0, #0   /* Quick abort for len=0 */
moveq pc, lr

stmdb sp!, {r0, lr}  /* memcpy() returns dest addr */
subs r2, r2, #4
blt Lmemcpy_fl4  /* less than 4 bytes */
ands r12, r0, #3
bne Lmemcpy_fdestul  /* oh unaligned destination addr */
ands r12, r1, #3
bne Lmemcpy_fsrcul  /* oh unaligned source addr */

Lmemcpy_ft8:
/* We have aligned source and destination */
subs r2, r2, #8
blt Lmemcpy_fl12  /* less than 12 bytes (4 from above) */
subs r2, r2, #0x14        
blt Lmemcpy_fl32  /* less than 32 bytes (12 from above) */
stmdb sp!, {r4, r7, r8, r9, r10}  /* borrow r4 */

/* blat 64 bytes at a time */
/* XXX for really big copies perhaps we should use more registers */
Lmemcpy_floop32:
ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
subs r2, r2, #0x40        
bge Lmemcpy_floop32

cmn r2, #0x10
ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
stmgeia r0!, {r3, r4, r12, lr}
subge r2, r2, #0x10        
ldmia sp!, {r4, r7, r8, r9, r10}  /* return r4 */

Lmemcpy_fl32:
adds r2, r2, #0x14        

/* blat 12 bytes at a time */
Lmemcpy_floop12:
ldmgeia r1!, {r3, r12, lr}
stmgeia r0!, {r3, r12, lr}
subges r2, r2, #0x0c        
bge Lmemcpy_floop12

Lmemcpy_fl12:
adds r2, r2, #8
blt Lmemcpy_fl4

subs r2, r2, #4
ldrlt r3, [r1], #4
strlt r3, [r0], #4
ldmgeia r1!, {r3, r12}
stmgeia r0!, {r3, r12}
subge r2, r2, #4

Lmemcpy_fl4:
/* less than 4 bytes to go */
adds r2, r2, #4
ldmeqia sp!, {r0, pc}  /* done */

/* copy the crud byte at a time */
cmp r2, #2
ldrb r3, [r1], #1
strb r3, [r0], #1
ldrgeb r3, [r1], #1
strgeb r3, [r0], #1
ldrgtb r3, [r1], #1
strgtb r3, [r0], #1
ldmia sp!, {r0, pc}

/* erg - unaligned destination */
Lmemcpy_fdestul:
rsb r12, r12, #4
cmp r12, #2

/* align destination with byte copies */
ldrb r3, [r1], #1
strb r3, [r0], #1
ldrgeb r3, [r1], #1
strgeb r3, [r0], #1
ldrgtb r3, [r1], #1
strgtb r3, [r0], #1
subs r2, r2, r12
blt Lmemcpy_fl4  /* less the 4 bytes */

ands r12, r1, #3
beq Lmemcpy_ft8  /* we have an aligned source */

/* erg - unaligned source */
/* This is where it gets nasty ... */
Lmemcpy_fsrcul:
bic r1, r1, #3
ldr lr, [r1], #4
cmp r12, #2
bgt Lmemcpy_fsrcul3
beq Lmemcpy_fsrcul2
cmp r2, #0x0c            
blt Lmemcpy_fsrcul1loop4
sub r2, r2, #0x0c        
stmdb sp!, {r4, r5}

Lmemcpy_fsrcul1loop16:
mov r3, lr, lsr #8
ldmia r1!, {r4, r5, r12, lr}
orr r3, r3, r4, lsl #24
mov r4, r4, lsr #8
orr r4, r4, r5, lsl #24
mov r5, r5, lsr #8
orr r5, r5, r12, lsl #24
mov r12, r12, lsr #8
orr r12, r12, lr, lsl #24
stmia r0!, {r3-r5, r12}
subs r2, r2, #0x10        
bge Lmemcpy_fsrcul1loop16
ldmia sp!, {r4, r5}
adds r2, r2, #0x0c        
blt Lmemcpy_fsrcul1l4

Lmemcpy_fsrcul1loop4:
mov r12, lr, lsr #8
ldr lr, [r1], #4
orr r12, r12, lr, lsl #24
str r12, [r0], #4
subs r2, r2, #4
bge Lmemcpy_fsrcul1loop4

Lmemcpy_fsrcul1l4:
sub r1, r1, #3
b Lmemcpy_fl4

Lmemcpy_fsrcul2:
cmp r2, #0x0c            
blt Lmemcpy_fsrcul2loop4
sub r2, r2, #0x0c        
stmdb sp!, {r4, r5}

Lmemcpy_fsrcul2loop16:
mov r3, lr, lsr #16
ldmia r1!, {r4, r5, r12, lr}
orr r3, r3, r4, lsl #16
mov r4, r4, lsr #16
orr r4, r4, r5, lsl #16
mov r5, r5, lsr #16
orr r5, r5, r12, lsl #16
mov r12, r12, lsr #16
orr r12, r12, lr, lsl #16
stmia r0!, {r3-r5, r12}
subs r2, r2, #0x10        
bge Lmemcpy_fsrcul2loop16
ldmia sp!, {r4, r5}
adds r2, r2, #0x0c        
blt Lmemcpy_fsrcul2l4

Lmemcpy_fsrcul2loop4:
mov r12, lr, lsr #16
ldr lr, [r1], #4
orr r12, r12, lr, lsl #16
str r12, [r0], #4
subs r2, r2, #4
bge Lmemcpy_fsrcul2loop4

Lmemcpy_fsrcul2l4:
sub r1, r1, #2
b Lmemcpy_fl4

Lmemcpy_fsrcul3:
cmp r2, #0x0c            
blt Lmemcpy_fsrcul3loop4
sub r2, r2, #0x0c        
stmdb sp!, {r4, r5}

Lmemcpy_fsrcul3loop16:
mov r3, lr, lsr #24
ldmia r1!, {r4, r5, r12, lr}
orr r3, r3, r4, lsl #8
mov r4, r4, lsr #24
orr r4, r4, r5, lsl #8
mov r5, r5, lsr #24
orr r5, r5, r12, lsl #8
mov r12, r12, lsr #24
orr r12, r12, lr, lsl #8
stmia r0!, {r3-r5, r12}
subs r2, r2, #0x10        
bge Lmemcpy_fsrcul3loop16
ldmia sp!, {r4, r5}
adds r2, r2, #0x0c        
blt Lmemcpy_fsrcul3l4

Lmemcpy_fsrcul3loop4:
mov r12, lr, lsr #24
ldr lr, [r1], #4
orr r12, r12, lr, lsl #8
str r12, [r0], #4
subs r2, r2, #4
bge Lmemcpy_fsrcul3loop4

Lmemcpy_fsrcul3l4:
sub r1, r1, #1
b Lmemcpy_fl4

Lmemcpy_backwards:
add r1, r1, r2
add r0, r0, r2
subs r2, r2, #4
blt Lmemcpy_bl4  /* less than 4 bytes */
ands r12, r0, #3
bne Lmemcpy_bdestul  /* oh unaligned destination addr */
ands r12, r1, #3
bne Lmemcpy_bsrcul  /* oh unaligned source addr */

Lmemcpy_bt8:
/* We have aligned source and destination */
subs r2, r2, #8
blt Lmemcpy_bl12  /* less than 12 bytes (4 from above) */
stmdb sp!, {r4, r7, r8, r9, r10, lr}
subs r2, r2, #0x14  /* less than 32 bytes (12 from above) */
blt Lmemcpy_bl32

/* blat 64 bytes at a time */
/* XXX for really big copies perhaps we should use more registers */
Lmemcpy_bloop32:
ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
subs r2, r2, #0x40        
bge Lmemcpy_bloop32

Lmemcpy_bl32:
cmn r2, #0x10            
ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
stmgedb r0!, {r3, r4, r12, lr}
subge r2, r2, #0x10        
adds r2, r2, #0x14        
ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */
stmgedb r0!, {r3, r12, lr}
subge r2, r2, #0x0c        
ldmia sp!, {r4, r7, r8, r9, r10, lr}

Lmemcpy_bl12:
adds r2, r2, #8
blt Lmemcpy_bl4
subs r2, r2, #4
ldrlt r3, [r1, #-4]!
strlt r3, [r0, #-4]!
ldmgedb r1!, {r3, r12}
stmgedb r0!, {r3, r12}
subge r2, r2, #4

Lmemcpy_bl4:
/* less than 4 bytes to go */
adds r2, r2, #4
moveq pc, lr   /* done */

/* copy the crud byte at a time */
cmp r2, #2
ldrb r3, [r1, #-1]!
strb r3, [r0, #-1]!
ldrgeb r3, [r1, #-1]!
strgeb r3, [r0, #-1]!
ldrgtb r3, [r1, #-1]!
strgtb r3, [r0, #-1]!
mov pc, lr

/* erg - unaligned destination */
Lmemcpy_bdestul:
cmp r12, #2

/* align destination with byte copies */
ldrb r3, [r1, #-1]!
strb r3, [r0, #-1]!
ldrgeb r3, [r1, #-1]!
strgeb r3, [r0, #-1]!
ldrgtb r3, [r1, #-1]!
strgtb r3, [r0, #-1]!
subs r2, r2, r12
blt Lmemcpy_bl4  /* less than 4 bytes to go */
ands r12, r1, #3
beq Lmemcpy_bt8  /* we have an aligned source */

/* erg - unaligned source */
/* This is where it gets nasty ... */
Lmemcpy_bsrcul:
bic r1, r1, #3
ldr r3, [r1, #0]
cmp r12, #2
blt Lmemcpy_bsrcul1
beq Lmemcpy_bsrcul2
cmp r2, #0x0c            
blt Lmemcpy_bsrcul3loop4
sub r2, r2, #0x0c        
stmdb sp!, {r4, r5, lr}

Lmemcpy_bsrcul3loop16:
mov lr, r3, lsl #8
ldmdb r1!, {r3-r5, r12}
orr lr, lr, r12, lsr #24
mov r12, r12, lsl #8
orr r12, r12, r5, lsr #24
mov r5, r5, lsl #8
orr r5, r5, r4, lsr #24
mov r4, r4, lsl #8
orr r4, r4, r3, lsr #24
stmdb r0!, {r4, r5, r12, lr}
subs r2, r2, #0x10        
bge Lmemcpy_bsrcul3loop16
ldmia sp!, {r4, r5, lr}
adds r2, r2, #0x0c        
blt Lmemcpy_bsrcul3l4

Lmemcpy_bsrcul3loop4:
mov r12, r3, lsl #8
ldr r3, [r1, #-4]!
orr r12, r12, r3, lsr #24
str r12, [r0, #-4]!
subs r2, r2, #4
bge Lmemcpy_bsrcul3loop4

Lmemcpy_bsrcul3l4:
add r1, r1, #3
b Lmemcpy_bl4

Lmemcpy_bsrcul2:
cmp r2, #0x0c            
blt Lmemcpy_bsrcul2loop4
sub r2, r2, #0x0c        
stmdb sp!, {r4, r5, lr}

Lmemcpy_bsrcul2loop16:
mov lr, r3, lsl #16
ldmdb r1!, {r3-r5, r12}
orr lr, lr, r12, lsr #16
mov r12, r12, lsl #16
orr r12, r12, r5, lsr #16
mov r5, r5, lsl #16
orr r5, r5, r4, lsr #16
mov r4, r4, lsl #16
orr r4, r4, r3, lsr #16
stmdb r0!, {r4, r5, r12, lr}
subs r2, r2, #0x10        
bge Lmemcpy_bsrcul2loop16
ldmia sp!, {r4, r5, lr}
adds r2, r2, #0x0c        
blt Lmemcpy_bsrcul2l4

Lmemcpy_bsrcul2loop4:
mov r12, r3, lsl #16
ldr r3, [r1, #-4]!
orr r12, r12, r3, lsr #16
str r12, [r0, #-4]!
subs r2, r2, #4
bge Lmemcpy_bsrcul2loop4

Lmemcpy_bsrcul2l4:
add r1, r1, #2
b Lmemcpy_bl4

Lmemcpy_bsrcul1:
cmp r2, #0x0c            
blt Lmemcpy_bsrcul1loop4
sub r2, r2, #0x0c        
stmdb sp!, {r4, r5, lr}

Lmemcpy_bsrcul1loop32:
mov lr, r3, lsl #24
ldmdb r1!, {r3-r5, r12}
orr lr, lr, r12, lsr #8
mov r12, r12, lsl #24
orr r12, r12, r5, lsr #8
mov r5, r5, lsl #24
orr r5, r5, r4, lsr #8
mov r4, r4, lsl #24
orr r4, r4, r3, lsr #8
stmdb r0!, {r4, r5, r12, lr}
subs r2, r2, #0x10        
bge Lmemcpy_bsrcul1loop32
ldmia sp!, {r4, r5, lr}
adds r2, r2, #0x0c        
blt Lmemcpy_bsrcul1l4

Lmemcpy_bsrcul1loop4:
mov r12, r3, lsl #24
ldr r3, [r1, #-4]!
orr r12, r12, r3, lsr #8
str r12, [r0, #-4]!
subs r2, r2, #4
bge Lmemcpy_bsrcul1loop4

Lmemcpy_bsrcul1l4:
add r1, r1, #1
b Lmemcpy_bl4
Commit	Line	Data
	1	/* $NetBSD: memcpy.S,v 1.3 1997/11/22 03:27:12 mark Exp $ */
	2
	3	/*-
	4	* Copyright (c) 1997 The NetBSD Foundation, Inc.
	5	* All rights reserved.
	6	*
	7	* This code is derived from software contributed to The NetBSD Foundation
	8	* by Neil A. Carson and Mark Brinicombe
	9	*
	10	* Redistribution and use in source and binary forms, with or without
	11	* modification, are permitted provided that the following conditions
	12	* are met:
	13	* 1. Redistributions of source code must retain the above copyright
	14	* notice, this list of conditions and the following disclaimer.
	15	* 2. Redistributions in binary form must reproduce the above copyright
	16	* notice, this list of conditions and the following disclaimer in the
	17	* documentation and/or other materials provided with the distribution.
	18	* 3. All advertising materials mentioning features or use of this software
	19	* must display the following acknowledgement:
	20	* This product includes software developed by the NetBSD
	21	* Foundation, Inc. and its contributors.
	22	* 4. Neither the name of The NetBSD Foundation nor the names of its
	23	* contributors may be used to endorse or promote products derived
	24	* from this software without specific prior written permission.
	25	*
	26	* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
	27	* ``AS IS\'\' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
	28	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	29	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
	30	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	31	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	32	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	33	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	34	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	35	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	36	* POSSIBILITY OF SUCH DAMAGE.
	37	*/
	38
	39	/* #include <machine/asm.h>*/
	40
	41	.globl memcpy
	42	.globl _memcpy
	43	memcpy:
	44
	45	stmfd sp!, {r0, lr}
	46	bl _memcpy
	47	ldmfd sp!, {r0, pc}
	48
	49
	50	.globl memmove
	51	memmove:
	52
	53	stmfd sp!, {r0, lr}
	54	bl _memcpy
	55	ldmfd sp!, {r0, pc}
	56
	57
	58
	59	/*
	60	* This is one fun bit of code ...
	61	* Some easy listening music is suggested while trying to understand this
	62	* code e.g. Iron Maiden
	63	*
	64	* For anyone attempting to understand it :
	65	*
	66	* The core code is implemented here with simple stubs for memcpy()
	67	* memmove() and bcopy().
	68	*
	69	* All local labels are prefixed with Lmemcpy_
	70	* Following the prefix a label starting f is used in the forward copy code
	71	* while a label using b is used in the backwards copy code
	72	* The source and destination addresses determine whether a forward or
	73	* backward copy is performed.
	74	* Separate bits of code are used to deal with the following situations
	75	* for both the forward and backwards copy.
	76	* unaligned source address
	77	* unaligned destination address
	78	* Separate copy routines are used to produce an optimised result for each
	79	* of these cases.
	80	* The copy code will use LDM/STM instructions to copy up to 32 bytes at
	81	* a time where possible.
	82	*
	83	* Note: r12 (aka ip) can be trashed during the function along with
	84	* r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
	85	* Additional registers are preserved prior to use i.e. r4, r5 & lr
	86	*
	87	* Apologies for the state of the comments;-)
	88	*/
	89
	90
	91
	92	_memcpy:
	93
	94	/* Determine copy direction */
	95	cmp r1, r0
	96	bcc Lmemcpy_backwards
	97
	98	moveq r0, #0 /* Quick abort for len=0 */
	99	moveq pc, lr
	100
	101	stmdb sp!, {r0, lr} /* memcpy() returns dest addr */
	102	subs r2, r2, #4
	103	blt Lmemcpy_fl4 /* less than 4 bytes */
	104	ands r12, r0, #3
	105	bne Lmemcpy_fdestul /* oh unaligned destination addr */
	106	ands r12, r1, #3
	107	bne Lmemcpy_fsrcul /* oh unaligned source addr */
	108
	109	Lmemcpy_ft8:
	110	/* We have aligned source and destination */
	111	subs r2, r2, #8
	112	blt Lmemcpy_fl12 /* less than 12 bytes (4 from above) */
	113	subs r2, r2, #0x14
	114	blt Lmemcpy_fl32 /* less than 32 bytes (12 from above) */
	115	stmdb sp!, {r4, r7, r8, r9, r10} /* borrow r4 */
	116
	117	/* blat 64 bytes at a time */
	118	/* XXX for really big copies perhaps we should use more registers */
	119	Lmemcpy_floop32:
	120	ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
	121	stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
	122	ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
	123	stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
	124	subs r2, r2, #0x40
	125	bge Lmemcpy_floop32
	126
	127	cmn r2, #0x10
	128	ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
	129	stmgeia r0!, {r3, r4, r12, lr}
	130	subge r2, r2, #0x10
	131	ldmia sp!, {r4, r7, r8, r9, r10} /* return r4 */
	132
	133	Lmemcpy_fl32:
	134	adds r2, r2, #0x14
	135
	136	/* blat 12 bytes at a time */
	137	Lmemcpy_floop12:
	138	ldmgeia r1!, {r3, r12, lr}
	139	stmgeia r0!, {r3, r12, lr}
	140	subges r2, r2, #0x0c
	141	bge Lmemcpy_floop12
	142
	143	Lmemcpy_fl12:
	144	adds r2, r2, #8
	145	blt Lmemcpy_fl4
	146
	147	subs r2, r2, #4
	148	ldrlt r3, [r1], #4
	149	strlt r3, [r0], #4
	150	ldmgeia r1!, {r3, r12}
	151	stmgeia r0!, {r3, r12}
	152	subge r2, r2, #4
	153
	154	Lmemcpy_fl4:
	155	/* less than 4 bytes to go */
	156	adds r2, r2, #4
	157	ldmeqia sp!, {r0, pc} /* done */
	158
	159	/* copy the crud byte at a time */
	160	cmp r2, #2
	161	ldrb r3, [r1], #1
	162	strb r3, [r0], #1
	163	ldrgeb r3, [r1], #1
	164	strgeb r3, [r0], #1
	165	ldrgtb r3, [r1], #1
	166	strgtb r3, [r0], #1
	167	ldmia sp!, {r0, pc}
	168
	169	/* erg - unaligned destination */
	170	Lmemcpy_fdestul:
	171	rsb r12, r12, #4
	172	cmp r12, #2
	173
	174	/* align destination with byte copies */
	175	ldrb r3, [r1], #1
	176	strb r3, [r0], #1
	177	ldrgeb r3, [r1], #1
	178	strgeb r3, [r0], #1
	179	ldrgtb r3, [r1], #1
	180	strgtb r3, [r0], #1
	181	subs r2, r2, r12
	182	blt Lmemcpy_fl4 /* less the 4 bytes */
	183
	184	ands r12, r1, #3
	185	beq Lmemcpy_ft8 /* we have an aligned source */
	186
	187	/* erg - unaligned source */
	188	/* This is where it gets nasty ... */
	189	Lmemcpy_fsrcul:
	190	bic r1, r1, #3
	191	ldr lr, [r1], #4
	192	cmp r12, #2
	193	bgt Lmemcpy_fsrcul3
	194	beq Lmemcpy_fsrcul2
	195	cmp r2, #0x0c
	196	blt Lmemcpy_fsrcul1loop4
	197	sub r2, r2, #0x0c
	198	stmdb sp!, {r4, r5}
	199
	200	Lmemcpy_fsrcul1loop16:
	201	mov r3, lr, lsr #8
	202	ldmia r1!, {r4, r5, r12, lr}
	203	orr r3, r3, r4, lsl #24
	204	mov r4, r4, lsr #8
	205	orr r4, r4, r5, lsl #24
	206	mov r5, r5, lsr #8
	207	orr r5, r5, r12, lsl #24
	208	mov r12, r12, lsr #8
	209	orr r12, r12, lr, lsl #24
	210	stmia r0!, {r3-r5, r12}
	211	subs r2, r2, #0x10
	212	bge Lmemcpy_fsrcul1loop16
	213	ldmia sp!, {r4, r5}
	214	adds r2, r2, #0x0c
	215	blt Lmemcpy_fsrcul1l4
	216
	217	Lmemcpy_fsrcul1loop4:
	218	mov r12, lr, lsr #8
	219	ldr lr, [r1], #4
	220	orr r12, r12, lr, lsl #24
	221	str r12, [r0], #4
	222	subs r2, r2, #4
	223	bge Lmemcpy_fsrcul1loop4
	224
	225	Lmemcpy_fsrcul1l4:
	226	sub r1, r1, #3
	227	b Lmemcpy_fl4
	228
	229	Lmemcpy_fsrcul2:
	230	cmp r2, #0x0c
	231	blt Lmemcpy_fsrcul2loop4
	232	sub r2, r2, #0x0c
	233	stmdb sp!, {r4, r5}
	234
	235	Lmemcpy_fsrcul2loop16:
	236	mov r3, lr, lsr #16
	237	ldmia r1!, {r4, r5, r12, lr}
	238	orr r3, r3, r4, lsl #16
	239	mov r4, r4, lsr #16
	240	orr r4, r4, r5, lsl #16
	241	mov r5, r5, lsr #16
	242	orr r5, r5, r12, lsl #16
	243	mov r12, r12, lsr #16
	244	orr r12, r12, lr, lsl #16
	245	stmia r0!, {r3-r5, r12}
	246	subs r2, r2, #0x10
	247	bge Lmemcpy_fsrcul2loop16
	248	ldmia sp!, {r4, r5}
	249	adds r2, r2, #0x0c
	250	blt Lmemcpy_fsrcul2l4
	251
	252	Lmemcpy_fsrcul2loop4:
	253	mov r12, lr, lsr #16
	254	ldr lr, [r1], #4
	255	orr r12, r12, lr, lsl #16
	256	str r12, [r0], #4
	257	subs r2, r2, #4
	258	bge Lmemcpy_fsrcul2loop4
	259
	260	Lmemcpy_fsrcul2l4:
	261	sub r1, r1, #2
	262	b Lmemcpy_fl4
	263
	264	Lmemcpy_fsrcul3:
	265	cmp r2, #0x0c
	266	blt Lmemcpy_fsrcul3loop4
	267	sub r2, r2, #0x0c
	268	stmdb sp!, {r4, r5}
	269
	270	Lmemcpy_fsrcul3loop16:
	271	mov r3, lr, lsr #24
	272	ldmia r1!, {r4, r5, r12, lr}
	273	orr r3, r3, r4, lsl #8
	274	mov r4, r4, lsr #24
	275	orr r4, r4, r5, lsl #8
	276	mov r5, r5, lsr #24
	277	orr r5, r5, r12, lsl #8
	278	mov r12, r12, lsr #24
	279	orr r12, r12, lr, lsl #8
	280	stmia r0!, {r3-r5, r12}
	281	subs r2, r2, #0x10
	282	bge Lmemcpy_fsrcul3loop16
	283	ldmia sp!, {r4, r5}
	284	adds r2, r2, #0x0c
	285	blt Lmemcpy_fsrcul3l4
	286
	287	Lmemcpy_fsrcul3loop4:
	288	mov r12, lr, lsr #24
	289	ldr lr, [r1], #4
	290	orr r12, r12, lr, lsl #8
	291	str r12, [r0], #4
	292	subs r2, r2, #4
	293	bge Lmemcpy_fsrcul3loop4
	294
	295	Lmemcpy_fsrcul3l4:
	296	sub r1, r1, #1
	297	b Lmemcpy_fl4
	298
	299	Lmemcpy_backwards:
	300	add r1, r1, r2
	301	add r0, r0, r2
	302	subs r2, r2, #4
	303	blt Lmemcpy_bl4 /* less than 4 bytes */
	304	ands r12, r0, #3
	305	bne Lmemcpy_bdestul /* oh unaligned destination addr */
	306	ands r12, r1, #3
	307	bne Lmemcpy_bsrcul /* oh unaligned source addr */
	308
	309	Lmemcpy_bt8:
	310	/* We have aligned source and destination */
	311	subs r2, r2, #8
	312	blt Lmemcpy_bl12 /* less than 12 bytes (4 from above) */
	313	stmdb sp!, {r4, r7, r8, r9, r10, lr}
	314	subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */
	315	blt Lmemcpy_bl32
	316
	317	/* blat 64 bytes at a time */
	318	/* XXX for really big copies perhaps we should use more registers */
	319	Lmemcpy_bloop32:
	320	ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
	321	stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
	322	ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
	323	stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
	324	subs r2, r2, #0x40
	325	bge Lmemcpy_bloop32
	326
	327	Lmemcpy_bl32:
	328	cmn r2, #0x10
	329	ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
	330	stmgedb r0!, {r3, r4, r12, lr}
	331	subge r2, r2, #0x10
	332	adds r2, r2, #0x14
	333	ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */
	334	stmgedb r0!, {r3, r12, lr}
	335	subge r2, r2, #0x0c
	336	ldmia sp!, {r4, r7, r8, r9, r10, lr}
	337
	338	Lmemcpy_bl12:
	339	adds r2, r2, #8
	340	blt Lmemcpy_bl4
	341	subs r2, r2, #4
	342	ldrlt r3, [r1, #-4]!
	343	strlt r3, [r0, #-4]!
	344	ldmgedb r1!, {r3, r12}
	345	stmgedb r0!, {r3, r12}
	346	subge r2, r2, #4
	347
	348	Lmemcpy_bl4:
	349	/* less than 4 bytes to go */
	350	adds r2, r2, #4
	351	moveq pc, lr /* done */
	352
	353	/* copy the crud byte at a time */
	354	cmp r2, #2
	355	ldrb r3, [r1, #-1]!
	356	strb r3, [r0, #-1]!
	357	ldrgeb r3, [r1, #-1]!
	358	strgeb r3, [r0, #-1]!
	359	ldrgtb r3, [r1, #-1]!
	360	strgtb r3, [r0, #-1]!
	361	mov pc, lr
	362
	363	/* erg - unaligned destination */
	364	Lmemcpy_bdestul:
	365	cmp r12, #2
	366
	367	/* align destination with byte copies */
	368	ldrb r3, [r1, #-1]!
	369	strb r3, [r0, #-1]!
	370	ldrgeb r3, [r1, #-1]!
	371	strgeb r3, [r0, #-1]!
	372	ldrgtb r3, [r1, #-1]!
	373	strgtb r3, [r0, #-1]!
	374	subs r2, r2, r12
	375	blt Lmemcpy_bl4 /* less than 4 bytes to go */
	376	ands r12, r1, #3
	377	beq Lmemcpy_bt8 /* we have an aligned source */
	378
	379	/* erg - unaligned source */
	380	/* This is where it gets nasty ... */
	381	Lmemcpy_bsrcul:
	382	bic r1, r1, #3
	383	ldr r3, [r1, #0]
	384	cmp r12, #2
	385	blt Lmemcpy_bsrcul1
	386	beq Lmemcpy_bsrcul2
	387	cmp r2, #0x0c
	388	blt Lmemcpy_bsrcul3loop4
	389	sub r2, r2, #0x0c
	390	stmdb sp!, {r4, r5, lr}
	391
	392	Lmemcpy_bsrcul3loop16:
	393	mov lr, r3, lsl #8
	394	ldmdb r1!, {r3-r5, r12}
	395	orr lr, lr, r12, lsr #24
	396	mov r12, r12, lsl #8
	397	orr r12, r12, r5, lsr #24
	398	mov r5, r5, lsl #8
	399	orr r5, r5, r4, lsr #24
	400	mov r4, r4, lsl #8
	401	orr r4, r4, r3, lsr #24
	402	stmdb r0!, {r4, r5, r12, lr}
	403	subs r2, r2, #0x10
	404	bge Lmemcpy_bsrcul3loop16
	405	ldmia sp!, {r4, r5, lr}
	406	adds r2, r2, #0x0c
	407	blt Lmemcpy_bsrcul3l4
	408
	409	Lmemcpy_bsrcul3loop4:
	410	mov r12, r3, lsl #8
	411	ldr r3, [r1, #-4]!
	412	orr r12, r12, r3, lsr #24
	413	str r12, [r0, #-4]!
	414	subs r2, r2, #4
	415	bge Lmemcpy_bsrcul3loop4
	416
	417	Lmemcpy_bsrcul3l4:
	418	add r1, r1, #3
	419	b Lmemcpy_bl4
	420
	421	Lmemcpy_bsrcul2:
	422	cmp r2, #0x0c
	423	blt Lmemcpy_bsrcul2loop4
	424	sub r2, r2, #0x0c
	425	stmdb sp!, {r4, r5, lr}
	426
	427	Lmemcpy_bsrcul2loop16:
	428	mov lr, r3, lsl #16
	429	ldmdb r1!, {r3-r5, r12}
	430	orr lr, lr, r12, lsr #16
	431	mov r12, r12, lsl #16
	432	orr r12, r12, r5, lsr #16
	433	mov r5, r5, lsl #16
	434	orr r5, r5, r4, lsr #16
	435	mov r4, r4, lsl #16
	436	orr r4, r4, r3, lsr #16
	437	stmdb r0!, {r4, r5, r12, lr}
	438	subs r2, r2, #0x10
	439	bge Lmemcpy_bsrcul2loop16
	440	ldmia sp!, {r4, r5, lr}
	441	adds r2, r2, #0x0c
	442	blt Lmemcpy_bsrcul2l4
	443
	444	Lmemcpy_bsrcul2loop4:
	445	mov r12, r3, lsl #16
	446	ldr r3, [r1, #-4]!
	447	orr r12, r12, r3, lsr #16
	448	str r12, [r0, #-4]!
	449	subs r2, r2, #4
	450	bge Lmemcpy_bsrcul2loop4
	451
	452	Lmemcpy_bsrcul2l4:
	453	add r1, r1, #2
	454	b Lmemcpy_bl4
	455
	456	Lmemcpy_bsrcul1:
	457	cmp r2, #0x0c
	458	blt Lmemcpy_bsrcul1loop4
	459	sub r2, r2, #0x0c
	460	stmdb sp!, {r4, r5, lr}
	461
	462	Lmemcpy_bsrcul1loop32:
	463	mov lr, r3, lsl #24
	464	ldmdb r1!, {r3-r5, r12}
	465	orr lr, lr, r12, lsr #8
	466	mov r12, r12, lsl #24
	467	orr r12, r12, r5, lsr #8
	468	mov r5, r5, lsl #24
	469	orr r5, r5, r4, lsr #8
	470	mov r4, r4, lsl #24
	471	orr r4, r4, r3, lsr #8
	472	stmdb r0!, {r4, r5, r12, lr}
	473	subs r2, r2, #0x10
	474	bge Lmemcpy_bsrcul1loop32
	475	ldmia sp!, {r4, r5, lr}
	476	adds r2, r2, #0x0c
	477	blt Lmemcpy_bsrcul1l4
	478
	479	Lmemcpy_bsrcul1loop4:
	480	mov r12, r3, lsl #24
	481	ldr r3, [r1, #-4]!
	482	orr r12, r12, r3, lsr #8
	483	str r12, [r0, #-4]!
	484	subs r2, r2, #4
	485	bge Lmemcpy_bsrcul1loop4
	486
	487	Lmemcpy_bsrcul1l4:
	488	add r1, r1, #1
	489	b Lmemcpy_bl4