[picodrive.git] / platform / gp2x / code940 / memcpy.s

/* $NetBSD: memcpy.S,v 1.3 1997/11/22 03:27:12 mark Exp $ */

/*-
* Copyright (c) 1997 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Neil A. Carson and Mark Brinicombe
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
*    notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
*    notice, this list of conditions and the following disclaimer in the
*    documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
*    must display the following acknowledgement:
* This product includes software developed by the NetBSD
* Foundation, Inc. and its contributors.
* 4. Neither the name of The NetBSD Foundation nor the names of its
*    contributors may be used to endorse or promote products derived
*    from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS\'\' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/

/* #include <machine/asm.h>*/

.globl memcpy
.globl _memcpy
memcpy:

stmfd sp!, {r0, lr}
bl _memcpy
ldmfd sp!, {r0, pc}


.globl memmove
memmove:

stmfd sp!, {r0, lr}
bl _memcpy
ldmfd sp!, {r0, pc}


/*
* This is one fun bit of code ...
* Some easy listening music is suggested while trying to understand this
* code e.g. Iron Maiden
*
* For anyone attempting to understand it :
*
* The core code is implemented here with simple stubs for memcpy()
* memmove() and bcopy().
*
* All local labels are prefixed with Lmemcpy_
* Following the prefix a label starting f is used in the forward copy code
* while a label using b is used in the backwards copy code
* The source and destination addresses determine whether a forward or
* backward copy is performed.
* Separate bits of code are used to deal with the following situations
* for both the forward and backwards copy.
* unaligned source address
* unaligned destination address
* Separate copy routines are used to produce an optimised result for each
* of these cases.
* The copy code will use LDM/STM instructions to copy up to 32 bytes at
* a time where possible.
*
* Note: r12 (aka ip) can be trashed during the function along with
* r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
* Additional registers are preserved prior to use i.e. r4, r5 & lr
*
* Apologies for the state of the comments;-)
*/


_memcpy:

/* Determine copy direction */
cmp r1, r0
bcc Lmemcpy_backwards

moveq r0, #0   /* Quick abort for len=0 */
moveq pc, lr

stmdb sp!, {r0, lr}  /* memcpy() returns dest addr */
subs r2, r2, #4
blt Lmemcpy_fl4  /* less than 4 bytes */
ands r12, r0, #3
bne Lmemcpy_fdestul  /* oh unaligned destination addr */
ands r12, r1, #3
bne Lmemcpy_fsrcul  /* oh unaligned source addr */

Lmemcpy_ft8:
/* We have aligned source and destination */
subs r2, r2, #8
blt Lmemcpy_fl12  /* less than 12 bytes (4 from above) */
subs r2, r2, #0x14        
blt Lmemcpy_fl32  /* less than 32 bytes (12 from above) */
stmdb sp!, {r4, r7, r8, r9, r10}  /* borrow r4 */

/* blat 64 bytes at a time */
/* XXX for really big copies perhaps we should use more registers */
Lmemcpy_floop32:
ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
subs r2, r2, #0x40        
bge Lmemcpy_floop32

cmn r2, #0x10
ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
stmgeia r0!, {r3, r4, r12, lr}
subge r2, r2, #0x10        
ldmia sp!, {r4, r7, r8, r9, r10}  /* return r4 */

Lmemcpy_fl32:
adds r2, r2, #0x14        

/* blat 12 bytes at a time */
Lmemcpy_floop12:
ldmgeia r1!, {r3, r12, lr}
stmgeia r0!, {r3, r12, lr}
subges r2, r2, #0x0c        
bge Lmemcpy_floop12

Lmemcpy_fl12:
adds r2, r2, #8
blt Lmemcpy_fl4

subs r2, r2, #4
ldrlt r3, [r1], #4
strlt r3, [r0], #4
ldmgeia r1!, {r3, r12}
stmgeia r0!, {r3, r12}
subge r2, r2, #4

Lmemcpy_fl4:
/* less than 4 bytes to go */
adds r2, r2, #4
ldmeqia sp!, {r0, pc}  /* done */

/* copy the crud byte at a time */
cmp r2, #2
ldrb r3, [r1], #1
strb r3, [r0], #1
ldrgeb r3, [r1], #1
strgeb r3, [r0], #1
ldrgtb r3, [r1], #1
strgtb r3, [r0], #1
ldmia sp!, {r0, pc}

/* erg - unaligned destination */
Lmemcpy_fdestul:
rsb r12, r12, #4
cmp r12, #2

/* align destination with byte copies */
ldrb r3, [r1], #1
strb r3, [r0], #1
ldrgeb r3, [r1], #1
strgeb r3, [r0], #1
ldrgtb r3, [r1], #1
strgtb r3, [r0], #1
subs r2, r2, r12
blt Lmemcpy_fl4  /* less the 4 bytes */

ands r12, r1, #3
beq Lmemcpy_ft8  /* we have an aligned source */

/* erg - unaligned source */
/* This is where it gets nasty ... */
Lmemcpy_fsrcul:
bic r1, r1, #3
ldr lr, [r1], #4
cmp r12, #2
bgt Lmemcpy_fsrcul3
beq Lmemcpy_fsrcul2
cmp r2, #0x0c            
blt Lmemcpy_fsrcul1loop4
sub r2, r2, #0x0c        
stmdb sp!, {r4, r5}

Lmemcpy_fsrcul1loop16:
mov r3, lr, lsr #8
ldmia r1!, {r4, r5, r12, lr}
orr r3, r3, r4, lsl #24
mov r4, r4, lsr #8
orr r4, r4, r5, lsl #24
mov r5, r5, lsr #8
orr r5, r5, r12, lsl #24
mov r12, r12, lsr #8
orr r12, r12, lr, lsl #24
stmia r0!, {r3-r5, r12}
subs r2, r2, #0x10        
bge Lmemcpy_fsrcul1loop16
ldmia sp!, {r4, r5}
adds r2, r2, #0x0c        
blt Lmemcpy_fsrcul1l4

Lmemcpy_fsrcul1loop4:
mov r12, lr, lsr #8
ldr lr, [r1], #4
orr r12, r12, lr, lsl #24
str r12, [r0], #4
subs r2, r2, #4
bge Lmemcpy_fsrcul1loop4

Lmemcpy_fsrcul1l4:
sub r1, r1, #3
b Lmemcpy_fl4

Lmemcpy_fsrcul2:
cmp r2, #0x0c            
blt Lmemcpy_fsrcul2loop4
sub r2, r2, #0x0c        
stmdb sp!, {r4, r5}

Lmemcpy_fsrcul2loop16:
mov r3, lr, lsr #16
ldmia r1!, {r4, r5, r12, lr}
orr r3, r3, r4, lsl #16
mov r4, r4, lsr #16
orr r4, r4, r5, lsl #16
mov r5, r5, lsr #16
orr r5, r5, r12, lsl #16
mov r12, r12, lsr #16
orr r12, r12, lr, lsl #16
stmia r0!, {r3-r5, r12}
subs r2, r2, #0x10        
bge Lmemcpy_fsrcul2loop16
ldmia sp!, {r4, r5}
adds r2, r2, #0x0c        
blt Lmemcpy_fsrcul2l4

Lmemcpy_fsrcul2loop4:
mov r12, lr, lsr #16
ldr lr, [r1], #4
orr r12, r12, lr, lsl #16
str r12, [r0], #4
subs r2, r2, #4
bge Lmemcpy_fsrcul2loop4

Lmemcpy_fsrcul2l4:
sub r1, r1, #2
b Lmemcpy_fl4

Lmemcpy_fsrcul3:
cmp r2, #0x0c            
blt Lmemcpy_fsrcul3loop4
sub r2, r2, #0x0c        
stmdb sp!, {r4, r5}

Lmemcpy_fsrcul3loop16:
mov r3, lr, lsr #24
ldmia r1!, {r4, r5, r12, lr}
orr r3, r3, r4, lsl #8
mov r4, r4, lsr #24
orr r4, r4, r5, lsl #8
mov r5, r5, lsr #24
orr r5, r5, r12, lsl #8
mov r12, r12, lsr #24
orr r12, r12, lr, lsl #8
stmia r0!, {r3-r5, r12}
subs r2, r2, #0x10        
bge Lmemcpy_fsrcul3loop16
ldmia sp!, {r4, r5}
adds r2, r2, #0x0c        
blt Lmemcpy_fsrcul3l4

Lmemcpy_fsrcul3loop4:
mov r12, lr, lsr #24
ldr lr, [r1], #4
orr r12, r12, lr, lsl #8
str r12, [r0], #4
subs r2, r2, #4
bge Lmemcpy_fsrcul3loop4

Lmemcpy_fsrcul3l4:
sub r1, r1, #1
b Lmemcpy_fl4

Lmemcpy_backwards:
add r1, r1, r2
add r0, r0, r2
subs r2, r2, #4
blt Lmemcpy_bl4  /* less than 4 bytes */
ands r12, r0, #3
bne Lmemcpy_bdestul  /* oh unaligned destination addr */
ands r12, r1, #3
bne Lmemcpy_bsrcul  /* oh unaligned source addr */

Lmemcpy_bt8:
/* We have aligned source and destination */
subs r2, r2, #8
blt Lmemcpy_bl12  /* less than 12 bytes (4 from above) */
stmdb sp!, {r4, r7, r8, r9, r10, lr}
subs r2, r2, #0x14  /* less than 32 bytes (12 from above) */
blt Lmemcpy_bl32

/* blat 64 bytes at a time */
/* XXX for really big copies perhaps we should use more registers */
Lmemcpy_bloop32:
ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
subs r2, r2, #0x40        
bge Lmemcpy_bloop32

Lmemcpy_bl32:
cmn r2, #0x10            
ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
stmgedb r0!, {r3, r4, r12, lr}
subge r2, r2, #0x10        
adds r2, r2, #0x14        
ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */
stmgedb r0!, {r3, r12, lr}
subge r2, r2, #0x0c        
ldmia sp!, {r4, r7, r8, r9, r10, lr}

Lmemcpy_bl12:
adds r2, r2, #8
blt Lmemcpy_bl4
subs r2, r2, #4
ldrlt r3, [r1, #-4]!
strlt r3, [r0, #-4]!
ldmgedb r1!, {r3, r12}
stmgedb r0!, {r3, r12}
subge r2, r2, #4

Lmemcpy_bl4:
/* less than 4 bytes to go */
adds r2, r2, #4
moveq pc, lr   /* done */

/* copy the crud byte at a time */
cmp r2, #2
ldrb r3, [r1, #-1]!
strb r3, [r0, #-1]!
ldrgeb r3, [r1, #-1]!
strgeb r3, [r0, #-1]!
ldrgtb r3, [r1, #-1]!
strgtb r3, [r0, #-1]!
mov pc, lr

/* erg - unaligned destination */
Lmemcpy_bdestul:
cmp r12, #2

/* align destination with byte copies */
ldrb r3, [r1, #-1]!
strb r3, [r0, #-1]!
ldrgeb r3, [r1, #-1]!
strgeb r3, [r0, #-1]!
ldrgtb r3, [r1, #-1]!
strgtb r3, [r0, #-1]!
subs r2, r2, r12
blt Lmemcpy_bl4  /* less than 4 bytes to go */
ands r12, r1, #3
beq Lmemcpy_bt8  /* we have an aligned source */

/* erg - unaligned source */
/* This is where it gets nasty ... */
Lmemcpy_bsrcul:
bic r1, r1, #3
ldr r3, [r1, #0]
cmp r12, #2
blt Lmemcpy_bsrcul1
beq Lmemcpy_bsrcul2
cmp r2, #0x0c            
blt Lmemcpy_bsrcul3loop4
sub r2, r2, #0x0c        
stmdb sp!, {r4, r5, lr}

Lmemcpy_bsrcul3loop16:
mov lr, r3, lsl #8
ldmdb r1!, {r3-r5, r12}
orr lr, lr, r12, lsr #24
mov r12, r12, lsl #8
orr r12, r12, r5, lsr #24
mov r5, r5, lsl #8
orr r5, r5, r4, lsr #24
mov r4, r4, lsl #8
orr r4, r4, r3, lsr #24
stmdb r0!, {r4, r5, r12, lr}
subs r2, r2, #0x10        
bge Lmemcpy_bsrcul3loop16
ldmia sp!, {r4, r5, lr}
adds r2, r2, #0x0c        
blt Lmemcpy_bsrcul3l4

Lmemcpy_bsrcul3loop4:
mov r12, r3, lsl #8
ldr r3, [r1, #-4]!
orr r12, r12, r3, lsr #24
str r12, [r0, #-4]!
subs r2, r2, #4
bge Lmemcpy_bsrcul3loop4

Lmemcpy_bsrcul3l4:
add r1, r1, #3
b Lmemcpy_bl4

Lmemcpy_bsrcul2:
cmp r2, #0x0c            
blt Lmemcpy_bsrcul2loop4
sub r2, r2, #0x0c        
stmdb sp!, {r4, r5, lr}

Lmemcpy_bsrcul2loop16:
mov lr, r3, lsl #16
ldmdb r1!, {r3-r5, r12}
orr lr, lr, r12, lsr #16
mov r12, r12, lsl #16
orr r12, r12, r5, lsr #16
mov r5, r5, lsl #16
orr r5, r5, r4, lsr #16
mov r4, r4, lsl #16
orr r4, r4, r3, lsr #16
stmdb r0!, {r4, r5, r12, lr}
subs r2, r2, #0x10        
bge Lmemcpy_bsrcul2loop16
ldmia sp!, {r4, r5, lr}
adds r2, r2, #0x0c        
blt Lmemcpy_bsrcul2l4

Lmemcpy_bsrcul2loop4:
mov r12, r3, lsl #16
ldr r3, [r1, #-4]!
orr r12, r12, r3, lsr #16
str r12, [r0, #-4]!
subs r2, r2, #4
bge Lmemcpy_bsrcul2loop4

Lmemcpy_bsrcul2l4:
add r1, r1, #2
b Lmemcpy_bl4

Lmemcpy_bsrcul1:
cmp r2, #0x0c            
blt Lmemcpy_bsrcul1loop4
sub r2, r2, #0x0c        
stmdb sp!, {r4, r5, lr}

Lmemcpy_bsrcul1loop32:
mov lr, r3, lsl #24
ldmdb r1!, {r3-r5, r12}
orr lr, lr, r12, lsr #8
mov r12, r12, lsl #24
orr r12, r12, r5, lsr #8
mov r5, r5, lsl #24
orr r5, r5, r4, lsr #8
mov r4, r4, lsl #24
orr r4, r4, r3, lsr #8
stmdb r0!, {r4, r5, r12, lr}
subs r2, r2, #0x10        
bge Lmemcpy_bsrcul1loop32
ldmia sp!, {r4, r5, lr}
adds r2, r2, #0x0c        
blt Lmemcpy_bsrcul1l4

Lmemcpy_bsrcul1loop4:
mov r12, r3, lsl #24
ldr r3, [r1, #-4]!
orr r12, r12, r3, lsr #8
str r12, [r0, #-4]!
subs r2, r2, #4
bge Lmemcpy_bsrcul1loop4

Lmemcpy_bsrcul1l4:
add r1, r1, #1
b Lmemcpy_bl4
Commit	Line	Data
42c7b147	1	/* $NetBSD: memcpy.S,v 1.3 1997/11/22 03:27:12 mark Exp $ */
	2
	3	/*-
	4	* Copyright (c) 1997 The NetBSD Foundation, Inc.
	5	* All rights reserved.
	6	*
	7	* This code is derived from software contributed to The NetBSD Foundation
	8	* by Neil A. Carson and Mark Brinicombe
	9	*
	10	* Redistribution and use in source and binary forms, with or without
	11	* modification, are permitted provided that the following conditions
	12	* are met:
	13	* 1. Redistributions of source code must retain the above copyright
	14	* notice, this list of conditions and the following disclaimer.
	15	* 2. Redistributions in binary form must reproduce the above copyright
	16	* notice, this list of conditions and the following disclaimer in the
	17	* documentation and/or other materials provided with the distribution.
	18	* 3. All advertising materials mentioning features or use of this software
	19	* must display the following acknowledgement:
	20	* This product includes software developed by the NetBSD
	21	* Foundation, Inc. and its contributors.
	22	* 4. Neither the name of The NetBSD Foundation nor the names of its
	23	* contributors may be used to endorse or promote products derived
	24	* from this software without specific prior written permission.
	25	*
	26	* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
	27	* ``AS IS\'\' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
	28	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	29	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
	30	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	31	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	32	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	33	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	34	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	35	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	36	* POSSIBILITY OF SUCH DAMAGE.
	37	*/
	38
42c7b147	39	/* #include <machine/asm.h>*/
42c7b147	40
42c7b147	41	.globl memcpy
e4fb433c	42	.globl _memcpy
42c7b147	43	memcpy:
42c7b147	44
e4fb433c	45	stmfd sp!, {r0, lr}
42c7b147	46	bl _memcpy
e4fb433c	47	ldmfd sp!, {r0, pc}
42c7b147	48
	49
	50	.globl memmove
	51	memmove:
	52
e4fb433c	53	stmfd sp!, {r0, lr}
42c7b147	54	bl _memcpy
e4fb433c	55	ldmfd sp!, {r0, pc}
42c7b147	56
	57
	58
	59	/*
	60	* This is one fun bit of code ...
	61	* Some easy listening music is suggested while trying to understand this
	62	* code e.g. Iron Maiden
	63	*
	64	* For anyone attempting to understand it :
	65	*
	66	* The core code is implemented here with simple stubs for memcpy()
	67	* memmove() and bcopy().
	68	*
	69	* All local labels are prefixed with Lmemcpy_
	70	* Following the prefix a label starting f is used in the forward copy code
	71	* while a label using b is used in the backwards copy code
	72	* The source and destination addresses determine whether a forward or
	73	* backward copy is performed.
	74	* Separate bits of code are used to deal with the following situations
	75	* for both the forward and backwards copy.
	76	* unaligned source address
	77	* unaligned destination address
	78	* Separate copy routines are used to produce an optimised result for each
	79	* of these cases.
	80	* The copy code will use LDM/STM instructions to copy up to 32 bytes at
	81	* a time where possible.
	82	*
	83	* Note: r12 (aka ip) can be trashed during the function along with
	84	* r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
	85	* Additional registers are preserved prior to use i.e. r4, r5 & lr
	86	*
	87	* Apologies for the state of the comments;-)
	88	*/
	89
	90
	91
	92	_memcpy:
	93
42c7b147	94	/* Determine copy direction */
	95	cmp r1, r0
	96	bcc Lmemcpy_backwards
	97
	98	moveq r0, #0 /* Quick abort for len=0 */
	99	moveq pc, lr
	100
	101	stmdb sp!, {r0, lr} /* memcpy() returns dest addr */
	102	subs r2, r2, #4
	103	blt Lmemcpy_fl4 /* less than 4 bytes */
	104	ands r12, r0, #3
	105	bne Lmemcpy_fdestul /* oh unaligned destination addr */
	106	ands r12, r1, #3
	107	bne Lmemcpy_fsrcul /* oh unaligned source addr */
	108
	109	Lmemcpy_ft8:
	110	/* We have aligned source and destination */
	111	subs r2, r2, #8
	112	blt Lmemcpy_fl12 /* less than 12 bytes (4 from above) */
	113	subs r2, r2, #0x14
	114	blt Lmemcpy_fl32 /* less than 32 bytes (12 from above) */
	115	stmdb sp!, {r4, r7, r8, r9, r10} /* borrow r4 */
	116
	117	/* blat 64 bytes at a time */
	118	/* XXX for really big copies perhaps we should use more registers */
	119	Lmemcpy_floop32:
	120	ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
	121	stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
	122	ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
	123	stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
	124	subs r2, r2, #0x40
	125	bge Lmemcpy_floop32
	126
	127	cmn r2, #0x10
	128	ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
	129	stmgeia r0!, {r3, r4, r12, lr}
	130	subge r2, r2, #0x10
	131	ldmia sp!, {r4, r7, r8, r9, r10} /* return r4 */
	132
	133	Lmemcpy_fl32:
	134	adds r2, r2, #0x14
	135
	136	/* blat 12 bytes at a time */
	137	Lmemcpy_floop12:
	138	ldmgeia r1!, {r3, r12, lr}
	139	stmgeia r0!, {r3, r12, lr}
	140	subges r2, r2, #0x0c
	141	bge Lmemcpy_floop12
	142
	143	Lmemcpy_fl12:
	144	adds r2, r2, #8
	145	blt Lmemcpy_fl4
	146
	147	subs r2, r2, #4
	148	ldrlt r3, [r1], #4
	149	strlt r3, [r0], #4
	150	ldmgeia r1!, {r3, r12}
	151	stmgeia r0!, {r3, r12}
	152	subge r2, r2, #4
	153
	154	Lmemcpy_fl4:
	155	/* less than 4 bytes to go */
	156	adds r2, r2, #4
	157	ldmeqia sp!, {r0, pc} /* done */
158
159	/* copy the crud byte at a time */
160	cmp r2, #2
161	ldrb r3, [r1], #1
162	strb r3, [r0], #1
163	ldrgeb r3, [r1], #1
164	strgeb r3, [r0], #1
165	ldrgtb r3, [r1], #1
166	strgtb r3, [r0], #1
167	ldmia sp!, {r0, pc}
168
169	/* erg - unaligned destination */
170	Lmemcpy_fdestul:
171	rsb r12, r12, #4
172	cmp r12, #2
173
174	/* align destination with byte copies */
175	ldrb r3, [r1], #1
176	strb r3, [r0], #1
177	ldrgeb r3, [r1], #1
178	strgeb r3, [r0], #1
179	ldrgtb r3, [r1], #1
180	strgtb r3, [r0], #1
181	subs r2, r2, r12
182	blt Lmemcpy_fl4 /* less the 4 bytes */
183
184	ands r12, r1, #3
185	beq Lmemcpy_ft8 /* we have an aligned source */
186
187	/* erg - unaligned source */
188	/* This is where it gets nasty ... */
189	Lmemcpy_fsrcul:
190	bic r1, r1, #3
191	ldr lr, [r1], #4
192	cmp r12, #2
193	bgt Lmemcpy_fsrcul3
194	beq Lmemcpy_fsrcul2
195	cmp r2, #0x0c
196	blt Lmemcpy_fsrcul1loop4
197	sub r2, r2, #0x0c
198	stmdb sp!, {r4, r5}
199
200	Lmemcpy_fsrcul1loop16:
201	mov r3, lr, lsr #8
202	ldmia r1!, {r4, r5, r12, lr}
203	orr r3, r3, r4, lsl #24
204	mov r4, r4, lsr #8
205	orr r4, r4, r5, lsl #24
206	mov r5, r5, lsr #8
207	orr r5, r5, r12, lsl #24
208	mov r12, r12, lsr #8
209	orr r12, r12, lr, lsl #24
210	stmia r0!, {r3-r5, r12}
211	subs r2, r2, #0x10
212	bge Lmemcpy_fsrcul1loop16
213	ldmia sp!, {r4, r5}
214	adds r2, r2, #0x0c
215	blt Lmemcpy_fsrcul1l4
216
217	Lmemcpy_fsrcul1loop4:
218	mov r12, lr, lsr #8
219	ldr lr, [r1], #4
220	orr r12, r12, lr, lsl #24
221	str r12, [r0], #4
222	subs r2, r2, #4
223	bge Lmemcpy_fsrcul1loop4
224
225	Lmemcpy_fsrcul1l4:
226	sub r1, r1, #3
227	b Lmemcpy_fl4
228
229	Lmemcpy_fsrcul2:
230	cmp r2, #0x0c
231	blt Lmemcpy_fsrcul2loop4
232	sub r2, r2, #0x0c
233	stmdb sp!, {r4, r5}
234
235	Lmemcpy_fsrcul2loop16:
236	mov r3, lr, lsr #16
237	ldmia r1!, {r4, r5, r12, lr}
238	orr r3, r3, r4, lsl #16
239	mov r4, r4, lsr #16
240	orr r4, r4, r5, lsl #16
241	mov r5, r5, lsr #16
242	orr r5, r5, r12, lsl #16
243	mov r12, r12, lsr #16
244	orr r12, r12, lr, lsl #16
245	stmia r0!, {r3-r5, r12}
246	subs r2, r2, #0x10
247	bge Lmemcpy_fsrcul2loop16
248	ldmia sp!, {r4, r5}
249	adds r2, r2, #0x0c
250	blt Lmemcpy_fsrcul2l4
251
252	Lmemcpy_fsrcul2loop4:
253	mov r12, lr, lsr #16
254	ldr lr, [r1], #4
255	orr r12, r12, lr, lsl #16
256	str r12, [r0], #4
257	subs r2, r2, #4
258	bge Lmemcpy_fsrcul2loop4
259
260	Lmemcpy_fsrcul2l4:
261	sub r1, r1, #2
262	b Lmemcpy_fl4
263
264	Lmemcpy_fsrcul3:
265	cmp r2, #0x0c
266	blt Lmemcpy_fsrcul3loop4
267	sub r2, r2, #0x0c
268	stmdb sp!, {r4, r5}
269
270	Lmemcpy_fsrcul3loop16:
271	mov r3, lr, lsr #24
272	ldmia r1!, {r4, r5, r12, lr}
273	orr r3, r3, r4, lsl #8
274	mov r4, r4, lsr #24
275	orr r4, r4, r5, lsl #8
276	mov r5, r5, lsr #24
277	orr r5, r5, r12, lsl #8
278	mov r12, r12, lsr #24
279	orr r12, r12, lr, lsl #8
280	stmia r0!, {r3-r5, r12}
281	subs r2, r2, #0x10
282	bge Lmemcpy_fsrcul3loop16
283	ldmia sp!, {r4, r5}
284	adds r2, r2, #0x0c
285	blt Lmemcpy_fsrcul3l4
286
287	Lmemcpy_fsrcul3loop4:
288	mov r12, lr, lsr #24
289	ldr lr, [r1], #4
290	orr r12, r12, lr, lsl #8
291	str r12, [r0], #4
292	subs r2, r2, #4
293	bge Lmemcpy_fsrcul3loop4
294
295	Lmemcpy_fsrcul3l4:
296	sub r1, r1, #1
297	b Lmemcpy_fl4
298
299	Lmemcpy_backwards:
300	add r1, r1, r2
301	add r0, r0, r2
302	subs r2, r2, #4
303	blt Lmemcpy_bl4 /* less than 4 bytes */
304	ands r12, r0, #3
305	bne Lmemcpy_bdestul /* oh unaligned destination addr */
306	ands r12, r1, #3
307	bne Lmemcpy_bsrcul /* oh unaligned source addr */
308
309	Lmemcpy_bt8:
310	/* We have aligned source and destination */
311	subs r2, r2, #8
312	blt Lmemcpy_bl12 /* less than 12 bytes (4 from above) */
313	stmdb sp!, {r4, r7, r8, r9, r10, lr}
314	subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */
315	blt Lmemcpy_bl32
316
317	/* blat 64 bytes at a time */
318	/* XXX for really big copies perhaps we should use more registers */
319	Lmemcpy_bloop32:
320	ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
321	stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
322	ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
323	stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
324	subs r2, r2, #0x40
325	bge Lmemcpy_bloop32
326
327	Lmemcpy_bl32:
328	cmn r2, #0x10
329	ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
330	stmgedb r0!, {r3, r4, r12, lr}
331	subge r2, r2, #0x10
332	adds r2, r2, #0x14
333	ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */
334	stmgedb r0!, {r3, r12, lr}
335	subge r2, r2, #0x0c
336	ldmia sp!, {r4, r7, r8, r9, r10, lr}
337
338	Lmemcpy_bl12:
339	adds r2, r2, #8
340	blt Lmemcpy_bl4
341	subs r2, r2, #4
342	ldrlt r3, [r1, #-4]!
343	strlt r3, [r0, #-4]!
344	ldmgedb r1!, {r3, r12}
345	stmgedb r0!, {r3, r12}
346	subge r2, r2, #4
347
348	Lmemcpy_bl4:
349	/* less than 4 bytes to go */
350	adds r2, r2, #4
351	moveq pc, lr /* done */
352
353	/* copy the crud byte at a time */
354	cmp r2, #2
355	ldrb r3, [r1, #-1]!
356	strb r3, [r0, #-1]!
357	ldrgeb r3, [r1, #-1]!
358	strgeb r3, [r0, #-1]!
359	ldrgtb r3, [r1, #-1]!
360	strgtb r3, [r0, #-1]!
361	mov pc, lr
362
363	/* erg - unaligned destination */
364	Lmemcpy_bdestul:
365	cmp r12, #2
366
367	/* align destination with byte copies */
368	ldrb r3, [r1, #-1]!
369	strb r3, [r0, #-1]!
370	ldrgeb r3, [r1, #-1]!
371	strgeb r3, [r0, #-1]!
372	ldrgtb r3, [r1, #-1]!
373	strgtb r3, [r0, #-1]!
374	subs r2, r2, r12
375	blt Lmemcpy_bl4 /* less than 4 bytes to go */
376	ands r12, r1, #3
377	beq Lmemcpy_bt8 /* we have an aligned source */
378
379	/* erg - unaligned source */
380	/* This is where it gets nasty ... */
381	Lmemcpy_bsrcul:
382	bic r1, r1, #3
383	ldr r3, [r1, #0]
384	cmp r12, #2
385	blt Lmemcpy_bsrcul1
386	beq Lmemcpy_bsrcul2
387	cmp r2, #0x0c
388	blt Lmemcpy_bsrcul3loop4
389	sub r2, r2, #0x0c
390	stmdb sp!, {r4, r5, lr}
391
392	Lmemcpy_bsrcul3loop16:
393	mov lr, r3, lsl #8
394	ldmdb r1!, {r3-r5, r12}
395	orr lr, lr, r12, lsr #24
396	mov r12, r12, lsl #8
397	orr r12, r12, r5, lsr #24
398	mov r5, r5, lsl #8
399	orr r5, r5, r4, lsr #24
400	mov r4, r4, lsl #8
401	orr r4, r4, r3, lsr #24
402	stmdb r0!, {r4, r5, r12, lr}
403	subs r2, r2, #0x10
404	bge Lmemcpy_bsrcul3loop16
405	ldmia sp!, {r4, r5, lr}
406	adds r2, r2, #0x0c
407	blt Lmemcpy_bsrcul3l4
408
409	Lmemcpy_bsrcul3loop4:
410	mov r12, r3, lsl #8
411	ldr r3, [r1, #-4]!
412	orr r12, r12, r3, lsr #24
413	str r12, [r0, #-4]!
414	subs r2, r2, #4
415	bge Lmemcpy_bsrcul3loop4
416
417	Lmemcpy_bsrcul3l4:
418	add r1, r1, #3
419	b Lmemcpy_bl4
420
421	Lmemcpy_bsrcul2:
422	cmp r2, #0x0c
423	blt Lmemcpy_bsrcul2loop4
424	sub r2, r2, #0x0c
425	stmdb sp!, {r4, r5, lr}
426
427	Lmemcpy_bsrcul2loop16:
428	mov lr, r3, lsl #16
429	ldmdb r1!, {r3-r5, r12}
430	orr lr, lr, r12, lsr #16
431	mov r12, r12, lsl #16
432	orr r12, r12, r5, lsr #16
433	mov r5, r5, lsl #16
434	orr r5, r5, r4, lsr #16
435	mov r4, r4, lsl #16
436	orr r4, r4, r3, lsr #16
437	stmdb r0!, {r4, r5, r12, lr}
438	subs r2, r2, #0x10
439	bge Lmemcpy_bsrcul2loop16
440	ldmia sp!, {r4, r5, lr}
441	adds r2, r2, #0x0c
442	blt Lmemcpy_bsrcul2l4
443
444	Lmemcpy_bsrcul2loop4:
445	mov r12, r3, lsl #16
446	ldr r3, [r1, #-4]!
447	orr r12, r12, r3, lsr #16
448	str r12, [r0, #-4]!
449	subs r2, r2, #4
450	bge Lmemcpy_bsrcul2loop4
451
452	Lmemcpy_bsrcul2l4:
453	add r1, r1, #2
454	b Lmemcpy_bl4
455
456	Lmemcpy_bsrcul1:
457	cmp r2, #0x0c
458	blt Lmemcpy_bsrcul1loop4
459	sub r2, r2, #0x0c
460	stmdb sp!, {r4, r5, lr}
461
462	Lmemcpy_bsrcul1loop32:
463	mov lr, r3, lsl #24
464	ldmdb r1!, {r3-r5, r12}
465	orr lr, lr, r12, lsr #8
466	mov r12, r12, lsl #24
467	orr r12, r12, r5, lsr #8
468	mov r5, r5, lsl #24
469	orr r5, r5, r4, lsr #8
470	mov r4, r4, lsl #24
471	orr r4, r4, r3, lsr #8
472	stmdb r0!, {r4, r5, r12, lr}
473	subs r2, r2, #0x10
474	bge Lmemcpy_bsrcul1loop32
475	ldmia sp!, {r4, r5, lr}
476	adds r2, r2, #0x0c
477	blt Lmemcpy_bsrcul1l4
478
479	Lmemcpy_bsrcul1loop4:
480	mov r12, r3, lsl #24
481	ldr r3, [r1, #-4]!
482	orr r12, r12, r3, lsr #8
483	str r12, [r0, #-4]!
484	subs r2, r2, #4
485	bge Lmemcpy_bsrcul1loop4
486
487	Lmemcpy_bsrcul1l4:
488	add r1, r1, #1
489	b Lmemcpy_bl4