ROM load progress bar
[picodrive.git] / platform / gp2x / code940 / memcpy.s
CommitLineData
42c7b147 1/* $NetBSD: memcpy.S,v 1.3 1997/11/22 03:27:12 mark Exp $ */
2
3/*-
4* Copyright (c) 1997 The NetBSD Foundation, Inc.
5* All rights reserved.
6*
7* This code is derived from software contributed to The NetBSD Foundation
8* by Neil A. Carson and Mark Brinicombe
9*
10* Redistribution and use in source and binary forms, with or without
11* modification, are permitted provided that the following conditions
12* are met:
13* 1. Redistributions of source code must retain the above copyright
14* notice, this list of conditions and the following disclaimer.
15* 2. Redistributions in binary form must reproduce the above copyright
16* notice, this list of conditions and the following disclaimer in the
17* documentation and/or other materials provided with the distribution.
18* 3. All advertising materials mentioning features or use of this software
19* must display the following acknowledgement:
20* This product includes software developed by the NetBSD
21* Foundation, Inc. and its contributors.
22* 4. Neither the name of The NetBSD Foundation nor the names of its
23* contributors may be used to endorse or promote products derived
24* from this software without specific prior written permission.
25*
26* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27* ``AS IS\'\' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36* POSSIBILITY OF SUCH DAMAGE.
37*/
38
39/* This was modified by Jay Monkman <jmonkman@smoothsmoothie.com> to
40* save and restore r12. This is necessary for RTEMS.
41*/
42/* #include <machine/asm.h>*/
43
44#define ENTRY(_LABEL) \
45 .global _LABEL; _LABEL:
46
47.globl memcpy
48memcpy:
49
50@ ENTRY(gp2x_memcpy)
51stmfd sp!, {r0, r12, lr}
52@ bl _gp2x_memcpy
53bl _memcpy
54ldmfd sp!, {r0, r12, pc}
55
56
57
58.globl memmove
59memmove:
60
61@ ENTRY(gp2x_memmove)
62stmfd sp!, {r0, r12, lr}
63@ bl _gp2x_memcpy
64bl _memcpy
65ldmfd sp!, {r0, r12, pc}
66
67
68
69/*
70* This is one fun bit of code ...
71* Some easy listening music is suggested while trying to understand this
72* code e.g. Iron Maiden
73*
74* For anyone attempting to understand it :
75*
76* The core code is implemented here with simple stubs for memcpy()
77* memmove() and bcopy().
78*
79* All local labels are prefixed with Lmemcpy_
80* Following the prefix a label starting f is used in the forward copy code
81* while a label using b is used in the backwards copy code
82* The source and destination addresses determine whether a forward or
83* backward copy is performed.
84* Separate bits of code are used to deal with the following situations
85* for both the forward and backwards copy.
86* unaligned source address
87* unaligned destination address
88* Separate copy routines are used to produce an optimised result for each
89* of these cases.
90* The copy code will use LDM/STM instructions to copy up to 32 bytes at
91* a time where possible.
92*
93* Note: r12 (aka ip) can be trashed during the function along with
94* r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
95* Additional registers are preserved prior to use i.e. r4, r5 & lr
96*
97* Apologies for the state of the comments;-)
98*/
99
100
101
102_memcpy:
103
104@ ENTRY(_gp2x_memcpy)
105/* Determine copy direction */
106cmp r1, r0
107bcc Lmemcpy_backwards
108
109moveq r0, #0 /* Quick abort for len=0 */
110moveq pc, lr
111
112stmdb sp!, {r0, lr} /* memcpy() returns dest addr */
113subs r2, r2, #4
114blt Lmemcpy_fl4 /* less than 4 bytes */
115ands r12, r0, #3
116bne Lmemcpy_fdestul /* oh unaligned destination addr */
117ands r12, r1, #3
118bne Lmemcpy_fsrcul /* oh unaligned source addr */
119
120Lmemcpy_ft8:
121/* We have aligned source and destination */
122subs r2, r2, #8
123blt Lmemcpy_fl12 /* less than 12 bytes (4 from above) */
124subs r2, r2, #0x14
125blt Lmemcpy_fl32 /* less than 32 bytes (12 from above) */
126stmdb sp!, {r4, r7, r8, r9, r10} /* borrow r4 */
127
128/* blat 64 bytes at a time */
129/* XXX for really big copies perhaps we should use more registers */
130Lmemcpy_floop32:
131ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
132stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
133ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
134stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
135subs r2, r2, #0x40
136bge Lmemcpy_floop32
137
138cmn r2, #0x10
139ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
140stmgeia r0!, {r3, r4, r12, lr}
141subge r2, r2, #0x10
142ldmia sp!, {r4, r7, r8, r9, r10} /* return r4 */
143
144Lmemcpy_fl32:
145adds r2, r2, #0x14
146
147/* blat 12 bytes at a time */
148Lmemcpy_floop12:
149ldmgeia r1!, {r3, r12, lr}
150stmgeia r0!, {r3, r12, lr}
151subges r2, r2, #0x0c
152bge Lmemcpy_floop12
153
154Lmemcpy_fl12:
155adds r2, r2, #8
156blt Lmemcpy_fl4
157
158subs r2, r2, #4
159ldrlt r3, [r1], #4
160strlt r3, [r0], #4
161ldmgeia r1!, {r3, r12}
162stmgeia r0!, {r3, r12}
163subge r2, r2, #4
164
165Lmemcpy_fl4:
166/* less than 4 bytes to go */
167adds r2, r2, #4
168ldmeqia sp!, {r0, pc} /* done */
169
170/* copy the crud byte at a time */
171cmp r2, #2
172ldrb r3, [r1], #1
173strb r3, [r0], #1
174ldrgeb r3, [r1], #1
175strgeb r3, [r0], #1
176ldrgtb r3, [r1], #1
177strgtb r3, [r0], #1
178ldmia sp!, {r0, pc}
179
180/* erg - unaligned destination */
181Lmemcpy_fdestul:
182rsb r12, r12, #4
183cmp r12, #2
184
185/* align destination with byte copies */
186ldrb r3, [r1], #1
187strb r3, [r0], #1
188ldrgeb r3, [r1], #1
189strgeb r3, [r0], #1
190ldrgtb r3, [r1], #1
191strgtb r3, [r0], #1
192subs r2, r2, r12
193blt Lmemcpy_fl4 /* less the 4 bytes */
194
195ands r12, r1, #3
196beq Lmemcpy_ft8 /* we have an aligned source */
197
198/* erg - unaligned source */
199/* This is where it gets nasty ... */
200Lmemcpy_fsrcul:
201bic r1, r1, #3
202ldr lr, [r1], #4
203cmp r12, #2
204bgt Lmemcpy_fsrcul3
205beq Lmemcpy_fsrcul2
206cmp r2, #0x0c
207blt Lmemcpy_fsrcul1loop4
208sub r2, r2, #0x0c
209stmdb sp!, {r4, r5}
210
211Lmemcpy_fsrcul1loop16:
212mov r3, lr, lsr #8
213ldmia r1!, {r4, r5, r12, lr}
214orr r3, r3, r4, lsl #24
215mov r4, r4, lsr #8
216orr r4, r4, r5, lsl #24
217mov r5, r5, lsr #8
218orr r5, r5, r12, lsl #24
219mov r12, r12, lsr #8
220orr r12, r12, lr, lsl #24
221stmia r0!, {r3-r5, r12}
222subs r2, r2, #0x10
223bge Lmemcpy_fsrcul1loop16
224ldmia sp!, {r4, r5}
225adds r2, r2, #0x0c
226blt Lmemcpy_fsrcul1l4
227
228Lmemcpy_fsrcul1loop4:
229mov r12, lr, lsr #8
230ldr lr, [r1], #4
231orr r12, r12, lr, lsl #24
232str r12, [r0], #4
233subs r2, r2, #4
234bge Lmemcpy_fsrcul1loop4
235
236Lmemcpy_fsrcul1l4:
237sub r1, r1, #3
238b Lmemcpy_fl4
239
240Lmemcpy_fsrcul2:
241cmp r2, #0x0c
242blt Lmemcpy_fsrcul2loop4
243sub r2, r2, #0x0c
244stmdb sp!, {r4, r5}
245
246Lmemcpy_fsrcul2loop16:
247mov r3, lr, lsr #16
248ldmia r1!, {r4, r5, r12, lr}
249orr r3, r3, r4, lsl #16
250mov r4, r4, lsr #16
251orr r4, r4, r5, lsl #16
252mov r5, r5, lsr #16
253orr r5, r5, r12, lsl #16
254mov r12, r12, lsr #16
255orr r12, r12, lr, lsl #16
256stmia r0!, {r3-r5, r12}
257subs r2, r2, #0x10
258bge Lmemcpy_fsrcul2loop16
259ldmia sp!, {r4, r5}
260adds r2, r2, #0x0c
261blt Lmemcpy_fsrcul2l4
262
263Lmemcpy_fsrcul2loop4:
264mov r12, lr, lsr #16
265ldr lr, [r1], #4
266orr r12, r12, lr, lsl #16
267str r12, [r0], #4
268subs r2, r2, #4
269bge Lmemcpy_fsrcul2loop4
270
271Lmemcpy_fsrcul2l4:
272sub r1, r1, #2
273b Lmemcpy_fl4
274
275Lmemcpy_fsrcul3:
276cmp r2, #0x0c
277blt Lmemcpy_fsrcul3loop4
278sub r2, r2, #0x0c
279stmdb sp!, {r4, r5}
280
281Lmemcpy_fsrcul3loop16:
282mov r3, lr, lsr #24
283ldmia r1!, {r4, r5, r12, lr}
284orr r3, r3, r4, lsl #8
285mov r4, r4, lsr #24
286orr r4, r4, r5, lsl #8
287mov r5, r5, lsr #24
288orr r5, r5, r12, lsl #8
289mov r12, r12, lsr #24
290orr r12, r12, lr, lsl #8
291stmia r0!, {r3-r5, r12}
292subs r2, r2, #0x10
293bge Lmemcpy_fsrcul3loop16
294ldmia sp!, {r4, r5}
295adds r2, r2, #0x0c
296blt Lmemcpy_fsrcul3l4
297
298Lmemcpy_fsrcul3loop4:
299mov r12, lr, lsr #24
300ldr lr, [r1], #4
301orr r12, r12, lr, lsl #8
302str r12, [r0], #4
303subs r2, r2, #4
304bge Lmemcpy_fsrcul3loop4
305
306Lmemcpy_fsrcul3l4:
307sub r1, r1, #1
308b Lmemcpy_fl4
309
310Lmemcpy_backwards:
311add r1, r1, r2
312add r0, r0, r2
313subs r2, r2, #4
314blt Lmemcpy_bl4 /* less than 4 bytes */
315ands r12, r0, #3
316bne Lmemcpy_bdestul /* oh unaligned destination addr */
317ands r12, r1, #3
318bne Lmemcpy_bsrcul /* oh unaligned source addr */
319
320Lmemcpy_bt8:
321/* We have aligned source and destination */
322subs r2, r2, #8
323blt Lmemcpy_bl12 /* less than 12 bytes (4 from above) */
324stmdb sp!, {r4, r7, r8, r9, r10, lr}
325subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */
326blt Lmemcpy_bl32
327
328/* blat 64 bytes at a time */
329/* XXX for really big copies perhaps we should use more registers */
330Lmemcpy_bloop32:
331ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
332stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
333ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
334stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
335subs r2, r2, #0x40
336bge Lmemcpy_bloop32
337
338Lmemcpy_bl32:
339cmn r2, #0x10
340ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
341stmgedb r0!, {r3, r4, r12, lr}
342subge r2, r2, #0x10
343adds r2, r2, #0x14
344ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */
345stmgedb r0!, {r3, r12, lr}
346subge r2, r2, #0x0c
347ldmia sp!, {r4, r7, r8, r9, r10, lr}
348
349Lmemcpy_bl12:
350adds r2, r2, #8
351blt Lmemcpy_bl4
352subs r2, r2, #4
353ldrlt r3, [r1, #-4]!
354strlt r3, [r0, #-4]!
355ldmgedb r1!, {r3, r12}
356stmgedb r0!, {r3, r12}
357subge r2, r2, #4
358
359Lmemcpy_bl4:
360/* less than 4 bytes to go */
361adds r2, r2, #4
362moveq pc, lr /* done */
363
364/* copy the crud byte at a time */
365cmp r2, #2
366ldrb r3, [r1, #-1]!
367strb r3, [r0, #-1]!
368ldrgeb r3, [r1, #-1]!
369strgeb r3, [r0, #-1]!
370ldrgtb r3, [r1, #-1]!
371strgtb r3, [r0, #-1]!
372mov pc, lr
373
374/* erg - unaligned destination */
375Lmemcpy_bdestul:
376cmp r12, #2
377
378/* align destination with byte copies */
379ldrb r3, [r1, #-1]!
380strb r3, [r0, #-1]!
381ldrgeb r3, [r1, #-1]!
382strgeb r3, [r0, #-1]!
383ldrgtb r3, [r1, #-1]!
384strgtb r3, [r0, #-1]!
385subs r2, r2, r12
386blt Lmemcpy_bl4 /* less than 4 bytes to go */
387ands r12, r1, #3
388beq Lmemcpy_bt8 /* we have an aligned source */
389
390/* erg - unaligned source */
391/* This is where it gets nasty ... */
392Lmemcpy_bsrcul:
393bic r1, r1, #3
394ldr r3, [r1, #0]
395cmp r12, #2
396blt Lmemcpy_bsrcul1
397beq Lmemcpy_bsrcul2
398cmp r2, #0x0c
399blt Lmemcpy_bsrcul3loop4
400sub r2, r2, #0x0c
401stmdb sp!, {r4, r5, lr}
402
403Lmemcpy_bsrcul3loop16:
404mov lr, r3, lsl #8
405ldmdb r1!, {r3-r5, r12}
406orr lr, lr, r12, lsr #24
407mov r12, r12, lsl #8
408orr r12, r12, r5, lsr #24
409mov r5, r5, lsl #8
410orr r5, r5, r4, lsr #24
411mov r4, r4, lsl #8
412orr r4, r4, r3, lsr #24
413stmdb r0!, {r4, r5, r12, lr}
414subs r2, r2, #0x10
415bge Lmemcpy_bsrcul3loop16
416ldmia sp!, {r4, r5, lr}
417adds r2, r2, #0x0c
418blt Lmemcpy_bsrcul3l4
419
420Lmemcpy_bsrcul3loop4:
421mov r12, r3, lsl #8
422ldr r3, [r1, #-4]!
423orr r12, r12, r3, lsr #24
424str r12, [r0, #-4]!
425subs r2, r2, #4
426bge Lmemcpy_bsrcul3loop4
427
428Lmemcpy_bsrcul3l4:
429add r1, r1, #3
430b Lmemcpy_bl4
431
432Lmemcpy_bsrcul2:
433cmp r2, #0x0c
434blt Lmemcpy_bsrcul2loop4
435sub r2, r2, #0x0c
436stmdb sp!, {r4, r5, lr}
437
438Lmemcpy_bsrcul2loop16:
439mov lr, r3, lsl #16
440ldmdb r1!, {r3-r5, r12}
441orr lr, lr, r12, lsr #16
442mov r12, r12, lsl #16
443orr r12, r12, r5, lsr #16
444mov r5, r5, lsl #16
445orr r5, r5, r4, lsr #16
446mov r4, r4, lsl #16
447orr r4, r4, r3, lsr #16
448stmdb r0!, {r4, r5, r12, lr}
449subs r2, r2, #0x10
450bge Lmemcpy_bsrcul2loop16
451ldmia sp!, {r4, r5, lr}
452adds r2, r2, #0x0c
453blt Lmemcpy_bsrcul2l4
454
455Lmemcpy_bsrcul2loop4:
456mov r12, r3, lsl #16
457ldr r3, [r1, #-4]!
458orr r12, r12, r3, lsr #16
459str r12, [r0, #-4]!
460subs r2, r2, #4
461bge Lmemcpy_bsrcul2loop4
462
463Lmemcpy_bsrcul2l4:
464add r1, r1, #2
465b Lmemcpy_bl4
466
467Lmemcpy_bsrcul1:
468cmp r2, #0x0c
469blt Lmemcpy_bsrcul1loop4
470sub r2, r2, #0x0c
471stmdb sp!, {r4, r5, lr}
472
473Lmemcpy_bsrcul1loop32:
474mov lr, r3, lsl #24
475ldmdb r1!, {r3-r5, r12}
476orr lr, lr, r12, lsr #8
477mov r12, r12, lsl #24
478orr r12, r12, r5, lsr #8
479mov r5, r5, lsl #24
480orr r5, r5, r4, lsr #8
481mov r4, r4, lsl #24
482orr r4, r4, r3, lsr #8
483stmdb r0!, {r4, r5, r12, lr}
484subs r2, r2, #0x10
485bge Lmemcpy_bsrcul1loop32
486ldmia sp!, {r4, r5, lr}
487adds r2, r2, #0x0c
488blt Lmemcpy_bsrcul1l4
489
490Lmemcpy_bsrcul1loop4:
491mov r12, r3, lsl #24
492ldr r3, [r1, #-4]!
493orr r12, r12, r3, lsr #8
494str r12, [r0, #-4]!
495subs r2, r2, #4
496bge Lmemcpy_bsrcul1loop4
497
498Lmemcpy_bsrcul1l4:
499add r1, r1, #1
500b Lmemcpy_bl4