full ym2612 save/load for 940
[picodrive.git] / platform / gp2x / code940 / memcpy.s
CommitLineData
42c7b147 1/* $NetBSD: memcpy.S,v 1.3 1997/11/22 03:27:12 mark Exp $ */
2
3/*-
4* Copyright (c) 1997 The NetBSD Foundation, Inc.
5* All rights reserved.
6*
7* This code is derived from software contributed to The NetBSD Foundation
8* by Neil A. Carson and Mark Brinicombe
9*
10* Redistribution and use in source and binary forms, with or without
11* modification, are permitted provided that the following conditions
12* are met:
13* 1. Redistributions of source code must retain the above copyright
14* notice, this list of conditions and the following disclaimer.
15* 2. Redistributions in binary form must reproduce the above copyright
16* notice, this list of conditions and the following disclaimer in the
17* documentation and/or other materials provided with the distribution.
18* 3. All advertising materials mentioning features or use of this software
19* must display the following acknowledgement:
20* This product includes software developed by the NetBSD
21* Foundation, Inc. and its contributors.
22* 4. Neither the name of The NetBSD Foundation nor the names of its
23* contributors may be used to endorse or promote products derived
24* from this software without specific prior written permission.
25*
26* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27* ``AS IS\'\' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36* POSSIBILITY OF SUCH DAMAGE.
37*/
38
42c7b147 39/* #include <machine/asm.h>*/
40
42c7b147 41.globl memcpy
e4fb433c 42.globl _memcpy
42c7b147 43memcpy:
44
e4fb433c 45stmfd sp!, {r0, lr}
42c7b147 46bl _memcpy
e4fb433c 47ldmfd sp!, {r0, pc}
42c7b147 48
49
50.globl memmove
51memmove:
52
e4fb433c 53stmfd sp!, {r0, lr}
42c7b147 54bl _memcpy
e4fb433c 55ldmfd sp!, {r0, pc}
42c7b147 56
57
58
59/*
60* This is one fun bit of code ...
61* Some easy listening music is suggested while trying to understand this
62* code e.g. Iron Maiden
63*
64* For anyone attempting to understand it :
65*
66* The core code is implemented here with simple stubs for memcpy()
67* memmove() and bcopy().
68*
69* All local labels are prefixed with Lmemcpy_
70* Following the prefix a label starting f is used in the forward copy code
71* while a label using b is used in the backwards copy code
72* The source and destination addresses determine whether a forward or
73* backward copy is performed.
74* Separate bits of code are used to deal with the following situations
75* for both the forward and backwards copy.
76* unaligned source address
77* unaligned destination address
78* Separate copy routines are used to produce an optimised result for each
79* of these cases.
80* The copy code will use LDM/STM instructions to copy up to 32 bytes at
81* a time where possible.
82*
83* Note: r12 (aka ip) can be trashed during the function along with
84* r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
85* Additional registers are preserved prior to use i.e. r4, r5 & lr
86*
87* Apologies for the state of the comments;-)
88*/
89
90
91
92_memcpy:
93
42c7b147 94/* Determine copy direction */
95cmp r1, r0
96bcc Lmemcpy_backwards
97
98moveq r0, #0 /* Quick abort for len=0 */
99moveq pc, lr
100
101stmdb sp!, {r0, lr} /* memcpy() returns dest addr */
102subs r2, r2, #4
103blt Lmemcpy_fl4 /* less than 4 bytes */
104ands r12, r0, #3
105bne Lmemcpy_fdestul /* oh unaligned destination addr */
106ands r12, r1, #3
107bne Lmemcpy_fsrcul /* oh unaligned source addr */
108
109Lmemcpy_ft8:
110/* We have aligned source and destination */
111subs r2, r2, #8
112blt Lmemcpy_fl12 /* less than 12 bytes (4 from above) */
113subs r2, r2, #0x14
114blt Lmemcpy_fl32 /* less than 32 bytes (12 from above) */
115stmdb sp!, {r4, r7, r8, r9, r10} /* borrow r4 */
116
117/* blat 64 bytes at a time */
118/* XXX for really big copies perhaps we should use more registers */
119Lmemcpy_floop32:
120ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
121stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
122ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
123stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
124subs r2, r2, #0x40
125bge Lmemcpy_floop32
126
127cmn r2, #0x10
128ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
129stmgeia r0!, {r3, r4, r12, lr}
130subge r2, r2, #0x10
131ldmia sp!, {r4, r7, r8, r9, r10} /* return r4 */
132
133Lmemcpy_fl32:
134adds r2, r2, #0x14
135
136/* blat 12 bytes at a time */
137Lmemcpy_floop12:
138ldmgeia r1!, {r3, r12, lr}
139stmgeia r0!, {r3, r12, lr}
140subges r2, r2, #0x0c
141bge Lmemcpy_floop12
142
143Lmemcpy_fl12:
144adds r2, r2, #8
145blt Lmemcpy_fl4
146
147subs r2, r2, #4
148ldrlt r3, [r1], #4
149strlt r3, [r0], #4
150ldmgeia r1!, {r3, r12}
151stmgeia r0!, {r3, r12}
152subge r2, r2, #4
153
154Lmemcpy_fl4:
155/* less than 4 bytes to go */
156adds r2, r2, #4
157ldmeqia sp!, {r0, pc} /* done */
158
159/* copy the crud byte at a time */
160cmp r2, #2
161ldrb r3, [r1], #1
162strb r3, [r0], #1
163ldrgeb r3, [r1], #1
164strgeb r3, [r0], #1
165ldrgtb r3, [r1], #1
166strgtb r3, [r0], #1
167ldmia sp!, {r0, pc}
168
169/* erg - unaligned destination */
170Lmemcpy_fdestul:
171rsb r12, r12, #4
172cmp r12, #2
173
174/* align destination with byte copies */
175ldrb r3, [r1], #1
176strb r3, [r0], #1
177ldrgeb r3, [r1], #1
178strgeb r3, [r0], #1
179ldrgtb r3, [r1], #1
180strgtb r3, [r0], #1
181subs r2, r2, r12
182blt Lmemcpy_fl4 /* less the 4 bytes */
183
184ands r12, r1, #3
185beq Lmemcpy_ft8 /* we have an aligned source */
186
187/* erg - unaligned source */
188/* This is where it gets nasty ... */
189Lmemcpy_fsrcul:
190bic r1, r1, #3
191ldr lr, [r1], #4
192cmp r12, #2
193bgt Lmemcpy_fsrcul3
194beq Lmemcpy_fsrcul2
195cmp r2, #0x0c
196blt Lmemcpy_fsrcul1loop4
197sub r2, r2, #0x0c
198stmdb sp!, {r4, r5}
199
200Lmemcpy_fsrcul1loop16:
201mov r3, lr, lsr #8
202ldmia r1!, {r4, r5, r12, lr}
203orr r3, r3, r4, lsl #24
204mov r4, r4, lsr #8
205orr r4, r4, r5, lsl #24
206mov r5, r5, lsr #8
207orr r5, r5, r12, lsl #24
208mov r12, r12, lsr #8
209orr r12, r12, lr, lsl #24
210stmia r0!, {r3-r5, r12}
211subs r2, r2, #0x10
212bge Lmemcpy_fsrcul1loop16
213ldmia sp!, {r4, r5}
214adds r2, r2, #0x0c
215blt Lmemcpy_fsrcul1l4
216
217Lmemcpy_fsrcul1loop4:
218mov r12, lr, lsr #8
219ldr lr, [r1], #4
220orr r12, r12, lr, lsl #24
221str r12, [r0], #4
222subs r2, r2, #4
223bge Lmemcpy_fsrcul1loop4
224
225Lmemcpy_fsrcul1l4:
226sub r1, r1, #3
227b Lmemcpy_fl4
228
229Lmemcpy_fsrcul2:
230cmp r2, #0x0c
231blt Lmemcpy_fsrcul2loop4
232sub r2, r2, #0x0c
233stmdb sp!, {r4, r5}
234
235Lmemcpy_fsrcul2loop16:
236mov r3, lr, lsr #16
237ldmia r1!, {r4, r5, r12, lr}
238orr r3, r3, r4, lsl #16
239mov r4, r4, lsr #16
240orr r4, r4, r5, lsl #16
241mov r5, r5, lsr #16
242orr r5, r5, r12, lsl #16
243mov r12, r12, lsr #16
244orr r12, r12, lr, lsl #16
245stmia r0!, {r3-r5, r12}
246subs r2, r2, #0x10
247bge Lmemcpy_fsrcul2loop16
248ldmia sp!, {r4, r5}
249adds r2, r2, #0x0c
250blt Lmemcpy_fsrcul2l4
251
252Lmemcpy_fsrcul2loop4:
253mov r12, lr, lsr #16
254ldr lr, [r1], #4
255orr r12, r12, lr, lsl #16
256str r12, [r0], #4
257subs r2, r2, #4
258bge Lmemcpy_fsrcul2loop4
259
260Lmemcpy_fsrcul2l4:
261sub r1, r1, #2
262b Lmemcpy_fl4
263
264Lmemcpy_fsrcul3:
265cmp r2, #0x0c
266blt Lmemcpy_fsrcul3loop4
267sub r2, r2, #0x0c
268stmdb sp!, {r4, r5}
269
270Lmemcpy_fsrcul3loop16:
271mov r3, lr, lsr #24
272ldmia r1!, {r4, r5, r12, lr}
273orr r3, r3, r4, lsl #8
274mov r4, r4, lsr #24
275orr r4, r4, r5, lsl #8
276mov r5, r5, lsr #24
277orr r5, r5, r12, lsl #8
278mov r12, r12, lsr #24
279orr r12, r12, lr, lsl #8
280stmia r0!, {r3-r5, r12}
281subs r2, r2, #0x10
282bge Lmemcpy_fsrcul3loop16
283ldmia sp!, {r4, r5}
284adds r2, r2, #0x0c
285blt Lmemcpy_fsrcul3l4
286
287Lmemcpy_fsrcul3loop4:
288mov r12, lr, lsr #24
289ldr lr, [r1], #4
290orr r12, r12, lr, lsl #8
291str r12, [r0], #4
292subs r2, r2, #4
293bge Lmemcpy_fsrcul3loop4
294
295Lmemcpy_fsrcul3l4:
296sub r1, r1, #1
297b Lmemcpy_fl4
298
299Lmemcpy_backwards:
300add r1, r1, r2
301add r0, r0, r2
302subs r2, r2, #4
303blt Lmemcpy_bl4 /* less than 4 bytes */
304ands r12, r0, #3
305bne Lmemcpy_bdestul /* oh unaligned destination addr */
306ands r12, r1, #3
307bne Lmemcpy_bsrcul /* oh unaligned source addr */
308
309Lmemcpy_bt8:
310/* We have aligned source and destination */
311subs r2, r2, #8
312blt Lmemcpy_bl12 /* less than 12 bytes (4 from above) */
313stmdb sp!, {r4, r7, r8, r9, r10, lr}
314subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */
315blt Lmemcpy_bl32
316
317/* blat 64 bytes at a time */
318/* XXX for really big copies perhaps we should use more registers */
319Lmemcpy_bloop32:
320ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
321stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
322ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
323stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
324subs r2, r2, #0x40
325bge Lmemcpy_bloop32
326
327Lmemcpy_bl32:
328cmn r2, #0x10
329ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
330stmgedb r0!, {r3, r4, r12, lr}
331subge r2, r2, #0x10
332adds r2, r2, #0x14
333ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */
334stmgedb r0!, {r3, r12, lr}
335subge r2, r2, #0x0c
336ldmia sp!, {r4, r7, r8, r9, r10, lr}
337
338Lmemcpy_bl12:
339adds r2, r2, #8
340blt Lmemcpy_bl4
341subs r2, r2, #4
342ldrlt r3, [r1, #-4]!
343strlt r3, [r0, #-4]!
344ldmgedb r1!, {r3, r12}
345stmgedb r0!, {r3, r12}
346subge r2, r2, #4
347
348Lmemcpy_bl4:
349/* less than 4 bytes to go */
350adds r2, r2, #4
351moveq pc, lr /* done */
352
353/* copy the crud byte at a time */
354cmp r2, #2
355ldrb r3, [r1, #-1]!
356strb r3, [r0, #-1]!
357ldrgeb r3, [r1, #-1]!
358strgeb r3, [r0, #-1]!
359ldrgtb r3, [r1, #-1]!
360strgtb r3, [r0, #-1]!
361mov pc, lr
362
363/* erg - unaligned destination */
364Lmemcpy_bdestul:
365cmp r12, #2
366
367/* align destination with byte copies */
368ldrb r3, [r1, #-1]!
369strb r3, [r0, #-1]!
370ldrgeb r3, [r1, #-1]!
371strgeb r3, [r0, #-1]!
372ldrgtb r3, [r1, #-1]!
373strgtb r3, [r0, #-1]!
374subs r2, r2, r12
375blt Lmemcpy_bl4 /* less than 4 bytes to go */
376ands r12, r1, #3
377beq Lmemcpy_bt8 /* we have an aligned source */
378
379/* erg - unaligned source */
380/* This is where it gets nasty ... */
381Lmemcpy_bsrcul:
382bic r1, r1, #3
383ldr r3, [r1, #0]
384cmp r12, #2
385blt Lmemcpy_bsrcul1
386beq Lmemcpy_bsrcul2
387cmp r2, #0x0c
388blt Lmemcpy_bsrcul3loop4
389sub r2, r2, #0x0c
390stmdb sp!, {r4, r5, lr}
391
392Lmemcpy_bsrcul3loop16:
393mov lr, r3, lsl #8
394ldmdb r1!, {r3-r5, r12}
395orr lr, lr, r12, lsr #24
396mov r12, r12, lsl #8
397orr r12, r12, r5, lsr #24
398mov r5, r5, lsl #8
399orr r5, r5, r4, lsr #24
400mov r4, r4, lsl #8
401orr r4, r4, r3, lsr #24
402stmdb r0!, {r4, r5, r12, lr}
403subs r2, r2, #0x10
404bge Lmemcpy_bsrcul3loop16
405ldmia sp!, {r4, r5, lr}
406adds r2, r2, #0x0c
407blt Lmemcpy_bsrcul3l4
408
409Lmemcpy_bsrcul3loop4:
410mov r12, r3, lsl #8
411ldr r3, [r1, #-4]!
412orr r12, r12, r3, lsr #24
413str r12, [r0, #-4]!
414subs r2, r2, #4
415bge Lmemcpy_bsrcul3loop4
416
417Lmemcpy_bsrcul3l4:
418add r1, r1, #3
419b Lmemcpy_bl4
420
421Lmemcpy_bsrcul2:
422cmp r2, #0x0c
423blt Lmemcpy_bsrcul2loop4
424sub r2, r2, #0x0c
425stmdb sp!, {r4, r5, lr}
426
427Lmemcpy_bsrcul2loop16:
428mov lr, r3, lsl #16
429ldmdb r1!, {r3-r5, r12}
430orr lr, lr, r12, lsr #16
431mov r12, r12, lsl #16
432orr r12, r12, r5, lsr #16
433mov r5, r5, lsl #16
434orr r5, r5, r4, lsr #16
435mov r4, r4, lsl #16
436orr r4, r4, r3, lsr #16
437stmdb r0!, {r4, r5, r12, lr}
438subs r2, r2, #0x10
439bge Lmemcpy_bsrcul2loop16
440ldmia sp!, {r4, r5, lr}
441adds r2, r2, #0x0c
442blt Lmemcpy_bsrcul2l4
443
444Lmemcpy_bsrcul2loop4:
445mov r12, r3, lsl #16
446ldr r3, [r1, #-4]!
447orr r12, r12, r3, lsr #16
448str r12, [r0, #-4]!
449subs r2, r2, #4
450bge Lmemcpy_bsrcul2loop4
451
452Lmemcpy_bsrcul2l4:
453add r1, r1, #2
454b Lmemcpy_bl4
455
456Lmemcpy_bsrcul1:
457cmp r2, #0x0c
458blt Lmemcpy_bsrcul1loop4
459sub r2, r2, #0x0c
460stmdb sp!, {r4, r5, lr}
461
462Lmemcpy_bsrcul1loop32:
463mov lr, r3, lsl #24
464ldmdb r1!, {r3-r5, r12}
465orr lr, lr, r12, lsr #8
466mov r12, r12, lsl #24
467orr r12, r12, r5, lsr #8
468mov r5, r5, lsl #24
469orr r5, r5, r4, lsr #8
470mov r4, r4, lsl #24
471orr r4, r4, r3, lsr #8
472stmdb r0!, {r4, r5, r12, lr}
473subs r2, r2, #0x10
474bge Lmemcpy_bsrcul1loop32
475ldmia sp!, {r4, r5, lr}
476adds r2, r2, #0x0c
477blt Lmemcpy_bsrcul1l4
478
479Lmemcpy_bsrcul1loop4:
480mov r12, r3, lsl #24
481ldr r3, [r1, #-4]!
482orr r12, r12, r3, lsr #8
483str r12, [r0, #-4]!
484subs r2, r2, #4
485bge Lmemcpy_bsrcul1loop4
486
487Lmemcpy_bsrcul1l4:
488add r1, r1, #1
489b Lmemcpy_bl4