1 /* $NetBSD: memcpy.S,v 1.3 1997/11/22 03:27:12 mark Exp $ */
4 * Copyright (c) 1997 The NetBSD Foundation, Inc.
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Neil A. Carson and Mark Brinicombe
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS\'\' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
39 /* This was modified by Jay Monkman <jmonkman@smoothsmoothie.com> to
40 * save and restore r12. This is necessary for RTEMS.
42 /* #include <machine/asm.h>*/
44 #define ENTRY(_LABEL) \
45 .global _LABEL; _LABEL:
51 stmfd sp!, {r0, r12, lr}
54 ldmfd sp!, {r0, r12, pc}
62 stmfd sp!, {r0, r12, lr}
65 ldmfd sp!, {r0, r12, pc}
70 * This is one fun bit of code ...
71 * Some easy listening music is suggested while trying to understand this
72 * code e.g. Iron Maiden
74 * For anyone attempting to understand it :
76 * The core code is implemented here with simple stubs for memcpy()
77 * memmove() and bcopy().
79 * All local labels are prefixed with Lmemcpy_
80 * Following the prefix a label starting f is used in the forward copy code
81 * while a label using b is used in the backwards copy code
82 * The source and destination addresses determine whether a forward or
83 * backward copy is performed.
84 * Separate bits of code are used to deal with the following situations
85 * for both the forward and backwards copy.
86 * unaligned source address
87 * unaligned destination address
88 * Separate copy routines are used to produce an optimised result for each
90 * The copy code will use LDM/STM instructions to copy up to 32 bytes at
91 * a time where possible.
93 * Note: r12 (aka ip) can be trashed during the function along with
94 * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
95 * Additional registers are preserved prior to use i.e. r4, r5 & lr
97 * Apologies for the state of the comments;-)
104 @ ENTRY(_gp2x_memcpy)
105 /* Determine copy direction */
107 bcc Lmemcpy_backwards
109 moveq r0, #0 /* Quick abort for len=0 */
112 stmdb sp!, {r0, lr} /* memcpy() returns dest addr */
114 blt Lmemcpy_fl4 /* less than 4 bytes */
116 bne Lmemcpy_fdestul /* oh unaligned destination addr */
118 bne Lmemcpy_fsrcul /* oh unaligned source addr */
121 /* We have aligned source and destination */
123 blt Lmemcpy_fl12 /* less than 12 bytes (4 from above) */
125 blt Lmemcpy_fl32 /* less than 32 bytes (12 from above) */
126 stmdb sp!, {r4, r7, r8, r9, r10} /* borrow r4 */
128 /* blat 64 bytes at a time */
129 /* XXX for really big copies perhaps we should use more registers */
131 ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
132 stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
133 ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
134 stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
139 ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
140 stmgeia r0!, {r3, r4, r12, lr}
142 ldmia sp!, {r4, r7, r8, r9, r10} /* return r4 */
147 /* blat 12 bytes at a time */
149 ldmgeia r1!, {r3, r12, lr}
150 stmgeia r0!, {r3, r12, lr}
161 ldmgeia r1!, {r3, r12}
162 stmgeia r0!, {r3, r12}
166 /* less than 4 bytes to go */
168 ldmeqia sp!, {r0, pc} /* done */
170 /* copy the crud byte at a time */
180 /* erg - unaligned destination */
185 /* align destination with byte copies */
193 blt Lmemcpy_fl4 /* less the 4 bytes */
196 beq Lmemcpy_ft8 /* we have an aligned source */
198 /* erg - unaligned source */
199 /* This is where it gets nasty ... */
207 blt Lmemcpy_fsrcul1loop4
211 Lmemcpy_fsrcul1loop16:
213 ldmia r1!, {r4, r5, r12, lr}
214 orr r3, r3, r4, lsl #24
216 orr r4, r4, r5, lsl #24
218 orr r5, r5, r12, lsl #24
220 orr r12, r12, lr, lsl #24
221 stmia r0!, {r3-r5, r12}
223 bge Lmemcpy_fsrcul1loop16
226 blt Lmemcpy_fsrcul1l4
228 Lmemcpy_fsrcul1loop4:
231 orr r12, r12, lr, lsl #24
234 bge Lmemcpy_fsrcul1loop4
242 blt Lmemcpy_fsrcul2loop4
246 Lmemcpy_fsrcul2loop16:
248 ldmia r1!, {r4, r5, r12, lr}
249 orr r3, r3, r4, lsl #16
251 orr r4, r4, r5, lsl #16
253 orr r5, r5, r12, lsl #16
254 mov r12, r12, lsr #16
255 orr r12, r12, lr, lsl #16
256 stmia r0!, {r3-r5, r12}
258 bge Lmemcpy_fsrcul2loop16
261 blt Lmemcpy_fsrcul2l4
263 Lmemcpy_fsrcul2loop4:
266 orr r12, r12, lr, lsl #16
269 bge Lmemcpy_fsrcul2loop4
277 blt Lmemcpy_fsrcul3loop4
281 Lmemcpy_fsrcul3loop16:
283 ldmia r1!, {r4, r5, r12, lr}
284 orr r3, r3, r4, lsl #8
286 orr r4, r4, r5, lsl #8
288 orr r5, r5, r12, lsl #8
289 mov r12, r12, lsr #24
290 orr r12, r12, lr, lsl #8
291 stmia r0!, {r3-r5, r12}
293 bge Lmemcpy_fsrcul3loop16
296 blt Lmemcpy_fsrcul3l4
298 Lmemcpy_fsrcul3loop4:
301 orr r12, r12, lr, lsl #8
304 bge Lmemcpy_fsrcul3loop4
314 blt Lmemcpy_bl4 /* less than 4 bytes */
316 bne Lmemcpy_bdestul /* oh unaligned destination addr */
318 bne Lmemcpy_bsrcul /* oh unaligned source addr */
321 /* We have aligned source and destination */
323 blt Lmemcpy_bl12 /* less than 12 bytes (4 from above) */
324 stmdb sp!, {r4, r7, r8, r9, r10, lr}
325 subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */
328 /* blat 64 bytes at a time */
329 /* XXX for really big copies perhaps we should use more registers */
331 ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
332 stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
333 ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
334 stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
340 ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
341 stmgedb r0!, {r3, r4, r12, lr}
344 ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */
345 stmgedb r0!, {r3, r12, lr}
347 ldmia sp!, {r4, r7, r8, r9, r10, lr}
355 ldmgedb r1!, {r3, r12}
356 stmgedb r0!, {r3, r12}
360 /* less than 4 bytes to go */
362 moveq pc, lr /* done */
364 /* copy the crud byte at a time */
368 ldrgeb r3, [r1, #-1]!
369 strgeb r3, [r0, #-1]!
370 ldrgtb r3, [r1, #-1]!
371 strgtb r3, [r0, #-1]!
374 /* erg - unaligned destination */
378 /* align destination with byte copies */
381 ldrgeb r3, [r1, #-1]!
382 strgeb r3, [r0, #-1]!
383 ldrgtb r3, [r1, #-1]!
384 strgtb r3, [r0, #-1]!
386 blt Lmemcpy_bl4 /* less than 4 bytes to go */
388 beq Lmemcpy_bt8 /* we have an aligned source */
390 /* erg - unaligned source */
391 /* This is where it gets nasty ... */
399 blt Lmemcpy_bsrcul3loop4
401 stmdb sp!, {r4, r5, lr}
403 Lmemcpy_bsrcul3loop16:
405 ldmdb r1!, {r3-r5, r12}
406 orr lr, lr, r12, lsr #24
408 orr r12, r12, r5, lsr #24
410 orr r5, r5, r4, lsr #24
412 orr r4, r4, r3, lsr #24
413 stmdb r0!, {r4, r5, r12, lr}
415 bge Lmemcpy_bsrcul3loop16
416 ldmia sp!, {r4, r5, lr}
418 blt Lmemcpy_bsrcul3l4
420 Lmemcpy_bsrcul3loop4:
423 orr r12, r12, r3, lsr #24
426 bge Lmemcpy_bsrcul3loop4
434 blt Lmemcpy_bsrcul2loop4
436 stmdb sp!, {r4, r5, lr}
438 Lmemcpy_bsrcul2loop16:
440 ldmdb r1!, {r3-r5, r12}
441 orr lr, lr, r12, lsr #16
442 mov r12, r12, lsl #16
443 orr r12, r12, r5, lsr #16
445 orr r5, r5, r4, lsr #16
447 orr r4, r4, r3, lsr #16
448 stmdb r0!, {r4, r5, r12, lr}
450 bge Lmemcpy_bsrcul2loop16
451 ldmia sp!, {r4, r5, lr}
453 blt Lmemcpy_bsrcul2l4
455 Lmemcpy_bsrcul2loop4:
458 orr r12, r12, r3, lsr #16
461 bge Lmemcpy_bsrcul2loop4
469 blt Lmemcpy_bsrcul1loop4
471 stmdb sp!, {r4, r5, lr}
473 Lmemcpy_bsrcul1loop32:
475 ldmdb r1!, {r3-r5, r12}
476 orr lr, lr, r12, lsr #8
477 mov r12, r12, lsl #24
478 orr r12, r12, r5, lsr #8
480 orr r5, r5, r4, lsr #8
482 orr r4, r4, r3, lsr #8
483 stmdb r0!, {r4, r5, r12, lr}
485 bge Lmemcpy_bsrcul1loop32
486 ldmia sp!, {r4, r5, lr}
488 blt Lmemcpy_bsrcul1l4
490 Lmemcpy_bsrcul1loop4:
493 orr r12, r12, r3, lsr #8
496 bge Lmemcpy_bsrcul1loop4