1 /* $NetBSD: memcpy.S,v 1.3 1997/11/22 03:27:12 mark Exp $ */
4 * Copyright (c) 1997 The NetBSD Foundation, Inc.
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Neil A. Carson and Mark Brinicombe
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS\'\' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
39 /* #include <machine/asm.h>*/
60 * This is one fun bit of code ...
61 * Some easy listening music is suggested while trying to understand this
62 * code e.g. Iron Maiden
64 * For anyone attempting to understand it :
66 * The core code is implemented here with simple stubs for memcpy()
67 * memmove() and bcopy().
69 * All local labels are prefixed with Lmemcpy_
70 * Following the prefix a label starting f is used in the forward copy code
71 * while a label using b is used in the backwards copy code
72 * The source and destination addresses determine whether a forward or
73 * backward copy is performed.
74 * Separate bits of code are used to deal with the following situations
75 * for both the forward and backwards copy.
76 * unaligned source address
77 * unaligned destination address
78 * Separate copy routines are used to produce an optimised result for each
80 * The copy code will use LDM/STM instructions to copy up to 32 bytes at
81 * a time where possible.
83 * Note: r12 (aka ip) can be trashed during the function along with
84 * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
85 * Additional registers are preserved prior to use i.e. r4, r5 & lr
87 * Apologies for the state of the comments;-)
94 /* Determine copy direction */
98 moveq r0, #0 /* Quick abort for len=0 */
101 stmdb sp!, {r0, lr} /* memcpy() returns dest addr */
103 blt Lmemcpy_fl4 /* less than 4 bytes */
105 bne Lmemcpy_fdestul /* oh unaligned destination addr */
107 bne Lmemcpy_fsrcul /* oh unaligned source addr */
110 /* We have aligned source and destination */
112 blt Lmemcpy_fl12 /* less than 12 bytes (4 from above) */
114 blt Lmemcpy_fl32 /* less than 32 bytes (12 from above) */
115 stmdb sp!, {r4, r7, r8, r9, r10} /* borrow r4 */
117 /* blat 32 bytes at a time */
118 /* XXX for really big copies perhaps we should use more registers */
120 ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
121 stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
126 ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
127 stmgeia r0!, {r3, r4, r12, lr}
129 ldmia sp!, {r4, r7, r8, r9, r10} /* return r4 */
134 /* blat 12 bytes at a time */
136 ldmgeia r1!, {r3, r12, lr}
137 stmgeia r0!, {r3, r12, lr}
148 ldmgeia r1!, {r3, r12}
149 stmgeia r0!, {r3, r12}
153 /* less than 4 bytes to go */
155 ldmeqia sp!, {r0, pc} /* done */
157 /* copy the crud byte at a time */
167 /* erg - unaligned destination */
172 /* align destination with byte copies */
180 blt Lmemcpy_fl4 /* less the 4 bytes */
183 beq Lmemcpy_ft8 /* we have an aligned source */
185 /* erg - unaligned source */
186 /* This is where it gets nasty ... */
194 blt Lmemcpy_fsrcul1loop4
198 Lmemcpy_fsrcul1loop16:
200 ldmia r1!, {r4, r5, r12, lr}
201 orr r3, r3, r4, lsl #24
203 orr r4, r4, r5, lsl #24
205 orr r5, r5, r12, lsl #24
207 orr r12, r12, lr, lsl #24
208 stmia r0!, {r3-r5, r12}
210 bge Lmemcpy_fsrcul1loop16
213 blt Lmemcpy_fsrcul1l4
215 Lmemcpy_fsrcul1loop4:
218 orr r12, r12, lr, lsl #24
221 bge Lmemcpy_fsrcul1loop4
229 blt Lmemcpy_fsrcul2loop4
233 Lmemcpy_fsrcul2loop16:
235 ldmia r1!, {r4, r5, r12, lr}
236 orr r3, r3, r4, lsl #16
238 orr r4, r4, r5, lsl #16
240 orr r5, r5, r12, lsl #16
241 mov r12, r12, lsr #16
242 orr r12, r12, lr, lsl #16
243 stmia r0!, {r3-r5, r12}
245 bge Lmemcpy_fsrcul2loop16
248 blt Lmemcpy_fsrcul2l4
250 Lmemcpy_fsrcul2loop4:
253 orr r12, r12, lr, lsl #16
256 bge Lmemcpy_fsrcul2loop4
264 blt Lmemcpy_fsrcul3loop4
268 Lmemcpy_fsrcul3loop16:
270 ldmia r1!, {r4, r5, r12, lr}
271 orr r3, r3, r4, lsl #8
273 orr r4, r4, r5, lsl #8
275 orr r5, r5, r12, lsl #8
276 mov r12, r12, lsr #24
277 orr r12, r12, lr, lsl #8
278 stmia r0!, {r3-r5, r12}
280 bge Lmemcpy_fsrcul3loop16
283 blt Lmemcpy_fsrcul3l4
285 Lmemcpy_fsrcul3loop4:
288 orr r12, r12, lr, lsl #8
291 bge Lmemcpy_fsrcul3loop4
301 blt Lmemcpy_bl4 /* less than 4 bytes */
303 bne Lmemcpy_bdestul /* oh unaligned destination addr */
305 bne Lmemcpy_bsrcul /* oh unaligned source addr */
308 /* We have aligned source and destination */
310 blt Lmemcpy_bl12 /* less than 12 bytes (4 from above) */
311 stmdb sp!, {r4, r7, r8, r9, r10, lr}
312 subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */
315 /* blat 32 bytes at a time */
316 /* XXX for really big copies perhaps we should use more registers */
318 ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
319 stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
325 ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
326 stmgedb r0!, {r3, r4, r12, lr}
329 ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */
330 stmgedb r0!, {r3, r12, lr}
332 ldmia sp!, {r4, r7, r8, r9, r10, lr}
340 ldmgedb r1!, {r3, r12}
341 stmgedb r0!, {r3, r12}
345 /* less than 4 bytes to go */
347 moveq pc, lr /* done */
349 /* copy the crud byte at a time */
353 ldrgeb r3, [r1, #-1]!
354 strgeb r3, [r0, #-1]!
355 ldrgtb r3, [r1, #-1]!
356 strgtb r3, [r0, #-1]!
359 /* erg - unaligned destination */
363 /* align destination with byte copies */
366 ldrgeb r3, [r1, #-1]!
367 strgeb r3, [r0, #-1]!
368 ldrgtb r3, [r1, #-1]!
369 strgtb r3, [r0, #-1]!
371 blt Lmemcpy_bl4 /* less than 4 bytes to go */
373 beq Lmemcpy_bt8 /* we have an aligned source */
375 /* erg - unaligned source */
376 /* This is where it gets nasty ... */
384 blt Lmemcpy_bsrcul3loop4
386 stmdb sp!, {r4, r5, lr}
388 Lmemcpy_bsrcul3loop16:
390 ldmdb r1!, {r3-r5, r12}
391 orr lr, lr, r12, lsr #24
393 orr r12, r12, r5, lsr #24
395 orr r5, r5, r4, lsr #24
397 orr r4, r4, r3, lsr #24
398 stmdb r0!, {r4, r5, r12, lr}
400 bge Lmemcpy_bsrcul3loop16
401 ldmia sp!, {r4, r5, lr}
403 blt Lmemcpy_bsrcul3l4
405 Lmemcpy_bsrcul3loop4:
408 orr r12, r12, r3, lsr #24
411 bge Lmemcpy_bsrcul3loop4
419 blt Lmemcpy_bsrcul2loop4
421 stmdb sp!, {r4, r5, lr}
423 Lmemcpy_bsrcul2loop16:
425 ldmdb r1!, {r3-r5, r12}
426 orr lr, lr, r12, lsr #16
427 mov r12, r12, lsl #16
428 orr r12, r12, r5, lsr #16
430 orr r5, r5, r4, lsr #16
432 orr r4, r4, r3, lsr #16
433 stmdb r0!, {r4, r5, r12, lr}
435 bge Lmemcpy_bsrcul2loop16
436 ldmia sp!, {r4, r5, lr}
438 blt Lmemcpy_bsrcul2l4
440 Lmemcpy_bsrcul2loop4:
443 orr r12, r12, r3, lsr #16
446 bge Lmemcpy_bsrcul2loop4
454 blt Lmemcpy_bsrcul1loop4
456 stmdb sp!, {r4, r5, lr}
458 Lmemcpy_bsrcul1loop32:
460 ldmdb r1!, {r3-r5, r12}
461 orr lr, lr, r12, lsr #8
462 mov r12, r12, lsl #24
463 orr r12, r12, r5, lsr #8
465 orr r5, r5, r4, lsr #8
467 orr r4, r4, r3, lsr #8
468 stmdb r0!, {r4, r5, r12, lr}
470 bge Lmemcpy_bsrcul1loop32
471 ldmia sp!, {r4, r5, lr}
473 blt Lmemcpy_bsrcul1l4
475 Lmemcpy_bsrcul1loop4:
478 orr r12, r12, r3, lsr #8
481 bge Lmemcpy_bsrcul1loop4