1 /* $NetBSD: memcpy.S,v 1.3 1997/11/22 03:27:12 mark Exp $ */
4 * Copyright (c) 1997 The NetBSD Foundation, Inc.
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Neil A. Carson and Mark Brinicombe
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS\'\' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
39 /* #include <machine/asm.h>*/
60 * This is one fun bit of code ...
61 * Some easy listening music is suggested while trying to understand this
62 * code e.g. Iron Maiden
64 * For anyone attempting to understand it :
66 * The core code is implemented here with simple stubs for memcpy()
67 * memmove() and bcopy().
69 * All local labels are prefixed with Lmemcpy_
70 * Following the prefix a label starting f is used in the forward copy code
71 * while a label using b is used in the backwards copy code
72 * The source and destination addresses determine whether a forward or
73 * backward copy is performed.
74 * Separate bits of code are used to deal with the following situations
75 * for both the forward and backwards copy.
76 * unaligned source address
77 * unaligned destination address
78 * Separate copy routines are used to produce an optimised result for each
80 * The copy code will use LDM/STM instructions to copy up to 32 bytes at
81 * a time where possible.
83 * Note: r12 (aka ip) can be trashed during the function along with
84 * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
85 * Additional registers are preserved prior to use i.e. r4, r5 & lr
87 * Apologies for the state of the comments;-)
94 /* Determine copy direction */
98 moveq r0, #0 /* Quick abort for len=0 */
101 stmdb sp!, {r0, lr} /* memcpy() returns dest addr */
103 blt Lmemcpy_fl4 /* less than 4 bytes */
105 bne Lmemcpy_fdestul /* oh unaligned destination addr */
107 bne Lmemcpy_fsrcul /* oh unaligned source addr */
110 /* We have aligned source and destination */
112 blt Lmemcpy_fl12 /* less than 12 bytes (4 from above) */
114 blt Lmemcpy_fl32 /* less than 32 bytes (12 from above) */
115 stmdb sp!, {r4, r7, r8, r9, r10} /* borrow r4 */
117 /* blat 64 bytes at a time */
118 /* XXX for really big copies perhaps we should use more registers */
120 ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
121 stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
122 ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
123 stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
128 ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
129 stmgeia r0!, {r3, r4, r12, lr}
131 ldmia sp!, {r4, r7, r8, r9, r10} /* return r4 */
136 /* blat 12 bytes at a time */
138 ldmgeia r1!, {r3, r12, lr}
139 stmgeia r0!, {r3, r12, lr}
150 ldmgeia r1!, {r3, r12}
151 stmgeia r0!, {r3, r12}
155 /* less than 4 bytes to go */
157 ldmeqia sp!, {r0, pc} /* done */
159 /* copy the crud byte at a time */
169 /* erg - unaligned destination */
174 /* align destination with byte copies */
182 blt Lmemcpy_fl4 /* less the 4 bytes */
185 beq Lmemcpy_ft8 /* we have an aligned source */
187 /* erg - unaligned source */
188 /* This is where it gets nasty ... */
196 blt Lmemcpy_fsrcul1loop4
200 Lmemcpy_fsrcul1loop16:
202 ldmia r1!, {r4, r5, r12, lr}
203 orr r3, r3, r4, lsl #24
205 orr r4, r4, r5, lsl #24
207 orr r5, r5, r12, lsl #24
209 orr r12, r12, lr, lsl #24
210 stmia r0!, {r3-r5, r12}
212 bge Lmemcpy_fsrcul1loop16
215 blt Lmemcpy_fsrcul1l4
217 Lmemcpy_fsrcul1loop4:
220 orr r12, r12, lr, lsl #24
223 bge Lmemcpy_fsrcul1loop4
231 blt Lmemcpy_fsrcul2loop4
235 Lmemcpy_fsrcul2loop16:
237 ldmia r1!, {r4, r5, r12, lr}
238 orr r3, r3, r4, lsl #16
240 orr r4, r4, r5, lsl #16
242 orr r5, r5, r12, lsl #16
243 mov r12, r12, lsr #16
244 orr r12, r12, lr, lsl #16
245 stmia r0!, {r3-r5, r12}
247 bge Lmemcpy_fsrcul2loop16
250 blt Lmemcpy_fsrcul2l4
252 Lmemcpy_fsrcul2loop4:
255 orr r12, r12, lr, lsl #16
258 bge Lmemcpy_fsrcul2loop4
266 blt Lmemcpy_fsrcul3loop4
270 Lmemcpy_fsrcul3loop16:
272 ldmia r1!, {r4, r5, r12, lr}
273 orr r3, r3, r4, lsl #8
275 orr r4, r4, r5, lsl #8
277 orr r5, r5, r12, lsl #8
278 mov r12, r12, lsr #24
279 orr r12, r12, lr, lsl #8
280 stmia r0!, {r3-r5, r12}
282 bge Lmemcpy_fsrcul3loop16
285 blt Lmemcpy_fsrcul3l4
287 Lmemcpy_fsrcul3loop4:
290 orr r12, r12, lr, lsl #8
293 bge Lmemcpy_fsrcul3loop4
303 blt Lmemcpy_bl4 /* less than 4 bytes */
305 bne Lmemcpy_bdestul /* oh unaligned destination addr */
307 bne Lmemcpy_bsrcul /* oh unaligned source addr */
310 /* We have aligned source and destination */
312 blt Lmemcpy_bl12 /* less than 12 bytes (4 from above) */
313 stmdb sp!, {r4, r7, r8, r9, r10, lr}
314 subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */
317 /* blat 64 bytes at a time */
318 /* XXX for really big copies perhaps we should use more registers */
320 ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
321 stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
322 ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
323 stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
329 ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
330 stmgedb r0!, {r3, r4, r12, lr}
333 ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */
334 stmgedb r0!, {r3, r12, lr}
336 ldmia sp!, {r4, r7, r8, r9, r10, lr}
344 ldmgedb r1!, {r3, r12}
345 stmgedb r0!, {r3, r12}
349 /* less than 4 bytes to go */
351 moveq pc, lr /* done */
353 /* copy the crud byte at a time */
357 ldrgeb r3, [r1, #-1]!
358 strgeb r3, [r0, #-1]!
359 ldrgtb r3, [r1, #-1]!
360 strgtb r3, [r0, #-1]!
363 /* erg - unaligned destination */
367 /* align destination with byte copies */
370 ldrgeb r3, [r1, #-1]!
371 strgeb r3, [r0, #-1]!
372 ldrgtb r3, [r1, #-1]!
373 strgtb r3, [r0, #-1]!
375 blt Lmemcpy_bl4 /* less than 4 bytes to go */
377 beq Lmemcpy_bt8 /* we have an aligned source */
379 /* erg - unaligned source */
380 /* This is where it gets nasty ... */
388 blt Lmemcpy_bsrcul3loop4
390 stmdb sp!, {r4, r5, lr}
392 Lmemcpy_bsrcul3loop16:
394 ldmdb r1!, {r3-r5, r12}
395 orr lr, lr, r12, lsr #24
397 orr r12, r12, r5, lsr #24
399 orr r5, r5, r4, lsr #24
401 orr r4, r4, r3, lsr #24
402 stmdb r0!, {r4, r5, r12, lr}
404 bge Lmemcpy_bsrcul3loop16
405 ldmia sp!, {r4, r5, lr}
407 blt Lmemcpy_bsrcul3l4
409 Lmemcpy_bsrcul3loop4:
412 orr r12, r12, r3, lsr #24
415 bge Lmemcpy_bsrcul3loop4
423 blt Lmemcpy_bsrcul2loop4
425 stmdb sp!, {r4, r5, lr}
427 Lmemcpy_bsrcul2loop16:
429 ldmdb r1!, {r3-r5, r12}
430 orr lr, lr, r12, lsr #16
431 mov r12, r12, lsl #16
432 orr r12, r12, r5, lsr #16
434 orr r5, r5, r4, lsr #16
436 orr r4, r4, r3, lsr #16
437 stmdb r0!, {r4, r5, r12, lr}
439 bge Lmemcpy_bsrcul2loop16
440 ldmia sp!, {r4, r5, lr}
442 blt Lmemcpy_bsrcul2l4
444 Lmemcpy_bsrcul2loop4:
447 orr r12, r12, r3, lsr #16
450 bge Lmemcpy_bsrcul2loop4
458 blt Lmemcpy_bsrcul1loop4
460 stmdb sp!, {r4, r5, lr}
462 Lmemcpy_bsrcul1loop32:
464 ldmdb r1!, {r3-r5, r12}
465 orr lr, lr, r12, lsr #8
466 mov r12, r12, lsl #24
467 orr r12, r12, r5, lsr #8
469 orr r5, r5, r4, lsr #8
471 orr r4, r4, r3, lsr #8
472 stmdb r0!, {r4, r5, r12, lr}
474 bge Lmemcpy_bsrcul1loop32
475 ldmia sp!, {r4, r5, lr}
477 blt Lmemcpy_bsrcul1l4
479 Lmemcpy_bsrcul1loop4:
482 orr r12, r12, r3, lsr #8
485 bge Lmemcpy_bsrcul1loop4