42c7b147 |
1 | /* $NetBSD: memcpy.S,v 1.3 1997/11/22 03:27:12 mark Exp $ */ |
2 | |
3 | /*- |
4 | * Copyright (c) 1997 The NetBSD Foundation, Inc. |
5 | * All rights reserved. |
6 | * |
7 | * This code is derived from software contributed to The NetBSD Foundation |
8 | * by Neil A. Carson and Mark Brinicombe |
9 | * |
10 | * Redistribution and use in source and binary forms, with or without |
11 | * modification, are permitted provided that the following conditions |
12 | * are met: |
13 | * 1. Redistributions of source code must retain the above copyright |
14 | * notice, this list of conditions and the following disclaimer. |
15 | * 2. Redistributions in binary form must reproduce the above copyright |
16 | * notice, this list of conditions and the following disclaimer in the |
17 | * documentation and/or other materials provided with the distribution. |
18 | * 3. All advertising materials mentioning features or use of this software |
19 | * must display the following acknowledgement: |
20 | * This product includes software developed by the NetBSD |
21 | * Foundation, Inc. and its contributors. |
22 | * 4. Neither the name of The NetBSD Foundation nor the names of its |
23 | * contributors may be used to endorse or promote products derived |
24 | * from this software without specific prior written permission. |
25 | * |
26 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
27 | * ``AS IS\'\' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
28 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
29 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
30 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
31 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
32 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
33 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
34 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
35 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
36 | * POSSIBILITY OF SUCH DAMAGE. |
37 | */ |
38 | |
42c7b147 |
39 | /* #include <machine/asm.h>*/ |
40 | |
42c7b147 |
41 | .globl memcpy |
e4fb433c |
42 | .globl _memcpy |
42c7b147 |
43 | memcpy: |
44 | |
e4fb433c |
45 | stmfd sp!, {r0, lr} |
42c7b147 |
46 | bl _memcpy |
e4fb433c |
47 | ldmfd sp!, {r0, pc} |
42c7b147 |
48 | |
49 | |
50 | .globl memmove |
51 | memmove: |
52 | |
e4fb433c |
53 | stmfd sp!, {r0, lr} |
42c7b147 |
54 | bl _memcpy |
e4fb433c |
55 | ldmfd sp!, {r0, pc} |
42c7b147 |
56 | |
57 | |
58 | |
59 | /* |
60 | * This is one fun bit of code ... |
61 | * Some easy listening music is suggested while trying to understand this |
62 | * code e.g. Iron Maiden |
63 | * |
64 | * For anyone attempting to understand it : |
65 | * |
66 | * The core code is implemented here with simple stubs for memcpy() |
67 | * memmove() and bcopy(). |
68 | * |
69 | * All local labels are prefixed with Lmemcpy_ |
70 | * Following the prefix a label starting f is used in the forward copy code |
71 | * while a label using b is used in the backwards copy code |
72 | * The source and destination addresses determine whether a forward or |
73 | * backward copy is performed. |
74 | * Separate bits of code are used to deal with the following situations |
75 | * for both the forward and backwards copy. |
76 | * unaligned source address |
77 | * unaligned destination address |
78 | * Separate copy routines are used to produce an optimised result for each |
79 | * of these cases. |
80 | * The copy code will use LDM/STM instructions to copy up to 32 bytes at |
81 | * a time where possible. |
82 | * |
83 | * Note: r12 (aka ip) can be trashed during the function along with |
84 | * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out. |
85 | * Additional registers are preserved prior to use i.e. r4, r5 & lr |
86 | * |
87 | * Apologies for the state of the comments;-) |
88 | */ |
89 | |
90 | |
91 | |
92 | _memcpy: |
93 | |
42c7b147 |
94 | /* Determine copy direction */ |
95 | cmp r1, r0 |
96 | bcc Lmemcpy_backwards |
97 | |
98 | moveq r0, #0 /* Quick abort for len=0 */ |
99 | moveq pc, lr |
100 | |
101 | stmdb sp!, {r0, lr} /* memcpy() returns dest addr */ |
102 | subs r2, r2, #4 |
103 | blt Lmemcpy_fl4 /* less than 4 bytes */ |
104 | ands r12, r0, #3 |
105 | bne Lmemcpy_fdestul /* oh unaligned destination addr */ |
106 | ands r12, r1, #3 |
107 | bne Lmemcpy_fsrcul /* oh unaligned source addr */ |
108 | |
109 | Lmemcpy_ft8: |
110 | /* We have aligned source and destination */ |
111 | subs r2, r2, #8 |
112 | blt Lmemcpy_fl12 /* less than 12 bytes (4 from above) */ |
113 | subs r2, r2, #0x14 |
114 | blt Lmemcpy_fl32 /* less than 32 bytes (12 from above) */ |
115 | stmdb sp!, {r4, r7, r8, r9, r10} /* borrow r4 */ |
116 | |
d40a5af4 |
117 | /* blat 32 bytes at a time */ |
42c7b147 |
118 | /* XXX for really big copies perhaps we should use more registers */ |
119 | Lmemcpy_floop32: |
120 | ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr} |
121 | stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr} |
d40a5af4 |
122 | subs r2, r2, #0x20 |
42c7b147 |
123 | bge Lmemcpy_floop32 |
124 | |
125 | cmn r2, #0x10 |
126 | ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */ |
127 | stmgeia r0!, {r3, r4, r12, lr} |
128 | subge r2, r2, #0x10 |
129 | ldmia sp!, {r4, r7, r8, r9, r10} /* return r4 */ |
130 | |
131 | Lmemcpy_fl32: |
132 | adds r2, r2, #0x14 |
133 | |
134 | /* blat 12 bytes at a time */ |
135 | Lmemcpy_floop12: |
136 | ldmgeia r1!, {r3, r12, lr} |
137 | stmgeia r0!, {r3, r12, lr} |
138 | subges r2, r2, #0x0c |
139 | bge Lmemcpy_floop12 |
140 | |
141 | Lmemcpy_fl12: |
142 | adds r2, r2, #8 |
143 | blt Lmemcpy_fl4 |
144 | |
145 | subs r2, r2, #4 |
146 | ldrlt r3, [r1], #4 |
147 | strlt r3, [r0], #4 |
148 | ldmgeia r1!, {r3, r12} |
149 | stmgeia r0!, {r3, r12} |
150 | subge r2, r2, #4 |
151 | |
152 | Lmemcpy_fl4: |
153 | /* less than 4 bytes to go */ |
154 | adds r2, r2, #4 |
155 | ldmeqia sp!, {r0, pc} /* done */ |
156 | |
157 | /* copy the crud byte at a time */ |
158 | cmp r2, #2 |
159 | ldrb r3, [r1], #1 |
160 | strb r3, [r0], #1 |
161 | ldrgeb r3, [r1], #1 |
162 | strgeb r3, [r0], #1 |
163 | ldrgtb r3, [r1], #1 |
164 | strgtb r3, [r0], #1 |
165 | ldmia sp!, {r0, pc} |
166 | |
167 | /* erg - unaligned destination */ |
168 | Lmemcpy_fdestul: |
169 | rsb r12, r12, #4 |
170 | cmp r12, #2 |
171 | |
172 | /* align destination with byte copies */ |
173 | ldrb r3, [r1], #1 |
174 | strb r3, [r0], #1 |
175 | ldrgeb r3, [r1], #1 |
176 | strgeb r3, [r0], #1 |
177 | ldrgtb r3, [r1], #1 |
178 | strgtb r3, [r0], #1 |
179 | subs r2, r2, r12 |
180 | blt Lmemcpy_fl4 /* less the 4 bytes */ |
181 | |
182 | ands r12, r1, #3 |
183 | beq Lmemcpy_ft8 /* we have an aligned source */ |
184 | |
185 | /* erg - unaligned source */ |
186 | /* This is where it gets nasty ... */ |
187 | Lmemcpy_fsrcul: |
188 | bic r1, r1, #3 |
189 | ldr lr, [r1], #4 |
190 | cmp r12, #2 |
191 | bgt Lmemcpy_fsrcul3 |
192 | beq Lmemcpy_fsrcul2 |
193 | cmp r2, #0x0c |
194 | blt Lmemcpy_fsrcul1loop4 |
195 | sub r2, r2, #0x0c |
196 | stmdb sp!, {r4, r5} |
197 | |
198 | Lmemcpy_fsrcul1loop16: |
199 | mov r3, lr, lsr #8 |
200 | ldmia r1!, {r4, r5, r12, lr} |
201 | orr r3, r3, r4, lsl #24 |
202 | mov r4, r4, lsr #8 |
203 | orr r4, r4, r5, lsl #24 |
204 | mov r5, r5, lsr #8 |
205 | orr r5, r5, r12, lsl #24 |
206 | mov r12, r12, lsr #8 |
207 | orr r12, r12, lr, lsl #24 |
208 | stmia r0!, {r3-r5, r12} |
209 | subs r2, r2, #0x10 |
210 | bge Lmemcpy_fsrcul1loop16 |
211 | ldmia sp!, {r4, r5} |
212 | adds r2, r2, #0x0c |
213 | blt Lmemcpy_fsrcul1l4 |
214 | |
215 | Lmemcpy_fsrcul1loop4: |
216 | mov r12, lr, lsr #8 |
217 | ldr lr, [r1], #4 |
218 | orr r12, r12, lr, lsl #24 |
219 | str r12, [r0], #4 |
220 | subs r2, r2, #4 |
221 | bge Lmemcpy_fsrcul1loop4 |
222 | |
223 | Lmemcpy_fsrcul1l4: |
224 | sub r1, r1, #3 |
225 | b Lmemcpy_fl4 |
226 | |
227 | Lmemcpy_fsrcul2: |
228 | cmp r2, #0x0c |
229 | blt Lmemcpy_fsrcul2loop4 |
230 | sub r2, r2, #0x0c |
231 | stmdb sp!, {r4, r5} |
232 | |
233 | Lmemcpy_fsrcul2loop16: |
234 | mov r3, lr, lsr #16 |
235 | ldmia r1!, {r4, r5, r12, lr} |
236 | orr r3, r3, r4, lsl #16 |
237 | mov r4, r4, lsr #16 |
238 | orr r4, r4, r5, lsl #16 |
239 | mov r5, r5, lsr #16 |
240 | orr r5, r5, r12, lsl #16 |
241 | mov r12, r12, lsr #16 |
242 | orr r12, r12, lr, lsl #16 |
243 | stmia r0!, {r3-r5, r12} |
244 | subs r2, r2, #0x10 |
245 | bge Lmemcpy_fsrcul2loop16 |
246 | ldmia sp!, {r4, r5} |
247 | adds r2, r2, #0x0c |
248 | blt Lmemcpy_fsrcul2l4 |
249 | |
250 | Lmemcpy_fsrcul2loop4: |
251 | mov r12, lr, lsr #16 |
252 | ldr lr, [r1], #4 |
253 | orr r12, r12, lr, lsl #16 |
254 | str r12, [r0], #4 |
255 | subs r2, r2, #4 |
256 | bge Lmemcpy_fsrcul2loop4 |
257 | |
258 | Lmemcpy_fsrcul2l4: |
259 | sub r1, r1, #2 |
260 | b Lmemcpy_fl4 |
261 | |
262 | Lmemcpy_fsrcul3: |
263 | cmp r2, #0x0c |
264 | blt Lmemcpy_fsrcul3loop4 |
265 | sub r2, r2, #0x0c |
266 | stmdb sp!, {r4, r5} |
267 | |
268 | Lmemcpy_fsrcul3loop16: |
269 | mov r3, lr, lsr #24 |
270 | ldmia r1!, {r4, r5, r12, lr} |
271 | orr r3, r3, r4, lsl #8 |
272 | mov r4, r4, lsr #24 |
273 | orr r4, r4, r5, lsl #8 |
274 | mov r5, r5, lsr #24 |
275 | orr r5, r5, r12, lsl #8 |
276 | mov r12, r12, lsr #24 |
277 | orr r12, r12, lr, lsl #8 |
278 | stmia r0!, {r3-r5, r12} |
279 | subs r2, r2, #0x10 |
280 | bge Lmemcpy_fsrcul3loop16 |
281 | ldmia sp!, {r4, r5} |
282 | adds r2, r2, #0x0c |
283 | blt Lmemcpy_fsrcul3l4 |
284 | |
285 | Lmemcpy_fsrcul3loop4: |
286 | mov r12, lr, lsr #24 |
287 | ldr lr, [r1], #4 |
288 | orr r12, r12, lr, lsl #8 |
289 | str r12, [r0], #4 |
290 | subs r2, r2, #4 |
291 | bge Lmemcpy_fsrcul3loop4 |
292 | |
293 | Lmemcpy_fsrcul3l4: |
294 | sub r1, r1, #1 |
295 | b Lmemcpy_fl4 |
296 | |
297 | Lmemcpy_backwards: |
298 | add r1, r1, r2 |
299 | add r0, r0, r2 |
300 | subs r2, r2, #4 |
301 | blt Lmemcpy_bl4 /* less than 4 bytes */ |
302 | ands r12, r0, #3 |
303 | bne Lmemcpy_bdestul /* oh unaligned destination addr */ |
304 | ands r12, r1, #3 |
305 | bne Lmemcpy_bsrcul /* oh unaligned source addr */ |
306 | |
307 | Lmemcpy_bt8: |
308 | /* We have aligned source and destination */ |
309 | subs r2, r2, #8 |
310 | blt Lmemcpy_bl12 /* less than 12 bytes (4 from above) */ |
311 | stmdb sp!, {r4, r7, r8, r9, r10, lr} |
312 | subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */ |
313 | blt Lmemcpy_bl32 |
314 | |
d40a5af4 |
315 | /* blat 32 bytes at a time */ |
42c7b147 |
316 | /* XXX for really big copies perhaps we should use more registers */ |
317 | Lmemcpy_bloop32: |
318 | ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr} |
319 | stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr} |
d40a5af4 |
320 | subs r2, r2, #0x20 |
42c7b147 |
321 | bge Lmemcpy_bloop32 |
322 | |
323 | Lmemcpy_bl32: |
324 | cmn r2, #0x10 |
325 | ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */ |
326 | stmgedb r0!, {r3, r4, r12, lr} |
327 | subge r2, r2, #0x10 |
328 | adds r2, r2, #0x14 |
329 | ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */ |
330 | stmgedb r0!, {r3, r12, lr} |
331 | subge r2, r2, #0x0c |
332 | ldmia sp!, {r4, r7, r8, r9, r10, lr} |
333 | |
334 | Lmemcpy_bl12: |
335 | adds r2, r2, #8 |
336 | blt Lmemcpy_bl4 |
337 | subs r2, r2, #4 |
338 | ldrlt r3, [r1, #-4]! |
339 | strlt r3, [r0, #-4]! |
340 | ldmgedb r1!, {r3, r12} |
341 | stmgedb r0!, {r3, r12} |
342 | subge r2, r2, #4 |
343 | |
344 | Lmemcpy_bl4: |
345 | /* less than 4 bytes to go */ |
346 | adds r2, r2, #4 |
347 | moveq pc, lr /* done */ |
348 | |
349 | /* copy the crud byte at a time */ |
350 | cmp r2, #2 |
351 | ldrb r3, [r1, #-1]! |
352 | strb r3, [r0, #-1]! |
353 | ldrgeb r3, [r1, #-1]! |
354 | strgeb r3, [r0, #-1]! |
355 | ldrgtb r3, [r1, #-1]! |
356 | strgtb r3, [r0, #-1]! |
357 | mov pc, lr |
358 | |
359 | /* erg - unaligned destination */ |
360 | Lmemcpy_bdestul: |
361 | cmp r12, #2 |
362 | |
363 | /* align destination with byte copies */ |
364 | ldrb r3, [r1, #-1]! |
365 | strb r3, [r0, #-1]! |
366 | ldrgeb r3, [r1, #-1]! |
367 | strgeb r3, [r0, #-1]! |
368 | ldrgtb r3, [r1, #-1]! |
369 | strgtb r3, [r0, #-1]! |
370 | subs r2, r2, r12 |
371 | blt Lmemcpy_bl4 /* less than 4 bytes to go */ |
372 | ands r12, r1, #3 |
373 | beq Lmemcpy_bt8 /* we have an aligned source */ |
374 | |
375 | /* erg - unaligned source */ |
376 | /* This is where it gets nasty ... */ |
377 | Lmemcpy_bsrcul: |
378 | bic r1, r1, #3 |
379 | ldr r3, [r1, #0] |
380 | cmp r12, #2 |
381 | blt Lmemcpy_bsrcul1 |
382 | beq Lmemcpy_bsrcul2 |
383 | cmp r2, #0x0c |
384 | blt Lmemcpy_bsrcul3loop4 |
385 | sub r2, r2, #0x0c |
386 | stmdb sp!, {r4, r5, lr} |
387 | |
388 | Lmemcpy_bsrcul3loop16: |
389 | mov lr, r3, lsl #8 |
390 | ldmdb r1!, {r3-r5, r12} |
391 | orr lr, lr, r12, lsr #24 |
392 | mov r12, r12, lsl #8 |
393 | orr r12, r12, r5, lsr #24 |
394 | mov r5, r5, lsl #8 |
395 | orr r5, r5, r4, lsr #24 |
396 | mov r4, r4, lsl #8 |
397 | orr r4, r4, r3, lsr #24 |
398 | stmdb r0!, {r4, r5, r12, lr} |
399 | subs r2, r2, #0x10 |
400 | bge Lmemcpy_bsrcul3loop16 |
401 | ldmia sp!, {r4, r5, lr} |
402 | adds r2, r2, #0x0c |
403 | blt Lmemcpy_bsrcul3l4 |
404 | |
405 | Lmemcpy_bsrcul3loop4: |
406 | mov r12, r3, lsl #8 |
407 | ldr r3, [r1, #-4]! |
408 | orr r12, r12, r3, lsr #24 |
409 | str r12, [r0, #-4]! |
410 | subs r2, r2, #4 |
411 | bge Lmemcpy_bsrcul3loop4 |
412 | |
413 | Lmemcpy_bsrcul3l4: |
414 | add r1, r1, #3 |
415 | b Lmemcpy_bl4 |
416 | |
417 | Lmemcpy_bsrcul2: |
418 | cmp r2, #0x0c |
419 | blt Lmemcpy_bsrcul2loop4 |
420 | sub r2, r2, #0x0c |
421 | stmdb sp!, {r4, r5, lr} |
422 | |
423 | Lmemcpy_bsrcul2loop16: |
424 | mov lr, r3, lsl #16 |
425 | ldmdb r1!, {r3-r5, r12} |
426 | orr lr, lr, r12, lsr #16 |
427 | mov r12, r12, lsl #16 |
428 | orr r12, r12, r5, lsr #16 |
429 | mov r5, r5, lsl #16 |
430 | orr r5, r5, r4, lsr #16 |
431 | mov r4, r4, lsl #16 |
432 | orr r4, r4, r3, lsr #16 |
433 | stmdb r0!, {r4, r5, r12, lr} |
434 | subs r2, r2, #0x10 |
435 | bge Lmemcpy_bsrcul2loop16 |
436 | ldmia sp!, {r4, r5, lr} |
437 | adds r2, r2, #0x0c |
438 | blt Lmemcpy_bsrcul2l4 |
439 | |
440 | Lmemcpy_bsrcul2loop4: |
441 | mov r12, r3, lsl #16 |
442 | ldr r3, [r1, #-4]! |
443 | orr r12, r12, r3, lsr #16 |
444 | str r12, [r0, #-4]! |
445 | subs r2, r2, #4 |
446 | bge Lmemcpy_bsrcul2loop4 |
447 | |
448 | Lmemcpy_bsrcul2l4: |
449 | add r1, r1, #2 |
450 | b Lmemcpy_bl4 |
451 | |
452 | Lmemcpy_bsrcul1: |
453 | cmp r2, #0x0c |
454 | blt Lmemcpy_bsrcul1loop4 |
455 | sub r2, r2, #0x0c |
456 | stmdb sp!, {r4, r5, lr} |
457 | |
458 | Lmemcpy_bsrcul1loop32: |
459 | mov lr, r3, lsl #24 |
460 | ldmdb r1!, {r3-r5, r12} |
461 | orr lr, lr, r12, lsr #8 |
462 | mov r12, r12, lsl #24 |
463 | orr r12, r12, r5, lsr #8 |
464 | mov r5, r5, lsl #24 |
465 | orr r5, r5, r4, lsr #8 |
466 | mov r4, r4, lsl #24 |
467 | orr r4, r4, r3, lsr #8 |
468 | stmdb r0!, {r4, r5, r12, lr} |
469 | subs r2, r2, #0x10 |
470 | bge Lmemcpy_bsrcul1loop32 |
471 | ldmia sp!, {r4, r5, lr} |
472 | adds r2, r2, #0x0c |
473 | blt Lmemcpy_bsrcul1l4 |
474 | |
475 | Lmemcpy_bsrcul1loop4: |
476 | mov r12, r3, lsl #24 |
477 | ldr r3, [r1, #-4]! |
478 | orr r12, r12, r3, lsr #8 |
479 | str r12, [r0, #-4]! |
480 | subs r2, r2, #4 |
481 | bge Lmemcpy_bsrcul1loop4 |
482 | |
483 | Lmemcpy_bsrcul1l4: |
484 | add r1, r1, #1 |
485 | b Lmemcpy_bl4 |