42c7b147 |
1 | /* $NetBSD: memcpy.S,v 1.3 1997/11/22 03:27:12 mark Exp $ */ |
2 | |
3 | /*- |
4 | * Copyright (c) 1997 The NetBSD Foundation, Inc. |
5 | * All rights reserved. |
6 | * |
7 | * This code is derived from software contributed to The NetBSD Foundation |
8 | * by Neil A. Carson and Mark Brinicombe |
9 | * |
10 | * Redistribution and use in source and binary forms, with or without |
11 | * modification, are permitted provided that the following conditions |
12 | * are met: |
13 | * 1. Redistributions of source code must retain the above copyright |
14 | * notice, this list of conditions and the following disclaimer. |
15 | * 2. Redistributions in binary form must reproduce the above copyright |
16 | * notice, this list of conditions and the following disclaimer in the |
17 | * documentation and/or other materials provided with the distribution. |
18 | * 3. All advertising materials mentioning features or use of this software |
19 | * must display the following acknowledgement: |
20 | * This product includes software developed by the NetBSD |
21 | * Foundation, Inc. and its contributors. |
22 | * 4. Neither the name of The NetBSD Foundation nor the names of its |
23 | * contributors may be used to endorse or promote products derived |
24 | * from this software without specific prior written permission. |
25 | * |
26 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
27 | * ``AS IS\'\' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
28 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
29 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
30 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
31 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
32 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
33 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
34 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
35 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
36 | * POSSIBILITY OF SUCH DAMAGE. |
37 | */ |
38 | |
42c7b147 |
39 | /* #include <machine/asm.h>*/ |
40 | |
42c7b147 |
41 | .globl memcpy |
e4fb433c |
42 | .globl _memcpy |
42c7b147 |
43 | memcpy: |
44 | |
e4fb433c |
45 | stmfd sp!, {r0, lr} |
42c7b147 |
46 | bl _memcpy |
e4fb433c |
47 | ldmfd sp!, {r0, pc} |
42c7b147 |
48 | |
49 | |
50 | .globl memmove |
51 | memmove: |
52 | |
e4fb433c |
53 | stmfd sp!, {r0, lr} |
42c7b147 |
54 | bl _memcpy |
e4fb433c |
55 | ldmfd sp!, {r0, pc} |
42c7b147 |
56 | |
57 | |
58 | |
59 | /* |
60 | * This is one fun bit of code ... |
61 | * Some easy listening music is suggested while trying to understand this |
62 | * code e.g. Iron Maiden |
63 | * |
64 | * For anyone attempting to understand it : |
65 | * |
66 | * The core code is implemented here with simple stubs for memcpy() |
67 | * memmove() and bcopy(). |
68 | * |
69 | * All local labels are prefixed with Lmemcpy_ |
70 | * Following the prefix a label starting f is used in the forward copy code |
71 | * while a label using b is used in the backwards copy code |
72 | * The source and destination addresses determine whether a forward or |
73 | * backward copy is performed. |
74 | * Separate bits of code are used to deal with the following situations |
75 | * for both the forward and backwards copy. |
76 | * unaligned source address |
77 | * unaligned destination address |
78 | * Separate copy routines are used to produce an optimised result for each |
79 | * of these cases. |
80 | * The copy code will use LDM/STM instructions to copy up to 32 bytes at |
81 | * a time where possible. |
82 | * |
83 | * Note: r12 (aka ip) can be trashed during the function along with |
84 | * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out. |
85 | * Additional registers are preserved prior to use i.e. r4, r5 & lr |
86 | * |
87 | * Apologies for the state of the comments;-) |
88 | */ |
89 | |
90 | |
91 | |
92 | _memcpy: |
93 | |
42c7b147 |
94 | /* Determine copy direction */ |
95 | cmp r1, r0 |
96 | bcc Lmemcpy_backwards |
97 | |
98 | moveq r0, #0 /* Quick abort for len=0 */ |
99 | moveq pc, lr |
100 | |
101 | stmdb sp!, {r0, lr} /* memcpy() returns dest addr */ |
102 | subs r2, r2, #4 |
103 | blt Lmemcpy_fl4 /* less than 4 bytes */ |
104 | ands r12, r0, #3 |
105 | bne Lmemcpy_fdestul /* oh unaligned destination addr */ |
106 | ands r12, r1, #3 |
107 | bne Lmemcpy_fsrcul /* oh unaligned source addr */ |
108 | |
109 | Lmemcpy_ft8: |
110 | /* We have aligned source and destination */ |
111 | subs r2, r2, #8 |
112 | blt Lmemcpy_fl12 /* less than 12 bytes (4 from above) */ |
113 | subs r2, r2, #0x14 |
114 | blt Lmemcpy_fl32 /* less than 32 bytes (12 from above) */ |
115 | stmdb sp!, {r4, r7, r8, r9, r10} /* borrow r4 */ |
116 | |
117 | /* blat 64 bytes at a time */ |
118 | /* XXX for really big copies perhaps we should use more registers */ |
119 | Lmemcpy_floop32: |
120 | ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr} |
121 | stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr} |
122 | ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr} |
123 | stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr} |
124 | subs r2, r2, #0x40 |
125 | bge Lmemcpy_floop32 |
126 | |
127 | cmn r2, #0x10 |
128 | ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */ |
129 | stmgeia r0!, {r3, r4, r12, lr} |
130 | subge r2, r2, #0x10 |
131 | ldmia sp!, {r4, r7, r8, r9, r10} /* return r4 */ |
132 | |
133 | Lmemcpy_fl32: |
134 | adds r2, r2, #0x14 |
135 | |
136 | /* blat 12 bytes at a time */ |
137 | Lmemcpy_floop12: |
138 | ldmgeia r1!, {r3, r12, lr} |
139 | stmgeia r0!, {r3, r12, lr} |
140 | subges r2, r2, #0x0c |
141 | bge Lmemcpy_floop12 |
142 | |
143 | Lmemcpy_fl12: |
144 | adds r2, r2, #8 |
145 | blt Lmemcpy_fl4 |
146 | |
147 | subs r2, r2, #4 |
148 | ldrlt r3, [r1], #4 |
149 | strlt r3, [r0], #4 |
150 | ldmgeia r1!, {r3, r12} |
151 | stmgeia r0!, {r3, r12} |
152 | subge r2, r2, #4 |
153 | |
154 | Lmemcpy_fl4: |
155 | /* less than 4 bytes to go */ |
156 | adds r2, r2, #4 |
157 | ldmeqia sp!, {r0, pc} /* done */ |
158 | |
159 | /* copy the crud byte at a time */ |
160 | cmp r2, #2 |
161 | ldrb r3, [r1], #1 |
162 | strb r3, [r0], #1 |
163 | ldrgeb r3, [r1], #1 |
164 | strgeb r3, [r0], #1 |
165 | ldrgtb r3, [r1], #1 |
166 | strgtb r3, [r0], #1 |
167 | ldmia sp!, {r0, pc} |
168 | |
169 | /* erg - unaligned destination */ |
170 | Lmemcpy_fdestul: |
171 | rsb r12, r12, #4 |
172 | cmp r12, #2 |
173 | |
174 | /* align destination with byte copies */ |
175 | ldrb r3, [r1], #1 |
176 | strb r3, [r0], #1 |
177 | ldrgeb r3, [r1], #1 |
178 | strgeb r3, [r0], #1 |
179 | ldrgtb r3, [r1], #1 |
180 | strgtb r3, [r0], #1 |
181 | subs r2, r2, r12 |
182 | blt Lmemcpy_fl4 /* less the 4 bytes */ |
183 | |
184 | ands r12, r1, #3 |
185 | beq Lmemcpy_ft8 /* we have an aligned source */ |
186 | |
187 | /* erg - unaligned source */ |
188 | /* This is where it gets nasty ... */ |
189 | Lmemcpy_fsrcul: |
190 | bic r1, r1, #3 |
191 | ldr lr, [r1], #4 |
192 | cmp r12, #2 |
193 | bgt Lmemcpy_fsrcul3 |
194 | beq Lmemcpy_fsrcul2 |
195 | cmp r2, #0x0c |
196 | blt Lmemcpy_fsrcul1loop4 |
197 | sub r2, r2, #0x0c |
198 | stmdb sp!, {r4, r5} |
199 | |
200 | Lmemcpy_fsrcul1loop16: |
201 | mov r3, lr, lsr #8 |
202 | ldmia r1!, {r4, r5, r12, lr} |
203 | orr r3, r3, r4, lsl #24 |
204 | mov r4, r4, lsr #8 |
205 | orr r4, r4, r5, lsl #24 |
206 | mov r5, r5, lsr #8 |
207 | orr r5, r5, r12, lsl #24 |
208 | mov r12, r12, lsr #8 |
209 | orr r12, r12, lr, lsl #24 |
210 | stmia r0!, {r3-r5, r12} |
211 | subs r2, r2, #0x10 |
212 | bge Lmemcpy_fsrcul1loop16 |
213 | ldmia sp!, {r4, r5} |
214 | adds r2, r2, #0x0c |
215 | blt Lmemcpy_fsrcul1l4 |
216 | |
217 | Lmemcpy_fsrcul1loop4: |
218 | mov r12, lr, lsr #8 |
219 | ldr lr, [r1], #4 |
220 | orr r12, r12, lr, lsl #24 |
221 | str r12, [r0], #4 |
222 | subs r2, r2, #4 |
223 | bge Lmemcpy_fsrcul1loop4 |
224 | |
225 | Lmemcpy_fsrcul1l4: |
226 | sub r1, r1, #3 |
227 | b Lmemcpy_fl4 |
228 | |
229 | Lmemcpy_fsrcul2: |
230 | cmp r2, #0x0c |
231 | blt Lmemcpy_fsrcul2loop4 |
232 | sub r2, r2, #0x0c |
233 | stmdb sp!, {r4, r5} |
234 | |
235 | Lmemcpy_fsrcul2loop16: |
236 | mov r3, lr, lsr #16 |
237 | ldmia r1!, {r4, r5, r12, lr} |
238 | orr r3, r3, r4, lsl #16 |
239 | mov r4, r4, lsr #16 |
240 | orr r4, r4, r5, lsl #16 |
241 | mov r5, r5, lsr #16 |
242 | orr r5, r5, r12, lsl #16 |
243 | mov r12, r12, lsr #16 |
244 | orr r12, r12, lr, lsl #16 |
245 | stmia r0!, {r3-r5, r12} |
246 | subs r2, r2, #0x10 |
247 | bge Lmemcpy_fsrcul2loop16 |
248 | ldmia sp!, {r4, r5} |
249 | adds r2, r2, #0x0c |
250 | blt Lmemcpy_fsrcul2l4 |
251 | |
252 | Lmemcpy_fsrcul2loop4: |
253 | mov r12, lr, lsr #16 |
254 | ldr lr, [r1], #4 |
255 | orr r12, r12, lr, lsl #16 |
256 | str r12, [r0], #4 |
257 | subs r2, r2, #4 |
258 | bge Lmemcpy_fsrcul2loop4 |
259 | |
260 | Lmemcpy_fsrcul2l4: |
261 | sub r1, r1, #2 |
262 | b Lmemcpy_fl4 |
263 | |
264 | Lmemcpy_fsrcul3: |
265 | cmp r2, #0x0c |
266 | blt Lmemcpy_fsrcul3loop4 |
267 | sub r2, r2, #0x0c |
268 | stmdb sp!, {r4, r5} |
269 | |
270 | Lmemcpy_fsrcul3loop16: |
271 | mov r3, lr, lsr #24 |
272 | ldmia r1!, {r4, r5, r12, lr} |
273 | orr r3, r3, r4, lsl #8 |
274 | mov r4, r4, lsr #24 |
275 | orr r4, r4, r5, lsl #8 |
276 | mov r5, r5, lsr #24 |
277 | orr r5, r5, r12, lsl #8 |
278 | mov r12, r12, lsr #24 |
279 | orr r12, r12, lr, lsl #8 |
280 | stmia r0!, {r3-r5, r12} |
281 | subs r2, r2, #0x10 |
282 | bge Lmemcpy_fsrcul3loop16 |
283 | ldmia sp!, {r4, r5} |
284 | adds r2, r2, #0x0c |
285 | blt Lmemcpy_fsrcul3l4 |
286 | |
287 | Lmemcpy_fsrcul3loop4: |
288 | mov r12, lr, lsr #24 |
289 | ldr lr, [r1], #4 |
290 | orr r12, r12, lr, lsl #8 |
291 | str r12, [r0], #4 |
292 | subs r2, r2, #4 |
293 | bge Lmemcpy_fsrcul3loop4 |
294 | |
295 | Lmemcpy_fsrcul3l4: |
296 | sub r1, r1, #1 |
297 | b Lmemcpy_fl4 |
298 | |
299 | Lmemcpy_backwards: |
300 | add r1, r1, r2 |
301 | add r0, r0, r2 |
302 | subs r2, r2, #4 |
303 | blt Lmemcpy_bl4 /* less than 4 bytes */ |
304 | ands r12, r0, #3 |
305 | bne Lmemcpy_bdestul /* oh unaligned destination addr */ |
306 | ands r12, r1, #3 |
307 | bne Lmemcpy_bsrcul /* oh unaligned source addr */ |
308 | |
309 | Lmemcpy_bt8: |
310 | /* We have aligned source and destination */ |
311 | subs r2, r2, #8 |
312 | blt Lmemcpy_bl12 /* less than 12 bytes (4 from above) */ |
313 | stmdb sp!, {r4, r7, r8, r9, r10, lr} |
314 | subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */ |
315 | blt Lmemcpy_bl32 |
316 | |
317 | /* blat 64 bytes at a time */ |
318 | /* XXX for really big copies perhaps we should use more registers */ |
319 | Lmemcpy_bloop32: |
320 | ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr} |
321 | stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr} |
322 | ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr} |
323 | stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr} |
324 | subs r2, r2, #0x40 |
325 | bge Lmemcpy_bloop32 |
326 | |
327 | Lmemcpy_bl32: |
328 | cmn r2, #0x10 |
329 | ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */ |
330 | stmgedb r0!, {r3, r4, r12, lr} |
331 | subge r2, r2, #0x10 |
332 | adds r2, r2, #0x14 |
333 | ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */ |
334 | stmgedb r0!, {r3, r12, lr} |
335 | subge r2, r2, #0x0c |
336 | ldmia sp!, {r4, r7, r8, r9, r10, lr} |
337 | |
338 | Lmemcpy_bl12: |
339 | adds r2, r2, #8 |
340 | blt Lmemcpy_bl4 |
341 | subs r2, r2, #4 |
342 | ldrlt r3, [r1, #-4]! |
343 | strlt r3, [r0, #-4]! |
344 | ldmgedb r1!, {r3, r12} |
345 | stmgedb r0!, {r3, r12} |
346 | subge r2, r2, #4 |
347 | |
348 | Lmemcpy_bl4: |
349 | /* less than 4 bytes to go */ |
350 | adds r2, r2, #4 |
351 | moveq pc, lr /* done */ |
352 | |
353 | /* copy the crud byte at a time */ |
354 | cmp r2, #2 |
355 | ldrb r3, [r1, #-1]! |
356 | strb r3, [r0, #-1]! |
357 | ldrgeb r3, [r1, #-1]! |
358 | strgeb r3, [r0, #-1]! |
359 | ldrgtb r3, [r1, #-1]! |
360 | strgtb r3, [r0, #-1]! |
361 | mov pc, lr |
362 | |
363 | /* erg - unaligned destination */ |
364 | Lmemcpy_bdestul: |
365 | cmp r12, #2 |
366 | |
367 | /* align destination with byte copies */ |
368 | ldrb r3, [r1, #-1]! |
369 | strb r3, [r0, #-1]! |
370 | ldrgeb r3, [r1, #-1]! |
371 | strgeb r3, [r0, #-1]! |
372 | ldrgtb r3, [r1, #-1]! |
373 | strgtb r3, [r0, #-1]! |
374 | subs r2, r2, r12 |
375 | blt Lmemcpy_bl4 /* less than 4 bytes to go */ |
376 | ands r12, r1, #3 |
377 | beq Lmemcpy_bt8 /* we have an aligned source */ |
378 | |
379 | /* erg - unaligned source */ |
380 | /* This is where it gets nasty ... */ |
381 | Lmemcpy_bsrcul: |
382 | bic r1, r1, #3 |
383 | ldr r3, [r1, #0] |
384 | cmp r12, #2 |
385 | blt Lmemcpy_bsrcul1 |
386 | beq Lmemcpy_bsrcul2 |
387 | cmp r2, #0x0c |
388 | blt Lmemcpy_bsrcul3loop4 |
389 | sub r2, r2, #0x0c |
390 | stmdb sp!, {r4, r5, lr} |
391 | |
392 | Lmemcpy_bsrcul3loop16: |
393 | mov lr, r3, lsl #8 |
394 | ldmdb r1!, {r3-r5, r12} |
395 | orr lr, lr, r12, lsr #24 |
396 | mov r12, r12, lsl #8 |
397 | orr r12, r12, r5, lsr #24 |
398 | mov r5, r5, lsl #8 |
399 | orr r5, r5, r4, lsr #24 |
400 | mov r4, r4, lsl #8 |
401 | orr r4, r4, r3, lsr #24 |
402 | stmdb r0!, {r4, r5, r12, lr} |
403 | subs r2, r2, #0x10 |
404 | bge Lmemcpy_bsrcul3loop16 |
405 | ldmia sp!, {r4, r5, lr} |
406 | adds r2, r2, #0x0c |
407 | blt Lmemcpy_bsrcul3l4 |
408 | |
409 | Lmemcpy_bsrcul3loop4: |
410 | mov r12, r3, lsl #8 |
411 | ldr r3, [r1, #-4]! |
412 | orr r12, r12, r3, lsr #24 |
413 | str r12, [r0, #-4]! |
414 | subs r2, r2, #4 |
415 | bge Lmemcpy_bsrcul3loop4 |
416 | |
417 | Lmemcpy_bsrcul3l4: |
418 | add r1, r1, #3 |
419 | b Lmemcpy_bl4 |
420 | |
421 | Lmemcpy_bsrcul2: |
422 | cmp r2, #0x0c |
423 | blt Lmemcpy_bsrcul2loop4 |
424 | sub r2, r2, #0x0c |
425 | stmdb sp!, {r4, r5, lr} |
426 | |
427 | Lmemcpy_bsrcul2loop16: |
428 | mov lr, r3, lsl #16 |
429 | ldmdb r1!, {r3-r5, r12} |
430 | orr lr, lr, r12, lsr #16 |
431 | mov r12, r12, lsl #16 |
432 | orr r12, r12, r5, lsr #16 |
433 | mov r5, r5, lsl #16 |
434 | orr r5, r5, r4, lsr #16 |
435 | mov r4, r4, lsl #16 |
436 | orr r4, r4, r3, lsr #16 |
437 | stmdb r0!, {r4, r5, r12, lr} |
438 | subs r2, r2, #0x10 |
439 | bge Lmemcpy_bsrcul2loop16 |
440 | ldmia sp!, {r4, r5, lr} |
441 | adds r2, r2, #0x0c |
442 | blt Lmemcpy_bsrcul2l4 |
443 | |
444 | Lmemcpy_bsrcul2loop4: |
445 | mov r12, r3, lsl #16 |
446 | ldr r3, [r1, #-4]! |
447 | orr r12, r12, r3, lsr #16 |
448 | str r12, [r0, #-4]! |
449 | subs r2, r2, #4 |
450 | bge Lmemcpy_bsrcul2loop4 |
451 | |
452 | Lmemcpy_bsrcul2l4: |
453 | add r1, r1, #2 |
454 | b Lmemcpy_bl4 |
455 | |
456 | Lmemcpy_bsrcul1: |
457 | cmp r2, #0x0c |
458 | blt Lmemcpy_bsrcul1loop4 |
459 | sub r2, r2, #0x0c |
460 | stmdb sp!, {r4, r5, lr} |
461 | |
462 | Lmemcpy_bsrcul1loop32: |
463 | mov lr, r3, lsl #24 |
464 | ldmdb r1!, {r3-r5, r12} |
465 | orr lr, lr, r12, lsr #8 |
466 | mov r12, r12, lsl #24 |
467 | orr r12, r12, r5, lsr #8 |
468 | mov r5, r5, lsl #24 |
469 | orr r5, r5, r4, lsr #8 |
470 | mov r4, r4, lsl #24 |
471 | orr r4, r4, r3, lsr #8 |
472 | stmdb r0!, {r4, r5, r12, lr} |
473 | subs r2, r2, #0x10 |
474 | bge Lmemcpy_bsrcul1loop32 |
475 | ldmia sp!, {r4, r5, lr} |
476 | adds r2, r2, #0x0c |
477 | blt Lmemcpy_bsrcul1l4 |
478 | |
479 | Lmemcpy_bsrcul1loop4: |
480 | mov r12, r3, lsl #24 |
481 | ldr r3, [r1, #-4]! |
482 | orr r12, r12, r3, lsr #8 |
483 | str r12, [r0, #-4]! |
484 | subs r2, r2, #4 |
485 | bge Lmemcpy_bsrcul1loop4 |
486 | |
487 | Lmemcpy_bsrcul1l4: |
488 | add r1, r1, #1 |
489 | b Lmemcpy_bl4 |