42c7b147 |
1 | /* $NetBSD: memcpy.S,v 1.3 1997/11/22 03:27:12 mark Exp $ */ |
2 | |
3 | /*- |
4 | * Copyright (c) 1997 The NetBSD Foundation, Inc. |
5 | * All rights reserved. |
6 | * |
7 | * This code is derived from software contributed to The NetBSD Foundation |
8 | * by Neil A. Carson and Mark Brinicombe |
9 | * |
10 | * Redistribution and use in source and binary forms, with or without |
11 | * modification, are permitted provided that the following conditions |
12 | * are met: |
13 | * 1. Redistributions of source code must retain the above copyright |
14 | * notice, this list of conditions and the following disclaimer. |
15 | * 2. Redistributions in binary form must reproduce the above copyright |
16 | * notice, this list of conditions and the following disclaimer in the |
17 | * documentation and/or other materials provided with the distribution. |
18 | * 3. All advertising materials mentioning features or use of this software |
19 | * must display the following acknowledgement: |
20 | * This product includes software developed by the NetBSD |
21 | * Foundation, Inc. and its contributors. |
22 | * 4. Neither the name of The NetBSD Foundation nor the names of its |
23 | * contributors may be used to endorse or promote products derived |
24 | * from this software without specific prior written permission. |
25 | * |
26 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
27 | * ``AS IS\'\' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
28 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
29 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
30 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
31 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
32 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
33 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
34 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
35 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
36 | * POSSIBILITY OF SUCH DAMAGE. |
37 | */ |
38 | |
39 | /* This was modified by Jay Monkman <jmonkman@smoothsmoothie.com> to |
40 | * save and restore r12. This is necessary for RTEMS. |
41 | */ |
42 | /* #include <machine/asm.h>*/ |
43 | |
44 | #define ENTRY(_LABEL) \ |
45 | .global _LABEL; _LABEL: |
46 | |
47 | .globl memcpy |
48 | memcpy: |
49 | |
50 | @ ENTRY(gp2x_memcpy) |
51 | stmfd sp!, {r0, r12, lr} |
52 | @ bl _gp2x_memcpy |
53 | bl _memcpy |
54 | ldmfd sp!, {r0, r12, pc} |
55 | |
56 | |
57 | |
58 | .globl memmove |
59 | memmove: |
60 | |
61 | @ ENTRY(gp2x_memmove) |
62 | stmfd sp!, {r0, r12, lr} |
63 | @ bl _gp2x_memcpy |
64 | bl _memcpy |
65 | ldmfd sp!, {r0, r12, pc} |
66 | |
67 | |
68 | |
69 | /* |
70 | * This is one fun bit of code ... |
71 | * Some easy listening music is suggested while trying to understand this |
72 | * code e.g. Iron Maiden |
73 | * |
74 | * For anyone attempting to understand it : |
75 | * |
76 | * The core code is implemented here with simple stubs for memcpy() |
77 | * memmove() and bcopy(). |
78 | * |
79 | * All local labels are prefixed with Lmemcpy_ |
80 | * Following the prefix a label starting f is used in the forward copy code |
81 | * while a label using b is used in the backwards copy code |
82 | * The source and destination addresses determine whether a forward or |
83 | * backward copy is performed. |
84 | * Separate bits of code are used to deal with the following situations |
85 | * for both the forward and backwards copy. |
86 | * unaligned source address |
87 | * unaligned destination address |
88 | * Separate copy routines are used to produce an optimised result for each |
89 | * of these cases. |
90 | * The copy code will use LDM/STM instructions to copy up to 32 bytes at |
91 | * a time where possible. |
92 | * |
93 | * Note: r12 (aka ip) can be trashed during the function along with |
94 | * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out. |
95 | * Additional registers are preserved prior to use i.e. r4, r5 & lr |
96 | * |
97 | * Apologies for the state of the comments;-) |
98 | */ |
99 | |
100 | |
101 | |
102 | _memcpy: |
103 | |
104 | @ ENTRY(_gp2x_memcpy) |
105 | /* Determine copy direction */ |
106 | cmp r1, r0 |
107 | bcc Lmemcpy_backwards |
108 | |
109 | moveq r0, #0 /* Quick abort for len=0 */ |
110 | moveq pc, lr |
111 | |
112 | stmdb sp!, {r0, lr} /* memcpy() returns dest addr */ |
113 | subs r2, r2, #4 |
114 | blt Lmemcpy_fl4 /* less than 4 bytes */ |
115 | ands r12, r0, #3 |
116 | bne Lmemcpy_fdestul /* oh unaligned destination addr */ |
117 | ands r12, r1, #3 |
118 | bne Lmemcpy_fsrcul /* oh unaligned source addr */ |
119 | |
120 | Lmemcpy_ft8: |
121 | /* We have aligned source and destination */ |
122 | subs r2, r2, #8 |
123 | blt Lmemcpy_fl12 /* less than 12 bytes (4 from above) */ |
124 | subs r2, r2, #0x14 |
125 | blt Lmemcpy_fl32 /* less than 32 bytes (12 from above) */ |
126 | stmdb sp!, {r4, r7, r8, r9, r10} /* borrow r4 */ |
127 | |
128 | /* blat 64 bytes at a time */ |
129 | /* XXX for really big copies perhaps we should use more registers */ |
130 | Lmemcpy_floop32: |
131 | ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr} |
132 | stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr} |
133 | ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr} |
134 | stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr} |
135 | subs r2, r2, #0x40 |
136 | bge Lmemcpy_floop32 |
137 | |
138 | cmn r2, #0x10 |
139 | ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */ |
140 | stmgeia r0!, {r3, r4, r12, lr} |
141 | subge r2, r2, #0x10 |
142 | ldmia sp!, {r4, r7, r8, r9, r10} /* return r4 */ |
143 | |
144 | Lmemcpy_fl32: |
145 | adds r2, r2, #0x14 |
146 | |
147 | /* blat 12 bytes at a time */ |
148 | Lmemcpy_floop12: |
149 | ldmgeia r1!, {r3, r12, lr} |
150 | stmgeia r0!, {r3, r12, lr} |
151 | subges r2, r2, #0x0c |
152 | bge Lmemcpy_floop12 |
153 | |
154 | Lmemcpy_fl12: |
155 | adds r2, r2, #8 |
156 | blt Lmemcpy_fl4 |
157 | |
158 | subs r2, r2, #4 |
159 | ldrlt r3, [r1], #4 |
160 | strlt r3, [r0], #4 |
161 | ldmgeia r1!, {r3, r12} |
162 | stmgeia r0!, {r3, r12} |
163 | subge r2, r2, #4 |
164 | |
165 | Lmemcpy_fl4: |
166 | /* less than 4 bytes to go */ |
167 | adds r2, r2, #4 |
168 | ldmeqia sp!, {r0, pc} /* done */ |
169 | |
170 | /* copy the crud byte at a time */ |
171 | cmp r2, #2 |
172 | ldrb r3, [r1], #1 |
173 | strb r3, [r0], #1 |
174 | ldrgeb r3, [r1], #1 |
175 | strgeb r3, [r0], #1 |
176 | ldrgtb r3, [r1], #1 |
177 | strgtb r3, [r0], #1 |
178 | ldmia sp!, {r0, pc} |
179 | |
180 | /* erg - unaligned destination */ |
181 | Lmemcpy_fdestul: |
182 | rsb r12, r12, #4 |
183 | cmp r12, #2 |
184 | |
185 | /* align destination with byte copies */ |
186 | ldrb r3, [r1], #1 |
187 | strb r3, [r0], #1 |
188 | ldrgeb r3, [r1], #1 |
189 | strgeb r3, [r0], #1 |
190 | ldrgtb r3, [r1], #1 |
191 | strgtb r3, [r0], #1 |
192 | subs r2, r2, r12 |
193 | blt Lmemcpy_fl4 /* less the 4 bytes */ |
194 | |
195 | ands r12, r1, #3 |
196 | beq Lmemcpy_ft8 /* we have an aligned source */ |
197 | |
198 | /* erg - unaligned source */ |
199 | /* This is where it gets nasty ... */ |
200 | Lmemcpy_fsrcul: |
201 | bic r1, r1, #3 |
202 | ldr lr, [r1], #4 |
203 | cmp r12, #2 |
204 | bgt Lmemcpy_fsrcul3 |
205 | beq Lmemcpy_fsrcul2 |
206 | cmp r2, #0x0c |
207 | blt Lmemcpy_fsrcul1loop4 |
208 | sub r2, r2, #0x0c |
209 | stmdb sp!, {r4, r5} |
210 | |
211 | Lmemcpy_fsrcul1loop16: |
212 | mov r3, lr, lsr #8 |
213 | ldmia r1!, {r4, r5, r12, lr} |
214 | orr r3, r3, r4, lsl #24 |
215 | mov r4, r4, lsr #8 |
216 | orr r4, r4, r5, lsl #24 |
217 | mov r5, r5, lsr #8 |
218 | orr r5, r5, r12, lsl #24 |
219 | mov r12, r12, lsr #8 |
220 | orr r12, r12, lr, lsl #24 |
221 | stmia r0!, {r3-r5, r12} |
222 | subs r2, r2, #0x10 |
223 | bge Lmemcpy_fsrcul1loop16 |
224 | ldmia sp!, {r4, r5} |
225 | adds r2, r2, #0x0c |
226 | blt Lmemcpy_fsrcul1l4 |
227 | |
228 | Lmemcpy_fsrcul1loop4: |
229 | mov r12, lr, lsr #8 |
230 | ldr lr, [r1], #4 |
231 | orr r12, r12, lr, lsl #24 |
232 | str r12, [r0], #4 |
233 | subs r2, r2, #4 |
234 | bge Lmemcpy_fsrcul1loop4 |
235 | |
236 | Lmemcpy_fsrcul1l4: |
237 | sub r1, r1, #3 |
238 | b Lmemcpy_fl4 |
239 | |
240 | Lmemcpy_fsrcul2: |
241 | cmp r2, #0x0c |
242 | blt Lmemcpy_fsrcul2loop4 |
243 | sub r2, r2, #0x0c |
244 | stmdb sp!, {r4, r5} |
245 | |
246 | Lmemcpy_fsrcul2loop16: |
247 | mov r3, lr, lsr #16 |
248 | ldmia r1!, {r4, r5, r12, lr} |
249 | orr r3, r3, r4, lsl #16 |
250 | mov r4, r4, lsr #16 |
251 | orr r4, r4, r5, lsl #16 |
252 | mov r5, r5, lsr #16 |
253 | orr r5, r5, r12, lsl #16 |
254 | mov r12, r12, lsr #16 |
255 | orr r12, r12, lr, lsl #16 |
256 | stmia r0!, {r3-r5, r12} |
257 | subs r2, r2, #0x10 |
258 | bge Lmemcpy_fsrcul2loop16 |
259 | ldmia sp!, {r4, r5} |
260 | adds r2, r2, #0x0c |
261 | blt Lmemcpy_fsrcul2l4 |
262 | |
263 | Lmemcpy_fsrcul2loop4: |
264 | mov r12, lr, lsr #16 |
265 | ldr lr, [r1], #4 |
266 | orr r12, r12, lr, lsl #16 |
267 | str r12, [r0], #4 |
268 | subs r2, r2, #4 |
269 | bge Lmemcpy_fsrcul2loop4 |
270 | |
271 | Lmemcpy_fsrcul2l4: |
272 | sub r1, r1, #2 |
273 | b Lmemcpy_fl4 |
274 | |
275 | Lmemcpy_fsrcul3: |
276 | cmp r2, #0x0c |
277 | blt Lmemcpy_fsrcul3loop4 |
278 | sub r2, r2, #0x0c |
279 | stmdb sp!, {r4, r5} |
280 | |
281 | Lmemcpy_fsrcul3loop16: |
282 | mov r3, lr, lsr #24 |
283 | ldmia r1!, {r4, r5, r12, lr} |
284 | orr r3, r3, r4, lsl #8 |
285 | mov r4, r4, lsr #24 |
286 | orr r4, r4, r5, lsl #8 |
287 | mov r5, r5, lsr #24 |
288 | orr r5, r5, r12, lsl #8 |
289 | mov r12, r12, lsr #24 |
290 | orr r12, r12, lr, lsl #8 |
291 | stmia r0!, {r3-r5, r12} |
292 | subs r2, r2, #0x10 |
293 | bge Lmemcpy_fsrcul3loop16 |
294 | ldmia sp!, {r4, r5} |
295 | adds r2, r2, #0x0c |
296 | blt Lmemcpy_fsrcul3l4 |
297 | |
298 | Lmemcpy_fsrcul3loop4: |
299 | mov r12, lr, lsr #24 |
300 | ldr lr, [r1], #4 |
301 | orr r12, r12, lr, lsl #8 |
302 | str r12, [r0], #4 |
303 | subs r2, r2, #4 |
304 | bge Lmemcpy_fsrcul3loop4 |
305 | |
306 | Lmemcpy_fsrcul3l4: |
307 | sub r1, r1, #1 |
308 | b Lmemcpy_fl4 |
309 | |
310 | Lmemcpy_backwards: |
311 | add r1, r1, r2 |
312 | add r0, r0, r2 |
313 | subs r2, r2, #4 |
314 | blt Lmemcpy_bl4 /* less than 4 bytes */ |
315 | ands r12, r0, #3 |
316 | bne Lmemcpy_bdestul /* oh unaligned destination addr */ |
317 | ands r12, r1, #3 |
318 | bne Lmemcpy_bsrcul /* oh unaligned source addr */ |
319 | |
320 | Lmemcpy_bt8: |
321 | /* We have aligned source and destination */ |
322 | subs r2, r2, #8 |
323 | blt Lmemcpy_bl12 /* less than 12 bytes (4 from above) */ |
324 | stmdb sp!, {r4, r7, r8, r9, r10, lr} |
325 | subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */ |
326 | blt Lmemcpy_bl32 |
327 | |
328 | /* blat 64 bytes at a time */ |
329 | /* XXX for really big copies perhaps we should use more registers */ |
330 | Lmemcpy_bloop32: |
331 | ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr} |
332 | stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr} |
333 | ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr} |
334 | stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr} |
335 | subs r2, r2, #0x40 |
336 | bge Lmemcpy_bloop32 |
337 | |
338 | Lmemcpy_bl32: |
339 | cmn r2, #0x10 |
340 | ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */ |
341 | stmgedb r0!, {r3, r4, r12, lr} |
342 | subge r2, r2, #0x10 |
343 | adds r2, r2, #0x14 |
344 | ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */ |
345 | stmgedb r0!, {r3, r12, lr} |
346 | subge r2, r2, #0x0c |
347 | ldmia sp!, {r4, r7, r8, r9, r10, lr} |
348 | |
349 | Lmemcpy_bl12: |
350 | adds r2, r2, #8 |
351 | blt Lmemcpy_bl4 |
352 | subs r2, r2, #4 |
353 | ldrlt r3, [r1, #-4]! |
354 | strlt r3, [r0, #-4]! |
355 | ldmgedb r1!, {r3, r12} |
356 | stmgedb r0!, {r3, r12} |
357 | subge r2, r2, #4 |
358 | |
359 | Lmemcpy_bl4: |
360 | /* less than 4 bytes to go */ |
361 | adds r2, r2, #4 |
362 | moveq pc, lr /* done */ |
363 | |
364 | /* copy the crud byte at a time */ |
365 | cmp r2, #2 |
366 | ldrb r3, [r1, #-1]! |
367 | strb r3, [r0, #-1]! |
368 | ldrgeb r3, [r1, #-1]! |
369 | strgeb r3, [r0, #-1]! |
370 | ldrgtb r3, [r1, #-1]! |
371 | strgtb r3, [r0, #-1]! |
372 | mov pc, lr |
373 | |
374 | /* erg - unaligned destination */ |
375 | Lmemcpy_bdestul: |
376 | cmp r12, #2 |
377 | |
378 | /* align destination with byte copies */ |
379 | ldrb r3, [r1, #-1]! |
380 | strb r3, [r0, #-1]! |
381 | ldrgeb r3, [r1, #-1]! |
382 | strgeb r3, [r0, #-1]! |
383 | ldrgtb r3, [r1, #-1]! |
384 | strgtb r3, [r0, #-1]! |
385 | subs r2, r2, r12 |
386 | blt Lmemcpy_bl4 /* less than 4 bytes to go */ |
387 | ands r12, r1, #3 |
388 | beq Lmemcpy_bt8 /* we have an aligned source */ |
389 | |
390 | /* erg - unaligned source */ |
391 | /* This is where it gets nasty ... */ |
392 | Lmemcpy_bsrcul: |
393 | bic r1, r1, #3 |
394 | ldr r3, [r1, #0] |
395 | cmp r12, #2 |
396 | blt Lmemcpy_bsrcul1 |
397 | beq Lmemcpy_bsrcul2 |
398 | cmp r2, #0x0c |
399 | blt Lmemcpy_bsrcul3loop4 |
400 | sub r2, r2, #0x0c |
401 | stmdb sp!, {r4, r5, lr} |
402 | |
403 | Lmemcpy_bsrcul3loop16: |
404 | mov lr, r3, lsl #8 |
405 | ldmdb r1!, {r3-r5, r12} |
406 | orr lr, lr, r12, lsr #24 |
407 | mov r12, r12, lsl #8 |
408 | orr r12, r12, r5, lsr #24 |
409 | mov r5, r5, lsl #8 |
410 | orr r5, r5, r4, lsr #24 |
411 | mov r4, r4, lsl #8 |
412 | orr r4, r4, r3, lsr #24 |
413 | stmdb r0!, {r4, r5, r12, lr} |
414 | subs r2, r2, #0x10 |
415 | bge Lmemcpy_bsrcul3loop16 |
416 | ldmia sp!, {r4, r5, lr} |
417 | adds r2, r2, #0x0c |
418 | blt Lmemcpy_bsrcul3l4 |
419 | |
420 | Lmemcpy_bsrcul3loop4: |
421 | mov r12, r3, lsl #8 |
422 | ldr r3, [r1, #-4]! |
423 | orr r12, r12, r3, lsr #24 |
424 | str r12, [r0, #-4]! |
425 | subs r2, r2, #4 |
426 | bge Lmemcpy_bsrcul3loop4 |
427 | |
428 | Lmemcpy_bsrcul3l4: |
429 | add r1, r1, #3 |
430 | b Lmemcpy_bl4 |
431 | |
432 | Lmemcpy_bsrcul2: |
433 | cmp r2, #0x0c |
434 | blt Lmemcpy_bsrcul2loop4 |
435 | sub r2, r2, #0x0c |
436 | stmdb sp!, {r4, r5, lr} |
437 | |
438 | Lmemcpy_bsrcul2loop16: |
439 | mov lr, r3, lsl #16 |
440 | ldmdb r1!, {r3-r5, r12} |
441 | orr lr, lr, r12, lsr #16 |
442 | mov r12, r12, lsl #16 |
443 | orr r12, r12, r5, lsr #16 |
444 | mov r5, r5, lsl #16 |
445 | orr r5, r5, r4, lsr #16 |
446 | mov r4, r4, lsl #16 |
447 | orr r4, r4, r3, lsr #16 |
448 | stmdb r0!, {r4, r5, r12, lr} |
449 | subs r2, r2, #0x10 |
450 | bge Lmemcpy_bsrcul2loop16 |
451 | ldmia sp!, {r4, r5, lr} |
452 | adds r2, r2, #0x0c |
453 | blt Lmemcpy_bsrcul2l4 |
454 | |
455 | Lmemcpy_bsrcul2loop4: |
456 | mov r12, r3, lsl #16 |
457 | ldr r3, [r1, #-4]! |
458 | orr r12, r12, r3, lsr #16 |
459 | str r12, [r0, #-4]! |
460 | subs r2, r2, #4 |
461 | bge Lmemcpy_bsrcul2loop4 |
462 | |
463 | Lmemcpy_bsrcul2l4: |
464 | add r1, r1, #2 |
465 | b Lmemcpy_bl4 |
466 | |
467 | Lmemcpy_bsrcul1: |
468 | cmp r2, #0x0c |
469 | blt Lmemcpy_bsrcul1loop4 |
470 | sub r2, r2, #0x0c |
471 | stmdb sp!, {r4, r5, lr} |
472 | |
473 | Lmemcpy_bsrcul1loop32: |
474 | mov lr, r3, lsl #24 |
475 | ldmdb r1!, {r3-r5, r12} |
476 | orr lr, lr, r12, lsr #8 |
477 | mov r12, r12, lsl #24 |
478 | orr r12, r12, r5, lsr #8 |
479 | mov r5, r5, lsl #24 |
480 | orr r5, r5, r4, lsr #8 |
481 | mov r4, r4, lsl #24 |
482 | orr r4, r4, r3, lsr #8 |
483 | stmdb r0!, {r4, r5, r12, lr} |
484 | subs r2, r2, #0x10 |
485 | bge Lmemcpy_bsrcul1loop32 |
486 | ldmia sp!, {r4, r5, lr} |
487 | adds r2, r2, #0x0c |
488 | blt Lmemcpy_bsrcul1l4 |
489 | |
490 | Lmemcpy_bsrcul1loop4: |
491 | mov r12, r3, lsl #24 |
492 | ldr r3, [r1, #-4]! |
493 | orr r12, r12, r3, lsr #8 |
494 | str r12, [r0, #-4]! |
495 | subs r2, r2, #4 |
496 | bge Lmemcpy_bsrcul1loop4 |
497 | |
498 | Lmemcpy_bsrcul1l4: |
499 | add r1, r1, #1 |
500 | b Lmemcpy_bl4 |