platform/gp2x/code940/memcpy.s

   1 /* $NetBSD: memcpy.S,v 1.3 1997/11/22 03:27:12 mark Exp $ */
   2
   3 /*-
   4 * Copyright (c) 1997 The NetBSD Foundation, Inc.
   5 * All rights reserved.
   6 *
   7 * This code is derived from software contributed to The NetBSD Foundation
   8 * by Neil A. Carson and Mark Brinicombe
   9 *
  10 * Redistribution and use in source and binary forms, with or without
  11 * modification, are permitted provided that the following conditions
  12 * are met:
  13 * 1. Redistributions of source code must retain the above copyright
  14 *    notice, this list of conditions and the following disclaimer.
  15 * 2. Redistributions in binary form must reproduce the above copyright
  16 *    notice, this list of conditions and the following disclaimer in the
  17 *    documentation and/or other materials provided with the distribution.
  18 * 3. All advertising materials mentioning features or use of this software
  19 *    must display the following acknowledgement:
  20 * This product includes software developed by the NetBSD
  21 * Foundation, Inc. and its contributors.
  22 * 4. Neither the name of The NetBSD Foundation nor the names of its
  23 *    contributors may be used to endorse or promote products derived
  24 *    from this software without specific prior written permission.
  25 *
  26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  27 * ``AS IS\'\' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  29 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  36 * POSSIBILITY OF SUCH DAMAGE.
  37 */
  38
  39 /* This was modified by Jay Monkman <jmonkman@smoothsmoothie.com> to
  40 *   save and restore r12. This is necessary for RTEMS.
  41 */
  42 /* #include <machine/asm.h>*/
  43
  44 #define ENTRY(_LABEL) \
  45  .global _LABEL; _LABEL:
  46
  47 .globl memcpy
  48 memcpy:
  49
  50 @ ENTRY(gp2x_memcpy)
  51 stmfd sp!, {r0, r12, lr}
  52 @ bl _gp2x_memcpy
  53 bl _memcpy
  54 ldmfd sp!, {r0, r12, pc}
  55
  56
  57
  58 .globl memmove
  59 memmove:
  60
  61 @ ENTRY(gp2x_memmove)
  62 stmfd sp!, {r0, r12, lr}
  63 @ bl _gp2x_memcpy
  64 bl _memcpy
  65 ldmfd sp!, {r0, r12, pc}
  66
  67
  68
  69 /*
  70 * This is one fun bit of code ...
  71 * Some easy listening music is suggested while trying to understand this
  72 * code e.g. Iron Maiden
  73 *
  74 * For anyone attempting to understand it :
  75 *
  76 * The core code is implemented here with simple stubs for memcpy()
  77 * memmove() and bcopy().
  78 *
  79 * All local labels are prefixed with Lmemcpy_
  80 * Following the prefix a label starting f is used in the forward copy code
  81 * while a label using b is used in the backwards copy code
  82 * The source and destination addresses determine whether a forward or
  83 * backward copy is performed.
  84 * Separate bits of code are used to deal with the following situations
  85 * for both the forward and backwards copy.
  86 * unaligned source address
  87 * unaligned destination address
  88 * Separate copy routines are used to produce an optimised result for each
  89 * of these cases.
  90 * The copy code will use LDM/STM instructions to copy up to 32 bytes at
  91 * a time where possible.
  92 *
  93 * Note: r12 (aka ip) can be trashed during the function along with
  94 * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
  95 * Additional registers are preserved prior to use i.e. r4, r5 & lr
  96 *
  97 * Apologies for the state of the comments;-)
  98 */
  99
 100
 101
 102 _memcpy:
 103
 104 @ ENTRY(_gp2x_memcpy)
 105 /* Determine copy direction */
 106 cmp r1, r0
 107 bcc Lmemcpy_backwards
 108
 109 moveq r0, #0   /* Quick abort for len=0 */
 110 moveq pc, lr
 111
 112 stmdb sp!, {r0, lr}  /* memcpy() returns dest addr */
 113 subs r2, r2, #4
 114 blt Lmemcpy_fl4  /* less than 4 bytes */
 115 ands r12, r0, #3
 116 bne Lmemcpy_fdestul  /* oh unaligned destination addr */
 117 ands r12, r1, #3
 118 bne Lmemcpy_fsrcul  /* oh unaligned source addr */
 119
 120 Lmemcpy_ft8:
 121 /* We have aligned source and destination */
 122 subs r2, r2, #8
 123 blt Lmemcpy_fl12  /* less than 12 bytes (4 from above) */
 124 subs r2, r2, #0x14
 125 blt Lmemcpy_fl32  /* less than 32 bytes (12 from above) */
 126 stmdb sp!, {r4, r7, r8, r9, r10}  /* borrow r4 */
 127
 128 /* blat 64 bytes at a time */
 129 /* XXX for really big copies perhaps we should use more registers */
 130 Lmemcpy_floop32:
 131 ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
 132 stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
 133 ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
 134 stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
 135 subs r2, r2, #0x40
 136 bge Lmemcpy_floop32
 137
 138 cmn r2, #0x10
 139 ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
 140 stmgeia r0!, {r3, r4, r12, lr}
 141 subge r2, r2, #0x10
 142 ldmia sp!, {r4, r7, r8, r9, r10}  /* return r4 */
 143
 144 Lmemcpy_fl32:
 145 adds r2, r2, #0x14
 146
 147 /* blat 12 bytes at a time */
 148 Lmemcpy_floop12:
 149 ldmgeia r1!, {r3, r12, lr}
 150 stmgeia r0!, {r3, r12, lr}
 151 subges r2, r2, #0x0c
 152 bge Lmemcpy_floop12
 153
 154 Lmemcpy_fl12:
 155 adds r2, r2, #8
 156 blt Lmemcpy_fl4
 157
 158 subs r2, r2, #4
 159 ldrlt r3, [r1], #4
 160 strlt r3, [r0], #4
 161 ldmgeia r1!, {r3, r12}
 162 stmgeia r0!, {r3, r12}
 163 subge r2, r2, #4
 164
 165 Lmemcpy_fl4:
 166 /* less than 4 bytes to go */
 167 adds r2, r2, #4
 168 ldmeqia sp!, {r0, pc}  /* done */
 169
 170 /* copy the crud byte at a time */
 171 cmp r2, #2
 172 ldrb r3, [r1], #1
 173 strb r3, [r0], #1
 174 ldrgeb r3, [r1], #1
 175 strgeb r3, [r0], #1
 176 ldrgtb r3, [r1], #1
 177 strgtb r3, [r0], #1
 178 ldmia sp!, {r0, pc}
 179
 180 /* erg - unaligned destination */
 181 Lmemcpy_fdestul:
 182 rsb r12, r12, #4
 183 cmp r12, #2
 184
 185 /* align destination with byte copies */
 186 ldrb r3, [r1], #1
 187 strb r3, [r0], #1
 188 ldrgeb r3, [r1], #1
 189 strgeb r3, [r0], #1
 190 ldrgtb r3, [r1], #1
 191 strgtb r3, [r0], #1
 192 subs r2, r2, r12
 193 blt Lmemcpy_fl4  /* less the 4 bytes */
 194
 195 ands r12, r1, #3
 196 beq Lmemcpy_ft8  /* we have an aligned source */
 197
 198 /* erg - unaligned source */
 199 /* This is where it gets nasty ... */
 200 Lmemcpy_fsrcul:
 201 bic r1, r1, #3
 202 ldr lr, [r1], #4
 203 cmp r12, #2
 204 bgt Lmemcpy_fsrcul3
 205 beq Lmemcpy_fsrcul2
 206 cmp r2, #0x0c
 207 blt Lmemcpy_fsrcul1loop4
 208 sub r2, r2, #0x0c
 209 stmdb sp!, {r4, r5}
 210
 211 Lmemcpy_fsrcul1loop16:
 212 mov r3, lr, lsr #8
 213 ldmia r1!, {r4, r5, r12, lr}
 214 orr r3, r3, r4, lsl #24
 215 mov r4, r4, lsr #8
 216 orr r4, r4, r5, lsl #24
 217 mov r5, r5, lsr #8
 218 orr r5, r5, r12, lsl #24
 219 mov r12, r12, lsr #8
 220 orr r12, r12, lr, lsl #24
 221 stmia r0!, {r3-r5, r12}
 222 subs r2, r2, #0x10
 223 bge Lmemcpy_fsrcul1loop16
 224 ldmia sp!, {r4, r5}
 225 adds r2, r2, #0x0c
 226 blt Lmemcpy_fsrcul1l4
 227
 228 Lmemcpy_fsrcul1loop4:
 229 mov r12, lr, lsr #8
 230 ldr lr, [r1], #4
 231 orr r12, r12, lr, lsl #24
 232 str r12, [r0], #4
 233 subs r2, r2, #4
 234 bge Lmemcpy_fsrcul1loop4
 235
 236 Lmemcpy_fsrcul1l4:
 237 sub r1, r1, #3
 238 b Lmemcpy_fl4
 239
 240 Lmemcpy_fsrcul2:
 241 cmp r2, #0x0c
 242 blt Lmemcpy_fsrcul2loop4
 243 sub r2, r2, #0x0c
 244 stmdb sp!, {r4, r5}
 245
 246 Lmemcpy_fsrcul2loop16:
 247 mov r3, lr, lsr #16
 248 ldmia r1!, {r4, r5, r12, lr}
 249 orr r3, r3, r4, lsl #16
 250 mov r4, r4, lsr #16
 251 orr r4, r4, r5, lsl #16
 252 mov r5, r5, lsr #16
 253 orr r5, r5, r12, lsl #16
 254 mov r12, r12, lsr #16
 255 orr r12, r12, lr, lsl #16
 256 stmia r0!, {r3-r5, r12}
 257 subs r2, r2, #0x10
 258 bge Lmemcpy_fsrcul2loop16
 259 ldmia sp!, {r4, r5}
 260 adds r2, r2, #0x0c
 261 blt Lmemcpy_fsrcul2l4
 262
 263 Lmemcpy_fsrcul2loop4:
 264 mov r12, lr, lsr #16
 265 ldr lr, [r1], #4
 266 orr r12, r12, lr, lsl #16
 267 str r12, [r0], #4
 268 subs r2, r2, #4
 269 bge Lmemcpy_fsrcul2loop4
 270
 271 Lmemcpy_fsrcul2l4:
 272 sub r1, r1, #2
 273 b Lmemcpy_fl4
 274
 275 Lmemcpy_fsrcul3:
 276 cmp r2, #0x0c
 277 blt Lmemcpy_fsrcul3loop4
 278 sub r2, r2, #0x0c
 279 stmdb sp!, {r4, r5}
 280
 281 Lmemcpy_fsrcul3loop16:
 282 mov r3, lr, lsr #24
 283 ldmia r1!, {r4, r5, r12, lr}
 284 orr r3, r3, r4, lsl #8
 285 mov r4, r4, lsr #24
 286 orr r4, r4, r5, lsl #8
 287 mov r5, r5, lsr #24
 288 orr r5, r5, r12, lsl #8
 289 mov r12, r12, lsr #24
 290 orr r12, r12, lr, lsl #8
 291 stmia r0!, {r3-r5, r12}
 292 subs r2, r2, #0x10
 293 bge Lmemcpy_fsrcul3loop16
 294 ldmia sp!, {r4, r5}
 295 adds r2, r2, #0x0c
 296 blt Lmemcpy_fsrcul3l4
 297
 298 Lmemcpy_fsrcul3loop4:
 299 mov r12, lr, lsr #24
 300 ldr lr, [r1], #4
 301 orr r12, r12, lr, lsl #8
 302 str r12, [r0], #4
 303 subs r2, r2, #4
 304 bge Lmemcpy_fsrcul3loop4
 305
 306 Lmemcpy_fsrcul3l4:
 307 sub r1, r1, #1
 308 b Lmemcpy_fl4
 309
 310 Lmemcpy_backwards:
 311 add r1, r1, r2
 312 add r0, r0, r2
 313 subs r2, r2, #4
 314 blt Lmemcpy_bl4  /* less than 4 bytes */
 315 ands r12, r0, #3
 316 bne Lmemcpy_bdestul  /* oh unaligned destination addr */
 317 ands r12, r1, #3
 318 bne Lmemcpy_bsrcul  /* oh unaligned source addr */
 319
 320 Lmemcpy_bt8:
 321 /* We have aligned source and destination */
 322 subs r2, r2, #8
 323 blt Lmemcpy_bl12  /* less than 12 bytes (4 from above) */
 324 stmdb sp!, {r4, r7, r8, r9, r10, lr}
 325 subs r2, r2, #0x14  /* less than 32 bytes (12 from above) */
 326 blt Lmemcpy_bl32
 327
 328 /* blat 64 bytes at a time */
 329 /* XXX for really big copies perhaps we should use more registers */
 330 Lmemcpy_bloop32:
 331 ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
 332 stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
 333 ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
 334 stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
 335 subs r2, r2, #0x40
 336 bge Lmemcpy_bloop32
 337
 338 Lmemcpy_bl32:
 339 cmn r2, #0x10
 340 ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
 341 stmgedb r0!, {r3, r4, r12, lr}
 342 subge r2, r2, #0x10
 343 adds r2, r2, #0x14
 344 ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */
 345 stmgedb r0!, {r3, r12, lr}
 346 subge r2, r2, #0x0c
 347 ldmia sp!, {r4, r7, r8, r9, r10, lr}
 348
 349 Lmemcpy_bl12:
 350 adds r2, r2, #8
 351 blt Lmemcpy_bl4
 352 subs r2, r2, #4
 353 ldrlt r3, [r1, #-4]!
 354 strlt r3, [r0, #-4]!
 355 ldmgedb r1!, {r3, r12}
 356 stmgedb r0!, {r3, r12}
 357 subge r2, r2, #4
 358
 359 Lmemcpy_bl4:
 360 /* less than 4 bytes to go */
 361 adds r2, r2, #4
 362 moveq pc, lr   /* done */
 363
 364 /* copy the crud byte at a time */
 365 cmp r2, #2
 366 ldrb r3, [r1, #-1]!
 367 strb r3, [r0, #-1]!
 368 ldrgeb r3, [r1, #-1]!
 369 strgeb r3, [r0, #-1]!
 370 ldrgtb r3, [r1, #-1]!
 371 strgtb r3, [r0, #-1]!
 372 mov pc, lr
 373
 374 /* erg - unaligned destination */
 375 Lmemcpy_bdestul:
 376 cmp r12, #2
 377
 378 /* align destination with byte copies */
 379 ldrb r3, [r1, #-1]!
 380 strb r3, [r0, #-1]!
 381 ldrgeb r3, [r1, #-1]!
 382 strgeb r3, [r0, #-1]!
 383 ldrgtb r3, [r1, #-1]!
 384 strgtb r3, [r0, #-1]!
 385 subs r2, r2, r12
 386 blt Lmemcpy_bl4  /* less than 4 bytes to go */
 387 ands r12, r1, #3
 388 beq Lmemcpy_bt8  /* we have an aligned source */
 389
 390 /* erg - unaligned source */
 391 /* This is where it gets nasty ... */
 392 Lmemcpy_bsrcul:
 393 bic r1, r1, #3
 394 ldr r3, [r1, #0]
 395 cmp r12, #2
 396 blt Lmemcpy_bsrcul1
 397 beq Lmemcpy_bsrcul2
 398 cmp r2, #0x0c
 399 blt Lmemcpy_bsrcul3loop4
 400 sub r2, r2, #0x0c
 401 stmdb sp!, {r4, r5, lr}
 402
 403 Lmemcpy_bsrcul3loop16:
 404 mov lr, r3, lsl #8
 405 ldmdb r1!, {r3-r5, r12}
 406 orr lr, lr, r12, lsr #24
 407 mov r12, r12, lsl #8
 408 orr r12, r12, r5, lsr #24
 409 mov r5, r5, lsl #8
 410 orr r5, r5, r4, lsr #24
 411 mov r4, r4, lsl #8
 412 orr r4, r4, r3, lsr #24
 413 stmdb r0!, {r4, r5, r12, lr}
 414 subs r2, r2, #0x10
 415 bge Lmemcpy_bsrcul3loop16
 416 ldmia sp!, {r4, r5, lr}
 417 adds r2, r2, #0x0c
 418 blt Lmemcpy_bsrcul3l4
 419
 420 Lmemcpy_bsrcul3loop4:
 421 mov r12, r3, lsl #8
 422 ldr r3, [r1, #-4]!
 423 orr r12, r12, r3, lsr #24
 424 str r12, [r0, #-4]!
 425 subs r2, r2, #4
 426 bge Lmemcpy_bsrcul3loop4
 427
 428 Lmemcpy_bsrcul3l4:
 429 add r1, r1, #3
 430 b Lmemcpy_bl4
 431
 432 Lmemcpy_bsrcul2:
 433 cmp r2, #0x0c
 434 blt Lmemcpy_bsrcul2loop4
 435 sub r2, r2, #0x0c
 436 stmdb sp!, {r4, r5, lr}
 437
 438 Lmemcpy_bsrcul2loop16:
 439 mov lr, r3, lsl #16
 440 ldmdb r1!, {r3-r5, r12}
 441 orr lr, lr, r12, lsr #16
 442 mov r12, r12, lsl #16
 443 orr r12, r12, r5, lsr #16
 444 mov r5, r5, lsl #16
 445 orr r5, r5, r4, lsr #16
 446 mov r4, r4, lsl #16
 447 orr r4, r4, r3, lsr #16
 448 stmdb r0!, {r4, r5, r12, lr}
 449 subs r2, r2, #0x10
 450 bge Lmemcpy_bsrcul2loop16
 451 ldmia sp!, {r4, r5, lr}
 452 adds r2, r2, #0x0c
 453 blt Lmemcpy_bsrcul2l4
 454
 455 Lmemcpy_bsrcul2loop4:
 456 mov r12, r3, lsl #16
 457 ldr r3, [r1, #-4]!
 458 orr r12, r12, r3, lsr #16
 459 str r12, [r0, #-4]!
 460 subs r2, r2, #4
 461 bge Lmemcpy_bsrcul2loop4
 462
 463 Lmemcpy_bsrcul2l4:
 464 add r1, r1, #2
 465 b Lmemcpy_bl4
 466
 467 Lmemcpy_bsrcul1:
 468 cmp r2, #0x0c
 469 blt Lmemcpy_bsrcul1loop4
 470 sub r2, r2, #0x0c
 471 stmdb sp!, {r4, r5, lr}
 472
 473 Lmemcpy_bsrcul1loop32:
 474 mov lr, r3, lsl #24
 475 ldmdb r1!, {r3-r5, r12}
 476 orr lr, lr, r12, lsr #8
 477 mov r12, r12, lsl #24
 478 orr r12, r12, r5, lsr #8
 479 mov r5, r5, lsl #24
 480 orr r5, r5, r4, lsr #8
 481 mov r4, r4, lsl #24
 482 orr r4, r4, r3, lsr #8
 483 stmdb r0!, {r4, r5, r12, lr}
 484 subs r2, r2, #0x10
 485 bge Lmemcpy_bsrcul1loop32
 486 ldmia sp!, {r4, r5, lr}
 487 adds r2, r2, #0x0c
 488 blt Lmemcpy_bsrcul1l4
 489
 490 Lmemcpy_bsrcul1loop4:
 491 mov r12, r3, lsl #24
 492 ldr r3, [r1, #-4]!
 493 orr r12, r12, r3, lsr #8
 494 str r12, [r0, #-4]!
 495 subs r2, r2, #4
 496 bge Lmemcpy_bsrcul1loop4
 497
 498 Lmemcpy_bsrcul1l4:
 499 add r1, r1, #1
 500 b Lmemcpy_bl4