platform/gp2x/code940/memcpy.s

   1 /* $NetBSD: memcpy.S,v 1.3 1997/11/22 03:27:12 mark Exp $ */
   2
   3 /*-
   4 * Copyright (c) 1997 The NetBSD Foundation, Inc.
   5 * All rights reserved.
   6 *
   7 * This code is derived from software contributed to The NetBSD Foundation
   8 * by Neil A. Carson and Mark Brinicombe
   9 *
  10 * Redistribution and use in source and binary forms, with or without
  11 * modification, are permitted provided that the following conditions
  12 * are met:
  13 * 1. Redistributions of source code must retain the above copyright
  14 *    notice, this list of conditions and the following disclaimer.
  15 * 2. Redistributions in binary form must reproduce the above copyright
  16 *    notice, this list of conditions and the following disclaimer in the
  17 *    documentation and/or other materials provided with the distribution.
  18 * 3. All advertising materials mentioning features or use of this software
  19 *    must display the following acknowledgement:
  20 * This product includes software developed by the NetBSD
  21 * Foundation, Inc. and its contributors.
  22 * 4. Neither the name of The NetBSD Foundation nor the names of its
  23 *    contributors may be used to endorse or promote products derived
  24 *    from this software without specific prior written permission.
  25 *
  26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  27 * ``AS IS\'\' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  29 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  36 * POSSIBILITY OF SUCH DAMAGE.
  37 */
  38
  39 /* #include <machine/asm.h>*/
  40
  41 .globl memcpy
  42 .globl _memcpy
  43 memcpy:
  44
  45 stmfd sp!, {r0, lr}
  46 bl _memcpy
  47 ldmfd sp!, {r0, pc}
  48
  49
  50 .globl memmove
  51 memmove:
  52
  53 stmfd sp!, {r0, lr}
  54 bl _memcpy
  55 ldmfd sp!, {r0, pc}
  56
  57
  58
  59 /*
  60 * This is one fun bit of code ...
  61 * Some easy listening music is suggested while trying to understand this
  62 * code e.g. Iron Maiden
  63 *
  64 * For anyone attempting to understand it :
  65 *
  66 * The core code is implemented here with simple stubs for memcpy()
  67 * memmove() and bcopy().
  68 *
  69 * All local labels are prefixed with Lmemcpy_
  70 * Following the prefix a label starting f is used in the forward copy code
  71 * while a label using b is used in the backwards copy code
  72 * The source and destination addresses determine whether a forward or
  73 * backward copy is performed.
  74 * Separate bits of code are used to deal with the following situations
  75 * for both the forward and backwards copy.
  76 * unaligned source address
  77 * unaligned destination address
  78 * Separate copy routines are used to produce an optimised result for each
  79 * of these cases.
  80 * The copy code will use LDM/STM instructions to copy up to 32 bytes at
  81 * a time where possible.
  82 *
  83 * Note: r12 (aka ip) can be trashed during the function along with
  84 * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
  85 * Additional registers are preserved prior to use i.e. r4, r5 & lr
  86 *
  87 * Apologies for the state of the comments;-)
  88 */
  89
  90
  91
  92 _memcpy:
  93
  94 /* Determine copy direction */
  95 cmp r1, r0
  96 bcc Lmemcpy_backwards
  97
  98 moveq r0, #0   /* Quick abort for len=0 */
  99 moveq pc, lr
 100
 101 stmdb sp!, {r0, lr}  /* memcpy() returns dest addr */
 102 subs r2, r2, #4
 103 blt Lmemcpy_fl4  /* less than 4 bytes */
 104 ands r12, r0, #3
 105 bne Lmemcpy_fdestul  /* oh unaligned destination addr */
 106 ands r12, r1, #3
 107 bne Lmemcpy_fsrcul  /* oh unaligned source addr */
 108
 109 Lmemcpy_ft8:
 110 /* We have aligned source and destination */
 111 subs r2, r2, #8
 112 blt Lmemcpy_fl12  /* less than 12 bytes (4 from above) */
 113 subs r2, r2, #0x14
 114 blt Lmemcpy_fl32  /* less than 32 bytes (12 from above) */
 115 stmdb sp!, {r4, r7, r8, r9, r10}  /* borrow r4 */
 116
 117 /* blat 32 bytes at a time */
 118 /* XXX for really big copies perhaps we should use more registers */
 119 Lmemcpy_floop32:
 120 ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
 121 stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
 122 subs r2, r2, #0x20
 123 bge Lmemcpy_floop32
 124
 125 cmn r2, #0x10
 126 ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
 127 stmgeia r0!, {r3, r4, r12, lr}
 128 subge r2, r2, #0x10
 129 ldmia sp!, {r4, r7, r8, r9, r10}  /* return r4 */
 130
 131 Lmemcpy_fl32:
 132 adds r2, r2, #0x14
 133
 134 /* blat 12 bytes at a time */
 135 Lmemcpy_floop12:
 136 ldmgeia r1!, {r3, r12, lr}
 137 stmgeia r0!, {r3, r12, lr}
 138 subges r2, r2, #0x0c
 139 bge Lmemcpy_floop12
 140
 141 Lmemcpy_fl12:
 142 adds r2, r2, #8
 143 blt Lmemcpy_fl4
 144
 145 subs r2, r2, #4
 146 ldrlt r3, [r1], #4
 147 strlt r3, [r0], #4
 148 ldmgeia r1!, {r3, r12}
 149 stmgeia r0!, {r3, r12}
 150 subge r2, r2, #4
 151
 152 Lmemcpy_fl4:
 153 /* less than 4 bytes to go */
 154 adds r2, r2, #4
 155 ldmeqia sp!, {r0, pc}  /* done */
 156
 157 /* copy the crud byte at a time */
 158 cmp r2, #2
 159 ldrb r3, [r1], #1
 160 strb r3, [r0], #1
 161 ldrgeb r3, [r1], #1
 162 strgeb r3, [r0], #1
 163 ldrgtb r3, [r1], #1
 164 strgtb r3, [r0], #1
 165 ldmia sp!, {r0, pc}
 166
 167 /* erg - unaligned destination */
 168 Lmemcpy_fdestul:
 169 rsb r12, r12, #4
 170 cmp r12, #2
 171
 172 /* align destination with byte copies */
 173 ldrb r3, [r1], #1
 174 strb r3, [r0], #1
 175 ldrgeb r3, [r1], #1
 176 strgeb r3, [r0], #1
 177 ldrgtb r3, [r1], #1
 178 strgtb r3, [r0], #1
 179 subs r2, r2, r12
 180 blt Lmemcpy_fl4  /* less the 4 bytes */
 181
 182 ands r12, r1, #3
 183 beq Lmemcpy_ft8  /* we have an aligned source */
 184
 185 /* erg - unaligned source */
 186 /* This is where it gets nasty ... */
 187 Lmemcpy_fsrcul:
 188 bic r1, r1, #3
 189 ldr lr, [r1], #4
 190 cmp r12, #2
 191 bgt Lmemcpy_fsrcul3
 192 beq Lmemcpy_fsrcul2
 193 cmp r2, #0x0c
 194 blt Lmemcpy_fsrcul1loop4
 195 sub r2, r2, #0x0c
 196 stmdb sp!, {r4, r5}
 197
 198 Lmemcpy_fsrcul1loop16:
 199 mov r3, lr, lsr #8
 200 ldmia r1!, {r4, r5, r12, lr}
 201 orr r3, r3, r4, lsl #24
 202 mov r4, r4, lsr #8
 203 orr r4, r4, r5, lsl #24
 204 mov r5, r5, lsr #8
 205 orr r5, r5, r12, lsl #24
 206 mov r12, r12, lsr #8
 207 orr r12, r12, lr, lsl #24
 208 stmia r0!, {r3-r5, r12}
 209 subs r2, r2, #0x10
 210 bge Lmemcpy_fsrcul1loop16
 211 ldmia sp!, {r4, r5}
 212 adds r2, r2, #0x0c
 213 blt Lmemcpy_fsrcul1l4
 214
 215 Lmemcpy_fsrcul1loop4:
 216 mov r12, lr, lsr #8
 217 ldr lr, [r1], #4
 218 orr r12, r12, lr, lsl #24
 219 str r12, [r0], #4
 220 subs r2, r2, #4
 221 bge Lmemcpy_fsrcul1loop4
 222
 223 Lmemcpy_fsrcul1l4:
 224 sub r1, r1, #3
 225 b Lmemcpy_fl4
 226
 227 Lmemcpy_fsrcul2:
 228 cmp r2, #0x0c
 229 blt Lmemcpy_fsrcul2loop4
 230 sub r2, r2, #0x0c
 231 stmdb sp!, {r4, r5}
 232
 233 Lmemcpy_fsrcul2loop16:
 234 mov r3, lr, lsr #16
 235 ldmia r1!, {r4, r5, r12, lr}
 236 orr r3, r3, r4, lsl #16
 237 mov r4, r4, lsr #16
 238 orr r4, r4, r5, lsl #16
 239 mov r5, r5, lsr #16
 240 orr r5, r5, r12, lsl #16
 241 mov r12, r12, lsr #16
 242 orr r12, r12, lr, lsl #16
 243 stmia r0!, {r3-r5, r12}
 244 subs r2, r2, #0x10
 245 bge Lmemcpy_fsrcul2loop16
 246 ldmia sp!, {r4, r5}
 247 adds r2, r2, #0x0c
 248 blt Lmemcpy_fsrcul2l4
 249
 250 Lmemcpy_fsrcul2loop4:
 251 mov r12, lr, lsr #16
 252 ldr lr, [r1], #4
 253 orr r12, r12, lr, lsl #16
 254 str r12, [r0], #4
 255 subs r2, r2, #4
 256 bge Lmemcpy_fsrcul2loop4
 257
 258 Lmemcpy_fsrcul2l4:
 259 sub r1, r1, #2
 260 b Lmemcpy_fl4
 261
 262 Lmemcpy_fsrcul3:
 263 cmp r2, #0x0c
 264 blt Lmemcpy_fsrcul3loop4
 265 sub r2, r2, #0x0c
 266 stmdb sp!, {r4, r5}
 267
 268 Lmemcpy_fsrcul3loop16:
 269 mov r3, lr, lsr #24
 270 ldmia r1!, {r4, r5, r12, lr}
 271 orr r3, r3, r4, lsl #8
 272 mov r4, r4, lsr #24
 273 orr r4, r4, r5, lsl #8
 274 mov r5, r5, lsr #24
 275 orr r5, r5, r12, lsl #8
 276 mov r12, r12, lsr #24
 277 orr r12, r12, lr, lsl #8
 278 stmia r0!, {r3-r5, r12}
 279 subs r2, r2, #0x10
 280 bge Lmemcpy_fsrcul3loop16
 281 ldmia sp!, {r4, r5}
 282 adds r2, r2, #0x0c
 283 blt Lmemcpy_fsrcul3l4
 284
 285 Lmemcpy_fsrcul3loop4:
 286 mov r12, lr, lsr #24
 287 ldr lr, [r1], #4
 288 orr r12, r12, lr, lsl #8
 289 str r12, [r0], #4
 290 subs r2, r2, #4
 291 bge Lmemcpy_fsrcul3loop4
 292
 293 Lmemcpy_fsrcul3l4:
 294 sub r1, r1, #1
 295 b Lmemcpy_fl4
 296
 297 Lmemcpy_backwards:
 298 add r1, r1, r2
 299 add r0, r0, r2
 300 subs r2, r2, #4
 301 blt Lmemcpy_bl4  /* less than 4 bytes */
 302 ands r12, r0, #3
 303 bne Lmemcpy_bdestul  /* oh unaligned destination addr */
 304 ands r12, r1, #3
 305 bne Lmemcpy_bsrcul  /* oh unaligned source addr */
 306
 307 Lmemcpy_bt8:
 308 /* We have aligned source and destination */
 309 subs r2, r2, #8
 310 blt Lmemcpy_bl12  /* less than 12 bytes (4 from above) */
 311 stmdb sp!, {r4, r7, r8, r9, r10, lr}
 312 subs r2, r2, #0x14  /* less than 32 bytes (12 from above) */
 313 blt Lmemcpy_bl32
 314
 315 /* blat 32 bytes at a time */
 316 /* XXX for really big copies perhaps we should use more registers */
 317 Lmemcpy_bloop32:
 318 ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
 319 stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
 320 subs r2, r2, #0x20
 321 bge Lmemcpy_bloop32
 322
 323 Lmemcpy_bl32:
 324 cmn r2, #0x10
 325 ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
 326 stmgedb r0!, {r3, r4, r12, lr}
 327 subge r2, r2, #0x10
 328 adds r2, r2, #0x14
 329 ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */
 330 stmgedb r0!, {r3, r12, lr}
 331 subge r2, r2, #0x0c
 332 ldmia sp!, {r4, r7, r8, r9, r10, lr}
 333
 334 Lmemcpy_bl12:
 335 adds r2, r2, #8
 336 blt Lmemcpy_bl4
 337 subs r2, r2, #4
 338 ldrlt r3, [r1, #-4]!
 339 strlt r3, [r0, #-4]!
 340 ldmgedb r1!, {r3, r12}
 341 stmgedb r0!, {r3, r12}
 342 subge r2, r2, #4
 343
 344 Lmemcpy_bl4:
 345 /* less than 4 bytes to go */
 346 adds r2, r2, #4
 347 moveq pc, lr   /* done */
 348
 349 /* copy the crud byte at a time */
 350 cmp r2, #2
 351 ldrb r3, [r1, #-1]!
 352 strb r3, [r0, #-1]!
 353 ldrgeb r3, [r1, #-1]!
 354 strgeb r3, [r0, #-1]!
 355 ldrgtb r3, [r1, #-1]!
 356 strgtb r3, [r0, #-1]!
 357 mov pc, lr
 358
 359 /* erg - unaligned destination */
 360 Lmemcpy_bdestul:
 361 cmp r12, #2
 362
 363 /* align destination with byte copies */
 364 ldrb r3, [r1, #-1]!
 365 strb r3, [r0, #-1]!
 366 ldrgeb r3, [r1, #-1]!
 367 strgeb r3, [r0, #-1]!
 368 ldrgtb r3, [r1, #-1]!
 369 strgtb r3, [r0, #-1]!
 370 subs r2, r2, r12
 371 blt Lmemcpy_bl4  /* less than 4 bytes to go */
 372 ands r12, r1, #3
 373 beq Lmemcpy_bt8  /* we have an aligned source */
 374
 375 /* erg - unaligned source */
 376 /* This is where it gets nasty ... */
 377 Lmemcpy_bsrcul:
 378 bic r1, r1, #3
 379 ldr r3, [r1, #0]
 380 cmp r12, #2
 381 blt Lmemcpy_bsrcul1
 382 beq Lmemcpy_bsrcul2
 383 cmp r2, #0x0c
 384 blt Lmemcpy_bsrcul3loop4
 385 sub r2, r2, #0x0c
 386 stmdb sp!, {r4, r5, lr}
 387
 388 Lmemcpy_bsrcul3loop16:
 389 mov lr, r3, lsl #8
 390 ldmdb r1!, {r3-r5, r12}
 391 orr lr, lr, r12, lsr #24
 392 mov r12, r12, lsl #8
 393 orr r12, r12, r5, lsr #24
 394 mov r5, r5, lsl #8
 395 orr r5, r5, r4, lsr #24
 396 mov r4, r4, lsl #8
 397 orr r4, r4, r3, lsr #24
 398 stmdb r0!, {r4, r5, r12, lr}
 399 subs r2, r2, #0x10
 400 bge Lmemcpy_bsrcul3loop16
 401 ldmia sp!, {r4, r5, lr}
 402 adds r2, r2, #0x0c
 403 blt Lmemcpy_bsrcul3l4
 404
 405 Lmemcpy_bsrcul3loop4:
 406 mov r12, r3, lsl #8
 407 ldr r3, [r1, #-4]!
 408 orr r12, r12, r3, lsr #24
 409 str r12, [r0, #-4]!
 410 subs r2, r2, #4
 411 bge Lmemcpy_bsrcul3loop4
 412
 413 Lmemcpy_bsrcul3l4:
 414 add r1, r1, #3
 415 b Lmemcpy_bl4
 416
 417 Lmemcpy_bsrcul2:
 418 cmp r2, #0x0c
 419 blt Lmemcpy_bsrcul2loop4
 420 sub r2, r2, #0x0c
 421 stmdb sp!, {r4, r5, lr}
 422
 423 Lmemcpy_bsrcul2loop16:
 424 mov lr, r3, lsl #16
 425 ldmdb r1!, {r3-r5, r12}
 426 orr lr, lr, r12, lsr #16
 427 mov r12, r12, lsl #16
 428 orr r12, r12, r5, lsr #16
 429 mov r5, r5, lsl #16
 430 orr r5, r5, r4, lsr #16
 431 mov r4, r4, lsl #16
 432 orr r4, r4, r3, lsr #16
 433 stmdb r0!, {r4, r5, r12, lr}
 434 subs r2, r2, #0x10
 435 bge Lmemcpy_bsrcul2loop16
 436 ldmia sp!, {r4, r5, lr}
 437 adds r2, r2, #0x0c
 438 blt Lmemcpy_bsrcul2l4
 439
 440 Lmemcpy_bsrcul2loop4:
 441 mov r12, r3, lsl #16
 442 ldr r3, [r1, #-4]!
 443 orr r12, r12, r3, lsr #16
 444 str r12, [r0, #-4]!
 445 subs r2, r2, #4
 446 bge Lmemcpy_bsrcul2loop4
 447
 448 Lmemcpy_bsrcul2l4:
 449 add r1, r1, #2
 450 b Lmemcpy_bl4
 451
 452 Lmemcpy_bsrcul1:
 453 cmp r2, #0x0c
 454 blt Lmemcpy_bsrcul1loop4
 455 sub r2, r2, #0x0c
 456 stmdb sp!, {r4, r5, lr}
 457
 458 Lmemcpy_bsrcul1loop32:
 459 mov lr, r3, lsl #24
 460 ldmdb r1!, {r3-r5, r12}
 461 orr lr, lr, r12, lsr #8
 462 mov r12, r12, lsl #24
 463 orr r12, r12, r5, lsr #8
 464 mov r5, r5, lsl #24
 465 orr r5, r5, r4, lsr #8
 466 mov r4, r4, lsl #24
 467 orr r4, r4, r3, lsr #8
 468 stmdb r0!, {r4, r5, r12, lr}
 469 subs r2, r2, #0x10
 470 bge Lmemcpy_bsrcul1loop32
 471 ldmia sp!, {r4, r5, lr}
 472 adds r2, r2, #0x0c
 473 blt Lmemcpy_bsrcul1l4
 474
 475 Lmemcpy_bsrcul1loop4:
 476 mov r12, r3, lsl #24
 477 ldr r3, [r1, #-4]!
 478 orr r12, r12, r3, lsr #8
 479 str r12, [r0, #-4]!
 480 subs r2, r2, #4
 481 bge Lmemcpy_bsrcul1loop4
 482
 483 Lmemcpy_bsrcul1l4:
 484 add r1, r1, #1
 485 b Lmemcpy_bl4