platform/gp2x/code940/memcpy.s

   1 /* $NetBSD: memcpy.S,v 1.3 1997/11/22 03:27:12 mark Exp $ */
   2
   3 /*-
   4 * Copyright (c) 1997 The NetBSD Foundation, Inc.
   5 * All rights reserved.
   6 *
   7 * This code is derived from software contributed to The NetBSD Foundation
   8 * by Neil A. Carson and Mark Brinicombe
   9 *
  10 * Redistribution and use in source and binary forms, with or without
  11 * modification, are permitted provided that the following conditions
  12 * are met:
  13 * 1. Redistributions of source code must retain the above copyright
  14 *    notice, this list of conditions and the following disclaimer.
  15 * 2. Redistributions in binary form must reproduce the above copyright
  16 *    notice, this list of conditions and the following disclaimer in the
  17 *    documentation and/or other materials provided with the distribution.
  18 * 3. All advertising materials mentioning features or use of this software
  19 *    must display the following acknowledgement:
  20 * This product includes software developed by the NetBSD
  21 * Foundation, Inc. and its contributors.
  22 * 4. Neither the name of The NetBSD Foundation nor the names of its
  23 *    contributors may be used to endorse or promote products derived
  24 *    from this software without specific prior written permission.
  25 *
  26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  27 * ``AS IS\'\' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  29 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  36 * POSSIBILITY OF SUCH DAMAGE.
  37 */
  38
  39 /* #include <machine/asm.h>*/
  40
  41 .globl memcpy
  42 .globl _memcpy
  43 memcpy:
  44
  45 stmfd sp!, {r0, lr}
  46 bl _memcpy
  47 ldmfd sp!, {r0, pc}
  48
  49
  50 .globl memmove
  51 memmove:
  52
  53 stmfd sp!, {r0, lr}
  54 bl _memcpy
  55 ldmfd sp!, {r0, pc}
  56
  57
  58
  59 /*
  60 * This is one fun bit of code ...
  61 * Some easy listening music is suggested while trying to understand this
  62 * code e.g. Iron Maiden
  63 *
  64 * For anyone attempting to understand it :
  65 *
  66 * The core code is implemented here with simple stubs for memcpy()
  67 * memmove() and bcopy().
  68 *
  69 * All local labels are prefixed with Lmemcpy_
  70 * Following the prefix a label starting f is used in the forward copy code
  71 * while a label using b is used in the backwards copy code
  72 * The source and destination addresses determine whether a forward or
  73 * backward copy is performed.
  74 * Separate bits of code are used to deal with the following situations
  75 * for both the forward and backwards copy.
  76 * unaligned source address
  77 * unaligned destination address
  78 * Separate copy routines are used to produce an optimised result for each
  79 * of these cases.
  80 * The copy code will use LDM/STM instructions to copy up to 32 bytes at
  81 * a time where possible.
  82 *
  83 * Note: r12 (aka ip) can be trashed during the function along with
  84 * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
  85 * Additional registers are preserved prior to use i.e. r4, r5 & lr
  86 *
  87 * Apologies for the state of the comments;-)
  88 */
  89
  90
  91
  92 _memcpy:
  93
  94 /* Determine copy direction */
  95 cmp r1, r0
  96 bcc Lmemcpy_backwards
  97
  98 moveq r0, #0   /* Quick abort for len=0 */
  99 moveq pc, lr
 100
 101 stmdb sp!, {r0, lr}  /* memcpy() returns dest addr */
 102 subs r2, r2, #4
 103 blt Lmemcpy_fl4  /* less than 4 bytes */
 104 ands r12, r0, #3
 105 bne Lmemcpy_fdestul  /* oh unaligned destination addr */
 106 ands r12, r1, #3
 107 bne Lmemcpy_fsrcul  /* oh unaligned source addr */
 108
 109 Lmemcpy_ft8:
 110 /* We have aligned source and destination */
 111 subs r2, r2, #8
 112 blt Lmemcpy_fl12  /* less than 12 bytes (4 from above) */
 113 subs r2, r2, #0x14
 114 blt Lmemcpy_fl32  /* less than 32 bytes (12 from above) */
 115 stmdb sp!, {r4, r7, r8, r9, r10}  /* borrow r4 */
 116
 117 /* blat 64 bytes at a time */
 118 /* XXX for really big copies perhaps we should use more registers */
 119 Lmemcpy_floop32:
 120 ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
 121 stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
 122 ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
 123 stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
 124 subs r2, r2, #0x40
 125 bge Lmemcpy_floop32
 126
 127 cmn r2, #0x10
 128 ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
 129 stmgeia r0!, {r3, r4, r12, lr}
 130 subge r2, r2, #0x10
 131 ldmia sp!, {r4, r7, r8, r9, r10}  /* return r4 */
 132
 133 Lmemcpy_fl32:
 134 adds r2, r2, #0x14
 135
 136 /* blat 12 bytes at a time */
 137 Lmemcpy_floop12:
 138 ldmgeia r1!, {r3, r12, lr}
 139 stmgeia r0!, {r3, r12, lr}
 140 subges r2, r2, #0x0c
 141 bge Lmemcpy_floop12
 142
 143 Lmemcpy_fl12:
 144 adds r2, r2, #8
 145 blt Lmemcpy_fl4
 146
 147 subs r2, r2, #4
 148 ldrlt r3, [r1], #4
 149 strlt r3, [r0], #4
 150 ldmgeia r1!, {r3, r12}
 151 stmgeia r0!, {r3, r12}
 152 subge r2, r2, #4
 153
 154 Lmemcpy_fl4:
 155 /* less than 4 bytes to go */
 156 adds r2, r2, #4
 157 ldmeqia sp!, {r0, pc}  /* done */
 158
 159 /* copy the crud byte at a time */
 160 cmp r2, #2
 161 ldrb r3, [r1], #1
 162 strb r3, [r0], #1
 163 ldrgeb r3, [r1], #1
 164 strgeb r3, [r0], #1
 165 ldrgtb r3, [r1], #1
 166 strgtb r3, [r0], #1
 167 ldmia sp!, {r0, pc}
 168
 169 /* erg - unaligned destination */
 170 Lmemcpy_fdestul:
 171 rsb r12, r12, #4
 172 cmp r12, #2
 173
 174 /* align destination with byte copies */
 175 ldrb r3, [r1], #1
 176 strb r3, [r0], #1
 177 ldrgeb r3, [r1], #1
 178 strgeb r3, [r0], #1
 179 ldrgtb r3, [r1], #1
 180 strgtb r3, [r0], #1
 181 subs r2, r2, r12
 182 blt Lmemcpy_fl4  /* less the 4 bytes */
 183
 184 ands r12, r1, #3
 185 beq Lmemcpy_ft8  /* we have an aligned source */
 186
 187 /* erg - unaligned source */
 188 /* This is where it gets nasty ... */
 189 Lmemcpy_fsrcul:
 190 bic r1, r1, #3
 191 ldr lr, [r1], #4
 192 cmp r12, #2
 193 bgt Lmemcpy_fsrcul3
 194 beq Lmemcpy_fsrcul2
 195 cmp r2, #0x0c
 196 blt Lmemcpy_fsrcul1loop4
 197 sub r2, r2, #0x0c
 198 stmdb sp!, {r4, r5}
 199
 200 Lmemcpy_fsrcul1loop16:
 201 mov r3, lr, lsr #8
 202 ldmia r1!, {r4, r5, r12, lr}
 203 orr r3, r3, r4, lsl #24
 204 mov r4, r4, lsr #8
 205 orr r4, r4, r5, lsl #24
 206 mov r5, r5, lsr #8
 207 orr r5, r5, r12, lsl #24
 208 mov r12, r12, lsr #8
 209 orr r12, r12, lr, lsl #24
 210 stmia r0!, {r3-r5, r12}
 211 subs r2, r2, #0x10
 212 bge Lmemcpy_fsrcul1loop16
 213 ldmia sp!, {r4, r5}
 214 adds r2, r2, #0x0c
 215 blt Lmemcpy_fsrcul1l4
 216
 217 Lmemcpy_fsrcul1loop4:
 218 mov r12, lr, lsr #8
 219 ldr lr, [r1], #4
 220 orr r12, r12, lr, lsl #24
 221 str r12, [r0], #4
 222 subs r2, r2, #4
 223 bge Lmemcpy_fsrcul1loop4
 224
 225 Lmemcpy_fsrcul1l4:
 226 sub r1, r1, #3
 227 b Lmemcpy_fl4
 228
 229 Lmemcpy_fsrcul2:
 230 cmp r2, #0x0c
 231 blt Lmemcpy_fsrcul2loop4
 232 sub r2, r2, #0x0c
 233 stmdb sp!, {r4, r5}
 234
 235 Lmemcpy_fsrcul2loop16:
 236 mov r3, lr, lsr #16
 237 ldmia r1!, {r4, r5, r12, lr}
 238 orr r3, r3, r4, lsl #16
 239 mov r4, r4, lsr #16
 240 orr r4, r4, r5, lsl #16
 241 mov r5, r5, lsr #16
 242 orr r5, r5, r12, lsl #16
 243 mov r12, r12, lsr #16
 244 orr r12, r12, lr, lsl #16
 245 stmia r0!, {r3-r5, r12}
 246 subs r2, r2, #0x10
 247 bge Lmemcpy_fsrcul2loop16
 248 ldmia sp!, {r4, r5}
 249 adds r2, r2, #0x0c
 250 blt Lmemcpy_fsrcul2l4
 251
 252 Lmemcpy_fsrcul2loop4:
 253 mov r12, lr, lsr #16
 254 ldr lr, [r1], #4
 255 orr r12, r12, lr, lsl #16
 256 str r12, [r0], #4
 257 subs r2, r2, #4
 258 bge Lmemcpy_fsrcul2loop4
 259
 260 Lmemcpy_fsrcul2l4:
 261 sub r1, r1, #2
 262 b Lmemcpy_fl4
 263
 264 Lmemcpy_fsrcul3:
 265 cmp r2, #0x0c
 266 blt Lmemcpy_fsrcul3loop4
 267 sub r2, r2, #0x0c
 268 stmdb sp!, {r4, r5}
 269
 270 Lmemcpy_fsrcul3loop16:
 271 mov r3, lr, lsr #24
 272 ldmia r1!, {r4, r5, r12, lr}
 273 orr r3, r3, r4, lsl #8
 274 mov r4, r4, lsr #24
 275 orr r4, r4, r5, lsl #8
 276 mov r5, r5, lsr #24
 277 orr r5, r5, r12, lsl #8
 278 mov r12, r12, lsr #24
 279 orr r12, r12, lr, lsl #8
 280 stmia r0!, {r3-r5, r12}
 281 subs r2, r2, #0x10
 282 bge Lmemcpy_fsrcul3loop16
 283 ldmia sp!, {r4, r5}
 284 adds r2, r2, #0x0c
 285 blt Lmemcpy_fsrcul3l4
 286
 287 Lmemcpy_fsrcul3loop4:
 288 mov r12, lr, lsr #24
 289 ldr lr, [r1], #4
 290 orr r12, r12, lr, lsl #8
 291 str r12, [r0], #4
 292 subs r2, r2, #4
 293 bge Lmemcpy_fsrcul3loop4
 294
 295 Lmemcpy_fsrcul3l4:
 296 sub r1, r1, #1
 297 b Lmemcpy_fl4
 298
 299 Lmemcpy_backwards:
 300 add r1, r1, r2
 301 add r0, r0, r2
 302 subs r2, r2, #4
 303 blt Lmemcpy_bl4  /* less than 4 bytes */
 304 ands r12, r0, #3
 305 bne Lmemcpy_bdestul  /* oh unaligned destination addr */
 306 ands r12, r1, #3
 307 bne Lmemcpy_bsrcul  /* oh unaligned source addr */
 308
 309 Lmemcpy_bt8:
 310 /* We have aligned source and destination */
 311 subs r2, r2, #8
 312 blt Lmemcpy_bl12  /* less than 12 bytes (4 from above) */
 313 stmdb sp!, {r4, r7, r8, r9, r10, lr}
 314 subs r2, r2, #0x14  /* less than 32 bytes (12 from above) */
 315 blt Lmemcpy_bl32
 316
 317 /* blat 64 bytes at a time */
 318 /* XXX for really big copies perhaps we should use more registers */
 319 Lmemcpy_bloop32:
 320 ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
 321 stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
 322 ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
 323 stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
 324 subs r2, r2, #0x40
 325 bge Lmemcpy_bloop32
 326
 327 Lmemcpy_bl32:
 328 cmn r2, #0x10
 329 ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
 330 stmgedb r0!, {r3, r4, r12, lr}
 331 subge r2, r2, #0x10
 332 adds r2, r2, #0x14
 333 ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */
 334 stmgedb r0!, {r3, r12, lr}
 335 subge r2, r2, #0x0c
 336 ldmia sp!, {r4, r7, r8, r9, r10, lr}
 337
 338 Lmemcpy_bl12:
 339 adds r2, r2, #8
 340 blt Lmemcpy_bl4
 341 subs r2, r2, #4
 342 ldrlt r3, [r1, #-4]!
 343 strlt r3, [r0, #-4]!
 344 ldmgedb r1!, {r3, r12}
 345 stmgedb r0!, {r3, r12}
 346 subge r2, r2, #4
 347
 348 Lmemcpy_bl4:
 349 /* less than 4 bytes to go */
 350 adds r2, r2, #4
 351 moveq pc, lr   /* done */
 352
 353 /* copy the crud byte at a time */
 354 cmp r2, #2
 355 ldrb r3, [r1, #-1]!
 356 strb r3, [r0, #-1]!
 357 ldrgeb r3, [r1, #-1]!
 358 strgeb r3, [r0, #-1]!
 359 ldrgtb r3, [r1, #-1]!
 360 strgtb r3, [r0, #-1]!
 361 mov pc, lr
 362
 363 /* erg - unaligned destination */
 364 Lmemcpy_bdestul:
 365 cmp r12, #2
 366
 367 /* align destination with byte copies */
 368 ldrb r3, [r1, #-1]!
 369 strb r3, [r0, #-1]!
 370 ldrgeb r3, [r1, #-1]!
 371 strgeb r3, [r0, #-1]!
 372 ldrgtb r3, [r1, #-1]!
 373 strgtb r3, [r0, #-1]!
 374 subs r2, r2, r12
 375 blt Lmemcpy_bl4  /* less than 4 bytes to go */
 376 ands r12, r1, #3
 377 beq Lmemcpy_bt8  /* we have an aligned source */
 378
 379 /* erg - unaligned source */
 380 /* This is where it gets nasty ... */
 381 Lmemcpy_bsrcul:
 382 bic r1, r1, #3
 383 ldr r3, [r1, #0]
 384 cmp r12, #2
 385 blt Lmemcpy_bsrcul1
 386 beq Lmemcpy_bsrcul2
 387 cmp r2, #0x0c
 388 blt Lmemcpy_bsrcul3loop4
 389 sub r2, r2, #0x0c
 390 stmdb sp!, {r4, r5, lr}
 391
 392 Lmemcpy_bsrcul3loop16:
 393 mov lr, r3, lsl #8
 394 ldmdb r1!, {r3-r5, r12}
 395 orr lr, lr, r12, lsr #24
 396 mov r12, r12, lsl #8
 397 orr r12, r12, r5, lsr #24
 398 mov r5, r5, lsl #8
 399 orr r5, r5, r4, lsr #24
 400 mov r4, r4, lsl #8
 401 orr r4, r4, r3, lsr #24
 402 stmdb r0!, {r4, r5, r12, lr}
 403 subs r2, r2, #0x10
 404 bge Lmemcpy_bsrcul3loop16
 405 ldmia sp!, {r4, r5, lr}
 406 adds r2, r2, #0x0c
 407 blt Lmemcpy_bsrcul3l4
 408
 409 Lmemcpy_bsrcul3loop4:
 410 mov r12, r3, lsl #8
 411 ldr r3, [r1, #-4]!
 412 orr r12, r12, r3, lsr #24
 413 str r12, [r0, #-4]!
 414 subs r2, r2, #4
 415 bge Lmemcpy_bsrcul3loop4
 416
 417 Lmemcpy_bsrcul3l4:
 418 add r1, r1, #3
 419 b Lmemcpy_bl4
 420
 421 Lmemcpy_bsrcul2:
 422 cmp r2, #0x0c
 423 blt Lmemcpy_bsrcul2loop4
 424 sub r2, r2, #0x0c
 425 stmdb sp!, {r4, r5, lr}
 426
 427 Lmemcpy_bsrcul2loop16:
 428 mov lr, r3, lsl #16
 429 ldmdb r1!, {r3-r5, r12}
 430 orr lr, lr, r12, lsr #16
 431 mov r12, r12, lsl #16
 432 orr r12, r12, r5, lsr #16
 433 mov r5, r5, lsl #16
 434 orr r5, r5, r4, lsr #16
 435 mov r4, r4, lsl #16
 436 orr r4, r4, r3, lsr #16
 437 stmdb r0!, {r4, r5, r12, lr}
 438 subs r2, r2, #0x10
 439 bge Lmemcpy_bsrcul2loop16
 440 ldmia sp!, {r4, r5, lr}
 441 adds r2, r2, #0x0c
 442 blt Lmemcpy_bsrcul2l4
 443
 444 Lmemcpy_bsrcul2loop4:
 445 mov r12, r3, lsl #16
 446 ldr r3, [r1, #-4]!
 447 orr r12, r12, r3, lsr #16
 448 str r12, [r0, #-4]!
 449 subs r2, r2, #4
 450 bge Lmemcpy_bsrcul2loop4
 451
 452 Lmemcpy_bsrcul2l4:
 453 add r1, r1, #2
 454 b Lmemcpy_bl4
 455
 456 Lmemcpy_bsrcul1:
 457 cmp r2, #0x0c
 458 blt Lmemcpy_bsrcul1loop4
 459 sub r2, r2, #0x0c
 460 stmdb sp!, {r4, r5, lr}
 461
 462 Lmemcpy_bsrcul1loop32:
 463 mov lr, r3, lsl #24
 464 ldmdb r1!, {r3-r5, r12}
 465 orr lr, lr, r12, lsr #8
 466 mov r12, r12, lsl #24
 467 orr r12, r12, r5, lsr #8
 468 mov r5, r5, lsl #24
 469 orr r5, r5, r4, lsr #8
 470 mov r4, r4, lsl #24
 471 orr r4, r4, r3, lsr #8
 472 stmdb r0!, {r4, r5, r12, lr}
 473 subs r2, r2, #0x10
 474 bge Lmemcpy_bsrcul1loop32
 475 ldmia sp!, {r4, r5, lr}
 476 adds r2, r2, #0x0c
 477 blt Lmemcpy_bsrcul1l4
 478
 479 Lmemcpy_bsrcul1loop4:
 480 mov r12, r3, lsl #24
 481 ldr r3, [r1, #-4]!
 482 orr r12, r12, r3, lsr #8
 483 str r12, [r0, #-4]!
 484 subs r2, r2, #4
 485 bge Lmemcpy_bsrcul1loop4
 486
 487 Lmemcpy_bsrcul1l4:
 488 add r1, r1, #1
 489 b Lmemcpy_bl4