inline/parametrize rootcounter reads
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / linkage_arm.s
index f838fcb..19c9686 100644 (file)
@@ -1,6 +1,7 @@
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
- *   Mupen64plus - linkage_arm.s                                           *
- *   Copyright (C) 2009-2010 Ari64                                         *
+ *   linkage_arm.s for PCSX                                                *
+ *   Copyright (C) 2009-2011 Ari64                                         *
+ *   Copyright (C) 2010-2011 GraÅžvydas "notaz" Ignotas                     *
  *                                                                         *
  *   This program is free software; you can redistribute it and/or modify  *
  *   it under the terms of the GNU General Public License as published by  *
  *   Free Software Foundation, Inc.,                                       *
  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
-       .cpu arm9tdmi
-       .fpu softvfp
-       .eabi_attribute 20, 1
-       .eabi_attribute 21, 1
-       .eabi_attribute 23, 3
-       .eabi_attribute 24, 1
-       .eabi_attribute 25, 1
-       .eabi_attribute 26, 2
-       .eabi_attribute 30, 6
-       .eabi_attribute 18, 4
-       .file   "linkage_arm.s"
+
+/* .equiv HAVE_ARMV7, 1 */
+
        .global rdram
 rdram = 0x80000000
        .global dynarec_local
        .global reg
        .global hi
        .global lo
-       .global reg_cop1_simple
-       .global reg_cop1_double
        .global reg_cop0
+       .global reg_cop2d
+       .global reg_cop2c
        .global FCR0
        .global FCR31
-       .global rounding_modes
        .global next_interupt
        .global cycle_count
        .global last_count
@@ -48,23 +40,25 @@ rdram = 0x80000000
        .global stop
        .global invc_ptr
        .global address
-       .global readmem_dword
-       .global dword
-       .global word
-       .global hword
-       .global byte
        .global branch_target
        .global PC
-       .global fake_pc
        .global mini_ht
        .global restore_candidate
-       .global memory_map
+       /* psx */
+       .global psxRegs
+       .global mem_rtab
+       .global mem_wtab
+       .global psxH_ptr
+       .global inv_code_start
+       .global inv_code_end
+       .global rcnts
+
        .bss
        .align  4
        .type   dynarec_local, %object
-       .size   dynarec_local, 64
+       .size   dynarec_local, dynarec_local_end-dynarec_local
 dynarec_local:
-       .space  64+16+16+8+8+8+8+256+8+8+128+128+128+16+8+132+4+256+512+4194304
+       .space  dynarec_local_end-dynarec_local
 next_interupt = dynarec_local + 64
        .type   next_interupt, %object
        .size   next_interupt, 4
@@ -77,10 +71,7 @@ last_count = cycle_count + 4
 pending_exception = last_count + 4
        .type   pending_exception, %object
        .size   pending_exception, 4
-pcaddr = pending_exception + 4
-       .type   pcaddr, %object
-       .size   pcaddr, 4
-stop = pcaddr + 4
+stop = pending_exception + 4
        .type   stop, %object
        .size   stop, 4
 invc_ptr = stop + 4
@@ -89,120 +80,169 @@ invc_ptr = stop + 4
 address = invc_ptr + 4
        .type   address, %object
        .size   address, 4
-readmem_dword = address + 4
-       .type   readmem_dword, %object
-       .size   readmem_dword, 8
-dword = readmem_dword + 8
-       .type   dword, %object
-       .size   dword, 8
-word = dword + 8
-       .type   word, %object
-       .size   word, 4
-hword = word + 4
-       .type   hword, %object
-       .size   hword, 2
-byte = hword + 2
-       .type   byte, %object
-       .size   byte, 1 /* 1 byte free */
-FCR0 = hword + 4
-       .type   FCR0, %object
-       .size   FCR0, 4
-FCR31 = FCR0 + 4
-       .type   FCR31, %object
-       .size   FCR31, 4
-reg = FCR31 + 4
+psxRegs = address + 4
+
+/* psxRegs */
+       .type   psxRegs, %object
+       .size   psxRegs, psxRegs_end-psxRegs
+reg = psxRegs
        .type   reg, %object
-       .size   reg, 256
-hi = reg + 256
-       .type   hi, %object
-       .size   hi, 8
-lo = hi + 8
+       .size   reg, 128
+lo = reg + 128
        .type   lo, %object
-       .size   lo, 8
-reg_cop0 = lo + 8
+       .size   lo, 4
+hi = lo + 4
+       .type   hi, %object
+       .size   hi, 4
+reg_cop0 = hi + 4
        .type   reg_cop0, %object
        .size   reg_cop0, 128
-reg_cop1_simple = reg_cop0 + 128
-       .type   reg_cop1_simple, %object
-       .size   reg_cop1_simple, 128
-reg_cop1_double = reg_cop1_simple + 128
-       .type   reg_cop1_double, %object
-       .size   reg_cop1_double, 128
-rounding_modes = reg_cop1_double + 128
-       .type   rounding_modes, %object
-       .size   rounding_modes, 16
-branch_target = rounding_modes + 16
-       .type   branch_target, %object
-       .size   branch_target, 4
-PC = branch_target + 4
+reg_cop2d = reg_cop0 + 128
+       .type   reg_cop2d, %object
+       .size   reg_cop2d, 128
+reg_cop2c = reg_cop2d + 128
+       .type   reg_cop2c, %object
+       .size   reg_cop2c, 128
+PC = reg_cop2c + 128
+pcaddr = PC
        .type   PC, %object
        .size   PC, 4
-fake_pc = PC + 4
-       .type   fake_pc, %object
-       .size   fake_pc, 132
-/* 4 bytes free */
-mini_ht = fake_pc + 136
+code = PC + 4
+       .type   code, %object
+       .size   code, 4
+cycle = code + 4
+       .type   cycle, %object
+       .size   cycle, 4
+interrupt = cycle + 4
+       .type   interrupt, %object
+       .size   interrupt, 4
+intCycle = interrupt + 4
+       .type   intCycle, %object
+       .size   intCycle, 256
+psxRegs_end = intCycle + 256
+
+rcnts = psxRegs_end
+       .type   rcnts, %object
+       .size   rcnts, 7*4*4
+rcnts_end = rcnts + 7*4*4
+
+mem_rtab = rcnts_end
+       .type   mem_rtab, %object
+       .size   mem_rtab, 4
+mem_wtab = mem_rtab + 4
+       .type   mem_wtab, %object
+       .size   mem_wtab, 4
+psxH_ptr = mem_wtab + 4
+       .type   psxH_ptr, %object
+       .size   psxH_ptr, 4
+inv_code_start = psxH_ptr + 4
+       .type   inv_code_start, %object
+       .size   inv_code_start, 4
+inv_code_end = inv_code_start + 4
+       .type   inv_code_end, %object
+       .size   inv_code_end, 4
+branch_target = inv_code_end + 4
+       .type   branch_target, %object
+       .size   branch_target, 4
+align0 = branch_target + 4 /* unused/alignment */
+       .type   align0, %object
+       .size   align0, 4
+mini_ht = align0 + 4
        .type   mini_ht, %object
        .size   mini_ht, 256
 restore_candidate = mini_ht + 256
        .type   restore_candidate, %object
        .size   restore_candidate, 512
-memory_map = restore_candidate + 512
-       .type   memory_map, %object
-       .size   memory_map, 4194304
+dynarec_local_end = restore_candidate + 512
 
-       .text
-       .align  2
-       .global dyna_linker
-       .type   dyna_linker, %function
-dyna_linker:
+/* unused */
+FCR0 = align0
+       .type   FCR0, %object
+       .size   FCR0, 4
+FCR31 = align0
+       .type   FCR31, %object
+       .size   FCR31, 4
+
+.macro load_var_adr reg var
+.if HAVE_ARMV7
+       movw    \reg, #:lower16:\var
+       movt    \reg, #:upper16:\var
+.else
+       ldr     \reg, =\var
+.endif
+.endm
+
+.macro mov_16 reg imm
+.if HAVE_ARMV7
+       movw    \reg, #\imm
+.else
+       mov     \reg, #(\imm & 0x00ff)
+       orr     \reg, #(\imm & 0xff00)
+.endif
+.endm
+
+.macro mov_24 reg imm
+.if HAVE_ARMV7
+       movw    \reg, #(\imm & 0xffff)
+       movt    \reg, #(\imm >> 16)
+.else
+       mov     \reg, #(\imm & 0x0000ff)
+       orr     \reg, #(\imm & 0x00ff00)
+       orr     \reg, #(\imm & 0xff0000)
+.endif
+.endm
+
+.macro dyna_linker_main
        /* r0 = virtual target address */
        /* r1 = instruction to patch */
-       ldr     r4, .tlbptr
-       lsr     r5, r0, #12
-       mov     r12, r0
-       cmp     r0, #0xC0000000
-       mov     r6, #4096
-       ldrge   r12, [r4, r5, lsl #2]
-       mov     r2, #0x80000
        ldr     r3, .jiptr
-       tst     r12, r12
+       /* get_page */
+       lsr     r2, r0, #12
+       mov     r6, #4096
+       bic     r2, r2, #0xe0000
        sub     r6, r6, #1
-       moveq   r12, r0
+       cmp     r2, #0x1000
        ldr     r7, [r1]
-       eor     r2, r2, r12, lsr #12
-       and     r6, r6, r12, lsr #12
+       biclt   r2, #0x0e00
+       and     r6, r6, r2
        cmp     r2, #2048
        add     r12, r7, #2
        orrcs   r2, r6, #2048
        ldr     r5, [r3, r2, lsl #2]
        lsl     r12, r12, #8
+       add     r6, r1, r12, asr #6
+       mov     r8, #0
        /* jump_in lookup */
-.A1:
+1:
        movs    r4, r5
-       beq     .A3
+       beq     2f
        ldr     r3, [r5]
        ldr     r5, [r4, #12]
        teq     r3, r0
-       bne     .A1
+       bne     1b
        ldr     r3, [r4, #4]
        ldr     r4, [r4, #8]
        tst     r3, r3
-       bne     .A1
-.A2:
-       mov     r5, r1
-       add     r1, r1, r12, asr #6
-       teq     r1, r4
+       bne     1b
+       teq     r4, r6
        moveq   pc, r4 /* Stale i-cache */
+       mov     r8, r4
+       b       1b     /* jump_in may have dupes, continue search */
+2:
+       tst     r8, r8
+       beq     3f     /* r0 not in jump_in */
+
+       mov     r5, r1
+       mov     r1, r6
        bl      add_link
-       sub     r2, r4, r5
+       sub     r2, r8, r5
        and     r1, r7, #0xff000000
        lsl     r2, r2, #6
        sub     r1, r1, #2
        add     r1, r1, r2, lsr #8
        str     r1, [r5]
-       mov     pc, r4
-.A3:
+       mov     pc, r8
+3:
        /* hash_table lookup */
        cmp     r2, #2048
        ldr     r3, .jdptr
@@ -220,14 +260,14 @@ dyna_linker:
        teq     r7, r0
        ldreq   pc, [r6, #12]
        /* jump_dirty lookup */
-.A6:
+6:
        movs    r4, r5
-       beq     .A8
+       beq     8f
        ldr     r3, [r5]
        ldr     r5, [r4, #12]
        teq     r3, r0
-       bne     .A6
-.A7:
+       bne     6b
+7:
        ldr     r1, [r4, #8]
        /* hash_table insert */
        ldr     r2, [r6]
@@ -237,7 +277,18 @@ dyna_linker:
        str     r2, [r6, #8]
        str     r3, [r6, #12]
        mov     pc, r1
-.A8:
+8:
+.endm
+
+       .text
+       .align  2
+       .global dyna_linker
+       .type   dyna_linker, %function
+dyna_linker:
+       /* r0 = virtual target address */
+       /* r1 = instruction to patch */
+       dyna_linker_main
+
        mov     r4, r0
        mov     r5, r1
        bl      new_recompile_block
@@ -274,6 +325,7 @@ exec_pagefault:
        bl      get_addr_ht
        mov     pc, r0
        .size   exec_pagefault, .-exec_pagefault
+
 /* Special dynamic linker for the case where a page fault
    may occur in a branch delay slot */
        .global dyna_linker_ds
@@ -281,86 +333,8 @@ exec_pagefault:
 dyna_linker_ds:
        /* r0 = virtual target address */
        /* r1 = instruction to patch */
-       ldr     r4, .tlbptr
-       lsr     r5, r0, #12
-       mov     r12, r0
-       cmp     r0, #0xC0000000
-       mov     r6, #4096
-       ldrge   r12, [r4, r5, lsl #2]
-       mov     r2, #0x80000
-       ldr     r3, .jiptr
-       tst     r12, r12
-       sub     r6, r6, #1
-       moveq   r12, r0
-       ldr     r7, [r1]
-       eor     r2, r2, r12, lsr #12
-       and     r6, r6, r12, lsr #12
-       cmp     r2, #2048
-       add     r12, r7, #2
-       orrcs   r2, r6, #2048
-       ldr     r5, [r3, r2, lsl #2]
-       lsl     r12, r12, #8
-       /* jump_in lookup */
-.B1:
-       movs    r4, r5
-       beq     .B3
-       ldr     r3, [r5]
-       ldr     r5, [r4, #12]
-       teq     r3, r0
-       bne     .B1
-       ldr     r3, [r4, #4]
-       ldr     r4, [r4, #8]
-       tst     r3, r3
-       bne     .B1
-.B2:
-       mov     r5, r1
-       add     r1, r1, r12, asr #6
-       teq     r1, r4
-       moveq   pc, r4 /* Stale i-cache */
-       bl      add_link
-       sub     r2, r4, r5
-       and     r1, r7, #0xff000000
-       lsl     r2, r2, #6
-       sub     r1, r1, #2
-       add     r1, r1, r2, lsr #8
-       str     r1, [r5]
-       mov     pc, r4
-.B3:
-       /* hash_table lookup */
-       cmp     r2, #2048
-       ldr     r3, .jdptr
-       eor     r4, r0, r0, lsl #16
-       lslcc   r2, r0, #9
-       ldr     r6, .htptr
-       lsr     r4, r4, #12
-       lsrcc   r2, r2, #21
-       bic     r4, r4, #15
-       ldr     r5, [r3, r2, lsl #2]
-       ldr     r7, [r6, r4]!
-       teq     r7, r0
-       ldreq   pc, [r6, #4]
-       ldr     r7, [r6, #8]
-       teq     r7, r0
-       ldreq   pc, [r6, #12]
-       /* jump_dirty lookup */
-.B6:
-       movs    r4, r5
-       beq     .B8
-       ldr     r3, [r5]
-       ldr     r5, [r4, #12]
-       teq     r3, r0
-       bne     .B6
-.B7:
-       ldr     r1, [r4, #8]
-       /* hash_table insert */
-       ldr     r2, [r6]
-       ldr     r3, [r6, #4]
-       str     r0, [r6]
-       str     r1, [r6, #4]
-       str     r2, [r6, #8]
-       str     r3, [r6, #12]
-       mov     pc, r1
-.B8:
+       dyna_linker_main
+
        mov     r4, r0
        bic     r0, r0, #7
        mov     r5, r1
@@ -380,10 +354,9 @@ dyna_linker_ds:
        .word   jump_in
 .jdptr:
        .word   jump_dirty
-.tlbptr:
-       .word   tlb_LUT_r
 .htptr:
        .word   hash_table
+
        .align  2
        .global jump_vaddr_r0
        .type   jump_vaddr_r0, %function
@@ -484,6 +457,7 @@ jump_vaddr:
        ldr     r10, [fp, #cycle_count-dynarec_local]
        mov     pc, r0
        .size   jump_vaddr, .-jump_vaddr
+
        .align  2
        .global verify_code_ds
        .type   verify_code_ds, %function
@@ -493,30 +467,6 @@ verify_code_ds:
        .global verify_code_vm
        .type   verify_code_vm, %function
 verify_code_vm:
-       /* r0 = instruction pointer (virtual address) */
-       /* r1 = source (virtual address) */
-       /* r2 = target */
-       /* r3 = length */
-       cmp     r1, #0xC0000000
-       blt     verify_code
-       add     r12, fp, #memory_map-dynarec_local
-       lsr     r4, r1, #12
-       add     r5, r1, r3
-       sub     r5, #1
-       ldr     r6, [r12, r4, lsl #2]
-       lsr     r5, r5, #12
-       movs    r7, r6
-       bmi     .D5
-       add     r1, r1, r6, lsl #2
-       lsl     r6, r6, #2
-.D1:
-       add     r4, r4, #1
-       teq     r6, r7, lsl #2
-       bne     .D5
-       ldr     r7, [r12, r4, lsl #2]
-       cmp     r4, r5
-       bls     .D1
-       .size   verify_code_vm, .-verify_code_vm
        .global verify_code
        .type   verify_code, %function
 verify_code:
@@ -553,6 +503,8 @@ verify_code:
        bl      get_addr
        mov     pc, r0
        .size   verify_code, .-verify_code
+       .size   verify_code_vm, .-verify_code_vm
+
        .align  2
        .global cc_interrupt
        .type   cc_interrupt, %function
@@ -564,7 +516,8 @@ cc_interrupt:
        str     r1, [fp, #pending_exception-dynarec_local]
        and     r2, r2, r10, lsr #17
        add     r3, fp, #restore_candidate-dynarec_local
-       str     r10, [fp, #reg_cop0+36-dynarec_local] /* Count */
+       str     r10, [fp, #cycle-dynarec_local] /* PCSX cycles */
+@@     str     r10, [fp, #reg_cop0+36-dynarec_local] /* Count */
        ldr     r4, [r2, r3]
        mov     r10, lr
        tst     r4, r4
@@ -572,23 +525,20 @@ cc_interrupt:
 .E1:
        bl      gen_interupt
        mov     lr, r10
-       ldr     r10, [fp, #reg_cop0+36-dynarec_local] /* Count */
+       ldr     r10, [fp, #cycle-dynarec_local]
        ldr     r0, [fp, #next_interupt-dynarec_local]
        ldr     r1, [fp, #pending_exception-dynarec_local]
        ldr     r2, [fp, #stop-dynarec_local]
        str     r0, [fp, #last_count-dynarec_local]
        sub     r10, r10, r0
        tst     r2, r2
-       bne     .E3
+       ldmnefd sp!, {r4, r5, r6, r7, r8, r9, sl, fp, ip, pc}
        tst     r1, r1
        moveq   pc, lr
 .E2:
        ldr     r0, [fp, #pcaddr-dynarec_local]
        bl      get_addr_ht
        mov     pc, r0
-.E3:
-       add     r12, fp, #28
-       ldmia   r12, {r4, r5, r6, r7, r8, r9, sl, fp, pc}
 .E4:
        /* Move 'dirty' blocks to the 'clean' list */
        lsl     r5, r2, #3
@@ -601,21 +551,18 @@ cc_interrupt:
        tst     r5, #31
        bne     .E5
        b       .E1
-
        .size   cc_interrupt, .-cc_interrupt
+
        .align  2
        .global do_interrupt
        .type   do_interrupt, %function
 do_interrupt:
        ldr     r0, [fp, #pcaddr-dynarec_local]
        bl      get_addr_ht
-       ldr     r1, [fp, #next_interupt-dynarec_local]
-       ldr     r10, [fp, #reg_cop0+36-dynarec_local] /* Count */
-       str     r1, [fp, #last_count-dynarec_local]
-       sub     r10, r10, r1
        add     r10, r10, #2
        mov     pc, r0
        .size   do_interrupt, .-do_interrupt
+
        .align  2
        .global fp_exception
        .type   fp_exception, %function
@@ -629,7 +576,7 @@ fp_exception:
        add     r2, r2, #0x2c
        str     r1, [fp, #reg_cop0+48-dynarec_local] /* Status */
        str     r2, [fp, #reg_cop0+52-dynarec_local] /* Cause */
-       add     r0, r3, #0x180
+       add     r0, r3, #0x80
        bl      get_addr_ht
        mov     pc, r0
        .size   fp_exception, .-fp_exception
@@ -640,6 +587,7 @@ fp_exception_ds:
        mov     r2, #0x90000000 /* Set high bit if delay slot */
        b       .E7
        .size   fp_exception_ds, .-fp_exception_ds
+
        .align  2
        .global jump_syscall
        .type   jump_syscall, %function
@@ -651,352 +599,402 @@ jump_syscall:
        mov     r2, #0x20
        str     r1, [fp, #reg_cop0+48-dynarec_local] /* Status */
        str     r2, [fp, #reg_cop0+52-dynarec_local] /* Cause */
-       add     r0, r3, #0x180
+       add     r0, r3, #0x80
        bl      get_addr_ht
        mov     pc, r0
        .size   jump_syscall, .-jump_syscall
        .align  2
-       .global indirect_jump_indexed
-       .type   indirect_jump_indexed, %function
-indirect_jump_indexed:
-       ldr     r0, [r0, r1, lsl #2]
-       .size   indirect_jump_indexed, .-indirect_jump_indexed
-       .align  2
-       .global indirect_jump
-       .type   indirect_jump, %function
-indirect_jump:
-       ldr     r12, [fp, #last_count-dynarec_local]
-       add     r2, r2, r12 
-       str     r2, [fp, #reg_cop0+36-dynarec_local] /* Count */
-       mov     pc, r0
-       .size   indirect_jump, .-indirect_jump
+
        .align  2
-       .global jump_eret
-       .type   jump_eret, %function
-jump_eret:
-       ldr     r1, [fp, #reg_cop0+48-dynarec_local] /* Status */
-       ldr     r0, [fp, #last_count-dynarec_local]
-       bic     r1, r1, #2
-       add     r10, r0, r10
-       str     r1, [fp, #reg_cop0+48-dynarec_local] /* Status */
-       str     r10, [fp, #reg_cop0+36-dynarec_local] /* Count */
-       bl      check_interupt
+       .global jump_syscall_hle
+       .type   jump_syscall_hle, %function
+jump_syscall_hle:
+       str     r0, [fp, #pcaddr-dynarec_local] /* PC must be set to EPC for psxException */
+       ldr     r2, [fp, #last_count-dynarec_local]
+       mov     r1, #0    /* in delay slot */
+       add     r2, r2, r10
+       mov     r0, #0x20 /* cause */
+       str     r2, [fp, #cycle-dynarec_local] /* PCSX cycle counter */
+       bl      psxException
+
+       /* note: psxException might do recorsive recompiler call from it's HLE code,
+        * so be ready for this */
+pcsx_return:
        ldr     r1, [fp, #next_interupt-dynarec_local]
-       ldr     r0, [fp, #reg_cop0+56-dynarec_local] /* EPC */
+       ldr     r10, [fp, #cycle-dynarec_local]
+       ldr     r0, [fp, #pcaddr-dynarec_local]
+       sub     r10, r10, r1
        str     r1, [fp, #last_count-dynarec_local]
-       subs    r10, r10, r1
-       bpl     .E11
-.E8:
-       add     r6, fp, #reg+256-dynarec_local
-       mov     r5, #248
-       mov     r1, #0
-.E9:
-       ldr     r2, [r6, #-8]!
-       ldr     r3, [r6, #4]
-       eor     r3, r3, r2, asr #31
-       subs    r3, r3, #1
-       adc     r1, r1, r1
-       subs    r5, r5, #8
-       bne     .E9
-       ldr     r2, [fp, #hi-dynarec_local]
-       ldr     r3, [fp, #hi+4-dynarec_local]
-       eors    r3, r3, r2, asr #31
-       ldr     r2, [fp, #lo-dynarec_local]
-       ldreq   r3, [fp, #lo+4-dynarec_local]
-       eoreq   r3, r3, r2, asr #31
-       subs    r3, r3, #1
-       adc     r1, r1, r1
-       bl      get_addr_32
+       bl      get_addr_ht
        mov     pc, r0
-.E11:
+       .size   jump_syscall_hle, .-jump_syscall_hle
+
+       .align  2
+       .global jump_hlecall
+       .type   jump_hlecall, %function
+jump_hlecall:
+       ldr     r2, [fp, #last_count-dynarec_local]
        str     r0, [fp, #pcaddr-dynarec_local]
-       bl      cc_interrupt
-       ldr     r0, [fp, #pcaddr-dynarec_local]
-       b       .E8
-       .size   jump_eret, .-jump_eret
+       add     r2, r2, r10
+       adr     lr, pcsx_return
+       str     r2, [fp, #cycle-dynarec_local] /* PCSX cycle counter */
+       bx      r1
+       .size   jump_hlecall, .-jump_hlecall
+
        .align  2
-       .global new_dyna_start
-       .type   new_dyna_start, %function
-new_dyna_start:
-       ldr     r12, .dlptr
-       mov     r0, #0xa4000000
-       stmia   r12, {r4, r5, r6, r7, r8, r9, sl, fp, lr}
-       sub     fp, r12, #28
-       add     r0, r0, #0x40
-       bl      new_recompile_block
-       ldr     r0, [fp, #next_interupt-dynarec_local]
-       ldr     r10, [fp, #reg_cop0+36-dynarec_local] /* Count */
-       str     r0, [fp, #last_count-dynarec_local]
-       sub     r10, r10, r0
-       mov     pc, #0x7000000
-.dlptr:
-       .word   dynarec_local+28
-       .size   new_dyna_start, .-new_dyna_start
+       .global jump_intcall
+       .type   jump_intcall, %function
+jump_intcall:
+       ldr     r2, [fp, #last_count-dynarec_local]
+       str     r0, [fp, #pcaddr-dynarec_local]
+       add     r2, r2, r10
+       adr     lr, pcsx_return
+       str     r2, [fp, #cycle-dynarec_local] /* PCSX cycle counter */
+       b       execI
+       .size   jump_hlecall, .-jump_hlecall
+
+new_dyna_leave:
        .align  2
-       .global write_rdram_new
-       .type   write_rdram_new, %function
-write_rdram_new:
-       ldr     r2, [fp, #address-dynarec_local]
-       ldr     r0, [fp, #word-dynarec_local]
-       str     r0, [r2]
-       b       .E12
-       .size   write_rdram_new, .-write_rdram_new
+       .global new_dyna_leave
+       .type   new_dyna_leave, %function
+       ldr     r0, [fp, #last_count-dynarec_local]
+       add     r12, fp, #28
+       add     r10, r0, r10
+       str     r10, [fp, #cycle-dynarec_local]
+       ldmfd   sp!, {r4, r5, r6, r7, r8, r9, sl, fp, ip, pc}
+       .size   new_dyna_leave, .-new_dyna_leave
+
        .align  2
-       .global write_rdramb_new
-       .type   write_rdramb_new, %function
-write_rdramb_new:
-       ldr     r2, [fp, #address-dynarec_local]
-       ldrb    r0, [fp, #byte-dynarec_local]
-       eor     r2, r2, #3
-       strb    r0, [r2]
-       b       .E12
-       .size   write_rdramb_new, .-write_rdramb_new
+       .global invalidate_addr_r0
+       .type   invalidate_addr_r0, %function
+invalidate_addr_r0:
+       stmia   fp, {r0, r1, r2, r3, r12, lr}
+       b       invalidate_addr_call
+       .size   invalidate_addr_r0, .-invalidate_addr_r0
        .align  2
-       .global write_rdramh_new
-       .type   write_rdramh_new, %function
-write_rdramh_new:
-       ldr     r2, [fp, #address-dynarec_local]
-       ldrh    r0, [fp, #hword-dynarec_local]
-       eor     r2, r2, #2
-       strh    r0, [r2]
-       b       .E12
-       .size   write_rdramh_new, .-write_rdramh_new
+       .global invalidate_addr_r1
+       .type   invalidate_addr_r1, %function
+invalidate_addr_r1:
+       stmia   fp, {r0, r1, r2, r3, r12, lr}
+       mov     r0, r1
+       b       invalidate_addr_call
+       .size   invalidate_addr_r1, .-invalidate_addr_r1
        .align  2
-       .global write_rdramd_new
-       .type   write_rdramd_new, %function
-write_rdramd_new:
-       ldr     r2, [fp, #address-dynarec_local]
-/*     ldrd    r0, [fp, #dword-dynarec_local]*/
-       ldr     r0, [fp, #dword-dynarec_local]
-       ldr     r1, [fp, #dword+4-dynarec_local]
-       str     r0, [r2, #4]
-       str     r1, [r2]
-       b       .E12
-       .size   write_rdramd_new, .-write_rdramd_new
+       .global invalidate_addr_r2
+       .type   invalidate_addr_r2, %function
+invalidate_addr_r2:
+       stmia   fp, {r0, r1, r2, r3, r12, lr}
+       mov     r0, r2
+       b       invalidate_addr_call
+       .size   invalidate_addr_r2, .-invalidate_addr_r2
        .align  2
-       .global do_invalidate
-       .type   do_invalidate, %function
-do_invalidate:
-       ldr     r2, [fp, #address-dynarec_local]
-.E12:
-       ldr     r1, [fp, #invc_ptr-dynarec_local]
-       lsr     r0, r2, #12
-       ldrb    r2, [r1, r0]
-       tst     r2, r2
-       beq     invalidate_block
-       mov     pc, lr
-       .size   do_invalidate, .-do_invalidate
+       .global invalidate_addr_r3
+       .type   invalidate_addr_r3, %function
+invalidate_addr_r3:
+       stmia   fp, {r0, r1, r2, r3, r12, lr}
+       mov     r0, r3
+       b       invalidate_addr_call
+       .size   invalidate_addr_r3, .-invalidate_addr_r3
+       .align  2
+       .global invalidate_addr_r4
+       .type   invalidate_addr_r4, %function
+invalidate_addr_r4:
+       stmia   fp, {r0, r1, r2, r3, r12, lr}
+       mov     r0, r4
+       b       invalidate_addr_call
+       .size   invalidate_addr_r4, .-invalidate_addr_r4
        .align  2
-       .global read_nomem_new
-       .type   read_nomem_new, %function
-read_nomem_new:
-       ldr     r2, [fp, #address-dynarec_local]
-       add     r12, fp, #memory_map-dynarec_local
-       lsr     r0, r2, #12
-       ldr     r12, [r12, r0, lsl #2]
-       mov     r1, #8
-       tst     r12, r12
-       bmi     tlb_exception
-       ldr     r0, [r2, r12, lsl #2]
-       str     r0, [fp, #readmem_dword-dynarec_local]
-       mov     pc, lr
-       .size   read_nomem_new, .-read_nomem_new
+       .global invalidate_addr_r5
+       .type   invalidate_addr_r5, %function
+invalidate_addr_r5:
+       stmia   fp, {r0, r1, r2, r3, r12, lr}
+       mov     r0, r5
+       b       invalidate_addr_call
+       .size   invalidate_addr_r5, .-invalidate_addr_r5
        .align  2
-       .global read_nomemb_new
-       .type   read_nomemb_new, %function
-read_nomemb_new:
-       ldr     r2, [fp, #address-dynarec_local]
-       add     r12, fp, #memory_map-dynarec_local
-       lsr     r0, r2, #12
-       ldr     r12, [r12, r0, lsl #2]
-       mov     r1, #8
-       tst     r12, r12
-       bmi     tlb_exception
-       eor     r2, r2, #3
-       ldrb    r0, [r2, r12, lsl #2]
-       str     r0, [fp, #readmem_dword-dynarec_local]
-       mov     pc, lr
-       .size   read_nomemb_new, .-read_nomemb_new
+       .global invalidate_addr_r6
+       .type   invalidate_addr_r6, %function
+invalidate_addr_r6:
+       stmia   fp, {r0, r1, r2, r3, r12, lr}
+       mov     r0, r6
+       b       invalidate_addr_call
+       .size   invalidate_addr_r6, .-invalidate_addr_r6
        .align  2
-       .global read_nomemh_new
-       .type   read_nomemh_new, %function
-read_nomemh_new:
-       ldr     r2, [fp, #address-dynarec_local]
-       add     r12, fp, #memory_map-dynarec_local
-       lsr     r0, r2, #12
-       ldr     r12, [r12, r0, lsl #2]
-       mov     r1, #8
-       tst     r12, r12
-       bmi     tlb_exception
-       lsl     r12, r12, #2
-       eor     r2, r2, #2
-       ldrh    r0, [r2, r12]
-       str     r0, [fp, #readmem_dword-dynarec_local]
-       mov     pc, lr
-       .size   read_nomemh_new, .-read_nomemh_new
+       .global invalidate_addr_r7
+       .type   invalidate_addr_r7, %function
+invalidate_addr_r7:
+       stmia   fp, {r0, r1, r2, r3, r12, lr}
+       mov     r0, r7
+       b       invalidate_addr_call
+       .size   invalidate_addr_r7, .-invalidate_addr_r7
        .align  2
-       .global read_nomemd_new
-       .type   read_nomemd_new, %function
-read_nomemd_new:
-       ldr     r2, [fp, #address-dynarec_local]
-       add     r12, fp, #memory_map-dynarec_local
-       lsr     r0, r2, #12
-       ldr     r12, [r12, r0, lsl #2]
-       mov     r1, #8
-       tst     r12, r12
-       bmi     tlb_exception
-       lsl     r12, r12, #2
-/*     ldrd    r0, [r2, r12]*/
-       add     r3, r2, #4
-       ldr     r0, [r2, r12]
-       ldr     r1, [r3, r12]
-       str     r0, [fp, #readmem_dword+4-dynarec_local]
-       str     r1, [fp, #readmem_dword-dynarec_local]
-       mov     pc, lr
-       .size   read_nomemd_new, .-read_nomemd_new
+       .global invalidate_addr_r8
+       .type   invalidate_addr_r8, %function
+invalidate_addr_r8:
+       stmia   fp, {r0, r1, r2, r3, r12, lr}
+       mov     r0, r8
+       b       invalidate_addr_call
+       .size   invalidate_addr_r8, .-invalidate_addr_r8
        .align  2
-       .global write_nomem_new
-       .type   write_nomem_new, %function
-write_nomem_new:
-       str     r3, [fp, #24]
-       str     lr, [fp, #28]
-       bl      do_invalidate
-       ldr     r2, [fp, #address-dynarec_local]
-       add     r12, fp, #memory_map-dynarec_local
-       ldr     lr, [fp, #28]
-       lsr     r0, r2, #12
-       ldr     r3, [fp, #24]
-       ldr     r12, [r12, r0, lsl #2]
-       mov     r1, #0xc
-       tst     r12, #0x40000000
-       bne     tlb_exception
-       ldr     r0, [fp, #word-dynarec_local]
-       str     r0, [r2, r12, lsl #2]
-       mov     pc, lr
-       .size   write_nomem_new, .-write_nomem_new
+       .global invalidate_addr_r9
+       .type   invalidate_addr_r9, %function
+invalidate_addr_r9:
+       stmia   fp, {r0, r1, r2, r3, r12, lr}
+       mov     r0, r9
+       b       invalidate_addr_call
+       .size   invalidate_addr_r9, .-invalidate_addr_r9
        .align  2
-       .global write_nomemb_new
-       .type   write_nomemb_new, %function
-write_nomemb_new:
-       str     r3, [fp, #24]
-       str     lr, [fp, #28]
-       bl      do_invalidate
-       ldr     r2, [fp, #address-dynarec_local]
-       add     r12, fp, #memory_map-dynarec_local
-       ldr     lr, [fp, #28]
-       lsr     r0, r2, #12
-       ldr     r3, [fp, #24]
-       ldr     r12, [r12, r0, lsl #2]
-       mov     r1, #0xc
-       tst     r12, #0x40000000
-       bne     tlb_exception
-       eor     r2, r2, #3
-       ldrb    r0, [fp, #byte-dynarec_local]
-       strb    r0, [r2, r12, lsl #2]
-       mov     pc, lr
-       .size   write_nomemb_new, .-write_nomemb_new
+       .global invalidate_addr_r10
+       .type   invalidate_addr_r10, %function
+invalidate_addr_r10:
+       stmia   fp, {r0, r1, r2, r3, r12, lr}
+       mov     r0, r10
+       b       invalidate_addr_call
+       .size   invalidate_addr_r10, .-invalidate_addr_r10
        .align  2
-       .global write_nomemh_new
-       .type   write_nomemh_new, %function
-write_nomemh_new:
-       str     r3, [fp, #24]
-       str     lr, [fp, #28]
-       bl      do_invalidate
-       ldr     r2, [fp, #address-dynarec_local]
-       add     r12, fp, #memory_map-dynarec_local
-       ldr     lr, [fp, #28]
-       lsr     r0, r2, #12
-       ldr     r3, [fp, #24]
-       ldr     r12, [r12, r0, lsl #2]
-       mov     r1, #0xc
-       lsls    r12, #2
-       bcs     tlb_exception
-       eor     r2, r2, #2
-       ldrh    r0, [fp, #hword-dynarec_local]
-       strh    r0, [r2, r12]
-       mov     pc, lr
-       .size   write_nomemh_new, .-write_nomemh_new
+       .global invalidate_addr_r12
+       .type   invalidate_addr_r12, %function
+invalidate_addr_r12:
+       stmia   fp, {r0, r1, r2, r3, r12, lr}
+       mov     r0, r12
+       .size   invalidate_addr_r12, .-invalidate_addr_r12
        .align  2
-       .global write_nomemd_new
-       .type   write_nomemd_new, %function
-write_nomemd_new:
-       str     r3, [fp, #24]
-       str     lr, [fp, #28]
-       bl      do_invalidate
-       ldr     r2, [fp, #address-dynarec_local]
-       add     r12, fp, #memory_map-dynarec_local
-       ldr     lr, [fp, #28]
-       lsr     r0, r2, #12
-       ldr     r3, [fp, #24]
-       ldr     r12, [r12, r0, lsl #2]
-       mov     r1, #0xc
-       lsls    r12, #2
-       bcs     tlb_exception
-       add     r3, r2, #4
-       ldr     r0, [fp, #dword+4-dynarec_local]
-       ldr     r1, [fp, #dword-dynarec_local]
-/*     strd    r0, [r2, r12]*/
-       str     r0, [r2, r12]
-       str     r1, [r3, r12]
-       mov     pc, lr
-       .size   write_nomemd_new, .-write_nomemd_new
+       .global invalidate_addr_call
+       .type   invalidate_addr_call, %function
+invalidate_addr_call:
+       ldr     r12, [fp, #inv_code_start-dynarec_local]
+       ldr     lr, [fp, #inv_code_end-dynarec_local]
+       cmp     r0, r12
+       cmpcs   lr, r0
+       blcc    invalidate_addr
+       ldmia   fp, {r0, r1, r2, r3, r12, pc}
+       .size   invalidate_addr_call, .-invalidate_addr_call
+
        .align  2
-       .global tlb_exception
-       .type   tlb_exception, %function
-tlb_exception:
-       /* r1 = cause */
-       /* r2 = address */
-       /* r3 = instr addr/flags */
-       ldr     r4, [fp, #reg_cop0+48-dynarec_local] /* Status */
-       add     r5, fp, #memory_map-dynarec_local
-       lsr     r6, r3, #12
-       orr     r1, r1, r3, lsl #31
-       orr     r4, r4, #2
-       ldr     r7, [r5, r6, lsl #2]
-       bic     r8, r3, #3
-       str     r4, [fp, #reg_cop0+48-dynarec_local] /* Status */
-       mov     r6, #0x6000000
-       str     r1, [fp, #reg_cop0+52-dynarec_local] /* Cause */
-       orr     r6, r6, #0x22
-       ldr     r0, [r8, r7, lsl #2]
-       add     r4, r8, r1, asr #29
-       add     r5, fp, #reg-dynarec_local
-       str     r4, [fp, #reg_cop0+56-dynarec_local] /* EPC */
-       mov     r7, #0xf8
-       ldr     r8, [fp, #reg_cop0+16-dynarec_local] /* Context */
-       lsl     r1, r0, #16
-       lsr     r4, r0, #26
-       and     r7, r7, r0, lsr #18
-       mvn     r9, #0xF000000F
-       sub     r2, r2, r1, asr #16
-       bic     r9, r9, #0x0F800000
-       rors    r6, r6, r4
-       mov     r0, #0x80000000
-       ldrcs   r2, [r5, r7]
-       bic     r8, r8, r9
-       tst     r3, #2
-       str     r2, [r5, r7]
-       add     r4, r2, r1, asr #16
-       add     r6, fp, #reg+4-dynarec_local
-       asr     r3, r2, #31
-       str     r4, [fp, #reg_cop0+32-dynarec_local] /* BadVAddr */
-       add     r0, r0, #0x180
-       and     r4, r9, r4, lsr #9
-       strne   r3, [r6, r7]
-       orr     r8, r8, r4
-       str     r8, [fp, #reg_cop0+16-dynarec_local] /* Context */
+       .global new_dyna_start
+       .type   new_dyna_start, %function
+new_dyna_start:
+       /* ip is stored to conform EABI alignment */
+       stmfd   sp!, {r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
+       load_var_adr fp, dynarec_local
+       ldr     r0, [fp, #pcaddr-dynarec_local]
        bl      get_addr_ht
        ldr     r1, [fp, #next_interupt-dynarec_local]
-       ldr     r10, [fp, #reg_cop0+36-dynarec_local] /* Count */
+       ldr     r10, [fp, #cycle-dynarec_local]
        str     r1, [fp, #last_count-dynarec_local]
        sub     r10, r10, r1
-       mov     pc, r0  
-       .size   tlb_exception, .-tlb_exception
-       .align  2
-       .global breakpoint
-       .type   breakpoint, %function
-breakpoint:
-       /* Set breakpoint here for debugging */
-       mov     pc, lr
-       .size   breakpoint, .-breakpoint
-       .section        .note.GNU-stack,"",%progbits
+       mov     pc, r0
+       .size   new_dyna_start, .-new_dyna_start
+
+/* --------------------------------------- */
+
+.align 2
+.global        jump_handler_read8
+.global        jump_handler_read16
+.global        jump_handler_read32
+.global        jump_handler_write8
+.global        jump_handler_write16
+.global        jump_handler_write32
+.global        jump_handler_write_h
+.global jump_handle_swl
+.global jump_handle_swr
+.global rcnt0_read_count_m0
+.global rcnt0_read_count_m1
+.global rcnt1_read_count_m0
+.global rcnt1_read_count_m1
+.global rcnt2_read_count_m0
+.global rcnt2_read_count_m1
+
+
+.macro pcsx_read_mem readop tab_shift
+       /* r0 = address, r1 = handler_tab, r2 = cycles */
+       lsl     r3, r0, #20
+       lsr     r3, #(20+\tab_shift)
+       ldr     r12, [fp, #last_count-dynarec_local]
+       ldr     r1, [r1, r3, lsl #2]
+       add     r2, r2, r12
+       lsls    r1, #1
+.if \tab_shift == 1
+       lsl     r3, #1
+       \readop r0, [r1, r3]
+.else
+       \readop r0, [r1, r3, lsl #\tab_shift]
+.endif
+       movcc   pc, lr
+       str     r2, [fp, #cycle-dynarec_local]
+       bx      r1
+.endm
+
+jump_handler_read8:
+       add     r1, #0x1000/4*4 + 0x1000/2*4 @ shift to r8 part
+       pcsx_read_mem ldrccb, 0
+
+jump_handler_read16:
+       add     r1, #0x1000/4*4              @ shift to r16 part
+       pcsx_read_mem ldrcch, 1
+
+jump_handler_read32:
+       pcsx_read_mem ldrcc, 2
+
+
+.macro pcsx_write_mem wrtop tab_shift
+       /* r0 = address, r1 = data, r2 = cycles, r3 = handler_tab */
+       lsl     r12,r0, #20
+       lsr     r12, #(20+\tab_shift)
+       ldr     r3, [r3, r12, lsl #2]
+       str     r0, [fp, #address-dynarec_local]      @ some handlers still need it..
+       lsls    r3, #1
+       mov     r0, r2                                @ cycle return in case of direct store
+.if \tab_shift == 1
+       lsl     r12, #1
+       \wrtop  r1, [r3, r12]
+.else
+       \wrtop  r1, [r3, r12, lsl #\tab_shift]
+.endif
+       movcc   pc, lr
+       ldr     r12, [fp, #last_count-dynarec_local]
+       mov     r0, r1
+       add     r2, r2, r12
+       push    {r2, lr}
+       str     r2, [fp, #cycle-dynarec_local]
+       blx     r3
+
+       ldr     r0, [fp, #next_interupt-dynarec_local]
+       pop     {r2, r3}
+       str     r0, [fp, #last_count-dynarec_local]
+       sub     r0, r2, r0
+       bx      r3
+.endm
+
+jump_handler_write8:
+       add     r3, #0x1000/4*4 + 0x1000/2*4 @ shift to r8 part
+       pcsx_write_mem strccb, 0
+
+jump_handler_write16:
+       add     r3, #0x1000/4*4              @ shift to r16 part
+       pcsx_write_mem strcch, 1
+
+jump_handler_write32:
+       pcsx_write_mem strcc, 2
+
+jump_handler_write_h:
+       /* r0 = address, r1 = data, r2 = cycles, r3 = handler */
+       ldr     r12, [fp, #last_count-dynarec_local]
+       str     r0, [fp, #address-dynarec_local]      @ some handlers still need it..
+       add     r2, r2, r12
+       mov     r0, r1
+       push    {r2, lr}
+       str     r2, [fp, #cycle-dynarec_local]
+       blx     r3
+
+       ldr     r0, [fp, #next_interupt-dynarec_local]
+       pop     {r2, r3}
+       str     r0, [fp, #last_count-dynarec_local]
+       sub     r0, r2, r0
+       bx      r3
+
+jump_handle_swl:
+       /* r0 = address, r1 = data, r2 = cycles */
+       ldr     r3, [fp, #mem_wtab-dynarec_local]
+       mov     r12,r0,lsr #12
+       ldr     r3, [r3, r12, lsl #2]
+       lsls    r3, #1
+       bcs     4f
+       add     r3, r0, r3
+       mov     r0, r2
+       tst     r3, #2
+       beq     101f
+       tst     r3, #1
+       beq     2f
+3:
+       str     r1, [r3, #-3]
+       bx      lr
+2:
+       lsr     r2, r1, #8
+       lsr     r1, #24
+       strh    r2, [r3, #-2]
+       strb    r1, [r3]
+       bx      lr
+101:
+       tst     r3, #1
+       lsrne   r1, #16         @ 1
+       lsreq   r12, r1, #24    @ 0
+       strneh  r1, [r3, #-1]
+       streqb  r12, [r3]
+       bx      lr
+4:
+       mov     r0, r2
+@      b       abort
+       bx      lr              @ TODO?
+
+
+jump_handle_swr:
+       /* r0 = address, r1 = data, r2 = cycles */
+       ldr     r3, [fp, #mem_wtab-dynarec_local]
+       mov     r12,r0,lsr #12
+       ldr     r3, [r3, r12, lsl #2]
+       lsls    r3, #1
+       bcs     4f
+       add     r3, r0, r3
+       and     r12,r3, #3
+       mov     r0, r2
+       cmp     r12,#2
+       strgtb  r1, [r3]        @ 3
+       streqh  r1, [r3]        @ 2
+       cmp     r12,#1
+       strlt   r1, [r3]        @ 0
+       bxne    lr
+       lsr     r2, r1, #8      @ 1
+       strb    r1, [r3]
+       strh    r2, [r3, #1]
+       bx      lr
+4:
+       mov     r0, r2
+@      b       abort
+       bx      lr              @ TODO?
+
+
+.macro rcntx_read_mode0 num
+       /* r0 = address, r2 = cycles */
+       ldr     r3, [fp, #rcnts-dynarec_local+6*4+7*4*\num] @ cycleStart
+       mov     r0, r2, lsl #16
+       sub     r0, r3, lsl #16
+       lsr     r0, #16
+       bx      lr
+.endm
+
+rcnt0_read_count_m0:
+       rcntx_read_mode0 0
+
+rcnt1_read_count_m0:
+       rcntx_read_mode0 1
+
+rcnt2_read_count_m0:
+       rcntx_read_mode0 2
+
+rcnt0_read_count_m1:
+       /* r0 = address, r2 = cycles */
+       ldr     r3, [fp, #rcnts-dynarec_local+6*4+7*4*0] @ cycleStart
+       mov_16  r1, 0x3334
+       sub     r2, r2, r3
+       mul     r0, r1, r2              @ /= 5
+       lsr     r0, #16
+       bx      lr
+
+rcnt1_read_count_m1:
+       /* r0 = address, r2 = cycles */
+       ldr     r3, [fp, #rcnts-dynarec_local+6*4+7*4*1]
+       mov_24  r1, 0x1e6cde
+       sub     r2, r2, r3
+       umull   r3, r0, r1, r2          @ ~ /= hsync_cycles, max ~0x1e6cdd
+       bx      lr
+
+rcnt2_read_count_m1:
+       /* r0 = address, r2 = cycles */
+       ldr     r3, [fp, #rcnts-dynarec_local+6*4+7*4*2]
+       mov     r0, r2, lsl #16-3
+       sub     r0, r3, lsl #16-3
+       lsr     r0, #16                 @ /= 8
+       bx      lr
+
+@ vim:filetype=armasm