drc: merge Ari64's patch: 11_reduce_invstub_memory_usage
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / linkage_arm.s
index f1b0f8c..57fb3d2 100644 (file)
@@ -1,6 +1,7 @@
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
- *   Mupen64plus - linkage_arm.s                                           *
- *   Copyright (C) 2009-2010 Ari64                                         *
+ *   linkage_arm.s for PCSX                                                *
+ *   Copyright (C) 2009-2011 Ari64                                         *
+ *   Copyright (C) 2010-2011 GraÅžvydas "notaz" Ignotas                     *
  *                                                                         *
  *   This program is free software; you can redistribute it and/or modify  *
  *   it under the terms of the GNU General Public License as published by  *
  *   Free Software Foundation, Inc.,                                       *
  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+.equiv HAVE_ARMV7, 1
+
+.if HAVE_ARMV7
+       .cpu cortex-a8
+       .fpu vfp
+.else
        .cpu arm9tdmi
        .fpu softvfp
-       .eabi_attribute 20, 1
-       .eabi_attribute 21, 1
-       .eabi_attribute 23, 3
-       .eabi_attribute 24, 1
-       .eabi_attribute 25, 1
-       .eabi_attribute 26, 2
-       .eabi_attribute 30, 6
-       .eabi_attribute 18, 4
-       .file   "linkage_arm.s"
+.endif 
        .global rdram
 rdram = 0x80000000
        .global dynarec_local
@@ -35,6 +35,8 @@ rdram = 0x80000000
        .global hi
        .global lo
        .global reg_cop0
+       .global reg_cop2d
+       .global reg_cop2c
        .global FCR0
        .global FCR31
        .global next_interupt
@@ -58,6 +60,8 @@ rdram = 0x80000000
        .global memory_map
        /* psx */
        .global psxRegs
+       .global nd_pcsx_io
+       .global psxH_ptr
 
        .bss
        .align  4
@@ -77,10 +81,7 @@ last_count = cycle_count + 4
 pending_exception = last_count + 4
        .type   pending_exception, %object
        .size   pending_exception, 4
-pcaddr = pending_exception + 4
-       .type   pcaddr, %object
-       .size   pcaddr, 4
-stop = pcaddr + 4
+stop = pending_exception + 4
        .type   stop, %object
        .size   stop, 4
 invc_ptr = stop + 4
@@ -111,20 +112,21 @@ FCR0 = hword + 4
 FCR31 = FCR0 + 4
        .type   FCR31, %object
        .size   FCR31, 4
-reg = FCR31 + 4
+psxRegs = FCR31 + 4
 
 /* psxRegs */
-psxRegs = reg
+       .type   psxRegs, %object
+       .size   psxRegs, psxRegs_end-psxRegs
+reg = psxRegs
        .type   reg, %object
        .size   reg, 128
-       .size   psxRegs, psxRegs_end-psxRegs
-hi = reg + 128
-       .type   hi, %object
-       .size   hi, 4
-lo = hi + 4
+lo = reg + 128
        .type   lo, %object
        .size   lo, 4
-reg_cop0 = lo + 4
+hi = lo + 4
+       .type   hi, %object
+       .size   hi, 4
+reg_cop0 = hi + 4
        .type   reg_cop0, %object
        .size   reg_cop0, 128
 reg_cop2d = reg_cop0 + 128
@@ -134,6 +136,7 @@ reg_cop2c = reg_cop2d + 128
        .type   reg_cop2c, %object
        .size   reg_cop2c, 128
 PC = reg_cop2c + 128
+pcaddr = PC
        .type   PC, %object
        .size   PC, 4
 code = PC + 4
@@ -147,10 +150,43 @@ interrupt = cycle + 4
        .size   interrupt, 4
 intCycle = interrupt + 4
        .type   intCycle, %object
-       .size   intCycle, 128
-psxRegs_end = intCycle + 128
+       .size   intCycle, 256
+psxRegs_end = intCycle + 256
+
+/* nd_pcsx_io */
+nd_pcsx_io = psxRegs_end
+       .type   nd_pcsx_io, %object
+       .size   nd_pcsx_io, nd_pcsx_io_end-nd_pcsx_io
+tab_read8 = nd_pcsx_io
+       .type   tab_read8, %object
+       .size   tab_read8, 4
+tab_read16 = tab_read8 + 4
+       .type   tab_read16, %object
+       .size   tab_read16, 4
+tab_read32 = tab_read16 + 4
+       .type   tab_read32, %object
+       .size   tab_read32, 4
+tab_write8 = tab_read32 + 4
+       .type   tab_write8, %object
+       .size   tab_write8, 4
+tab_write16 = tab_write8 + 4
+       .type   tab_write16, %object
+       .size   tab_write16, 4
+tab_write32 = tab_write16 + 4
+       .type   tab_write32, %object
+       .size   tab_write32, 4
+spu_readf = tab_write32 + 4
+       .type   spu_readf, %object
+       .size   spu_readf, 4
+spu_writef = spu_readf + 4
+       .type   spu_writef, %object
+       .size   spu_writef, 4
+nd_pcsx_io_end = spu_writef + 4
 
-align0 = psxRegs_end /* just for alignment */
+psxH_ptr = nd_pcsx_io_end
+       .type   psxH_ptr, %object
+       .size   psxH_ptr, 4
+align0 = psxH_ptr + 4 /* just for alignment */
        .type   align0, %object
        .size   align0, 4
 branch_target = align0 + 4
@@ -284,6 +320,7 @@ exec_pagefault:
        bl      get_addr_ht
        mov     pc, r0
        .size   exec_pagefault, .-exec_pagefault
+
 /* Special dynamic linker for the case where a page fault
    may occur in a branch delay slot */
        .global dyna_linker_ds
@@ -386,6 +423,7 @@ dyna_linker_ds:
        .word   jump_dirty
 .htptr:
        .word   hash_table
+
        .align  2
        .global jump_vaddr_r0
        .type   jump_vaddr_r0, %function
@@ -486,6 +524,7 @@ jump_vaddr:
        ldr     r10, [fp, #cycle_count-dynarec_local]
        mov     pc, r0
        .size   jump_vaddr, .-jump_vaddr
+
        .align  2
        .global verify_code_ds
        .type   verify_code_ds, %function
@@ -495,30 +534,6 @@ verify_code_ds:
        .global verify_code_vm
        .type   verify_code_vm, %function
 verify_code_vm:
-       /* r0 = instruction pointer (virtual address) */
-       /* r1 = source (virtual address) */
-       /* r2 = target */
-       /* r3 = length */
-       cmp     r1, #0xC0000000
-       blt     verify_code
-       add     r12, fp, #memory_map-dynarec_local
-       lsr     r4, r1, #12
-       add     r5, r1, r3
-       sub     r5, #1
-       ldr     r6, [r12, r4, lsl #2]
-       lsr     r5, r5, #12
-       movs    r7, r6
-       bmi     .D5
-       add     r1, r1, r6, lsl #2
-       lsl     r6, r6, #2
-.D1:
-       add     r4, r4, #1
-       teq     r6, r7, lsl #2
-       bne     .D5
-       ldr     r7, [r12, r4, lsl #2]
-       cmp     r4, r5
-       bls     .D1
-       .size   verify_code_vm, .-verify_code_vm
        .global verify_code
        .type   verify_code, %function
 verify_code:
@@ -555,6 +570,8 @@ verify_code:
        bl      get_addr
        mov     pc, r0
        .size   verify_code, .-verify_code
+       .size   verify_code_vm, .-verify_code_vm
+
        .align  2
        .global cc_interrupt
        .type   cc_interrupt, %function
@@ -566,7 +583,8 @@ cc_interrupt:
        str     r1, [fp, #pending_exception-dynarec_local]
        and     r2, r2, r10, lsr #17
        add     r3, fp, #restore_candidate-dynarec_local
-       str     r10, [fp, #reg_cop0+36-dynarec_local] /* Count */
+       str     r10, [fp, #cycle-dynarec_local] /* PCSX cycles */
+@@     str     r10, [fp, #reg_cop0+36-dynarec_local] /* Count */
        ldr     r4, [r2, r3]
        mov     r10, lr
        tst     r4, r4
@@ -574,23 +592,20 @@ cc_interrupt:
 .E1:
        bl      gen_interupt
        mov     lr, r10
-       ldr     r10, [fp, #reg_cop0+36-dynarec_local] /* Count */
+       ldr     r10, [fp, #cycle-dynarec_local]
        ldr     r0, [fp, #next_interupt-dynarec_local]
        ldr     r1, [fp, #pending_exception-dynarec_local]
        ldr     r2, [fp, #stop-dynarec_local]
        str     r0, [fp, #last_count-dynarec_local]
        sub     r10, r10, r0
        tst     r2, r2
-       bne     .E3
+       ldmnefd sp!, {r4, r5, r6, r7, r8, r9, sl, fp, ip, pc}
        tst     r1, r1
        moveq   pc, lr
 .E2:
        ldr     r0, [fp, #pcaddr-dynarec_local]
        bl      get_addr_ht
        mov     pc, r0
-.E3:
-       add     r12, fp, #28
-       ldmia   r12, {r4, r5, r6, r7, r8, r9, sl, fp, pc}
 .E4:
        /* Move 'dirty' blocks to the 'clean' list */
        lsl     r5, r2, #3
@@ -603,21 +618,18 @@ cc_interrupt:
        tst     r5, #31
        bne     .E5
        b       .E1
-
        .size   cc_interrupt, .-cc_interrupt
+
        .align  2
        .global do_interrupt
        .type   do_interrupt, %function
 do_interrupt:
        ldr     r0, [fp, #pcaddr-dynarec_local]
        bl      get_addr_ht
-       ldr     r1, [fp, #next_interupt-dynarec_local]
-       ldr     r10, [fp, #reg_cop0+36-dynarec_local] /* Count */
-       str     r1, [fp, #last_count-dynarec_local]
-       sub     r10, r10, r1
        add     r10, r10, #2
        mov     pc, r0
        .size   do_interrupt, .-do_interrupt
+
        .align  2
        .global fp_exception
        .type   fp_exception, %function
@@ -631,7 +643,7 @@ fp_exception:
        add     r2, r2, #0x2c
        str     r1, [fp, #reg_cop0+48-dynarec_local] /* Status */
        str     r2, [fp, #reg_cop0+52-dynarec_local] /* Cause */
-       add     r0, r3, #0x180
+       add     r0, r3, #0x80
        bl      get_addr_ht
        mov     pc, r0
        .size   fp_exception, .-fp_exception
@@ -642,6 +654,7 @@ fp_exception_ds:
        mov     r2, #0x90000000 /* Set high bit if delay slot */
        b       .E7
        .size   fp_exception_ds, .-fp_exception_ds
+
        .align  2
        .global jump_syscall
        .type   jump_syscall, %function
@@ -653,304 +666,486 @@ jump_syscall:
        mov     r2, #0x20
        str     r1, [fp, #reg_cop0+48-dynarec_local] /* Status */
        str     r2, [fp, #reg_cop0+52-dynarec_local] /* Cause */
-       add     r0, r3, #0x180
+       add     r0, r3, #0x80
        bl      get_addr_ht
        mov     pc, r0
        .size   jump_syscall, .-jump_syscall
        .align  2
+
+       .align  2
+       .global jump_syscall_hle
+       .type   jump_syscall_hle, %function
+jump_syscall_hle:
+       str     r0, [fp, #pcaddr-dynarec_local] /* PC must be set to EPC for psxException */
+       ldr     r2, [fp, #last_count-dynarec_local]
+       mov     r1, #0    /* in delay slot */
+       add     r2, r2, r10
+       mov     r0, #0x20 /* cause */
+       str     r2, [fp, #cycle-dynarec_local] /* PCSX cycle counter */
+       bl      psxException
+
+       /* note: psxException might do recorsive recompiler call from it's HLE code,
+        * so be ready for this */
+pcsx_return:
+       ldr     r1, [fp, #next_interupt-dynarec_local]
+       ldr     r10, [fp, #cycle-dynarec_local]
+       ldr     r0, [fp, #pcaddr-dynarec_local]
+       sub     r10, r10, r1
+       str     r1, [fp, #last_count-dynarec_local]
+       bl      get_addr_ht
+       mov     pc, r0
+       .size   jump_syscall_hle, .-jump_syscall_hle
+
+       .align  2
+       .global jump_hlecall
+       .type   jump_hlecall, %function
+jump_hlecall:
+       ldr     r2, [fp, #last_count-dynarec_local]
+       str     r0, [fp, #pcaddr-dynarec_local]
+       add     r2, r2, r10
+       adr     lr, pcsx_return
+       str     r2, [fp, #cycle-dynarec_local] /* PCSX cycle counter */
+       bx      r1
+       .size   jump_hlecall, .-jump_hlecall
+
+       .align  2
+       .global jump_intcall
+       .type   jump_intcall, %function
+jump_intcall:
+       ldr     r2, [fp, #last_count-dynarec_local]
+       str     r0, [fp, #pcaddr-dynarec_local]
+       add     r2, r2, r10
+       adr     lr, pcsx_return
+       str     r2, [fp, #cycle-dynarec_local] /* PCSX cycle counter */
+       b       execI
+       .size   jump_hlecall, .-jump_hlecall
+
+new_dyna_leave:
+       .align  2
+       .global new_dyna_leave
+       .type   new_dyna_leave, %function
+       ldr     r0, [fp, #last_count-dynarec_local]
+       add     r12, fp, #28
+       add     r10, r0, r10
+       str     r10, [fp, #cycle-dynarec_local]
+       ldmfd   sp!, {r4, r5, r6, r7, r8, r9, sl, fp, ip, pc}
+       .size   new_dyna_leave, .-new_dyna_leave
+
+       /* these are used to call memhandlers */
+       .align  2
        .global indirect_jump_indexed
        .type   indirect_jump_indexed, %function
 indirect_jump_indexed:
        ldr     r0, [r0, r1, lsl #2]
-       .size   indirect_jump_indexed, .-indirect_jump_indexed
-       .align  2
        .global indirect_jump
        .type   indirect_jump, %function
 indirect_jump:
        ldr     r12, [fp, #last_count-dynarec_local]
        add     r2, r2, r12 
-       str     r2, [fp, #reg_cop0+36-dynarec_local] /* Count */
+       str     r2, [fp, #cycle-dynarec_local]
        mov     pc, r0
        .size   indirect_jump, .-indirect_jump
+       .size   indirect_jump_indexed, .-indirect_jump_indexed
+
        .align  2
-       .global jump_eret
-       .type   jump_eret, %function
-jump_eret:
-       ldr     r1, [fp, #reg_cop0+48-dynarec_local] /* Status */
-       ldr     r0, [fp, #last_count-dynarec_local]
-       bic     r1, r1, #2
-       add     r10, r0, r10
-       str     r1, [fp, #reg_cop0+48-dynarec_local] /* Status */
-       str     r10, [fp, #reg_cop0+36-dynarec_local] /* Count */
-       bl      check_interupt
-       ldr     r1, [fp, #next_interupt-dynarec_local]
-       ldr     r0, [fp, #reg_cop0+56-dynarec_local] /* EPC */
-       str     r1, [fp, #last_count-dynarec_local]
-       subs    r10, r10, r1
-       bpl     .E11
-.E8:
-       add     r6, fp, #reg+256-dynarec_local
-       mov     r5, #248
-       mov     r1, #0
-.E9:
-       ldr     r2, [r6, #-8]!
-       ldr     r3, [r6, #4]
-       eor     r3, r3, r2, asr #31
-       subs    r3, r3, #1
-       adc     r1, r1, r1
-       subs    r5, r5, #8
-       bne     .E9
-       ldr     r2, [fp, #hi-dynarec_local]
-       ldr     r3, [fp, #hi+4-dynarec_local]
-       eors    r3, r3, r2, asr #31
-       ldr     r2, [fp, #lo-dynarec_local]
-       ldreq   r3, [fp, #lo+4-dynarec_local]
-       eoreq   r3, r3, r2, asr #31
-       subs    r3, r3, #1
-       adc     r1, r1, r1
-       bl      get_addr_32
-       mov     pc, r0
-.E11:
-       str     r0, [fp, #pcaddr-dynarec_local]
-       bl      cc_interrupt
-       ldr     r0, [fp, #pcaddr-dynarec_local]
-       b       .E8
-       .size   jump_eret, .-jump_eret
+       .global invalidate_addr_r0
+       .type   invalidate_addr_r0, %function
+invalidate_addr_r0:
+       stmia   fp, {r0, r1, r2, r3, r12, lr}
+       lsr     r0, r0, #12     
+       b       invalidate_addr_call
+       .size   invalidate_addr_r0, .-invalidate_addr_r0
        .align  2
-       .global new_dyna_start
-       .type   new_dyna_start, %function
-new_dyna_start:
-       ldr     r12, .dlptr
-       stmia   r12, {r4, r5, r6, r7, r8, r9, sl, fp, lr}
-       sub     fp, r12, #28
-       bl      new_recompile_block
-       ldr     r0, [fp, #next_interupt-dynarec_local]
-       ldr     r10, [fp, #reg_cop0+36-dynarec_local] /* Count */
-       str     r0, [fp, #last_count-dynarec_local]
-       sub     r10, r10, r0
-       mov     pc, #0x2000000
-.dlptr:
-       .word   dynarec_local+28
-       .size   new_dyna_start, .-new_dyna_start
+       .global invalidate_addr_r1
+       .type   invalidate_addr_r1, %function
+invalidate_addr_r1:
+       stmia   fp, {r0, r1, r2, r3, r12, lr}
+       lsr     r0, r1, #12     
+       b       invalidate_addr_call
+       .size   invalidate_addr_r1, .-invalidate_addr_r1
        .align  2
-       .global write_rdram_new
-       .type   write_rdram_new, %function
-write_rdram_new:
-       ldr     r2, [fp, #address-dynarec_local]
-       ldr     r0, [fp, #word-dynarec_local]
-       str     r0, [r2]
-       b       .E12
-       .size   write_rdram_new, .-write_rdram_new
+       .global invalidate_addr_r2
+       .type   invalidate_addr_r2, %function
+invalidate_addr_r2:
+       stmia   fp, {r0, r1, r2, r3, r12, lr}
+       lsr     r0, r2, #12     
+       b       invalidate_addr_call
+       .size   invalidate_addr_r2, .-invalidate_addr_r2
        .align  2
-       .global write_rdramb_new
-       .type   write_rdramb_new, %function
-write_rdramb_new:
-       ldr     r2, [fp, #address-dynarec_local]
-       ldrb    r0, [fp, #byte-dynarec_local]
-       eor     r2, r2, #3
-       strb    r0, [r2]
-       b       .E12
-       .size   write_rdramb_new, .-write_rdramb_new
+       .global invalidate_addr_r3
+       .type   invalidate_addr_r3, %function
+invalidate_addr_r3:
+       stmia   fp, {r0, r1, r2, r3, r12, lr}
+       lsr     r0, r3, #12     
+       b       invalidate_addr_call
+       .size   invalidate_addr_r3, .-invalidate_addr_r3
        .align  2
-       .global write_rdramh_new
-       .type   write_rdramh_new, %function
-write_rdramh_new:
-       ldr     r2, [fp, #address-dynarec_local]
-       ldrh    r0, [fp, #hword-dynarec_local]
-       eor     r2, r2, #2
-       strh    r0, [r2]
-       b       .E12
-       .size   write_rdramh_new, .-write_rdramh_new
+       .global invalidate_addr_r4
+       .type   invalidate_addr_r4, %function
+invalidate_addr_r4:
+       stmia   fp, {r0, r1, r2, r3, r12, lr}
+       lsr     r0, r4, #12     
+       b       invalidate_addr_call
+       .size   invalidate_addr_r4, .-invalidate_addr_r4
        .align  2
-       .global write_rdramd_new
-       .type   write_rdramd_new, %function
-write_rdramd_new:
-       ldr     r2, [fp, #address-dynarec_local]
-/*     ldrd    r0, [fp, #dword-dynarec_local]*/
-       ldr     r0, [fp, #dword-dynarec_local]
-       ldr     r1, [fp, #dword+4-dynarec_local]
-       str     r0, [r2, #4]
-       str     r1, [r2]
-       b       .E12
-       .size   write_rdramd_new, .-write_rdramd_new
+       .global invalidate_addr_r5
+       .type   invalidate_addr_r5, %function
+invalidate_addr_r5:
+       stmia   fp, {r0, r1, r2, r3, r12, lr}
+       lsr     r0, r5, #12     
+       b       invalidate_addr_call
+       .size   invalidate_addr_r5, .-invalidate_addr_r5
        .align  2
-       .global do_invalidate
-       .type   do_invalidate, %function
-do_invalidate:
-       ldr     r2, [fp, #address-dynarec_local]
-.E12:
-       ldr     r1, [fp, #invc_ptr-dynarec_local]
-       lsr     r0, r2, #12
-       ldrb    r2, [r1, r0]
-       tst     r2, r2
-       beq     invalidate_block
-       mov     pc, lr
-       .size   do_invalidate, .-do_invalidate
+       .global invalidate_addr_r6
+       .type   invalidate_addr_r6, %function
+invalidate_addr_r6:
+       stmia   fp, {r0, r1, r2, r3, r12, lr}
+       lsr     r0, r6, #12     
+       b       invalidate_addr_call
+       .size   invalidate_addr_r6, .-invalidate_addr_r6
        .align  2
-       .global read_nomem_new
-       .type   read_nomem_new, %function
-/*read_nomem_new:*/
-read_nomemb_new:
-read_nomemh_new:
-read_nomemd_new:
-       /* should never happen */
-       b       read_nomem_new
-/*
-       ldr     r2, [fp, #address-dynarec_local]
-       add     r12, fp, #memory_map-dynarec_local
-       lsr     r0, r2, #12
-       ldr     r12, [r12, r0, lsl #2]
-       mov     r1, #8
-       tst     r12, r12
-       bmi     tlb_exception
-       ldr     r0, [r2, r12, lsl #2]
-       str     r0, [fp, #readmem_dword-dynarec_local]
-       mov     pc, lr
-*/
-       .size   read_nomem_new, .-read_nomem_new
+       .global invalidate_addr_r7
+       .type   invalidate_addr_r7, %function
+invalidate_addr_r7:
+       stmia   fp, {r0, r1, r2, r3, r12, lr}
+       lsr     r0, r7, #12     
+       b       invalidate_addr_call
+       .size   invalidate_addr_r7, .-invalidate_addr_r7
        .align  2
-       .global read_nomemb_new
-       .type   read_nomemb_new, %function
-write_nomem_new:
-       str     r3, [fp, #24]
-       str     lr, [fp, #28]
-       bl      do_invalidate
-       ldr     r2, [fp, #address-dynarec_local]
-       add     r12, fp, #memory_map-dynarec_local
-       ldr     lr, [fp, #28]
-       lsr     r0, r2, #12
-       ldr     r3, [fp, #24]
-       ldr     r12, [r12, r0, lsl #2]
-       mov     r1, #0xc
-       tst     r12, #0x40000000
-       bne     tlb_exception
-       ldr     r0, [fp, #word-dynarec_local]
-       str     r0, [r2, r12, lsl #2]
-       mov     pc, lr
-       .size   write_nomem_new, .-write_nomem_new
+       .global invalidate_addr_r8
+       .type   invalidate_addr_r8, %function
+invalidate_addr_r8:
+       stmia   fp, {r0, r1, r2, r3, r12, lr}
+       lsr     r0, r8, #12     
+       b       invalidate_addr_call
+       .size   invalidate_addr_r8, .-invalidate_addr_r8
        .align  2
-       .global write_nomemb_new
-       .type   write_nomemb_new, %function
-write_nomemb_new:
-       str     r3, [fp, #24]
-       str     lr, [fp, #28]
-       bl      do_invalidate
-       ldr     r2, [fp, #address-dynarec_local]
-       add     r12, fp, #memory_map-dynarec_local
-       ldr     lr, [fp, #28]
-       lsr     r0, r2, #12
-       ldr     r3, [fp, #24]
-       ldr     r12, [r12, r0, lsl #2]
-       mov     r1, #0xc
-       tst     r12, #0x40000000
-       bne     tlb_exception
-       eor     r2, r2, #3
-       ldrb    r0, [fp, #byte-dynarec_local]
-       strb    r0, [r2, r12, lsl #2]
-       mov     pc, lr
-       .size   write_nomemb_new, .-write_nomemb_new
+       .global invalidate_addr_r9
+       .type   invalidate_addr_r9, %function
+invalidate_addr_r9:
+       stmia   fp, {r0, r1, r2, r3, r12, lr}
+       lsr     r0, r9, #12     
+       b       invalidate_addr_call
+       .size   invalidate_addr_r9, .-invalidate_addr_r9
        .align  2
-       .global write_nomemh_new
-       .type   write_nomemh_new, %function
-write_nomemh_new:
-       str     r3, [fp, #24]
-       str     lr, [fp, #28]
-       bl      do_invalidate
-       ldr     r2, [fp, #address-dynarec_local]
-       add     r12, fp, #memory_map-dynarec_local
-       ldr     lr, [fp, #28]
-       lsr     r0, r2, #12
-       ldr     r3, [fp, #24]
-       ldr     r12, [r12, r0, lsl #2]
-       mov     r1, #0xc
-       lsls    r12, #2
-       bcs     tlb_exception
-       eor     r2, r2, #2
-       ldrh    r0, [fp, #hword-dynarec_local]
-       strh    r0, [r2, r12]
-       mov     pc, lr
-       .size   write_nomemh_new, .-write_nomemh_new
+       .global invalidate_addr_r10
+       .type   invalidate_addr_r10, %function
+invalidate_addr_r10:
+       stmia   fp, {r0, r1, r2, r3, r12, lr}
+       lsr     r0, r10, #12    
+       b       invalidate_addr_call
+       .size   invalidate_addr_r10, .-invalidate_addr_r10
        .align  2
-       .global write_nomemd_new
-       .type   write_nomemd_new, %function
-write_nomemd_new:
-       str     r3, [fp, #24]
-       str     lr, [fp, #28]
-       bl      do_invalidate
-       ldr     r2, [fp, #address-dynarec_local]
-       add     r12, fp, #memory_map-dynarec_local
-       ldr     lr, [fp, #28]
-       lsr     r0, r2, #12
-       ldr     r3, [fp, #24]
-       ldr     r12, [r12, r0, lsl #2]
-       mov     r1, #0xc
-       lsls    r12, #2
-       bcs     tlb_exception
-       add     r3, r2, #4
-       ldr     r0, [fp, #dword+4-dynarec_local]
-       ldr     r1, [fp, #dword-dynarec_local]
-/*     strd    r0, [r2, r12]*/
-       str     r0, [r2, r12]
-       str     r1, [r3, r12]
-       mov     pc, lr
-       .size   write_nomemd_new, .-write_nomemd_new
+       .global invalidate_addr_r12
+       .type   invalidate_addr_r12, %function
+invalidate_addr_r12:
+       stmia   fp, {r0, r1, r2, r3, r12, lr}
+       lsr     r0, r12, #12    
+       .size   invalidate_addr_r12, .-invalidate_addr_r12
        .align  2
-       .global tlb_exception
-       .type   tlb_exception, %function
-tlb_exception:
-       /* r1 = cause */
-       /* r2 = address */
-       /* r3 = instr addr/flags */
-       ldr     r4, [fp, #reg_cop0+48-dynarec_local] /* Status */
-       add     r5, fp, #memory_map-dynarec_local
-       lsr     r6, r3, #12
-       orr     r1, r1, r3, lsl #31
-       orr     r4, r4, #2
-       ldr     r7, [r5, r6, lsl #2]
-       bic     r8, r3, #3
-       str     r4, [fp, #reg_cop0+48-dynarec_local] /* Status */
-       mov     r6, #0x6000000
-       str     r1, [fp, #reg_cop0+52-dynarec_local] /* Cause */
-       orr     r6, r6, #0x22
-       ldr     r0, [r8, r7, lsl #2]
-       add     r4, r8, r1, asr #29
-       add     r5, fp, #reg-dynarec_local
-       str     r4, [fp, #reg_cop0+56-dynarec_local] /* EPC */
-       mov     r7, #0xf8
-       ldr     r8, [fp, #reg_cop0+16-dynarec_local] /* Context */
-       lsl     r1, r0, #16
-       lsr     r4, r0, #26
-       and     r7, r7, r0, lsr #18
-       mvn     r9, #0xF000000F
-       sub     r2, r2, r1, asr #16
-       bic     r9, r9, #0x0F800000
-       rors    r6, r6, r4
-       mov     r0, #0x80000000
-       ldrcs   r2, [r5, r7]
-       bic     r8, r8, r9
-       tst     r3, #2
-       str     r2, [r5, r7]
-       add     r4, r2, r1, asr #16
-       add     r6, fp, #reg+4-dynarec_local
-       asr     r3, r2, #31
-       str     r4, [fp, #reg_cop0+32-dynarec_local] /* BadVAddr */
-       add     r0, r0, #0x180
-       and     r4, r9, r4, lsr #9
-       strne   r3, [r6, r7]
-       orr     r8, r8, r4
-       str     r8, [fp, #reg_cop0+16-dynarec_local] /* Context */
+       .global invalidate_addr_call
+       .type   invalidate_addr_call, %function
+invalidate_addr_call:
+       bl      invalidate_block
+       ldmia   fp, {r0, r1, r2, r3, r12, pc}
+       .size   invalidate_addr_call, .-invalidate_addr_call
+
+       .align  2
+       .global new_dyna_start
+       .type   new_dyna_start, %function
+new_dyna_start:
+       /* ip is stored to conform EABI alignment */
+       stmfd   sp!, {r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
+.if HAVE_ARMV7
+       movw    fp, #:lower16:dynarec_local
+       movt    fp, #:upper16:dynarec_local
+.else
+       ldr     fp, .dlptr
+.endif
+       ldr     r0, [fp, #pcaddr-dynarec_local]
        bl      get_addr_ht
        ldr     r1, [fp, #next_interupt-dynarec_local]
-       ldr     r10, [fp, #reg_cop0+36-dynarec_local] /* Count */
+       ldr     r10, [fp, #cycle-dynarec_local]
        str     r1, [fp, #last_count-dynarec_local]
        sub     r10, r10, r1
-       mov     pc, r0  
-       .size   tlb_exception, .-tlb_exception
-       .align  2
-       .global breakpoint
-       .type   breakpoint, %function
-breakpoint:
-       /* Set breakpoint here for debugging */
+       mov     pc, r0
+.dlptr:
+       .word   dynarec_local
+       .size   new_dyna_start, .-new_dyna_start
+
+/* --------------------------------------- */
+
+.align 2
+.global        ari_read_ram8
+.global        ari_read_ram16
+.global        ari_read_ram32
+.global        ari_read_ram_mirror8
+.global        ari_read_ram_mirror16
+.global        ari_read_ram_mirror32
+.global        ari_write_ram8
+.global        ari_write_ram16
+.global        ari_write_ram32
+.global        ari_write_ram_mirror8
+.global        ari_write_ram_mirror16
+.global        ari_write_ram_mirror32
+.global        ari_read_bios8
+.global        ari_read_bios16
+.global        ari_read_bios32
+.global        ari_read_io8
+.global        ari_read_io16
+.global        ari_read_io32
+.global        ari_write_io8
+.global        ari_write_io16
+.global        ari_write_io32
+
+.macro ari_read_ram bic_const op
+       ldr     r0, [fp, #address-dynarec_local]
+.if \bic_const
+       bic     r0, r0, #\bic_const
+.endif
+       \op     r0, [r0]
+       str     r0, [fp, #readmem_dword-dynarec_local]
+       mov     pc, lr
+.endm
+
+ari_read_ram8:
+       ari_read_ram 0, ldrb
+
+ari_read_ram16:
+       ari_read_ram 1, ldrh
+
+ari_read_ram32:
+       ari_read_ram 3, ldr
+
+.macro ari_read_ram_mirror mvn_const, op
+       ldr     r0, [fp, #address-dynarec_local]
+       mvn     r1, #\mvn_const
+       and     r0, r1, lsr #11
+       orr     r0, r0, #1<<31
+       \op     r0, [r0]
+       str     r0, [fp, #readmem_dword-dynarec_local]
+       mov     pc, lr
+.endm
+
+ari_read_ram_mirror8:
+       ari_read_ram_mirror 0, ldrb
+
+ari_read_ram_mirror16:
+       ari_read_ram_mirror (1<<11), ldrh
+
+ari_read_ram_mirror32:
+       ari_read_ram_mirror (3<<11), ldr
+
+/* invalidation is already taken care of by the caller */
+.macro ari_write_ram bic_const var pf
+       ldr     r0, [fp, #address-dynarec_local]
+       ldr\pf  r1, [fp, #\var-dynarec_local]
+.if \bic_const
+       bic     r0, r0, #\bic_const
+.endif
+       str\pf  r1, [r0]
+       mov     pc, lr
+.endm
+
+ari_write_ram8:
+       ari_write_ram 0, byte, b
+
+ari_write_ram16:
+       ari_write_ram 1, hword, h
+
+ari_write_ram32:
+       ari_write_ram 3, word,
+
+.macro ari_write_ram_mirror mvn_const var pf
+       ldr     r0, [fp, #address-dynarec_local]
+       mvn     r3, #\mvn_const
+       ldr\pf  r1, [fp, #\var-dynarec_local]
+       and     r0, r3, lsr #11
+       ldr     r2, [fp, #invc_ptr-dynarec_local]
+       orr     r0, r0, #1<<31
+       ldrb    r2, [r2, r0, lsr #12]
+       str\pf  r1, [r0]
+       tst     r2, r2
+       movne   pc, lr
+       lsr     r0, r0, #12
+       b       invalidate_block
+.endm
+
+ari_write_ram_mirror8:
+       ari_write_ram_mirror 0, byte, b
+
+ari_write_ram_mirror16:
+       ari_write_ram_mirror (1<<11), hword, h
+
+ari_write_ram_mirror32:
+       ari_write_ram_mirror (3<<11), word,
+
+
+.macro ari_read_bios_mirror bic_const op
+       ldr     r0, [fp, #address-dynarec_local]
+       orr     r0, r0, #0x80000000
+       bic     r0, r0, #(0x20000000|\bic_const)        @ map to 0x9fc...
+       \op     r0, [r0]
+       str     r0, [fp, #readmem_dword-dynarec_local]
        mov     pc, lr
-       .size   breakpoint, .-breakpoint
-       .section        .note.GNU-stack,"",%progbits
+.endm
+
+ari_read_bios8:
+       ari_read_bios_mirror 0, ldrb
+
+ari_read_bios16:
+       ari_read_bios_mirror 1, ldrh
+
+ari_read_bios32:
+       ari_read_bios_mirror 3, ldr
+
+
+@ for testing
+.macro ari_read_io_old tab_shift
+       str     lr, [sp, #-8]! @ EABI alignment..
+.if \tab_shift == 0
+       bl      psxHwRead32
+.endif
+.if \tab_shift == 1
+       bl      psxHwRead16
+.endif
+.if \tab_shift == 2
+       bl      psxHwRead8
+.endif
+       str     r0, [fp, #readmem_dword-dynarec_local]
+       ldr     pc, [sp], #8
+.endm
+
+.macro ari_read_io readop mem_tab tab_shift
+       ldr     r0, [fp, #address-dynarec_local]
+       ldr     r1, [fp, #psxH_ptr-dynarec_local]
+.if \tab_shift == 0
+       bic     r0, r0, #3
+.endif
+.if \tab_shift == 1
+       bic     r0, r0, #1
+.endif
+       bic     r2, r0, #0x1f800000
+       ldr     r12,[fp, #\mem_tab-dynarec_local]
+       subs    r3, r2, #0x1000
+       blo     2f
+@      ari_read_io_old \tab_shift
+       cmp     r3, #0x880
+       bhs     1f
+       ldr     r12,[r12, r3, lsl #\tab_shift]
+       tst     r12,r12
+       beq     2f
+0:
+       str     lr, [sp, #-8]! @ EABI alignment..
+       blx     r12
+       str     r0, [fp, #readmem_dword-dynarec_local]
+       ldr     pc, [sp], #8
+
+1:
+.if \tab_shift == 1 @ read16
+       cmp     r2, #0x1c00
+       blo     2f
+       cmp     r2, #0x1e00
+       bhs     2f
+       ldr     r12,[fp, #spu_readf-dynarec_local]
+       b       0b
+.endif
+2:
+       @ no handler, just read psxH
+       \readop r0, [r1, r2]
+       str     r0, [fp, #readmem_dword-dynarec_local]
+       mov     pc, lr
+.endm
+
+ari_read_io8:
+       ari_read_io ldrb, tab_read8, 2
+
+ari_read_io16:
+       ari_read_io ldrh, tab_read16, 1
+
+ari_read_io32:
+       ari_read_io ldr, tab_read32, 0
+
+.macro ari_write_io_old tab_shift
+.if \tab_shift == 0
+       b       psxHwWrite32
+.endif
+.if \tab_shift == 1
+       b       psxHwWrite16
+.endif
+.if \tab_shift == 2
+       b       psxHwWrite8
+.endif
+.endm
+
+.macro ari_write_io pf var mem_tab tab_shift
+       ldr     r0, [fp, #address-dynarec_local]
+       ldr\pf  r1, [fp, #\var-dynarec_local]
+.if \tab_shift == 0
+       bic     r0, r0, #3
+.endif
+.if \tab_shift == 1
+       bic     r0, r0, #1
+.endif
+       bic     r2, r0, #0x1f800000
+       ldr     r12,[fp, #\mem_tab-dynarec_local]
+       subs    r3, r2, #0x1000
+       blo     0f
+@      ari_write_io_old \tab_shift
+       cmp     r3, #0x880
+       bhs     1f
+       ldr     r12,[r12, r3, lsl #\tab_shift]
+       mov     r0, r1
+       tst     r12,r12
+       bxne    r12
+0:
+       ldr     r3, [fp, #psxH_ptr-dynarec_local]
+       str\pf  r1, [r2, r3]
+       mov     pc, lr
+1:
+       cmp     r2, #0x1c00
+       blo     0b
+       cmp     r2, #0x1e00
+.if \tab_shift != 0
+       ldrlo   pc, [fp, #spu_writef-dynarec_local]
+.else
+       @ write32 to SPU - very rare case (is this correct?)
+       bhs     0b
+       add     r2, r0, #2
+       mov     r3, r1, lsr #16
+       push    {r2,r3,lr}
+       mov     lr, pc
+       ldr     pc, [fp, #spu_writef-dynarec_local]
+       pop     {r0,r1,lr}
+       ldr     pc, [fp, #spu_writef-dynarec_local]
+.endif
+       nop
+       b       0b
+.endm
+
+ari_write_io8:
+       @ PCSX always writes to psxH, so do we for consistency
+       ldr     r0, [fp, #address-dynarec_local]
+       ldr     r3, [fp, #psxH_ptr-dynarec_local]
+       ldrb    r1, [fp, #byte-dynarec_local]
+       bic     r2, r0, #0x1f800000
+       ldr     r12,[fp, #tab_write8-dynarec_local]
+       strb    r1, [r2, r3]
+       subs    r3, r2, #0x1000
+       movlo   pc, lr
+@      ari_write_io_old 2
+       cmp     r3, #0x880
+       movhs   pc, lr
+       ldr     r12,[r12, r3, lsl #2]
+       mov     r0, r1
+       tst     r12,r12
+       bxne    r12
+       mov     pc, lr
+
+ari_write_io16:
+       ari_write_io h, hword, tab_write16, 1
+
+ari_write_io32:
+       ari_write_io , word, tab_write32, 0
+
+@ vim:filetype=armasm