From 0c2ca3ba2ca8a191fc3f6d9782dc2420537b1964 Mon Sep 17 00:00:00 2001
From: notaz <notasas@gmail.com>
Date: Fri, 30 Sep 2011 01:04:21 +0300
Subject: [PATCH] gte_arm: implement RTPS, RTPT

---
 libpcsxcore/gte_arm.h            |   2 +
 libpcsxcore/gte_arm.s            | 241 ++++++++++++++++++++++++++++++-
 libpcsxcore/gte_neon.s           |   5 +-
 libpcsxcore/new_dynarec/emu_if.c |   3 +
 4 files changed, 239 insertions(+), 12 deletions(-)

diff --git a/libpcsxcore/gte_arm.h b/libpcsxcore/gte_arm.h
index 69924c8d..7cd381a1 100644
--- a/libpcsxcore/gte_arm.h
+++ b/libpcsxcore/gte_arm.h
@@ -1 +1,3 @@
+void gteRTPS_nf_arm(void *cp2_regs, int opcode);
+void gteRTPT_nf_arm(void *cp2_regs, int opcode);
 void gteNCLIP_arm(void *cp2_regs, int opcode);
diff --git a/libpcsxcore/gte_arm.s b/libpcsxcore/gte_arm.s
index c8a7279a..ea1f2cd1 100644
--- a/libpcsxcore/gte_arm.s
+++ b/libpcsxcore/gte_arm.s
@@ -1,10 +1,7 @@
 /*
  * (C) GraÅ¾vydas "notaz" Ignotas, 2011
  *
- * This work is licensed under the terms of any of these licenses
- * (at your option):
- *  - GNU GPL, version 2 or later.
- *  - GNU LGPL, version 2.1 or later.
+ * This work is licensed under the terms of GNU GPL version 2 or later.
  * See the COPYING file in the top-level directory.
  */
 
@@ -13,15 +10,243 @@
 .text
 .align 2
 
-.macro sgnxt16 reg
+.macro sgnxt16 rd
 .if HAVE_ARMV7
-    sxth        \reg, \reg
+    sxth     \rd, \rd
 .else
-    lsl         \reg, \reg, #16
-    asr         \reg, \reg, #16
+    lsl      \rd, \rd, #16
+    asr      \rd, \rd, #16
 .endif
 .endm
 
+@ prepare work reg for ssatx
+@ in: wr reg, bit to saturate to
+.macro ssatx_prep wr bit
+.if !HAVE_ARMV7
+    mov      \wr, #(1<<(\bit-1))
+.endif
+.endm
+
+.macro ssatx rd wr bit
+.if HAVE_ARMV7
+    ssat     \rd, #\bit, \rd
+.else
+    cmp      \rd, \wr
+    subge    \rd, \wr, #1
+    cmn      \rd, \wr
+    rsblt    \rd, \wr, #0
+.endif
+.endm
+
+.macro usat16_ rd rs
+.if HAVE_ARMV7
+    usat     \rd, #16, \rs
+.else
+    subs     \rd, \rs, #0
+    movlt    \rd, #0
+    cmp      \rd, #0x10000
+    movge    \rd, #0x0ff00
+    orrge    \rd, #0x000ff
+.endif
+.endm
+
+@ unsigned divide rd = rm / rs
+@ no div by 0 check
+@ in: rm, rs
+@ trash: rm rs
+.macro udiv rd rm rs
+    clz      \rd, \rs
+    lsl      \rs, \rs, \rd        @ shift up divisor
+    orr      \rd, \rd, #1<<31
+    lsr      \rd, \rd, \rd
+0:
+    cmp      \rm, \rs
+    subcs    \rm, \rs
+    adcs     \rd, \rd, \rd
+    lsr      \rs, #1
+    bcc      0b
+.endm
+
+
+@ calculate RTPS/RTPT MAC values
+@ in: r0 context, r8,r9 VXYZ
+@ out: r10-r12 MAC123
+@ trash: r1-r7
+.macro do_rtpx_mac
+    add      r1, r0, #4*32
+    add      r2, r0, #4*(32+5)    @ gteTRX
+    ldmia    r1!,{r5-r7}          @ gteR1*,gteR2*
+    ldmia    r2, {r10-r12}
+    smulbb   r2, r5, r8           @ gteR11 * gteVX0
+    smultt   r3, r5, r8           @ gteR12 * gteVY0
+    smulbb   r4, r6, r9           @ gteR13 * gteVZ0
+    qadd     r2, r2, r3
+    asr      r4, r4, #1           @ prevent oflow, lose a bit
+    add      r3, r4, r2, asr #1
+    add      r10,r10,r3, asr #11  @ gteMAC1
+    smultb   r2, r6, r8           @ gteR21 * gteVX0
+    smulbt   r3, r7, r8           @ gteR22 * gteVY0
+    smultb   r4, r7, r9           @ gteR23 * gteVZ0
+    ldmia    r1!,{r5-r6}          @ gteR3*
+    qadd     r2, r2, r3
+    asr      r4, r4, #1
+    add      r3, r4, r2, asr #1
+    add      r11,r11,r3, asr #11  @ gteMAC2
+    @ be more accurate for gteMAC3, since it's also a divider
+    smulbb   r2, r5, r8           @ gteR31 * gteVX0
+    smultt   r3, r5, r8           @ gteR32 * gteVY0
+    smulbb   r4, r6, r9           @ gteR33 * gteVZ0
+    qadd     r2, r2, r3
+    asr      r3, r4, #31          @ expand to 64bit
+    adds     r1, r2, r4
+    adc      r3, r2, asr #31      @ 64bit sum in r3,r1
+    add      r12,r12,r3, lsl #20
+    add      r12,r12,r1, lsr #12  @ gteMAC3
+.endm
+
+
+.global gteRTPS_nf_arm @ r0=CP2 (d,c),
+gteRTPS_nf_arm:
+    push     {r4-r11,lr}
+
+    ldmia    r0, {r8,r9}          @ VXYZ(0)
+    do_rtpx_mac
+    add      r1, r0, #4*25        @ gteMAC1
+    add      r2, r0, #4*17        @ gteSZ1
+    stmia    r1, {r10-r12}        @ gteMAC123 save
+    ldmia    r2, {r3-r5}
+    add      r1, r0, #4*16        @ gteSZ0
+    add      r2, r0, #4*9         @ gteIR1
+    ssatx_prep r6, 16
+    usat16_  lr, r12              @ limD
+    ssatx    r10,r6, 16
+    ssatx    r11,r6, 16
+    ssatx    r12,r6, 16
+    stmia    r1, {r3-r5,lr}       @ gteSZ*
+    ldr      r3, [r0,#4*(32+26)]  @ gteH
+    stmia    r2, {r10,r11,r12}    @ gteIR123 save
+    cmp      r3, lr, lsl #1       @ gteH < gteSZ3*2 ?
+    mov      r9, #1<<30
+    bhs      1f
+.if 1
+    lsl      r3, #16
+    udiv     r9, r3, lr
+.else
+    push     {r0, r12}
+    mov      r0, r3
+    mov      r1, lr
+    bl       DIVIDE
+    mov      r9, r0
+    pop      {r0, r12}
+.endif
+1:
+    ldrd     r6, [r0,#4*(32+24)]  @ gteOFXY
+                                  cmp      r9, #0x20000
+    add      r1, r0, #4*12        @ gteSXY0
+                                  movhs    r9, #0x20000
+    ldmia    r1, {r2-r4}
+                   /* quotient */ subhs    r9, #1
+    mov      r2, #0
+    smlal    r6, r2, r10, r9
+    stmia    r1!,{r3,r4}          @ shift gteSXY
+    mov      r3, #0
+    smlal    r7, r3, r11, r9
+    lsr      r6, #16
+             /* gteDQA, gteDQB */ ldrd     r10,[r0, #4*(32+27)]
+    orr      r6, r2, lsl #16      @ (gteOFX + gteIR1 * q) >> 16
+    ssatx_prep r2, 11
+    lsr      r7, #16
+        /* gteDQB + gteDQA * q */ mla      r4, r10, r9, r11
+    orr      r7, r3, lsl #16      @ (gteOFY + gteIR2 * q) >> 16
+    ssatx    r6, r2, 11           @ gteSX2
+    ssatx    r7, r2, 11           @ gteSY2
+    strh     r6, [r1]
+    strh     r7, [r1, #2]
+    str      r4, [r0,#4*24]       @ gteMAC0
+    asrs     r4, #12
+    movmi    r4, #0
+    cmp      r4, #0x1000          @ limH
+    movgt    r4, #0x1000
+    str      r4, [r0,#4*8]        @ gteIR0
+
+    pop      {r4-r11,pc}
+    .size    gteRTPS_nf_arm, .-gteRTPS_nf_arm
+
+
+.global gteRTPT_nf_arm @ r0=CP2 (d,c),
+gteRTPT_nf_arm:
+    ldr      r1, [r0, #4*19]      @ gteSZ3
+    push     {r4-r11,lr}
+    str      r1, [r0, #4*16]      @ gteSZ0
+    mov      lr, #0
+
+rtpt_arm_loop:
+    add      r1, r0, lr, lsl #1
+    ldrd     r8, [r1]             @ VXYZ(v)
+    do_rtpx_mac
+
+    ssatx_prep r6, 16
+    usat16_  r2, r12              @ limD
+    add      r1, r0, #4*25        @ gteMAC1
+    ldr      r3, [r0,#4*(32+26)]  @ gteH
+    stmia    r1, {r10-r12}        @ gteMAC123 save
+    add      r1, r0, #4*17
+    ssatx    r10,r6, 16
+    ssatx    r11,r6, 16
+    ssatx    r12,r6, 16
+    str      r2, [r1, lr]         @ fSZ(v)
+    cmp      r3, r2, lsl #1       @ gteH < gteSZ3*2 ?
+    mov      r9, #1<<30
+    bhs      1f
+.if 1
+    lsl      r3, #16
+    udiv     r9, r3, r2
+.else
+    push     {r0, r12, lr}
+    mov      r0, r3
+    mov      r1, r2
+    bl       DIVIDE
+    mov      r9, r0
+    pop      {r0, r12, lr}
+.endif
+1:
+                                  cmp      r9, #0x20000
+    add      r1, r0, #4*12
+                                  movhs    r9, #0x20000
+    ldrd     r6, [r0,#4*(32+24)]  @ gteOFXY
+                   /* quotient */ subhs    r9, #1
+    mov      r2, #0
+    smlal    r6, r2, r10, r9
+    mov      r3, #0
+    smlal    r7, r3, r11, r9
+    lsr      r6, #16
+    orr      r6, r2, lsl #16      @ (gteOFX + gteIR1 * q) >> 16
+    ssatx_prep r2, 11
+    lsr      r7, #16
+    orr      r7, r3, lsl #16      @ (gteOFY + gteIR2 * q) >> 16
+    ssatx    r6, r2, 11           @ gteSX(v)
+    ssatx    r7, r2, 11           @ gteSY(v)
+    strh     r6, [r1, lr]!
+    add      lr, #4
+    strh     r7, [r1, #2]
+    cmp      lr, #12
+    blt      rtpt_arm_loop
+
+    ldrd     r4, [r0, #4*(32+27)] @ gteDQA, gteDQB
+    add      r1, r0, #4*9         @ gteIR1
+    mla      r3, r4, r9, r5       @ gteDQB + gteDQA * q
+    stmia    r1, {r10,r11,r12}    @ gteIR123 save
+
+    str      r3, [r0,#4*24]       @ gteMAC0
+    asrs     r3, #12
+    movmi    r3, #0
+    cmp      r3, #0x1000          @ limH
+    movgt    r3, #0x1000
+    str      r3, [r0,#4*8]        @ gteIR0
+
+    pop      {r4-r11,pc}
+    .size    gteRTPT_nf_arm, .-gteRTPT_nf_arm
+
 
 .global gteNCLIP_arm @ r0=CP2 (d,c),
 gteNCLIP_arm:
diff --git a/libpcsxcore/gte_neon.s b/libpcsxcore/gte_neon.s
index ab8c1b6e..b0a8f16c 100644
--- a/libpcsxcore/gte_neon.s
+++ b/libpcsxcore/gte_neon.s
@@ -1,10 +1,7 @@
 /*
  * (C) GraÅ¾vydas "notaz" Ignotas, 2011
  *
- * This work is licensed under the terms of any of these licenses
- * (at your option):
- *  - GNU GPL, version 2 or later.
- *  - GNU LGPL, version 2.1 or later.
+ * This work is licensed under the terms of GNU GPL version 2 or later.
  * See the COPYING file in the top-level directory.
  */
 
diff --git a/libpcsxcore/new_dynarec/emu_if.c b/libpcsxcore/new_dynarec/emu_if.c
index f65e5bd5..39bbf2ad 100644
--- a/libpcsxcore/new_dynarec/emu_if.c
+++ b/libpcsxcore/new_dynarec/emu_if.c
@@ -190,9 +190,12 @@ static int ari64_init()
 #if !defined(DRC_DBG) && !defined(PCNT)
 #ifdef __arm__
 	gte_handlers[0x06] = gteNCLIP_arm;
+	gte_handlers_nf[0x01] = gteRTPS_nf_arm;
+	gte_handlers_nf[0x30] = gteRTPT_nf_arm;
 #endif
 #ifdef __ARM_NEON__
 	// compiler's _nf version is still a lot slower then neon
+	// _nf_arm RTPS is roughly the same, RTPT slower
 	gte_handlers[0x01] = gte_handlers_nf[0x01] = gteRTPS_neon;
 	gte_handlers[0x30] = gte_handlers_nf[0x30] = gteRTPT_neon;
 	gte_handlers[0x12] = gte_handlers_nf[0x12] = gteMVMVA_neon;
-- 
2.47.3