From 0c2ca3ba2ca8a191fc3f6d9782dc2420537b1964 Mon Sep 17 00:00:00 2001 From: notaz Date: Fri, 30 Sep 2011 01:04:21 +0300 Subject: [PATCH] gte_arm: implement RTPS, RTPT --- libpcsxcore/gte_arm.h | 2 + libpcsxcore/gte_arm.s | 241 ++++++++++++++++++++++++++++++- libpcsxcore/gte_neon.s | 5 +- libpcsxcore/new_dynarec/emu_if.c | 3 + 4 files changed, 239 insertions(+), 12 deletions(-) diff --git a/libpcsxcore/gte_arm.h b/libpcsxcore/gte_arm.h index 69924c8d..7cd381a1 100644 --- a/libpcsxcore/gte_arm.h +++ b/libpcsxcore/gte_arm.h @@ -1 +1,3 @@ +void gteRTPS_nf_arm(void *cp2_regs, int opcode); +void gteRTPT_nf_arm(void *cp2_regs, int opcode); void gteNCLIP_arm(void *cp2_regs, int opcode); diff --git a/libpcsxcore/gte_arm.s b/libpcsxcore/gte_arm.s index c8a7279a..ea1f2cd1 100644 --- a/libpcsxcore/gte_arm.s +++ b/libpcsxcore/gte_arm.s @@ -1,10 +1,7 @@ /* * (C) Gražvydas "notaz" Ignotas, 2011 * - * This work is licensed under the terms of any of these licenses - * (at your option): - * - GNU GPL, version 2 or later. - * - GNU LGPL, version 2.1 or later. + * This work is licensed under the terms of GNU GPL version 2 or later. * See the COPYING file in the top-level directory. */ @@ -13,15 +10,243 @@ .text .align 2 -.macro sgnxt16 reg +.macro sgnxt16 rd .if HAVE_ARMV7 - sxth \reg, \reg + sxth \rd, \rd .else - lsl \reg, \reg, #16 - asr \reg, \reg, #16 + lsl \rd, \rd, #16 + asr \rd, \rd, #16 .endif .endm +@ prepare work reg for ssatx +@ in: wr reg, bit to saturate to +.macro ssatx_prep wr bit +.if !HAVE_ARMV7 + mov \wr, #(1<<(\bit-1)) +.endif +.endm + +.macro ssatx rd wr bit +.if HAVE_ARMV7 + ssat \rd, #\bit, \rd +.else + cmp \rd, \wr + subge \rd, \wr, #1 + cmn \rd, \wr + rsblt \rd, \wr, #0 +.endif +.endm + +.macro usat16_ rd rs +.if HAVE_ARMV7 + usat \rd, #16, \rs +.else + subs \rd, \rs, #0 + movlt \rd, #0 + cmp \rd, #0x10000 + movge \rd, #0x0ff00 + orrge \rd, #0x000ff +.endif +.endm + +@ unsigned divide rd = rm / rs +@ no div by 0 check +@ in: rm, rs +@ trash: rm rs +.macro udiv rd rm rs + clz \rd, \rs + lsl \rs, \rs, \rd @ shift up divisor + orr \rd, \rd, #1<<31 + lsr \rd, \rd, \rd +0: + cmp \rm, \rs + subcs \rm, \rs + adcs \rd, \rd, \rd + lsr \rs, #1 + bcc 0b +.endm + + +@ calculate RTPS/RTPT MAC values +@ in: r0 context, r8,r9 VXYZ +@ out: r10-r12 MAC123 +@ trash: r1-r7 +.macro do_rtpx_mac + add r1, r0, #4*32 + add r2, r0, #4*(32+5) @ gteTRX + ldmia r1!,{r5-r7} @ gteR1*,gteR2* + ldmia r2, {r10-r12} + smulbb r2, r5, r8 @ gteR11 * gteVX0 + smultt r3, r5, r8 @ gteR12 * gteVY0 + smulbb r4, r6, r9 @ gteR13 * gteVZ0 + qadd r2, r2, r3 + asr r4, r4, #1 @ prevent oflow, lose a bit + add r3, r4, r2, asr #1 + add r10,r10,r3, asr #11 @ gteMAC1 + smultb r2, r6, r8 @ gteR21 * gteVX0 + smulbt r3, r7, r8 @ gteR22 * gteVY0 + smultb r4, r7, r9 @ gteR23 * gteVZ0 + ldmia r1!,{r5-r6} @ gteR3* + qadd r2, r2, r3 + asr r4, r4, #1 + add r3, r4, r2, asr #1 + add r11,r11,r3, asr #11 @ gteMAC2 + @ be more accurate for gteMAC3, since it's also a divider + smulbb r2, r5, r8 @ gteR31 * gteVX0 + smultt r3, r5, r8 @ gteR32 * gteVY0 + smulbb r4, r6, r9 @ gteR33 * gteVZ0 + qadd r2, r2, r3 + asr r3, r4, #31 @ expand to 64bit + adds r1, r2, r4 + adc r3, r2, asr #31 @ 64bit sum in r3,r1 + add r12,r12,r3, lsl #20 + add r12,r12,r1, lsr #12 @ gteMAC3 +.endm + + +.global gteRTPS_nf_arm @ r0=CP2 (d,c), +gteRTPS_nf_arm: + push {r4-r11,lr} + + ldmia r0, {r8,r9} @ VXYZ(0) + do_rtpx_mac + add r1, r0, #4*25 @ gteMAC1 + add r2, r0, #4*17 @ gteSZ1 + stmia r1, {r10-r12} @ gteMAC123 save + ldmia r2, {r3-r5} + add r1, r0, #4*16 @ gteSZ0 + add r2, r0, #4*9 @ gteIR1 + ssatx_prep r6, 16 + usat16_ lr, r12 @ limD + ssatx r10,r6, 16 + ssatx r11,r6, 16 + ssatx r12,r6, 16 + stmia r1, {r3-r5,lr} @ gteSZ* + ldr r3, [r0,#4*(32+26)] @ gteH + stmia r2, {r10,r11,r12} @ gteIR123 save + cmp r3, lr, lsl #1 @ gteH < gteSZ3*2 ? + mov r9, #1<<30 + bhs 1f +.if 1 + lsl r3, #16 + udiv r9, r3, lr +.else + push {r0, r12} + mov r0, r3 + mov r1, lr + bl DIVIDE + mov r9, r0 + pop {r0, r12} +.endif +1: + ldrd r6, [r0,#4*(32+24)] @ gteOFXY + cmp r9, #0x20000 + add r1, r0, #4*12 @ gteSXY0 + movhs r9, #0x20000 + ldmia r1, {r2-r4} + /* quotient */ subhs r9, #1 + mov r2, #0 + smlal r6, r2, r10, r9 + stmia r1!,{r3,r4} @ shift gteSXY + mov r3, #0 + smlal r7, r3, r11, r9 + lsr r6, #16 + /* gteDQA, gteDQB */ ldrd r10,[r0, #4*(32+27)] + orr r6, r2, lsl #16 @ (gteOFX + gteIR1 * q) >> 16 + ssatx_prep r2, 11 + lsr r7, #16 + /* gteDQB + gteDQA * q */ mla r4, r10, r9, r11 + orr r7, r3, lsl #16 @ (gteOFY + gteIR2 * q) >> 16 + ssatx r6, r2, 11 @ gteSX2 + ssatx r7, r2, 11 @ gteSY2 + strh r6, [r1] + strh r7, [r1, #2] + str r4, [r0,#4*24] @ gteMAC0 + asrs r4, #12 + movmi r4, #0 + cmp r4, #0x1000 @ limH + movgt r4, #0x1000 + str r4, [r0,#4*8] @ gteIR0 + + pop {r4-r11,pc} + .size gteRTPS_nf_arm, .-gteRTPS_nf_arm + + +.global gteRTPT_nf_arm @ r0=CP2 (d,c), +gteRTPT_nf_arm: + ldr r1, [r0, #4*19] @ gteSZ3 + push {r4-r11,lr} + str r1, [r0, #4*16] @ gteSZ0 + mov lr, #0 + +rtpt_arm_loop: + add r1, r0, lr, lsl #1 + ldrd r8, [r1] @ VXYZ(v) + do_rtpx_mac + + ssatx_prep r6, 16 + usat16_ r2, r12 @ limD + add r1, r0, #4*25 @ gteMAC1 + ldr r3, [r0,#4*(32+26)] @ gteH + stmia r1, {r10-r12} @ gteMAC123 save + add r1, r0, #4*17 + ssatx r10,r6, 16 + ssatx r11,r6, 16 + ssatx r12,r6, 16 + str r2, [r1, lr] @ fSZ(v) + cmp r3, r2, lsl #1 @ gteH < gteSZ3*2 ? + mov r9, #1<<30 + bhs 1f +.if 1 + lsl r3, #16 + udiv r9, r3, r2 +.else + push {r0, r12, lr} + mov r0, r3 + mov r1, r2 + bl DIVIDE + mov r9, r0 + pop {r0, r12, lr} +.endif +1: + cmp r9, #0x20000 + add r1, r0, #4*12 + movhs r9, #0x20000 + ldrd r6, [r0,#4*(32+24)] @ gteOFXY + /* quotient */ subhs r9, #1 + mov r2, #0 + smlal r6, r2, r10, r9 + mov r3, #0 + smlal r7, r3, r11, r9 + lsr r6, #16 + orr r6, r2, lsl #16 @ (gteOFX + gteIR1 * q) >> 16 + ssatx_prep r2, 11 + lsr r7, #16 + orr r7, r3, lsl #16 @ (gteOFY + gteIR2 * q) >> 16 + ssatx r6, r2, 11 @ gteSX(v) + ssatx r7, r2, 11 @ gteSY(v) + strh r6, [r1, lr]! + add lr, #4 + strh r7, [r1, #2] + cmp lr, #12 + blt rtpt_arm_loop + + ldrd r4, [r0, #4*(32+27)] @ gteDQA, gteDQB + add r1, r0, #4*9 @ gteIR1 + mla r3, r4, r9, r5 @ gteDQB + gteDQA * q + stmia r1, {r10,r11,r12} @ gteIR123 save + + str r3, [r0,#4*24] @ gteMAC0 + asrs r3, #12 + movmi r3, #0 + cmp r3, #0x1000 @ limH + movgt r3, #0x1000 + str r3, [r0,#4*8] @ gteIR0 + + pop {r4-r11,pc} + .size gteRTPT_nf_arm, .-gteRTPT_nf_arm + .global gteNCLIP_arm @ r0=CP2 (d,c), gteNCLIP_arm: diff --git a/libpcsxcore/gte_neon.s b/libpcsxcore/gte_neon.s index ab8c1b6e..b0a8f16c 100644 --- a/libpcsxcore/gte_neon.s +++ b/libpcsxcore/gte_neon.s @@ -1,10 +1,7 @@ /* * (C) Gražvydas "notaz" Ignotas, 2011 * - * This work is licensed under the terms of any of these licenses - * (at your option): - * - GNU GPL, version 2 or later. - * - GNU LGPL, version 2.1 or later. + * This work is licensed under the terms of GNU GPL version 2 or later. * See the COPYING file in the top-level directory. */ diff --git a/libpcsxcore/new_dynarec/emu_if.c b/libpcsxcore/new_dynarec/emu_if.c index f65e5bd5..39bbf2ad 100644 --- a/libpcsxcore/new_dynarec/emu_if.c +++ b/libpcsxcore/new_dynarec/emu_if.c @@ -190,9 +190,12 @@ static int ari64_init() #if !defined(DRC_DBG) && !defined(PCNT) #ifdef __arm__ gte_handlers[0x06] = gteNCLIP_arm; + gte_handlers_nf[0x01] = gteRTPS_nf_arm; + gte_handlers_nf[0x30] = gteRTPT_nf_arm; #endif #ifdef __ARM_NEON__ // compiler's _nf version is still a lot slower then neon + // _nf_arm RTPS is roughly the same, RTPT slower gte_handlers[0x01] = gte_handlers_nf[0x01] = gteRTPS_neon; gte_handlers[0x30] = gte_handlers_nf[0x30] = gteRTPT_neon; gte_handlers[0x12] = gte_handlers_nf[0x12] = gteMVMVA_neon; -- 2.39.5