From: kub Date: Tue, 7 Apr 2020 20:07:38 +0000 (+0200) Subject: ym2612 ARM optimisations X-Git-Tag: v2.00~754 X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c9183791372cea2c39d0233186496c26f5c7cf3e;p=picodrive.git ym2612 ARM optimisations --- diff --git a/pico/sound/ym2612_arm.S b/pico/sound/ym2612_arm.S index 59abb74e..1370e6cf 100644 --- a/pico/sound/ym2612_arm.S +++ b/pico/sound/ym2612_arm.S @@ -15,8 +15,8 @@ #include "../arm_features.h" -@ very simple adaption YM2612 output rate to sample rate (~1M cycles @44100) -//#define INTERPOL +@ very simple YM2612 output rate to sample rate adaption (~500k cycles @44100) +#define INTERPOL .equiv SLOT1, 0 .equiv SLOT2, 2 @@ -44,7 +44,7 @@ @ r5=slot, r1=eg_cnt, trashes: r0,r2,r3 @ writes output to routp, but only if vol_out changes -.macro update_eg_phase_slot slot +.macro update_eg_phase_slot #if defined(INTERPOL) ldrh r0, [r5,#0x34] @ vol_out #endif @@ -190,21 +190,6 @@ ldrh r3, [r5,#0x18] @ tl add r0, r0, r3 @ volume += tl strh r0, [r5,#0x34] @ vol_out -.if \slot == SLOT1 - mov r6, r6, lsr #16 - orr r6, r0, r6, lsl #16 -.elseif \slot == SLOT2 - mov r6, r6, lsl #16 - mov r0, r0, lsl #16 - orr r6, r0, r6, lsr #16 -.elseif \slot == SLOT3 - mov r7, r7, lsr #16 - orr r7, r0, r7, lsl #16 -.elseif \slot == SLOT4 - mov r7, r7, lsl #16 - mov r0, r0, lsl #16 - orr r7, r0, r7, lsr #16 -.endif 0: @ EG_OFF .endm @@ -672,24 +657,16 @@ chan_render_loop: mov r11, r1 and r0, r0, #7 orr r4, r4, r0 @ (length<<8)|algo - add r0, lr, #0x44 - ldmia r0, {r8,r9} @ eg_timer, eg_timer_add + ldr r8, [lr, #0x44] @ eg_timer + ldr r9, [lr, #0x48] @ eg_timer_add ldr r10, [lr, #0x54] @ op1_out -@ ldmia lr, {r6,r7} @ load volumes - ldr r5, [lr, #0x40] @ CH - ldrh r6, [r5, #0x34] @ vol_out values for all slots - ldrh r2, [r5, #0x34+SLOT_STRUCT_SIZE*2] - ldrh r7, [r5, #0x34+SLOT_STRUCT_SIZE] - ldrh r3, [r5, #0x34+SLOT_STRUCT_SIZE*3] - orr r6, r6, r2, lsl #16 - orr r7, r7, r3, lsl #16 tst r12, #8 @ lfo? beq crl_loop crl_loop_lfo: - add r0, lr, #0x30 - ldmia r0, {r1,r2} @ lfo_cnt, lfo_inc + ldr r1, [lr, #0x30] @ lfo_cnt + ldr r2, [lr, #0x34] @ lfo_inc subs r4, r4, #0x100 bmi crl_loop_end @@ -707,37 +684,48 @@ crl_loop: bmi crl_loop_end @ -- SSG -- - add r0, lr, #0x3c - ldmia r0, {r1,r5} @ eg_cnt, CH + ldr r5, [lr, #0x40] @ CH @ r5=slot, trashes: r0,r2,r3 + mov r6, #4 +ssg_upd_loop: update_ssg_eg - add r5, r5, #SLOT_STRUCT_SIZE*2 @ SLOT2 (2) - update_ssg_eg - sub r5, r5, #SLOT_STRUCT_SIZE @ SLOT3 (1) - update_ssg_eg - add r5, r5, #SLOT_STRUCT_SIZE*2 @ SLOT4 (3) +#if 0 + subs r6, r6, #1 + addne r5, r5, #SLOT_STRUCT_SIZE +#else + add r5, r5, #SLOT_STRUCT_SIZE*2 update_ssg_eg + subs r6, r6, #2 + subne r5, r5, #SLOT_STRUCT_SIZE +#endif + bne ssg_upd_loop sub r5, r5, #SLOT_STRUCT_SIZE*3 @ -- EG -- add r8, r8, r9 cmp r8, #EG_TIMER_OVERFLOW bcc eg_done + ldr r1, [lr, #0x3c] @ eg_cnt eg_loop: sub r8, r8, #EG_TIMER_OVERFLOW add r1, r1, #1 cmp r1, #4096 movge r1, #1 - @ SLOT1 (0) - @ r5=slot, r1=eg_cnt, trashes: r0,r2,r3 - update_eg_phase_slot SLOT1 - add r5, r5, #SLOT_STRUCT_SIZE*2 @ SLOT2 (2) - update_eg_phase_slot SLOT2 - sub r5, r5, #SLOT_STRUCT_SIZE @ SLOT3 (1) - update_eg_phase_slot SLOT3 - add r5, r5, #SLOT_STRUCT_SIZE*2 @ SLOT4 (3) - update_eg_phase_slot SLOT4 + + mov r6, #4 +eg_upd_loop: + update_eg_phase_slot +#if 1 + subs r6, r6, #1 + addne r5, r5, #SLOT_STRUCT_SIZE +#else + add r5, r5, #SLOT_STRUCT_SIZE*2 + update_eg_phase_slot + subs r6, r6, #2 + subne r5, r5, #SLOT_STRUCT_SIZE +#endif + bne eg_upd_loop cmp r8, #EG_TIMER_OVERFLOW sub r5, r5, #SLOT_STRUCT_SIZE*3 @@ -754,64 +742,49 @@ eg_done: beq crl_loop @ output interpolation -#if 0 // too expensive on slow platforms +#if defined(INTERPOL) +#if 1 // possibly too expensive for slow platforms? @ basic interpolator, interpolate in middle region, else use closer value mov r3, r8, lsr #EG_SH @ eg_timer, [0..3<>EG_SH)/2 - bgt 0f @ mix is vol_out - - ldrh r0, [r5,#0x36] @ SLOT1 vol_ipol - lsleq r2, r6, #16 - addeq r0, r0, r2, lsr #16 - lsreq r0, r0, #1 - mov r6, r6, lsr #16 - orr r6, r0, r6, lsl #16 - - ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE*2] @ SLOT2 vol_ipol - addeq r0, r0, r6, lsr #16 - lsreq r0, r0, #1 - mov r6, r6, lsl #16 - orr r6, r6, r0 - ror r6, r6, #16 - - ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE] @ SLOT3 vol_ipol - lsleq r2, r7, #16 - addeq r0, r0, r2, lsr #16 - lsreq r0, r0, #1 - mov r7, r7, lsr #16 - orr r7, r0, r7, lsl #16 - - ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE*3] @ SLOT4 vol_ipol - addeq r0, r0, r7, lsr #16 - lsreq r0, r0, #1 - mov r7, r7, lsl #16 - orr r7, r7, r0 - ror r7, r7, #16 -#elif defined(INTERPOL) + bne 0f @ mix is vol_out + + ldr r6, [r5, #0x34] @ vol_out, vol_ipol for all slots + ldr r2, [r5, #0x34+SLOT_STRUCT_SIZE*2] + ldr r7, [r5, #0x34+SLOT_STRUCT_SIZE] + ldr r3, [r5, #0x34+SLOT_STRUCT_SIZE*3] + add r6, r6, r6, lsl #16 + lsr r6, r6, #17 + add r2, r2, r2, lsl #16 + lsr r2, r2, #17 + add r7, r7, r7, lsl #16 + lsr r7, r7, #17 + add r3, r3, r3, lsl #16 + lsr r3, r3, #17 + b 1f +#else @ super-basic... just take value closest to sample point mov r3, r8, lsr #EG_SH-1 @ eg_timer, [0..3<>EG_SH) - bge 0f @ mix is vol_out - - ldrh r0, [r5,#0x36] @ SLOT1 vol_ipol - mov r6, r6, lsr #16 - orr r6, r0, r6, lsl #16 - - ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE*2] @ SLOT2 vol_ipol - mov r6, r6, lsl #16 - orr r6, r6, r0 - ror r6, r6, #16 +#endif - ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE] @ SLOT3 vol_ipol - mov r7, r7, lsr #16 - orr r7, r0, r7, lsl #16 +0: ldrgeh r6, [r5, #0x34] @ vol_out values for all slots + ldrlth r6, [r5, #0x36] @ vol_ipol values for all slots + ldrgeh r2, [r5, #0x34+SLOT_STRUCT_SIZE*2] + ldrlth r2, [r5, #0x36+SLOT_STRUCT_SIZE*2] + ldrgeh r7, [r5, #0x34+SLOT_STRUCT_SIZE] + ldrlth r7, [r5, #0x36+SLOT_STRUCT_SIZE] + ldrgeh r3, [r5, #0x34+SLOT_STRUCT_SIZE*3] + ldrlth r3, [r5, #0x36+SLOT_STRUCT_SIZE*3] - ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE*3] @ SLOT4 vol_ipol - mov r7, r7, lsl #16 - orr r7, r7, r0 - ror r7, r7, #16 +#else + ldrh r6, [r5, #0x34] @ vol_out values for all slots + ldrh r2, [r5, #0x34+SLOT_STRUCT_SIZE*2] + ldrh r7, [r5, #0x34+SLOT_STRUCT_SIZE] + ldrh r3, [r5, #0x34+SLOT_STRUCT_SIZE*3] #endif -0: +1: orr r6, r6, r2, lsl #16 + orr r7, r7, r3, lsl #16 @ -- SLOT1 -- PIC_LDR(r3, r2, ym_tl_tab) @@ -893,34 +866,28 @@ crl_algo_done: strne r1, [r11], #4 b crl_do_phase -ctl_sample_skip: - and r1, r12, #1 - add r1, r1, #1 - add r11,r11, r1, lsl #2 - b crl_do_phase - ctl_sample_mono: ldr r1, [r11] add r1, r0, r1 str r1, [r11], #4 + b crl_do_phase + +ctl_sample_skip: + and r1, r12, #1 + add r1, r1, #1 + add r11,r11, r1, lsl #2 crl_do_phase: @ -- PHASE UPDATE -- add r5, lr, #0x10 - ldmia r5, {r0-r1} - add r5, lr, #0x20 - ldmia r5, {r2-r3} - add r5, lr, #0x10 - add r0, r0, r2 - add r1, r1, r3 - stmia r5!,{r0-r1} - ldmia r5, {r0-r1} - add r5, lr, #0x28 - ldmia r5, {r2-r3} - add r5, lr, #0x18 - add r0, r0, r2 - add r1, r1, r3 - stmia r5, {r0-r1} + ldmia r5, {r0-r3,r6-r7} + add r0, r0, r6 + add r1, r1, r7 + ldr r6, [r5, #0x18] + ldr r7, [r5, #0x1c] + add r2, r2, r6 + add r3, r3, r7 + stmia r5, {r0-r3} tst r12, #8 bne crl_loop_lfo @@ -928,7 +895,6 @@ crl_do_phase: crl_loop_end: -@ stmia lr, {r6,r7} @ save volumes (for debug) str r8, [lr, #0x44] @ eg_timer str r12, [lr, #0x4c] @ pack (for lfo_ampm) str r4, [lr, #0x50] @ was_update