From 5494fde2932a6a5d04eec5d054bd5efbc2d97e4d Mon Sep 17 00:00:00 2001
From: kub <derkub@gmail.com>
Date: Tue, 28 Dec 2021 17:43:25 +0100
Subject: [PATCH] sound, ym2612 optimizations and fixes

---
 pico/sound/ym2612.c     |  73 ++++++++++++++++-----------
 pico/sound/ym2612_arm.S | 106 +++++++++++++++++++++++++++-------------
 2 files changed, 117 insertions(+), 62 deletions(-)

diff --git a/pico/sound/ym2612.c b/pico/sound/ym2612.c
index ccdce778..418c1b36 100644
--- a/pico/sound/ym2612.c
+++ b/pico/sound/ym2612.c
@@ -1067,10 +1067,10 @@ static int update_algo_channel(chan_rend_context *ct, unsigned int eg_out, unsig
 			/*    +----C1----+     */
 			/* M1-+-MEM---M2-+-OUT */
 			/*    +----C2----+     */
-			if (ct->eg_timer >= (1<<EG_SH)) break;
-
 			m2 = ct->mem;
 			ct->mem = c1 = c2 = ct->op1_out>>16;
+			if (ct->eg_timer >= (1<<EG_SH)) break;
+
 			if( eg_out < ENV_QUIET ) {		/* SLOT 3 */
 				smp = op_calc(ct->phase3, eg_out, m2);
 			}
@@ -1138,13 +1138,24 @@ static void chan_render_loop(chan_rend_context *ct, int *buffer, int length)
 		unsigned int eg_out, eg_out2, eg_out4;
 
 		ct->eg_timer += ct->eg_timer_add;
-		while (ct->eg_timer >= 1<<EG_SH) {
-			ct->eg_timer -= 1<<EG_SH;
 
+		if (ct->eg_timer >= 3<<EG_SH && !(ct->pack&0xf000)) {
+			int cnt = (ct->eg_timer>>EG_SH)-2;
 			if (ct->pack & 8) { /* LFO enabled ? (test Earthworm Jim in between demo 1 and 2) */
-				ct->pack = (ct->pack&0xffff) | (advance_lfo(ct->pack >> 16, ct->lfo_cnt, ct->lfo_cnt + ct->lfo_inc) << 16);
-				ct->lfo_cnt += ct->lfo_inc;
+				int inc = cnt*ct->lfo_inc;
+				ct->pack = (ct->pack&0xffff) | (advance_lfo(ct->pack >> 16, ct->lfo_cnt, ct->lfo_cnt + inc) << 16);
+				ct->lfo_cnt += inc;
 			}
+
+			ct->phase1 += cnt*ct->incr1;
+			ct->phase2 += cnt*ct->incr2;
+			ct->phase3 += cnt*ct->incr3;
+			ct->phase4 += cnt*ct->incr4;
+		}
+
+		while (ct->eg_timer >= 1<<EG_SH) {
+			ct->eg_timer -= 1<<EG_SH;
+
 			if (ct->pack & 2)
 				update_ssg_eg_channel(ct);
 
@@ -1163,10 +1174,15 @@ static void chan_render_loop(chan_rend_context *ct, int *buffer, int length)
 			ct->vol_out3 =  ct->CH->SLOT[SLOT3].vol_out;
 			ct->vol_out4 =  ct->CH->SLOT[SLOT4].vol_out;
 
-			if (ct->pack & 4) goto disabled; /* output disabled */
-
-			/* calculate channel sample */
 			if (ct->eg_timer < (2<<EG_SH) || (ct->pack&0xf000)) {
+				if (ct->pack & 4) goto disabled; /* output disabled */
+
+				if (ct->pack & 8) { /* LFO enabled ? (test Earthworm Jim in between demo 1 and 2) */
+					ct->pack = (ct->pack&0xffff) | (advance_lfo(ct->pack >> 16, ct->lfo_cnt, ct->lfo_cnt + ct->lfo_inc) << 16);
+					ct->lfo_cnt += ct->lfo_inc;
+				}
+
+				/* calculate channel sample */
 				eg_out = ct->vol_out1;
 				if ( (ct->pack & 8) && (ct->pack&(1<<(SLOT1+8))) )
 					eg_out += ct->pack >> (((ct->pack&0xc0)>>6)+24);
@@ -1175,36 +1191,37 @@ static void chan_render_loop(chan_rend_context *ct, int *buffer, int length)
 				{
 					int out = 0;
 
-					if (ct->pack&0xf000) out = ((ct->op1_out>>16) + ((ct->op1_out<<16)>>16)) << ((ct->pack&0xf000)>>12); /* op1_out0 + op1_out1 */
+					if (ct->pack&0xf000) out = ((ct->op1_out + (ct->op1_out<<16))>>16) << ((ct->pack&0xf000)>>12); /* op1_out0 + op1_out1 */
 					ct->op1_out <<= 16;
 					ct->op1_out |= (unsigned short)op_calc1(ct->phase1, eg_out, out);
 				} else {
 					ct->op1_out <<= 16; /* op1_out0 = op1_out1; op1_out1 = 0; */
 				}
-			}
 
-			if (ct->eg_timer < (2<<EG_SH)) {
-				eg_out  = ct->vol_out3; // volume_calc(&CH->SLOT[SLOT3]);
-				eg_out2 = ct->vol_out2; // volume_calc(&CH->SLOT[SLOT2]);
-				eg_out4 = ct->vol_out4; // volume_calc(&CH->SLOT[SLOT4]);
+				if (ct->eg_timer < (2<<EG_SH)) {
+					eg_out  = ct->vol_out3; // volume_calc(&CH->SLOT[SLOT3]);
+					eg_out2 = ct->vol_out2; // volume_calc(&CH->SLOT[SLOT2]);
+					eg_out4 = ct->vol_out4; // volume_calc(&CH->SLOT[SLOT4]);
+
+					if (ct->pack & 8) {
+						unsigned int add = ct->pack >> (((ct->pack&0xc0)>>6)+24);
+						if (ct->pack & (1<<(SLOT3+8))) eg_out  += add;
+						if (ct->pack & (1<<(SLOT2+8))) eg_out2 += add;
+						if (ct->pack & (1<<(SLOT4+8))) eg_out4 += add;
+					}
 
-				if (ct->pack & 8) {
-					unsigned int add = ct->pack >> (((ct->pack&0xc0)>>6)+24);
-					if (ct->pack & (1<<(SLOT3+8))) eg_out  += add;
-					if (ct->pack & (1<<(SLOT2+8))) eg_out2 += add;
-					if (ct->pack & (1<<(SLOT4+8))) eg_out4 += add;
+					smp = update_algo_channel(ct, eg_out, eg_out2, eg_out4);
 				}
+				/* done calculating channel sample */
 
-				smp = update_algo_channel(ct, eg_out, eg_out2, eg_out4);
+disabled:
+				/* update phase counters AFTER output calculations */
+				ct->phase1 += ct->incr1;
+				ct->phase2 += ct->incr2;
+				ct->phase3 += ct->incr3;
+				ct->phase4 += ct->incr4;
 			}
-			/* done calculating channel sample */
 
-disabled:
-			/* update phase counters AFTER output calculations */
-			ct->phase1 += ct->incr1;
-			ct->phase2 += ct->incr2;
-			ct->phase3 += ct->incr3;
-			ct->phase4 += ct->incr4;
 		}
 
 		/* mix sample to output buffer */
diff --git a/pico/sound/ym2612_arm.S b/pico/sound/ym2612_arm.S
index 52611fce..16b38337 100644
--- a/pico/sound/ym2612_arm.S
+++ b/pico/sound/ym2612_arm.S
@@ -80,8 +80,7 @@
     cmp     r2, #2
     mov     r2, #1
     mov     r2, r2, lsl r3
-    mov     r2, r2, lsr #1       @ eg_inc_val
-    add     r0, r0, r2
+    add     r0, r0, r2, lsr #1   @ volume += eg_inc_val
     blt     1f                   @ EG_REL
     beq     2f                   @ EG_SUS
 
@@ -138,7 +137,7 @@
     movlt   r3, r0, lsl r3
     ldrlth  r0, [r5,#0x1a]       @ volume, unsigned (0-1023)
     movlt   r3, r3, lsr #1       @ eg_inc_val
-    addlt   r0, r0, r3, lsl #2
+    addlt   r0, r0, r3, lsl #2   @ ...*4
 
     cmp     r2, #2
     blt     1f                   @ EG_REL
@@ -213,9 +212,9 @@
     orrne   r0, r0, #0x400                @ ssgn = 4
     strneh  r0, [r5,#0x30]
 
-    eor     r0, r0, #0x4                  @ if ( !(ssg&0x04) )
-    tst     r0, #0x4
-    cmpne   r2, #EG_ATT                   @ if ( state != EG_ATT )
+    tst     r0, #0x4                      @ if ( !(ssg&0x04) )
+    bne     9f
+    cmp     r2, #EG_ATT                   @ if ( state != EG_ATT )
     movne   r3, #0x400
     subne   r3, r3, #1
     strneh  r3, [r5,#0x1a]                @ volume = MAX_ATT
@@ -273,15 +272,16 @@
     cmp     r2, r1, lsr #LFO_SH
     beq     0f
     and     r3, r2, #0x3f
-    cmp     r2, #0x40
-    eorlt   r3, r3, #0x3f
+    tst     r2, #0x40
+    eoreq   r3, r3, #0x3f
     bic     r12,r12, #0xff000000          @ lfo_ampm &= 0xff
     orr     r12,r12, r3, lsl #1+24
 
     mov     r2, r2, lsr #2
     cmp     r2, r1, lsr #LFO_SH+2
+    andne   r3, r2, #0x1f
     bicne   r12,r12, #0xff0000
-    orrne   r12,r12, r2, lsl #16
+    orrne   r12,r12, r3, lsl #16
 
 0:
 .endm
@@ -306,7 +306,6 @@
     movne   r2, r2,  lsr #6
     addne   r2, r2,  #24
     addne   r1, r1,  r12, lsr r2
-    bic     r1, r1,  #1
 .endm
 
 
@@ -316,6 +315,7 @@
     eorne   \r, \r, #0xff   @ if (sin & 0x100) sin = 0xff - (sin&0xff);
     tst     \r, #0x200
     and     \r, \r, #0xff
+    bic     r1, r1, #1
     orr     \r, \r, r1, lsl #7
     mov     \r, \r, lsl #1
     ldrh    \r, [r3, \r]    @ 2ci if ne
@@ -338,18 +338,17 @@
     ldr     r2, [lr, #0x18]
     ldr     r0, [lr, #0x38] @ mem (signed)
     mov     r2, r2, lsr #16
-    add     r0, r2, r0, lsr #1
+    add     r0, r2, r0, asr #1
     lookup_tl r0                  @ r0=c2
 
 0:
-
     @ SLOT4
     make_eg_out SLOT4
     cmp     r1, #ENV_QUIET
     movcs   r0, #0
     bcs     1f
     ldr     r2, [lr, #0x1c]
-    mov     r0, r0, lsr #1
+    mov     r0, r0, asr #1
     add     r0, r0, r2, lsr #16
     lookup_tl r0                  @ r0=output smp
 
@@ -360,7 +359,7 @@
     movcs   r2, #0
     bcs     2f
     ldr     r2, [lr, #0x14]       @ 1ci
-    mov     r5, r10, lsr #17
+    mov     r5, r10, asr #17
     add     r2, r5, r2, lsr #16
     lookup_tl r2                  @ r2=mem
 
@@ -382,7 +381,7 @@
     ldr     r2, [lr, #0x18]
     ldr     r0, [lr, #0x38] @ mem (signed)
     mov     r2, r2, lsr #16
-    add     r0, r2, r0, lsr #1
+    add     r0, r2, r0, asr #1
     lookup_tl r0                 @ r0=c2
 
 0:
@@ -392,7 +391,7 @@
     movcs   r0, #0
     bcs     1f
     ldr     r2, [lr, #0x1c]
-    mov     r0, r0, lsr #1
+    mov     r0, r0, asr #1
     add     r0, r0, r2, lsr #16
     lookup_tl r0                 @ r0=output smp
 
@@ -425,7 +424,7 @@
     ldr     r2, [lr, #0x18]
     ldr     r0, [lr, #0x38] @ mem (signed)
     mov     r2, r2, lsr #16
-    add     r0, r2, r0, lsr #1
+    add     r0, r2, r0, asr #1
     lookup_tl r0                 @ r0=c2
 
 0:
@@ -437,7 +436,7 @@
     movcs   r0, #0
     bcs     1f
     ldr     r2, [lr, #0x1c]
-    mov     r0, r0, lsr #1
+    mov     r0, r0, asr #1
     add     r0, r0, r2, lsr #16
     lookup_tl r0                 @ r0=output smp
 
@@ -480,7 +479,7 @@
     movcs   r0, #0
     bcs     1f
     ldr     r2, [lr, #0x1c]
-    mov     r0, r0, lsr #1
+    mov     r0, r0, asr #1
     add     r0, r0, r2, lsr #16
     lookup_tl r0                 @ r0=output smp
 
@@ -491,7 +490,7 @@
     movcs   r2, #0
     bcs     2f
     ldr     r2, [lr, #0x14]      @ phase2
-    mov     r5, r10, lsr #17
+    mov     r5, r10, asr #17
     add     r2, r5, r2, lsr #16
     lookup_tl r2                 @ r2=mem
 
@@ -521,7 +520,7 @@
     movcs   r0, #0
     bcs     1f
     ldr     r2, [lr, #0x1c]
-    mov     r0, r0, lsr #1
+    mov     r0, r0, asr #1
     add     r0, r0, r2, lsr #16
     lookup_tl r0                 @ r0=output smp
 
@@ -531,7 +530,7 @@
     cmp     r1, #ENV_QUIET
     bcs     2f
     ldr     r2, [lr, #0x14]
-    mov     r5, r10, lsr #17
+    mov     r5, r10, asr #17
     add     r2, r5, r2, lsr #16
     lookup_tl r2
     add     r0, r0, r2            @ add to smp
@@ -553,7 +552,7 @@
     ldr     r2, [lr, #0x18]
     ldr     r0, [lr, #0x38] @ mem (signed)
     mov     r2, r2, lsr #16
-    add     r0, r2, r0, lsr #1
+    add     r0, r2, r0, asr #1
     lookup_tl r0                 @ r0=output smp
 
 0:
@@ -562,7 +561,7 @@
     cmp     r1, #ENV_QUIET
     bcs     1f
     ldr     r2, [lr, #0x1c]
-    mov     r5, r10, lsr #17
+    mov     r5, r10, asr #17
     add     r2, r5, r2, lsr #16
     lookup_tl r2
     add     r0, r0, r2           @ add to smp
@@ -572,7 +571,7 @@
     cmp     r1, #ENV_QUIET
     bcs     2f
     ldr     r2, [lr, #0x14]
-    mov     r5, r10, lsr #17
+    mov     r5, r10, asr #17
     add     r2, r5, r2, lsr #16
     lookup_tl r2
     add     r0, r0, r2           @ add to smp
@@ -612,7 +611,7 @@
     cmp     r1, #ENV_QUIET
     bcs     2f
     ldr     r2, [lr, #0x14]
-    mov     r5, r10, lsr #17
+    mov     r5, r10, asr #17
     add     r2, r5, r2, lsr #16
     lookup_tl r2
     add     r0, r0, r2           @ add to smp
@@ -712,12 +711,20 @@ crl_loop:
     subs    r8, r8, #(1<<EG_SH)
     blt     crl_smp_loop_end
 
-crl_smp_loop:
+    cmp     r8, #(2<<EG_SH)      @ calculate only for operator memory, sample,
+    tstge   r12, #0xf000         @ ...feedback
+    bne     crl_smp_loop
+
+    @ -- LFO+PHASE UPDATE, FF --
+    mov     r0, r8, lsr #EG_SH
+    sub     r0, r0, #1
+
     tst     r12, #8              @ lfo?
-    beq     lfo_done
+    beq     lfo_done_ff
 
     ldr     r2, [lr, #0x34]      @ lfo_inc
     ldr     r1, [lr, #0x30]      @ lfo_cnt
+    mul     r2, r0, r2
 
     add     r2, r2, r1
     str     r2, [lr, #0x30]
@@ -725,7 +732,23 @@ crl_smp_loop:
     @ r12=lfo_ampm[31:16], r1=lfo_cnt_old, r2=lfo_cnt
     advance_lfo_m
 
-lfo_done:
+lfo_done_ff:
+    add     lr, lr, #0x10
+    ldmia   lr, {r1-r3,r5-r7}
+    mul     r6, r0, r6
+    mul     r7, r0, r7
+    add     r1, r1, r6
+    add     r2, r2, r7
+    ldr     r6, [lr, #0x18]
+    ldr     r7, [lr, #0x1c]
+    mul     r6, r0, r6
+    mul     r7, r0, r7
+    add     r3, r3, r6
+    add     r5, r5, r7
+    stmia   lr, {r1-r3,r5}
+    sub     lr, lr, #0x10
+
+crl_smp_loop:
     ldr     r5, [lr, #0x40]      @ CH
 #if defined(SSG_EG)
     tst     r12, #0x02              @ ssg_enabled?
@@ -758,7 +781,8 @@ ssg_done:
 
     @ -- EG --
     tst     r4, #0x30
-    subnes  r4, r4, #0x10
+    subne   r4, r4, #0x10
+    tst     r4, #0x30
     bne     eg_done
     orr     r4, r4, #0x30
 
@@ -784,15 +808,28 @@ eg_upd_loop:
     sub     r5, r5, #SLOT_STRUCT_SIZE*3
 
 eg_done:
+    cmp     r8, #(2<<EG_SH)      @ calculate only for operator memory, sample,
+    tstge   r12, #0xf000         @ ...feedback
+    beq     crl_ff
+
     @ -- disabled? --
-    tst     r12, #0x4
     mov     r0, #0
+    tst     r12, #0x4
     bne     crl_algo_done
 
-    cmp     r8, #(2<<EG_SH)      @ calculate only for operator memory, sample,
-    tstge   r12, #0xf000         @ ...feedback
-    beq     crl_algo_done
+    tst     r12, #8              @ lfo?
+    beq     lfo_done
 
+    ldr     r2, [lr, #0x34]      @ lfo_inc
+    ldr     r1, [lr, #0x30]      @ lfo_cnt
+
+    add     r2, r2, r1
+    str     r2, [lr, #0x30]
+
+    @ r12=lfo_ampm[31:16], r1=lfo_cnt_old, r2=lfo_cnt
+    advance_lfo_m
+
+lfo_done:
     ldrh    r6, [r5, #0x34]      @ vol_out values for all slots
     ldrh    r2, [r5, #0x34+SLOT_STRUCT_SIZE*2]
     ldrh    r7, [r5, #0x34+SLOT_STRUCT_SIZE]
@@ -878,6 +915,7 @@ crl_algo_done:
     stmia   lr, {r1-r3,r5}
     sub     lr, lr, #0x10
 
+crl_ff:
     subs    r8, r8, #(1<<EG_SH)
     bge     crl_smp_loop
 
-- 
2.39.5