source/mupen64plus-rsp-hle/src/musyx.c

   1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
   2  *   Mupen64plus-rsp-hle - musyx.c                                         *
   3  *   Mupen64Plus homepage: http://code.google.com/p/mupen64plus/           *
   4  *   Copyright (C) 2013 Bobby Smiles                                       *
   5  *                                                                         *
   6  *   This program is free software; you can redistribute it and/or modify  *
   7  *   it under the terms of the GNU General Public License as published by  *
   8  *   the Free Software Foundation; either version 2 of the License, or     *
   9  *   (at your option) any later version.                                   *
  10  *                                                                         *
  11  *   This program is distributed in the hope that it will be useful,       *
  12  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
  13  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
  14  *   GNU General Public License for more details.                          *
  15  *                                                                         *
  16  *   You should have received a copy of the GNU General Public License     *
  17  *   along with this program; if not, write to the                         *
  18  *   Free Software Foundation, Inc.,                                       *
  19  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
  20  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
  21
  22 #include <stdbool.h>
  23 #include <stdint.h>
  24 #include <string.h>
  25 #include <stddef.h>
  26
  27 #include "m64p_plugin.h"
  28 #include "m64p_types.h"
  29 #include "hle.h"
  30 #include "musyx.h"
  31
  32 /* various constants */
  33 enum { SUBFRAME_SIZE = 192 };
  34 enum { MAX_VOICES = 32 };
  35
  36 enum { SAMPLE_BUFFER_SIZE = 0x200 };
  37
  38
  39 enum {
  40     SFD_VOICE_COUNT     = 0x0,
  41     SFD_SFX_INDEX       = 0x2,
  42     SFD_VOICE_BITMASK   = 0x4,
  43     SFD_STATE_PTR       = 0x8,
  44     SFD_SFX_PTR         = 0xc,
  45
  46     SFD_VOICES          = 0x10
  47 };
  48
  49 enum {
  50     VOICE_ENV_BEGIN         = 0x00,
  51     VOICE_ENV_STEP          = 0x10,
  52     VOICE_PITCH_Q16         = 0x20,
  53     VOICE_PITCH_SHIFT       = 0x22,
  54     VOICE_CATSRC_0          = 0x24,
  55     VOICE_CATSRC_1          = 0x30,
  56     VOICE_ADPCM_FRAMES      = 0x3c,
  57     VOICE_SKIP_SAMPLES      = 0x3e,
  58
  59     /* for PCM16 */
  60     VOICE_U16_40            = 0x40,
  61     VOICE_U16_42            = 0x42,
  62
  63     /* for ADPCM */
  64     VOICE_ADPCM_TABLE_PTR   = 0x40,
  65
  66     VOICE_INTERLEAVED_PTR   = 0x44,
  67     VOICE_END_POINT         = 0x48,
  68     VOICE_RESTART_POINT     = 0x4a,
  69     VOICE_U16_4C            = 0x4c,
  70     VOICE_U16_4E            = 0x4e,
  71
  72     VOICE_SIZE              = 0x50
  73 };
  74
  75 enum {
  76     CATSRC_PTR1     = 0x00,
  77     CATSRC_PTR2     = 0x04,
  78     CATSRC_SIZE1    = 0x08,
  79     CATSRC_SIZE2    = 0x0a
  80 };
  81
  82 enum {
  83     STATE_LAST_SAMPLE   = 0x0,
  84     STATE_BASE_VOL      = 0x100,
  85     STATE_CC0           = 0x110,
  86     STATE_740_LAST4     = 0x290
  87 };
  88
  89 enum {
  90     SFX_CBUFFER_PTR     = 0x00,
  91     SFX_CBUFFER_LENGTH  = 0x04,
  92     SFX_TAP_COUNT       = 0x08,
  93     SFX_FIR4_HGAIN      = 0x0a,
  94     SFX_TAP_DELAYS      = 0x0c,
  95     SFX_TAP_GAINS       = 0x2c,
  96     /* padding          = 0x3c */
  97     SFX_FIR4_HCOEFFS    = 0x40
  98 };
  99
 100
 101 /* struct definition */
 102 typedef struct {
 103     /* internal subframes */
 104     int16_t left[SUBFRAME_SIZE];
 105     int16_t right[SUBFRAME_SIZE];
 106     int16_t cc0[SUBFRAME_SIZE];
 107     int16_t e50[SUBFRAME_SIZE];
 108
 109     /* internal subframes base volumes */
 110     int32_t base_vol[4];
 111
 112     /* */
 113     int16_t subframe_740_last4[4];
 114 } musyx_t;
 115
 116 /* helper functions prototypes */
 117 static void load_base_vol(int32_t *base_vol, uint32_t address);
 118 static void save_base_vol(const int32_t *base_vol, uint32_t address);
 119 static void update_base_vol(int32_t *base_vol, uint32_t voice_mask,
 120                             uint32_t last_sample_ptr);
 121
 122 static void init_subframes(musyx_t *musyx);
 123
 124 static uint32_t voice_stage(musyx_t *musyx, uint32_t voice_ptr,
 125                             uint32_t last_sample_ptr);
 126
 127 static void dma_cat8(uint8_t *dst, uint32_t catsrc_ptr);
 128 static void dma_cat16(uint16_t *dst, uint32_t catsrc_ptr);
 129
 130 static void load_samples_PCM16(uint32_t voice_ptr, int16_t *samples,
 131                                unsigned *segbase, unsigned *offset);
 132 static void load_samples_ADPCM(uint32_t voice_ptr, int16_t *samples,
 133                                unsigned *segbase, unsigned *offset);
 134
 135 static void adpcm_decode_frames(int16_t *dst, const uint8_t *src,
 136                                 const int16_t *table, uint8_t count,
 137                                 uint8_t skip_samples);
 138
 139 static int16_t adpcm_get_predicted_sample(uint8_t byte, uint8_t mask,
 140                                           unsigned lshift, unsigned rshift);
 141 static void adpcm_get_predicted_frame(int16_t *dst, const uint8_t *src,
 142                                       const uint8_t *nibbles,
 143                                       unsigned int rshift);
 144 static void adpcm_decode_upto_8_samples(int16_t *dst, const int16_t *src,
 145                                         const int16_t *cb_entry,
 146                                         const int16_t *last_samples,
 147                                         size_t size);
 148
 149 static void mix_voice_samples(musyx_t *musyx, uint32_t voice_ptr,
 150                               const int16_t *samples, unsigned segbase,
 151                               unsigned offset, uint32_t last_sample_ptr);
 152
 153 static void sfx_stage(musyx_t *musyx, uint32_t sfx_ptr, uint16_t idx);
 154 static void mix_subframes(int16_t *y, const int16_t *x, int16_t hgain);
 155 static void mix_fir4(int16_t *y, const int16_t *x, int16_t hgain, const int16_t *hcoeffs);
 156
 157
 158 static void interleave_stage(musyx_t *musyx, uint32_t output_ptr);
 159
 160
 161 static uint8_t  *dram_u8(uint32_t address);
 162 static uint16_t *dram_u16(uint32_t address);
 163 static uint32_t *dram_u32(uint32_t address);
 164
 165 static void load_u8(uint8_t *dst, uint32_t address, size_t count);
 166 static void load_u16(uint16_t *dst, uint32_t address, size_t count);
 167 static void load_u32(uint32_t *dst, uint32_t address, size_t count);
 168
 169 static void store_u16(const uint16_t *src, uint32_t address, size_t count);
 170
 171 static inline unsigned int align(unsigned int x, unsigned amount)
 172 {
 173     --amount;
 174     return (x + amount) & ~amount;
 175 }
 176
 177 static int32_t rdot(size_t n, const int16_t *x, const int16_t *y)
 178 {
 179     int32_t accu = 0;
 180
 181     y += n;
 182
 183     while (n != 0) {
 184         accu += ((int32_t)*(x++) * (int32_t)*(--y));
 185         --n;
 186     }
 187
 188     return accu;
 189 }
 190
 191
 192 static int32_t dot4(const int16_t *x, const int16_t *y)
 193 {
 194     size_t i;
 195     int32_t accu = 0;
 196
 197     for (i = 0; i < 4; ++i)
 198         accu = clamp_s16(accu + (((int32_t)x[i] * (int32_t)y[i]) >> 15));
 199
 200     return accu;
 201 }
 202
 203 /* Fast and dirty way of reading dram memory
 204  * Assume properly aligned access
 205  */
 206 static uint8_t *dram_u8(uint32_t address)
 207 {
 208     return (uint8_t *)&rsp.RDRAM[(address & 0xffffff) ^ S8];
 209 }
 210
 211 static uint16_t *dram_u16(uint32_t address)
 212 {
 213     return (uint16_t *)&rsp.RDRAM[(address & 0xffffff) ^ S16];
 214 }
 215
 216 static uint32_t *dram_u32(uint32_t address)
 217 {
 218     return (uint32_t *)&rsp.RDRAM[address & 0xffffff];
 219 }
 220
 221 static void load_u8(uint8_t *dst, uint32_t address, size_t count)
 222 {
 223     while (count != 0) {
 224         *(dst++) = *dram_u8(address);
 225         address += 1;
 226         --count;
 227     }
 228 }
 229
 230 static void load_u16(uint16_t *dst, uint32_t address, size_t count)
 231 {
 232     while (count != 0) {
 233         *(dst++) = *dram_u16(address);
 234         address += 2;
 235         --count;
 236     }
 237 }
 238
 239 static void load_u32(uint32_t *dst, uint32_t address, size_t count)
 240 {
 241     /* Optimization for uint32_t */
 242     const uint32_t *src = dram_u32(address);
 243
 244     memcpy(dst, src, count * sizeof(uint32_t));
 245 }
 246
 247 static void store_u16(const uint16_t *src, uint32_t address, size_t count)
 248 {
 249     while (count != 0) {
 250         *dram_u16(address) = *(src++);
 251         address += 2;
 252         --count;
 253     }
 254 }
 255
 256 /**************************************************************************
 257  * MusyX audio ucode
 258  **************************************************************************/
 259 void musyx_task(void)
 260 {
 261     const OSTask_t *const task = get_task();
 262
 263     uint32_t sfd_ptr   = task->data_ptr;
 264     uint32_t sfd_count = task->data_size;
 265     uint32_t state_ptr;
 266     musyx_t musyx;
 267
 268     DebugMessage(M64MSG_VERBOSE, "musyx_task: *data=%x, #SF=%d",
 269                  sfd_ptr,
 270                  sfd_count);
 271
 272     state_ptr = *dram_u32(sfd_ptr + SFD_STATE_PTR);
 273
 274     /* load initial state */
 275     load_base_vol(musyx.base_vol, state_ptr + STATE_BASE_VOL);
 276     load_u16((uint16_t *)musyx.cc0, state_ptr + STATE_CC0, SUBFRAME_SIZE);
 277     load_u16((uint16_t *)musyx.subframe_740_last4, state_ptr + STATE_740_LAST4,
 278              4);
 279
 280     for (;;) {
 281         /* parse SFD structure */
 282         uint16_t sfx_index   = *dram_u16(sfd_ptr + SFD_SFX_INDEX);
 283         uint32_t voice_mask  = *dram_u32(sfd_ptr + SFD_VOICE_BITMASK);
 284         uint32_t sfx_ptr     = *dram_u32(sfd_ptr + SFD_SFX_PTR);
 285         uint32_t voice_ptr       = sfd_ptr + SFD_VOICES;
 286         uint32_t last_sample_ptr = state_ptr + STATE_LAST_SAMPLE;
 287         uint32_t output_ptr;
 288
 289         /* initialize internal subframes using updated base volumes */
 290         update_base_vol(musyx.base_vol, voice_mask, last_sample_ptr);
 291         init_subframes(&musyx);
 292
 293         /* active voices get mixed into L,R,cc0,e50 subframes (optional) */
 294         output_ptr = voice_stage(&musyx, voice_ptr, last_sample_ptr);
 295
 296         /* apply delay-based effects (optional) */
 297         sfx_stage(&musyx, sfx_ptr, sfx_index);
 298
 299         /* emit interleaved L,R subframes */
 300         interleave_stage(&musyx, output_ptr);
 301
 302         --sfd_count;
 303         if (sfd_count == 0)
 304             break;
 305
 306         sfd_ptr += SFD_VOICES + MAX_VOICES * VOICE_SIZE;
 307         state_ptr = *dram_u32(sfd_ptr + SFD_STATE_PTR);
 308     }
 309
 310     /* writeback updated state */
 311     save_base_vol(musyx.base_vol, state_ptr + STATE_BASE_VOL);
 312     store_u16((uint16_t *)musyx.cc0, state_ptr + STATE_CC0, SUBFRAME_SIZE);
 313     store_u16((uint16_t *)musyx.subframe_740_last4, state_ptr + STATE_740_LAST4,
 314               4);
 315 }
 316
 317 static void load_base_vol(int32_t *base_vol, uint32_t address)
 318 {
 319     base_vol[0] = ((uint32_t)(*dram_u16(address))     << 16) | (*dram_u16(address +  8));
 320     base_vol[1] = ((uint32_t)(*dram_u16(address + 2)) << 16) | (*dram_u16(address + 10));
 321     base_vol[2] = ((uint32_t)(*dram_u16(address + 4)) << 16) | (*dram_u16(address + 12));
 322     base_vol[3] = ((uint32_t)(*dram_u16(address + 6)) << 16) | (*dram_u16(address + 14));
 323 }
 324
 325 static void save_base_vol(const int32_t *base_vol, uint32_t address)
 326 {
 327     unsigned k;
 328
 329     for (k = 0; k < 4; ++k) {
 330         *dram_u16(address) = (uint16_t)(base_vol[k] >> 16);
 331         address += 2;
 332     }
 333
 334     for (k = 0; k < 4; ++k) {
 335         *dram_u16(address) = (uint16_t)(base_vol[k]);
 336         address += 2;
 337     }
 338 }
 339
 340 static void update_base_vol(int32_t *base_vol, uint32_t voice_mask,
 341                             uint32_t last_sample_ptr)
 342 {
 343     unsigned i, k;
 344     uint32_t mask;
 345
 346     DebugMessage(M64MSG_VERBOSE, "base_vol voice_mask = %08x", voice_mask);
 347     DebugMessage(M64MSG_VERBOSE, "BEFORE: base_vol = %08x %08x %08x %08x",
 348                  base_vol[0], base_vol[1], base_vol[2], base_vol[3]);
 349
 350     /* optim: skip voices contributions entirely if voice_mask is empty */
 351     if (voice_mask != 0) {
 352         for (i = 0, mask = 1; i < MAX_VOICES;
 353              ++i, mask <<= 1, last_sample_ptr += 8) {
 354             if ((voice_mask & mask) == 0)
 355                 continue;
 356
 357             for (k = 0; k < 4; ++k)
 358                 base_vol[k] += (int16_t)*dram_u16(last_sample_ptr + k * 2);
 359         }
 360     }
 361
 362     /* apply 3% decay */
 363     for (k = 0; k < 4; ++k)
 364         base_vol[k] = (base_vol[k] * 0x0000f850) >> 16;
 365
 366     DebugMessage(M64MSG_VERBOSE, "AFTER: base_vol = %08x %08x %08x %08x",
 367                  base_vol[0], base_vol[1], base_vol[2], base_vol[3]);
 368 }
 369
 370 static void init_subframes(musyx_t *musyx)
 371 {
 372     unsigned i;
 373
 374     int16_t base_cc0 = clamp_s16(musyx->base_vol[2]);
 375     int16_t base_e50 = clamp_s16(musyx->base_vol[3]);
 376
 377     int16_t *left  = musyx->left;
 378     int16_t *right = musyx->right;
 379     int16_t *cc0   = musyx->cc0;
 380     int16_t *e50   = musyx->e50;
 381
 382     for (i = 0; i < SUBFRAME_SIZE; ++i) {
 383         *(e50++)    = base_e50;
 384         *(left++)   = clamp_s16(*cc0 + base_cc0);
 385         *(right++)  = clamp_s16(-*cc0 - base_cc0);
 386         *(cc0++)    = 0;
 387     }
 388 }
 389
 390 /* Process voices, and returns interleaved subframe destination address */
 391 static uint32_t voice_stage(musyx_t *musyx, uint32_t voice_ptr,
 392                             uint32_t last_sample_ptr)
 393 {
 394     uint32_t output_ptr;
 395     int i = 0;
 396
 397     /* voice stage can be skipped if first voice has no samples */
 398     if (*dram_u16(voice_ptr + VOICE_CATSRC_0 + CATSRC_SIZE1) == 0) {
 399         DebugMessage(M64MSG_VERBOSE, "Skipping Voice stage");
 400         output_ptr = *dram_u32(voice_ptr + VOICE_INTERLEAVED_PTR);
 401     } else {
 402         /* otherwise process voices until a non null output_ptr is encountered */
 403         for (;;) {
 404             /* load voice samples (PCM16 or APDCM) */
 405             int16_t samples[SAMPLE_BUFFER_SIZE];
 406             unsigned segbase;
 407             unsigned offset;
 408
 409             DebugMessage(M64MSG_VERBOSE, "Processing Voice #%d", i);
 410
 411             if (*dram_u8(voice_ptr + VOICE_ADPCM_FRAMES) == 0)
 412                 load_samples_PCM16(voice_ptr, samples, &segbase, &offset);
 413             else
 414                 load_samples_ADPCM(voice_ptr, samples, &segbase, &offset);
 415
 416             /* mix them with each internal subframes */
 417             mix_voice_samples(musyx, voice_ptr, samples, segbase, offset,
 418                               last_sample_ptr + i * 8);
 419
 420             /* check break condition */
 421             output_ptr = *dram_u32(voice_ptr + VOICE_INTERLEAVED_PTR);
 422             if (output_ptr != 0)
 423                 break;
 424
 425             /* next voice */
 426             ++i;
 427             voice_ptr += VOICE_SIZE;
 428         }
 429     }
 430
 431     return output_ptr;
 432 }
 433
 434 static void dma_cat8(uint8_t *dst, uint32_t catsrc_ptr)
 435 {
 436     uint32_t ptr1  = *dram_u32(catsrc_ptr + CATSRC_PTR1);
 437     uint32_t ptr2  = *dram_u32(catsrc_ptr + CATSRC_PTR2);
 438     uint16_t size1 = *dram_u16(catsrc_ptr + CATSRC_SIZE1);
 439     uint16_t size2 = *dram_u16(catsrc_ptr + CATSRC_SIZE2);
 440
 441     size_t count1 = size1;
 442     size_t count2 = size2;
 443
 444     DebugMessage(M64MSG_VERBOSE, "dma_cat: %08x %08x %04x %04x",
 445                  ptr1,
 446                  ptr2,
 447                  size1,
 448                  size2);
 449
 450     load_u8(dst, ptr1, count1);
 451
 452     if (size2 == 0)
 453         return;
 454
 455     load_u8(dst + count1, ptr2, count2);
 456 }
 457
 458 static void dma_cat16(uint16_t *dst, uint32_t catsrc_ptr)
 459 {
 460     uint32_t ptr1  = *dram_u32(catsrc_ptr + CATSRC_PTR1);
 461     uint32_t ptr2  = *dram_u32(catsrc_ptr + CATSRC_PTR2);
 462     uint16_t size1 = *dram_u16(catsrc_ptr + CATSRC_SIZE1);
 463     uint16_t size2 = *dram_u16(catsrc_ptr + CATSRC_SIZE2);
 464
 465     size_t count1 = size1 >> 1;
 466     size_t count2 = size2 >> 1;
 467
 468     DebugMessage(M64MSG_VERBOSE, "dma_cat: %08x %08x %04x %04x",
 469                  ptr1,
 470                  ptr2,
 471                  size1,
 472                  size2);
 473
 474     load_u16(dst, ptr1, count1);
 475
 476     if (size2 == 0)
 477         return;
 478
 479     load_u16(dst + count1, ptr2, count2);
 480 }
 481
 482 static void load_samples_PCM16(uint32_t voice_ptr, int16_t *samples,
 483                                unsigned *segbase, unsigned *offset)
 484 {
 485
 486     uint8_t  u8_3e  = *dram_u8(voice_ptr + VOICE_SKIP_SAMPLES);
 487     uint16_t u16_40 = *dram_u16(voice_ptr + VOICE_U16_40);
 488     uint16_t u16_42 = *dram_u16(voice_ptr + VOICE_U16_42);
 489
 490     unsigned count = align(u16_40 + u8_3e, 4);
 491
 492     DebugMessage(M64MSG_VERBOSE, "Format: PCM16");
 493
 494     *segbase = SAMPLE_BUFFER_SIZE - count;
 495     *offset  = u8_3e;
 496
 497     dma_cat16((uint16_t *)samples + *segbase, voice_ptr + VOICE_CATSRC_0);
 498
 499     if (u16_42 != 0)
 500         dma_cat16((uint16_t *)samples, voice_ptr + VOICE_CATSRC_1);
 501 }
 502
 503 static void load_samples_ADPCM(uint32_t voice_ptr, int16_t *samples,
 504                                unsigned *segbase, unsigned *offset)
 505 {
 506     /* decompressed samples cannot exceed 0x400 bytes;
 507      * ADPCM has a compression ratio of 5/16 */
 508     uint8_t buffer[SAMPLE_BUFFER_SIZE * 2 * 5 / 16];
 509     int16_t adpcm_table[128];
 510
 511     uint8_t u8_3c = *dram_u8(voice_ptr + VOICE_ADPCM_FRAMES    );
 512     uint8_t u8_3d = *dram_u8(voice_ptr + VOICE_ADPCM_FRAMES + 1);
 513     uint8_t u8_3e = *dram_u8(voice_ptr + VOICE_SKIP_SAMPLES    );
 514     uint8_t u8_3f = *dram_u8(voice_ptr + VOICE_SKIP_SAMPLES + 1);
 515     uint32_t adpcm_table_ptr = *dram_u32(voice_ptr + VOICE_ADPCM_TABLE_PTR);
 516     unsigned count;
 517
 518     DebugMessage(M64MSG_VERBOSE, "Format: ADPCM");
 519
 520     DebugMessage(M64MSG_VERBOSE, "Loading ADPCM table: %08x", adpcm_table_ptr);
 521     load_u16((uint16_t *)adpcm_table, adpcm_table_ptr, 128);
 522
 523     count = u8_3c << 5;
 524
 525     *segbase = SAMPLE_BUFFER_SIZE - count;
 526     *offset  = u8_3e & 0x1f;
 527
 528     dma_cat8(buffer, voice_ptr + VOICE_CATSRC_0);
 529     adpcm_decode_frames(samples + *segbase, buffer, adpcm_table, u8_3c, u8_3e);
 530
 531     if (u8_3d != 0) {
 532         dma_cat8(buffer, voice_ptr + VOICE_CATSRC_1);
 533         adpcm_decode_frames(samples, buffer, adpcm_table, u8_3d, u8_3f);
 534     }
 535 }
 536
 537 static void adpcm_decode_frames(int16_t *dst, const uint8_t *src,
 538                                 const int16_t *table, uint8_t count,
 539                                 uint8_t skip_samples)
 540 {
 541     int16_t frame[32];
 542     const uint8_t *nibbles = src + 8;
 543     unsigned i;
 544     bool jump_gap = false;
 545
 546     DebugMessage(M64MSG_VERBOSE, "ADPCM decode: count=%d, skip=%d", count,
 547                  skip_samples);
 548
 549     if (skip_samples >= 32) {
 550         jump_gap = true;
 551         nibbles += 16;
 552         src += 4;
 553     }
 554
 555     for (i = 0; i < count; ++i) {
 556         uint8_t c2 = nibbles[0];
 557
 558         const int16_t *book = (c2 & 0xf0) + table;
 559         unsigned int rshift = (c2 & 0x0f);
 560
 561         adpcm_get_predicted_frame(frame, src, nibbles, rshift);
 562
 563         memcpy(dst, frame, 2 * sizeof(frame[0]));
 564         adpcm_decode_upto_8_samples(dst +  2, frame +  2, book, dst     , 6);
 565         adpcm_decode_upto_8_samples(dst +  8, frame +  8, book, dst +  6, 8);
 566         adpcm_decode_upto_8_samples(dst + 16, frame + 16, book, dst + 14, 8);
 567         adpcm_decode_upto_8_samples(dst + 24, frame + 24, book, dst + 22, 8);
 568
 569         if (jump_gap) {
 570             nibbles += 8;
 571             src += 32;
 572         }
 573
 574         jump_gap = !jump_gap;
 575         nibbles += 16;
 576         src += 4;
 577         dst += 32;
 578     }
 579 }
 580
 581 static int16_t adpcm_get_predicted_sample(uint8_t byte, uint8_t mask,
 582                                           unsigned lshift, unsigned rshift)
 583 {
 584     int16_t sample = ((uint16_t)byte & (uint16_t)mask) << lshift;
 585     sample >>= rshift; /* signed */
 586     return sample;
 587 }
 588
 589 static void adpcm_get_predicted_frame(int16_t *dst, const uint8_t *src,
 590                                       const uint8_t *nibbles,
 591                                       unsigned int rshift)
 592 {
 593     unsigned int i;
 594
 595     *(dst++) = (src[0] << 8) | src[1];
 596     *(dst++) = (src[2] << 8) | src[3];
 597
 598     for (i = 1; i < 16; ++i) {
 599         uint8_t byte = nibbles[i];
 600
 601         *(dst++) = adpcm_get_predicted_sample(byte, 0xf0,  8, rshift);
 602         *(dst++) = adpcm_get_predicted_sample(byte, 0x0f, 12, rshift);
 603     }
 604 }
 605
 606 static void adpcm_decode_upto_8_samples(int16_t *dst, const int16_t *src,
 607                                         const int16_t *cb_entry,
 608                                         const int16_t *last_samples,
 609                                         size_t size)
 610 {
 611     const int16_t *const book1 = cb_entry;
 612     const int16_t *const book2 = cb_entry + 8;
 613
 614     const int16_t l1 = last_samples[0];
 615     const int16_t l2 = last_samples[1];
 616
 617     size_t i;
 618     int32_t accu;
 619
 620     for (i = 0; i < size; ++i) {
 621         accu = (int32_t)src[i] << 11;
 622         accu += book1[i] * l1 + book2[i] * l2 + rdot(i, book2, src);
 623         dst[i] = clamp_s16(accu >> 11);
 624     }
 625 }
 626
 627 static void mix_voice_samples(musyx_t *musyx, uint32_t voice_ptr,
 628                               const int16_t *samples, unsigned segbase,
 629                               unsigned offset, uint32_t last_sample_ptr)
 630 {
 631     int i, k;
 632
 633     /* parse VOICE structure */
 634     const uint16_t pitch_q16   = *dram_u16(voice_ptr + VOICE_PITCH_Q16);
 635     const uint16_t pitch_shift = *dram_u16(voice_ptr + VOICE_PITCH_SHIFT); /* Q4.12 */
 636
 637     const uint16_t end_point     = *dram_u16(voice_ptr + VOICE_END_POINT);
 638     const uint16_t restart_point = *dram_u16(voice_ptr + VOICE_RESTART_POINT);
 639
 640     const uint16_t u16_4e = *dram_u16(voice_ptr + VOICE_U16_4E);
 641
 642     /* init values and pointers */
 643     const int16_t       *sample         = samples + segbase + offset + u16_4e;
 644     const int16_t *const sample_end     = samples + segbase + end_point;
 645     const int16_t *const sample_restart = samples + (restart_point & 0x7fff) +
 646                                           (((restart_point & 0x8000) != 0) ? 0x000 : segbase);
 647
 648
 649     uint32_t pitch_accu = pitch_q16;
 650     uint32_t pitch_step = pitch_shift << 4;
 651
 652     int32_t  v4_env[4];
 653     int32_t  v4_env_step[4];
 654     int16_t *v4_dst[4];
 655     int16_t  v4[4];
 656
 657     load_u32((uint32_t *)v4_env,      voice_ptr + VOICE_ENV_BEGIN, 4);
 658     load_u32((uint32_t *)v4_env_step, voice_ptr + VOICE_ENV_STEP,  4);
 659
 660     v4_dst[0] = musyx->left;
 661     v4_dst[1] = musyx->right;
 662     v4_dst[2] = musyx->cc0;
 663     v4_dst[3] = musyx->e50;
 664
 665     DebugMessage(M64MSG_VERBOSE,
 666                  "Voice debug: segbase=%d"
 667                  "\tu16_4e=%04x\n"
 668                  "\tpitch: frac0=%04x shift=%04x\n"
 669                  "\tend_point=%04x restart_point=%04x\n"
 670                  "\tenv      = %08x %08x %08x %08x\n"
 671                  "\tenv_step = %08x %08x %08x %08x\n",
 672                  segbase,
 673                  u16_4e,
 674                  pitch_q16, pitch_shift,
 675                  end_point, restart_point,
 676                  v4_env[0],      v4_env[1],      v4_env[2],      v4_env[3],
 677                  v4_env_step[0], v4_env_step[1], v4_env_step[2], v4_env_step[3]);
 678
 679     for (i = 0; i < SUBFRAME_SIZE; ++i) {
 680         /* update sample and resample_lut pointers and then pitch_accu */
 681         const int16_t *lut = (int16_t *)(ResampleLUT + ((pitch_accu & 0xfc00) >> 8));
 682         int dist;
 683         int16_t v;
 684
 685         sample += (pitch_accu >> 16);
 686         pitch_accu &= 0xffff;
 687         pitch_accu += pitch_step;
 688
 689         /* handle end/restart points */
 690         dist = sample - sample_end;
 691         if (dist >= 0)
 692             sample = sample_restart + dist;
 693
 694         /* apply resample filter */
 695         v = clamp_s16(dot4(sample, lut));
 696
 697         for (k = 0; k < 4; ++k) {
 698             /* envmix */
 699             int32_t accu = (v * (v4_env[k] >> 16)) >> 15;
 700             v4[k] = clamp_s16(accu);
 701             *(v4_dst[k]) = clamp_s16(accu + *(v4_dst[k]));
 702
 703             /* update envelopes and dst pointers */
 704             ++(v4_dst[k]);
 705             v4_env[k] += v4_env_step[k];
 706         }
 707     }
 708
 709     /* save last resampled sample */
 710     store_u16((uint16_t *)v4, last_sample_ptr, 4);
 711
 712     DebugMessage(M64MSG_VERBOSE, "last_sample = %04x %04x %04x %04x",
 713                  v4[0], v4[1], v4[2], v4[3]);
 714 }
 715
 716
 717 static void sfx_stage(musyx_t *musyx, uint32_t sfx_ptr, uint16_t idx)
 718 {
 719     unsigned int i;
 720
 721     int16_t buffer[SUBFRAME_SIZE + 4];
 722     int16_t *subframe = buffer + 4;
 723
 724     uint32_t tap_delays[8];
 725     int16_t tap_gains[8];
 726     int16_t fir4_hcoeffs[4];
 727
 728     int16_t delayed[SUBFRAME_SIZE];
 729     int dpos, dlength;
 730
 731     const uint32_t pos = idx * SUBFRAME_SIZE;
 732
 733     uint32_t cbuffer_ptr;
 734     uint32_t cbuffer_length;
 735     uint16_t tap_count;
 736     int16_t fir4_hgain;
 737
 738     DebugMessage(M64MSG_VERBOSE, "SFX: %08x, idx=%d", sfx_ptr, idx);
 739
 740     if (sfx_ptr == 0)
 741         return;
 742
 743     /* load sfx  parameters */
 744     cbuffer_ptr    = *dram_u32(sfx_ptr + SFX_CBUFFER_PTR);
 745     cbuffer_length = *dram_u32(sfx_ptr + SFX_CBUFFER_LENGTH);
 746
 747     tap_count      = *dram_u16(sfx_ptr + SFX_TAP_COUNT);
 748
 749     load_u32(tap_delays, sfx_ptr + SFX_TAP_DELAYS, 8);
 750     load_u16((uint16_t *)tap_gains,  sfx_ptr + SFX_TAP_GAINS,  8);
 751
 752     fir4_hgain     = *dram_u16(sfx_ptr + SFX_FIR4_HGAIN);
 753     load_u16((uint16_t *)fir4_hcoeffs, sfx_ptr + SFX_FIR4_HCOEFFS, 4);
 754
 755     DebugMessage(M64MSG_VERBOSE, "cbuffer: ptr=%08x length=%x", cbuffer_ptr,
 756                  cbuffer_length);
 757
 758     DebugMessage(M64MSG_VERBOSE, "fir4: hgain=%04x hcoeff=%04x %04x %04x %04x",
 759                  fir4_hgain, fir4_hcoeffs[0], fir4_hcoeffs[1], fir4_hcoeffs[2],
 760                  fir4_hcoeffs[3]);
 761
 762     DebugMessage(M64MSG_VERBOSE,
 763                  "tap count=%d\n"
 764                  "delays: %08x %08x %08x %08x %08x %08x %08x %08x\n"
 765                  "gains:  %04x %04x %04x %04x %04x %04x %04x %04x",
 766                  tap_count,
 767                  tap_delays[0], tap_delays[1], tap_delays[2], tap_delays[3],
 768                  tap_delays[4], tap_delays[5], tap_delays[6], tap_delays[7],
 769                  tap_gains[0], tap_gains[1], tap_gains[2], tap_gains[3],
 770                  tap_gains[4], tap_gains[5], tap_gains[6], tap_gains[7]);
 771
 772     /* mix up to 8 delayed subframes */
 773     memset(subframe, 0, SUBFRAME_SIZE * sizeof(subframe[0]));
 774     for (i = 0; i < tap_count; ++i) {
 775
 776         dpos = pos - tap_delays[i];
 777         if (dpos <= 0)
 778             dpos += cbuffer_length;
 779         dlength = SUBFRAME_SIZE;
 780
 781         if (dpos + SUBFRAME_SIZE > cbuffer_length) {
 782             dlength = cbuffer_length - dpos;
 783             load_u16((uint16_t *)delayed + dlength, cbuffer_ptr, SUBFRAME_SIZE - dlength);
 784         }
 785
 786         load_u16((uint16_t *)delayed, cbuffer_ptr + dpos * 2, dlength);
 787
 788         mix_subframes(subframe, delayed, tap_gains[i]);
 789     }
 790
 791     /* add resulting subframe to L/R subframes */
 792     for (i = 0; i < SUBFRAME_SIZE; ++i) {
 793         int16_t v = subframe[i];
 794         musyx->left[i]  = clamp_s16(musyx->left[i]  + v);
 795         musyx->right[i] = clamp_s16(musyx->right[i] + v);
 796     }
 797
 798     /* apply FIR4 filter and writeback filtered result */
 799     memcpy(buffer, musyx->subframe_740_last4, 4 * sizeof(int16_t));
 800     memcpy(musyx->subframe_740_last4, subframe + SUBFRAME_SIZE - 4, 4 * sizeof(int16_t));
 801     mix_fir4(musyx->e50, buffer + 1, fir4_hgain, fir4_hcoeffs);
 802     store_u16((uint16_t *)musyx->e50, cbuffer_ptr + pos * 2, SUBFRAME_SIZE);
 803 }
 804
 805 static void mix_subframes(int16_t *y, const int16_t *x, int16_t hgain)
 806 {
 807     unsigned int i;
 808
 809     for (i = 0; i < SUBFRAME_SIZE; ++i) {
 810         int32_t v = (hgain * x[i]) >> 15;
 811         y[i] = clamp_s16(y[i] + v);
 812     }
 813 }
 814
 815 static void mix_fir4(int16_t *y, const int16_t *x, int16_t hgain, const int16_t *hcoeffs)
 816 {
 817     unsigned int i;
 818     int32_t h[4];
 819
 820     h[0] = (hgain * hcoeffs[0]) >> 15;
 821     h[1] = (hgain * hcoeffs[1]) >> 15;
 822     h[2] = (hgain * hcoeffs[2]) >> 15;
 823     h[3] = (hgain * hcoeffs[3]) >> 15;
 824
 825     for (i = 0; i < SUBFRAME_SIZE; ++i) {
 826         int32_t v = (h[0] * x[i] + h[1] * x[i + 1] + h[2] * x[i + 2] + h[3] * x[i + 3]) >> 15;
 827         y[i] = clamp_s16(y[i] + v);
 828     }
 829 }
 830
 831
 832 static void interleave_stage(musyx_t *musyx, uint32_t output_ptr)
 833 {
 834     size_t i;
 835
 836     int16_t base_left;
 837     int16_t base_right;
 838
 839     int16_t *left;
 840     int16_t *right;
 841     uint32_t *dst;
 842
 843     DebugMessage(M64MSG_VERBOSE, "interleave: %08x", output_ptr);
 844
 845     base_left  = clamp_s16(musyx->base_vol[0]);
 846     base_right = clamp_s16(musyx->base_vol[1]);
 847
 848     left  = musyx->left;
 849     right = musyx->right;
 850     dst  = dram_u32(output_ptr);
 851
 852     for (i = 0; i < SUBFRAME_SIZE; ++i) {
 853         uint16_t l = clamp_s16(*(left++)  + base_left);
 854         uint16_t r = clamp_s16(*(right++) + base_right);
 855
 856         *(dst++) = (l << 16) | r;
 857     }
 858 }