deps/libchdr/include/dr_libs/dr_flac.h

   1 /*
   2 FLAC audio decoder. Choice of public domain or MIT-0. See license statements at the end of this file.
   3 dr_flac - v0.12.28 - 2021-02-21
   4
   5 David Reid - mackron@gmail.com
   6
   7 GitHub: https://github.com/mackron/dr_libs
   8 */
   9
  10 /*
  11 RELEASE NOTES - v0.12.0
  12 =======================
  13 Version 0.12.0 has breaking API changes including changes to the existing API and the removal of deprecated APIs.
  14
  15
  16 Improved Client-Defined Memory Allocation
  17 -----------------------------------------
  18 The main change with this release is the addition of a more flexible way of implementing custom memory allocation routines. The
  19 existing system of DRFLAC_MALLOC, DRFLAC_REALLOC and DRFLAC_FREE are still in place and will be used by default when no custom
  20 allocation callbacks are specified.
  21
  22 To use the new system, you pass in a pointer to a drflac_allocation_callbacks object to drflac_open() and family, like this:
  23
  24     void* my_malloc(size_t sz, void* pUserData)
  25     {
  26         return malloc(sz);
  27     }
  28     void* my_realloc(void* p, size_t sz, void* pUserData)
  29     {
  30         return realloc(p, sz);
  31     }
  32     void my_free(void* p, void* pUserData)
  33     {
  34         free(p);
  35     }
  36
  37     ...
  38
  39     drflac_allocation_callbacks allocationCallbacks;
  40     allocationCallbacks.pUserData = &myData;
  41     allocationCallbacks.onMalloc  = my_malloc;
  42     allocationCallbacks.onRealloc = my_realloc;
  43     allocationCallbacks.onFree    = my_free;
  44     drflac* pFlac = drflac_open_file("my_file.flac", &allocationCallbacks);
  45
  46 The advantage of this new system is that it allows you to specify user data which will be passed in to the allocation routines.
  47
  48 Passing in null for the allocation callbacks object will cause dr_flac to use defaults which is the same as DRFLAC_MALLOC,
  49 DRFLAC_REALLOC and DRFLAC_FREE and the equivalent of how it worked in previous versions.
  50
  51 Every API that opens a drflac object now takes this extra parameter. These include the following:
  52
  53     drflac_open()
  54     drflac_open_relaxed()
  55     drflac_open_with_metadata()
  56     drflac_open_with_metadata_relaxed()
  57     drflac_open_file()
  58     drflac_open_file_with_metadata()
  59     drflac_open_memory()
  60     drflac_open_memory_with_metadata()
  61     drflac_open_and_read_pcm_frames_s32()
  62     drflac_open_and_read_pcm_frames_s16()
  63     drflac_open_and_read_pcm_frames_f32()
  64     drflac_open_file_and_read_pcm_frames_s32()
  65     drflac_open_file_and_read_pcm_frames_s16()
  66     drflac_open_file_and_read_pcm_frames_f32()
  67     drflac_open_memory_and_read_pcm_frames_s32()
  68     drflac_open_memory_and_read_pcm_frames_s16()
  69     drflac_open_memory_and_read_pcm_frames_f32()
  70
  71
  72
  73 Optimizations
  74 -------------
  75 Seeking performance has been greatly improved. A new binary search based seeking algorithm has been introduced which significantly
  76 improves performance over the brute force method which was used when no seek table was present. Seek table based seeking also takes
  77 advantage of the new binary search seeking system to further improve performance there as well. Note that this depends on CRC which
  78 means it will be disabled when DR_FLAC_NO_CRC is used.
  79
  80 The SSE4.1 pipeline has been cleaned up and optimized. You should see some improvements with decoding speed of 24-bit files in
  81 particular. 16-bit streams should also see some improvement.
  82
  83 drflac_read_pcm_frames_s16() has been optimized. Previously this sat on top of drflac_read_pcm_frames_s32() and performed it's s32
  84 to s16 conversion in a second pass. This is now all done in a single pass. This includes SSE2 and ARM NEON optimized paths.
  85
  86 A minor optimization has been implemented for drflac_read_pcm_frames_s32(). This will now use an SSE2 optimized pipeline for stereo
  87 channel reconstruction which is the last part of the decoding process.
  88
  89 The ARM build has seen a few improvements. The CLZ (count leading zeroes) and REV (byte swap) instructions are now used when
  90 compiling with GCC and Clang which is achieved using inline assembly. The CLZ instruction requires ARM architecture version 5 at
  91 compile time and the REV instruction requires ARM architecture version 6.
  92
  93 An ARM NEON optimized pipeline has been implemented. To enable this you'll need to add -mfpu=neon to the command line when compiling.
  94
  95
  96 Removed APIs
  97 ------------
  98 The following APIs were deprecated in version 0.11.0 and have been completely removed in version 0.12.0:
  99
 100     drflac_read_s32()                   -> drflac_read_pcm_frames_s32()
 101     drflac_read_s16()                   -> drflac_read_pcm_frames_s16()
 102     drflac_read_f32()                   -> drflac_read_pcm_frames_f32()
 103     drflac_seek_to_sample()             -> drflac_seek_to_pcm_frame()
 104     drflac_open_and_decode_s32()        -> drflac_open_and_read_pcm_frames_s32()
 105     drflac_open_and_decode_s16()        -> drflac_open_and_read_pcm_frames_s16()
 106     drflac_open_and_decode_f32()        -> drflac_open_and_read_pcm_frames_f32()
 107     drflac_open_and_decode_file_s32()   -> drflac_open_file_and_read_pcm_frames_s32()
 108     drflac_open_and_decode_file_s16()   -> drflac_open_file_and_read_pcm_frames_s16()
 109     drflac_open_and_decode_file_f32()   -> drflac_open_file_and_read_pcm_frames_f32()
 110     drflac_open_and_decode_memory_s32() -> drflac_open_memory_and_read_pcm_frames_s32()
 111     drflac_open_and_decode_memory_s16() -> drflac_open_memory_and_read_pcm_frames_s16()
 112     drflac_open_and_decode_memory_f32() -> drflac_open_memroy_and_read_pcm_frames_f32()
 113
 114 Prior versions of dr_flac operated on a per-sample basis whereas now it operates on PCM frames. The removed APIs all relate
 115 to the old per-sample APIs. You now need to use the "pcm_frame" versions.
 116 */
 117
 118
 119 /*
 120 Introduction
 121 ============
 122 dr_flac is a single file library. To use it, do something like the following in one .c file.
 123
 124     ```c
 125     #define DR_FLAC_IMPLEMENTATION
 126     #include "dr_flac.h"
 127     ```
 128
 129 You can then #include this file in other parts of the program as you would with any other header file. To decode audio data, do something like the following:
 130
 131     ```c
 132     drflac* pFlac = drflac_open_file("MySong.flac", NULL);
 133     if (pFlac == NULL) {
 134         // Failed to open FLAC file
 135     }
 136
 137     drflac_int32* pSamples = malloc(pFlac->totalPCMFrameCount * pFlac->channels * sizeof(drflac_int32));
 138     drflac_uint64 numberOfInterleavedSamplesActuallyRead = drflac_read_pcm_frames_s32(pFlac, pFlac->totalPCMFrameCount, pSamples);
 139     ```
 140
 141 The drflac object represents the decoder. It is a transparent type so all the information you need, such as the number of channels and the bits per sample,
 142 should be directly accessible - just make sure you don't change their values. Samples are always output as interleaved signed 32-bit PCM. In the example above
 143 a native FLAC stream was opened, however dr_flac has seamless support for Ogg encapsulated FLAC streams as well.
 144
 145 You do not need to decode the entire stream in one go - you just specify how many samples you'd like at any given time and the decoder will give you as many
 146 samples as it can, up to the amount requested. Later on when you need the next batch of samples, just call it again. Example:
 147
 148     ```c
 149     while (drflac_read_pcm_frames_s32(pFlac, chunkSizeInPCMFrames, pChunkSamples) > 0) {
 150         do_something();
 151     }
 152     ```
 153
 154 You can seek to a specific PCM frame with `drflac_seek_to_pcm_frame()`.
 155
 156 If you just want to quickly decode an entire FLAC file in one go you can do something like this:
 157
 158     ```c
 159     unsigned int channels;
 160     unsigned int sampleRate;
 161     drflac_uint64 totalPCMFrameCount;
 162     drflac_int32* pSampleData = drflac_open_file_and_read_pcm_frames_s32("MySong.flac", &channels, &sampleRate, &totalPCMFrameCount, NULL);
 163     if (pSampleData == NULL) {
 164         // Failed to open and decode FLAC file.
 165     }
 166
 167     ...
 168
 169     drflac_free(pSampleData, NULL);
 170     ```
 171
 172 You can read samples as signed 16-bit integer and 32-bit floating-point PCM with the *_s16() and *_f32() family of APIs respectively, but note that these
 173 should be considered lossy.
 174
 175
 176 If you need access to metadata (album art, etc.), use `drflac_open_with_metadata()`, `drflac_open_file_with_metdata()` or `drflac_open_memory_with_metadata()`.
 177 The rationale for keeping these APIs separate is that they're slightly slower than the normal versions and also just a little bit harder to use. dr_flac
 178 reports metadata to the application through the use of a callback, and every metadata block is reported before `drflac_open_with_metdata()` returns.
 179
 180 The main opening APIs (`drflac_open()`, etc.) will fail if the header is not present. The presents a problem in certain scenarios such as broadcast style
 181 streams or internet radio where the header may not be present because the user has started playback mid-stream. To handle this, use the relaxed APIs:
 182
 183     `drflac_open_relaxed()`
 184     `drflac_open_with_metadata_relaxed()`
 185
 186 It is not recommended to use these APIs for file based streams because a missing header would usually indicate a corrupt or perverse file. In addition, these
 187 APIs can take a long time to initialize because they may need to spend a lot of time finding the first frame.
 188
 189
 190
 191 Build Options
 192 =============
 193 #define these options before including this file.
 194
 195 #define DR_FLAC_NO_STDIO
 196   Disable `drflac_open_file()` and family.
 197
 198 #define DR_FLAC_NO_OGG
 199   Disables support for Ogg/FLAC streams.
 200
 201 #define DR_FLAC_BUFFER_SIZE <number>
 202   Defines the size of the internal buffer to store data from onRead(). This buffer is used to reduce the number of calls back to the client for more data.
 203   Larger values means more memory, but better performance. My tests show diminishing returns after about 4KB (which is the default). Consider reducing this if
 204   you have a very efficient implementation of onRead(), or increase it if it's very inefficient. Must be a multiple of 8.
 205
 206 #define DR_FLAC_NO_CRC
 207   Disables CRC checks. This will offer a performance boost when CRC is unnecessary. This will disable binary search seeking. When seeking, the seek table will
 208   be used if available. Otherwise the seek will be performed using brute force.
 209
 210 #define DR_FLAC_NO_SIMD
 211   Disables SIMD optimizations (SSE on x86/x64 architectures, NEON on ARM architectures). Use this if you are having compatibility issues with your compiler.
 212
 213
 214
 215 Notes
 216 =====
 217 - dr_flac does not support changing the sample rate nor channel count mid stream.
 218 - dr_flac is not thread-safe, but its APIs can be called from any thread so long as you do your own synchronization.
 219 - When using Ogg encapsulation, a corrupted metadata block will result in `drflac_open_with_metadata()` and `drflac_open()` returning inconsistent samples due
 220   to differences in corrupted stream recorvery logic between the two APIs.
 221 */
 222
 223 #ifndef dr_flac_h
 224 #define dr_flac_h
 225
 226 #ifdef __cplusplus
 227 extern "C" {
 228 #endif
 229
 230 #define DRFLAC_STRINGIFY(x)      #x
 231 #define DRFLAC_XSTRINGIFY(x)     DRFLAC_STRINGIFY(x)
 232
 233 #define DRFLAC_VERSION_MAJOR     0
 234 #define DRFLAC_VERSION_MINOR     12
 235 #define DRFLAC_VERSION_REVISION  28
 236 #define DRFLAC_VERSION_STRING    DRFLAC_XSTRINGIFY(DRFLAC_VERSION_MAJOR) "." DRFLAC_XSTRINGIFY(DRFLAC_VERSION_MINOR) "." DRFLAC_XSTRINGIFY(DRFLAC_VERSION_REVISION)
 237
 238 #include <stddef.h> /* For size_t. */
 239
 240 /* Sized types. */
 241 typedef   signed char           drflac_int8;
 242 typedef unsigned char           drflac_uint8;
 243 typedef   signed short          drflac_int16;
 244 typedef unsigned short          drflac_uint16;
 245 typedef   signed int            drflac_int32;
 246 typedef unsigned int            drflac_uint32;
 247 #if defined(_MSC_VER)
 248     typedef   signed __int64    drflac_int64;
 249     typedef unsigned __int64    drflac_uint64;
 250 #else
 251     #if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
 252         #pragma GCC diagnostic push
 253         #pragma GCC diagnostic ignored "-Wlong-long"
 254         #if defined(__clang__)
 255             #pragma GCC diagnostic ignored "-Wc++11-long-long"
 256         #endif
 257     #endif
 258     typedef   signed long long  drflac_int64;
 259     typedef unsigned long long  drflac_uint64;
 260     #if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
 261         #pragma GCC diagnostic pop
 262     #endif
 263 #endif
 264 #if defined(__LP64__) || defined(_WIN64) || (defined(__x86_64__) && !defined(__ILP32__)) || defined(_M_X64) || defined(__ia64) || defined (_M_IA64) || defined(__aarch64__) || defined(__powerpc64__)
 265     typedef drflac_uint64       drflac_uintptr;
 266 #else
 267     typedef drflac_uint32       drflac_uintptr;
 268 #endif
 269 typedef drflac_uint8            drflac_bool8;
 270 typedef drflac_uint32           drflac_bool32;
 271 #define DRFLAC_TRUE             1
 272 #define DRFLAC_FALSE            0
 273
 274 #if !defined(DRFLAC_API)
 275     #if defined(DRFLAC_DLL)
 276         #if defined(_WIN32)
 277             #define DRFLAC_DLL_IMPORT  __declspec(dllimport)
 278             #define DRFLAC_DLL_EXPORT  __declspec(dllexport)
 279             #define DRFLAC_DLL_PRIVATE static
 280         #else
 281             #if defined(__GNUC__) && __GNUC__ >= 4
 282                 #define DRFLAC_DLL_IMPORT  __attribute__((visibility("default")))
 283                 #define DRFLAC_DLL_EXPORT  __attribute__((visibility("default")))
 284                 #define DRFLAC_DLL_PRIVATE __attribute__((visibility("hidden")))
 285             #else
 286                 #define DRFLAC_DLL_IMPORT
 287                 #define DRFLAC_DLL_EXPORT
 288                 #define DRFLAC_DLL_PRIVATE static
 289             #endif
 290         #endif
 291
 292         #if defined(DR_FLAC_IMPLEMENTATION) || defined(DRFLAC_IMPLEMENTATION)
 293             #define DRFLAC_API  DRFLAC_DLL_EXPORT
 294         #else
 295             #define DRFLAC_API  DRFLAC_DLL_IMPORT
 296         #endif
 297         #define DRFLAC_PRIVATE DRFLAC_DLL_PRIVATE
 298     #else
 299         #define DRFLAC_API extern
 300         #define DRFLAC_PRIVATE static
 301     #endif
 302 #endif
 303
 304 #if defined(_MSC_VER) && _MSC_VER >= 1700   /* Visual Studio 2012 */
 305     #define DRFLAC_DEPRECATED       __declspec(deprecated)
 306 #elif (defined(__GNUC__) && __GNUC__ >= 4)  /* GCC 4 */
 307     #define DRFLAC_DEPRECATED       __attribute__((deprecated))
 308 #elif defined(__has_feature)                /* Clang */
 309     #if __has_feature(attribute_deprecated)
 310         #define DRFLAC_DEPRECATED   __attribute__((deprecated))
 311     #else
 312         #define DRFLAC_DEPRECATED
 313     #endif
 314 #else
 315     #define DRFLAC_DEPRECATED
 316 #endif
 317
 318 DRFLAC_API void drflac_version(drflac_uint32* pMajor, drflac_uint32* pMinor, drflac_uint32* pRevision);
 319 DRFLAC_API const char* drflac_version_string(void);
 320
 321 /*
 322 As data is read from the client it is placed into an internal buffer for fast access. This controls the size of that buffer. Larger values means more speed,
 323 but also more memory. In my testing there is diminishing returns after about 4KB, but you can fiddle with this to suit your own needs. Must be a multiple of 8.
 324 */
 325 #ifndef DR_FLAC_BUFFER_SIZE
 326 #define DR_FLAC_BUFFER_SIZE   4096
 327 #endif
 328
 329 /* Check if we can enable 64-bit optimizations. */
 330 #if defined(_WIN64) || defined(_LP64) || defined(__LP64__)
 331 #define DRFLAC_64BIT
 332 #endif
 333
 334 #ifdef DRFLAC_64BIT
 335 typedef drflac_uint64 drflac_cache_t;
 336 #else
 337 typedef drflac_uint32 drflac_cache_t;
 338 #endif
 339
 340 /* The various metadata block types. */
 341 #define DRFLAC_METADATA_BLOCK_TYPE_STREAMINFO       0
 342 #define DRFLAC_METADATA_BLOCK_TYPE_PADDING          1
 343 #define DRFLAC_METADATA_BLOCK_TYPE_APPLICATION      2
 344 #define DRFLAC_METADATA_BLOCK_TYPE_SEEKTABLE        3
 345 #define DRFLAC_METADATA_BLOCK_TYPE_VORBIS_COMMENT   4
 346 #define DRFLAC_METADATA_BLOCK_TYPE_CUESHEET         5
 347 #define DRFLAC_METADATA_BLOCK_TYPE_PICTURE          6
 348 #define DRFLAC_METADATA_BLOCK_TYPE_INVALID          127
 349
 350 /* The various picture types specified in the PICTURE block. */
 351 #define DRFLAC_PICTURE_TYPE_OTHER                   0
 352 #define DRFLAC_PICTURE_TYPE_FILE_ICON               1
 353 #define DRFLAC_PICTURE_TYPE_OTHER_FILE_ICON         2
 354 #define DRFLAC_PICTURE_TYPE_COVER_FRONT             3
 355 #define DRFLAC_PICTURE_TYPE_COVER_BACK              4
 356 #define DRFLAC_PICTURE_TYPE_LEAFLET_PAGE            5
 357 #define DRFLAC_PICTURE_TYPE_MEDIA                   6
 358 #define DRFLAC_PICTURE_TYPE_LEAD_ARTIST             7
 359 #define DRFLAC_PICTURE_TYPE_ARTIST                  8
 360 #define DRFLAC_PICTURE_TYPE_CONDUCTOR               9
 361 #define DRFLAC_PICTURE_TYPE_BAND                    10
 362 #define DRFLAC_PICTURE_TYPE_COMPOSER                11
 363 #define DRFLAC_PICTURE_TYPE_LYRICIST                12
 364 #define DRFLAC_PICTURE_TYPE_RECORDING_LOCATION      13
 365 #define DRFLAC_PICTURE_TYPE_DURING_RECORDING        14
 366 #define DRFLAC_PICTURE_TYPE_DURING_PERFORMANCE      15
 367 #define DRFLAC_PICTURE_TYPE_SCREEN_CAPTURE          16
 368 #define DRFLAC_PICTURE_TYPE_BRIGHT_COLORED_FISH     17
 369 #define DRFLAC_PICTURE_TYPE_ILLUSTRATION            18
 370 #define DRFLAC_PICTURE_TYPE_BAND_LOGOTYPE           19
 371 #define DRFLAC_PICTURE_TYPE_PUBLISHER_LOGOTYPE      20
 372
 373 typedef enum
 374 {
 375     drflac_container_native,
 376     drflac_container_ogg,
 377     drflac_container_unknown
 378 } drflac_container;
 379
 380 typedef enum
 381 {
 382     drflac_seek_origin_start,
 383     drflac_seek_origin_current
 384 } drflac_seek_origin;
 385
 386 /* Packing is important on this structure because we map this directly to the raw data within the SEEKTABLE metadata block. */
 387 #pragma pack(2)
 388 typedef struct
 389 {
 390     drflac_uint64 firstPCMFrame;
 391     drflac_uint64 flacFrameOffset;   /* The offset from the first byte of the header of the first frame. */
 392     drflac_uint16 pcmFrameCount;
 393 } drflac_seekpoint;
 394 #pragma pack()
 395
 396 typedef struct
 397 {
 398     drflac_uint16 minBlockSizeInPCMFrames;
 399     drflac_uint16 maxBlockSizeInPCMFrames;
 400     drflac_uint32 minFrameSizeInPCMFrames;
 401     drflac_uint32 maxFrameSizeInPCMFrames;
 402     drflac_uint32 sampleRate;
 403     drflac_uint8  channels;
 404     drflac_uint8  bitsPerSample;
 405     drflac_uint64 totalPCMFrameCount;
 406     drflac_uint8  md5[16];
 407 } drflac_streaminfo;
 408
 409 typedef struct
 410 {
 411     /*
 412     The metadata type. Use this to know how to interpret the data below. Will be set to one of the
 413     DRFLAC_METADATA_BLOCK_TYPE_* tokens.
 414     */
 415     drflac_uint32 type;
 416
 417     /*
 418     A pointer to the raw data. This points to a temporary buffer so don't hold on to it. It's best to
 419     not modify the contents of this buffer. Use the structures below for more meaningful and structured
 420     information about the metadata. It's possible for this to be null.
 421     */
 422     const void* pRawData;
 423
 424     /* The size in bytes of the block and the buffer pointed to by pRawData if it's non-NULL. */
 425     drflac_uint32 rawDataSize;
 426
 427     union
 428     {
 429         drflac_streaminfo streaminfo;
 430
 431         struct
 432         {
 433             int unused;
 434         } padding;
 435
 436         struct
 437         {
 438             drflac_uint32 id;
 439             const void* pData;
 440             drflac_uint32 dataSize;
 441         } application;
 442
 443         struct
 444         {
 445             drflac_uint32 seekpointCount;
 446             const drflac_seekpoint* pSeekpoints;
 447         } seektable;
 448
 449         struct
 450         {
 451             drflac_uint32 vendorLength;
 452             const char* vendor;
 453             drflac_uint32 commentCount;
 454             const void* pComments;
 455         } vorbis_comment;
 456
 457         struct
 458         {
 459             char catalog[128];
 460             drflac_uint64 leadInSampleCount;
 461             drflac_bool32 isCD;
 462             drflac_uint8 trackCount;
 463             const void* pTrackData;
 464         } cuesheet;
 465
 466         struct
 467         {
 468             drflac_uint32 type;
 469             drflac_uint32 mimeLength;
 470             const char* mime;
 471             drflac_uint32 descriptionLength;
 472             const char* description;
 473             drflac_uint32 width;
 474             drflac_uint32 height;
 475             drflac_uint32 colorDepth;
 476             drflac_uint32 indexColorCount;
 477             drflac_uint32 pictureDataSize;
 478             const drflac_uint8* pPictureData;
 479         } picture;
 480     } data;
 481 } drflac_metadata;
 482
 483
 484 /*
 485 Callback for when data needs to be read from the client.
 486
 487
 488 Parameters
 489 ----------
 490 pUserData (in)
 491     The user data that was passed to drflac_open() and family.
 492
 493 pBufferOut (out)
 494     The output buffer.
 495
 496 bytesToRead (in)
 497     The number of bytes to read.
 498
 499
 500 Return Value
 501 ------------
 502 The number of bytes actually read.
 503
 504
 505 Remarks
 506 -------
 507 A return value of less than bytesToRead indicates the end of the stream. Do _not_ return from this callback until either the entire bytesToRead is filled or
 508 you have reached the end of the stream.
 509 */
 510 typedef size_t (* drflac_read_proc)(void* pUserData, void* pBufferOut, size_t bytesToRead);
 511
 512 /*
 513 Callback for when data needs to be seeked.
 514
 515
 516 Parameters
 517 ----------
 518 pUserData (in)
 519     The user data that was passed to drflac_open() and family.
 520
 521 offset (in)
 522     The number of bytes to move, relative to the origin. Will never be negative.
 523
 524 origin (in)
 525     The origin of the seek - the current position or the start of the stream.
 526
 527
 528 Return Value
 529 ------------
 530 Whether or not the seek was successful.
 531
 532
 533 Remarks
 534 -------
 535 The offset will never be negative. Whether or not it is relative to the beginning or current position is determined by the "origin" parameter which will be
 536 either drflac_seek_origin_start or drflac_seek_origin_current.
 537
 538 When seeking to a PCM frame using drflac_seek_to_pcm_frame(), dr_flac may call this with an offset beyond the end of the FLAC stream. This needs to be detected
 539 and handled by returning DRFLAC_FALSE.
 540 */
 541 typedef drflac_bool32 (* drflac_seek_proc)(void* pUserData, int offset, drflac_seek_origin origin);
 542
 543 /*
 544 Callback for when a metadata block is read.
 545
 546
 547 Parameters
 548 ----------
 549 pUserData (in)
 550     The user data that was passed to drflac_open() and family.
 551
 552 pMetadata (in)
 553     A pointer to a structure containing the data of the metadata block.
 554
 555
 556 Remarks
 557 -------
 558 Use pMetadata->type to determine which metadata block is being handled and how to read the data. This
 559 will be set to one of the DRFLAC_METADATA_BLOCK_TYPE_* tokens.
 560 */
 561 typedef void (* drflac_meta_proc)(void* pUserData, drflac_metadata* pMetadata);
 562
 563
 564 typedef struct
 565 {
 566     void* pUserData;
 567     void* (* onMalloc)(size_t sz, void* pUserData);
 568     void* (* onRealloc)(void* p, size_t sz, void* pUserData);
 569     void  (* onFree)(void* p, void* pUserData);
 570 } drflac_allocation_callbacks;
 571
 572 /* Structure for internal use. Only used for decoders opened with drflac_open_memory. */
 573 typedef struct
 574 {
 575     const drflac_uint8* data;
 576     size_t dataSize;
 577     size_t currentReadPos;
 578 } drflac__memory_stream;
 579
 580 /* Structure for internal use. Used for bit streaming. */
 581 typedef struct
 582 {
 583     /* The function to call when more data needs to be read. */
 584     drflac_read_proc onRead;
 585
 586     /* The function to call when the current read position needs to be moved. */
 587     drflac_seek_proc onSeek;
 588
 589     /* The user data to pass around to onRead and onSeek. */
 590     void* pUserData;
 591
 592
 593     /*
 594     The number of unaligned bytes in the L2 cache. This will always be 0 until the end of the stream is hit. At the end of the
 595     stream there will be a number of bytes that don't cleanly fit in an L1 cache line, so we use this variable to know whether
 596     or not the bistreamer needs to run on a slower path to read those last bytes. This will never be more than sizeof(drflac_cache_t).
 597     */
 598     size_t unalignedByteCount;
 599
 600     /* The content of the unaligned bytes. */
 601     drflac_cache_t unalignedCache;
 602
 603     /* The index of the next valid cache line in the "L2" cache. */
 604     drflac_uint32 nextL2Line;
 605
 606     /* The number of bits that have been consumed by the cache. This is used to determine how many valid bits are remaining. */
 607     drflac_uint32 consumedBits;
 608
 609     /*
 610     The cached data which was most recently read from the client. There are two levels of cache. Data flows as such:
 611     Client -> L2 -> L1. The L2 -> L1 movement is aligned and runs on a fast path in just a few instructions.
 612     */
 613     drflac_cache_t cacheL2[DR_FLAC_BUFFER_SIZE/sizeof(drflac_cache_t)];
 614     drflac_cache_t cache;
 615
 616     /*
 617     CRC-16. This is updated whenever bits are read from the bit stream. Manually set this to 0 to reset the CRC. For FLAC, this
 618     is reset to 0 at the beginning of each frame.
 619     */
 620     drflac_uint16 crc16;
 621     drflac_cache_t crc16Cache;              /* A cache for optimizing CRC calculations. This is filled when when the L1 cache is reloaded. */
 622     drflac_uint32 crc16CacheIgnoredBytes;   /* The number of bytes to ignore when updating the CRC-16 from the CRC-16 cache. */
 623 } drflac_bs;
 624
 625 typedef struct
 626 {
 627     /* The type of the subframe: SUBFRAME_CONSTANT, SUBFRAME_VERBATIM, SUBFRAME_FIXED or SUBFRAME_LPC. */
 628     drflac_uint8 subframeType;
 629
 630     /* The number of wasted bits per sample as specified by the sub-frame header. */
 631     drflac_uint8 wastedBitsPerSample;
 632
 633     /* The order to use for the prediction stage for SUBFRAME_FIXED and SUBFRAME_LPC. */
 634     drflac_uint8 lpcOrder;
 635
 636     /* A pointer to the buffer containing the decoded samples in the subframe. This pointer is an offset from drflac::pExtraData. */
 637     drflac_int32* pSamplesS32;
 638 } drflac_subframe;
 639
 640 typedef struct
 641 {
 642     /*
 643     If the stream uses variable block sizes, this will be set to the index of the first PCM frame. If fixed block sizes are used, this will
 644     always be set to 0. This is 64-bit because the decoded PCM frame number will be 36 bits.
 645     */
 646     drflac_uint64 pcmFrameNumber;
 647
 648     /*
 649     If the stream uses fixed block sizes, this will be set to the frame number. If variable block sizes are used, this will always be 0. This
 650     is 32-bit because in fixed block sizes, the maximum frame number will be 31 bits.
 651     */
 652     drflac_uint32 flacFrameNumber;
 653
 654     /* The sample rate of this frame. */
 655     drflac_uint32 sampleRate;
 656
 657     /* The number of PCM frames in each sub-frame within this frame. */
 658     drflac_uint16 blockSizeInPCMFrames;
 659
 660     /*
 661     The channel assignment of this frame. This is not always set to the channel count. If interchannel decorrelation is being used this
 662     will be set to DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE, DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE or DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE.
 663     */
 664     drflac_uint8 channelAssignment;
 665
 666     /* The number of bits per sample within this frame. */
 667     drflac_uint8 bitsPerSample;
 668
 669     /* The frame's CRC. */
 670     drflac_uint8 crc8;
 671 } drflac_frame_header;
 672
 673 typedef struct
 674 {
 675     /* The header. */
 676     drflac_frame_header header;
 677
 678     /*
 679     The number of PCM frames left to be read in this FLAC frame. This is initially set to the block size. As PCM frames are read,
 680     this will be decremented. When it reaches 0, the decoder will see this frame as fully consumed and load the next frame.
 681     */
 682     drflac_uint32 pcmFramesRemaining;
 683
 684     /* The list of sub-frames within the frame. There is one sub-frame for each channel, and there's a maximum of 8 channels. */
 685     drflac_subframe subframes[8];
 686 } drflac_frame;
 687
 688 typedef struct
 689 {
 690     /* The function to call when a metadata block is read. */
 691     drflac_meta_proc onMeta;
 692
 693     /* The user data posted to the metadata callback function. */
 694     void* pUserDataMD;
 695
 696     /* Memory allocation callbacks. */
 697     drflac_allocation_callbacks allocationCallbacks;
 698
 699
 700     /* The sample rate. Will be set to something like 44100. */
 701     drflac_uint32 sampleRate;
 702
 703     /*
 704     The number of channels. This will be set to 1 for monaural streams, 2 for stereo, etc. Maximum 8. This is set based on the
 705     value specified in the STREAMINFO block.
 706     */
 707     drflac_uint8 channels;
 708
 709     /* The bits per sample. Will be set to something like 16, 24, etc. */
 710     drflac_uint8 bitsPerSample;
 711
 712     /* The maximum block size, in samples. This number represents the number of samples in each channel (not combined). */
 713     drflac_uint16 maxBlockSizeInPCMFrames;
 714
 715     /*
 716     The total number of PCM Frames making up the stream. Can be 0 in which case it's still a valid stream, but just means
 717     the total PCM frame count is unknown. Likely the case with streams like internet radio.
 718     */
 719     drflac_uint64 totalPCMFrameCount;
 720
 721
 722     /* The container type. This is set based on whether or not the decoder was opened from a native or Ogg stream. */
 723     drflac_container container;
 724
 725     /* The number of seekpoints in the seektable. */
 726     drflac_uint32 seekpointCount;
 727
 728
 729     /* Information about the frame the decoder is currently sitting on. */
 730     drflac_frame currentFLACFrame;
 731
 732
 733     /* The index of the PCM frame the decoder is currently sitting on. This is only used for seeking. */
 734     drflac_uint64 currentPCMFrame;
 735
 736     /* The position of the first FLAC frame in the stream. This is only ever used for seeking. */
 737     drflac_uint64 firstFLACFramePosInBytes;
 738
 739
 740     /* A hack to avoid a malloc() when opening a decoder with drflac_open_memory(). */
 741     drflac__memory_stream memoryStream;
 742
 743
 744     /* A pointer to the decoded sample data. This is an offset of pExtraData. */
 745     drflac_int32* pDecodedSamples;
 746
 747     /* A pointer to the seek table. This is an offset of pExtraData, or NULL if there is no seek table. */
 748     drflac_seekpoint* pSeekpoints;
 749
 750     /* Internal use only. Only used with Ogg containers. Points to a drflac_oggbs object. This is an offset of pExtraData. */
 751     void* _oggbs;
 752
 753     /* Internal use only. Used for profiling and testing different seeking modes. */
 754     drflac_bool32 _noSeekTableSeek    : 1;
 755     drflac_bool32 _noBinarySearchSeek : 1;
 756     drflac_bool32 _noBruteForceSeek   : 1;
 757
 758     /* The bit streamer. The raw FLAC data is fed through this object. */
 759     drflac_bs bs;
 760
 761     /* Variable length extra data. We attach this to the end of the object so we can avoid unnecessary mallocs. */
 762     drflac_uint8 pExtraData[1];
 763 } drflac;
 764
 765
 766 /*
 767 Opens a FLAC decoder.
 768
 769
 770 Parameters
 771 ----------
 772 onRead (in)
 773     The function to call when data needs to be read from the client.
 774
 775 onSeek (in)
 776     The function to call when the read position of the client data needs to move.
 777
 778 pUserData (in, optional)
 779     A pointer to application defined data that will be passed to onRead and onSeek.
 780
 781 pAllocationCallbacks (in, optional)
 782     A pointer to application defined callbacks for managing memory allocations.
 783
 784
 785 Return Value
 786 ------------
 787 Returns a pointer to an object representing the decoder.
 788
 789
 790 Remarks
 791 -------
 792 Close the decoder with `drflac_close()`.
 793
 794 `pAllocationCallbacks` can be NULL in which case it will use `DRFLAC_MALLOC`, `DRFLAC_REALLOC` and `DRFLAC_FREE`.
 795
 796 This function will automatically detect whether or not you are attempting to open a native or Ogg encapsulated FLAC, both of which should work seamlessly
 797 without any manual intervention. Ogg encapsulation also works with multiplexed streams which basically means it can play FLAC encoded audio tracks in videos.
 798
 799 This is the lowest level function for opening a FLAC stream. You can also use `drflac_open_file()` and `drflac_open_memory()` to open the stream from a file or
 800 from a block of memory respectively.
 801
 802 The STREAMINFO block must be present for this to succeed. Use `drflac_open_relaxed()` to open a FLAC stream where the header may not be present.
 803
 804 Use `drflac_open_with_metadata()` if you need access to metadata.
 805
 806
 807 Seek Also
 808 ---------
 809 drflac_open_file()
 810 drflac_open_memory()
 811 drflac_open_with_metadata()
 812 drflac_close()
 813 */
 814 DRFLAC_API drflac* drflac_open(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks);
 815
 816 /*
 817 Opens a FLAC stream with relaxed validation of the header block.
 818
 819
 820 Parameters
 821 ----------
 822 onRead (in)
 823     The function to call when data needs to be read from the client.
 824
 825 onSeek (in)
 826     The function to call when the read position of the client data needs to move.
 827
 828 container (in)
 829     Whether or not the FLAC stream is encapsulated using standard FLAC encapsulation or Ogg encapsulation.
 830
 831 pUserData (in, optional)
 832     A pointer to application defined data that will be passed to onRead and onSeek.
 833
 834 pAllocationCallbacks (in, optional)
 835     A pointer to application defined callbacks for managing memory allocations.
 836
 837
 838 Return Value
 839 ------------
 840 A pointer to an object representing the decoder.
 841
 842
 843 Remarks
 844 -------
 845 The same as drflac_open(), except attempts to open the stream even when a header block is not present.
 846
 847 Because the header is not necessarily available, the caller must explicitly define the container (Native or Ogg). Do not set this to `drflac_container_unknown`
 848 as that is for internal use only.
 849
 850 Opening in relaxed mode will continue reading data from onRead until it finds a valid frame. If a frame is never found it will continue forever. To abort,
 851 force your `onRead` callback to return 0, which dr_flac will use as an indicator that the end of the stream was found.
 852
 853 Use `drflac_open_with_metadata_relaxed()` if you need access to metadata.
 854 */
 855 DRFLAC_API drflac* drflac_open_relaxed(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_container container, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks);
 856
 857 /*
 858 Opens a FLAC decoder and notifies the caller of the metadata chunks (album art, etc.).
 859
 860
 861 Parameters
 862 ----------
 863 onRead (in)
 864     The function to call when data needs to be read from the client.
 865
 866 onSeek (in)
 867     The function to call when the read position of the client data needs to move.
 868
 869 onMeta (in)
 870     The function to call for every metadata block.
 871
 872 pUserData (in, optional)
 873     A pointer to application defined data that will be passed to onRead, onSeek and onMeta.
 874
 875 pAllocationCallbacks (in, optional)
 876     A pointer to application defined callbacks for managing memory allocations.
 877
 878
 879 Return Value
 880 ------------
 881 A pointer to an object representing the decoder.
 882
 883
 884 Remarks
 885 -------
 886 Close the decoder with `drflac_close()`.
 887
 888 `pAllocationCallbacks` can be NULL in which case it will use `DRFLAC_MALLOC`, `DRFLAC_REALLOC` and `DRFLAC_FREE`.
 889
 890 This is slower than `drflac_open()`, so avoid this one if you don't need metadata. Internally, this will allocate and free memory on the heap for every
 891 metadata block except for STREAMINFO and PADDING blocks.
 892
 893 The caller is notified of the metadata via the `onMeta` callback. All metadata blocks will be handled before the function returns. This callback takes a
 894 pointer to a `drflac_metadata` object which is a union containing the data of all relevant metadata blocks. Use the `type` member to discriminate against
 895 the different metadata types.
 896
 897 The STREAMINFO block must be present for this to succeed. Use `drflac_open_with_metadata_relaxed()` to open a FLAC stream where the header may not be present.
 898
 899 Note that this will behave inconsistently with `drflac_open()` if the stream is an Ogg encapsulated stream and a metadata block is corrupted. This is due to
 900 the way the Ogg stream recovers from corrupted pages. When `drflac_open_with_metadata()` is being used, the open routine will try to read the contents of the
 901 metadata block, whereas `drflac_open()` will simply seek past it (for the sake of efficiency). This inconsistency can result in different samples being
 902 returned depending on whether or not the stream is being opened with metadata.
 903
 904
 905 Seek Also
 906 ---------
 907 drflac_open_file_with_metadata()
 908 drflac_open_memory_with_metadata()
 909 drflac_open()
 910 drflac_close()
 911 */
 912 DRFLAC_API drflac* drflac_open_with_metadata(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_meta_proc onMeta, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks);
 913
 914 /*
 915 The same as drflac_open_with_metadata(), except attempts to open the stream even when a header block is not present.
 916
 917 See Also
 918 --------
 919 drflac_open_with_metadata()
 920 drflac_open_relaxed()
 921 */
 922 DRFLAC_API drflac* drflac_open_with_metadata_relaxed(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_meta_proc onMeta, drflac_container container, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks);
 923
 924 /*
 925 Closes the given FLAC decoder.
 926
 927
 928 Parameters
 929 ----------
 930 pFlac (in)
 931     The decoder to close.
 932
 933
 934 Remarks
 935 -------
 936 This will destroy the decoder object.
 937
 938
 939 See Also
 940 --------
 941 drflac_open()
 942 drflac_open_with_metadata()
 943 drflac_open_file()
 944 drflac_open_file_w()
 945 drflac_open_file_with_metadata()
 946 drflac_open_file_with_metadata_w()
 947 drflac_open_memory()
 948 drflac_open_memory_with_metadata()
 949 */
 950 DRFLAC_API void drflac_close(drflac* pFlac);
 951
 952
 953 /*
 954 Reads sample data from the given FLAC decoder, output as interleaved signed 32-bit PCM.
 955
 956
 957 Parameters
 958 ----------
 959 pFlac (in)
 960     The decoder.
 961
 962 framesToRead (in)
 963     The number of PCM frames to read.
 964
 965 pBufferOut (out, optional)
 966     A pointer to the buffer that will receive the decoded samples.
 967
 968
 969 Return Value
 970 ------------
 971 Returns the number of PCM frames actually read. If the return value is less than `framesToRead` it has reached the end.
 972
 973
 974 Remarks
 975 -------
 976 pBufferOut can be null, in which case the call will act as a seek, and the return value will be the number of frames seeked.
 977 */
 978 DRFLAC_API drflac_uint64 drflac_read_pcm_frames_s32(drflac* pFlac, drflac_uint64 framesToRead, drflac_int32* pBufferOut);
 979
 980
 981 /*
 982 Reads sample data from the given FLAC decoder, output as interleaved signed 16-bit PCM.
 983
 984
 985 Parameters
 986 ----------
 987 pFlac (in)
 988     The decoder.
 989
 990 framesToRead (in)
 991     The number of PCM frames to read.
 992
 993 pBufferOut (out, optional)
 994     A pointer to the buffer that will receive the decoded samples.
 995
 996
 997 Return Value
 998 ------------
 999 Returns the number of PCM frames actually read. If the return value is less than `framesToRead` it has reached the end.
1000
1001
1002 Remarks
1003 -------
1004 pBufferOut can be null, in which case the call will act as a seek, and the return value will be the number of frames seeked.
1005
1006 Note that this is lossy for streams where the bits per sample is larger than 16.
1007 */
1008 DRFLAC_API drflac_uint64 drflac_read_pcm_frames_s16(drflac* pFlac, drflac_uint64 framesToRead, drflac_int16* pBufferOut);
1009
1010 /*
1011 Reads sample data from the given FLAC decoder, output as interleaved 32-bit floating point PCM.
1012
1013
1014 Parameters
1015 ----------
1016 pFlac (in)
1017     The decoder.
1018
1019 framesToRead (in)
1020     The number of PCM frames to read.
1021
1022 pBufferOut (out, optional)
1023     A pointer to the buffer that will receive the decoded samples.
1024
1025
1026 Return Value
1027 ------------
1028 Returns the number of PCM frames actually read. If the return value is less than `framesToRead` it has reached the end.
1029
1030
1031 Remarks
1032 -------
1033 pBufferOut can be null, in which case the call will act as a seek, and the return value will be the number of frames seeked.
1034
1035 Note that this should be considered lossy due to the nature of floating point numbers not being able to exactly represent every possible number.
1036 */
1037 DRFLAC_API drflac_uint64 drflac_read_pcm_frames_f32(drflac* pFlac, drflac_uint64 framesToRead, float* pBufferOut);
1038
1039 /*
1040 Seeks to the PCM frame at the given index.
1041
1042
1043 Parameters
1044 ----------
1045 pFlac (in)
1046     The decoder.
1047
1048 pcmFrameIndex (in)
1049     The index of the PCM frame to seek to. See notes below.
1050
1051
1052 Return Value
1053 -------------
1054 `DRFLAC_TRUE` if successful; `DRFLAC_FALSE` otherwise.
1055 */
1056 DRFLAC_API drflac_bool32 drflac_seek_to_pcm_frame(drflac* pFlac, drflac_uint64 pcmFrameIndex);
1057
1058 /*
1059 Opens a FLAC decoder from a pre-allocated block of memory
1060
1061
1062 Parameters
1063 ----------
1064 pData (in)
1065     A pointer to the raw encoded FLAC data.
1066
1067 dataSize (in)
1068     The size in bytes of `data`.
1069
1070 pAllocationCallbacks (in)
1071     A pointer to application defined callbacks for managing memory allocations.
1072
1073
1074 Return Value
1075 ------------
1076 A pointer to an object representing the decoder.
1077
1078
1079 Remarks
1080 -------
1081 This does not create a copy of the data. It is up to the application to ensure the buffer remains valid for the lifetime of the decoder.
1082
1083
1084 See Also
1085 --------
1086 drflac_open()
1087 drflac_close()
1088 */
1089 DRFLAC_API drflac* drflac_open_memory(const void* pData, size_t dataSize, const drflac_allocation_callbacks* pAllocationCallbacks);
1090
1091 /*
1092 Opens a FLAC decoder from a pre-allocated block of memory and notifies the caller of the metadata chunks (album art, etc.)
1093
1094
1095 Parameters
1096 ----------
1097 pData (in)
1098     A pointer to the raw encoded FLAC data.
1099
1100 dataSize (in)
1101     The size in bytes of `data`.
1102
1103 onMeta (in)
1104     The callback to fire for each metadata block.
1105
1106 pUserData (in)
1107     A pointer to the user data to pass to the metadata callback.
1108
1109 pAllocationCallbacks (in)
1110     A pointer to application defined callbacks for managing memory allocations.
1111
1112
1113 Remarks
1114 -------
1115 Look at the documentation for drflac_open_with_metadata() for more information on how metadata is handled.
1116
1117
1118 See Also
1119 -------
1120 drflac_open_with_metadata()
1121 drflac_open()
1122 drflac_close()
1123 */
1124 DRFLAC_API drflac* drflac_open_memory_with_metadata(const void* pData, size_t dataSize, drflac_meta_proc onMeta, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks);
1125
1126
1127
1128 /* High Level APIs */
1129
1130 /*
1131 Opens a FLAC stream from the given callbacks and fully decodes it in a single operation. The return value is a
1132 pointer to the sample data as interleaved signed 32-bit PCM. The returned data must be freed with drflac_free().
1133
1134 You can pass in custom memory allocation callbacks via the pAllocationCallbacks parameter. This can be NULL in which
1135 case it will use DRFLAC_MALLOC, DRFLAC_REALLOC and DRFLAC_FREE.
1136
1137 Sometimes a FLAC file won't keep track of the total sample count. In this situation the function will continuously
1138 read samples into a dynamically sized buffer on the heap until no samples are left.
1139
1140 Do not call this function on a broadcast type of stream (like internet radio streams and whatnot).
1141 */
1142 DRFLAC_API drflac_int32* drflac_open_and_read_pcm_frames_s32(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks);
1143
1144 /* Same as drflac_open_and_read_pcm_frames_s32(), except returns signed 16-bit integer samples. */
1145 DRFLAC_API drflac_int16* drflac_open_and_read_pcm_frames_s16(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks);
1146
1147 /* Same as drflac_open_and_read_pcm_frames_s32(), except returns 32-bit floating-point samples. */
1148 DRFLAC_API float* drflac_open_and_read_pcm_frames_f32(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks);
1149
1150 /* Same as drflac_open_and_read_pcm_frames_s32() except opens the decoder from a block of memory. */
1151 DRFLAC_API drflac_int32* drflac_open_memory_and_read_pcm_frames_s32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks);
1152
1153 /* Same as drflac_open_memory_and_read_pcm_frames_s32(), except returns signed 16-bit integer samples. */
1154 DRFLAC_API drflac_int16* drflac_open_memory_and_read_pcm_frames_s16(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks);
1155
1156 /* Same as drflac_open_memory_and_read_pcm_frames_s32(), except returns 32-bit floating-point samples. */
1157 DRFLAC_API float* drflac_open_memory_and_read_pcm_frames_f32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks);
1158
1159 /*
1160 Frees memory that was allocated internally by dr_flac.
1161
1162 Set pAllocationCallbacks to the same object that was passed to drflac_open_*_and_read_pcm_frames_*(). If you originally passed in NULL, pass in NULL for this.
1163 */
1164 DRFLAC_API void drflac_free(void* p, const drflac_allocation_callbacks* pAllocationCallbacks);
1165
1166
1167 /* Structure representing an iterator for vorbis comments in a VORBIS_COMMENT metadata block. */
1168 typedef struct
1169 {
1170     drflac_uint32 countRemaining;
1171     const char* pRunningData;
1172 } drflac_vorbis_comment_iterator;
1173
1174 /*
1175 Initializes a vorbis comment iterator. This can be used for iterating over the vorbis comments in a VORBIS_COMMENT
1176 metadata block.
1177 */
1178 DRFLAC_API void drflac_init_vorbis_comment_iterator(drflac_vorbis_comment_iterator* pIter, drflac_uint32 commentCount, const void* pComments);
1179
1180 /*
1181 Goes to the next vorbis comment in the given iterator. If null is returned it means there are no more comments. The
1182 returned string is NOT null terminated.
1183 */
1184 DRFLAC_API const char* drflac_next_vorbis_comment(drflac_vorbis_comment_iterator* pIter, drflac_uint32* pCommentLengthOut);
1185
1186
1187 /* Structure representing an iterator for cuesheet tracks in a CUESHEET metadata block. */
1188 typedef struct
1189 {
1190     drflac_uint32 countRemaining;
1191     const char* pRunningData;
1192 } drflac_cuesheet_track_iterator;
1193
1194 /* Packing is important on this structure because we map this directly to the raw data within the CUESHEET metadata block. */
1195 #pragma pack(4)
1196 typedef struct
1197 {
1198     drflac_uint64 offset;
1199     drflac_uint8 index;
1200     drflac_uint8 reserved[3];
1201 } drflac_cuesheet_track_index;
1202 #pragma pack()
1203
1204 typedef struct
1205 {
1206     drflac_uint64 offset;
1207     drflac_uint8 trackNumber;
1208     char ISRC[12];
1209     drflac_bool8 isAudio;
1210     drflac_bool8 preEmphasis;
1211     drflac_uint8 indexCount;
1212     const drflac_cuesheet_track_index* pIndexPoints;
1213 } drflac_cuesheet_track;
1214
1215 /*
1216 Initializes a cuesheet track iterator. This can be used for iterating over the cuesheet tracks in a CUESHEET metadata
1217 block.
1218 */
1219 DRFLAC_API void drflac_init_cuesheet_track_iterator(drflac_cuesheet_track_iterator* pIter, drflac_uint32 trackCount, const void* pTrackData);
1220
1221 /* Goes to the next cuesheet track in the given iterator. If DRFLAC_FALSE is returned it means there are no more comments. */
1222 DRFLAC_API drflac_bool32 drflac_next_cuesheet_track(drflac_cuesheet_track_iterator* pIter, drflac_cuesheet_track* pCuesheetTrack);
1223
1224
1225 #ifdef __cplusplus
1226 }
1227 #endif
1228 #endif  /* dr_flac_h */
1229
1230
1231 /************************************************************************************************************************************************************
1232  ************************************************************************************************************************************************************
1233
1234  IMPLEMENTATION
1235
1236  ************************************************************************************************************************************************************
1237  ************************************************************************************************************************************************************/
1238 #if defined(DR_FLAC_IMPLEMENTATION) || defined(DRFLAC_IMPLEMENTATION)
1239 #ifndef dr_flac_c
1240 #define dr_flac_c
1241
1242 /* Disable some annoying warnings. */
1243 #if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
1244     #pragma GCC diagnostic push
1245     #if __GNUC__ >= 7
1246     #pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
1247     #endif
1248 #endif
1249
1250 #ifdef __linux__
1251     #ifndef _BSD_SOURCE
1252         #define _BSD_SOURCE
1253     #endif
1254     #ifndef _DEFAULT_SOURCE
1255         #define _DEFAULT_SOURCE
1256     #endif
1257     #ifndef __USE_BSD
1258         #define __USE_BSD
1259     #endif
1260     #include <endian.h>
1261 #endif
1262
1263 #include <stdlib.h>
1264 #include <string.h>
1265
1266 #ifdef _MSC_VER
1267     #define DRFLAC_INLINE __forceinline
1268 #elif defined(__GNUC__)
1269     /*
1270     I've had a bug report where GCC is emitting warnings about functions possibly not being inlineable. This warning happens when
1271     the __attribute__((always_inline)) attribute is defined without an "inline" statement. I think therefore there must be some
1272     case where "__inline__" is not always defined, thus the compiler emitting these warnings. When using -std=c89 or -ansi on the
1273     command line, we cannot use the "inline" keyword and instead need to use "__inline__". In an attempt to work around this issue
1274     I am using "__inline__" only when we're compiling in strict ANSI mode.
1275     */
1276     #if defined(__STRICT_ANSI__)
1277         #define DRFLAC_INLINE __inline__ __attribute__((always_inline))
1278     #else
1279         #define DRFLAC_INLINE inline __attribute__((always_inline))
1280     #endif
1281 #elif defined(__WATCOMC__)
1282     #define DRFLAC_INLINE __inline
1283 #else
1284     #define DRFLAC_INLINE
1285 #endif
1286
1287 /* CPU architecture. */
1288 #if defined(__x86_64__) || defined(_M_X64)
1289     #define DRFLAC_X64
1290 #elif defined(__i386) || defined(_M_IX86)
1291     #define DRFLAC_X86
1292 #elif defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
1293     #define DRFLAC_ARM
1294 #endif
1295
1296 /*
1297 Intrinsics Support
1298
1299 There's a bug in GCC 4.2.x which results in an incorrect compilation error when using _mm_slli_epi32() where it complains with
1300
1301     "error: shift must be an immediate"
1302
1303 Unfortuantely dr_flac depends on this for a few things so we're just going to disable SSE on GCC 4.2 and below.
1304 */
1305 #if !defined(DR_FLAC_NO_SIMD)
1306     #if defined(DRFLAC_X64) || defined(DRFLAC_X86)
1307         #if defined(_MSC_VER) && !defined(__clang__)
1308             /* MSVC. */
1309             #if _MSC_VER >= 1400 && !defined(DRFLAC_NO_SSE2)    /* 2005 */
1310                 #define DRFLAC_SUPPORT_SSE2
1311             #endif
1312             #if _MSC_VER >= 1600 && !defined(DRFLAC_NO_SSE41)   /* 2010 */
1313                 #define DRFLAC_SUPPORT_SSE41
1314             #endif
1315         #elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)))
1316             /* Assume GNUC-style. */
1317             #if defined(__SSE2__) && !defined(DRFLAC_NO_SSE2)
1318                 #define DRFLAC_SUPPORT_SSE2
1319             #endif
1320             #if defined(__SSE4_1__) && !defined(DRFLAC_NO_SSE41)
1321                 #define DRFLAC_SUPPORT_SSE41
1322             #endif
1323         #endif
1324
1325         /* If at this point we still haven't determined compiler support for the intrinsics just fall back to __has_include. */
1326         #if !defined(__GNUC__) && !defined(__clang__) && defined(__has_include)
1327             #if !defined(DRFLAC_SUPPORT_SSE2) && !defined(DRFLAC_NO_SSE2) && __has_include(<emmintrin.h>)
1328                 #define DRFLAC_SUPPORT_SSE2
1329             #endif
1330             #if !defined(DRFLAC_SUPPORT_SSE41) && !defined(DRFLAC_NO_SSE41) && __has_include(<smmintrin.h>)
1331                 #define DRFLAC_SUPPORT_SSE41
1332             #endif
1333         #endif
1334
1335         #if defined(DRFLAC_SUPPORT_SSE41)
1336             #include <smmintrin.h>
1337         #elif defined(DRFLAC_SUPPORT_SSE2)
1338             #include <emmintrin.h>
1339         #endif
1340     #endif
1341
1342     #if defined(DRFLAC_ARM)
1343         #if !defined(DRFLAC_NO_NEON) && (defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64))
1344             #define DRFLAC_SUPPORT_NEON
1345         #endif
1346
1347         /* Fall back to looking for the #include file. */
1348         #if !defined(__GNUC__) && !defined(__clang__) && defined(__has_include)
1349             #if !defined(DRFLAC_SUPPORT_NEON) && !defined(DRFLAC_NO_NEON) && __has_include(<arm_neon.h>)
1350                 #define DRFLAC_SUPPORT_NEON
1351             #endif
1352         #endif
1353
1354         #if defined(DRFLAC_SUPPORT_NEON)
1355             #include <arm_neon.h>
1356         #endif
1357     #endif
1358 #endif
1359
1360 /* Compile-time CPU feature support. */
1361 #if !defined(DR_FLAC_NO_SIMD) && (defined(DRFLAC_X86) || defined(DRFLAC_X64))
1362     #if defined(_MSC_VER) && !defined(__clang__)
1363         #if _MSC_VER >= 1400
1364             #include <intrin.h>
1365             static void drflac__cpuid(int info[4], int fid)
1366             {
1367                 __cpuid(info, fid);
1368             }
1369         #else
1370             #define DRFLAC_NO_CPUID
1371         #endif
1372     #else
1373         #if defined(__GNUC__) || defined(__clang__)
1374             static void drflac__cpuid(int info[4], int fid)
1375             {
1376                 /*
1377                 It looks like the -fPIC option uses the ebx register which GCC complains about. We can work around this by just using a different register, the
1378                 specific register of which I'm letting the compiler decide on. The "k" prefix is used to specify a 32-bit register. The {...} syntax is for
1379                 supporting different assembly dialects.
1380
1381                 What's basically happening is that we're saving and restoring the ebx register manually.
1382                 */
1383                 #if defined(DRFLAC_X86) && defined(__PIC__)
1384                     __asm__ __volatile__ (
1385                         "xchg{l} {%%}ebx, %k1;"
1386                         "cpuid;"
1387                         "xchg{l} {%%}ebx, %k1;"
1388                         : "=a"(info[0]), "=&r"(info[1]), "=c"(info[2]), "=d"(info[3]) : "a"(fid), "c"(0)
1389                     );
1390                 #else
1391                     __asm__ __volatile__ (
1392                         "cpuid" : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3]) : "a"(fid), "c"(0)
1393                     );
1394                 #endif
1395             }
1396         #else
1397             #define DRFLAC_NO_CPUID
1398         #endif
1399     #endif
1400 #else
1401     #define DRFLAC_NO_CPUID
1402 #endif
1403
1404 static DRFLAC_INLINE drflac_bool32 drflac_has_sse2(void)
1405 {
1406 #if defined(DRFLAC_SUPPORT_SSE2)
1407     #if (defined(DRFLAC_X64) || defined(DRFLAC_X86)) && !defined(DRFLAC_NO_SSE2)
1408         #if defined(DRFLAC_X64)
1409             return DRFLAC_TRUE;    /* 64-bit targets always support SSE2. */
1410         #elif (defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(__SSE2__)
1411             return DRFLAC_TRUE;    /* If the compiler is allowed to freely generate SSE2 code we can assume support. */
1412         #else
1413             #if defined(DRFLAC_NO_CPUID)
1414                 return DRFLAC_FALSE;
1415             #else
1416                 int info[4];
1417                 drflac__cpuid(info, 1);
1418                 return (info[3] & (1 << 26)) != 0;
1419             #endif
1420         #endif
1421     #else
1422         return DRFLAC_FALSE;       /* SSE2 is only supported on x86 and x64 architectures. */
1423     #endif
1424 #else
1425     return DRFLAC_FALSE;           /* No compiler support. */
1426 #endif
1427 }
1428
1429 static DRFLAC_INLINE drflac_bool32 drflac_has_sse41(void)
1430 {
1431 #if defined(DRFLAC_SUPPORT_SSE41)
1432     #if (defined(DRFLAC_X64) || defined(DRFLAC_X86)) && !defined(DRFLAC_NO_SSE41)
1433         #if defined(DRFLAC_X64)
1434             return DRFLAC_TRUE;    /* 64-bit targets always support SSE4.1. */
1435         #elif (defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(__SSE4_1__)
1436             return DRFLAC_TRUE;    /* If the compiler is allowed to freely generate SSE41 code we can assume support. */
1437         #else
1438             #if defined(DRFLAC_NO_CPUID)
1439                 return DRFLAC_FALSE;
1440             #else
1441                 int info[4];
1442                 drflac__cpuid(info, 1);
1443                 return (info[2] & (1 << 19)) != 0;
1444             #endif
1445         #endif
1446     #else
1447         return DRFLAC_FALSE;       /* SSE41 is only supported on x86 and x64 architectures. */
1448     #endif
1449 #else
1450     return DRFLAC_FALSE;           /* No compiler support. */
1451 #endif
1452 }
1453
1454
1455 #if defined(_MSC_VER) && _MSC_VER >= 1500 && (defined(DRFLAC_X86) || defined(DRFLAC_X64)) && !defined(__clang__)
1456     #define DRFLAC_HAS_LZCNT_INTRINSIC
1457 #elif (defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)))
1458     #define DRFLAC_HAS_LZCNT_INTRINSIC
1459 #elif defined(__clang__)
1460     #if defined(__has_builtin)
1461         #if __has_builtin(__builtin_clzll) || __has_builtin(__builtin_clzl)
1462             #define DRFLAC_HAS_LZCNT_INTRINSIC
1463         #endif
1464     #endif
1465 #endif
1466
1467 #if defined(_MSC_VER) && _MSC_VER >= 1400 && !defined(__clang__)
1468     #define DRFLAC_HAS_BYTESWAP16_INTRINSIC
1469     #define DRFLAC_HAS_BYTESWAP32_INTRINSIC
1470     #define DRFLAC_HAS_BYTESWAP64_INTRINSIC
1471 #elif defined(__clang__)
1472     #if defined(__has_builtin)
1473         #if __has_builtin(__builtin_bswap16)
1474             #define DRFLAC_HAS_BYTESWAP16_INTRINSIC
1475         #endif
1476         #if __has_builtin(__builtin_bswap32)
1477             #define DRFLAC_HAS_BYTESWAP32_INTRINSIC
1478         #endif
1479         #if __has_builtin(__builtin_bswap64)
1480             #define DRFLAC_HAS_BYTESWAP64_INTRINSIC
1481         #endif
1482     #endif
1483 #elif defined(__GNUC__)
1484     #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
1485         #define DRFLAC_HAS_BYTESWAP32_INTRINSIC
1486         #define DRFLAC_HAS_BYTESWAP64_INTRINSIC
1487     #endif
1488     #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
1489         #define DRFLAC_HAS_BYTESWAP16_INTRINSIC
1490     #endif
1491 #endif
1492
1493
1494 /* Standard library stuff. */
1495 #ifndef DRFLAC_ASSERT
1496 #include <assert.h>
1497 #define DRFLAC_ASSERT(expression)           assert(expression)
1498 #endif
1499 #ifndef DRFLAC_MALLOC
1500 #define DRFLAC_MALLOC(sz)                   malloc((sz))
1501 #endif
1502 #ifndef DRFLAC_REALLOC
1503 #define DRFLAC_REALLOC(p, sz)               realloc((p), (sz))
1504 #endif
1505 #ifndef DRFLAC_FREE
1506 #define DRFLAC_FREE(p)                      free((p))
1507 #endif
1508 #ifndef DRFLAC_COPY_MEMORY
1509 #define DRFLAC_COPY_MEMORY(dst, src, sz)    memcpy((dst), (src), (sz))
1510 #endif
1511 #ifndef DRFLAC_ZERO_MEMORY
1512 #define DRFLAC_ZERO_MEMORY(p, sz)           memset((p), 0, (sz))
1513 #endif
1514 #ifndef DRFLAC_ZERO_OBJECT
1515 #define DRFLAC_ZERO_OBJECT(p)               DRFLAC_ZERO_MEMORY((p), sizeof(*(p)))
1516 #endif
1517
1518 #define DRFLAC_MAX_SIMD_VECTOR_SIZE                     64  /* 64 for AVX-512 in the future. */
1519
1520 typedef drflac_int32 drflac_result;
1521 #define DRFLAC_SUCCESS                                   0
1522 #define DRFLAC_ERROR                                    -1   /* A generic error. */
1523 #define DRFLAC_INVALID_ARGS                             -2
1524 #define DRFLAC_INVALID_OPERATION                        -3
1525 #define DRFLAC_OUT_OF_MEMORY                            -4
1526 #define DRFLAC_OUT_OF_RANGE                             -5
1527 #define DRFLAC_ACCESS_DENIED                            -6
1528 #define DRFLAC_DOES_NOT_EXIST                           -7
1529 #define DRFLAC_ALREADY_EXISTS                           -8
1530 #define DRFLAC_TOO_MANY_OPEN_FILES                      -9
1531 #define DRFLAC_INVALID_FILE                             -10
1532 #define DRFLAC_TOO_BIG                                  -11
1533 #define DRFLAC_PATH_TOO_LONG                            -12
1534 #define DRFLAC_NAME_TOO_LONG                            -13
1535 #define DRFLAC_NOT_DIRECTORY                            -14
1536 #define DRFLAC_IS_DIRECTORY                             -15
1537 #define DRFLAC_DIRECTORY_NOT_EMPTY                      -16
1538 #define DRFLAC_END_OF_FILE                              -17
1539 #define DRFLAC_NO_SPACE                                 -18
1540 #define DRFLAC_BUSY                                     -19
1541 #define DRFLAC_IO_ERROR                                 -20
1542 #define DRFLAC_INTERRUPT                                -21
1543 #define DRFLAC_UNAVAILABLE                              -22
1544 #define DRFLAC_ALREADY_IN_USE                           -23
1545 #define DRFLAC_BAD_ADDRESS                              -24
1546 #define DRFLAC_BAD_SEEK                                 -25
1547 #define DRFLAC_BAD_PIPE                                 -26
1548 #define DRFLAC_DEADLOCK                                 -27
1549 #define DRFLAC_TOO_MANY_LINKS                           -28
1550 #define DRFLAC_NOT_IMPLEMENTED                          -29
1551 #define DRFLAC_NO_MESSAGE                               -30
1552 #define DRFLAC_BAD_MESSAGE                              -31
1553 #define DRFLAC_NO_DATA_AVAILABLE                        -32
1554 #define DRFLAC_INVALID_DATA                             -33
1555 #define DRFLAC_TIMEOUT                                  -34
1556 #define DRFLAC_NO_NETWORK                               -35
1557 #define DRFLAC_NOT_UNIQUE                               -36
1558 #define DRFLAC_NOT_SOCKET                               -37
1559 #define DRFLAC_NO_ADDRESS                               -38
1560 #define DRFLAC_BAD_PROTOCOL                             -39
1561 #define DRFLAC_PROTOCOL_UNAVAILABLE                     -40
1562 #define DRFLAC_PROTOCOL_NOT_SUPPORTED                   -41
1563 #define DRFLAC_PROTOCOL_FAMILY_NOT_SUPPORTED            -42
1564 #define DRFLAC_ADDRESS_FAMILY_NOT_SUPPORTED             -43
1565 #define DRFLAC_SOCKET_NOT_SUPPORTED                     -44
1566 #define DRFLAC_CONNECTION_RESET                         -45
1567 #define DRFLAC_ALREADY_CONNECTED                        -46
1568 #define DRFLAC_NOT_CONNECTED                            -47
1569 #define DRFLAC_CONNECTION_REFUSED                       -48
1570 #define DRFLAC_NO_HOST                                  -49
1571 #define DRFLAC_IN_PROGRESS                              -50
1572 #define DRFLAC_CANCELLED                                -51
1573 #define DRFLAC_MEMORY_ALREADY_MAPPED                    -52
1574 #define DRFLAC_AT_END                                   -53
1575 #define DRFLAC_CRC_MISMATCH                             -128
1576
1577 #define DRFLAC_SUBFRAME_CONSTANT                        0
1578 #define DRFLAC_SUBFRAME_VERBATIM                        1
1579 #define DRFLAC_SUBFRAME_FIXED                           8
1580 #define DRFLAC_SUBFRAME_LPC                             32
1581 #define DRFLAC_SUBFRAME_RESERVED                        255
1582
1583 #define DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE  0
1584 #define DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2 1
1585
1586 #define DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT           0
1587 #define DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE             8
1588 #define DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE            9
1589 #define DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE              10
1590
1591 #define drflac_align(x, a)                              ((((x) + (a) - 1) / (a)) * (a))
1592
1593
1594 DRFLAC_API void drflac_version(drflac_uint32* pMajor, drflac_uint32* pMinor, drflac_uint32* pRevision)
1595 {
1596     if (pMajor) {
1597         *pMajor = DRFLAC_VERSION_MAJOR;
1598     }
1599
1600     if (pMinor) {
1601         *pMinor = DRFLAC_VERSION_MINOR;
1602     }
1603
1604     if (pRevision) {
1605         *pRevision = DRFLAC_VERSION_REVISION;
1606     }
1607 }
1608
1609 DRFLAC_API const char* drflac_version_string(void)
1610 {
1611     return DRFLAC_VERSION_STRING;
1612 }
1613
1614
1615 /* CPU caps. */
1616 #if defined(__has_feature)
1617     #if __has_feature(thread_sanitizer)
1618         #define DRFLAC_NO_THREAD_SANITIZE __attribute__((no_sanitize("thread")))
1619     #else
1620         #define DRFLAC_NO_THREAD_SANITIZE
1621     #endif
1622 #else
1623     #define DRFLAC_NO_THREAD_SANITIZE
1624 #endif
1625
1626 #if defined(DRFLAC_HAS_LZCNT_INTRINSIC)
1627 static drflac_bool32 drflac__gIsLZCNTSupported = DRFLAC_FALSE;
1628 #endif
1629
1630 #ifndef DRFLAC_NO_CPUID
1631 static drflac_bool32 drflac__gIsSSE2Supported  = DRFLAC_FALSE;
1632 static drflac_bool32 drflac__gIsSSE41Supported = DRFLAC_FALSE;
1633
1634 /*
1635 I've had a bug report that Clang's ThreadSanitizer presents a warning in this function. Having reviewed this, this does
1636 actually make sense. However, since CPU caps should never differ for a running process, I don't think the trade off of
1637 complicating internal API's by passing around CPU caps versus just disabling the warnings is worthwhile. I'm therefore
1638 just going to disable these warnings. This is disabled via the DRFLAC_NO_THREAD_SANITIZE attribute.
1639 */
1640 DRFLAC_NO_THREAD_SANITIZE static void drflac__init_cpu_caps(void)
1641 {
1642     static drflac_bool32 isCPUCapsInitialized = DRFLAC_FALSE;
1643
1644     if (!isCPUCapsInitialized) {
1645         /* LZCNT */
1646 #if defined(DRFLAC_HAS_LZCNT_INTRINSIC)
1647         int info[4] = {0};
1648         drflac__cpuid(info, 0x80000001);
1649         drflac__gIsLZCNTSupported = (info[2] & (1 << 5)) != 0;
1650 #endif
1651
1652         /* SSE2 */
1653         drflac__gIsSSE2Supported = drflac_has_sse2();
1654
1655         /* SSE4.1 */
1656         drflac__gIsSSE41Supported = drflac_has_sse41();
1657
1658         /* Initialized. */
1659         isCPUCapsInitialized = DRFLAC_TRUE;
1660     }
1661 }
1662 #else
1663 static drflac_bool32 drflac__gIsNEONSupported  = DRFLAC_FALSE;
1664
1665 static DRFLAC_INLINE drflac_bool32 drflac__has_neon(void)
1666 {
1667 #if defined(DRFLAC_SUPPORT_NEON)
1668     #if defined(DRFLAC_ARM) && !defined(DRFLAC_NO_NEON)
1669         #if (defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64))
1670             return DRFLAC_TRUE;    /* If the compiler is allowed to freely generate NEON code we can assume support. */
1671         #else
1672             /* TODO: Runtime check. */
1673             return DRFLAC_FALSE;
1674         #endif
1675     #else
1676         return DRFLAC_FALSE;       /* NEON is only supported on ARM architectures. */
1677     #endif
1678 #else
1679     return DRFLAC_FALSE;           /* No compiler support. */
1680 #endif
1681 }
1682
1683 DRFLAC_NO_THREAD_SANITIZE static void drflac__init_cpu_caps(void)
1684 {
1685     drflac__gIsNEONSupported = drflac__has_neon();
1686
1687 #if defined(DRFLAC_HAS_LZCNT_INTRINSIC) && defined(DRFLAC_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 5)
1688     drflac__gIsLZCNTSupported = DRFLAC_TRUE;
1689 #endif
1690 }
1691 #endif
1692
1693
1694 /* Endian Management */
1695 static DRFLAC_INLINE drflac_bool32 drflac__is_little_endian(void)
1696 {
1697 #if defined(DRFLAC_X86) || defined(DRFLAC_X64)
1698     return DRFLAC_TRUE;
1699 #elif defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN
1700     return DRFLAC_TRUE;
1701 #else
1702     int n = 1;
1703     return (*(char*)&n) == 1;
1704 #endif
1705 }
1706
1707 static DRFLAC_INLINE drflac_uint16 drflac__swap_endian_uint16(drflac_uint16 n)
1708 {
1709 #ifdef DRFLAC_HAS_BYTESWAP16_INTRINSIC
1710     #if defined(_MSC_VER) && !defined(__clang__)
1711         return _byteswap_ushort(n);
1712     #elif defined(__GNUC__) || defined(__clang__)
1713         return __builtin_bswap16(n);
1714     #else
1715         #error "This compiler does not support the byte swap intrinsic."
1716     #endif
1717 #else
1718     return ((n & 0xFF00) >> 8) |
1719            ((n & 0x00FF) << 8);
1720 #endif
1721 }
1722
1723 static DRFLAC_INLINE drflac_uint32 drflac__swap_endian_uint32(drflac_uint32 n)
1724 {
1725 #ifdef DRFLAC_HAS_BYTESWAP32_INTRINSIC
1726     #if defined(_MSC_VER) && !defined(__clang__)
1727         return _byteswap_ulong(n);
1728     #elif defined(__GNUC__) || defined(__clang__)
1729         #if defined(DRFLAC_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 6) && !defined(DRFLAC_64BIT)   /* <-- 64-bit inline assembly has not been tested, so disabling for now. */
1730             /* Inline assembly optimized implementation for ARM. In my testing, GCC does not generate optimized code with __builtin_bswap32(). */
1731             drflac_uint32 r;
1732             __asm__ __volatile__ (
1733             #if defined(DRFLAC_64BIT)
1734                 "rev %w[out], %w[in]" : [out]"=r"(r) : [in]"r"(n)   /* <-- This is untested. If someone in the community could test this, that would be appreciated! */
1735             #else
1736                 "rev %[out], %[in]" : [out]"=r"(r) : [in]"r"(n)
1737             #endif
1738             );
1739             return r;
1740         #else
1741             return __builtin_bswap32(n);
1742         #endif
1743     #else
1744         #error "This compiler does not support the byte swap intrinsic."
1745     #endif
1746 #else
1747     return ((n & 0xFF000000) >> 24) |
1748            ((n & 0x00FF0000) >>  8) |
1749            ((n & 0x0000FF00) <<  8) |
1750            ((n & 0x000000FF) << 24);
1751 #endif
1752 }
1753
1754 static DRFLAC_INLINE drflac_uint64 drflac__swap_endian_uint64(drflac_uint64 n)
1755 {
1756 #ifdef DRFLAC_HAS_BYTESWAP64_INTRINSIC
1757     #if defined(_MSC_VER) && !defined(__clang__)
1758         return _byteswap_uint64(n);
1759     #elif defined(__GNUC__) || defined(__clang__)
1760         return __builtin_bswap64(n);
1761     #else
1762         #error "This compiler does not support the byte swap intrinsic."
1763     #endif
1764 #else
1765     /* Weird "<< 32" bitshift is required for C89 because it doesn't support 64-bit constants. Should be optimized out by a good compiler. */
1766     return ((n & ((drflac_uint64)0xFF000000 << 32)) >> 56) |
1767            ((n & ((drflac_uint64)0x00FF0000 << 32)) >> 40) |
1768            ((n & ((drflac_uint64)0x0000FF00 << 32)) >> 24) |
1769            ((n & ((drflac_uint64)0x000000FF << 32)) >>  8) |
1770            ((n & ((drflac_uint64)0xFF000000      )) <<  8) |
1771            ((n & ((drflac_uint64)0x00FF0000      )) << 24) |
1772            ((n & ((drflac_uint64)0x0000FF00      )) << 40) |
1773            ((n & ((drflac_uint64)0x000000FF      )) << 56);
1774 #endif
1775 }
1776
1777
1778 static DRFLAC_INLINE drflac_uint16 drflac__be2host_16(drflac_uint16 n)
1779 {
1780     if (drflac__is_little_endian()) {
1781         return drflac__swap_endian_uint16(n);
1782     }
1783
1784     return n;
1785 }
1786
1787 static DRFLAC_INLINE drflac_uint32 drflac__be2host_32(drflac_uint32 n)
1788 {
1789     if (drflac__is_little_endian()) {
1790         return drflac__swap_endian_uint32(n);
1791     }
1792
1793     return n;
1794 }
1795
1796 static DRFLAC_INLINE drflac_uint64 drflac__be2host_64(drflac_uint64 n)
1797 {
1798     if (drflac__is_little_endian()) {
1799         return drflac__swap_endian_uint64(n);
1800     }
1801
1802     return n;
1803 }
1804
1805
1806 static DRFLAC_INLINE drflac_uint32 drflac__le2host_32(drflac_uint32 n)
1807 {
1808     if (!drflac__is_little_endian()) {
1809         return drflac__swap_endian_uint32(n);
1810     }
1811
1812     return n;
1813 }
1814
1815
1816 static DRFLAC_INLINE drflac_uint32 drflac__unsynchsafe_32(drflac_uint32 n)
1817 {
1818     drflac_uint32 result = 0;
1819     result |= (n & 0x7F000000) >> 3;
1820     result |= (n & 0x007F0000) >> 2;
1821     result |= (n & 0x00007F00) >> 1;
1822     result |= (n & 0x0000007F) >> 0;
1823
1824     return result;
1825 }
1826
1827
1828
1829 /* The CRC code below is based on this document: http://zlib.net/crc_v3.txt */
1830 static drflac_uint8 drflac__crc8_table[] = {
1831     0x00, 0x07, 0x0E, 0x09, 0x1C, 0x1B, 0x12, 0x15, 0x38, 0x3F, 0x36, 0x31, 0x24, 0x23, 0x2A, 0x2D,
1832     0x70, 0x77, 0x7E, 0x79, 0x6C, 0x6B, 0x62, 0x65, 0x48, 0x4F, 0x46, 0x41, 0x54, 0x53, 0x5A, 0x5D,
1833     0xE0, 0xE7, 0xEE, 0xE9, 0xFC, 0xFB, 0xF2, 0xF5, 0xD8, 0xDF, 0xD6, 0xD1, 0xC4, 0xC3, 0xCA, 0xCD,
1834     0x90, 0x97, 0x9E, 0x99, 0x8C, 0x8B, 0x82, 0x85, 0xA8, 0xAF, 0xA6, 0xA1, 0xB4, 0xB3, 0xBA, 0xBD,
1835     0xC7, 0xC0, 0xC9, 0xCE, 0xDB, 0xDC, 0xD5, 0xD2, 0xFF, 0xF8, 0xF1, 0xF6, 0xE3, 0xE4, 0xED, 0xEA,
1836     0xB7, 0xB0, 0xB9, 0xBE, 0xAB, 0xAC, 0xA5, 0xA2, 0x8F, 0x88, 0x81, 0x86, 0x93, 0x94, 0x9D, 0x9A,
1837     0x27, 0x20, 0x29, 0x2E, 0x3B, 0x3C, 0x35, 0x32, 0x1F, 0x18, 0x11, 0x16, 0x03, 0x04, 0x0D, 0x0A,
1838     0x57, 0x50, 0x59, 0x5E, 0x4B, 0x4C, 0x45, 0x42, 0x6F, 0x68, 0x61, 0x66, 0x73, 0x74, 0x7D, 0x7A,
1839     0x89, 0x8E, 0x87, 0x80, 0x95, 0x92, 0x9B, 0x9C, 0xB1, 0xB6, 0xBF, 0xB8, 0xAD, 0xAA, 0xA3, 0xA4,
1840     0xF9, 0xFE, 0xF7, 0xF0, 0xE5, 0xE2, 0xEB, 0xEC, 0xC1, 0xC6, 0xCF, 0xC8, 0xDD, 0xDA, 0xD3, 0xD4,
1841     0x69, 0x6E, 0x67, 0x60, 0x75, 0x72, 0x7B, 0x7C, 0x51, 0x56, 0x5F, 0x58, 0x4D, 0x4A, 0x43, 0x44,
1842     0x19, 0x1E, 0x17, 0x10, 0x05, 0x02, 0x0B, 0x0C, 0x21, 0x26, 0x2F, 0x28, 0x3D, 0x3A, 0x33, 0x34,
1843     0x4E, 0x49, 0x40, 0x47, 0x52, 0x55, 0x5C, 0x5B, 0x76, 0x71, 0x78, 0x7F, 0x6A, 0x6D, 0x64, 0x63,
1844     0x3E, 0x39, 0x30, 0x37, 0x22, 0x25, 0x2C, 0x2B, 0x06, 0x01, 0x08, 0x0F, 0x1A, 0x1D, 0x14, 0x13,
1845     0xAE, 0xA9, 0xA0, 0xA7, 0xB2, 0xB5, 0xBC, 0xBB, 0x96, 0x91, 0x98, 0x9F, 0x8A, 0x8D, 0x84, 0x83,
1846     0xDE, 0xD9, 0xD0, 0xD7, 0xC2, 0xC5, 0xCC, 0xCB, 0xE6, 0xE1, 0xE8, 0xEF, 0xFA, 0xFD, 0xF4, 0xF3
1847 };
1848
1849 static drflac_uint16 drflac__crc16_table[] = {
1850     0x0000, 0x8005, 0x800F, 0x000A, 0x801B, 0x001E, 0x0014, 0x8011,
1851     0x8033, 0x0036, 0x003C, 0x8039, 0x0028, 0x802D, 0x8027, 0x0022,
1852     0x8063, 0x0066, 0x006C, 0x8069, 0x0078, 0x807D, 0x8077, 0x0072,
1853     0x0050, 0x8055, 0x805F, 0x005A, 0x804B, 0x004E, 0x0044, 0x8041,
1854     0x80C3, 0x00C6, 0x00CC, 0x80C9, 0x00D8, 0x80DD, 0x80D7, 0x00D2,
1855     0x00F0, 0x80F5, 0x80FF, 0x00FA, 0x80EB, 0x00EE, 0x00E4, 0x80E1,
1856     0x00A0, 0x80A5, 0x80AF, 0x00AA, 0x80BB, 0x00BE, 0x00B4, 0x80B1,
1857     0x8093, 0x0096, 0x009C, 0x8099, 0x0088, 0x808D, 0x8087, 0x0082,
1858     0x8183, 0x0186, 0x018C, 0x8189, 0x0198, 0x819D, 0x8197, 0x0192,
1859     0x01B0, 0x81B5, 0x81BF, 0x01BA, 0x81AB, 0x01AE, 0x01A4, 0x81A1,
1860     0x01E0, 0x81E5, 0x81EF, 0x01EA, 0x81FB, 0x01FE, 0x01F4, 0x81F1,
1861     0x81D3, 0x01D6, 0x01DC, 0x81D9, 0x01C8, 0x81CD, 0x81C7, 0x01C2,
1862     0x0140, 0x8145, 0x814F, 0x014A, 0x815B, 0x015E, 0x0154, 0x8151,
1863     0x8173, 0x0176, 0x017C, 0x8179, 0x0168, 0x816D, 0x8167, 0x0162,
1864     0x8123, 0x0126, 0x012C, 0x8129, 0x0138, 0x813D, 0x8137, 0x0132,
1865     0x0110, 0x8115, 0x811F, 0x011A, 0x810B, 0x010E, 0x0104, 0x8101,
1866     0x8303, 0x0306, 0x030C, 0x8309, 0x0318, 0x831D, 0x8317, 0x0312,
1867     0x0330, 0x8335, 0x833F, 0x033A, 0x832B, 0x032E, 0x0324, 0x8321,
1868     0x0360, 0x8365, 0x836F, 0x036A, 0x837B, 0x037E, 0x0374, 0x8371,
1869     0x8353, 0x0356, 0x035C, 0x8359, 0x0348, 0x834D, 0x8347, 0x0342,
1870     0x03C0, 0x83C5, 0x83CF, 0x03CA, 0x83DB, 0x03DE, 0x03D4, 0x83D1,
1871     0x83F3, 0x03F6, 0x03FC, 0x83F9, 0x03E8, 0x83ED, 0x83E7, 0x03E2,
1872     0x83A3, 0x03A6, 0x03AC, 0x83A9, 0x03B8, 0x83BD, 0x83B7, 0x03B2,
1873     0x0390, 0x8395, 0x839F, 0x039A, 0x838B, 0x038E, 0x0384, 0x8381,
1874     0x0280, 0x8285, 0x828F, 0x028A, 0x829B, 0x029E, 0x0294, 0x8291,
1875     0x82B3, 0x02B6, 0x02BC, 0x82B9, 0x02A8, 0x82AD, 0x82A7, 0x02A2,
1876     0x82E3, 0x02E6, 0x02EC, 0x82E9, 0x02F8, 0x82FD, 0x82F7, 0x02F2,
1877     0x02D0, 0x82D5, 0x82DF, 0x02DA, 0x82CB, 0x02CE, 0x02C4, 0x82C1,
1878     0x8243, 0x0246, 0x024C, 0x8249, 0x0258, 0x825D, 0x8257, 0x0252,
1879     0x0270, 0x8275, 0x827F, 0x027A, 0x826B, 0x026E, 0x0264, 0x8261,
1880     0x0220, 0x8225, 0x822F, 0x022A, 0x823B, 0x023E, 0x0234, 0x8231,
1881     0x8213, 0x0216, 0x021C, 0x8219, 0x0208, 0x820D, 0x8207, 0x0202
1882 };
1883
1884 static DRFLAC_INLINE drflac_uint8 drflac_crc8_byte(drflac_uint8 crc, drflac_uint8 data)
1885 {
1886     return drflac__crc8_table[crc ^ data];
1887 }
1888
1889 static DRFLAC_INLINE drflac_uint8 drflac_crc8(drflac_uint8 crc, drflac_uint32 data, drflac_uint32 count)
1890 {
1891 #ifdef DR_FLAC_NO_CRC
1892     (void)crc;
1893     (void)data;
1894     (void)count;
1895     return 0;
1896 #else
1897     drflac_uint32 wholeBytes;
1898     drflac_uint32 leftoverBits;
1899     drflac_uint64 leftoverDataMask;
1900
1901     static drflac_uint64 leftoverDataMaskTable[8] = {
1902         0x00, 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3F, 0x7F
1903     };
1904
1905     DRFLAC_ASSERT(count <= 32);
1906
1907     wholeBytes = count >> 3;
1908     leftoverBits = count - (wholeBytes*8);
1909     leftoverDataMask = leftoverDataMaskTable[leftoverBits];
1910
1911     switch (wholeBytes) {
1912         case 4: crc = drflac_crc8_byte(crc, (drflac_uint8)((data & (0xFF000000UL << leftoverBits)) >> (24 + leftoverBits)));
1913         case 3: crc = drflac_crc8_byte(crc, (drflac_uint8)((data & (0x00FF0000UL << leftoverBits)) >> (16 + leftoverBits)));
1914         case 2: crc = drflac_crc8_byte(crc, (drflac_uint8)((data & (0x0000FF00UL << leftoverBits)) >> ( 8 + leftoverBits)));
1915         case 1: crc = drflac_crc8_byte(crc, (drflac_uint8)((data & (0x000000FFUL << leftoverBits)) >> ( 0 + leftoverBits)));
1916         case 0: if (leftoverBits > 0) crc = (drflac_uint8)((crc << leftoverBits) ^ drflac__crc8_table[(crc >> (8 - leftoverBits)) ^ (data & leftoverDataMask)]);
1917     }
1918     return crc;
1919 #endif
1920 }
1921
1922 static DRFLAC_INLINE drflac_uint16 drflac_crc16_byte(drflac_uint16 crc, drflac_uint8 data)
1923 {
1924     return (crc << 8) ^ drflac__crc16_table[(drflac_uint8)(crc >> 8) ^ data];
1925 }
1926
1927 static DRFLAC_INLINE drflac_uint16 drflac_crc16_cache(drflac_uint16 crc, drflac_cache_t data)
1928 {
1929 #ifdef DRFLAC_64BIT
1930     crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 56) & 0xFF));
1931     crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 48) & 0xFF));
1932     crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 40) & 0xFF));
1933     crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 32) & 0xFF));
1934 #endif
1935     crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 24) & 0xFF));
1936     crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 16) & 0xFF));
1937     crc = drflac_crc16_byte(crc, (drflac_uint8)((data >>  8) & 0xFF));
1938     crc = drflac_crc16_byte(crc, (drflac_uint8)((data >>  0) & 0xFF));
1939
1940     return crc;
1941 }
1942
1943 static DRFLAC_INLINE drflac_uint16 drflac_crc16_bytes(drflac_uint16 crc, drflac_cache_t data, drflac_uint32 byteCount)
1944 {
1945     switch (byteCount)
1946     {
1947 #ifdef DRFLAC_64BIT
1948     case 8: crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 56) & 0xFF));
1949     case 7: crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 48) & 0xFF));
1950     case 6: crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 40) & 0xFF));
1951     case 5: crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 32) & 0xFF));
1952 #endif
1953     case 4: crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 24) & 0xFF));
1954     case 3: crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 16) & 0xFF));
1955     case 2: crc = drflac_crc16_byte(crc, (drflac_uint8)((data >>  8) & 0xFF));
1956     case 1: crc = drflac_crc16_byte(crc, (drflac_uint8)((data >>  0) & 0xFF));
1957     }
1958
1959     return crc;
1960 }
1961
1962 #ifdef DRFLAC_64BIT
1963 #define drflac__be2host__cache_line drflac__be2host_64
1964 #else
1965 #define drflac__be2host__cache_line drflac__be2host_32
1966 #endif
1967
1968 /*
1969 BIT READING ATTEMPT #2
1970
1971 This uses a 32- or 64-bit bit-shifted cache - as bits are read, the cache is shifted such that the first valid bit is sitting
1972 on the most significant bit. It uses the notion of an L1 and L2 cache (borrowed from CPU architecture), where the L1 cache
1973 is a 32- or 64-bit unsigned integer (depending on whether or not a 32- or 64-bit build is being compiled) and the L2 is an
1974 array of "cache lines", with each cache line being the same size as the L1. The L2 is a buffer of about 4KB and is where data
1975 from onRead() is read into.
1976 */
1977 #define DRFLAC_CACHE_L1_SIZE_BYTES(bs)                      (sizeof((bs)->cache))
1978 #define DRFLAC_CACHE_L1_SIZE_BITS(bs)                       (sizeof((bs)->cache)*8)
1979 #define DRFLAC_CACHE_L1_BITS_REMAINING(bs)                  (DRFLAC_CACHE_L1_SIZE_BITS(bs) - (bs)->consumedBits)
1980 #define DRFLAC_CACHE_L1_SELECTION_MASK(_bitCount)           (~((~(drflac_cache_t)0) >> (_bitCount)))
1981 #define DRFLAC_CACHE_L1_SELECTION_SHIFT(bs, _bitCount)      (DRFLAC_CACHE_L1_SIZE_BITS(bs) - (_bitCount))
1982 #define DRFLAC_CACHE_L1_SELECT(bs, _bitCount)               (((bs)->cache) & DRFLAC_CACHE_L1_SELECTION_MASK(_bitCount))
1983 #define DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bs, _bitCount)     (DRFLAC_CACHE_L1_SELECT((bs), (_bitCount)) >>  DRFLAC_CACHE_L1_SELECTION_SHIFT((bs), (_bitCount)))
1984 #define DRFLAC_CACHE_L1_SELECT_AND_SHIFT_SAFE(bs, _bitCount)(DRFLAC_CACHE_L1_SELECT((bs), (_bitCount)) >> (DRFLAC_CACHE_L1_SELECTION_SHIFT((bs), (_bitCount)) & (DRFLAC_CACHE_L1_SIZE_BITS(bs)-1)))
1985 #define DRFLAC_CACHE_L2_SIZE_BYTES(bs)                      (sizeof((bs)->cacheL2))
1986 #define DRFLAC_CACHE_L2_LINE_COUNT(bs)                      (DRFLAC_CACHE_L2_SIZE_BYTES(bs) / sizeof((bs)->cacheL2[0]))
1987 #define DRFLAC_CACHE_L2_LINES_REMAINING(bs)                 (DRFLAC_CACHE_L2_LINE_COUNT(bs) - (bs)->nextL2Line)
1988
1989
1990 #ifndef DR_FLAC_NO_CRC
1991 static DRFLAC_INLINE void drflac__reset_crc16(drflac_bs* bs)
1992 {
1993     bs->crc16 = 0;
1994     bs->crc16CacheIgnoredBytes = bs->consumedBits >> 3;
1995 }
1996
1997 static DRFLAC_INLINE void drflac__update_crc16(drflac_bs* bs)
1998 {
1999     if (bs->crc16CacheIgnoredBytes == 0) {
2000         bs->crc16 = drflac_crc16_cache(bs->crc16, bs->crc16Cache);
2001     } else {
2002         bs->crc16 = drflac_crc16_bytes(bs->crc16, bs->crc16Cache, DRFLAC_CACHE_L1_SIZE_BYTES(bs) - bs->crc16CacheIgnoredBytes);
2003         bs->crc16CacheIgnoredBytes = 0;
2004     }
2005 }
2006
2007 static DRFLAC_INLINE drflac_uint16 drflac__flush_crc16(drflac_bs* bs)
2008 {
2009     /* We should never be flushing in a situation where we are not aligned on a byte boundary. */
2010     DRFLAC_ASSERT((DRFLAC_CACHE_L1_BITS_REMAINING(bs) & 7) == 0);
2011
2012     /*
2013     The bits that were read from the L1 cache need to be accumulated. The number of bytes needing to be accumulated is determined
2014     by the number of bits that have been consumed.
2015     */
2016     if (DRFLAC_CACHE_L1_BITS_REMAINING(bs) == 0) {
2017         drflac__update_crc16(bs);
2018     } else {
2019         /* We only accumulate the consumed bits. */
2020         bs->crc16 = drflac_crc16_bytes(bs->crc16, bs->crc16Cache >> DRFLAC_CACHE_L1_BITS_REMAINING(bs), (bs->consumedBits >> 3) - bs->crc16CacheIgnoredBytes);
2021
2022         /*
2023         The bits that we just accumulated should never be accumulated again. We need to keep track of how many bytes were accumulated
2024         so we can handle that later.
2025         */
2026         bs->crc16CacheIgnoredBytes = bs->consumedBits >> 3;
2027     }
2028
2029     return bs->crc16;
2030 }
2031 #endif
2032
2033 static DRFLAC_INLINE drflac_bool32 drflac__reload_l1_cache_from_l2(drflac_bs* bs)
2034 {
2035     size_t bytesRead;
2036     size_t alignedL1LineCount;
2037
2038     /* Fast path. Try loading straight from L2. */
2039     if (bs->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT(bs)) {
2040         bs->cache = bs->cacheL2[bs->nextL2Line++];
2041         return DRFLAC_TRUE;
2042     }
2043
2044     /*
2045     If we get here it means we've run out of data in the L2 cache. We'll need to fetch more from the client, if there's
2046     any left.
2047     */
2048     if (bs->unalignedByteCount > 0) {
2049         return DRFLAC_FALSE;   /* If we have any unaligned bytes it means there's no more aligned bytes left in the client. */
2050     }
2051
2052     bytesRead = bs->onRead(bs->pUserData, bs->cacheL2, DRFLAC_CACHE_L2_SIZE_BYTES(bs));
2053
2054     bs->nextL2Line = 0;
2055     if (bytesRead == DRFLAC_CACHE_L2_SIZE_BYTES(bs)) {
2056         bs->cache = bs->cacheL2[bs->nextL2Line++];
2057         return DRFLAC_TRUE;
2058     }
2059
2060
2061     /*
2062     If we get here it means we were unable to retrieve enough data to fill the entire L2 cache. It probably
2063     means we've just reached the end of the file. We need to move the valid data down to the end of the buffer
2064     and adjust the index of the next line accordingly. Also keep in mind that the L2 cache must be aligned to
2065     the size of the L1 so we'll need to seek backwards by any misaligned bytes.
2066     */
2067     alignedL1LineCount = bytesRead / DRFLAC_CACHE_L1_SIZE_BYTES(bs);
2068
2069     /* We need to keep track of any unaligned bytes for later use. */
2070     bs->unalignedByteCount = bytesRead - (alignedL1LineCount * DRFLAC_CACHE_L1_SIZE_BYTES(bs));
2071     if (bs->unalignedByteCount > 0) {
2072         bs->unalignedCache = bs->cacheL2[alignedL1LineCount];
2073     }
2074
2075     if (alignedL1LineCount > 0) {
2076         size_t offset = DRFLAC_CACHE_L2_LINE_COUNT(bs) - alignedL1LineCount;
2077         size_t i;
2078         for (i = alignedL1LineCount; i > 0; --i) {
2079             bs->cacheL2[i-1 + offset] = bs->cacheL2[i-1];
2080         }
2081
2082         bs->nextL2Line = (drflac_uint32)offset;
2083         bs->cache = bs->cacheL2[bs->nextL2Line++];
2084         return DRFLAC_TRUE;
2085     } else {
2086         /* If we get into this branch it means we weren't able to load any L1-aligned data. */
2087         bs->nextL2Line = DRFLAC_CACHE_L2_LINE_COUNT(bs);
2088         return DRFLAC_FALSE;
2089     }
2090 }
2091
2092 static drflac_bool32 drflac__reload_cache(drflac_bs* bs)
2093 {
2094     size_t bytesRead;
2095
2096 #ifndef DR_FLAC_NO_CRC
2097     drflac__update_crc16(bs);
2098 #endif
2099
2100     /* Fast path. Try just moving the next value in the L2 cache to the L1 cache. */
2101     if (drflac__reload_l1_cache_from_l2(bs)) {
2102         bs->cache = drflac__be2host__cache_line(bs->cache);
2103         bs->consumedBits = 0;
2104 #ifndef DR_FLAC_NO_CRC
2105         bs->crc16Cache = bs->cache;
2106 #endif
2107         return DRFLAC_TRUE;
2108     }
2109
2110     /* Slow path. */
2111
2112     /*
2113     If we get here it means we have failed to load the L1 cache from the L2. Likely we've just reached the end of the stream and the last
2114     few bytes did not meet the alignment requirements for the L2 cache. In this case we need to fall back to a slower path and read the
2115     data from the unaligned cache.
2116     */
2117     bytesRead = bs->unalignedByteCount;
2118     if (bytesRead == 0) {
2119         bs->consumedBits = DRFLAC_CACHE_L1_SIZE_BITS(bs);   /* <-- The stream has been exhausted, so marked the bits as consumed. */
2120         return DRFLAC_FALSE;
2121     }
2122
2123     DRFLAC_ASSERT(bytesRead < DRFLAC_CACHE_L1_SIZE_BYTES(bs));
2124     bs->consumedBits = (drflac_uint32)(DRFLAC_CACHE_L1_SIZE_BYTES(bs) - bytesRead) * 8;
2125
2126     bs->cache = drflac__be2host__cache_line(bs->unalignedCache);
2127     bs->cache &= DRFLAC_CACHE_L1_SELECTION_MASK(DRFLAC_CACHE_L1_BITS_REMAINING(bs));    /* <-- Make sure the consumed bits are always set to zero. Other parts of the library depend on this property. */
2128     bs->unalignedByteCount = 0;     /* <-- At this point the unaligned bytes have been moved into the cache and we thus have no more unaligned bytes. */
2129
2130 #ifndef DR_FLAC_NO_CRC
2131     bs->crc16Cache = bs->cache >> bs->consumedBits;
2132     bs->crc16CacheIgnoredBytes = bs->consumedBits >> 3;
2133 #endif
2134     return DRFLAC_TRUE;
2135 }
2136
2137 static void drflac__reset_cache(drflac_bs* bs)
2138 {
2139     bs->nextL2Line   = DRFLAC_CACHE_L2_LINE_COUNT(bs);  /* <-- This clears the L2 cache. */
2140     bs->consumedBits = DRFLAC_CACHE_L1_SIZE_BITS(bs);   /* <-- This clears the L1 cache. */
2141     bs->cache = 0;
2142     bs->unalignedByteCount = 0;                         /* <-- This clears the trailing unaligned bytes. */
2143     bs->unalignedCache = 0;
2144
2145 #ifndef DR_FLAC_NO_CRC
2146     bs->crc16Cache = 0;
2147     bs->crc16CacheIgnoredBytes = 0;
2148 #endif
2149 }
2150
2151
2152 static DRFLAC_INLINE drflac_bool32 drflac__read_uint32(drflac_bs* bs, unsigned int bitCount, drflac_uint32* pResultOut)
2153 {
2154     DRFLAC_ASSERT(bs != NULL);
2155     DRFLAC_ASSERT(pResultOut != NULL);
2156     DRFLAC_ASSERT(bitCount > 0);
2157     DRFLAC_ASSERT(bitCount <= 32);
2158
2159     if (bs->consumedBits == DRFLAC_CACHE_L1_SIZE_BITS(bs)) {
2160         if (!drflac__reload_cache(bs)) {
2161             return DRFLAC_FALSE;
2162         }
2163     }
2164
2165     if (bitCount <= DRFLAC_CACHE_L1_BITS_REMAINING(bs)) {
2166         /*
2167         If we want to load all 32-bits from a 32-bit cache we need to do it slightly differently because we can't do
2168         a 32-bit shift on a 32-bit integer. This will never be the case on 64-bit caches, so we can have a slightly
2169         more optimal solution for this.
2170         */
2171 #ifdef DRFLAC_64BIT
2172         *pResultOut = (drflac_uint32)DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bs, bitCount);
2173         bs->consumedBits += bitCount;
2174         bs->cache <<= bitCount;
2175 #else
2176         if (bitCount < DRFLAC_CACHE_L1_SIZE_BITS(bs)) {
2177             *pResultOut = (drflac_uint32)DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bs, bitCount);
2178             bs->consumedBits += bitCount;
2179             bs->cache <<= bitCount;
2180         } else {
2181             /* Cannot shift by 32-bits, so need to do it differently. */
2182             *pResultOut = (drflac_uint32)bs->cache;
2183             bs->consumedBits = DRFLAC_CACHE_L1_SIZE_BITS(bs);
2184             bs->cache = 0;
2185         }
2186 #endif
2187
2188         return DRFLAC_TRUE;
2189     } else {
2190         /* It straddles the cached data. It will never cover more than the next chunk. We just read the number in two parts and combine them. */
2191         drflac_uint32 bitCountHi = DRFLAC_CACHE_L1_BITS_REMAINING(bs);
2192         drflac_uint32 bitCountLo = bitCount - bitCountHi;
2193         drflac_uint32 resultHi;
2194
2195         DRFLAC_ASSERT(bitCountHi > 0);
2196         DRFLAC_ASSERT(bitCountHi < 32);
2197         resultHi = (drflac_uint32)DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bs, bitCountHi);
2198
2199         if (!drflac__reload_cache(bs)) {
2200             return DRFLAC_FALSE;
2201         }
2202
2203         *pResultOut = (resultHi << bitCountLo) | (drflac_uint32)DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bs, bitCountLo);
2204         bs->consumedBits += bitCountLo;
2205         bs->cache <<= bitCountLo;
2206         return DRFLAC_TRUE;
2207     }
2208 }
2209
2210 static drflac_bool32 drflac__read_int32(drflac_bs* bs, unsigned int bitCount, drflac_int32* pResult)
2211 {
2212     drflac_uint32 result;
2213
2214     DRFLAC_ASSERT(bs != NULL);
2215     DRFLAC_ASSERT(pResult != NULL);
2216     DRFLAC_ASSERT(bitCount > 0);
2217     DRFLAC_ASSERT(bitCount <= 32);
2218
2219     if (!drflac__read_uint32(bs, bitCount, &result)) {
2220         return DRFLAC_FALSE;
2221     }
2222
2223     /* Do not attempt to shift by 32 as it's undefined. */
2224     if (bitCount < 32) {
2225         drflac_uint32 signbit;
2226         signbit = ((result >> (bitCount-1)) & 0x01);
2227         result |= (~signbit + 1) << bitCount;
2228     }
2229
2230     *pResult = (drflac_int32)result;
2231     return DRFLAC_TRUE;
2232 }
2233
2234 #ifdef DRFLAC_64BIT
2235 static drflac_bool32 drflac__read_uint64(drflac_bs* bs, unsigned int bitCount, drflac_uint64* pResultOut)
2236 {
2237     drflac_uint32 resultHi;
2238     drflac_uint32 resultLo;
2239
2240     DRFLAC_ASSERT(bitCount <= 64);
2241     DRFLAC_ASSERT(bitCount >  32);
2242
2243     if (!drflac__read_uint32(bs, bitCount - 32, &resultHi)) {
2244         return DRFLAC_FALSE;
2245     }
2246
2247     if (!drflac__read_uint32(bs, 32, &resultLo)) {
2248         return DRFLAC_FALSE;
2249     }
2250
2251     *pResultOut = (((drflac_uint64)resultHi) << 32) | ((drflac_uint64)resultLo);
2252     return DRFLAC_TRUE;
2253 }
2254 #endif
2255
2256 static drflac_bool32 drflac__read_uint16(drflac_bs* bs, unsigned int bitCount, drflac_uint16* pResult)
2257 {
2258     drflac_uint32 result;
2259
2260     DRFLAC_ASSERT(bs != NULL);
2261     DRFLAC_ASSERT(pResult != NULL);
2262     DRFLAC_ASSERT(bitCount > 0);
2263     DRFLAC_ASSERT(bitCount <= 16);
2264
2265     if (!drflac__read_uint32(bs, bitCount, &result)) {
2266         return DRFLAC_FALSE;
2267     }
2268
2269     *pResult = (drflac_uint16)result;
2270     return DRFLAC_TRUE;
2271 }
2272
2273 static drflac_bool32 drflac__read_uint8(drflac_bs* bs, unsigned int bitCount, drflac_uint8* pResult)
2274 {
2275     drflac_uint32 result;
2276
2277     DRFLAC_ASSERT(bs != NULL);
2278     DRFLAC_ASSERT(pResult != NULL);
2279     DRFLAC_ASSERT(bitCount > 0);
2280     DRFLAC_ASSERT(bitCount <= 8);
2281
2282     if (!drflac__read_uint32(bs, bitCount, &result)) {
2283         return DRFLAC_FALSE;
2284     }
2285
2286     *pResult = (drflac_uint8)result;
2287     return DRFLAC_TRUE;
2288 }
2289
2290 static drflac_bool32 drflac__read_int8(drflac_bs* bs, unsigned int bitCount, drflac_int8* pResult)
2291 {
2292     drflac_int32 result;
2293
2294     DRFLAC_ASSERT(bs != NULL);
2295     DRFLAC_ASSERT(pResult != NULL);
2296     DRFLAC_ASSERT(bitCount > 0);
2297     DRFLAC_ASSERT(bitCount <= 8);
2298
2299     if (!drflac__read_int32(bs, bitCount, &result)) {
2300         return DRFLAC_FALSE;
2301     }
2302
2303     *pResult = (drflac_int8)result;
2304     return DRFLAC_TRUE;
2305 }
2306
2307
2308 static drflac_bool32 drflac__seek_bits(drflac_bs* bs, size_t bitsToSeek)
2309 {
2310     if (bitsToSeek <= DRFLAC_CACHE_L1_BITS_REMAINING(bs)) {
2311         bs->consumedBits += (drflac_uint32)bitsToSeek;
2312         bs->cache <<= bitsToSeek;
2313         return DRFLAC_TRUE;
2314     } else {
2315         /* It straddles the cached data. This function isn't called too frequently so I'm favouring simplicity here. */
2316         bitsToSeek       -= DRFLAC_CACHE_L1_BITS_REMAINING(bs);
2317         bs->consumedBits += DRFLAC_CACHE_L1_BITS_REMAINING(bs);
2318         bs->cache         = 0;
2319
2320         /* Simple case. Seek in groups of the same number as bits that fit within a cache line. */
2321 #ifdef DRFLAC_64BIT
2322         while (bitsToSeek >= DRFLAC_CACHE_L1_SIZE_BITS(bs)) {
2323             drflac_uint64 bin;
2324             if (!drflac__read_uint64(bs, DRFLAC_CACHE_L1_SIZE_BITS(bs), &bin)) {
2325                 return DRFLAC_FALSE;
2326             }
2327             bitsToSeek -= DRFLAC_CACHE_L1_SIZE_BITS(bs);
2328         }
2329 #else
2330         while (bitsToSeek >= DRFLAC_CACHE_L1_SIZE_BITS(bs)) {
2331             drflac_uint32 bin;
2332             if (!drflac__read_uint32(bs, DRFLAC_CACHE_L1_SIZE_BITS(bs), &bin)) {
2333                 return DRFLAC_FALSE;
2334             }
2335             bitsToSeek -= DRFLAC_CACHE_L1_SIZE_BITS(bs);
2336         }
2337 #endif
2338
2339         /* Whole leftover bytes. */
2340         while (bitsToSeek >= 8) {
2341             drflac_uint8 bin;
2342             if (!drflac__read_uint8(bs, 8, &bin)) {
2343                 return DRFLAC_FALSE;
2344             }
2345             bitsToSeek -= 8;
2346         }
2347
2348         /* Leftover bits. */
2349         if (bitsToSeek > 0) {
2350             drflac_uint8 bin;
2351             if (!drflac__read_uint8(bs, (drflac_uint32)bitsToSeek, &bin)) {
2352                 return DRFLAC_FALSE;
2353             }
2354             bitsToSeek = 0; /* <-- Necessary for the assert below. */
2355         }
2356
2357         DRFLAC_ASSERT(bitsToSeek == 0);
2358         return DRFLAC_TRUE;
2359     }
2360 }
2361
2362
2363 /* This function moves the bit streamer to the first bit after the sync code (bit 15 of the of the frame header). It will also update the CRC-16. */
2364 static drflac_bool32 drflac__find_and_seek_to_next_sync_code(drflac_bs* bs)
2365 {
2366     DRFLAC_ASSERT(bs != NULL);
2367
2368     /*
2369     The sync code is always aligned to 8 bits. This is convenient for us because it means we can do byte-aligned movements. The first
2370     thing to do is align to the next byte.
2371     */
2372     if (!drflac__seek_bits(bs, DRFLAC_CACHE_L1_BITS_REMAINING(bs) & 7)) {
2373         return DRFLAC_FALSE;
2374     }
2375
2376     for (;;) {
2377         drflac_uint8 hi;
2378
2379 #ifndef DR_FLAC_NO_CRC
2380         drflac__reset_crc16(bs);
2381 #endif
2382
2383         if (!drflac__read_uint8(bs, 8, &hi)) {
2384             return DRFLAC_FALSE;
2385         }
2386
2387         if (hi == 0xFF) {
2388             drflac_uint8 lo;
2389             if (!drflac__read_uint8(bs, 6, &lo)) {
2390                 return DRFLAC_FALSE;
2391             }
2392
2393             if (lo == 0x3E) {
2394                 return DRFLAC_TRUE;
2395             } else {
2396                 if (!drflac__seek_bits(bs, DRFLAC_CACHE_L1_BITS_REMAINING(bs) & 7)) {
2397                     return DRFLAC_FALSE;
2398                 }
2399             }
2400         }
2401     }
2402
2403     /* Should never get here. */
2404     /*return DRFLAC_FALSE;*/
2405 }
2406
2407
2408 #if defined(DRFLAC_HAS_LZCNT_INTRINSIC)
2409 #define DRFLAC_IMPLEMENT_CLZ_LZCNT
2410 #endif
2411 #if  defined(_MSC_VER) && _MSC_VER >= 1400 && (defined(DRFLAC_X64) || defined(DRFLAC_X86)) && !defined(__clang__)
2412 #define DRFLAC_IMPLEMENT_CLZ_MSVC
2413 #endif
2414
2415 static DRFLAC_INLINE drflac_uint32 drflac__clz_software(drflac_cache_t x)
2416 {
2417     drflac_uint32 n;
2418     static drflac_uint32 clz_table_4[] = {
2419         0,
2420         4,
2421         3, 3,
2422         2, 2, 2, 2,
2423         1, 1, 1, 1, 1, 1, 1, 1
2424     };
2425
2426     if (x == 0) {
2427         return sizeof(x)*8;
2428     }
2429
2430     n = clz_table_4[x >> (sizeof(x)*8 - 4)];
2431     if (n == 0) {
2432 #ifdef DRFLAC_64BIT
2433         if ((x & ((drflac_uint64)0xFFFFFFFF << 32)) == 0) { n  = 32; x <<= 32; }
2434         if ((x & ((drflac_uint64)0xFFFF0000 << 32)) == 0) { n += 16; x <<= 16; }
2435         if ((x & ((drflac_uint64)0xFF000000 << 32)) == 0) { n += 8;  x <<= 8;  }
2436         if ((x & ((drflac_uint64)0xF0000000 << 32)) == 0) { n += 4;  x <<= 4;  }
2437 #else
2438         if ((x & 0xFFFF0000) == 0) { n  = 16; x <<= 16; }
2439         if ((x & 0xFF000000) == 0) { n += 8;  x <<= 8;  }
2440         if ((x & 0xF0000000) == 0) { n += 4;  x <<= 4;  }
2441 #endif
2442         n += clz_table_4[x >> (sizeof(x)*8 - 4)];
2443     }
2444
2445     return n - 1;
2446 }
2447
2448 #ifdef DRFLAC_IMPLEMENT_CLZ_LZCNT
2449 static DRFLAC_INLINE drflac_bool32 drflac__is_lzcnt_supported(void)
2450 {
2451     /* Fast compile time check for ARM. */
2452 #if defined(DRFLAC_HAS_LZCNT_INTRINSIC) && defined(DRFLAC_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 5)
2453     return DRFLAC_TRUE;
2454 #else
2455     /* If the compiler itself does not support the intrinsic then we'll need to return false. */
2456     #ifdef DRFLAC_HAS_LZCNT_INTRINSIC
2457         return drflac__gIsLZCNTSupported;
2458     #else
2459         return DRFLAC_FALSE;
2460     #endif
2461 #endif
2462 }
2463
2464 static DRFLAC_INLINE drflac_uint32 drflac__clz_lzcnt(drflac_cache_t x)
2465 {
2466     /*
2467     It's critical for competitive decoding performance that this function be highly optimal. With MSVC we can use the __lzcnt64() and __lzcnt() intrinsics
2468     to achieve good performance, however on GCC and Clang it's a little bit more annoying. The __builtin_clzl() and __builtin_clzll() intrinsics leave
2469     it undefined as to the return value when `x` is 0. We need this to be well defined as returning 32 or 64, depending on whether or not it's a 32- or
2470     64-bit build. To work around this we would need to add a conditional to check for the x = 0 case, but this creates unnecessary inefficiency. To work
2471     around this problem I have written some inline assembly to emit the LZCNT (x86) or CLZ (ARM) instruction directly which removes the need to include
2472     the conditional. This has worked well in the past, but for some reason Clang's MSVC compatible driver, clang-cl, does not seem to be handling this
2473     in the same way as the normal Clang driver. It seems that `clang-cl` is just outputting the wrong results sometimes, maybe due to some register
2474     getting clobbered?
2475
2476     I'm not sure if this is a bug with dr_flac's inlined assembly (most likely), a bug in `clang-cl` or just a misunderstanding on my part with inline
2477     assembly rules for `clang-cl`. If somebody can identify an error in dr_flac's inlined assembly I'm happy to get that fixed.
2478
2479     Fortunately there is an easy workaround for this. Clang implements MSVC-specific intrinsics for compatibility. It also defines _MSC_VER for extra
2480     compatibility. We can therefore just check for _MSC_VER and use the MSVC intrinsic which, fortunately for us, Clang supports. It would still be nice
2481     to know how to fix the inlined assembly for correctness sake, however.
2482     */
2483
2484 #if defined(_MSC_VER) /*&& !defined(__clang__)*/    /* <-- Intentionally wanting Clang to use the MSVC __lzcnt64/__lzcnt intrinsics due to above ^. */
2485     #ifdef DRFLAC_64BIT
2486         return (drflac_uint32)__lzcnt64(x);
2487     #else
2488         return (drflac_uint32)__lzcnt(x);
2489     #endif
2490 #else
2491     #if defined(__GNUC__) || defined(__clang__)
2492         #if defined(DRFLAC_X64)
2493             {
2494                 drflac_uint64 r;
2495                 __asm__ __volatile__ (
2496                     "lzcnt{ %1, %0| %0, %1}" : "=r"(r) : "r"(x) : "cc"
2497                 );
2498
2499                 return (drflac_uint32)r;
2500             }
2501         #elif defined(DRFLAC_X86)
2502             {
2503                 drflac_uint32 r;
2504                 __asm__ __volatile__ (
2505                     "lzcnt{l %1, %0| %0, %1}" : "=r"(r) : "r"(x) : "cc"
2506                 );
2507
2508                 return r;
2509             }
2510         #elif defined(DRFLAC_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 5) && !defined(DRFLAC_64BIT)   /* <-- I haven't tested 64-bit inline assembly, so only enabling this for the 32-bit build for now. */
2511             {
2512                 unsigned int r;
2513                 __asm__ __volatile__ (
2514                 #if defined(DRFLAC_64BIT)
2515                     "clz %w[out], %w[in]" : [out]"=r"(r) : [in]"r"(x)   /* <-- This is untested. If someone in the community could test this, that would be appreciated! */
2516                 #else
2517                     "clz %[out], %[in]" : [out]"=r"(r) : [in]"r"(x)
2518                 #endif
2519                 );
2520
2521                 return r;
2522             }
2523         #else
2524             if (x == 0) {
2525                 return sizeof(x)*8;
2526             }
2527             #ifdef DRFLAC_64BIT
2528                 return (drflac_uint32)__builtin_clzll((drflac_uint64)x);
2529             #else
2530                 return (drflac_uint32)__builtin_clzl((drflac_uint32)x);
2531             #endif
2532         #endif
2533     #else
2534         /* Unsupported compiler. */
2535         #error "This compiler does not support the lzcnt intrinsic."
2536     #endif
2537 #endif
2538 }
2539 #endif
2540
2541 #ifdef DRFLAC_IMPLEMENT_CLZ_MSVC
2542 #include <intrin.h> /* For BitScanReverse(). */
2543
2544 static DRFLAC_INLINE drflac_uint32 drflac__clz_msvc(drflac_cache_t x)
2545 {
2546     drflac_uint32 n;
2547
2548     if (x == 0) {
2549         return sizeof(x)*8;
2550     }
2551
2552 #ifdef DRFLAC_64BIT
2553     _BitScanReverse64((unsigned long*)&n, x);
2554 #else
2555     _BitScanReverse((unsigned long*)&n, x);
2556 #endif
2557     return sizeof(x)*8 - n - 1;
2558 }
2559 #endif
2560
2561 static DRFLAC_INLINE drflac_uint32 drflac__clz(drflac_cache_t x)
2562 {
2563 #ifdef DRFLAC_IMPLEMENT_CLZ_LZCNT
2564     if (drflac__is_lzcnt_supported()) {
2565         return drflac__clz_lzcnt(x);
2566     } else
2567 #endif
2568     {
2569 #ifdef DRFLAC_IMPLEMENT_CLZ_MSVC
2570         return drflac__clz_msvc(x);
2571 #else
2572         return drflac__clz_software(x);
2573 #endif
2574     }
2575 }
2576
2577
2578 static DRFLAC_INLINE drflac_bool32 drflac__seek_past_next_set_bit(drflac_bs* bs, unsigned int* pOffsetOut)
2579 {
2580     drflac_uint32 zeroCounter = 0;
2581     drflac_uint32 setBitOffsetPlus1;
2582
2583     while (bs->cache == 0) {
2584         zeroCounter += (drflac_uint32)DRFLAC_CACHE_L1_BITS_REMAINING(bs);
2585         if (!drflac__reload_cache(bs)) {
2586             return DRFLAC_FALSE;
2587         }
2588     }
2589
2590     setBitOffsetPlus1 = drflac__clz(bs->cache);
2591     setBitOffsetPlus1 += 1;
2592
2593     bs->consumedBits += setBitOffsetPlus1;
2594     bs->cache <<= setBitOffsetPlus1;
2595
2596     *pOffsetOut = zeroCounter + setBitOffsetPlus1 - 1;
2597     return DRFLAC_TRUE;
2598 }
2599
2600
2601
2602 static drflac_bool32 drflac__seek_to_byte(drflac_bs* bs, drflac_uint64 offsetFromStart)
2603 {
2604     DRFLAC_ASSERT(bs != NULL);
2605     DRFLAC_ASSERT(offsetFromStart > 0);
2606
2607     /*
2608     Seeking from the start is not quite as trivial as it sounds because the onSeek callback takes a signed 32-bit integer (which
2609     is intentional because it simplifies the implementation of the onSeek callbacks), however offsetFromStart is unsigned 64-bit.
2610     To resolve we just need to do an initial seek from the start, and then a series of offset seeks to make up the remainder.
2611     */
2612     if (offsetFromStart > 0x7FFFFFFF) {
2613         drflac_uint64 bytesRemaining = offsetFromStart;
2614         if (!bs->onSeek(bs->pUserData, 0x7FFFFFFF, drflac_seek_origin_start)) {
2615             return DRFLAC_FALSE;
2616         }
2617         bytesRemaining -= 0x7FFFFFFF;
2618
2619         while (bytesRemaining > 0x7FFFFFFF) {
2620             if (!bs->onSeek(bs->pUserData, 0x7FFFFFFF, drflac_seek_origin_current)) {
2621                 return DRFLAC_FALSE;
2622             }
2623             bytesRemaining -= 0x7FFFFFFF;
2624         }
2625
2626         if (bytesRemaining > 0) {
2627             if (!bs->onSeek(bs->pUserData, (int)bytesRemaining, drflac_seek_origin_current)) {
2628                 return DRFLAC_FALSE;
2629             }
2630         }
2631     } else {
2632         if (!bs->onSeek(bs->pUserData, (int)offsetFromStart, drflac_seek_origin_start)) {
2633             return DRFLAC_FALSE;
2634         }
2635     }
2636
2637     /* The cache should be reset to force a reload of fresh data from the client. */
2638     drflac__reset_cache(bs);
2639     return DRFLAC_TRUE;
2640 }
2641
2642
2643 static drflac_result drflac__read_utf8_coded_number(drflac_bs* bs, drflac_uint64* pNumberOut, drflac_uint8* pCRCOut)
2644 {
2645     drflac_uint8 crc;
2646     drflac_uint64 result;
2647     drflac_uint8 utf8[7] = {0};
2648     int byteCount;
2649     int i;
2650
2651     DRFLAC_ASSERT(bs != NULL);
2652     DRFLAC_ASSERT(pNumberOut != NULL);
2653     DRFLAC_ASSERT(pCRCOut != NULL);
2654
2655     crc = *pCRCOut;
2656
2657     if (!drflac__read_uint8(bs, 8, utf8)) {
2658         *pNumberOut = 0;
2659         return DRFLAC_AT_END;
2660     }
2661     crc = drflac_crc8(crc, utf8[0], 8);
2662
2663     if ((utf8[0] & 0x80) == 0) {
2664         *pNumberOut = utf8[0];
2665         *pCRCOut = crc;
2666         return DRFLAC_SUCCESS;
2667     }
2668
2669     /*byteCount = 1;*/
2670     if ((utf8[0] & 0xE0) == 0xC0) {
2671         byteCount = 2;
2672     } else if ((utf8[0] & 0xF0) == 0xE0) {
2673         byteCount = 3;
2674     } else if ((utf8[0] & 0xF8) == 0xF0) {
2675         byteCount = 4;
2676     } else if ((utf8[0] & 0xFC) == 0xF8) {
2677         byteCount = 5;
2678     } else if ((utf8[0] & 0xFE) == 0xFC) {
2679         byteCount = 6;
2680     } else if ((utf8[0] & 0xFF) == 0xFE) {
2681         byteCount = 7;
2682     } else {
2683         *pNumberOut = 0;
2684         return DRFLAC_CRC_MISMATCH;     /* Bad UTF-8 encoding. */
2685     }
2686
2687     /* Read extra bytes. */
2688     DRFLAC_ASSERT(byteCount > 1);
2689
2690     result = (drflac_uint64)(utf8[0] & (0xFF >> (byteCount + 1)));
2691     for (i = 1; i < byteCount; ++i) {
2692         if (!drflac__read_uint8(bs, 8, utf8 + i)) {
2693             *pNumberOut = 0;
2694             return DRFLAC_AT_END;
2695         }
2696         crc = drflac_crc8(crc, utf8[i], 8);
2697
2698         result = (result << 6) | (utf8[i] & 0x3F);
2699     }
2700
2701     *pNumberOut = result;
2702     *pCRCOut = crc;
2703     return DRFLAC_SUCCESS;
2704 }
2705
2706
2707
2708 /*
2709 The next two functions are responsible for calculating the prediction.
2710
2711 When the bits per sample is >16 we need to use 64-bit integer arithmetic because otherwise we'll run out of precision. It's
2712 safe to assume this will be slower on 32-bit platforms so we use a more optimal solution when the bits per sample is <=16.
2713 */
2714 static DRFLAC_INLINE drflac_int32 drflac__calculate_prediction_32(drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pDecodedSamples)
2715 {
2716     drflac_int32 prediction = 0;
2717
2718     DRFLAC_ASSERT(order <= 32);
2719
2720     /* 32-bit version. */
2721
2722     /* VC++ optimizes this to a single jmp. I've not yet verified this for other compilers. */
2723     switch (order)
2724     {
2725     case 32: prediction += coefficients[31] * pDecodedSamples[-32];
2726     case 31: prediction += coefficients[30] * pDecodedSamples[-31];
2727     case 30: prediction += coefficients[29] * pDecodedSamples[-30];
2728     case 29: prediction += coefficients[28] * pDecodedSamples[-29];
2729     case 28: prediction += coefficients[27] * pDecodedSamples[-28];
2730     case 27: prediction += coefficients[26] * pDecodedSamples[-27];
2731     case 26: prediction += coefficients[25] * pDecodedSamples[-26];
2732     case 25: prediction += coefficients[24] * pDecodedSamples[-25];
2733     case 24: prediction += coefficients[23] * pDecodedSamples[-24];
2734     case 23: prediction += coefficients[22] * pDecodedSamples[-23];
2735     case 22: prediction += coefficients[21] * pDecodedSamples[-22];
2736     case 21: prediction += coefficients[20] * pDecodedSamples[-21];
2737     case 20: prediction += coefficients[19] * pDecodedSamples[-20];
2738     case 19: prediction += coefficients[18] * pDecodedSamples[-19];
2739     case 18: prediction += coefficients[17] * pDecodedSamples[-18];
2740     case 17: prediction += coefficients[16] * pDecodedSamples[-17];
2741     case 16: prediction += coefficients[15] * pDecodedSamples[-16];
2742     case 15: prediction += coefficients[14] * pDecodedSamples[-15];
2743     case 14: prediction += coefficients[13] * pDecodedSamples[-14];
2744     case 13: prediction += coefficients[12] * pDecodedSamples[-13];
2745     case 12: prediction += coefficients[11] * pDecodedSamples[-12];
2746     case 11: prediction += coefficients[10] * pDecodedSamples[-11];
2747     case 10: prediction += coefficients[ 9] * pDecodedSamples[-10];
2748     case  9: prediction += coefficients[ 8] * pDecodedSamples[- 9];
2749     case  8: prediction += coefficients[ 7] * pDecodedSamples[- 8];
2750     case  7: prediction += coefficients[ 6] * pDecodedSamples[- 7];
2751     case  6: prediction += coefficients[ 5] * pDecodedSamples[- 6];
2752     case  5: prediction += coefficients[ 4] * pDecodedSamples[- 5];
2753     case  4: prediction += coefficients[ 3] * pDecodedSamples[- 4];
2754     case  3: prediction += coefficients[ 2] * pDecodedSamples[- 3];
2755     case  2: prediction += coefficients[ 1] * pDecodedSamples[- 2];
2756     case  1: prediction += coefficients[ 0] * pDecodedSamples[- 1];
2757     }
2758
2759     return (drflac_int32)(prediction >> shift);
2760 }
2761
2762 static DRFLAC_INLINE drflac_int32 drflac__calculate_prediction_64(drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pDecodedSamples)
2763 {
2764     drflac_int64 prediction;
2765
2766     DRFLAC_ASSERT(order <= 32);
2767
2768     /* 64-bit version. */
2769
2770     /* This method is faster on the 32-bit build when compiling with VC++. See note below. */
2771 #ifndef DRFLAC_64BIT
2772     if (order == 8)
2773     {
2774         prediction  = coefficients[0] * (drflac_int64)pDecodedSamples[-1];
2775         prediction += coefficients[1] * (drflac_int64)pDecodedSamples[-2];
2776         prediction += coefficients[2] * (drflac_int64)pDecodedSamples[-3];
2777         prediction += coefficients[3] * (drflac_int64)pDecodedSamples[-4];
2778         prediction += coefficients[4] * (drflac_int64)pDecodedSamples[-5];
2779         prediction += coefficients[5] * (drflac_int64)pDecodedSamples[-6];
2780         prediction += coefficients[6] * (drflac_int64)pDecodedSamples[-7];
2781         prediction += coefficients[7] * (drflac_int64)pDecodedSamples[-8];
2782     }
2783     else if (order == 7)
2784     {
2785         prediction  = coefficients[0] * (drflac_int64)pDecodedSamples[-1];
2786         prediction += coefficients[1] * (drflac_int64)pDecodedSamples[-2];
2787         prediction += coefficients[2] * (drflac_int64)pDecodedSamples[-3];
2788         prediction += coefficients[3] * (drflac_int64)pDecodedSamples[-4];
2789         prediction += coefficients[4] * (drflac_int64)pDecodedSamples[-5];
2790         prediction += coefficients[5] * (drflac_int64)pDecodedSamples[-6];
2791         prediction += coefficients[6] * (drflac_int64)pDecodedSamples[-7];
2792     }
2793     else if (order == 3)
2794     {
2795         prediction  = coefficients[0] * (drflac_int64)pDecodedSamples[-1];
2796         prediction += coefficients[1] * (drflac_int64)pDecodedSamples[-2];
2797         prediction += coefficients[2] * (drflac_int64)pDecodedSamples[-3];
2798     }
2799     else if (order == 6)
2800     {
2801         prediction  = coefficients[0] * (drflac_int64)pDecodedSamples[-1];
2802         prediction += coefficients[1] * (drflac_int64)pDecodedSamples[-2];
2803         prediction += coefficients[2] * (drflac_int64)pDecodedSamples[-3];
2804         prediction += coefficients[3] * (drflac_int64)pDecodedSamples[-4];
2805         prediction += coefficients[4] * (drflac_int64)pDecodedSamples[-5];
2806         prediction += coefficients[5] * (drflac_int64)pDecodedSamples[-6];
2807     }
2808     else if (order == 5)
2809     {
2810         prediction  = coefficients[0] * (drflac_int64)pDecodedSamples[-1];
2811         prediction += coefficients[1] * (drflac_int64)pDecodedSamples[-2];
2812         prediction += coefficients[2] * (drflac_int64)pDecodedSamples[-3];
2813         prediction += coefficients[3] * (drflac_int64)pDecodedSamples[-4];
2814         prediction += coefficients[4] * (drflac_int64)pDecodedSamples[-5];
2815     }
2816     else if (order == 4)
2817     {
2818         prediction  = coefficients[0] * (drflac_int64)pDecodedSamples[-1];
2819         prediction += coefficients[1] * (drflac_int64)pDecodedSamples[-2];
2820         prediction += coefficients[2] * (drflac_int64)pDecodedSamples[-3];
2821         prediction += coefficients[3] * (drflac_int64)pDecodedSamples[-4];
2822     }
2823     else if (order == 12)
2824     {
2825         prediction  = coefficients[0]  * (drflac_int64)pDecodedSamples[-1];
2826         prediction += coefficients[1]  * (drflac_int64)pDecodedSamples[-2];
2827         prediction += coefficients[2]  * (drflac_int64)pDecodedSamples[-3];
2828         prediction += coefficients[3]  * (drflac_int64)pDecodedSamples[-4];
2829         prediction += coefficients[4]  * (drflac_int64)pDecodedSamples[-5];
2830         prediction += coefficients[5]  * (drflac_int64)pDecodedSamples[-6];
2831         prediction += coefficients[6]  * (drflac_int64)pDecodedSamples[-7];
2832         prediction += coefficients[7]  * (drflac_int64)pDecodedSamples[-8];
2833         prediction += coefficients[8]  * (drflac_int64)pDecodedSamples[-9];
2834         prediction += coefficients[9]  * (drflac_int64)pDecodedSamples[-10];
2835         prediction += coefficients[10] * (drflac_int64)pDecodedSamples[-11];
2836         prediction += coefficients[11] * (drflac_int64)pDecodedSamples[-12];
2837     }
2838     else if (order == 2)
2839     {
2840         prediction  = coefficients[0] * (drflac_int64)pDecodedSamples[-1];
2841         prediction += coefficients[1] * (drflac_int64)pDecodedSamples[-2];
2842     }
2843     else if (order == 1)
2844     {
2845         prediction = coefficients[0] * (drflac_int64)pDecodedSamples[-1];
2846     }
2847     else if (order == 10)
2848     {
2849         prediction  = coefficients[0]  * (drflac_int64)pDecodedSamples[-1];
2850         prediction += coefficients[1]  * (drflac_int64)pDecodedSamples[-2];
2851         prediction += coefficients[2]  * (drflac_int64)pDecodedSamples[-3];
2852         prediction += coefficients[3]  * (drflac_int64)pDecodedSamples[-4];
2853         prediction += coefficients[4]  * (drflac_int64)pDecodedSamples[-5];
2854         prediction += coefficients[5]  * (drflac_int64)pDecodedSamples[-6];
2855         prediction += coefficients[6]  * (drflac_int64)pDecodedSamples[-7];
2856         prediction += coefficients[7]  * (drflac_int64)pDecodedSamples[-8];
2857         prediction += coefficients[8]  * (drflac_int64)pDecodedSamples[-9];
2858         prediction += coefficients[9]  * (drflac_int64)pDecodedSamples[-10];
2859     }
2860     else if (order == 9)
2861     {
2862         prediction  = coefficients[0]  * (drflac_int64)pDecodedSamples[-1];
2863         prediction += coefficients[1]  * (drflac_int64)pDecodedSamples[-2];
2864         prediction += coefficients[2]  * (drflac_int64)pDecodedSamples[-3];
2865         prediction += coefficients[3]  * (drflac_int64)pDecodedSamples[-4];
2866         prediction += coefficients[4]  * (drflac_int64)pDecodedSamples[-5];
2867         prediction += coefficients[5]  * (drflac_int64)pDecodedSamples[-6];
2868         prediction += coefficients[6]  * (drflac_int64)pDecodedSamples[-7];
2869         prediction += coefficients[7]  * (drflac_int64)pDecodedSamples[-8];
2870         prediction += coefficients[8]  * (drflac_int64)pDecodedSamples[-9];
2871     }
2872     else if (order == 11)
2873     {
2874         prediction  = coefficients[0]  * (drflac_int64)pDecodedSamples[-1];
2875         prediction += coefficients[1]  * (drflac_int64)pDecodedSamples[-2];
2876         prediction += coefficients[2]  * (drflac_int64)pDecodedSamples[-3];
2877         prediction += coefficients[3]  * (drflac_int64)pDecodedSamples[-4];
2878         prediction += coefficients[4]  * (drflac_int64)pDecodedSamples[-5];
2879         prediction += coefficients[5]  * (drflac_int64)pDecodedSamples[-6];
2880         prediction += coefficients[6]  * (drflac_int64)pDecodedSamples[-7];
2881         prediction += coefficients[7]  * (drflac_int64)pDecodedSamples[-8];
2882         prediction += coefficients[8]  * (drflac_int64)pDecodedSamples[-9];
2883         prediction += coefficients[9]  * (drflac_int64)pDecodedSamples[-10];
2884         prediction += coefficients[10] * (drflac_int64)pDecodedSamples[-11];
2885     }
2886     else
2887     {
2888         int j;
2889
2890         prediction = 0;
2891         for (j = 0; j < (int)order; ++j) {
2892             prediction += coefficients[j] * (drflac_int64)pDecodedSamples[-j-1];
2893         }
2894     }
2895 #endif
2896
2897     /*
2898     VC++ optimizes this to a single jmp instruction, but only the 64-bit build. The 32-bit build generates less efficient code for some
2899     reason. The ugly version above is faster so we'll just switch between the two depending on the target platform.
2900     */
2901 #ifdef DRFLAC_64BIT
2902     prediction = 0;
2903     switch (order)
2904     {
2905     case 32: prediction += coefficients[31] * (drflac_int64)pDecodedSamples[-32];
2906     case 31: prediction += coefficients[30] * (drflac_int64)pDecodedSamples[-31];
2907     case 30: prediction += coefficients[29] * (drflac_int64)pDecodedSamples[-30];
2908     case 29: prediction += coefficients[28] * (drflac_int64)pDecodedSamples[-29];
2909     case 28: prediction += coefficients[27] * (drflac_int64)pDecodedSamples[-28];
2910     case 27: prediction += coefficients[26] * (drflac_int64)pDecodedSamples[-27];
2911     case 26: prediction += coefficients[25] * (drflac_int64)pDecodedSamples[-26];
2912     case 25: prediction += coefficients[24] * (drflac_int64)pDecodedSamples[-25];
2913     case 24: prediction += coefficients[23] * (drflac_int64)pDecodedSamples[-24];
2914     case 23: prediction += coefficients[22] * (drflac_int64)pDecodedSamples[-23];
2915     case 22: prediction += coefficients[21] * (drflac_int64)pDecodedSamples[-22];
2916     case 21: prediction += coefficients[20] * (drflac_int64)pDecodedSamples[-21];
2917     case 20: prediction += coefficients[19] * (drflac_int64)pDecodedSamples[-20];
2918     case 19: prediction += coefficients[18] * (drflac_int64)pDecodedSamples[-19];
2919     case 18: prediction += coefficients[17] * (drflac_int64)pDecodedSamples[-18];
2920     case 17: prediction += coefficients[16] * (drflac_int64)pDecodedSamples[-17];
2921     case 16: prediction += coefficients[15] * (drflac_int64)pDecodedSamples[-16];
2922     case 15: prediction += coefficients[14] * (drflac_int64)pDecodedSamples[-15];
2923     case 14: prediction += coefficients[13] * (drflac_int64)pDecodedSamples[-14];
2924     case 13: prediction += coefficients[12] * (drflac_int64)pDecodedSamples[-13];
2925     case 12: prediction += coefficients[11] * (drflac_int64)pDecodedSamples[-12];
2926     case 11: prediction += coefficients[10] * (drflac_int64)pDecodedSamples[-11];
2927     case 10: prediction += coefficients[ 9] * (drflac_int64)pDecodedSamples[-10];
2928     case  9: prediction += coefficients[ 8] * (drflac_int64)pDecodedSamples[- 9];
2929     case  8: prediction += coefficients[ 7] * (drflac_int64)pDecodedSamples[- 8];
2930     case  7: prediction += coefficients[ 6] * (drflac_int64)pDecodedSamples[- 7];
2931     case  6: prediction += coefficients[ 5] * (drflac_int64)pDecodedSamples[- 6];
2932     case  5: prediction += coefficients[ 4] * (drflac_int64)pDecodedSamples[- 5];
2933     case  4: prediction += coefficients[ 3] * (drflac_int64)pDecodedSamples[- 4];
2934     case  3: prediction += coefficients[ 2] * (drflac_int64)pDecodedSamples[- 3];
2935     case  2: prediction += coefficients[ 1] * (drflac_int64)pDecodedSamples[- 2];
2936     case  1: prediction += coefficients[ 0] * (drflac_int64)pDecodedSamples[- 1];
2937     }
2938 #endif
2939
2940     return (drflac_int32)(prediction >> shift);
2941 }
2942
2943 static DRFLAC_INLINE drflac_bool32 drflac__read_rice_parts_x1(drflac_bs* bs, drflac_uint8 riceParam, drflac_uint32* pZeroCounterOut, drflac_uint32* pRiceParamPartOut)
2944 {
2945     drflac_uint32  riceParamPlus1 = riceParam + 1;
2946     /*drflac_cache_t riceParamPlus1Mask  = DRFLAC_CACHE_L1_SELECTION_MASK(riceParamPlus1);*/
2947     drflac_uint32  riceParamPlus1Shift = DRFLAC_CACHE_L1_SELECTION_SHIFT(bs, riceParamPlus1);
2948     drflac_uint32  riceParamPlus1MaxConsumedBits = DRFLAC_CACHE_L1_SIZE_BITS(bs) - riceParamPlus1;
2949
2950     /*
2951     The idea here is to use local variables for the cache in an attempt to encourage the compiler to store them in registers. I have
2952     no idea how this will work in practice...
2953     */
2954     drflac_cache_t bs_cache = bs->cache;
2955     drflac_uint32  bs_consumedBits = bs->consumedBits;
2956
2957     /* The first thing to do is find the first unset bit. Most likely a bit will be set in the current cache line. */
2958     drflac_uint32  lzcount = drflac__clz(bs_cache);
2959     if (lzcount < sizeof(bs_cache)*8) {
2960         pZeroCounterOut[0] = lzcount;
2961
2962         /*
2963         It is most likely that the riceParam part (which comes after the zero counter) is also on this cache line. When extracting
2964         this, we include the set bit from the unary coded part because it simplifies cache management. This bit will be handled
2965         outside of this function at a higher level.
2966         */
2967     extract_rice_param_part:
2968         bs_cache       <<= lzcount;
2969         bs_consumedBits += lzcount;
2970
2971         if (bs_consumedBits <= riceParamPlus1MaxConsumedBits) {
2972             /* Getting here means the rice parameter part is wholly contained within the current cache line. */
2973             pRiceParamPartOut[0] = (drflac_uint32)(bs_cache >> riceParamPlus1Shift);
2974             bs_cache       <<= riceParamPlus1;
2975             bs_consumedBits += riceParamPlus1;
2976         } else {
2977             drflac_uint32 riceParamPartHi;
2978             drflac_uint32 riceParamPartLo;
2979             drflac_uint32 riceParamPartLoBitCount;
2980
2981             /*
2982             Getting here means the rice parameter part straddles the cache line. We need to read from the tail of the current cache
2983             line, reload the cache, and then combine it with the head of the next cache line.
2984             */
2985
2986             /* Grab the high part of the rice parameter part. */
2987             riceParamPartHi = (drflac_uint32)(bs_cache >> riceParamPlus1Shift);
2988
2989             /* Before reloading the cache we need to grab the size in bits of the low part. */
2990             riceParamPartLoBitCount = bs_consumedBits - riceParamPlus1MaxConsumedBits;
2991             DRFLAC_ASSERT(riceParamPartLoBitCount > 0 && riceParamPartLoBitCount < 32);
2992
2993             /* Now reload the cache. */
2994             if (bs->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT(bs)) {
2995             #ifndef DR_FLAC_NO_CRC
2996                 drflac__update_crc16(bs);
2997             #endif
2998                 bs_cache = drflac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
2999                 bs_consumedBits = riceParamPartLoBitCount;
3000             #ifndef DR_FLAC_NO_CRC
3001                 bs->crc16Cache = bs_cache;
3002             #endif
3003             } else {
3004                 /* Slow path. We need to fetch more data from the client. */
3005                 if (!drflac__reload_cache(bs)) {
3006                     return DRFLAC_FALSE;
3007                 }
3008
3009                 bs_cache = bs->cache;
3010                 bs_consumedBits = bs->consumedBits + riceParamPartLoBitCount;
3011             }
3012
3013             /* We should now have enough information to construct the rice parameter part. */
3014             riceParamPartLo = (drflac_uint32)(bs_cache >> (DRFLAC_CACHE_L1_SELECTION_SHIFT(bs, riceParamPartLoBitCount)));
3015             pRiceParamPartOut[0] = riceParamPartHi | riceParamPartLo;
3016
3017             bs_cache <<= riceParamPartLoBitCount;
3018         }
3019     } else {
3020         /*
3021         Getting here means there are no bits set on the cache line. This is a less optimal case because we just wasted a call
3022         to drflac__clz() and we need to reload the cache.
3023         */
3024         drflac_uint32 zeroCounter = (drflac_uint32)(DRFLAC_CACHE_L1_SIZE_BITS(bs) - bs_consumedBits);
3025         for (;;) {
3026             if (bs->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT(bs)) {
3027             #ifndef DR_FLAC_NO_CRC
3028                 drflac__update_crc16(bs);
3029             #endif
3030                 bs_cache = drflac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
3031                 bs_consumedBits = 0;
3032             #ifndef DR_FLAC_NO_CRC
3033                 bs->crc16Cache = bs_cache;
3034             #endif
3035             } else {
3036                 /* Slow path. We need to fetch more data from the client. */
3037                 if (!drflac__reload_cache(bs)) {
3038                     return DRFLAC_FALSE;
3039                 }
3040
3041                 bs_cache = bs->cache;
3042                 bs_consumedBits = bs->consumedBits;
3043             }
3044
3045             lzcount = drflac__clz(bs_cache);
3046             zeroCounter += lzcount;
3047
3048             if (lzcount < sizeof(bs_cache)*8) {
3049                 break;
3050             }
3051         }
3052
3053         pZeroCounterOut[0] = zeroCounter;
3054         goto extract_rice_param_part;
3055     }
3056
3057     /* Make sure the cache is restored at the end of it all. */
3058     bs->cache = bs_cache;
3059     bs->consumedBits = bs_consumedBits;
3060
3061     return DRFLAC_TRUE;
3062 }
3063
3064 static DRFLAC_INLINE drflac_bool32 drflac__seek_rice_parts(drflac_bs* bs, drflac_uint8 riceParam)
3065 {
3066     drflac_uint32  riceParamPlus1 = riceParam + 1;
3067     drflac_uint32  riceParamPlus1MaxConsumedBits = DRFLAC_CACHE_L1_SIZE_BITS(bs) - riceParamPlus1;
3068
3069     /*
3070     The idea here is to use local variables for the cache in an attempt to encourage the compiler to store them in registers. I have
3071     no idea how this will work in practice...
3072     */
3073     drflac_cache_t bs_cache = bs->cache;
3074     drflac_uint32  bs_consumedBits = bs->consumedBits;
3075
3076     /* The first thing to do is find the first unset bit. Most likely a bit will be set in the current cache line. */
3077     drflac_uint32  lzcount = drflac__clz(bs_cache);
3078     if (lzcount < sizeof(bs_cache)*8) {
3079         /*
3080         It is most likely that the riceParam part (which comes after the zero counter) is also on this cache line. When extracting
3081         this, we include the set bit from the unary coded part because it simplifies cache management. This bit will be handled
3082         outside of this function at a higher level.
3083         */
3084     extract_rice_param_part:
3085         bs_cache       <<= lzcount;
3086         bs_consumedBits += lzcount;
3087
3088         if (bs_consumedBits <= riceParamPlus1MaxConsumedBits) {
3089             /* Getting here means the rice parameter part is wholly contained within the current cache line. */
3090             bs_cache       <<= riceParamPlus1;
3091             bs_consumedBits += riceParamPlus1;
3092         } else {
3093             /*
3094             Getting here means the rice parameter part straddles the cache line. We need to read from the tail of the current cache
3095             line, reload the cache, and then combine it with the head of the next cache line.
3096             */
3097
3098             /* Before reloading the cache we need to grab the size in bits of the low part. */
3099             drflac_uint32 riceParamPartLoBitCount = bs_consumedBits - riceParamPlus1MaxConsumedBits;
3100             DRFLAC_ASSERT(riceParamPartLoBitCount > 0 && riceParamPartLoBitCount < 32);
3101
3102             /* Now reload the cache. */
3103             if (bs->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT(bs)) {
3104             #ifndef DR_FLAC_NO_CRC
3105                 drflac__update_crc16(bs);
3106             #endif
3107                 bs_cache = drflac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
3108                 bs_consumedBits = riceParamPartLoBitCount;
3109             #ifndef DR_FLAC_NO_CRC
3110                 bs->crc16Cache = bs_cache;
3111             #endif
3112             } else {
3113                 /* Slow path. We need to fetch more data from the client. */
3114                 if (!drflac__reload_cache(bs)) {
3115                     return DRFLAC_FALSE;
3116                 }
3117
3118                 bs_cache = bs->cache;
3119                 bs_consumedBits = bs->consumedBits + riceParamPartLoBitCount;
3120             }
3121
3122             bs_cache <<= riceParamPartLoBitCount;
3123         }
3124     } else {
3125         /*
3126         Getting here means there are no bits set on the cache line. This is a less optimal case because we just wasted a call
3127         to drflac__clz() and we need to reload the cache.
3128         */
3129         for (;;) {
3130             if (bs->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT(bs)) {
3131             #ifndef DR_FLAC_NO_CRC
3132                 drflac__update_crc16(bs);
3133             #endif
3134                 bs_cache = drflac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
3135                 bs_consumedBits = 0;
3136             #ifndef DR_FLAC_NO_CRC
3137                 bs->crc16Cache = bs_cache;
3138             #endif
3139             } else {
3140                 /* Slow path. We need to fetch more data from the client. */
3141                 if (!drflac__reload_cache(bs)) {
3142                     return DRFLAC_FALSE;
3143                 }
3144
3145                 bs_cache = bs->cache;
3146                 bs_consumedBits = bs->consumedBits;
3147             }
3148
3149             lzcount = drflac__clz(bs_cache);
3150             if (lzcount < sizeof(bs_cache)*8) {
3151                 break;
3152             }
3153         }
3154
3155         goto extract_rice_param_part;
3156     }
3157
3158     /* Make sure the cache is restored at the end of it all. */
3159     bs->cache = bs_cache;
3160     bs->consumedBits = bs_consumedBits;
3161
3162     return DRFLAC_TRUE;
3163 }
3164
3165
3166 static drflac_bool32 drflac__decode_samples_with_residual__rice__scalar_zeroorder(drflac_bs* bs, drflac_uint32 bitsPerSample, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
3167 {
3168     drflac_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
3169     drflac_uint32 zeroCountPart0;
3170     drflac_uint32 riceParamPart0;
3171     drflac_uint32 riceParamMask;
3172     drflac_uint32 i;
3173
3174     DRFLAC_ASSERT(bs != NULL);
3175     DRFLAC_ASSERT(count > 0);
3176     DRFLAC_ASSERT(pSamplesOut != NULL);
3177
3178     (void)bitsPerSample;
3179     (void)order;
3180     (void)shift;
3181     (void)coefficients;
3182
3183     riceParamMask  = (drflac_uint32)~((~0UL) << riceParam);
3184
3185     i = 0;
3186     while (i < count) {
3187         /* Rice extraction. */
3188         if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart0, &riceParamPart0)) {
3189             return DRFLAC_FALSE;
3190         }
3191
3192         /* Rice reconstruction. */
3193         riceParamPart0 &= riceParamMask;
3194         riceParamPart0 |= (zeroCountPart0 << riceParam);
3195         riceParamPart0  = (riceParamPart0 >> 1) ^ t[riceParamPart0 & 0x01];
3196
3197         pSamplesOut[i] = riceParamPart0;
3198
3199         i += 1;
3200     }
3201
3202     return DRFLAC_TRUE;
3203 }
3204
3205 static drflac_bool32 drflac__decode_samples_with_residual__rice__scalar(drflac_bs* bs, drflac_uint32 bitsPerSample, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
3206 {
3207     drflac_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
3208     drflac_uint32 zeroCountPart0 = 0;
3209     drflac_uint32 zeroCountPart1 = 0;
3210     drflac_uint32 zeroCountPart2 = 0;
3211     drflac_uint32 zeroCountPart3 = 0;
3212     drflac_uint32 riceParamPart0 = 0;
3213     drflac_uint32 riceParamPart1 = 0;
3214     drflac_uint32 riceParamPart2 = 0;
3215     drflac_uint32 riceParamPart3 = 0;
3216     drflac_uint32 riceParamMask;
3217     const drflac_int32* pSamplesOutEnd;
3218     drflac_uint32 i;
3219
3220     DRFLAC_ASSERT(bs != NULL);
3221     DRFLAC_ASSERT(count > 0);
3222     DRFLAC_ASSERT(pSamplesOut != NULL);
3223
3224     if (order == 0) {
3225         return drflac__decode_samples_with_residual__rice__scalar_zeroorder(bs, bitsPerSample, count, riceParam, order, shift, coefficients, pSamplesOut);
3226     }
3227
3228     riceParamMask  = (drflac_uint32)~((~0UL) << riceParam);
3229     pSamplesOutEnd = pSamplesOut + (count & ~3);
3230
3231     if (bitsPerSample+shift > 32) {
3232         while (pSamplesOut < pSamplesOutEnd) {
3233             /*
3234             Rice extraction. It's faster to do this one at a time against local variables than it is to use the x4 version
3235             against an array. Not sure why, but perhaps it's making more efficient use of registers?
3236             */
3237             if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart0, &riceParamPart0) ||
3238                 !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart1, &riceParamPart1) ||
3239                 !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart2, &riceParamPart2) ||
3240                 !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart3, &riceParamPart3)) {
3241                 return DRFLAC_FALSE;
3242             }
3243
3244             riceParamPart0 &= riceParamMask;
3245             riceParamPart1 &= riceParamMask;
3246             riceParamPart2 &= riceParamMask;
3247             riceParamPart3 &= riceParamMask;
3248
3249             riceParamPart0 |= (zeroCountPart0 << riceParam);
3250             riceParamPart1 |= (zeroCountPart1 << riceParam);
3251             riceParamPart2 |= (zeroCountPart2 << riceParam);
3252             riceParamPart3 |= (zeroCountPart3 << riceParam);
3253
3254             riceParamPart0  = (riceParamPart0 >> 1) ^ t[riceParamPart0 & 0x01];
3255             riceParamPart1  = (riceParamPart1 >> 1) ^ t[riceParamPart1 & 0x01];
3256             riceParamPart2  = (riceParamPart2 >> 1) ^ t[riceParamPart2 & 0x01];
3257             riceParamPart3  = (riceParamPart3 >> 1) ^ t[riceParamPart3 & 0x01];
3258
3259             pSamplesOut[0] = riceParamPart0 + drflac__calculate_prediction_64(order, shift, coefficients, pSamplesOut + 0);
3260             pSamplesOut[1] = riceParamPart1 + drflac__calculate_prediction_64(order, shift, coefficients, pSamplesOut + 1);
3261             pSamplesOut[2] = riceParamPart2 + drflac__calculate_prediction_64(order, shift, coefficients, pSamplesOut + 2);
3262             pSamplesOut[3] = riceParamPart3 + drflac__calculate_prediction_64(order, shift, coefficients, pSamplesOut + 3);
3263
3264             pSamplesOut += 4;
3265         }
3266     } else {
3267         while (pSamplesOut < pSamplesOutEnd) {
3268             if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart0, &riceParamPart0) ||
3269                 !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart1, &riceParamPart1) ||
3270                 !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart2, &riceParamPart2) ||
3271                 !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart3, &riceParamPart3)) {
3272                 return DRFLAC_FALSE;
3273             }
3274
3275             riceParamPart0 &= riceParamMask;
3276             riceParamPart1 &= riceParamMask;
3277             riceParamPart2 &= riceParamMask;
3278             riceParamPart3 &= riceParamMask;
3279
3280             riceParamPart0 |= (zeroCountPart0 << riceParam);
3281             riceParamPart1 |= (zeroCountPart1 << riceParam);
3282             riceParamPart2 |= (zeroCountPart2 << riceParam);
3283             riceParamPart3 |= (zeroCountPart3 << riceParam);
3284
3285             riceParamPart0  = (riceParamPart0 >> 1) ^ t[riceParamPart0 & 0x01];
3286             riceParamPart1  = (riceParamPart1 >> 1) ^ t[riceParamPart1 & 0x01];
3287             riceParamPart2  = (riceParamPart2 >> 1) ^ t[riceParamPart2 & 0x01];
3288             riceParamPart3  = (riceParamPart3 >> 1) ^ t[riceParamPart3 & 0x01];
3289
3290             pSamplesOut[0] = riceParamPart0 + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + 0);
3291             pSamplesOut[1] = riceParamPart1 + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + 1);
3292             pSamplesOut[2] = riceParamPart2 + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + 2);
3293             pSamplesOut[3] = riceParamPart3 + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + 3);
3294
3295             pSamplesOut += 4;
3296         }
3297     }
3298
3299     i = (count & ~3);
3300     while (i < count) {
3301         /* Rice extraction. */
3302         if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart0, &riceParamPart0)) {
3303             return DRFLAC_FALSE;
3304         }
3305
3306         /* Rice reconstruction. */
3307         riceParamPart0 &= riceParamMask;
3308         riceParamPart0 |= (zeroCountPart0 << riceParam);
3309         riceParamPart0  = (riceParamPart0 >> 1) ^ t[riceParamPart0 & 0x01];
3310         /*riceParamPart0  = (riceParamPart0 >> 1) ^ (~(riceParamPart0 & 0x01) + 1);*/
3311
3312         /* Sample reconstruction. */
3313         if (bitsPerSample+shift > 32) {
3314             pSamplesOut[0] = riceParamPart0 + drflac__calculate_prediction_64(order, shift, coefficients, pSamplesOut + 0);
3315         } else {
3316             pSamplesOut[0] = riceParamPart0 + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + 0);
3317         }
3318
3319         i += 1;
3320         pSamplesOut += 1;
3321     }
3322
3323     return DRFLAC_TRUE;
3324 }
3325
3326 #if defined(DRFLAC_SUPPORT_SSE2)
3327 static DRFLAC_INLINE __m128i drflac__mm_packs_interleaved_epi32(__m128i a, __m128i b)
3328 {
3329     __m128i r;
3330
3331     /* Pack. */
3332     r = _mm_packs_epi32(a, b);
3333
3334     /* a3a2 a1a0 b3b2 b1b0 -> a3a2 b3b2 a1a0 b1b0 */
3335     r = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 1, 2, 0));
3336
3337     /* a3a2 b3b2 a1a0 b1b0 -> a3b3 a2b2 a1b1 a0b0 */
3338     r = _mm_shufflehi_epi16(r, _MM_SHUFFLE(3, 1, 2, 0));
3339     r = _mm_shufflelo_epi16(r, _MM_SHUFFLE(3, 1, 2, 0));
3340
3341     return r;
3342 }
3343 #endif
3344
3345 #if defined(DRFLAC_SUPPORT_SSE41)
3346 static DRFLAC_INLINE __m128i drflac__mm_not_si128(__m128i a)
3347 {
3348     return _mm_xor_si128(a, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
3349 }
3350
3351 static DRFLAC_INLINE __m128i drflac__mm_hadd_epi32(__m128i x)
3352 {
3353     __m128i x64 = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)));
3354     __m128i x32 = _mm_shufflelo_epi16(x64, _MM_SHUFFLE(1, 0, 3, 2));
3355     return _mm_add_epi32(x64, x32);
3356 }
3357
3358 static DRFLAC_INLINE __m128i drflac__mm_hadd_epi64(__m128i x)
3359 {
3360     return _mm_add_epi64(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)));
3361 }
3362
3363 static DRFLAC_INLINE __m128i drflac__mm_srai_epi64(__m128i x, int count)
3364 {
3365     /*
3366     To simplify this we are assuming count < 32. This restriction allows us to work on a low side and a high side. The low side
3367     is shifted with zero bits, whereas the right side is shifted with sign bits.
3368     */
3369     __m128i lo = _mm_srli_epi64(x, count);
3370     __m128i hi = _mm_srai_epi32(x, count);
3371
3372     hi = _mm_and_si128(hi, _mm_set_epi32(0xFFFFFFFF, 0, 0xFFFFFFFF, 0));    /* The high part needs to have the low part cleared. */
3373
3374     return _mm_or_si128(lo, hi);
3375 }
3376
3377 static drflac_bool32 drflac__decode_samples_with_residual__rice__sse41_32(drflac_bs* bs, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
3378 {
3379     int i;
3380     drflac_uint32 riceParamMask;
3381     drflac_int32* pDecodedSamples    = pSamplesOut;
3382     drflac_int32* pDecodedSamplesEnd = pSamplesOut + (count & ~3);
3383     drflac_uint32 zeroCountParts0 = 0;
3384     drflac_uint32 zeroCountParts1 = 0;
3385     drflac_uint32 zeroCountParts2 = 0;
3386     drflac_uint32 zeroCountParts3 = 0;
3387     drflac_uint32 riceParamParts0 = 0;
3388     drflac_uint32 riceParamParts1 = 0;
3389     drflac_uint32 riceParamParts2 = 0;
3390     drflac_uint32 riceParamParts3 = 0;
3391     __m128i coefficients128_0;
3392     __m128i coefficients128_4;
3393     __m128i coefficients128_8;
3394     __m128i samples128_0;
3395     __m128i samples128_4;
3396     __m128i samples128_8;
3397     __m128i riceParamMask128;
3398
3399     const drflac_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
3400
3401     riceParamMask    = (drflac_uint32)~((~0UL) << riceParam);
3402     riceParamMask128 = _mm_set1_epi32(riceParamMask);
3403
3404     /* Pre-load. */
3405     coefficients128_0 = _mm_setzero_si128();
3406     coefficients128_4 = _mm_setzero_si128();
3407     coefficients128_8 = _mm_setzero_si128();
3408
3409     samples128_0 = _mm_setzero_si128();
3410     samples128_4 = _mm_setzero_si128();
3411     samples128_8 = _mm_setzero_si128();
3412
3413     /*
3414     Pre-loading the coefficients and prior samples is annoying because we need to ensure we don't try reading more than
3415     what's available in the input buffers. It would be convenient to use a fall-through switch to do this, but this results
3416     in strict aliasing warnings with GCC. To work around this I'm just doing something hacky. This feels a bit convoluted
3417     so I think there's opportunity for this to be simplified.
3418     */
3419 #if 1
3420     {
3421         int runningOrder = order;
3422
3423         /* 0 - 3. */
3424         if (runningOrder >= 4) {
3425             coefficients128_0 = _mm_loadu_si128((const __m128i*)(coefficients + 0));
3426             samples128_0      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 4));
3427             runningOrder -= 4;
3428         } else {
3429             switch (runningOrder) {
3430                 case 3: coefficients128_0 = _mm_set_epi32(0, coefficients[2], coefficients[1], coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], pSamplesOut[-2], pSamplesOut[-3], 0); break;
3431                 case 2: coefficients128_0 = _mm_set_epi32(0, 0,               coefficients[1], coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], pSamplesOut[-2], 0,               0); break;
3432                 case 1: coefficients128_0 = _mm_set_epi32(0, 0,               0,               coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], 0,               0,               0); break;
3433             }
3434             runningOrder = 0;
3435         }
3436
3437         /* 4 - 7 */
3438         if (runningOrder >= 4) {
3439             coefficients128_4 = _mm_loadu_si128((const __m128i*)(coefficients + 4));
3440             samples128_4      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 8));
3441             runningOrder -= 4;
3442         } else {
3443             switch (runningOrder) {
3444                 case 3: coefficients128_4 = _mm_set_epi32(0, coefficients[6], coefficients[5], coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], pSamplesOut[-6], pSamplesOut[-7], 0); break;
3445                 case 2: coefficients128_4 = _mm_set_epi32(0, 0,               coefficients[5], coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], pSamplesOut[-6], 0,               0); break;
3446                 case 1: coefficients128_4 = _mm_set_epi32(0, 0,               0,               coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], 0,               0,               0); break;
3447             }
3448             runningOrder = 0;
3449         }
3450
3451         /* 8 - 11 */
3452         if (runningOrder == 4) {
3453             coefficients128_8 = _mm_loadu_si128((const __m128i*)(coefficients + 8));
3454             samples128_8      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 12));
3455             runningOrder -= 4;
3456         } else {
3457             switch (runningOrder) {
3458                 case 3: coefficients128_8 = _mm_set_epi32(0, coefficients[10], coefficients[9], coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], pSamplesOut[-10], pSamplesOut[-11], 0); break;
3459                 case 2: coefficients128_8 = _mm_set_epi32(0, 0,                coefficients[9], coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], pSamplesOut[-10], 0,                0); break;
3460                 case 1: coefficients128_8 = _mm_set_epi32(0, 0,                0,               coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], 0,                0,                0); break;
3461             }
3462             runningOrder = 0;
3463         }
3464
3465         /* Coefficients need to be shuffled for our streaming algorithm below to work. Samples are already in the correct order from the loading routine above. */
3466         coefficients128_0 = _mm_shuffle_epi32(coefficients128_0, _MM_SHUFFLE(0, 1, 2, 3));
3467         coefficients128_4 = _mm_shuffle_epi32(coefficients128_4, _MM_SHUFFLE(0, 1, 2, 3));
3468         coefficients128_8 = _mm_shuffle_epi32(coefficients128_8, _MM_SHUFFLE(0, 1, 2, 3));
3469     }
3470 #else
3471     /* This causes strict-aliasing warnings with GCC. */
3472     switch (order)
3473     {
3474     case 12: ((drflac_int32*)&coefficients128_8)[0] = coefficients[11]; ((drflac_int32*)&samples128_8)[0] = pDecodedSamples[-12];
3475     case 11: ((drflac_int32*)&coefficients128_8)[1] = coefficients[10]; ((drflac_int32*)&samples128_8)[1] = pDecodedSamples[-11];
3476     case 10: ((drflac_int32*)&coefficients128_8)[2] = coefficients[ 9]; ((drflac_int32*)&samples128_8)[2] = pDecodedSamples[-10];
3477     case 9:  ((drflac_int32*)&coefficients128_8)[3] = coefficients[ 8]; ((drflac_int32*)&samples128_8)[3] = pDecodedSamples[- 9];
3478     case 8:  ((drflac_int32*)&coefficients128_4)[0] = coefficients[ 7]; ((drflac_int32*)&samples128_4)[0] = pDecodedSamples[- 8];
3479     case 7:  ((drflac_int32*)&coefficients128_4)[1] = coefficients[ 6]; ((drflac_int32*)&samples128_4)[1] = pDecodedSamples[- 7];
3480     case 6:  ((drflac_int32*)&coefficients128_4)[2] = coefficients[ 5]; ((drflac_int32*)&samples128_4)[2] = pDecodedSamples[- 6];
3481     case 5:  ((drflac_int32*)&coefficients128_4)[3] = coefficients[ 4]; ((drflac_int32*)&samples128_4)[3] = pDecodedSamples[- 5];
3482     case 4:  ((drflac_int32*)&coefficients128_0)[0] = coefficients[ 3]; ((drflac_int32*)&samples128_0)[0] = pDecodedSamples[- 4];
3483     case 3:  ((drflac_int32*)&coefficients128_0)[1] = coefficients[ 2]; ((drflac_int32*)&samples128_0)[1] = pDecodedSamples[- 3];
3484     case 2:  ((drflac_int32*)&coefficients128_0)[2] = coefficients[ 1]; ((drflac_int32*)&samples128_0)[2] = pDecodedSamples[- 2];
3485     case 1:  ((drflac_int32*)&coefficients128_0)[3] = coefficients[ 0]; ((drflac_int32*)&samples128_0)[3] = pDecodedSamples[- 1];
3486     }
3487 #endif
3488
3489     /* For this version we are doing one sample at a time. */
3490     while (pDecodedSamples < pDecodedSamplesEnd) {
3491         __m128i prediction128;
3492         __m128i zeroCountPart128;
3493         __m128i riceParamPart128;
3494
3495         if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts0, &riceParamParts0) ||
3496             !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts1, &riceParamParts1) ||
3497             !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts2, &riceParamParts2) ||
3498             !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts3, &riceParamParts3)) {
3499             return DRFLAC_FALSE;
3500         }
3501
3502         zeroCountPart128 = _mm_set_epi32(zeroCountParts3, zeroCountParts2, zeroCountParts1, zeroCountParts0);
3503         riceParamPart128 = _mm_set_epi32(riceParamParts3, riceParamParts2, riceParamParts1, riceParamParts0);
3504
3505         riceParamPart128 = _mm_and_si128(riceParamPart128, riceParamMask128);
3506         riceParamPart128 = _mm_or_si128(riceParamPart128, _mm_slli_epi32(zeroCountPart128, riceParam));
3507         riceParamPart128 = _mm_xor_si128(_mm_srli_epi32(riceParamPart128, 1), _mm_add_epi32(drflac__mm_not_si128(_mm_and_si128(riceParamPart128, _mm_set1_epi32(0x01))), _mm_set1_epi32(0x01)));  /* <-- SSE2 compatible */
3508         /*riceParamPart128 = _mm_xor_si128(_mm_srli_epi32(riceParamPart128, 1), _mm_mullo_epi32(_mm_and_si128(riceParamPart128, _mm_set1_epi32(0x01)), _mm_set1_epi32(0xFFFFFFFF)));*/   /* <-- Only supported from SSE4.1 and is slower in my testing... */
3509
3510         if (order <= 4) {
3511             for (i = 0; i < 4; i += 1) {
3512                 prediction128 = _mm_mullo_epi32(coefficients128_0, samples128_0);
3513
3514                 /* Horizontal add and shift. */
3515                 prediction128 = drflac__mm_hadd_epi32(prediction128);
3516                 prediction128 = _mm_srai_epi32(prediction128, shift);
3517                 prediction128 = _mm_add_epi32(riceParamPart128, prediction128);
3518
3519                 samples128_0 = _mm_alignr_epi8(prediction128, samples128_0, 4);
3520                 riceParamPart128 = _mm_alignr_epi8(_mm_setzero_si128(), riceParamPart128, 4);
3521             }
3522         } else if (order <= 8) {
3523             for (i = 0; i < 4; i += 1) {
3524                 prediction128 =                              _mm_mullo_epi32(coefficients128_4, samples128_4);
3525                 prediction128 = _mm_add_epi32(prediction128, _mm_mullo_epi32(coefficients128_0, samples128_0));
3526
3527                 /* Horizontal add and shift. */
3528                 prediction128 = drflac__mm_hadd_epi32(prediction128);
3529                 prediction128 = _mm_srai_epi32(prediction128, shift);
3530                 prediction128 = _mm_add_epi32(riceParamPart128, prediction128);
3531
3532                 samples128_4 = _mm_alignr_epi8(samples128_0,  samples128_4, 4);
3533                 samples128_0 = _mm_alignr_epi8(prediction128, samples128_0, 4);
3534                 riceParamPart128 = _mm_alignr_epi8(_mm_setzero_si128(), riceParamPart128, 4);
3535             }
3536         } else {
3537             for (i = 0; i < 4; i += 1) {
3538                 prediction128 =                              _mm_mullo_epi32(coefficients128_8, samples128_8);
3539                 prediction128 = _mm_add_epi32(prediction128, _mm_mullo_epi32(coefficients128_4, samples128_4));
3540                 prediction128 = _mm_add_epi32(prediction128, _mm_mullo_epi32(coefficients128_0, samples128_0));
3541
3542                 /* Horizontal add and shift. */
3543                 prediction128 = drflac__mm_hadd_epi32(prediction128);
3544                 prediction128 = _mm_srai_epi32(prediction128, shift);
3545                 prediction128 = _mm_add_epi32(riceParamPart128, prediction128);
3546
3547                 samples128_8 = _mm_alignr_epi8(samples128_4,  samples128_8, 4);
3548                 samples128_4 = _mm_alignr_epi8(samples128_0,  samples128_4, 4);
3549                 samples128_0 = _mm_alignr_epi8(prediction128, samples128_0, 4);
3550                 riceParamPart128 = _mm_alignr_epi8(_mm_setzero_si128(), riceParamPart128, 4);
3551             }
3552         }
3553
3554         /* We store samples in groups of 4. */
3555         _mm_storeu_si128((__m128i*)pDecodedSamples, samples128_0);
3556         pDecodedSamples += 4;
3557     }
3558
3559     /* Make sure we process the last few samples. */
3560     i = (count & ~3);
3561     while (i < (int)count) {
3562         /* Rice extraction. */
3563         if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts0, &riceParamParts0)) {
3564             return DRFLAC_FALSE;
3565         }
3566
3567         /* Rice reconstruction. */
3568         riceParamParts0 &= riceParamMask;
3569         riceParamParts0 |= (zeroCountParts0 << riceParam);
3570         riceParamParts0  = (riceParamParts0 >> 1) ^ t[riceParamParts0 & 0x01];
3571
3572         /* Sample reconstruction. */
3573         pDecodedSamples[0] = riceParamParts0 + drflac__calculate_prediction_32(order, shift, coefficients, pDecodedSamples);
3574
3575         i += 1;
3576         pDecodedSamples += 1;
3577     }
3578
3579     return DRFLAC_TRUE;
3580 }
3581
3582 static drflac_bool32 drflac__decode_samples_with_residual__rice__sse41_64(drflac_bs* bs, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
3583 {
3584     int i;
3585     drflac_uint32 riceParamMask;
3586     drflac_int32* pDecodedSamples    = pSamplesOut;
3587     drflac_int32* pDecodedSamplesEnd = pSamplesOut + (count & ~3);
3588     drflac_uint32 zeroCountParts0 = 0;
3589     drflac_uint32 zeroCountParts1 = 0;
3590     drflac_uint32 zeroCountParts2 = 0;
3591     drflac_uint32 zeroCountParts3 = 0;
3592     drflac_uint32 riceParamParts0 = 0;
3593     drflac_uint32 riceParamParts1 = 0;
3594     drflac_uint32 riceParamParts2 = 0;
3595     drflac_uint32 riceParamParts3 = 0;
3596     __m128i coefficients128_0;
3597     __m128i coefficients128_4;
3598     __m128i coefficients128_8;
3599     __m128i samples128_0;
3600     __m128i samples128_4;
3601     __m128i samples128_8;
3602     __m128i prediction128;
3603     __m128i riceParamMask128;
3604
3605     const drflac_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
3606
3607     DRFLAC_ASSERT(order <= 12);
3608
3609     riceParamMask    = (drflac_uint32)~((~0UL) << riceParam);
3610     riceParamMask128 = _mm_set1_epi32(riceParamMask);
3611
3612     prediction128 = _mm_setzero_si128();
3613
3614     /* Pre-load. */
3615     coefficients128_0  = _mm_setzero_si128();
3616     coefficients128_4  = _mm_setzero_si128();
3617     coefficients128_8  = _mm_setzero_si128();
3618
3619     samples128_0  = _mm_setzero_si128();
3620     samples128_4  = _mm_setzero_si128();
3621     samples128_8  = _mm_setzero_si128();
3622
3623 #if 1
3624     {
3625         int runningOrder = order;
3626
3627         /* 0 - 3. */
3628         if (runningOrder >= 4) {
3629             coefficients128_0 = _mm_loadu_si128((const __m128i*)(coefficients + 0));
3630             samples128_0      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 4));
3631             runningOrder -= 4;
3632         } else {
3633             switch (runningOrder) {
3634                 case 3: coefficients128_0 = _mm_set_epi32(0, coefficients[2], coefficients[1], coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], pSamplesOut[-2], pSamplesOut[-3], 0); break;
3635                 case 2: coefficients128_0 = _mm_set_epi32(0, 0,               coefficients[1], coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], pSamplesOut[-2], 0,               0); break;
3636                 case 1: coefficients128_0 = _mm_set_epi32(0, 0,               0,               coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], 0,               0,               0); break;
3637             }
3638             runningOrder = 0;
3639         }
3640
3641         /* 4 - 7 */
3642         if (runningOrder >= 4) {
3643             coefficients128_4 = _mm_loadu_si128((const __m128i*)(coefficients + 4));
3644             samples128_4      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 8));
3645             runningOrder -= 4;
3646         } else {
3647             switch (runningOrder) {
3648                 case 3: coefficients128_4 = _mm_set_epi32(0, coefficients[6], coefficients[5], coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], pSamplesOut[-6], pSamplesOut[-7], 0); break;
3649                 case 2: coefficients128_4 = _mm_set_epi32(0, 0,               coefficients[5], coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], pSamplesOut[-6], 0,               0); break;
3650                 case 1: coefficients128_4 = _mm_set_epi32(0, 0,               0,               coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], 0,               0,               0); break;
3651             }
3652             runningOrder = 0;
3653         }
3654
3655         /* 8 - 11 */
3656         if (runningOrder == 4) {
3657             coefficients128_8 = _mm_loadu_si128((const __m128i*)(coefficients + 8));
3658             samples128_8      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 12));
3659             runningOrder -= 4;
3660         } else {
3661             switch (runningOrder) {
3662                 case 3: coefficients128_8 = _mm_set_epi32(0, coefficients[10], coefficients[9], coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], pSamplesOut[-10], pSamplesOut[-11], 0); break;
3663                 case 2: coefficients128_8 = _mm_set_epi32(0, 0,                coefficients[9], coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], pSamplesOut[-10], 0,                0); break;
3664                 case 1: coefficients128_8 = _mm_set_epi32(0, 0,                0,               coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], 0,                0,                0); break;
3665             }
3666             runningOrder = 0;
3667         }
3668
3669         /* Coefficients need to be shuffled for our streaming algorithm below to work. Samples are already in the correct order from the loading routine above. */
3670         coefficients128_0 = _mm_shuffle_epi32(coefficients128_0, _MM_SHUFFLE(0, 1, 2, 3));
3671         coefficients128_4 = _mm_shuffle_epi32(coefficients128_4, _MM_SHUFFLE(0, 1, 2, 3));
3672         coefficients128_8 = _mm_shuffle_epi32(coefficients128_8, _MM_SHUFFLE(0, 1, 2, 3));
3673     }
3674 #else
3675     switch (order)
3676     {
3677     case 12: ((drflac_int32*)&coefficients128_8)[0] = coefficients[11]; ((drflac_int32*)&samples128_8)[0] = pDecodedSamples[-12];
3678     case 11: ((drflac_int32*)&coefficients128_8)[1] = coefficients[10]; ((drflac_int32*)&samples128_8)[1] = pDecodedSamples[-11];
3679     case 10: ((drflac_int32*)&coefficients128_8)[2] = coefficients[ 9]; ((drflac_int32*)&samples128_8)[2] = pDecodedSamples[-10];
3680     case 9:  ((drflac_int32*)&coefficients128_8)[3] = coefficients[ 8]; ((drflac_int32*)&samples128_8)[3] = pDecodedSamples[- 9];
3681     case 8:  ((drflac_int32*)&coefficients128_4)[0] = coefficients[ 7]; ((drflac_int32*)&samples128_4)[0] = pDecodedSamples[- 8];
3682     case 7:  ((drflac_int32*)&coefficients128_4)[1] = coefficients[ 6]; ((drflac_int32*)&samples128_4)[1] = pDecodedSamples[- 7];
3683     case 6:  ((drflac_int32*)&coefficients128_4)[2] = coefficients[ 5]; ((drflac_int32*)&samples128_4)[2] = pDecodedSamples[- 6];
3684     case 5:  ((drflac_int32*)&coefficients128_4)[3] = coefficients[ 4]; ((drflac_int32*)&samples128_4)[3] = pDecodedSamples[- 5];
3685     case 4:  ((drflac_int32*)&coefficients128_0)[0] = coefficients[ 3]; ((drflac_int32*)&samples128_0)[0] = pDecodedSamples[- 4];
3686     case 3:  ((drflac_int32*)&coefficients128_0)[1] = coefficients[ 2]; ((drflac_int32*)&samples128_0)[1] = pDecodedSamples[- 3];
3687     case 2:  ((drflac_int32*)&coefficients128_0)[2] = coefficients[ 1]; ((drflac_int32*)&samples128_0)[2] = pDecodedSamples[- 2];
3688     case 1:  ((drflac_int32*)&coefficients128_0)[3] = coefficients[ 0]; ((drflac_int32*)&samples128_0)[3] = pDecodedSamples[- 1];
3689     }
3690 #endif
3691
3692     /* For this version we are doing one sample at a time. */
3693     while (pDecodedSamples < pDecodedSamplesEnd) {
3694         __m128i zeroCountPart128;
3695         __m128i riceParamPart128;
3696
3697         if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts0, &riceParamParts0) ||
3698             !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts1, &riceParamParts1) ||
3699             !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts2, &riceParamParts2) ||
3700             !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts3, &riceParamParts3)) {
3701             return DRFLAC_FALSE;
3702         }
3703
3704         zeroCountPart128 = _mm_set_epi32(zeroCountParts3, zeroCountParts2, zeroCountParts1, zeroCountParts0);
3705         riceParamPart128 = _mm_set_epi32(riceParamParts3, riceParamParts2, riceParamParts1, riceParamParts0);
3706
3707         riceParamPart128 = _mm_and_si128(riceParamPart128, riceParamMask128);
3708         riceParamPart128 = _mm_or_si128(riceParamPart128, _mm_slli_epi32(zeroCountPart128, riceParam));
3709         riceParamPart128 = _mm_xor_si128(_mm_srli_epi32(riceParamPart128, 1), _mm_add_epi32(drflac__mm_not_si128(_mm_and_si128(riceParamPart128, _mm_set1_epi32(1))), _mm_set1_epi32(1)));
3710
3711         for (i = 0; i < 4; i += 1) {
3712             prediction128 = _mm_xor_si128(prediction128, prediction128);    /* Reset to 0. */
3713
3714             switch (order)
3715             {
3716             case 12:
3717             case 11: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_8, _MM_SHUFFLE(1, 1, 0, 0)), _mm_shuffle_epi32(samples128_8, _MM_SHUFFLE(1, 1, 0, 0))));
3718             case 10:
3719             case  9: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_8, _MM_SHUFFLE(3, 3, 2, 2)), _mm_shuffle_epi32(samples128_8, _MM_SHUFFLE(3, 3, 2, 2))));
3720             case  8:
3721             case  7: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_4, _MM_SHUFFLE(1, 1, 0, 0)), _mm_shuffle_epi32(samples128_4, _MM_SHUFFLE(1, 1, 0, 0))));
3722             case  6:
3723             case  5: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_4, _MM_SHUFFLE(3, 3, 2, 2)), _mm_shuffle_epi32(samples128_4, _MM_SHUFFLE(3, 3, 2, 2))));
3724             case  4:
3725             case  3: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_0, _MM_SHUFFLE(1, 1, 0, 0)), _mm_shuffle_epi32(samples128_0, _MM_SHUFFLE(1, 1, 0, 0))));
3726             case  2:
3727             case  1: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_0, _MM_SHUFFLE(3, 3, 2, 2)), _mm_shuffle_epi32(samples128_0, _MM_SHUFFLE(3, 3, 2, 2))));
3728             }
3729
3730             /* Horizontal add and shift. */
3731             prediction128 = drflac__mm_hadd_epi64(prediction128);
3732             prediction128 = drflac__mm_srai_epi64(prediction128, shift);
3733             prediction128 = _mm_add_epi32(riceParamPart128, prediction128);
3734
3735             /* Our value should be sitting in prediction128[0]. We need to combine this with our SSE samples. */
3736             samples128_8 = _mm_alignr_epi8(samples128_4,  samples128_8, 4);
3737             samples128_4 = _mm_alignr_epi8(samples128_0,  samples128_4, 4);
3738             samples128_0 = _mm_alignr_epi8(prediction128, samples128_0, 4);
3739
3740             /* Slide our rice parameter down so that the value in position 0 contains the next one to process. */
3741             riceParamPart128 = _mm_alignr_epi8(_mm_setzero_si128(), riceParamPart128, 4);
3742         }
3743
3744         /* We store samples in groups of 4. */
3745         _mm_storeu_si128((__m128i*)pDecodedSamples, samples128_0);
3746         pDecodedSamples += 4;
3747     }
3748
3749     /* Make sure we process the last few samples. */
3750     i = (count & ~3);
3751     while (i < (int)count) {
3752         /* Rice extraction. */
3753         if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts0, &riceParamParts0)) {
3754             return DRFLAC_FALSE;
3755         }
3756
3757         /* Rice reconstruction. */
3758         riceParamParts0 &= riceParamMask;
3759         riceParamParts0 |= (zeroCountParts0 << riceParam);
3760         riceParamParts0  = (riceParamParts0 >> 1) ^ t[riceParamParts0 & 0x01];
3761
3762         /* Sample reconstruction. */
3763         pDecodedSamples[0] = riceParamParts0 + drflac__calculate_prediction_64(order, shift, coefficients, pDecodedSamples);
3764
3765         i += 1;
3766         pDecodedSamples += 1;
3767     }
3768
3769     return DRFLAC_TRUE;
3770 }
3771
3772 static drflac_bool32 drflac__decode_samples_with_residual__rice__sse41(drflac_bs* bs, drflac_uint32 bitsPerSample, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
3773 {
3774     DRFLAC_ASSERT(bs != NULL);
3775     DRFLAC_ASSERT(count > 0);
3776     DRFLAC_ASSERT(pSamplesOut != NULL);
3777
3778     /* In my testing the order is rarely > 12, so in this case I'm going to simplify the SSE implementation by only handling order <= 12. */
3779     if (order > 0 && order <= 12) {
3780         if (bitsPerSample+shift > 32) {
3781             return drflac__decode_samples_with_residual__rice__sse41_64(bs, count, riceParam, order, shift, coefficients, pSamplesOut);
3782         } else {
3783             return drflac__decode_samples_with_residual__rice__sse41_32(bs, count, riceParam, order, shift, coefficients, pSamplesOut);
3784         }
3785     } else {
3786         return drflac__decode_samples_with_residual__rice__scalar(bs, bitsPerSample, count, riceParam, order, shift, coefficients, pSamplesOut);
3787     }
3788 }
3789 #endif
3790
3791 #if defined(DRFLAC_SUPPORT_NEON)
3792 static DRFLAC_INLINE void drflac__vst2q_s32(drflac_int32* p, int32x4x2_t x)
3793 {
3794     vst1q_s32(p+0, x.val[0]);
3795     vst1q_s32(p+4, x.val[1]);
3796 }
3797
3798 static DRFLAC_INLINE void drflac__vst2q_u32(drflac_uint32* p, uint32x4x2_t x)
3799 {
3800     vst1q_u32(p+0, x.val[0]);
3801     vst1q_u32(p+4, x.val[1]);
3802 }
3803
3804 static DRFLAC_INLINE void drflac__vst2q_f32(float* p, float32x4x2_t x)
3805 {
3806     vst1q_f32(p+0, x.val[0]);
3807     vst1q_f32(p+4, x.val[1]);
3808 }
3809
3810 static DRFLAC_INLINE void drflac__vst2q_s16(drflac_int16* p, int16x4x2_t x)
3811 {
3812     vst1q_s16(p, vcombine_s16(x.val[0], x.val[1]));
3813 }
3814
3815 static DRFLAC_INLINE void drflac__vst2q_u16(drflac_uint16* p, uint16x4x2_t x)
3816 {
3817     vst1q_u16(p, vcombine_u16(x.val[0], x.val[1]));
3818 }
3819
3820 static DRFLAC_INLINE int32x4_t drflac__vdupq_n_s32x4(drflac_int32 x3, drflac_int32 x2, drflac_int32 x1, drflac_int32 x0)
3821 {
3822     drflac_int32 x[4];
3823     x[3] = x3;
3824     x[2] = x2;
3825     x[1] = x1;
3826     x[0] = x0;
3827     return vld1q_s32(x);
3828 }
3829
3830 static DRFLAC_INLINE int32x4_t drflac__valignrq_s32_1(int32x4_t a, int32x4_t b)
3831 {
3832     /* Equivalent to SSE's _mm_alignr_epi8(a, b, 4) */
3833
3834     /* Reference */
3835     /*return drflac__vdupq_n_s32x4(
3836         vgetq_lane_s32(a, 0),
3837         vgetq_lane_s32(b, 3),
3838         vgetq_lane_s32(b, 2),
3839         vgetq_lane_s32(b, 1)
3840     );*/
3841
3842     return vextq_s32(b, a, 1);
3843 }
3844
3845 static DRFLAC_INLINE uint32x4_t drflac__valignrq_u32_1(uint32x4_t a, uint32x4_t b)
3846 {
3847     /* Equivalent to SSE's _mm_alignr_epi8(a, b, 4) */
3848
3849     /* Reference */
3850     /*return drflac__vdupq_n_s32x4(
3851         vgetq_lane_s32(a, 0),
3852         vgetq_lane_s32(b, 3),
3853         vgetq_lane_s32(b, 2),
3854         vgetq_lane_s32(b, 1)
3855     );*/
3856
3857     return vextq_u32(b, a, 1);
3858 }
3859
3860 static DRFLAC_INLINE int32x2_t drflac__vhaddq_s32(int32x4_t x)
3861 {
3862     /* The sum must end up in position 0. */
3863
3864     /* Reference */
3865     /*return vdupq_n_s32(
3866         vgetq_lane_s32(x, 3) +
3867         vgetq_lane_s32(x, 2) +
3868         vgetq_lane_s32(x, 1) +
3869         vgetq_lane_s32(x, 0)
3870     );*/
3871
3872     int32x2_t r = vadd_s32(vget_high_s32(x), vget_low_s32(x));
3873     return vpadd_s32(r, r);
3874 }
3875
3876 static DRFLAC_INLINE int64x1_t drflac__vhaddq_s64(int64x2_t x)
3877 {
3878     return vadd_s64(vget_high_s64(x), vget_low_s64(x));
3879 }
3880
3881 static DRFLAC_INLINE int32x4_t drflac__vrevq_s32(int32x4_t x)
3882 {
3883     /* Reference */
3884     /*return drflac__vdupq_n_s32x4(
3885         vgetq_lane_s32(x, 0),
3886         vgetq_lane_s32(x, 1),
3887         vgetq_lane_s32(x, 2),
3888         vgetq_lane_s32(x, 3)
3889     );*/
3890
3891     return vrev64q_s32(vcombine_s32(vget_high_s32(x), vget_low_s32(x)));
3892 }
3893
3894 static DRFLAC_INLINE int32x4_t drflac__vnotq_s32(int32x4_t x)
3895 {
3896     return veorq_s32(x, vdupq_n_s32(0xFFFFFFFF));
3897 }
3898
3899 static DRFLAC_INLINE uint32x4_t drflac__vnotq_u32(uint32x4_t x)
3900 {
3901     return veorq_u32(x, vdupq_n_u32(0xFFFFFFFF));
3902 }
3903
3904 static drflac_bool32 drflac__decode_samples_with_residual__rice__neon_32(drflac_bs* bs, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
3905 {
3906     int i;
3907     drflac_uint32 riceParamMask;
3908     drflac_int32* pDecodedSamples    = pSamplesOut;
3909     drflac_int32* pDecodedSamplesEnd = pSamplesOut + (count & ~3);
3910     drflac_uint32 zeroCountParts[4];
3911     drflac_uint32 riceParamParts[4];
3912     int32x4_t coefficients128_0;
3913     int32x4_t coefficients128_4;
3914     int32x4_t coefficients128_8;
3915     int32x4_t samples128_0;
3916     int32x4_t samples128_4;
3917     int32x4_t samples128_8;
3918     uint32x4_t riceParamMask128;
3919     int32x4_t riceParam128;
3920     int32x2_t shift64;
3921     uint32x4_t one128;
3922
3923     const drflac_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
3924
3925     riceParamMask    = ~((~0UL) << riceParam);
3926     riceParamMask128 = vdupq_n_u32(riceParamMask);
3927
3928     riceParam128 = vdupq_n_s32(riceParam);
3929     shift64 = vdup_n_s32(-shift); /* Negate the shift because we'll be doing a variable shift using vshlq_s32(). */
3930     one128 = vdupq_n_u32(1);
3931
3932     /*
3933     Pre-loading the coefficients and prior samples is annoying because we need to ensure we don't try reading more than
3934     what's available in the input buffers. It would be conenient to use a fall-through switch to do this, but this results
3935     in strict aliasing warnings with GCC. To work around this I'm just doing something hacky. This feels a bit convoluted
3936     so I think there's opportunity for this to be simplified.
3937     */
3938     {
3939         int runningOrder = order;
3940         drflac_int32 tempC[4] = {0, 0, 0, 0};
3941         drflac_int32 tempS[4] = {0, 0, 0, 0};
3942
3943         /* 0 - 3. */
3944         if (runningOrder >= 4) {
3945             coefficients128_0 = vld1q_s32(coefficients + 0);
3946             samples128_0      = vld1q_s32(pSamplesOut  - 4);
3947             runningOrder -= 4;
3948         } else {
3949             switch (runningOrder) {
3950                 case 3: tempC[2] = coefficients[2]; tempS[1] = pSamplesOut[-3]; /* fallthrough */
3951                 case 2: tempC[1] = coefficients[1]; tempS[2] = pSamplesOut[-2]; /* fallthrough */
3952                 case 1: tempC[0] = coefficients[0]; tempS[3] = pSamplesOut[-1]; /* fallthrough */
3953             }
3954
3955             coefficients128_0 = vld1q_s32(tempC);
3956             samples128_0      = vld1q_s32(tempS);
3957             runningOrder = 0;
3958         }
3959
3960         /* 4 - 7 */
3961         if (runningOrder >= 4) {
3962             coefficients128_4 = vld1q_s32(coefficients + 4);
3963             samples128_4      = vld1q_s32(pSamplesOut  - 8);
3964             runningOrder -= 4;
3965         } else {
3966             switch (runningOrder) {
3967                 case 3: tempC[2] = coefficients[6]; tempS[1] = pSamplesOut[-7]; /* fallthrough */
3968                 case 2: tempC[1] = coefficients[5]; tempS[2] = pSamplesOut[-6]; /* fallthrough */
3969                 case 1: tempC[0] = coefficients[4]; tempS[3] = pSamplesOut[-5]; /* fallthrough */
3970             }
3971
3972             coefficients128_4 = vld1q_s32(tempC);
3973             samples128_4      = vld1q_s32(tempS);
3974             runningOrder = 0;
3975         }
3976
3977         /* 8 - 11 */
3978         if (runningOrder == 4) {
3979             coefficients128_8 = vld1q_s32(coefficients + 8);
3980             samples128_8      = vld1q_s32(pSamplesOut  - 12);
3981             runningOrder -= 4;
3982         } else {
3983             switch (runningOrder) {
3984                 case 3: tempC[2] = coefficients[10]; tempS[1] = pSamplesOut[-11]; /* fallthrough */
3985                 case 2: tempC[1] = coefficients[ 9]; tempS[2] = pSamplesOut[-10]; /* fallthrough */
3986                 case 1: tempC[0] = coefficients[ 8]; tempS[3] = pSamplesOut[- 9]; /* fallthrough */
3987             }
3988
3989             coefficients128_8 = vld1q_s32(tempC);
3990             samples128_8      = vld1q_s32(tempS);
3991             runningOrder = 0;
3992         }
3993
3994         /* Coefficients need to be shuffled for our streaming algorithm below to work. Samples are already in the correct order from the loading routine above. */
3995         coefficients128_0 = drflac__vrevq_s32(coefficients128_0);
3996         coefficients128_4 = drflac__vrevq_s32(coefficients128_4);
3997         coefficients128_8 = drflac__vrevq_s32(coefficients128_8);
3998     }
3999
4000     /* For this version we are doing one sample at a time. */
4001     while (pDecodedSamples < pDecodedSamplesEnd) {
4002         int32x4_t prediction128;
4003         int32x2_t prediction64;
4004         uint32x4_t zeroCountPart128;
4005         uint32x4_t riceParamPart128;
4006
4007         if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[0], &riceParamParts[0]) ||
4008             !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[1], &riceParamParts[1]) ||
4009             !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[2], &riceParamParts[2]) ||
4010             !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[3], &riceParamParts[3])) {
4011             return DRFLAC_FALSE;
4012         }
4013
4014         zeroCountPart128 = vld1q_u32(zeroCountParts);
4015         riceParamPart128 = vld1q_u32(riceParamParts);
4016
4017         riceParamPart128 = vandq_u32(riceParamPart128, riceParamMask128);
4018         riceParamPart128 = vorrq_u32(riceParamPart128, vshlq_u32(zeroCountPart128, riceParam128));
4019         riceParamPart128 = veorq_u32(vshrq_n_u32(riceParamPart128, 1), vaddq_u32(drflac__vnotq_u32(vandq_u32(riceParamPart128, one128)), one128));
4020
4021         if (order <= 4) {
4022             for (i = 0; i < 4; i += 1) {
4023                 prediction128 = vmulq_s32(coefficients128_0, samples128_0);
4024
4025                 /* Horizontal add and shift. */
4026                 prediction64 = drflac__vhaddq_s32(prediction128);
4027                 prediction64 = vshl_s32(prediction64, shift64);
4028                 prediction64 = vadd_s32(prediction64, vget_low_s32(vreinterpretq_s32_u32(riceParamPart128)));
4029
4030                 samples128_0 = drflac__valignrq_s32_1(vcombine_s32(prediction64, vdup_n_s32(0)), samples128_0);
4031                 riceParamPart128 = drflac__valignrq_u32_1(vdupq_n_u32(0), riceParamPart128);
4032             }
4033         } else if (order <= 8) {
4034             for (i = 0; i < 4; i += 1) {
4035                 prediction128 =                vmulq_s32(coefficients128_4, samples128_4);
4036                 prediction128 = vmlaq_s32(prediction128, coefficients128_0, samples128_0);
4037
4038                 /* Horizontal add and shift. */
4039                 prediction64 = drflac__vhaddq_s32(prediction128);
4040                 prediction64 = vshl_s32(prediction64, shift64);
4041                 prediction64 = vadd_s32(prediction64, vget_low_s32(vreinterpretq_s32_u32(riceParamPart128)));
4042
4043                 samples128_4 = drflac__valignrq_s32_1(samples128_0, samples128_4);
4044                 samples128_0 = drflac__valignrq_s32_1(vcombine_s32(prediction64, vdup_n_s32(0)), samples128_0);
4045                 riceParamPart128 = drflac__valignrq_u32_1(vdupq_n_u32(0), riceParamPart128);
4046             }
4047         } else {
4048             for (i = 0; i < 4; i += 1) {
4049                 prediction128 =                vmulq_s32(coefficients128_8, samples128_8);
4050                 prediction128 = vmlaq_s32(prediction128, coefficients128_4, samples128_4);
4051                 prediction128 = vmlaq_s32(prediction128, coefficients128_0, samples128_0);
4052
4053                 /* Horizontal add and shift. */
4054                 prediction64 = drflac__vhaddq_s32(prediction128);
4055                 prediction64 = vshl_s32(prediction64, shift64);
4056                 prediction64 = vadd_s32(prediction64, vget_low_s32(vreinterpretq_s32_u32(riceParamPart128)));
4057
4058                 samples128_8 = drflac__valignrq_s32_1(samples128_4, samples128_8);
4059                 samples128_4 = drflac__valignrq_s32_1(samples128_0, samples128_4);
4060                 samples128_0 = drflac__valignrq_s32_1(vcombine_s32(prediction64, vdup_n_s32(0)), samples128_0);
4061                 riceParamPart128 = drflac__valignrq_u32_1(vdupq_n_u32(0), riceParamPart128);
4062             }
4063         }
4064
4065         /* We store samples in groups of 4. */
4066         vst1q_s32(pDecodedSamples, samples128_0);
4067         pDecodedSamples += 4;
4068     }
4069
4070     /* Make sure we process the last few samples. */
4071     i = (count & ~3);
4072     while (i < (int)count) {
4073         /* Rice extraction. */
4074         if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[0], &riceParamParts[0])) {
4075             return DRFLAC_FALSE;
4076         }
4077
4078         /* Rice reconstruction. */
4079         riceParamParts[0] &= riceParamMask;
4080         riceParamParts[0] |= (zeroCountParts[0] << riceParam);
4081         riceParamParts[0]  = (riceParamParts[0] >> 1) ^ t[riceParamParts[0] & 0x01];
4082
4083         /* Sample reconstruction. */
4084         pDecodedSamples[0] = riceParamParts[0] + drflac__calculate_prediction_32(order, shift, coefficients, pDecodedSamples);
4085
4086         i += 1;
4087         pDecodedSamples += 1;
4088     }
4089
4090     return DRFLAC_TRUE;
4091 }
4092
4093 static drflac_bool32 drflac__decode_samples_with_residual__rice__neon_64(drflac_bs* bs, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
4094 {
4095     int i;
4096     drflac_uint32 riceParamMask;
4097     drflac_int32* pDecodedSamples    = pSamplesOut;
4098     drflac_int32* pDecodedSamplesEnd = pSamplesOut + (count & ~3);
4099     drflac_uint32 zeroCountParts[4];
4100     drflac_uint32 riceParamParts[4];
4101     int32x4_t coefficients128_0;
4102     int32x4_t coefficients128_4;
4103     int32x4_t coefficients128_8;
4104     int32x4_t samples128_0;
4105     int32x4_t samples128_4;
4106     int32x4_t samples128_8;
4107     uint32x4_t riceParamMask128;
4108     int32x4_t riceParam128;
4109     int64x1_t shift64;
4110     uint32x4_t one128;
4111
4112     const drflac_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
4113
4114     riceParamMask    = ~((~0UL) << riceParam);
4115     riceParamMask128 = vdupq_n_u32(riceParamMask);
4116
4117     riceParam128 = vdupq_n_s32(riceParam);
4118     shift64 = vdup_n_s64(-shift); /* Negate the shift because we'll be doing a variable shift using vshlq_s32(). */
4119     one128 = vdupq_n_u32(1);
4120
4121     /*
4122     Pre-loading the coefficients and prior samples is annoying because we need to ensure we don't try reading more than
4123     what's available in the input buffers. It would be conenient to use a fall-through switch to do this, but this results
4124     in strict aliasing warnings with GCC. To work around this I'm just doing something hacky. This feels a bit convoluted
4125     so I think there's opportunity for this to be simplified.
4126     */
4127     {
4128         int runningOrder = order;
4129         drflac_int32 tempC[4] = {0, 0, 0, 0};
4130         drflac_int32 tempS[4] = {0, 0, 0, 0};
4131
4132         /* 0 - 3. */
4133         if (runningOrder >= 4) {
4134             coefficients128_0 = vld1q_s32(coefficients + 0);
4135             samples128_0      = vld1q_s32(pSamplesOut  - 4);
4136             runningOrder -= 4;
4137         } else {
4138             switch (runningOrder) {
4139                 case 3: tempC[2] = coefficients[2]; tempS[1] = pSamplesOut[-3]; /* fallthrough */
4140                 case 2: tempC[1] = coefficients[1]; tempS[2] = pSamplesOut[-2]; /* fallthrough */
4141                 case 1: tempC[0] = coefficients[0]; tempS[3] = pSamplesOut[-1]; /* fallthrough */
4142             }
4143
4144             coefficients128_0 = vld1q_s32(tempC);
4145             samples128_0      = vld1q_s32(tempS);
4146             runningOrder = 0;
4147         }
4148
4149         /* 4 - 7 */
4150         if (runningOrder >= 4) {
4151             coefficients128_4 = vld1q_s32(coefficients + 4);
4152             samples128_4      = vld1q_s32(pSamplesOut  - 8);
4153             runningOrder -= 4;
4154         } else {
4155             switch (runningOrder) {
4156                 case 3: tempC[2] = coefficients[6]; tempS[1] = pSamplesOut[-7]; /* fallthrough */
4157                 case 2: tempC[1] = coefficients[5]; tempS[2] = pSamplesOut[-6]; /* fallthrough */
4158                 case 1: tempC[0] = coefficients[4]; tempS[3] = pSamplesOut[-5]; /* fallthrough */
4159             }
4160
4161             coefficients128_4 = vld1q_s32(tempC);
4162             samples128_4      = vld1q_s32(tempS);
4163             runningOrder = 0;
4164         }
4165
4166         /* 8 - 11 */
4167         if (runningOrder == 4) {
4168             coefficients128_8 = vld1q_s32(coefficients + 8);
4169             samples128_8      = vld1q_s32(pSamplesOut  - 12);
4170             runningOrder -= 4;
4171         } else {
4172             switch (runningOrder) {
4173                 case 3: tempC[2] = coefficients[10]; tempS[1] = pSamplesOut[-11]; /* fallthrough */
4174                 case 2: tempC[1] = coefficients[ 9]; tempS[2] = pSamplesOut[-10]; /* fallthrough */
4175                 case 1: tempC[0] = coefficients[ 8]; tempS[3] = pSamplesOut[- 9]; /* fallthrough */
4176             }
4177
4178             coefficients128_8 = vld1q_s32(tempC);
4179             samples128_8      = vld1q_s32(tempS);
4180             runningOrder = 0;
4181         }
4182
4183         /* Coefficients need to be shuffled for our streaming algorithm below to work. Samples are already in the correct order from the loading routine above. */
4184         coefficients128_0 = drflac__vrevq_s32(coefficients128_0);
4185         coefficients128_4 = drflac__vrevq_s32(coefficients128_4);
4186         coefficients128_8 = drflac__vrevq_s32(coefficients128_8);
4187     }
4188
4189     /* For this version we are doing one sample at a time. */
4190     while (pDecodedSamples < pDecodedSamplesEnd) {
4191         int64x2_t prediction128;
4192         uint32x4_t zeroCountPart128;
4193         uint32x4_t riceParamPart128;
4194
4195         if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[0], &riceParamParts[0]) ||
4196             !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[1], &riceParamParts[1]) ||
4197             !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[2], &riceParamParts[2]) ||
4198             !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[3], &riceParamParts[3])) {
4199             return DRFLAC_FALSE;
4200         }
4201
4202         zeroCountPart128 = vld1q_u32(zeroCountParts);
4203         riceParamPart128 = vld1q_u32(riceParamParts);
4204
4205         riceParamPart128 = vandq_u32(riceParamPart128, riceParamMask128);
4206         riceParamPart128 = vorrq_u32(riceParamPart128, vshlq_u32(zeroCountPart128, riceParam128));
4207         riceParamPart128 = veorq_u32(vshrq_n_u32(riceParamPart128, 1), vaddq_u32(drflac__vnotq_u32(vandq_u32(riceParamPart128, one128)), one128));
4208
4209         for (i = 0; i < 4; i += 1) {
4210             int64x1_t prediction64;
4211
4212             prediction128 = veorq_s64(prediction128, prediction128);    /* Reset to 0. */
4213             switch (order)
4214             {
4215             case 12:
4216             case 11: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_low_s32(coefficients128_8), vget_low_s32(samples128_8)));
4217             case 10:
4218             case  9: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_high_s32(coefficients128_8), vget_high_s32(samples128_8)));
4219             case  8:
4220             case  7: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_low_s32(coefficients128_4), vget_low_s32(samples128_4)));
4221             case  6:
4222             case  5: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_high_s32(coefficients128_4), vget_high_s32(samples128_4)));
4223             case  4:
4224             case  3: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_low_s32(coefficients128_0), vget_low_s32(samples128_0)));
4225             case  2:
4226             case  1: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_high_s32(coefficients128_0), vget_high_s32(samples128_0)));
4227             }
4228
4229             /* Horizontal add and shift. */
4230             prediction64 = drflac__vhaddq_s64(prediction128);
4231             prediction64 = vshl_s64(prediction64, shift64);
4232             prediction64 = vadd_s64(prediction64, vdup_n_s64(vgetq_lane_u32(riceParamPart128, 0)));
4233
4234             /* Our value should be sitting in prediction64[0]. We need to combine this with our SSE samples. */
4235             samples128_8 = drflac__valignrq_s32_1(samples128_4, samples128_8);
4236             samples128_4 = drflac__valignrq_s32_1(samples128_0, samples128_4);
4237             samples128_0 = drflac__valignrq_s32_1(vcombine_s32(vreinterpret_s32_s64(prediction64), vdup_n_s32(0)), samples128_0);
4238
4239             /* Slide our rice parameter down so that the value in position 0 contains the next one to process. */
4240             riceParamPart128 = drflac__valignrq_u32_1(vdupq_n_u32(0), riceParamPart128);
4241         }
4242
4243         /* We store samples in groups of 4. */
4244         vst1q_s32(pDecodedSamples, samples128_0);
4245         pDecodedSamples += 4;
4246     }
4247
4248     /* Make sure we process the last few samples. */
4249     i = (count & ~3);
4250     while (i < (int)count) {
4251         /* Rice extraction. */
4252         if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[0], &riceParamParts[0])) {
4253             return DRFLAC_FALSE;
4254         }
4255
4256         /* Rice reconstruction. */
4257         riceParamParts[0] &= riceParamMask;
4258         riceParamParts[0] |= (zeroCountParts[0] << riceParam);
4259         riceParamParts[0]  = (riceParamParts[0] >> 1) ^ t[riceParamParts[0] & 0x01];
4260
4261         /* Sample reconstruction. */
4262         pDecodedSamples[0] = riceParamParts[0] + drflac__calculate_prediction_64(order, shift, coefficients, pDecodedSamples);
4263
4264         i += 1;
4265         pDecodedSamples += 1;
4266     }
4267
4268     return DRFLAC_TRUE;
4269 }
4270
4271 static drflac_bool32 drflac__decode_samples_with_residual__rice__neon(drflac_bs* bs, drflac_uint32 bitsPerSample, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
4272 {
4273     DRFLAC_ASSERT(bs != NULL);
4274     DRFLAC_ASSERT(count > 0);
4275     DRFLAC_ASSERT(pSamplesOut != NULL);
4276
4277     /* In my testing the order is rarely > 12, so in this case I'm going to simplify the NEON implementation by only handling order <= 12. */
4278     if (order > 0 && order <= 12) {
4279         if (bitsPerSample+shift > 32) {
4280             return drflac__decode_samples_with_residual__rice__neon_64(bs, count, riceParam, order, shift, coefficients, pSamplesOut);
4281         } else {
4282             return drflac__decode_samples_with_residual__rice__neon_32(bs, count, riceParam, order, shift, coefficients, pSamplesOut);
4283         }
4284     } else {
4285         return drflac__decode_samples_with_residual__rice__scalar(bs, bitsPerSample, count, riceParam, order, shift, coefficients, pSamplesOut);
4286     }
4287 }
4288 #endif
4289
4290 static drflac_bool32 drflac__decode_samples_with_residual__rice(drflac_bs* bs, drflac_uint32 bitsPerSample, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
4291 {
4292 #if defined(DRFLAC_SUPPORT_SSE41)
4293     if (drflac__gIsSSE41Supported) {
4294         return drflac__decode_samples_with_residual__rice__sse41(bs, bitsPerSample, count, riceParam, order, shift, coefficients, pSamplesOut);
4295     } else
4296 #elif defined(DRFLAC_SUPPORT_NEON)
4297     if (drflac__gIsNEONSupported) {
4298         return drflac__decode_samples_with_residual__rice__neon(bs, bitsPerSample, count, riceParam, order, shift, coefficients, pSamplesOut);
4299     } else
4300 #endif
4301     {
4302         /* Scalar fallback. */
4303         return drflac__decode_samples_with_residual__rice__scalar(bs, bitsPerSample, count, riceParam, order, shift, coefficients, pSamplesOut);
4304     }
4305 }
4306
4307 /* Reads and seeks past a string of residual values as Rice codes. The decoder should be sitting on the first bit of the Rice codes. */
4308 static drflac_bool32 drflac__read_and_seek_residual__rice(drflac_bs* bs, drflac_uint32 count, drflac_uint8 riceParam)
4309 {
4310     drflac_uint32 i;
4311
4312     DRFLAC_ASSERT(bs != NULL);
4313     DRFLAC_ASSERT(count > 0);
4314
4315     for (i = 0; i < count; ++i) {
4316         if (!drflac__seek_rice_parts(bs, riceParam)) {
4317             return DRFLAC_FALSE;
4318         }
4319     }
4320
4321     return DRFLAC_TRUE;
4322 }
4323
4324 static drflac_bool32 drflac__decode_samples_with_residual__unencoded(drflac_bs* bs, drflac_uint32 bitsPerSample, drflac_uint32 count, drflac_uint8 unencodedBitsPerSample, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
4325 {
4326     drflac_uint32 i;
4327
4328     DRFLAC_ASSERT(bs != NULL);
4329     DRFLAC_ASSERT(count > 0);
4330     DRFLAC_ASSERT(unencodedBitsPerSample <= 31);    /* <-- unencodedBitsPerSample is a 5 bit number, so cannot exceed 31. */
4331     DRFLAC_ASSERT(pSamplesOut != NULL);
4332
4333     for (i = 0; i < count; ++i) {
4334         if (unencodedBitsPerSample > 0) {
4335             if (!drflac__read_int32(bs, unencodedBitsPerSample, pSamplesOut + i)) {
4336                 return DRFLAC_FALSE;
4337             }
4338         } else {
4339             pSamplesOut[i] = 0;
4340         }
4341
4342         if (bitsPerSample >= 24) {
4343             pSamplesOut[i] += drflac__calculate_prediction_64(order, shift, coefficients, pSamplesOut + i);
4344         } else {
4345             pSamplesOut[i] += drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + i);
4346         }
4347     }
4348
4349     return DRFLAC_TRUE;
4350 }
4351
4352
4353 /*
4354 Reads and decodes the residual for the sub-frame the decoder is currently sitting on. This function should be called
4355 when the decoder is sitting at the very start of the RESIDUAL block. The first <order> residuals will be ignored. The
4356 <blockSize> and <order> parameters are used to determine how many residual values need to be decoded.
4357 */
4358 static drflac_bool32 drflac__decode_samples_with_residual(drflac_bs* bs, drflac_uint32 bitsPerSample, drflac_uint32 blockSize, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pDecodedSamples)
4359 {
4360     drflac_uint8 residualMethod;
4361     drflac_uint8 partitionOrder;
4362     drflac_uint32 samplesInPartition;
4363     drflac_uint32 partitionsRemaining;
4364
4365     DRFLAC_ASSERT(bs != NULL);
4366     DRFLAC_ASSERT(blockSize != 0);
4367     DRFLAC_ASSERT(pDecodedSamples != NULL);       /* <-- Should we allow NULL, in which case we just seek past the residual rather than do a full decode? */
4368
4369     if (!drflac__read_uint8(bs, 2, &residualMethod)) {
4370         return DRFLAC_FALSE;
4371     }
4372
4373     if (residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE && residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
4374         return DRFLAC_FALSE;    /* Unknown or unsupported residual coding method. */
4375     }
4376
4377     /* Ignore the first <order> values. */
4378     pDecodedSamples += order;
4379
4380     if (!drflac__read_uint8(bs, 4, &partitionOrder)) {
4381         return DRFLAC_FALSE;
4382     }
4383
4384     /*
4385     From the FLAC spec:
4386       The Rice partition order in a Rice-coded residual section must be less than or equal to 8.
4387     */
4388     if (partitionOrder > 8) {
4389         return DRFLAC_FALSE;
4390     }
4391
4392     /* Validation check. */
4393     if ((blockSize / (1 << partitionOrder)) <= order) {
4394         return DRFLAC_FALSE;
4395     }
4396
4397     samplesInPartition = (blockSize / (1 << partitionOrder)) - order;
4398     partitionsRemaining = (1 << partitionOrder);
4399     for (;;) {
4400         drflac_uint8 riceParam = 0;
4401         if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE) {
4402             if (!drflac__read_uint8(bs, 4, &riceParam)) {
4403                 return DRFLAC_FALSE;
4404             }
4405             if (riceParam == 15) {
4406                 riceParam = 0xFF;
4407             }
4408         } else if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
4409             if (!drflac__read_uint8(bs, 5, &riceParam)) {
4410                 return DRFLAC_FALSE;
4411             }
4412             if (riceParam == 31) {
4413                 riceParam = 0xFF;
4414             }
4415         }
4416
4417         if (riceParam != 0xFF) {
4418             if (!drflac__decode_samples_with_residual__rice(bs, bitsPerSample, samplesInPartition, riceParam, order, shift, coefficients, pDecodedSamples)) {
4419                 return DRFLAC_FALSE;
4420             }
4421         } else {
4422             drflac_uint8 unencodedBitsPerSample = 0;
4423             if (!drflac__read_uint8(bs, 5, &unencodedBitsPerSample)) {
4424                 return DRFLAC_FALSE;
4425             }
4426
4427             if (!drflac__decode_samples_with_residual__unencoded(bs, bitsPerSample, samplesInPartition, unencodedBitsPerSample, order, shift, coefficients, pDecodedSamples)) {
4428                 return DRFLAC_FALSE;
4429             }
4430         }
4431
4432         pDecodedSamples += samplesInPartition;
4433
4434         if (partitionsRemaining == 1) {
4435             break;
4436         }
4437
4438         partitionsRemaining -= 1;
4439
4440         if (partitionOrder != 0) {
4441             samplesInPartition = blockSize / (1 << partitionOrder);
4442         }
4443     }
4444
4445     return DRFLAC_TRUE;
4446 }
4447
4448 /*
4449 Reads and seeks past the residual for the sub-frame the decoder is currently sitting on. This function should be called
4450 when the decoder is sitting at the very start of the RESIDUAL block. The first <order> residuals will be set to 0. The
4451 <blockSize> and <order> parameters are used to determine how many residual values need to be decoded.
4452 */
4453 static drflac_bool32 drflac__read_and_seek_residual(drflac_bs* bs, drflac_uint32 blockSize, drflac_uint32 order)
4454 {
4455     drflac_uint8 residualMethod;
4456     drflac_uint8 partitionOrder;
4457     drflac_uint32 samplesInPartition;
4458     drflac_uint32 partitionsRemaining;
4459
4460     DRFLAC_ASSERT(bs != NULL);
4461     DRFLAC_ASSERT(blockSize != 0);
4462
4463     if (!drflac__read_uint8(bs, 2, &residualMethod)) {
4464         return DRFLAC_FALSE;
4465     }
4466
4467     if (residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE && residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
4468         return DRFLAC_FALSE;    /* Unknown or unsupported residual coding method. */
4469     }
4470
4471     if (!drflac__read_uint8(bs, 4, &partitionOrder)) {
4472         return DRFLAC_FALSE;
4473     }
4474
4475     /*
4476     From the FLAC spec:
4477       The Rice partition order in a Rice-coded residual section must be less than or equal to 8.
4478     */
4479     if (partitionOrder > 8) {
4480         return DRFLAC_FALSE;
4481     }
4482
4483     /* Validation check. */
4484     if ((blockSize / (1 << partitionOrder)) <= order) {
4485         return DRFLAC_FALSE;
4486     }
4487
4488     samplesInPartition = (blockSize / (1 << partitionOrder)) - order;
4489     partitionsRemaining = (1 << partitionOrder);
4490     for (;;)
4491     {
4492         drflac_uint8 riceParam = 0;
4493         if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE) {
4494             if (!drflac__read_uint8(bs, 4, &riceParam)) {
4495                 return DRFLAC_FALSE;
4496             }
4497             if (riceParam == 15) {
4498                 riceParam = 0xFF;
4499             }
4500         } else if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
4501             if (!drflac__read_uint8(bs, 5, &riceParam)) {
4502                 return DRFLAC_FALSE;
4503             }
4504             if (riceParam == 31) {
4505                 riceParam = 0xFF;
4506             }
4507         }
4508
4509         if (riceParam != 0xFF) {
4510             if (!drflac__read_and_seek_residual__rice(bs, samplesInPartition, riceParam)) {
4511                 return DRFLAC_FALSE;
4512             }
4513         } else {
4514             drflac_uint8 unencodedBitsPerSample = 0;
4515             if (!drflac__read_uint8(bs, 5, &unencodedBitsPerSample)) {
4516                 return DRFLAC_FALSE;
4517             }
4518
4519             if (!drflac__seek_bits(bs, unencodedBitsPerSample * samplesInPartition)) {
4520                 return DRFLAC_FALSE;
4521             }
4522         }
4523
4524
4525         if (partitionsRemaining == 1) {
4526             break;
4527         }
4528
4529         partitionsRemaining -= 1;
4530         samplesInPartition = blockSize / (1 << partitionOrder);
4531     }
4532
4533     return DRFLAC_TRUE;
4534 }
4535
4536
4537 static drflac_bool32 drflac__decode_samples__constant(drflac_bs* bs, drflac_uint32 blockSize, drflac_uint32 subframeBitsPerSample, drflac_int32* pDecodedSamples)
4538 {
4539     drflac_uint32 i;
4540
4541     /* Only a single sample needs to be decoded here. */
4542     drflac_int32 sample;
4543     if (!drflac__read_int32(bs, subframeBitsPerSample, &sample)) {
4544         return DRFLAC_FALSE;
4545     }
4546
4547     /*
4548     We don't really need to expand this, but it does simplify the process of reading samples. If this becomes a performance issue (unlikely)
4549     we'll want to look at a more efficient way.
4550     */
4551     for (i = 0; i < blockSize; ++i) {
4552         pDecodedSamples[i] = sample;
4553     }
4554
4555     return DRFLAC_TRUE;
4556 }
4557
4558 static drflac_bool32 drflac__decode_samples__verbatim(drflac_bs* bs, drflac_uint32 blockSize, drflac_uint32 subframeBitsPerSample, drflac_int32* pDecodedSamples)
4559 {
4560     drflac_uint32 i;
4561
4562     for (i = 0; i < blockSize; ++i) {
4563         drflac_int32 sample;
4564         if (!drflac__read_int32(bs, subframeBitsPerSample, &sample)) {
4565             return DRFLAC_FALSE;
4566         }
4567
4568         pDecodedSamples[i] = sample;
4569     }
4570
4571     return DRFLAC_TRUE;
4572 }
4573
4574 static drflac_bool32 drflac__decode_samples__fixed(drflac_bs* bs, drflac_uint32 blockSize, drflac_uint32 subframeBitsPerSample, drflac_uint8 lpcOrder, drflac_int32* pDecodedSamples)
4575 {
4576     drflac_uint32 i;
4577
4578     static drflac_int32 lpcCoefficientsTable[5][4] = {
4579         {0,  0, 0,  0},
4580         {1,  0, 0,  0},
4581         {2, -1, 0,  0},
4582         {3, -3, 1,  0},
4583         {4, -6, 4, -1}
4584     };
4585
4586     /* Warm up samples and coefficients. */
4587     for (i = 0; i < lpcOrder; ++i) {
4588         drflac_int32 sample;
4589         if (!drflac__read_int32(bs, subframeBitsPerSample, &sample)) {
4590             return DRFLAC_FALSE;
4591         }
4592
4593         pDecodedSamples[i] = sample;
4594     }
4595
4596     if (!drflac__decode_samples_with_residual(bs, subframeBitsPerSample, blockSize, lpcOrder, 0, lpcCoefficientsTable[lpcOrder], pDecodedSamples)) {
4597         return DRFLAC_FALSE;
4598     }
4599
4600     return DRFLAC_TRUE;
4601 }
4602
4603 static drflac_bool32 drflac__decode_samples__lpc(drflac_bs* bs, drflac_uint32 blockSize, drflac_uint32 bitsPerSample, drflac_uint8 lpcOrder, drflac_int32* pDecodedSamples)
4604 {
4605     drflac_uint8 i;
4606     drflac_uint8 lpcPrecision;
4607     drflac_int8 lpcShift;
4608     drflac_int32 coefficients[32];
4609
4610     /* Warm up samples. */
4611     for (i = 0; i < lpcOrder; ++i) {
4612         drflac_int32 sample;
4613         if (!drflac__read_int32(bs, bitsPerSample, &sample)) {
4614             return DRFLAC_FALSE;
4615         }
4616
4617         pDecodedSamples[i] = sample;
4618     }
4619
4620     if (!drflac__read_uint8(bs, 4, &lpcPrecision)) {
4621         return DRFLAC_FALSE;
4622     }
4623     if (lpcPrecision == 15) {
4624         return DRFLAC_FALSE;    /* Invalid. */
4625     }
4626     lpcPrecision += 1;
4627
4628     if (!drflac__read_int8(bs, 5, &lpcShift)) {
4629         return DRFLAC_FALSE;
4630     }
4631
4632     /*
4633     From the FLAC specification:
4634
4635         Quantized linear predictor coefficient shift needed in bits (NOTE: this number is signed two's-complement)
4636
4637     Emphasis on the "signed two's-complement". In practice there does not seem to be any encoders nor decoders supporting negative shifts. For now dr_flac is
4638     not going to support negative shifts as I don't have any reference files. However, when a reference file comes through I will consider adding support.
4639     */
4640     if (lpcShift < 0) {
4641         return DRFLAC_FALSE;
4642     }
4643
4644     DRFLAC_ZERO_MEMORY(coefficients, sizeof(coefficients));
4645     for (i = 0; i < lpcOrder; ++i) {
4646         if (!drflac__read_int32(bs, lpcPrecision, coefficients + i)) {
4647             return DRFLAC_FALSE;
4648         }
4649     }
4650
4651     if (!drflac__decode_samples_with_residual(bs, bitsPerSample, blockSize, lpcOrder, lpcShift, coefficients, pDecodedSamples)) {
4652         return DRFLAC_FALSE;
4653     }
4654
4655     return DRFLAC_TRUE;
4656 }
4657
4658
4659 static drflac_bool32 drflac__read_next_flac_frame_header(drflac_bs* bs, drflac_uint8 streaminfoBitsPerSample, drflac_frame_header* header)
4660 {
4661     const drflac_uint32 sampleRateTable[12]  = {0, 88200, 176400, 192000, 8000, 16000, 22050, 24000, 32000, 44100, 48000, 96000};
4662     const drflac_uint8 bitsPerSampleTable[8] = {0, 8, 12, (drflac_uint8)-1, 16, 20, 24, (drflac_uint8)-1};   /* -1 = reserved. */
4663
4664     DRFLAC_ASSERT(bs != NULL);
4665     DRFLAC_ASSERT(header != NULL);
4666
4667     /* Keep looping until we find a valid sync code. */
4668     for (;;) {
4669         drflac_uint8 crc8 = 0xCE; /* 0xCE = drflac_crc8(0, 0x3FFE, 14); */
4670         drflac_uint8 reserved = 0;
4671         drflac_uint8 blockingStrategy = 0;
4672         drflac_uint8 blockSize = 0;
4673         drflac_uint8 sampleRate = 0;
4674         drflac_uint8 channelAssignment = 0;
4675         drflac_uint8 bitsPerSample = 0;
4676         drflac_bool32 isVariableBlockSize;
4677
4678         if (!drflac__find_and_seek_to_next_sync_code(bs)) {
4679             return DRFLAC_FALSE;
4680         }
4681
4682         if (!drflac__read_uint8(bs, 1, &reserved)) {
4683             return DRFLAC_FALSE;
4684         }
4685         if (reserved == 1) {
4686             continue;
4687         }
4688         crc8 = drflac_crc8(crc8, reserved, 1);
4689
4690         if (!drflac__read_uint8(bs, 1, &blockingStrategy)) {
4691             return DRFLAC_FALSE;
4692         }
4693         crc8 = drflac_crc8(crc8, blockingStrategy, 1);
4694
4695         if (!drflac__read_uint8(bs, 4, &blockSize)) {
4696             return DRFLAC_FALSE;
4697         }
4698         if (blockSize == 0) {
4699             continue;
4700         }
4701         crc8 = drflac_crc8(crc8, blockSize, 4);
4702
4703         if (!drflac__read_uint8(bs, 4, &sampleRate)) {
4704             return DRFLAC_FALSE;
4705         }
4706         crc8 = drflac_crc8(crc8, sampleRate, 4);
4707
4708         if (!drflac__read_uint8(bs, 4, &channelAssignment)) {
4709             return DRFLAC_FALSE;
4710         }
4711         if (channelAssignment > 10) {
4712             continue;
4713         }
4714         crc8 = drflac_crc8(crc8, channelAssignment, 4);
4715
4716         if (!drflac__read_uint8(bs, 3, &bitsPerSample)) {
4717             return DRFLAC_FALSE;
4718         }
4719         if (bitsPerSample == 3 || bitsPerSample == 7) {
4720             continue;
4721         }
4722         crc8 = drflac_crc8(crc8, bitsPerSample, 3);
4723
4724
4725         if (!drflac__read_uint8(bs, 1, &reserved)) {
4726             return DRFLAC_FALSE;
4727         }
4728         if (reserved == 1) {
4729             continue;
4730         }
4731         crc8 = drflac_crc8(crc8, reserved, 1);
4732
4733
4734         isVariableBlockSize = blockingStrategy == 1;
4735         if (isVariableBlockSize) {
4736             drflac_uint64 pcmFrameNumber;
4737             drflac_result result = drflac__read_utf8_coded_number(bs, &pcmFrameNumber, &crc8);
4738             if (result != DRFLAC_SUCCESS) {
4739                 if (result == DRFLAC_AT_END) {
4740                     return DRFLAC_FALSE;
4741                 } else {
4742                     continue;
4743                 }
4744             }
4745             header->flacFrameNumber  = 0;
4746             header->pcmFrameNumber = pcmFrameNumber;
4747         } else {
4748             drflac_uint64 flacFrameNumber = 0;
4749             drflac_result result = drflac__read_utf8_coded_number(bs, &flacFrameNumber, &crc8);
4750             if (result != DRFLAC_SUCCESS) {
4751                 if (result == DRFLAC_AT_END) {
4752                     return DRFLAC_FALSE;
4753                 } else {
4754                     continue;
4755                 }
4756             }
4757             header->flacFrameNumber  = (drflac_uint32)flacFrameNumber;   /* <-- Safe cast. */
4758             header->pcmFrameNumber = 0;
4759         }
4760
4761
4762         DRFLAC_ASSERT(blockSize > 0);
4763         if (blockSize == 1) {
4764             header->blockSizeInPCMFrames = 192;
4765         } else if (blockSize <= 5) {
4766             DRFLAC_ASSERT(blockSize >= 2);
4767             header->blockSizeInPCMFrames = 576 * (1 << (blockSize - 2));
4768         } else if (blockSize == 6) {
4769             if (!drflac__read_uint16(bs, 8, &header->blockSizeInPCMFrames)) {
4770                 return DRFLAC_FALSE;
4771             }
4772             crc8 = drflac_crc8(crc8, header->blockSizeInPCMFrames, 8);
4773             header->blockSizeInPCMFrames += 1;
4774         } else if (blockSize == 7) {
4775             if (!drflac__read_uint16(bs, 16, &header->blockSizeInPCMFrames)) {
4776                 return DRFLAC_FALSE;
4777             }
4778             crc8 = drflac_crc8(crc8, header->blockSizeInPCMFrames, 16);
4779             header->blockSizeInPCMFrames += 1;
4780         } else {
4781             DRFLAC_ASSERT(blockSize >= 8);
4782             header->blockSizeInPCMFrames = 256 * (1 << (blockSize - 8));
4783         }
4784
4785
4786         if (sampleRate <= 11) {
4787             header->sampleRate = sampleRateTable[sampleRate];
4788         } else if (sampleRate == 12) {
4789             if (!drflac__read_uint32(bs, 8, &header->sampleRate)) {
4790                 return DRFLAC_FALSE;
4791             }
4792             crc8 = drflac_crc8(crc8, header->sampleRate, 8);
4793             header->sampleRate *= 1000;
4794         } else if (sampleRate == 13) {
4795             if (!drflac__read_uint32(bs, 16, &header->sampleRate)) {
4796                 return DRFLAC_FALSE;
4797             }
4798             crc8 = drflac_crc8(crc8, header->sampleRate, 16);
4799         } else if (sampleRate == 14) {
4800             if (!drflac__read_uint32(bs, 16, &header->sampleRate)) {
4801                 return DRFLAC_FALSE;
4802             }
4803             crc8 = drflac_crc8(crc8, header->sampleRate, 16);
4804             header->sampleRate *= 10;
4805         } else {
4806             continue;  /* Invalid. Assume an invalid block. */
4807         }
4808
4809
4810         header->channelAssignment = channelAssignment;
4811
4812         header->bitsPerSample = bitsPerSampleTable[bitsPerSample];
4813         if (header->bitsPerSample == 0) {
4814             header->bitsPerSample = streaminfoBitsPerSample;
4815         }
4816
4817         if (!drflac__read_uint8(bs, 8, &header->crc8)) {
4818             return DRFLAC_FALSE;
4819         }
4820
4821 #ifndef DR_FLAC_NO_CRC
4822         if (header->crc8 != crc8) {
4823             continue;    /* CRC mismatch. Loop back to the top and find the next sync code. */
4824         }
4825 #endif
4826         return DRFLAC_TRUE;
4827     }
4828 }
4829
4830 static drflac_bool32 drflac__read_subframe_header(drflac_bs* bs, drflac_subframe* pSubframe)
4831 {
4832     drflac_uint8 header;
4833     int type;
4834
4835     if (!drflac__read_uint8(bs, 8, &header)) {
4836         return DRFLAC_FALSE;
4837     }
4838
4839     /* First bit should always be 0. */
4840     if ((header & 0x80) != 0) {
4841         return DRFLAC_FALSE;
4842     }
4843
4844     type = (header & 0x7E) >> 1;
4845     if (type == 0) {
4846         pSubframe->subframeType = DRFLAC_SUBFRAME_CONSTANT;
4847     } else if (type == 1) {
4848         pSubframe->subframeType = DRFLAC_SUBFRAME_VERBATIM;
4849     } else {
4850         if ((type & 0x20) != 0) {
4851             pSubframe->subframeType = DRFLAC_SUBFRAME_LPC;
4852             pSubframe->lpcOrder = (drflac_uint8)(type & 0x1F) + 1;
4853         } else if ((type & 0x08) != 0) {
4854             pSubframe->subframeType = DRFLAC_SUBFRAME_FIXED;
4855             pSubframe->lpcOrder = (drflac_uint8)(type & 0x07);
4856             if (pSubframe->lpcOrder > 4) {
4857                 pSubframe->subframeType = DRFLAC_SUBFRAME_RESERVED;
4858                 pSubframe->lpcOrder = 0;
4859             }
4860         } else {
4861             pSubframe->subframeType = DRFLAC_SUBFRAME_RESERVED;
4862         }
4863     }
4864
4865     if (pSubframe->subframeType == DRFLAC_SUBFRAME_RESERVED) {
4866         return DRFLAC_FALSE;
4867     }
4868
4869     /* Wasted bits per sample. */
4870     pSubframe->wastedBitsPerSample = 0;
4871     if ((header & 0x01) == 1) {
4872         unsigned int wastedBitsPerSample;
4873         if (!drflac__seek_past_next_set_bit(bs, &wastedBitsPerSample)) {
4874             return DRFLAC_FALSE;
4875         }
4876         pSubframe->wastedBitsPerSample = (drflac_uint8)wastedBitsPerSample + 1;
4877     }
4878
4879     return DRFLAC_TRUE;
4880 }
4881
4882 static drflac_bool32 drflac__decode_subframe(drflac_bs* bs, drflac_frame* frame, int subframeIndex, drflac_int32* pDecodedSamplesOut)
4883 {
4884     drflac_subframe* pSubframe;
4885     drflac_uint32 subframeBitsPerSample;
4886
4887     DRFLAC_ASSERT(bs != NULL);
4888     DRFLAC_ASSERT(frame != NULL);
4889
4890     pSubframe = frame->subframes + subframeIndex;
4891     if (!drflac__read_subframe_header(bs, pSubframe)) {
4892         return DRFLAC_FALSE;
4893     }
4894
4895     /* Side channels require an extra bit per sample. Took a while to figure that one out... */
4896     subframeBitsPerSample = frame->header.bitsPerSample;
4897     if ((frame->header.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE || frame->header.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE) && subframeIndex == 1) {
4898         subframeBitsPerSample += 1;
4899     } else if (frame->header.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE && subframeIndex == 0) {
4900         subframeBitsPerSample += 1;
4901     }
4902
4903     /* Need to handle wasted bits per sample. */
4904     if (pSubframe->wastedBitsPerSample >= subframeBitsPerSample) {
4905         return DRFLAC_FALSE;
4906     }
4907     subframeBitsPerSample -= pSubframe->wastedBitsPerSample;
4908
4909     pSubframe->pSamplesS32 = pDecodedSamplesOut;
4910
4911     switch (pSubframe->subframeType)
4912     {
4913         case DRFLAC_SUBFRAME_CONSTANT:
4914         {
4915             drflac__decode_samples__constant(bs, frame->header.blockSizeInPCMFrames, subframeBitsPerSample, pSubframe->pSamplesS32);
4916         } break;
4917
4918         case DRFLAC_SUBFRAME_VERBATIM:
4919         {
4920             drflac__decode_samples__verbatim(bs, frame->header.blockSizeInPCMFrames, subframeBitsPerSample, pSubframe->pSamplesS32);
4921         } break;
4922
4923         case DRFLAC_SUBFRAME_FIXED:
4924         {
4925             drflac__decode_samples__fixed(bs, frame->header.blockSizeInPCMFrames, subframeBitsPerSample, pSubframe->lpcOrder, pSubframe->pSamplesS32);
4926         } break;
4927
4928         case DRFLAC_SUBFRAME_LPC:
4929         {
4930             drflac__decode_samples__lpc(bs, frame->header.blockSizeInPCMFrames, subframeBitsPerSample, pSubframe->lpcOrder, pSubframe->pSamplesS32);
4931         } break;
4932
4933         default: return DRFLAC_FALSE;
4934     }
4935
4936     return DRFLAC_TRUE;
4937 }
4938
4939 static drflac_bool32 drflac__seek_subframe(drflac_bs* bs, drflac_frame* frame, int subframeIndex)
4940 {
4941     drflac_subframe* pSubframe;
4942     drflac_uint32 subframeBitsPerSample;
4943
4944     DRFLAC_ASSERT(bs != NULL);
4945     DRFLAC_ASSERT(frame != NULL);
4946
4947     pSubframe = frame->subframes + subframeIndex;
4948     if (!drflac__read_subframe_header(bs, pSubframe)) {
4949         return DRFLAC_FALSE;
4950     }
4951
4952     /* Side channels require an extra bit per sample. Took a while to figure that one out... */
4953     subframeBitsPerSample = frame->header.bitsPerSample;
4954     if ((frame->header.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE || frame->header.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE) && subframeIndex == 1) {
4955         subframeBitsPerSample += 1;
4956     } else if (frame->header.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE && subframeIndex == 0) {
4957         subframeBitsPerSample += 1;
4958     }
4959
4960     /* Need to handle wasted bits per sample. */
4961     if (pSubframe->wastedBitsPerSample >= subframeBitsPerSample) {
4962         return DRFLAC_FALSE;
4963     }
4964     subframeBitsPerSample -= pSubframe->wastedBitsPerSample;
4965
4966     pSubframe->pSamplesS32 = NULL;
4967
4968     switch (pSubframe->subframeType)
4969     {
4970         case DRFLAC_SUBFRAME_CONSTANT:
4971         {
4972             if (!drflac__seek_bits(bs, subframeBitsPerSample)) {
4973                 return DRFLAC_FALSE;
4974             }
4975         } break;
4976
4977         case DRFLAC_SUBFRAME_VERBATIM:
4978         {
4979             unsigned int bitsToSeek = frame->header.blockSizeInPCMFrames * subframeBitsPerSample;
4980             if (!drflac__seek_bits(bs, bitsToSeek)) {
4981                 return DRFLAC_FALSE;
4982             }
4983         } break;
4984
4985         case DRFLAC_SUBFRAME_FIXED:
4986         {
4987             unsigned int bitsToSeek = pSubframe->lpcOrder * subframeBitsPerSample;
4988             if (!drflac__seek_bits(bs, bitsToSeek)) {
4989                 return DRFLAC_FALSE;
4990             }
4991
4992             if (!drflac__read_and_seek_residual(bs, frame->header.blockSizeInPCMFrames, pSubframe->lpcOrder)) {
4993                 return DRFLAC_FALSE;
4994             }
4995         } break;
4996
4997         case DRFLAC_SUBFRAME_LPC:
4998         {
4999             drflac_uint8 lpcPrecision;
5000
5001             unsigned int bitsToSeek = pSubframe->lpcOrder * subframeBitsPerSample;
5002             if (!drflac__seek_bits(bs, bitsToSeek)) {
5003                 return DRFLAC_FALSE;
5004             }
5005
5006             if (!drflac__read_uint8(bs, 4, &lpcPrecision)) {
5007                 return DRFLAC_FALSE;
5008             }
5009             if (lpcPrecision == 15) {
5010                 return DRFLAC_FALSE;    /* Invalid. */
5011             }
5012             lpcPrecision += 1;
5013
5014
5015             bitsToSeek = (pSubframe->lpcOrder * lpcPrecision) + 5;    /* +5 for shift. */
5016             if (!drflac__seek_bits(bs, bitsToSeek)) {
5017                 return DRFLAC_FALSE;
5018             }
5019
5020             if (!drflac__read_and_seek_residual(bs, frame->header.blockSizeInPCMFrames, pSubframe->lpcOrder)) {
5021                 return DRFLAC_FALSE;
5022             }
5023         } break;
5024
5025         default: return DRFLAC_FALSE;
5026     }
5027
5028     return DRFLAC_TRUE;
5029 }
5030
5031
5032 static DRFLAC_INLINE drflac_uint8 drflac__get_channel_count_from_channel_assignment(drflac_int8 channelAssignment)
5033 {
5034     drflac_uint8 lookup[] = {1, 2, 3, 4, 5, 6, 7, 8, 2, 2, 2};
5035
5036     DRFLAC_ASSERT(channelAssignment <= 10);
5037     return lookup[channelAssignment];
5038 }
5039
5040 static drflac_result drflac__decode_flac_frame(drflac* pFlac)
5041 {
5042     int channelCount;
5043     int i;
5044     drflac_uint8 paddingSizeInBits;
5045     drflac_uint16 desiredCRC16;
5046 #ifndef DR_FLAC_NO_CRC
5047     drflac_uint16 actualCRC16;
5048 #endif
5049
5050     /* This function should be called while the stream is sitting on the first byte after the frame header. */
5051     DRFLAC_ZERO_MEMORY(pFlac->currentFLACFrame.subframes, sizeof(pFlac->currentFLACFrame.subframes));
5052
5053     /* The frame block size must never be larger than the maximum block size defined by the FLAC stream. */
5054     if (pFlac->currentFLACFrame.header.blockSizeInPCMFrames > pFlac->maxBlockSizeInPCMFrames) {
5055         return DRFLAC_ERROR;
5056     }
5057
5058     /* The number of channels in the frame must match the channel count from the STREAMINFO block. */
5059     channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFLACFrame.header.channelAssignment);
5060     if (channelCount != (int)pFlac->channels) {
5061         return DRFLAC_ERROR;
5062     }
5063
5064     for (i = 0; i < channelCount; ++i) {
5065         if (!drflac__decode_subframe(&pFlac->bs, &pFlac->currentFLACFrame, i, pFlac->pDecodedSamples + (pFlac->currentFLACFrame.header.blockSizeInPCMFrames * i))) {
5066             return DRFLAC_ERROR;
5067         }
5068     }
5069
5070     paddingSizeInBits = (drflac_uint8)(DRFLAC_CACHE_L1_BITS_REMAINING(&pFlac->bs) & 7);
5071     if (paddingSizeInBits > 0) {
5072         drflac_uint8 padding = 0;
5073         if (!drflac__read_uint8(&pFlac->bs, paddingSizeInBits, &padding)) {
5074             return DRFLAC_AT_END;
5075         }
5076     }
5077
5078 #ifndef DR_FLAC_NO_CRC
5079     actualCRC16 = drflac__flush_crc16(&pFlac->bs);
5080 #endif
5081     if (!drflac__read_uint16(&pFlac->bs, 16, &desiredCRC16)) {
5082         return DRFLAC_AT_END;
5083     }
5084
5085 #ifndef DR_FLAC_NO_CRC
5086     if (actualCRC16 != desiredCRC16) {
5087         return DRFLAC_CRC_MISMATCH;    /* CRC mismatch. */
5088     }
5089 #endif
5090
5091     pFlac->currentFLACFrame.pcmFramesRemaining = pFlac->currentFLACFrame.header.blockSizeInPCMFrames;
5092
5093     return DRFLAC_SUCCESS;
5094 }
5095
5096 static drflac_result drflac__seek_flac_frame(drflac* pFlac)
5097 {
5098     int channelCount;
5099     int i;
5100     drflac_uint16 desiredCRC16;
5101 #ifndef DR_FLAC_NO_CRC
5102     drflac_uint16 actualCRC16;
5103 #endif
5104
5105     channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFLACFrame.header.channelAssignment);
5106     for (i = 0; i < channelCount; ++i) {
5107         if (!drflac__seek_subframe(&pFlac->bs, &pFlac->currentFLACFrame, i)) {
5108             return DRFLAC_ERROR;
5109         }
5110     }
5111
5112     /* Padding. */
5113     if (!drflac__seek_bits(&pFlac->bs, DRFLAC_CACHE_L1_BITS_REMAINING(&pFlac->bs) & 7)) {
5114         return DRFLAC_ERROR;
5115     }
5116
5117     /* CRC. */
5118 #ifndef DR_FLAC_NO_CRC
5119     actualCRC16 = drflac__flush_crc16(&pFlac->bs);
5120 #endif
5121     if (!drflac__read_uint16(&pFlac->bs, 16, &desiredCRC16)) {
5122         return DRFLAC_AT_END;
5123     }
5124
5125 #ifndef DR_FLAC_NO_CRC
5126     if (actualCRC16 != desiredCRC16) {
5127         return DRFLAC_CRC_MISMATCH;    /* CRC mismatch. */
5128     }
5129 #endif
5130
5131     return DRFLAC_SUCCESS;
5132 }
5133
5134 static drflac_bool32 drflac__read_and_decode_next_flac_frame(drflac* pFlac)
5135 {
5136     DRFLAC_ASSERT(pFlac != NULL);
5137
5138     for (;;) {
5139         drflac_result result;
5140
5141         if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
5142             return DRFLAC_FALSE;
5143         }
5144
5145         result = drflac__decode_flac_frame(pFlac);
5146         if (result != DRFLAC_SUCCESS) {
5147             if (result == DRFLAC_CRC_MISMATCH) {
5148                 continue;   /* CRC mismatch. Skip to the next frame. */
5149             } else {
5150                 return DRFLAC_FALSE;
5151             }
5152         }
5153
5154         return DRFLAC_TRUE;
5155     }
5156 }
5157
5158 static void drflac__get_pcm_frame_range_of_current_flac_frame(drflac* pFlac, drflac_uint64* pFirstPCMFrame, drflac_uint64* pLastPCMFrame)
5159 {
5160     drflac_uint64 firstPCMFrame;
5161     drflac_uint64 lastPCMFrame;
5162
5163     DRFLAC_ASSERT(pFlac != NULL);
5164
5165     firstPCMFrame = pFlac->currentFLACFrame.header.pcmFrameNumber;
5166     if (firstPCMFrame == 0) {
5167         firstPCMFrame = ((drflac_uint64)pFlac->currentFLACFrame.header.flacFrameNumber) * pFlac->maxBlockSizeInPCMFrames;
5168     }
5169
5170     lastPCMFrame = firstPCMFrame + pFlac->currentFLACFrame.header.blockSizeInPCMFrames;
5171     if (lastPCMFrame > 0) {
5172         lastPCMFrame -= 1; /* Needs to be zero based. */
5173     }
5174
5175     if (pFirstPCMFrame) {
5176         *pFirstPCMFrame = firstPCMFrame;
5177     }
5178     if (pLastPCMFrame) {
5179         *pLastPCMFrame = lastPCMFrame;
5180     }
5181 }
5182
5183 static drflac_bool32 drflac__seek_to_first_frame(drflac* pFlac)
5184 {
5185     drflac_bool32 result;
5186
5187     DRFLAC_ASSERT(pFlac != NULL);
5188
5189     result = drflac__seek_to_byte(&pFlac->bs, pFlac->firstFLACFramePosInBytes);
5190
5191     DRFLAC_ZERO_MEMORY(&pFlac->currentFLACFrame, sizeof(pFlac->currentFLACFrame));
5192     pFlac->currentPCMFrame = 0;
5193
5194     return result;
5195 }
5196
5197 static DRFLAC_INLINE drflac_result drflac__seek_to_next_flac_frame(drflac* pFlac)
5198 {
5199     /* This function should only ever be called while the decoder is sitting on the first byte past the FRAME_HEADER section. */
5200     DRFLAC_ASSERT(pFlac != NULL);
5201     return drflac__seek_flac_frame(pFlac);
5202 }
5203
5204
5205 static drflac_uint64 drflac__seek_forward_by_pcm_frames(drflac* pFlac, drflac_uint64 pcmFramesToSeek)
5206 {
5207     drflac_uint64 pcmFramesRead = 0;
5208     while (pcmFramesToSeek > 0) {
5209         if (pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
5210             if (!drflac__read_and_decode_next_flac_frame(pFlac)) {
5211                 break;  /* Couldn't read the next frame, so just break from the loop and return. */
5212             }
5213         } else {
5214             if (pFlac->currentFLACFrame.pcmFramesRemaining > pcmFramesToSeek) {
5215                 pcmFramesRead   += pcmFramesToSeek;
5216                 pFlac->currentFLACFrame.pcmFramesRemaining -= (drflac_uint32)pcmFramesToSeek;   /* <-- Safe cast. Will always be < currentFrame.pcmFramesRemaining < 65536. */
5217                 pcmFramesToSeek  = 0;
5218             } else {
5219                 pcmFramesRead   += pFlac->currentFLACFrame.pcmFramesRemaining;
5220                 pcmFramesToSeek -= pFlac->currentFLACFrame.pcmFramesRemaining;
5221                 pFlac->currentFLACFrame.pcmFramesRemaining = 0;
5222             }
5223         }
5224     }
5225
5226     pFlac->currentPCMFrame += pcmFramesRead;
5227     return pcmFramesRead;
5228 }
5229
5230
5231 static drflac_bool32 drflac__seek_to_pcm_frame__brute_force(drflac* pFlac, drflac_uint64 pcmFrameIndex)
5232 {
5233     drflac_bool32 isMidFrame = DRFLAC_FALSE;
5234     drflac_uint64 runningPCMFrameCount;
5235
5236     DRFLAC_ASSERT(pFlac != NULL);
5237
5238     /* If we are seeking forward we start from the current position. Otherwise we need to start all the way from the start of the file. */
5239     if (pcmFrameIndex >= pFlac->currentPCMFrame) {
5240         /* Seeking forward. Need to seek from the current position. */
5241         runningPCMFrameCount = pFlac->currentPCMFrame;
5242
5243         /* The frame header for the first frame may not yet have been read. We need to do that if necessary. */
5244         if (pFlac->currentPCMFrame == 0 && pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
5245             if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
5246                 return DRFLAC_FALSE;
5247             }
5248         } else {
5249             isMidFrame = DRFLAC_TRUE;
5250         }
5251     } else {
5252         /* Seeking backwards. Need to seek from the start of the file. */
5253         runningPCMFrameCount = 0;
5254
5255         /* Move back to the start. */
5256         if (!drflac__seek_to_first_frame(pFlac)) {
5257             return DRFLAC_FALSE;
5258         }
5259
5260         /* Decode the first frame in preparation for sample-exact seeking below. */
5261         if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
5262             return DRFLAC_FALSE;
5263         }
5264     }
5265
5266     /*
5267     We need to as quickly as possible find the frame that contains the target sample. To do this, we iterate over each frame and inspect its
5268     header. If based on the header we can determine that the frame contains the sample, we do a full decode of that frame.
5269     */
5270     for (;;) {
5271         drflac_uint64 pcmFrameCountInThisFLACFrame;
5272         drflac_uint64 firstPCMFrameInFLACFrame = 0;
5273         drflac_uint64 lastPCMFrameInFLACFrame = 0;
5274
5275         drflac__get_pcm_frame_range_of_current_flac_frame(pFlac, &firstPCMFrameInFLACFrame, &lastPCMFrameInFLACFrame);
5276
5277         pcmFrameCountInThisFLACFrame = (lastPCMFrameInFLACFrame - firstPCMFrameInFLACFrame) + 1;
5278         if (pcmFrameIndex < (runningPCMFrameCount + pcmFrameCountInThisFLACFrame)) {
5279             /*
5280             The sample should be in this frame. We need to fully decode it, however if it's an invalid frame (a CRC mismatch), we need to pretend
5281             it never existed and keep iterating.
5282             */
5283             drflac_uint64 pcmFramesToDecode = pcmFrameIndex - runningPCMFrameCount;
5284
5285             if (!isMidFrame) {
5286                 drflac_result result = drflac__decode_flac_frame(pFlac);
5287                 if (result == DRFLAC_SUCCESS) {
5288                     /* The frame is valid. We just need to skip over some samples to ensure it's sample-exact. */
5289                     return drflac__seek_forward_by_pcm_frames(pFlac, pcmFramesToDecode) == pcmFramesToDecode;  /* <-- If this fails, something bad has happened (it should never fail). */
5290                 } else {
5291                     if (result == DRFLAC_CRC_MISMATCH) {
5292                         goto next_iteration;   /* CRC mismatch. Pretend this frame never existed. */
5293                     } else {
5294                         return DRFLAC_FALSE;
5295                     }
5296                 }
5297             } else {
5298                 /* We started seeking mid-frame which means we need to skip the frame decoding part. */
5299                 return drflac__seek_forward_by_pcm_frames(pFlac, pcmFramesToDecode) == pcmFramesToDecode;
5300             }
5301         } else {
5302             /*
5303             It's not in this frame. We need to seek past the frame, but check if there was a CRC mismatch. If so, we pretend this
5304             frame never existed and leave the running sample count untouched.
5305             */
5306             if (!isMidFrame) {
5307                 drflac_result result = drflac__seek_to_next_flac_frame(pFlac);
5308                 if (result == DRFLAC_SUCCESS) {
5309                     runningPCMFrameCount += pcmFrameCountInThisFLACFrame;
5310                 } else {
5311                     if (result == DRFLAC_CRC_MISMATCH) {
5312                         goto next_iteration;   /* CRC mismatch. Pretend this frame never existed. */
5313                     } else {
5314                         return DRFLAC_FALSE;
5315                     }
5316                 }
5317             } else {
5318                 /*
5319                 We started seeking mid-frame which means we need to seek by reading to the end of the frame instead of with
5320                 drflac__seek_to_next_flac_frame() which only works if the decoder is sitting on the byte just after the frame header.
5321                 */
5322                 runningPCMFrameCount += pFlac->currentFLACFrame.pcmFramesRemaining;
5323                 pFlac->currentFLACFrame.pcmFramesRemaining = 0;
5324                 isMidFrame = DRFLAC_FALSE;
5325             }
5326
5327             /* If we are seeking to the end of the file and we've just hit it, we're done. */
5328             if (pcmFrameIndex == pFlac->totalPCMFrameCount && runningPCMFrameCount == pFlac->totalPCMFrameCount) {
5329                 return DRFLAC_TRUE;
5330             }
5331         }
5332
5333     next_iteration:
5334         /* Grab the next frame in preparation for the next iteration. */
5335         if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
5336             return DRFLAC_FALSE;
5337         }
5338     }
5339 }
5340
5341
5342 #if !defined(DR_FLAC_NO_CRC)
5343 /*
5344 We use an average compression ratio to determine our approximate start location. FLAC files are generally about 50%-70% the size of their
5345 uncompressed counterparts so we'll use this as a basis. I'm going to split the middle and use a factor of 0.6 to determine the starting
5346 location.
5347 */
5348 #define DRFLAC_BINARY_SEARCH_APPROX_COMPRESSION_RATIO 0.6f
5349
5350 static drflac_bool32 drflac__seek_to_approximate_flac_frame_to_byte(drflac* pFlac, drflac_uint64 targetByte, drflac_uint64 rangeLo, drflac_uint64 rangeHi, drflac_uint64* pLastSuccessfulSeekOffset)
5351 {
5352     DRFLAC_ASSERT(pFlac != NULL);
5353     DRFLAC_ASSERT(pLastSuccessfulSeekOffset != NULL);
5354     DRFLAC_ASSERT(targetByte >= rangeLo);
5355     DRFLAC_ASSERT(targetByte <= rangeHi);
5356
5357     *pLastSuccessfulSeekOffset = pFlac->firstFLACFramePosInBytes;
5358
5359     for (;;) {
5360         /* After rangeLo == rangeHi == targetByte fails, we need to break out. */
5361         drflac_uint64 lastTargetByte = targetByte;
5362
5363         /* When seeking to a byte, failure probably means we've attempted to seek beyond the end of the stream. To counter this we just halve it each attempt. */
5364         if (!drflac__seek_to_byte(&pFlac->bs, targetByte)) {
5365             /* If we couldn't even seek to the first byte in the stream we have a problem. Just abandon the whole thing. */
5366             if (targetByte == 0) {
5367                 drflac__seek_to_first_frame(pFlac); /* Try to recover. */
5368                 return DRFLAC_FALSE;
5369             }
5370
5371             /* Halve the byte location and continue. */
5372             targetByte = rangeLo + ((rangeHi - rangeLo)/2);
5373             rangeHi = targetByte;
5374         } else {
5375             /* Getting here should mean that we have seeked to an appropriate byte. */
5376
5377             /* Clear the details of the FLAC frame so we don't misreport data. */
5378             DRFLAC_ZERO_MEMORY(&pFlac->currentFLACFrame, sizeof(pFlac->currentFLACFrame));
5379
5380             /*
5381             Now seek to the next FLAC frame. We need to decode the entire frame (not just the header) because it's possible for the header to incorrectly pass the
5382             CRC check and return bad data. We need to decode the entire frame to be more certain. Although this seems unlikely, this has happened to me in testing
5383             so it needs to stay this way for now.
5384             */
5385 #if 1
5386             if (!drflac__read_and_decode_next_flac_frame(pFlac)) {
5387                 /* Halve the byte location and continue. */
5388                 targetByte = rangeLo + ((rangeHi - rangeLo)/2);
5389                 rangeHi = targetByte;
5390             } else {
5391                 break;
5392             }
5393 #else
5394             if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
5395                 /* Halve the byte location and continue. */
5396                 targetByte = rangeLo + ((rangeHi - rangeLo)/2);
5397                 rangeHi = targetByte;
5398             } else {
5399                 break;
5400             }
5401 #endif
5402         }
5403
5404         /* We already tried this byte and there are no more to try, break out. */
5405         if(targetByte == lastTargetByte) {
5406             return DRFLAC_FALSE;
5407         }
5408     }
5409
5410     /* The current PCM frame needs to be updated based on the frame we just seeked to. */
5411     drflac__get_pcm_frame_range_of_current_flac_frame(pFlac, &pFlac->currentPCMFrame, NULL);
5412
5413     DRFLAC_ASSERT(targetByte <= rangeHi);
5414
5415     *pLastSuccessfulSeekOffset = targetByte;
5416     return DRFLAC_TRUE;
5417 }
5418
5419 static drflac_bool32 drflac__decode_flac_frame_and_seek_forward_by_pcm_frames(drflac* pFlac, drflac_uint64 offset)
5420 {
5421     return drflac__seek_forward_by_pcm_frames(pFlac, offset) == offset;
5422 }
5423
5424
5425 static drflac_bool32 drflac__seek_to_pcm_frame__binary_search_internal(drflac* pFlac, drflac_uint64 pcmFrameIndex, drflac_uint64 byteRangeLo, drflac_uint64 byteRangeHi)
5426 {
5427     /* This assumes pFlac->currentPCMFrame is sitting on byteRangeLo upon entry. */
5428
5429     drflac_uint64 targetByte;
5430     drflac_uint64 pcmRangeLo = pFlac->totalPCMFrameCount;
5431     drflac_uint64 pcmRangeHi = 0;
5432     drflac_uint64 lastSuccessfulSeekOffset = (drflac_uint64)-1;
5433     drflac_uint64 closestSeekOffsetBeforeTargetPCMFrame = byteRangeLo;
5434     drflac_uint32 seekForwardThreshold = (pFlac->maxBlockSizeInPCMFrames != 0) ? pFlac->maxBlockSizeInPCMFrames*2 : 4096;
5435
5436     targetByte = byteRangeLo + (drflac_uint64)(((drflac_int64)((pcmFrameIndex - pFlac->currentPCMFrame) * pFlac->channels * pFlac->bitsPerSample)/8.0f) * DRFLAC_BINARY_SEARCH_APPROX_COMPRESSION_RATIO);
5437     if (targetByte > byteRangeHi) {
5438         targetByte = byteRangeHi;
5439     }
5440
5441     for (;;) {
5442         if (drflac__seek_to_approximate_flac_frame_to_byte(pFlac, targetByte, byteRangeLo, byteRangeHi, &lastSuccessfulSeekOffset)) {
5443             /* We found a FLAC frame. We need to check if it contains the sample we're looking for. */
5444             drflac_uint64 newPCMRangeLo;
5445             drflac_uint64 newPCMRangeHi;
5446             drflac__get_pcm_frame_range_of_current_flac_frame(pFlac, &newPCMRangeLo, &newPCMRangeHi);
5447
5448             /* If we selected the same frame, it means we should be pretty close. Just decode the rest. */
5449             if (pcmRangeLo == newPCMRangeLo) {
5450                 if (!drflac__seek_to_approximate_flac_frame_to_byte(pFlac, closestSeekOffsetBeforeTargetPCMFrame, closestSeekOffsetBeforeTargetPCMFrame, byteRangeHi, &lastSuccessfulSeekOffset)) {
5451                     break;  /* Failed to seek to closest frame. */
5452                 }
5453
5454                 if (drflac__decode_flac_frame_and_seek_forward_by_pcm_frames(pFlac, pcmFrameIndex - pFlac->currentPCMFrame)) {
5455                     return DRFLAC_TRUE;
5456                 } else {
5457                     break;  /* Failed to seek forward. */
5458                 }
5459             }
5460
5461             pcmRangeLo = newPCMRangeLo;
5462             pcmRangeHi = newPCMRangeHi;
5463
5464             if (pcmRangeLo <= pcmFrameIndex && pcmRangeHi >= pcmFrameIndex) {
5465                 /* The target PCM frame is in this FLAC frame. */
5466                 if (drflac__decode_flac_frame_and_seek_forward_by_pcm_frames(pFlac, pcmFrameIndex - pFlac->currentPCMFrame) ) {
5467                     return DRFLAC_TRUE;
5468                 } else {
5469                     break;  /* Failed to seek to FLAC frame. */
5470                 }
5471             } else {
5472                 const float approxCompressionRatio = (drflac_int64)(lastSuccessfulSeekOffset - pFlac->firstFLACFramePosInBytes) / ((drflac_int64)(pcmRangeLo * pFlac->channels * pFlac->bitsPerSample)/8.0f);
5473
5474                 if (pcmRangeLo > pcmFrameIndex) {
5475                     /* We seeked too far forward. We need to move our target byte backward and try again. */
5476                     byteRangeHi = lastSuccessfulSeekOffset;
5477                     if (byteRangeLo > byteRangeHi) {
5478                         byteRangeLo = byteRangeHi;
5479                     }
5480
5481                     targetByte = byteRangeLo + ((byteRangeHi - byteRangeLo) / 2);
5482                     if (targetByte < byteRangeLo) {
5483                         targetByte = byteRangeLo;
5484                     }
5485                 } else /*if (pcmRangeHi < pcmFrameIndex)*/ {
5486                     /* We didn't seek far enough. We need to move our target byte forward and try again. */
5487
5488                     /* If we're close enough we can just seek forward. */
5489                     if ((pcmFrameIndex - pcmRangeLo) < seekForwardThreshold) {
5490                         if (drflac__decode_flac_frame_and_seek_forward_by_pcm_frames(pFlac, pcmFrameIndex - pFlac->currentPCMFrame)) {
5491                             return DRFLAC_TRUE;
5492                         } else {
5493                             break;  /* Failed to seek to FLAC frame. */
5494                         }
5495                     } else {
5496                         byteRangeLo = lastSuccessfulSeekOffset;
5497                         if (byteRangeHi < byteRangeLo) {
5498                             byteRangeHi = byteRangeLo;
5499                         }
5500
5501                         targetByte = lastSuccessfulSeekOffset + (drflac_uint64)(((drflac_int64)((pcmFrameIndex-pcmRangeLo) * pFlac->channels * pFlac->bitsPerSample)/8.0f) * approxCompressionRatio);
5502                         if (targetByte > byteRangeHi) {
5503                             targetByte = byteRangeHi;
5504                         }
5505
5506                         if (closestSeekOffsetBeforeTargetPCMFrame < lastSuccessfulSeekOffset) {
5507                             closestSeekOffsetBeforeTargetPCMFrame = lastSuccessfulSeekOffset;
5508                         }
5509                     }
5510                 }
5511             }
5512         } else {
5513             /* Getting here is really bad. We just recover as best we can, but moving to the first frame in the stream, and then abort. */
5514             break;
5515         }
5516     }
5517
5518     drflac__seek_to_first_frame(pFlac); /* <-- Try to recover. */
5519     return DRFLAC_FALSE;
5520 }
5521
5522 static drflac_bool32 drflac__seek_to_pcm_frame__binary_search(drflac* pFlac, drflac_uint64 pcmFrameIndex)
5523 {
5524     drflac_uint64 byteRangeLo;
5525     drflac_uint64 byteRangeHi;
5526     drflac_uint32 seekForwardThreshold = (pFlac->maxBlockSizeInPCMFrames != 0) ? pFlac->maxBlockSizeInPCMFrames*2 : 4096;
5527
5528     /* Our algorithm currently assumes the FLAC stream is currently sitting at the start. */
5529     if (drflac__seek_to_first_frame(pFlac) == DRFLAC_FALSE) {
5530         return DRFLAC_FALSE;
5531     }
5532
5533     /* If we're close enough to the start, just move to the start and seek forward. */
5534     if (pcmFrameIndex < seekForwardThreshold) {
5535         return drflac__seek_forward_by_pcm_frames(pFlac, pcmFrameIndex) == pcmFrameIndex;
5536     }
5537
5538     /*
5539     Our starting byte range is the byte position of the first FLAC frame and the approximate end of the file as if it were completely uncompressed. This ensures
5540     the entire file is included, even though most of the time it'll exceed the end of the actual stream. This is OK as the frame searching logic will handle it.
5541     */
5542     byteRangeLo = pFlac->firstFLACFramePosInBytes;
5543     byteRangeHi = pFlac->firstFLACFramePosInBytes + (drflac_uint64)((drflac_int64)(pFlac->totalPCMFrameCount * pFlac->channels * pFlac->bitsPerSample)/8.0f);
5544
5545     return drflac__seek_to_pcm_frame__binary_search_internal(pFlac, pcmFrameIndex, byteRangeLo, byteRangeHi);
5546 }
5547 #endif  /* !DR_FLAC_NO_CRC */
5548
5549 static drflac_bool32 drflac__seek_to_pcm_frame__seek_table(drflac* pFlac, drflac_uint64 pcmFrameIndex)
5550 {
5551     drflac_uint32 iClosestSeekpoint = 0;
5552     drflac_bool32 isMidFrame = DRFLAC_FALSE;
5553     drflac_uint64 runningPCMFrameCount;
5554     drflac_uint32 iSeekpoint;
5555
5556
5557     DRFLAC_ASSERT(pFlac != NULL);
5558
5559     if (pFlac->pSeekpoints == NULL || pFlac->seekpointCount == 0) {
5560         return DRFLAC_FALSE;
5561     }
5562
5563     for (iSeekpoint = 0; iSeekpoint < pFlac->seekpointCount; ++iSeekpoint) {
5564         if (pFlac->pSeekpoints[iSeekpoint].firstPCMFrame >= pcmFrameIndex) {
5565             break;
5566         }
5567
5568         iClosestSeekpoint = iSeekpoint;
5569     }
5570
5571     /* There's been cases where the seek table contains only zeros. We need to do some basic validation on the closest seekpoint. */
5572     if (pFlac->pSeekpoints[iClosestSeekpoint].pcmFrameCount == 0 || pFlac->pSeekpoints[iClosestSeekpoint].pcmFrameCount > pFlac->maxBlockSizeInPCMFrames) {
5573         return DRFLAC_FALSE;
5574     }
5575     if (pFlac->pSeekpoints[iClosestSeekpoint].firstPCMFrame > pFlac->totalPCMFrameCount && pFlac->totalPCMFrameCount > 0) {
5576         return DRFLAC_FALSE;
5577     }
5578
5579 #if !defined(DR_FLAC_NO_CRC)
5580     /* At this point we should know the closest seek point. We can use a binary search for this. We need to know the total sample count for this. */
5581     if (pFlac->totalPCMFrameCount > 0) {
5582         drflac_uint64 byteRangeLo;
5583         drflac_uint64 byteRangeHi;
5584
5585         byteRangeHi = pFlac->firstFLACFramePosInBytes + (drflac_uint64)((drflac_int64)(pFlac->totalPCMFrameCount * pFlac->channels * pFlac->bitsPerSample)/8.0f);
5586         byteRangeLo = pFlac->firstFLACFramePosInBytes + pFlac->pSeekpoints[iClosestSeekpoint].flacFrameOffset;
5587
5588         /*
5589         If our closest seek point is not the last one, we only need to search between it and the next one. The section below calculates an appropriate starting
5590         value for byteRangeHi which will clamp it appropriately.
5591
5592         Note that the next seekpoint must have an offset greater than the closest seekpoint because otherwise our binary search algorithm will break down. There
5593         have been cases where a seektable consists of seek points where every byte offset is set to 0 which causes problems. If this happens we need to abort.
5594         */
5595         if (iClosestSeekpoint < pFlac->seekpointCount-1) {
5596             drflac_uint32 iNextSeekpoint = iClosestSeekpoint + 1;
5597
5598             /* Basic validation on the seekpoints to ensure they're usable. */
5599             if (pFlac->pSeekpoints[iClosestSeekpoint].flacFrameOffset >= pFlac->pSeekpoints[iNextSeekpoint].flacFrameOffset || pFlac->pSeekpoints[iNextSeekpoint].pcmFrameCount == 0) {
5600                 return DRFLAC_FALSE;    /* The next seekpoint doesn't look right. The seek table cannot be trusted from here. Abort. */
5601             }
5602
5603             if (pFlac->pSeekpoints[iNextSeekpoint].firstPCMFrame != (((drflac_uint64)0xFFFFFFFF << 32) | 0xFFFFFFFF)) { /* Make sure it's not a placeholder seekpoint. */
5604                 byteRangeHi = pFlac->firstFLACFramePosInBytes + pFlac->pSeekpoints[iNextSeekpoint].flacFrameOffset - 1; /* byteRangeHi must be zero based. */
5605             }
5606         }
5607
5608         if (drflac__seek_to_byte(&pFlac->bs, pFlac->firstFLACFramePosInBytes + pFlac->pSeekpoints[iClosestSeekpoint].flacFrameOffset)) {
5609             if (drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
5610                 drflac__get_pcm_frame_range_of_current_flac_frame(pFlac, &pFlac->currentPCMFrame, NULL);
5611
5612                 if (drflac__seek_to_pcm_frame__binary_search_internal(pFlac, pcmFrameIndex, byteRangeLo, byteRangeHi)) {
5613                     return DRFLAC_TRUE;
5614                 }
5615             }
5616         }
5617     }
5618 #endif  /* !DR_FLAC_NO_CRC */
5619
5620     /* Getting here means we need to use a slower algorithm because the binary search method failed or cannot be used. */
5621
5622     /*
5623     If we are seeking forward and the closest seekpoint is _before_ the current sample, we just seek forward from where we are. Otherwise we start seeking
5624     from the seekpoint's first sample.
5625     */
5626     if (pcmFrameIndex >= pFlac->currentPCMFrame && pFlac->pSeekpoints[iClosestSeekpoint].firstPCMFrame <= pFlac->currentPCMFrame) {
5627         /* Optimized case. Just seek forward from where we are. */
5628         runningPCMFrameCount = pFlac->currentPCMFrame;
5629
5630         /* The frame header for the first frame may not yet have been read. We need to do that if necessary. */
5631         if (pFlac->currentPCMFrame == 0 && pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
5632             if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
5633                 return DRFLAC_FALSE;
5634             }
5635         } else {
5636             isMidFrame = DRFLAC_TRUE;
5637         }
5638     } else {
5639         /* Slower case. Seek to the start of the seekpoint and then seek forward from there. */
5640         runningPCMFrameCount = pFlac->pSeekpoints[iClosestSeekpoint].firstPCMFrame;
5641
5642         if (!drflac__seek_to_byte(&pFlac->bs, pFlac->firstFLACFramePosInBytes + pFlac->pSeekpoints[iClosestSeekpoint].flacFrameOffset)) {
5643             return DRFLAC_FALSE;
5644         }
5645
5646         /* Grab the frame the seekpoint is sitting on in preparation for the sample-exact seeking below. */
5647         if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
5648             return DRFLAC_FALSE;
5649         }
5650     }
5651
5652     for (;;) {
5653         drflac_uint64 pcmFrameCountInThisFLACFrame;
5654         drflac_uint64 firstPCMFrameInFLACFrame = 0;
5655         drflac_uint64 lastPCMFrameInFLACFrame = 0;
5656
5657         drflac__get_pcm_frame_range_of_current_flac_frame(pFlac, &firstPCMFrameInFLACFrame, &lastPCMFrameInFLACFrame);
5658
5659         pcmFrameCountInThisFLACFrame = (lastPCMFrameInFLACFrame - firstPCMFrameInFLACFrame) + 1;
5660         if (pcmFrameIndex < (runningPCMFrameCount + pcmFrameCountInThisFLACFrame)) {
5661             /*
5662             The sample should be in this frame. We need to fully decode it, but if it's an invalid frame (a CRC mismatch) we need to pretend
5663             it never existed and keep iterating.
5664             */
5665             drflac_uint64 pcmFramesToDecode = pcmFrameIndex - runningPCMFrameCount;
5666
5667             if (!isMidFrame) {
5668                 drflac_result result = drflac__decode_flac_frame(pFlac);
5669                 if (result == DRFLAC_SUCCESS) {
5670                     /* The frame is valid. We just need to skip over some samples to ensure it's sample-exact. */
5671                     return drflac__seek_forward_by_pcm_frames(pFlac, pcmFramesToDecode) == pcmFramesToDecode;  /* <-- If this fails, something bad has happened (it should never fail). */
5672                 } else {
5673                     if (result == DRFLAC_CRC_MISMATCH) {
5674                         goto next_iteration;   /* CRC mismatch. Pretend this frame never existed. */
5675                     } else {
5676                         return DRFLAC_FALSE;
5677                     }
5678                 }
5679             } else {
5680                 /* We started seeking mid-frame which means we need to skip the frame decoding part. */
5681                 return drflac__seek_forward_by_pcm_frames(pFlac, pcmFramesToDecode) == pcmFramesToDecode;
5682             }
5683         } else {
5684             /*
5685             It's not in this frame. We need to seek past the frame, but check if there was a CRC mismatch. If so, we pretend this
5686             frame never existed and leave the running sample count untouched.
5687             */
5688             if (!isMidFrame) {
5689                 drflac_result result = drflac__seek_to_next_flac_frame(pFlac);
5690                 if (result == DRFLAC_SUCCESS) {
5691                     runningPCMFrameCount += pcmFrameCountInThisFLACFrame;
5692                 } else {
5693                     if (result == DRFLAC_CRC_MISMATCH) {
5694                         goto next_iteration;   /* CRC mismatch. Pretend this frame never existed. */
5695                     } else {
5696                         return DRFLAC_FALSE;
5697                     }
5698                 }
5699             } else {
5700                 /*
5701                 We started seeking mid-frame which means we need to seek by reading to the end of the frame instead of with
5702                 drflac__seek_to_next_flac_frame() which only works if the decoder is sitting on the byte just after the frame header.
5703                 */
5704                 runningPCMFrameCount += pFlac->currentFLACFrame.pcmFramesRemaining;
5705                 pFlac->currentFLACFrame.pcmFramesRemaining = 0;
5706                 isMidFrame = DRFLAC_FALSE;
5707             }
5708
5709             /* If we are seeking to the end of the file and we've just hit it, we're done. */
5710             if (pcmFrameIndex == pFlac->totalPCMFrameCount && runningPCMFrameCount == pFlac->totalPCMFrameCount) {
5711                 return DRFLAC_TRUE;
5712             }
5713         }
5714
5715     next_iteration:
5716         /* Grab the next frame in preparation for the next iteration. */
5717         if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
5718             return DRFLAC_FALSE;
5719         }
5720     }
5721 }
5722
5723
5724 #ifndef DR_FLAC_NO_OGG
5725 typedef struct
5726 {
5727     drflac_uint8 capturePattern[4];  /* Should be "OggS" */
5728     drflac_uint8 structureVersion;   /* Always 0. */
5729     drflac_uint8 headerType;
5730     drflac_uint64 granulePosition;
5731     drflac_uint32 serialNumber;
5732     drflac_uint32 sequenceNumber;
5733     drflac_uint32 checksum;
5734     drflac_uint8 segmentCount;
5735     drflac_uint8 segmentTable[255];
5736 } drflac_ogg_page_header;
5737 #endif
5738
5739 typedef struct
5740 {
5741     drflac_read_proc onRead;
5742     drflac_seek_proc onSeek;
5743     drflac_meta_proc onMeta;
5744     drflac_container container;
5745     void* pUserData;
5746     void* pUserDataMD;
5747     drflac_uint32 sampleRate;
5748     drflac_uint8  channels;
5749     drflac_uint8  bitsPerSample;
5750     drflac_uint64 totalPCMFrameCount;
5751     drflac_uint16 maxBlockSizeInPCMFrames;
5752     drflac_uint64 runningFilePos;
5753     drflac_bool32 hasStreamInfoBlock;
5754     drflac_bool32 hasMetadataBlocks;
5755     drflac_bs bs;                           /* <-- A bit streamer is required for loading data during initialization. */
5756     drflac_frame_header firstFrameHeader;   /* <-- The header of the first frame that was read during relaxed initalization. Only set if there is no STREAMINFO block. */
5757
5758 #ifndef DR_FLAC_NO_OGG
5759     drflac_uint32 oggSerial;
5760     drflac_uint64 oggFirstBytePos;
5761     drflac_ogg_page_header oggBosHeader;
5762 #endif
5763 } drflac_init_info;
5764
5765 static DRFLAC_INLINE void drflac__decode_block_header(drflac_uint32 blockHeader, drflac_uint8* isLastBlock, drflac_uint8* blockType, drflac_uint32* blockSize)
5766 {
5767     blockHeader = drflac__be2host_32(blockHeader);
5768     *isLastBlock = (drflac_uint8)((blockHeader & 0x80000000UL) >> 31);
5769     *blockType   = (drflac_uint8)((blockHeader & 0x7F000000UL) >> 24);
5770     *blockSize   =                (blockHeader & 0x00FFFFFFUL);
5771 }
5772
5773 static DRFLAC_INLINE drflac_bool32 drflac__read_and_decode_block_header(drflac_read_proc onRead, void* pUserData, drflac_uint8* isLastBlock, drflac_uint8* blockType, drflac_uint32* blockSize)
5774 {
5775     drflac_uint32 blockHeader;
5776
5777     *blockSize = 0;
5778     if (onRead(pUserData, &blockHeader, 4) != 4) {
5779         return DRFLAC_FALSE;
5780     }
5781
5782     drflac__decode_block_header(blockHeader, isLastBlock, blockType, blockSize);
5783     return DRFLAC_TRUE;
5784 }
5785
5786 static drflac_bool32 drflac__read_streaminfo(drflac_read_proc onRead, void* pUserData, drflac_streaminfo* pStreamInfo)
5787 {
5788     drflac_uint32 blockSizes;
5789     drflac_uint64 frameSizes = 0;
5790     drflac_uint64 importantProps;
5791     drflac_uint8 md5[16];
5792
5793     /* min/max block size. */
5794     if (onRead(pUserData, &blockSizes, 4) != 4) {
5795         return DRFLAC_FALSE;
5796     }
5797
5798     /* min/max frame size. */
5799     if (onRead(pUserData, &frameSizes, 6) != 6) {
5800         return DRFLAC_FALSE;
5801     }
5802
5803     /* Sample rate, channels, bits per sample and total sample count. */
5804     if (onRead(pUserData, &importantProps, 8) != 8) {
5805         return DRFLAC_FALSE;
5806     }
5807
5808     /* MD5 */
5809     if (onRead(pUserData, md5, sizeof(md5)) != sizeof(md5)) {
5810         return DRFLAC_FALSE;
5811     }
5812
5813     blockSizes     = drflac__be2host_32(blockSizes);
5814     frameSizes     = drflac__be2host_64(frameSizes);
5815     importantProps = drflac__be2host_64(importantProps);
5816
5817     pStreamInfo->minBlockSizeInPCMFrames = (drflac_uint16)((blockSizes & 0xFFFF0000) >> 16);
5818     pStreamInfo->maxBlockSizeInPCMFrames = (drflac_uint16) (blockSizes & 0x0000FFFF);
5819     pStreamInfo->minFrameSizeInPCMFrames = (drflac_uint32)((frameSizes     &  (((drflac_uint64)0x00FFFFFF << 16) << 24)) >> 40);
5820     pStreamInfo->maxFrameSizeInPCMFrames = (drflac_uint32)((frameSizes     &  (((drflac_uint64)0x00FFFFFF << 16) <<  0)) >> 16);
5821     pStreamInfo->sampleRate              = (drflac_uint32)((importantProps &  (((drflac_uint64)0x000FFFFF << 16) << 28)) >> 44);
5822     pStreamInfo->channels                = (drflac_uint8 )((importantProps &  (((drflac_uint64)0x0000000E << 16) << 24)) >> 41) + 1;
5823     pStreamInfo->bitsPerSample           = (drflac_uint8 )((importantProps &  (((drflac_uint64)0x0000001F << 16) << 20)) >> 36) + 1;
5824     pStreamInfo->totalPCMFrameCount      =                ((importantProps & ((((drflac_uint64)0x0000000F << 16) << 16) | 0xFFFFFFFF)));
5825     DRFLAC_COPY_MEMORY(pStreamInfo->md5, md5, sizeof(md5));
5826
5827     return DRFLAC_TRUE;
5828 }
5829
5830
5831 static void* drflac__malloc_default(size_t sz, void* pUserData)
5832 {
5833     (void)pUserData;
5834     return DRFLAC_MALLOC(sz);
5835 }
5836
5837 static void* drflac__realloc_default(void* p, size_t sz, void* pUserData)
5838 {
5839     (void)pUserData;
5840     return DRFLAC_REALLOC(p, sz);
5841 }
5842
5843 static void drflac__free_default(void* p, void* pUserData)
5844 {
5845     (void)pUserData;
5846     DRFLAC_FREE(p);
5847 }
5848
5849
5850 static void* drflac__malloc_from_callbacks(size_t sz, const drflac_allocation_callbacks* pAllocationCallbacks)
5851 {
5852     if (pAllocationCallbacks == NULL) {
5853         return NULL;
5854     }
5855
5856     if (pAllocationCallbacks->onMalloc != NULL) {
5857         return pAllocationCallbacks->onMalloc(sz, pAllocationCallbacks->pUserData);
5858     }
5859
5860     /* Try using realloc(). */
5861     if (pAllocationCallbacks->onRealloc != NULL) {
5862         return pAllocationCallbacks->onRealloc(NULL, sz, pAllocationCallbacks->pUserData);
5863     }
5864
5865     return NULL;
5866 }
5867
5868 static void* drflac__realloc_from_callbacks(void* p, size_t szNew, size_t szOld, const drflac_allocation_callbacks* pAllocationCallbacks)
5869 {
5870     if (pAllocationCallbacks == NULL) {
5871         return NULL;
5872     }
5873
5874     if (pAllocationCallbacks->onRealloc != NULL) {
5875         return pAllocationCallbacks->onRealloc(p, szNew, pAllocationCallbacks->pUserData);
5876     }
5877
5878     /* Try emulating realloc() in terms of malloc()/free(). */
5879     if (pAllocationCallbacks->onMalloc != NULL && pAllocationCallbacks->onFree != NULL) {
5880         void* p2;
5881
5882         p2 = pAllocationCallbacks->onMalloc(szNew, pAllocationCallbacks->pUserData);
5883         if (p2 == NULL) {
5884             return NULL;
5885         }
5886
5887         if (p != NULL) {
5888             DRFLAC_COPY_MEMORY(p2, p, szOld);
5889             pAllocationCallbacks->onFree(p, pAllocationCallbacks->pUserData);
5890         }
5891
5892         return p2;
5893     }
5894
5895     return NULL;
5896 }
5897
5898 static void drflac__free_from_callbacks(void* p, const drflac_allocation_callbacks* pAllocationCallbacks)
5899 {
5900     if (p == NULL || pAllocationCallbacks == NULL) {
5901         return;
5902     }
5903
5904     if (pAllocationCallbacks->onFree != NULL) {
5905         pAllocationCallbacks->onFree(p, pAllocationCallbacks->pUserData);
5906     }
5907 }
5908
5909
5910 static drflac_bool32 drflac__read_and_decode_metadata(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_meta_proc onMeta, void* pUserData, void* pUserDataMD, drflac_uint64* pFirstFramePos, drflac_uint64* pSeektablePos, drflac_uint32* pSeektableSize, drflac_allocation_callbacks* pAllocationCallbacks)
5911 {
5912     /*
5913     We want to keep track of the byte position in the stream of the seektable. At the time of calling this function we know that
5914     we'll be sitting on byte 42.
5915     */
5916     drflac_uint64 runningFilePos = 42;
5917     drflac_uint64 seektablePos   = 0;
5918     drflac_uint32 seektableSize  = 0;
5919
5920     for (;;) {
5921         drflac_metadata metadata;
5922         drflac_uint8 isLastBlock = 0;
5923         drflac_uint8 blockType;
5924         drflac_uint32 blockSize;
5925         if (drflac__read_and_decode_block_header(onRead, pUserData, &isLastBlock, &blockType, &blockSize) == DRFLAC_FALSE) {
5926             return DRFLAC_FALSE;
5927         }
5928         runningFilePos += 4;
5929
5930         metadata.type = blockType;
5931         metadata.pRawData = NULL;
5932         metadata.rawDataSize = 0;
5933
5934         switch (blockType)
5935         {
5936             case DRFLAC_METADATA_BLOCK_TYPE_APPLICATION:
5937             {
5938                 if (blockSize < 4) {
5939                     return DRFLAC_FALSE;
5940                 }
5941
5942                 if (onMeta) {
5943                     void* pRawData = drflac__malloc_from_callbacks(blockSize, pAllocationCallbacks);
5944                     if (pRawData == NULL) {
5945                         return DRFLAC_FALSE;
5946                     }
5947
5948                     if (onRead(pUserData, pRawData, blockSize) != blockSize) {
5949                         drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
5950                         return DRFLAC_FALSE;
5951                     }
5952
5953                     metadata.pRawData = pRawData;
5954                     metadata.rawDataSize = blockSize;
5955                     metadata.data.application.id       = drflac__be2host_32(*(drflac_uint32*)pRawData);
5956                     metadata.data.application.pData    = (const void*)((drflac_uint8*)pRawData + sizeof(drflac_uint32));
5957                     metadata.data.application.dataSize = blockSize - sizeof(drflac_uint32);
5958                     onMeta(pUserDataMD, &metadata);
5959
5960                     drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
5961                 }
5962             } break;
5963
5964             case DRFLAC_METADATA_BLOCK_TYPE_SEEKTABLE:
5965             {
5966                 seektablePos  = runningFilePos;
5967                 seektableSize = blockSize;
5968
5969                 if (onMeta) {
5970                     drflac_uint32 iSeekpoint;
5971                     void* pRawData;
5972
5973                     pRawData = drflac__malloc_from_callbacks(blockSize, pAllocationCallbacks);
5974                     if (pRawData == NULL) {
5975                         return DRFLAC_FALSE;
5976                     }
5977
5978                     if (onRead(pUserData, pRawData, blockSize) != blockSize) {
5979                         drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
5980                         return DRFLAC_FALSE;
5981                     }
5982
5983                     metadata.pRawData = pRawData;
5984                     metadata.rawDataSize = blockSize;
5985                     metadata.data.seektable.seekpointCount = blockSize/sizeof(drflac_seekpoint);
5986                     metadata.data.seektable.pSeekpoints = (const drflac_seekpoint*)pRawData;
5987
5988                     /* Endian swap. */
5989                     for (iSeekpoint = 0; iSeekpoint < metadata.data.seektable.seekpointCount; ++iSeekpoint) {
5990                         drflac_seekpoint* pSeekpoint = (drflac_seekpoint*)pRawData + iSeekpoint;
5991                         pSeekpoint->firstPCMFrame   = drflac__be2host_64(pSeekpoint->firstPCMFrame);
5992                         pSeekpoint->flacFrameOffset = drflac__be2host_64(pSeekpoint->flacFrameOffset);
5993                         pSeekpoint->pcmFrameCount   = drflac__be2host_16(pSeekpoint->pcmFrameCount);
5994                     }
5995
5996                     onMeta(pUserDataMD, &metadata);
5997
5998                     drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
5999                 }
6000             } break;
6001
6002             case DRFLAC_METADATA_BLOCK_TYPE_VORBIS_COMMENT:
6003             {
6004                 if (blockSize < 8) {
6005                     return DRFLAC_FALSE;
6006                 }
6007
6008                 if (onMeta) {
6009                     void* pRawData;
6010                     const char* pRunningData;
6011                     const char* pRunningDataEnd;
6012                     drflac_uint32 i;
6013
6014                     pRawData = drflac__malloc_from_callbacks(blockSize, pAllocationCallbacks);
6015                     if (pRawData == NULL) {
6016                         return DRFLAC_FALSE;
6017                     }
6018
6019                     if (onRead(pUserData, pRawData, blockSize) != blockSize) {
6020                         drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6021                         return DRFLAC_FALSE;
6022                     }
6023
6024                     metadata.pRawData = pRawData;
6025                     metadata.rawDataSize = blockSize;
6026
6027                     pRunningData    = (const char*)pRawData;
6028                     pRunningDataEnd = (const char*)pRawData + blockSize;
6029
6030                     metadata.data.vorbis_comment.vendorLength = drflac__le2host_32(*(const drflac_uint32*)pRunningData); pRunningData += 4;
6031
6032                     /* Need space for the rest of the block */
6033                     if ((pRunningDataEnd - pRunningData) - 4 < (drflac_int64)metadata.data.vorbis_comment.vendorLength) { /* <-- Note the order of operations to avoid overflow to a valid value */
6034                         drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6035                         return DRFLAC_FALSE;
6036                     }
6037                     metadata.data.vorbis_comment.vendor       = pRunningData;                                            pRunningData += metadata.data.vorbis_comment.vendorLength;
6038                     metadata.data.vorbis_comment.commentCount = drflac__le2host_32(*(const drflac_uint32*)pRunningData); pRunningData += 4;
6039
6040                     /* Need space for 'commentCount' comments after the block, which at minimum is a drflac_uint32 per comment */
6041                     if ((pRunningDataEnd - pRunningData) / sizeof(drflac_uint32) < metadata.data.vorbis_comment.commentCount) { /* <-- Note the order of operations to avoid overflow to a valid value */
6042                         drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6043                         return DRFLAC_FALSE;
6044                     }
6045                     metadata.data.vorbis_comment.pComments    = pRunningData;
6046
6047                     /* Check that the comments section is valid before passing it to the callback */
6048                     for (i = 0; i < metadata.data.vorbis_comment.commentCount; ++i) {
6049                         drflac_uint32 commentLength;
6050
6051                         if (pRunningDataEnd - pRunningData < 4) {
6052                             drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6053                             return DRFLAC_FALSE;
6054                         }
6055
6056                         commentLength = drflac__le2host_32(*(const drflac_uint32*)pRunningData); pRunningData += 4;
6057                         if (pRunningDataEnd - pRunningData < (drflac_int64)commentLength) { /* <-- Note the order of operations to avoid overflow to a valid value */
6058                             drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6059                             return DRFLAC_FALSE;
6060                         }
6061                         pRunningData += commentLength;
6062                     }
6063
6064                     onMeta(pUserDataMD, &metadata);
6065
6066                     drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6067                 }
6068             } break;
6069
6070             case DRFLAC_METADATA_BLOCK_TYPE_CUESHEET:
6071             {
6072                 if (blockSize < 396) {
6073                     return DRFLAC_FALSE;
6074                 }
6075
6076                 if (onMeta) {
6077                     void* pRawData;
6078                     const char* pRunningData;
6079                     const char* pRunningDataEnd;
6080                     drflac_uint8 iTrack;
6081                     drflac_uint8 iIndex;
6082
6083                     pRawData = drflac__malloc_from_callbacks(blockSize, pAllocationCallbacks);
6084                     if (pRawData == NULL) {
6085                         return DRFLAC_FALSE;
6086                     }
6087
6088                     if (onRead(pUserData, pRawData, blockSize) != blockSize) {
6089                         drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6090                         return DRFLAC_FALSE;
6091                     }
6092
6093                     metadata.pRawData = pRawData;
6094                     metadata.rawDataSize = blockSize;
6095
6096                     pRunningData    = (const char*)pRawData;
6097                     pRunningDataEnd = (const char*)pRawData + blockSize;
6098
6099                     DRFLAC_COPY_MEMORY(metadata.data.cuesheet.catalog, pRunningData, 128);                              pRunningData += 128;
6100                     metadata.data.cuesheet.leadInSampleCount = drflac__be2host_64(*(const drflac_uint64*)pRunningData); pRunningData += 8;
6101                     metadata.data.cuesheet.isCD              = (pRunningData[0] & 0x80) != 0;                           pRunningData += 259;
6102                     metadata.data.cuesheet.trackCount        = pRunningData[0];                                         pRunningData += 1;
6103                     metadata.data.cuesheet.pTrackData        = pRunningData;
6104
6105                     /* Check that the cuesheet tracks are valid before passing it to the callback */
6106                     for (iTrack = 0; iTrack < metadata.data.cuesheet.trackCount; ++iTrack) {
6107                         drflac_uint8 indexCount;
6108                         drflac_uint32 indexPointSize;
6109
6110                         if (pRunningDataEnd - pRunningData < 36) {
6111                             drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6112                             return DRFLAC_FALSE;
6113                         }
6114
6115                         /* Skip to the index point count */
6116                         pRunningData += 35;
6117                         indexCount = pRunningData[0]; pRunningData += 1;
6118                         indexPointSize = indexCount * sizeof(drflac_cuesheet_track_index);
6119                         if (pRunningDataEnd - pRunningData < (drflac_int64)indexPointSize) {
6120                             drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6121                             return DRFLAC_FALSE;
6122                         }
6123
6124                         /* Endian swap. */
6125                         for (iIndex = 0; iIndex < indexCount; ++iIndex) {
6126                             drflac_cuesheet_track_index* pTrack = (drflac_cuesheet_track_index*)pRunningData;
6127                             pRunningData += sizeof(drflac_cuesheet_track_index);
6128                             pTrack->offset = drflac__be2host_64(pTrack->offset);
6129                         }
6130                     }
6131
6132                     onMeta(pUserDataMD, &metadata);
6133
6134                     drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6135                 }
6136             } break;
6137
6138             case DRFLAC_METADATA_BLOCK_TYPE_PICTURE:
6139             {
6140                 if (blockSize < 32) {
6141                     return DRFLAC_FALSE;
6142                 }
6143
6144                 if (onMeta) {
6145                     void* pRawData;
6146                     const char* pRunningData;
6147                     const char* pRunningDataEnd;
6148
6149                     pRawData = drflac__malloc_from_callbacks(blockSize, pAllocationCallbacks);
6150                     if (pRawData == NULL) {
6151                         return DRFLAC_FALSE;
6152                     }
6153
6154                     if (onRead(pUserData, pRawData, blockSize) != blockSize) {
6155                         drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6156                         return DRFLAC_FALSE;
6157                     }
6158
6159                     metadata.pRawData = pRawData;
6160                     metadata.rawDataSize = blockSize;
6161
6162                     pRunningData    = (const char*)pRawData;
6163                     pRunningDataEnd = (const char*)pRawData + blockSize;
6164
6165                     metadata.data.picture.type       = drflac__be2host_32(*(const drflac_uint32*)pRunningData); pRunningData += 4;
6166                     metadata.data.picture.mimeLength = drflac__be2host_32(*(const drflac_uint32*)pRunningData); pRunningData += 4;
6167
6168                     /* Need space for the rest of the block */
6169                     if ((pRunningDataEnd - pRunningData) - 24 < (drflac_int64)metadata.data.picture.mimeLength) { /* <-- Note the order of operations to avoid overflow to a valid value */
6170                         drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6171                         return DRFLAC_FALSE;
6172                     }
6173                     metadata.data.picture.mime              = pRunningData;                                            pRunningData += metadata.data.picture.mimeLength;
6174                     metadata.data.picture.descriptionLength = drflac__be2host_32(*(const drflac_uint32*)pRunningData); pRunningData += 4;
6175
6176                     /* Need space for the rest of the block */
6177                     if ((pRunningDataEnd - pRunningData) - 20 < (drflac_int64)metadata.data.picture.descriptionLength) { /* <-- Note the order of operations to avoid overflow to a valid value */
6178                         drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6179                         return DRFLAC_FALSE;
6180                     }
6181                     metadata.data.picture.description     = pRunningData;                                            pRunningData += metadata.data.picture.descriptionLength;
6182                     metadata.data.picture.width           = drflac__be2host_32(*(const drflac_uint32*)pRunningData); pRunningData += 4;
6183                     metadata.data.picture.height          = drflac__be2host_32(*(const drflac_uint32*)pRunningData); pRunningData += 4;
6184                     metadata.data.picture.colorDepth      = drflac__be2host_32(*(const drflac_uint32*)pRunningData); pRunningData += 4;
6185                     metadata.data.picture.indexColorCount = drflac__be2host_32(*(const drflac_uint32*)pRunningData); pRunningData += 4;
6186                     metadata.data.picture.pictureDataSize = drflac__be2host_32(*(const drflac_uint32*)pRunningData); pRunningData += 4;
6187                     metadata.data.picture.pPictureData    = (const drflac_uint8*)pRunningData;
6188
6189                     /* Need space for the picture after the block */
6190                     if (pRunningDataEnd - pRunningData < (drflac_int64)metadata.data.picture.pictureDataSize) { /* <-- Note the order of operations to avoid overflow to a valid value */
6191                         drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6192                         return DRFLAC_FALSE;
6193                     }
6194
6195                     onMeta(pUserDataMD, &metadata);
6196
6197                     drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6198                 }
6199             } break;
6200
6201             case DRFLAC_METADATA_BLOCK_TYPE_PADDING:
6202             {
6203                 if (onMeta) {
6204                     metadata.data.padding.unused = 0;
6205
6206                     /* Padding doesn't have anything meaningful in it, so just skip over it, but make sure the caller is aware of it by firing the callback. */
6207                     if (!onSeek(pUserData, blockSize, drflac_seek_origin_current)) {
6208                         isLastBlock = DRFLAC_TRUE;  /* An error occurred while seeking. Attempt to recover by treating this as the last block which will in turn terminate the loop. */
6209                     } else {
6210                         onMeta(pUserDataMD, &metadata);
6211                     }
6212                 }
6213             } break;
6214
6215             case DRFLAC_METADATA_BLOCK_TYPE_INVALID:
6216             {
6217                 /* Invalid chunk. Just skip over this one. */
6218                 if (onMeta) {
6219                     if (!onSeek(pUserData, blockSize, drflac_seek_origin_current)) {
6220                         isLastBlock = DRFLAC_TRUE;  /* An error occurred while seeking. Attempt to recover by treating this as the last block which will in turn terminate the loop. */
6221                     }
6222                 }
6223             } break;
6224
6225             default:
6226             {
6227                 /*
6228                 It's an unknown chunk, but not necessarily invalid. There's a chance more metadata blocks might be defined later on, so we
6229                 can at the very least report the chunk to the application and let it look at the raw data.
6230                 */
6231                 if (onMeta) {
6232                     void* pRawData = drflac__malloc_from_callbacks(blockSize, pAllocationCallbacks);
6233                     if (pRawData == NULL) {
6234                         return DRFLAC_FALSE;
6235                     }
6236
6237                     if (onRead(pUserData, pRawData, blockSize) != blockSize) {
6238                         drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6239                         return DRFLAC_FALSE;
6240                     }
6241
6242                     metadata.pRawData = pRawData;
6243                     metadata.rawDataSize = blockSize;
6244                     onMeta(pUserDataMD, &metadata);
6245
6246                     drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6247                 }
6248             } break;
6249         }
6250
6251         /* If we're not handling metadata, just skip over the block. If we are, it will have been handled earlier in the switch statement above. */
6252         if (onMeta == NULL && blockSize > 0) {
6253             if (!onSeek(pUserData, blockSize, drflac_seek_origin_current)) {
6254                 isLastBlock = DRFLAC_TRUE;
6255             }
6256         }
6257
6258         runningFilePos += blockSize;
6259         if (isLastBlock) {
6260             break;
6261         }
6262     }
6263
6264     *pSeektablePos = seektablePos;
6265     *pSeektableSize = seektableSize;
6266     *pFirstFramePos = runningFilePos;
6267
6268     return DRFLAC_TRUE;
6269 }
6270
6271 static drflac_bool32 drflac__init_private__native(drflac_init_info* pInit, drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_meta_proc onMeta, void* pUserData, void* pUserDataMD, drflac_bool32 relaxed)
6272 {
6273     /* Pre Condition: The bit stream should be sitting just past the 4-byte id header. */
6274
6275     drflac_uint8 isLastBlock;
6276     drflac_uint8 blockType;
6277     drflac_uint32 blockSize;
6278
6279     (void)onSeek;
6280
6281     pInit->container = drflac_container_native;
6282
6283     /* The first metadata block should be the STREAMINFO block. */
6284     if (!drflac__read_and_decode_block_header(onRead, pUserData, &isLastBlock, &blockType, &blockSize)) {
6285         return DRFLAC_FALSE;
6286     }
6287
6288     if (blockType != DRFLAC_METADATA_BLOCK_TYPE_STREAMINFO || blockSize != 34) {
6289         if (!relaxed) {
6290             /* We're opening in strict mode and the first block is not the STREAMINFO block. Error. */
6291             return DRFLAC_FALSE;
6292         } else {
6293             /*
6294             Relaxed mode. To open from here we need to just find the first frame and set the sample rate, etc. to whatever is defined
6295             for that frame.
6296             */
6297             pInit->hasStreamInfoBlock = DRFLAC_FALSE;
6298             pInit->hasMetadataBlocks  = DRFLAC_FALSE;
6299
6300             if (!drflac__read_next_flac_frame_header(&pInit->bs, 0, &pInit->firstFrameHeader)) {
6301                 return DRFLAC_FALSE;    /* Couldn't find a frame. */
6302             }
6303
6304             if (pInit->firstFrameHeader.bitsPerSample == 0) {
6305                 return DRFLAC_FALSE;    /* Failed to initialize because the first frame depends on the STREAMINFO block, which does not exist. */
6306             }
6307
6308             pInit->sampleRate              = pInit->firstFrameHeader.sampleRate;
6309             pInit->channels                = drflac__get_channel_count_from_channel_assignment(pInit->firstFrameHeader.channelAssignment);
6310             pInit->bitsPerSample           = pInit->firstFrameHeader.bitsPerSample;
6311             pInit->maxBlockSizeInPCMFrames = 65535;   /* <-- See notes here: https://xiph.org/flac/format.html#metadata_block_streaminfo */
6312             return DRFLAC_TRUE;
6313         }
6314     } else {
6315         drflac_streaminfo streaminfo;
6316         if (!drflac__read_streaminfo(onRead, pUserData, &streaminfo)) {
6317             return DRFLAC_FALSE;
6318         }
6319
6320         pInit->hasStreamInfoBlock      = DRFLAC_TRUE;
6321         pInit->sampleRate              = streaminfo.sampleRate;
6322         pInit->channels                = streaminfo.channels;
6323         pInit->bitsPerSample           = streaminfo.bitsPerSample;
6324         pInit->totalPCMFrameCount      = streaminfo.totalPCMFrameCount;
6325         pInit->maxBlockSizeInPCMFrames = streaminfo.maxBlockSizeInPCMFrames;    /* Don't care about the min block size - only the max (used for determining the size of the memory allocation). */
6326         pInit->hasMetadataBlocks       = !isLastBlock;
6327
6328         if (onMeta) {
6329             drflac_metadata metadata;
6330             metadata.type = DRFLAC_METADATA_BLOCK_TYPE_STREAMINFO;
6331             metadata.pRawData = NULL;
6332             metadata.rawDataSize = 0;
6333             metadata.data.streaminfo = streaminfo;
6334             onMeta(pUserDataMD, &metadata);
6335         }
6336
6337         return DRFLAC_TRUE;
6338     }
6339 }
6340
6341 #ifndef DR_FLAC_NO_OGG
6342 #define DRFLAC_OGG_MAX_PAGE_SIZE            65307
6343 #define DRFLAC_OGG_CAPTURE_PATTERN_CRC32    1605413199  /* CRC-32 of "OggS". */
6344
6345 typedef enum
6346 {
6347     drflac_ogg_recover_on_crc_mismatch,
6348     drflac_ogg_fail_on_crc_mismatch
6349 } drflac_ogg_crc_mismatch_recovery;
6350
6351 #ifndef DR_FLAC_NO_CRC
6352 static drflac_uint32 drflac__crc32_table[] = {
6353     0x00000000L, 0x04C11DB7L, 0x09823B6EL, 0x0D4326D9L,
6354     0x130476DCL, 0x17C56B6BL, 0x1A864DB2L, 0x1E475005L,
6355     0x2608EDB8L, 0x22C9F00FL, 0x2F8AD6D6L, 0x2B4BCB61L,
6356     0x350C9B64L, 0x31CD86D3L, 0x3C8EA00AL, 0x384FBDBDL,
6357     0x4C11DB70L, 0x48D0C6C7L, 0x4593E01EL, 0x4152FDA9L,
6358     0x5F15ADACL, 0x5BD4B01BL, 0x569796C2L, 0x52568B75L,
6359     0x6A1936C8L, 0x6ED82B7FL, 0x639B0DA6L, 0x675A1011L,
6360     0x791D4014L, 0x7DDC5DA3L, 0x709F7B7AL, 0x745E66CDL,
6361     0x9823B6E0L, 0x9CE2AB57L, 0x91A18D8EL, 0x95609039L,
6362     0x8B27C03CL, 0x8FE6DD8BL, 0x82A5FB52L, 0x8664E6E5L,
6363     0xBE2B5B58L, 0xBAEA46EFL, 0xB7A96036L, 0xB3687D81L,
6364     0xAD2F2D84L, 0xA9EE3033L, 0xA4AD16EAL, 0xA06C0B5DL,
6365     0xD4326D90L, 0xD0F37027L, 0xDDB056FEL, 0xD9714B49L,
6366     0xC7361B4CL, 0xC3F706FBL, 0xCEB42022L, 0xCA753D95L,
6367     0xF23A8028L, 0xF6FB9D9FL, 0xFBB8BB46L, 0xFF79A6F1L,
6368     0xE13EF6F4L, 0xE5FFEB43L, 0xE8BCCD9AL, 0xEC7DD02DL,
6369     0x34867077L, 0x30476DC0L, 0x3D044B19L, 0x39C556AEL,
6370     0x278206ABL, 0x23431B1CL, 0x2E003DC5L, 0x2AC12072L,
6371     0x128E9DCFL, 0x164F8078L, 0x1B0CA6A1L, 0x1FCDBB16L,
6372     0x018AEB13L, 0x054BF6A4L, 0x0808D07DL, 0x0CC9CDCAL,
6373     0x7897AB07L, 0x7C56B6B0L, 0x71159069L, 0x75D48DDEL,
6374     0x6B93DDDBL, 0x6F52C06CL, 0x6211E6B5L, 0x66D0FB02L,
6375     0x5E9F46BFL, 0x5A5E5B08L, 0x571D7DD1L, 0x53DC6066L,
6376     0x4D9B3063L, 0x495A2DD4L, 0x44190B0DL, 0x40D816BAL,
6377     0xACA5C697L, 0xA864DB20L, 0xA527FDF9L, 0xA1E6E04EL,
6378     0xBFA1B04BL, 0xBB60ADFCL, 0xB6238B25L, 0xB2E29692L,
6379     0x8AAD2B2FL, 0x8E6C3698L, 0x832F1041L, 0x87EE0DF6L,
6380     0x99A95DF3L, 0x9D684044L, 0x902B669DL, 0x94EA7B2AL,
6381     0xE0B41DE7L, 0xE4750050L, 0xE9362689L, 0xEDF73B3EL,
6382     0xF3B06B3BL, 0xF771768CL, 0xFA325055L, 0xFEF34DE2L,
6383     0xC6BCF05FL, 0xC27DEDE8L, 0xCF3ECB31L, 0xCBFFD686L,
6384     0xD5B88683L, 0xD1799B34L, 0xDC3ABDEDL, 0xD8FBA05AL,
6385     0x690CE0EEL, 0x6DCDFD59L, 0x608EDB80L, 0x644FC637L,
6386     0x7A089632L, 0x7EC98B85L, 0x738AAD5CL, 0x774BB0EBL,
6387     0x4F040D56L, 0x4BC510E1L, 0x46863638L, 0x42472B8FL,
6388     0x5C007B8AL, 0x58C1663DL, 0x558240E4L, 0x51435D53L,
6389     0x251D3B9EL, 0x21DC2629L, 0x2C9F00F0L, 0x285E1D47L,
6390     0x36194D42L, 0x32D850F5L, 0x3F9B762CL, 0x3B5A6B9BL,
6391     0x0315D626L, 0x07D4CB91L, 0x0A97ED48L, 0x0E56F0FFL,
6392     0x1011A0FAL, 0x14D0BD4DL, 0x19939B94L, 0x1D528623L,
6393     0xF12F560EL, 0xF5EE4BB9L, 0xF8AD6D60L, 0xFC6C70D7L,
6394     0xE22B20D2L, 0xE6EA3D65L, 0xEBA91BBCL, 0xEF68060BL,
6395     0xD727BBB6L, 0xD3E6A601L, 0xDEA580D8L, 0xDA649D6FL,
6396     0xC423CD6AL, 0xC0E2D0DDL, 0xCDA1F604L, 0xC960EBB3L,
6397     0xBD3E8D7EL, 0xB9FF90C9L, 0xB4BCB610L, 0xB07DABA7L,
6398     0xAE3AFBA2L, 0xAAFBE615L, 0xA7B8C0CCL, 0xA379DD7BL,
6399     0x9B3660C6L, 0x9FF77D71L, 0x92B45BA8L, 0x9675461FL,
6400     0x8832161AL, 0x8CF30BADL, 0x81B02D74L, 0x857130C3L,
6401     0x5D8A9099L, 0x594B8D2EL, 0x5408ABF7L, 0x50C9B640L,
6402     0x4E8EE645L, 0x4A4FFBF2L, 0x470CDD2BL, 0x43CDC09CL,
6403     0x7B827D21L, 0x7F436096L, 0x7200464FL, 0x76C15BF8L,
6404     0x68860BFDL, 0x6C47164AL, 0x61043093L, 0x65C52D24L,
6405     0x119B4BE9L, 0x155A565EL, 0x18197087L, 0x1CD86D30L,
6406     0x029F3D35L, 0x065E2082L, 0x0B1D065BL, 0x0FDC1BECL,
6407     0x3793A651L, 0x3352BBE6L, 0x3E119D3FL, 0x3AD08088L,
6408     0x2497D08DL, 0x2056CD3AL, 0x2D15EBE3L, 0x29D4F654L,
6409     0xC5A92679L, 0xC1683BCEL, 0xCC2B1D17L, 0xC8EA00A0L,
6410     0xD6AD50A5L, 0xD26C4D12L, 0xDF2F6BCBL, 0xDBEE767CL,
6411     0xE3A1CBC1L, 0xE760D676L, 0xEA23F0AFL, 0xEEE2ED18L,
6412     0xF0A5BD1DL, 0xF464A0AAL, 0xF9278673L, 0xFDE69BC4L,
6413     0x89B8FD09L, 0x8D79E0BEL, 0x803AC667L, 0x84FBDBD0L,
6414     0x9ABC8BD5L, 0x9E7D9662L, 0x933EB0BBL, 0x97FFAD0CL,
6415     0xAFB010B1L, 0xAB710D06L, 0xA6322BDFL, 0xA2F33668L,
6416     0xBCB4666DL, 0xB8757BDAL, 0xB5365D03L, 0xB1F740B4L
6417 };
6418 #endif
6419
6420 static DRFLAC_INLINE drflac_uint32 drflac_crc32_byte(drflac_uint32 crc32, drflac_uint8 data)
6421 {
6422 #ifndef DR_FLAC_NO_CRC
6423     return (crc32 << 8) ^ drflac__crc32_table[(drflac_uint8)((crc32 >> 24) & 0xFF) ^ data];
6424 #else
6425     (void)data;
6426     return crc32;
6427 #endif
6428 }
6429
6430 static DRFLAC_INLINE drflac_uint32 drflac_crc32_buffer(drflac_uint32 crc32, drflac_uint8* pData, drflac_uint32 dataSize)
6431 {
6432     /* This can be optimized. */
6433     drflac_uint32 i;
6434     for (i = 0; i < dataSize; ++i) {
6435         crc32 = drflac_crc32_byte(crc32, pData[i]);
6436     }
6437     return crc32;
6438 }
6439
6440
6441 static DRFLAC_INLINE drflac_bool32 drflac_ogg__is_capture_pattern(drflac_uint8 pattern[4])
6442 {
6443     return pattern[0] == 'O' && pattern[1] == 'g' && pattern[2] == 'g' && pattern[3] == 'S';
6444 }
6445
6446 static DRFLAC_INLINE drflac_uint32 drflac_ogg__get_page_header_size(drflac_ogg_page_header* pHeader)
6447 {
6448     return 27 + pHeader->segmentCount;
6449 }
6450
6451 static DRFLAC_INLINE drflac_uint32 drflac_ogg__get_page_body_size(drflac_ogg_page_header* pHeader)
6452 {
6453     drflac_uint32 pageBodySize = 0;
6454     int i;
6455
6456     for (i = 0; i < pHeader->segmentCount; ++i) {
6457         pageBodySize += pHeader->segmentTable[i];
6458     }
6459
6460     return pageBodySize;
6461 }
6462
6463 static drflac_result drflac_ogg__read_page_header_after_capture_pattern(drflac_read_proc onRead, void* pUserData, drflac_ogg_page_header* pHeader, drflac_uint32* pBytesRead, drflac_uint32* pCRC32)
6464 {
6465     drflac_uint8 data[23];
6466     drflac_uint32 i;
6467
6468     DRFLAC_ASSERT(*pCRC32 == DRFLAC_OGG_CAPTURE_PATTERN_CRC32);
6469
6470     if (onRead(pUserData, data, 23) != 23) {
6471         return DRFLAC_AT_END;
6472     }
6473     *pBytesRead += 23;
6474
6475     /*
6476     It's not actually used, but set the capture pattern to 'OggS' for completeness. Not doing this will cause static analysers to complain about
6477     us trying to access uninitialized data. We could alternatively just comment out this member of the drflac_ogg_page_header structure, but I
6478     like to have it map to the structure of the underlying data.
6479     */
6480     pHeader->capturePattern[0] = 'O';
6481     pHeader->capturePattern[1] = 'g';
6482     pHeader->capturePattern[2] = 'g';
6483     pHeader->capturePattern[3] = 'S';
6484
6485     pHeader->structureVersion = data[0];
6486     pHeader->headerType       = data[1];
6487     DRFLAC_COPY_MEMORY(&pHeader->granulePosition, &data[ 2], 8);
6488     DRFLAC_COPY_MEMORY(&pHeader->serialNumber,    &data[10], 4);
6489     DRFLAC_COPY_MEMORY(&pHeader->sequenceNumber,  &data[14], 4);
6490     DRFLAC_COPY_MEMORY(&pHeader->checksum,        &data[18], 4);
6491     pHeader->segmentCount     = data[22];
6492
6493     /* Calculate the CRC. Note that for the calculation the checksum part of the page needs to be set to 0. */
6494     data[18] = 0;
6495     data[19] = 0;
6496     data[20] = 0;
6497     data[21] = 0;
6498
6499     for (i = 0; i < 23; ++i) {
6500         *pCRC32 = drflac_crc32_byte(*pCRC32, data[i]);
6501     }
6502
6503
6504     if (onRead(pUserData, pHeader->segmentTable, pHeader->segmentCount) != pHeader->segmentCount) {
6505         return DRFLAC_AT_END;
6506     }
6507     *pBytesRead += pHeader->segmentCount;
6508
6509     for (i = 0; i < pHeader->segmentCount; ++i) {
6510         *pCRC32 = drflac_crc32_byte(*pCRC32, pHeader->segmentTable[i]);
6511     }
6512
6513     return DRFLAC_SUCCESS;
6514 }
6515
6516 static drflac_result drflac_ogg__read_page_header(drflac_read_proc onRead, void* pUserData, drflac_ogg_page_header* pHeader, drflac_uint32* pBytesRead, drflac_uint32* pCRC32)
6517 {
6518     drflac_uint8 id[4];
6519
6520     *pBytesRead = 0;
6521
6522     if (onRead(pUserData, id, 4) != 4) {
6523         return DRFLAC_AT_END;
6524     }
6525     *pBytesRead += 4;
6526
6527     /* We need to read byte-by-byte until we find the OggS capture pattern. */
6528     for (;;) {
6529         if (drflac_ogg__is_capture_pattern(id)) {
6530             drflac_result result;
6531
6532             *pCRC32 = DRFLAC_OGG_CAPTURE_PATTERN_CRC32;
6533
6534             result = drflac_ogg__read_page_header_after_capture_pattern(onRead, pUserData, pHeader, pBytesRead, pCRC32);
6535             if (result == DRFLAC_SUCCESS) {
6536                 return DRFLAC_SUCCESS;
6537             } else {
6538                 if (result == DRFLAC_CRC_MISMATCH) {
6539                     continue;
6540                 } else {
6541                     return result;
6542                 }
6543             }
6544         } else {
6545             /* The first 4 bytes did not equal the capture pattern. Read the next byte and try again. */
6546             id[0] = id[1];
6547             id[1] = id[2];
6548             id[2] = id[3];
6549             if (onRead(pUserData, &id[3], 1) != 1) {
6550                 return DRFLAC_AT_END;
6551             }
6552             *pBytesRead += 1;
6553         }
6554     }
6555 }
6556
6557
6558 /*
6559 The main part of the Ogg encapsulation is the conversion from the physical Ogg bitstream to the native FLAC bitstream. It works
6560 in three general stages: Ogg Physical Bitstream -> Ogg/FLAC Logical Bitstream -> FLAC Native Bitstream. dr_flac is designed
6561 in such a way that the core sections assume everything is delivered in native format. Therefore, for each encapsulation type
6562 dr_flac is supporting there needs to be a layer sitting on top of the onRead and onSeek callbacks that ensures the bits read from
6563 the physical Ogg bitstream are converted and delivered in native FLAC format.
6564 */
6565 typedef struct
6566 {
6567     drflac_read_proc onRead;                /* The original onRead callback from drflac_open() and family. */
6568     drflac_seek_proc onSeek;                /* The original onSeek callback from drflac_open() and family. */
6569     void* pUserData;                        /* The user data passed on onRead and onSeek. This is the user data that was passed on drflac_open() and family. */
6570     drflac_uint64 currentBytePos;           /* The position of the byte we are sitting on in the physical byte stream. Used for efficient seeking. */
6571     drflac_uint64 firstBytePos;             /* The position of the first byte in the physical bitstream. Points to the start of the "OggS" identifier of the FLAC bos page. */
6572     drflac_uint32 serialNumber;             /* The serial number of the FLAC audio pages. This is determined by the initial header page that was read during initialization. */
6573     drflac_ogg_page_header bosPageHeader;   /* Used for seeking. */
6574     drflac_ogg_page_header currentPageHeader;
6575     drflac_uint32 bytesRemainingInPage;
6576     drflac_uint32 pageDataSize;
6577     drflac_uint8 pageData[DRFLAC_OGG_MAX_PAGE_SIZE];
6578 } drflac_oggbs; /* oggbs = Ogg Bitstream */
6579
6580 static size_t drflac_oggbs__read_physical(drflac_oggbs* oggbs, void* bufferOut, size_t bytesToRead)
6581 {
6582     size_t bytesActuallyRead = oggbs->onRead(oggbs->pUserData, bufferOut, bytesToRead);
6583     oggbs->currentBytePos += bytesActuallyRead;
6584
6585     return bytesActuallyRead;
6586 }
6587
6588 static drflac_bool32 drflac_oggbs__seek_physical(drflac_oggbs* oggbs, drflac_uint64 offset, drflac_seek_origin origin)
6589 {
6590     if (origin == drflac_seek_origin_start) {
6591         if (offset <= 0x7FFFFFFF) {
6592             if (!oggbs->onSeek(oggbs->pUserData, (int)offset, drflac_seek_origin_start)) {
6593                 return DRFLAC_FALSE;
6594             }
6595             oggbs->currentBytePos = offset;
6596
6597             return DRFLAC_TRUE;
6598         } else {
6599             if (!oggbs->onSeek(oggbs->pUserData, 0x7FFFFFFF, drflac_seek_origin_start)) {
6600                 return DRFLAC_FALSE;
6601             }
6602             oggbs->currentBytePos = offset;
6603
6604             return drflac_oggbs__seek_physical(oggbs, offset - 0x7FFFFFFF, drflac_seek_origin_current);
6605         }
6606     } else {
6607         while (offset > 0x7FFFFFFF) {
6608             if (!oggbs->onSeek(oggbs->pUserData, 0x7FFFFFFF, drflac_seek_origin_current)) {
6609                 return DRFLAC_FALSE;
6610             }
6611             oggbs->currentBytePos += 0x7FFFFFFF;
6612             offset -= 0x7FFFFFFF;
6613         }
6614
6615         if (!oggbs->onSeek(oggbs->pUserData, (int)offset, drflac_seek_origin_current)) {    /* <-- Safe cast thanks to the loop above. */
6616             return DRFLAC_FALSE;
6617         }
6618         oggbs->currentBytePos += offset;
6619
6620         return DRFLAC_TRUE;
6621     }
6622 }
6623
6624 static drflac_bool32 drflac_oggbs__goto_next_page(drflac_oggbs* oggbs, drflac_ogg_crc_mismatch_recovery recoveryMethod)
6625 {
6626     drflac_ogg_page_header header;
6627     for (;;) {
6628         drflac_uint32 crc32 = 0;
6629         drflac_uint32 bytesRead;
6630         drflac_uint32 pageBodySize;
6631 #ifndef DR_FLAC_NO_CRC
6632         drflac_uint32 actualCRC32;
6633 #endif
6634
6635         if (drflac_ogg__read_page_header(oggbs->onRead, oggbs->pUserData, &header, &bytesRead, &crc32) != DRFLAC_SUCCESS) {
6636             return DRFLAC_FALSE;
6637         }
6638         oggbs->currentBytePos += bytesRead;
6639
6640         pageBodySize = drflac_ogg__get_page_body_size(&header);
6641         if (pageBodySize > DRFLAC_OGG_MAX_PAGE_SIZE) {
6642             continue;   /* Invalid page size. Assume it's corrupted and just move to the next page. */
6643         }
6644
6645         if (header.serialNumber != oggbs->serialNumber) {
6646             /* It's not a FLAC page. Skip it. */
6647             if (pageBodySize > 0 && !drflac_oggbs__seek_physical(oggbs, pageBodySize, drflac_seek_origin_current)) {
6648                 return DRFLAC_FALSE;
6649             }
6650             continue;
6651         }
6652
6653
6654         /* We need to read the entire page and then do a CRC check on it. If there's a CRC mismatch we need to skip this page. */
6655         if (drflac_oggbs__read_physical(oggbs, oggbs->pageData, pageBodySize) != pageBodySize) {
6656             return DRFLAC_FALSE;
6657         }
6658         oggbs->pageDataSize = pageBodySize;
6659
6660 #ifndef DR_FLAC_NO_CRC
6661         actualCRC32 = drflac_crc32_buffer(crc32, oggbs->pageData, oggbs->pageDataSize);
6662         if (actualCRC32 != header.checksum) {
6663             if (recoveryMethod == drflac_ogg_recover_on_crc_mismatch) {
6664                 continue;   /* CRC mismatch. Skip this page. */
6665             } else {
6666                 /*
6667                 Even though we are failing on a CRC mismatch, we still want our stream to be in a good state. Therefore we
6668                 go to the next valid page to ensure we're in a good state, but return false to let the caller know that the
6669                 seek did not fully complete.
6670                 */
6671                 drflac_oggbs__goto_next_page(oggbs, drflac_ogg_recover_on_crc_mismatch);
6672                 return DRFLAC_FALSE;
6673             }
6674         }
6675 #else
6676         (void)recoveryMethod;   /* <-- Silence a warning. */
6677 #endif
6678
6679         oggbs->currentPageHeader = header;
6680         oggbs->bytesRemainingInPage = pageBodySize;
6681         return DRFLAC_TRUE;
6682     }
6683 }
6684
6685 static size_t drflac__on_read_ogg(void* pUserData, void* bufferOut, size_t bytesToRead)
6686 {
6687     drflac_oggbs* oggbs = (drflac_oggbs*)pUserData;
6688     drflac_uint8* pRunningBufferOut = (drflac_uint8*)bufferOut;
6689     size_t bytesRead = 0;
6690
6691     DRFLAC_ASSERT(oggbs != NULL);
6692     DRFLAC_ASSERT(pRunningBufferOut != NULL);
6693
6694     /* Reading is done page-by-page. If we've run out of bytes in the page we need to move to the next one. */
6695     while (bytesRead < bytesToRead) {
6696         size_t bytesRemainingToRead = bytesToRead - bytesRead;
6697
6698         if (oggbs->bytesRemainingInPage >= bytesRemainingToRead) {
6699             DRFLAC_COPY_MEMORY(pRunningBufferOut, oggbs->pageData + (oggbs->pageDataSize - oggbs->bytesRemainingInPage), bytesRemainingToRead);
6700             bytesRead += bytesRemainingToRead;
6701             oggbs->bytesRemainingInPage -= (drflac_uint32)bytesRemainingToRead;
6702             break;
6703         }
6704
6705         /* If we get here it means some of the requested data is contained in the next pages. */
6706         if (oggbs->bytesRemainingInPage > 0) {
6707             DRFLAC_COPY_MEMORY(pRunningBufferOut, oggbs->pageData + (oggbs->pageDataSize - oggbs->bytesRemainingInPage), oggbs->bytesRemainingInPage);
6708             bytesRead += oggbs->bytesRemainingInPage;
6709             pRunningBufferOut += oggbs->bytesRemainingInPage;
6710             oggbs->bytesRemainingInPage = 0;
6711         }
6712
6713         DRFLAC_ASSERT(bytesRemainingToRead > 0);
6714         if (!drflac_oggbs__goto_next_page(oggbs, drflac_ogg_recover_on_crc_mismatch)) {
6715             break;  /* Failed to go to the next page. Might have simply hit the end of the stream. */
6716         }
6717     }
6718
6719     return bytesRead;
6720 }
6721
6722 static drflac_bool32 drflac__on_seek_ogg(void* pUserData, int offset, drflac_seek_origin origin)
6723 {
6724     drflac_oggbs* oggbs = (drflac_oggbs*)pUserData;
6725     int bytesSeeked = 0;
6726
6727     DRFLAC_ASSERT(oggbs != NULL);
6728     DRFLAC_ASSERT(offset >= 0);  /* <-- Never seek backwards. */
6729
6730     /* Seeking is always forward which makes things a lot simpler. */
6731     if (origin == drflac_seek_origin_start) {
6732         if (!drflac_oggbs__seek_physical(oggbs, (int)oggbs->firstBytePos, drflac_seek_origin_start)) {
6733             return DRFLAC_FALSE;
6734         }
6735
6736         if (!drflac_oggbs__goto_next_page(oggbs, drflac_ogg_fail_on_crc_mismatch)) {
6737             return DRFLAC_FALSE;
6738         }
6739
6740         return drflac__on_seek_ogg(pUserData, offset, drflac_seek_origin_current);
6741     }
6742
6743     DRFLAC_ASSERT(origin == drflac_seek_origin_current);
6744
6745     while (bytesSeeked < offset) {
6746         int bytesRemainingToSeek = offset - bytesSeeked;
6747         DRFLAC_ASSERT(bytesRemainingToSeek >= 0);
6748
6749         if (oggbs->bytesRemainingInPage >= (size_t)bytesRemainingToSeek) {
6750             bytesSeeked += bytesRemainingToSeek;
6751             (void)bytesSeeked;  /* <-- Silence a dead store warning emitted by Clang Static Analyzer. */
6752             oggbs->bytesRemainingInPage -= bytesRemainingToSeek;
6753             break;
6754         }
6755
6756         /* If we get here it means some of the requested data is contained in the next pages. */
6757         if (oggbs->bytesRemainingInPage > 0) {
6758             bytesSeeked += (int)oggbs->bytesRemainingInPage;
6759             oggbs->bytesRemainingInPage = 0;
6760         }
6761
6762         DRFLAC_ASSERT(bytesRemainingToSeek > 0);
6763         if (!drflac_oggbs__goto_next_page(oggbs, drflac_ogg_fail_on_crc_mismatch)) {
6764             /* Failed to go to the next page. We either hit the end of the stream or had a CRC mismatch. */
6765             return DRFLAC_FALSE;
6766         }
6767     }
6768
6769     return DRFLAC_TRUE;
6770 }
6771
6772
6773 static drflac_bool32 drflac_ogg__seek_to_pcm_frame(drflac* pFlac, drflac_uint64 pcmFrameIndex)
6774 {
6775     drflac_oggbs* oggbs = (drflac_oggbs*)pFlac->_oggbs;
6776     drflac_uint64 originalBytePos;
6777     drflac_uint64 runningGranulePosition;
6778     drflac_uint64 runningFrameBytePos;
6779     drflac_uint64 runningPCMFrameCount;
6780
6781     DRFLAC_ASSERT(oggbs != NULL);
6782
6783     originalBytePos = oggbs->currentBytePos;   /* For recovery. Points to the OggS identifier. */
6784
6785     /* First seek to the first frame. */
6786     if (!drflac__seek_to_byte(&pFlac->bs, pFlac->firstFLACFramePosInBytes)) {
6787         return DRFLAC_FALSE;
6788     }
6789     oggbs->bytesRemainingInPage = 0;
6790
6791     runningGranulePosition = 0;
6792     for (;;) {
6793         if (!drflac_oggbs__goto_next_page(oggbs, drflac_ogg_recover_on_crc_mismatch)) {
6794             drflac_oggbs__seek_physical(oggbs, originalBytePos, drflac_seek_origin_start);
6795             return DRFLAC_FALSE;   /* Never did find that sample... */
6796         }
6797
6798         runningFrameBytePos = oggbs->currentBytePos - drflac_ogg__get_page_header_size(&oggbs->currentPageHeader) - oggbs->pageDataSize;
6799         if (oggbs->currentPageHeader.granulePosition >= pcmFrameIndex) {
6800             break; /* The sample is somewhere in the previous page. */
6801         }
6802
6803         /*
6804         At this point we know the sample is not in the previous page. It could possibly be in this page. For simplicity we
6805         disregard any pages that do not begin a fresh packet.
6806         */
6807         if ((oggbs->currentPageHeader.headerType & 0x01) == 0) {    /* <-- Is it a fresh page? */
6808             if (oggbs->currentPageHeader.segmentTable[0] >= 2) {
6809                 drflac_uint8 firstBytesInPage[2];
6810                 firstBytesInPage[0] = oggbs->pageData[0];
6811                 firstBytesInPage[1] = oggbs->pageData[1];
6812
6813                 if ((firstBytesInPage[0] == 0xFF) && (firstBytesInPage[1] & 0xFC) == 0xF8) {    /* <-- Does the page begin with a frame's sync code? */
6814                     runningGranulePosition = oggbs->currentPageHeader.granulePosition;
6815                 }
6816
6817                 continue;
6818             }
6819         }
6820     }
6821
6822     /*
6823     We found the page that that is closest to the sample, so now we need to find it. The first thing to do is seek to the
6824     start of that page. In the loop above we checked that it was a fresh page which means this page is also the start of
6825     a new frame. This property means that after we've seeked to the page we can immediately start looping over frames until
6826     we find the one containing the target sample.
6827     */
6828     if (!drflac_oggbs__seek_physical(oggbs, runningFrameBytePos, drflac_seek_origin_start)) {
6829         return DRFLAC_FALSE;
6830     }
6831     if (!drflac_oggbs__goto_next_page(oggbs, drflac_ogg_recover_on_crc_mismatch)) {
6832         return DRFLAC_FALSE;
6833     }
6834
6835     /*
6836     At this point we'll be sitting on the first byte of the frame header of the first frame in the page. We just keep
6837     looping over these frames until we find the one containing the sample we're after.
6838     */
6839     runningPCMFrameCount = runningGranulePosition;
6840     for (;;) {
6841         /*
6842         There are two ways to find the sample and seek past irrelevant frames:
6843           1) Use the native FLAC decoder.
6844           2) Use Ogg's framing system.
6845
6846         Both of these options have their own pros and cons. Using the native FLAC decoder is slower because it needs to
6847         do a full decode of the frame. Using Ogg's framing system is faster, but more complicated and involves some code
6848         duplication for the decoding of frame headers.
6849
6850         Another thing to consider is that using the Ogg framing system will perform direct seeking of the physical Ogg
6851         bitstream. This is important to consider because it means we cannot read data from the drflac_bs object using the
6852         standard drflac__*() APIs because that will read in extra data for its own internal caching which in turn breaks
6853         the positioning of the read pointer of the physical Ogg bitstream. Therefore, anything that would normally be read
6854         using the native FLAC decoding APIs, such as drflac__read_next_flac_frame_header(), need to be re-implemented so as to
6855         avoid the use of the drflac_bs object.
6856
6857         Considering these issues, I have decided to use the slower native FLAC decoding method for the following reasons:
6858           1) Seeking is already partially accelerated using Ogg's paging system in the code block above.
6859           2) Seeking in an Ogg encapsulated FLAC stream is probably quite uncommon.
6860           3) Simplicity.
6861         */
6862         drflac_uint64 firstPCMFrameInFLACFrame = 0;
6863         drflac_uint64 lastPCMFrameInFLACFrame = 0;
6864         drflac_uint64 pcmFrameCountInThisFrame;
6865
6866         if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
6867             return DRFLAC_FALSE;
6868         }
6869
6870         drflac__get_pcm_frame_range_of_current_flac_frame(pFlac, &firstPCMFrameInFLACFrame, &lastPCMFrameInFLACFrame);
6871
6872         pcmFrameCountInThisFrame = (lastPCMFrameInFLACFrame - firstPCMFrameInFLACFrame) + 1;
6873
6874         /* If we are seeking to the end of the file and we've just hit it, we're done. */
6875         if (pcmFrameIndex == pFlac->totalPCMFrameCount && (runningPCMFrameCount + pcmFrameCountInThisFrame) == pFlac->totalPCMFrameCount) {
6876             drflac_result result = drflac__decode_flac_frame(pFlac);
6877             if (result == DRFLAC_SUCCESS) {
6878                 pFlac->currentPCMFrame = pcmFrameIndex;
6879                 pFlac->currentFLACFrame.pcmFramesRemaining = 0;
6880                 return DRFLAC_TRUE;
6881             } else {
6882                 return DRFLAC_FALSE;
6883             }
6884         }
6885
6886         if (pcmFrameIndex < (runningPCMFrameCount + pcmFrameCountInThisFrame)) {
6887             /*
6888             The sample should be in this FLAC frame. We need to fully decode it, however if it's an invalid frame (a CRC mismatch), we need to pretend
6889             it never existed and keep iterating.
6890             */
6891             drflac_result result = drflac__decode_flac_frame(pFlac);
6892             if (result == DRFLAC_SUCCESS) {
6893                 /* The frame is valid. We just need to skip over some samples to ensure it's sample-exact. */
6894                 drflac_uint64 pcmFramesToDecode = (size_t)(pcmFrameIndex - runningPCMFrameCount);    /* <-- Safe cast because the maximum number of samples in a frame is 65535. */
6895                 if (pcmFramesToDecode == 0) {
6896                     return DRFLAC_TRUE;
6897                 }
6898
6899                 pFlac->currentPCMFrame = runningPCMFrameCount;
6900
6901                 return drflac__seek_forward_by_pcm_frames(pFlac, pcmFramesToDecode) == pcmFramesToDecode;  /* <-- If this fails, something bad has happened (it should never fail). */
6902             } else {
6903                 if (result == DRFLAC_CRC_MISMATCH) {
6904                     continue;   /* CRC mismatch. Pretend this frame never existed. */
6905                 } else {
6906                     return DRFLAC_FALSE;
6907                 }
6908             }
6909         } else {
6910             /*
6911             It's not in this frame. We need to seek past the frame, but check if there was a CRC mismatch. If so, we pretend this
6912             frame never existed and leave the running sample count untouched.
6913             */
6914             drflac_result result = drflac__seek_to_next_flac_frame(pFlac);
6915             if (result == DRFLAC_SUCCESS) {
6916                 runningPCMFrameCount += pcmFrameCountInThisFrame;
6917             } else {
6918                 if (result == DRFLAC_CRC_MISMATCH) {
6919                     continue;   /* CRC mismatch. Pretend this frame never existed. */
6920                 } else {
6921                     return DRFLAC_FALSE;
6922                 }
6923             }
6924         }
6925     }
6926 }
6927
6928
6929
6930 static drflac_bool32 drflac__init_private__ogg(drflac_init_info* pInit, drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_meta_proc onMeta, void* pUserData, void* pUserDataMD, drflac_bool32 relaxed)
6931 {
6932     drflac_ogg_page_header header;
6933     drflac_uint32 crc32 = DRFLAC_OGG_CAPTURE_PATTERN_CRC32;
6934     drflac_uint32 bytesRead = 0;
6935
6936     /* Pre Condition: The bit stream should be sitting just past the 4-byte OggS capture pattern. */
6937     (void)relaxed;
6938
6939     pInit->container = drflac_container_ogg;
6940     pInit->oggFirstBytePos = 0;
6941
6942     /*
6943     We'll get here if the first 4 bytes of the stream were the OggS capture pattern, however it doesn't necessarily mean the
6944     stream includes FLAC encoded audio. To check for this we need to scan the beginning-of-stream page markers and check if
6945     any match the FLAC specification. Important to keep in mind that the stream may be multiplexed.
6946     */
6947     if (drflac_ogg__read_page_header_after_capture_pattern(onRead, pUserData, &header, &bytesRead, &crc32) != DRFLAC_SUCCESS) {
6948         return DRFLAC_FALSE;
6949     }
6950     pInit->runningFilePos += bytesRead;
6951
6952     for (;;) {
6953         int pageBodySize;
6954
6955         /* Break if we're past the beginning of stream page. */
6956         if ((header.headerType & 0x02) == 0) {
6957             return DRFLAC_FALSE;
6958         }
6959
6960         /* Check if it's a FLAC header. */
6961         pageBodySize = drflac_ogg__get_page_body_size(&header);
6962         if (pageBodySize == 51) {   /* 51 = the lacing value of the FLAC header packet. */
6963             /* It could be a FLAC page... */
6964             drflac_uint32 bytesRemainingInPage = pageBodySize;
6965             drflac_uint8 packetType;
6966
6967             if (onRead(pUserData, &packetType, 1) != 1) {
6968                 return DRFLAC_FALSE;
6969             }
6970
6971             bytesRemainingInPage -= 1;
6972             if (packetType == 0x7F) {
6973                 /* Increasingly more likely to be a FLAC page... */
6974                 drflac_uint8 sig[4];
6975                 if (onRead(pUserData, sig, 4) != 4) {
6976                     return DRFLAC_FALSE;
6977                 }
6978
6979                 bytesRemainingInPage -= 4;
6980                 if (sig[0] == 'F' && sig[1] == 'L' && sig[2] == 'A' && sig[3] == 'C') {
6981                     /* Almost certainly a FLAC page... */
6982                     drflac_uint8 mappingVersion[2];
6983                     if (onRead(pUserData, mappingVersion, 2) != 2) {
6984                         return DRFLAC_FALSE;
6985                     }
6986
6987                     if (mappingVersion[0] != 1) {
6988                         return DRFLAC_FALSE;   /* Only supporting version 1.x of the Ogg mapping. */
6989                     }
6990
6991                     /*
6992                     The next 2 bytes are the non-audio packets, not including this one. We don't care about this because we're going to
6993                     be handling it in a generic way based on the serial number and packet types.
6994                     */
6995                     if (!onSeek(pUserData, 2, drflac_seek_origin_current)) {
6996                         return DRFLAC_FALSE;
6997                     }
6998
6999                     /* Expecting the native FLAC signature "fLaC". */
7000                     if (onRead(pUserData, sig, 4) != 4) {
7001                         return DRFLAC_FALSE;
7002                     }
7003
7004                     if (sig[0] == 'f' && sig[1] == 'L' && sig[2] == 'a' && sig[3] == 'C') {
7005                         /* The remaining data in the page should be the STREAMINFO block. */
7006                         drflac_streaminfo streaminfo;
7007                         drflac_uint8 isLastBlock;
7008                         drflac_uint8 blockType;
7009                         drflac_uint32 blockSize;
7010                         if (!drflac__read_and_decode_block_header(onRead, pUserData, &isLastBlock, &blockType, &blockSize)) {
7011                             return DRFLAC_FALSE;
7012                         }
7013
7014                         if (blockType != DRFLAC_METADATA_BLOCK_TYPE_STREAMINFO || blockSize != 34) {
7015                             return DRFLAC_FALSE;    /* Invalid block type. First block must be the STREAMINFO block. */
7016                         }
7017
7018                         if (drflac__read_streaminfo(onRead, pUserData, &streaminfo)) {
7019                             /* Success! */
7020                             pInit->hasStreamInfoBlock      = DRFLAC_TRUE;
7021                             pInit->sampleRate              = streaminfo.sampleRate;
7022                             pInit->channels                = streaminfo.channels;
7023                             pInit->bitsPerSample           = streaminfo.bitsPerSample;
7024                             pInit->totalPCMFrameCount      = streaminfo.totalPCMFrameCount;
7025                             pInit->maxBlockSizeInPCMFrames = streaminfo.maxBlockSizeInPCMFrames;
7026                             pInit->hasMetadataBlocks       = !isLastBlock;
7027
7028                             if (onMeta) {
7029                                 drflac_metadata metadata;
7030                                 metadata.type = DRFLAC_METADATA_BLOCK_TYPE_STREAMINFO;
7031                                 metadata.pRawData = NULL;
7032                                 metadata.rawDataSize = 0;
7033                                 metadata.data.streaminfo = streaminfo;
7034                                 onMeta(pUserDataMD, &metadata);
7035                             }
7036
7037                             pInit->runningFilePos  += pageBodySize;
7038                             pInit->oggFirstBytePos  = pInit->runningFilePos - 79;   /* Subtracting 79 will place us right on top of the "OggS" identifier of the FLAC bos page. */
7039                             pInit->oggSerial        = header.serialNumber;
7040                             pInit->oggBosHeader     = header;
7041                             break;
7042                         } else {
7043                             /* Failed to read STREAMINFO block. Aww, so close... */
7044                             return DRFLAC_FALSE;
7045                         }
7046                     } else {
7047                         /* Invalid file. */
7048                         return DRFLAC_FALSE;
7049                     }
7050                 } else {
7051                     /* Not a FLAC header. Skip it. */
7052                     if (!onSeek(pUserData, bytesRemainingInPage, drflac_seek_origin_current)) {
7053                         return DRFLAC_FALSE;
7054                     }
7055                 }
7056             } else {
7057                 /* Not a FLAC header. Seek past the entire page and move on to the next. */
7058                 if (!onSeek(pUserData, bytesRemainingInPage, drflac_seek_origin_current)) {
7059                     return DRFLAC_FALSE;
7060                 }
7061             }
7062         } else {
7063             if (!onSeek(pUserData, pageBodySize, drflac_seek_origin_current)) {
7064                 return DRFLAC_FALSE;
7065             }
7066         }
7067
7068         pInit->runningFilePos += pageBodySize;
7069
7070
7071         /* Read the header of the next page. */
7072         if (drflac_ogg__read_page_header(onRead, pUserData, &header, &bytesRead, &crc32) != DRFLAC_SUCCESS) {
7073             return DRFLAC_FALSE;
7074         }
7075         pInit->runningFilePos += bytesRead;
7076     }
7077
7078     /*
7079     If we get here it means we found a FLAC audio stream. We should be sitting on the first byte of the header of the next page. The next
7080     packets in the FLAC logical stream contain the metadata. The only thing left to do in the initialization phase for Ogg is to create the
7081     Ogg bistream object.
7082     */
7083     pInit->hasMetadataBlocks = DRFLAC_TRUE;    /* <-- Always have at least VORBIS_COMMENT metadata block. */
7084     return DRFLAC_TRUE;
7085 }
7086 #endif
7087
7088 static drflac_bool32 drflac__init_private(drflac_init_info* pInit, drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_meta_proc onMeta, drflac_container container, void* pUserData, void* pUserDataMD)
7089 {
7090     drflac_bool32 relaxed;
7091     drflac_uint8 id[4];
7092
7093     if (pInit == NULL || onRead == NULL || onSeek == NULL) {
7094         return DRFLAC_FALSE;
7095     }
7096
7097     DRFLAC_ZERO_MEMORY(pInit, sizeof(*pInit));
7098     pInit->onRead       = onRead;
7099     pInit->onSeek       = onSeek;
7100     pInit->onMeta       = onMeta;
7101     pInit->container    = container;
7102     pInit->pUserData    = pUserData;
7103     pInit->pUserDataMD  = pUserDataMD;
7104
7105     pInit->bs.onRead    = onRead;
7106     pInit->bs.onSeek    = onSeek;
7107     pInit->bs.pUserData = pUserData;
7108     drflac__reset_cache(&pInit->bs);
7109
7110
7111     /* If the container is explicitly defined then we can try opening in relaxed mode. */
7112     relaxed = container != drflac_container_unknown;
7113
7114     /* Skip over any ID3 tags. */
7115     for (;;) {
7116         if (onRead(pUserData, id, 4) != 4) {
7117             return DRFLAC_FALSE;    /* Ran out of data. */
7118         }
7119         pInit->runningFilePos += 4;
7120
7121         if (id[0] == 'I' && id[1] == 'D' && id[2] == '3') {
7122             drflac_uint8 header[6];
7123             drflac_uint8 flags;
7124             drflac_uint32 headerSize;
7125
7126             if (onRead(pUserData, header, 6) != 6) {
7127                 return DRFLAC_FALSE;    /* Ran out of data. */
7128             }
7129             pInit->runningFilePos += 6;
7130
7131             flags = header[1];
7132
7133             DRFLAC_COPY_MEMORY(&headerSize, header+2, 4);
7134             headerSize = drflac__unsynchsafe_32(drflac__be2host_32(headerSize));
7135             if (flags & 0x10) {
7136                 headerSize += 10;
7137             }
7138
7139             if (!onSeek(pUserData, headerSize, drflac_seek_origin_current)) {
7140                 return DRFLAC_FALSE;    /* Failed to seek past the tag. */
7141             }
7142             pInit->runningFilePos += headerSize;
7143         } else {
7144             break;
7145         }
7146     }
7147
7148     if (id[0] == 'f' && id[1] == 'L' && id[2] == 'a' && id[3] == 'C') {
7149         return drflac__init_private__native(pInit, onRead, onSeek, onMeta, pUserData, pUserDataMD, relaxed);
7150     }
7151 #ifndef DR_FLAC_NO_OGG
7152     if (id[0] == 'O' && id[1] == 'g' && id[2] == 'g' && id[3] == 'S') {
7153         return drflac__init_private__ogg(pInit, onRead, onSeek, onMeta, pUserData, pUserDataMD, relaxed);
7154     }
7155 #endif
7156
7157     /* If we get here it means we likely don't have a header. Try opening in relaxed mode, if applicable. */
7158     if (relaxed) {
7159         if (container == drflac_container_native) {
7160             return drflac__init_private__native(pInit, onRead, onSeek, onMeta, pUserData, pUserDataMD, relaxed);
7161         }
7162 #ifndef DR_FLAC_NO_OGG
7163         if (container == drflac_container_ogg) {
7164             return drflac__init_private__ogg(pInit, onRead, onSeek, onMeta, pUserData, pUserDataMD, relaxed);
7165         }
7166 #endif
7167     }
7168
7169     /* Unsupported container. */
7170     return DRFLAC_FALSE;
7171 }
7172
7173 static void drflac__init_from_info(drflac* pFlac, const drflac_init_info* pInit)
7174 {
7175     DRFLAC_ASSERT(pFlac != NULL);
7176     DRFLAC_ASSERT(pInit != NULL);
7177
7178     DRFLAC_ZERO_MEMORY(pFlac, sizeof(*pFlac));
7179     pFlac->bs                      = pInit->bs;
7180     pFlac->onMeta                  = pInit->onMeta;
7181     pFlac->pUserDataMD             = pInit->pUserDataMD;
7182     pFlac->maxBlockSizeInPCMFrames = pInit->maxBlockSizeInPCMFrames;
7183     pFlac->sampleRate              = pInit->sampleRate;
7184     pFlac->channels                = (drflac_uint8)pInit->channels;
7185     pFlac->bitsPerSample           = (drflac_uint8)pInit->bitsPerSample;
7186     pFlac->totalPCMFrameCount      = pInit->totalPCMFrameCount;
7187     pFlac->container               = pInit->container;
7188 }
7189
7190
7191 static drflac* drflac_open_with_metadata_private(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_meta_proc onMeta, drflac_container container, void* pUserData, void* pUserDataMD, const drflac_allocation_callbacks* pAllocationCallbacks)
7192 {
7193     drflac_init_info init;
7194     drflac_uint32 allocationSize;
7195     drflac_uint32 wholeSIMDVectorCountPerChannel;
7196     drflac_uint32 decodedSamplesAllocationSize;
7197 #ifndef DR_FLAC_NO_OGG
7198     drflac_oggbs oggbs;
7199 #endif
7200     drflac_uint64 firstFramePos;
7201     drflac_uint64 seektablePos;
7202     drflac_uint32 seektableSize;
7203     drflac_allocation_callbacks allocationCallbacks;
7204     drflac* pFlac;
7205
7206     /* CPU support first. */
7207     drflac__init_cpu_caps();
7208
7209     if (!drflac__init_private(&init, onRead, onSeek, onMeta, container, pUserData, pUserDataMD)) {
7210         return NULL;
7211     }
7212
7213     if (pAllocationCallbacks != NULL) {
7214         allocationCallbacks = *pAllocationCallbacks;
7215         if (allocationCallbacks.onFree == NULL || (allocationCallbacks.onMalloc == NULL && allocationCallbacks.onRealloc == NULL)) {
7216             return NULL;    /* Invalid allocation callbacks. */
7217         }
7218     } else {
7219         allocationCallbacks.pUserData = NULL;
7220         allocationCallbacks.onMalloc  = drflac__malloc_default;
7221         allocationCallbacks.onRealloc = drflac__realloc_default;
7222         allocationCallbacks.onFree    = drflac__free_default;
7223     }
7224
7225
7226     /*
7227     The size of the allocation for the drflac object needs to be large enough to fit the following:
7228       1) The main members of the drflac structure
7229       2) A block of memory large enough to store the decoded samples of the largest frame in the stream
7230       3) If the container is Ogg, a drflac_oggbs object
7231
7232     The complicated part of the allocation is making sure there's enough room the decoded samples, taking into consideration
7233     the different SIMD instruction sets.
7234     */
7235     allocationSize = sizeof(drflac);
7236
7237     /*
7238     The allocation size for decoded frames depends on the number of 32-bit integers that fit inside the largest SIMD vector
7239     we are supporting.
7240     */
7241     if ((init.maxBlockSizeInPCMFrames % (DRFLAC_MAX_SIMD_VECTOR_SIZE / sizeof(drflac_int32))) == 0) {
7242         wholeSIMDVectorCountPerChannel = (init.maxBlockSizeInPCMFrames / (DRFLAC_MAX_SIMD_VECTOR_SIZE / sizeof(drflac_int32)));
7243     } else {
7244         wholeSIMDVectorCountPerChannel = (init.maxBlockSizeInPCMFrames / (DRFLAC_MAX_SIMD_VECTOR_SIZE / sizeof(drflac_int32))) + 1;
7245     }
7246
7247     decodedSamplesAllocationSize = wholeSIMDVectorCountPerChannel * DRFLAC_MAX_SIMD_VECTOR_SIZE * init.channels;
7248
7249     allocationSize += decodedSamplesAllocationSize;
7250     allocationSize += DRFLAC_MAX_SIMD_VECTOR_SIZE;  /* Allocate extra bytes to ensure we have enough for alignment. */
7251
7252 #ifndef DR_FLAC_NO_OGG
7253     /* There's additional data required for Ogg streams. */
7254     if (init.container == drflac_container_ogg) {
7255         allocationSize += sizeof(drflac_oggbs);
7256     }
7257
7258     DRFLAC_ZERO_MEMORY(&oggbs, sizeof(oggbs));
7259     if (init.container == drflac_container_ogg) {
7260         oggbs.onRead = onRead;
7261         oggbs.onSeek = onSeek;
7262         oggbs.pUserData = pUserData;
7263         oggbs.currentBytePos = init.oggFirstBytePos;
7264         oggbs.firstBytePos = init.oggFirstBytePos;
7265         oggbs.serialNumber = init.oggSerial;
7266         oggbs.bosPageHeader = init.oggBosHeader;
7267         oggbs.bytesRemainingInPage = 0;
7268     }
7269 #endif
7270
7271     /*
7272     This part is a bit awkward. We need to load the seektable so that it can be referenced in-memory, but I want the drflac object to
7273     consist of only a single heap allocation. To this, the size of the seek table needs to be known, which we determine when reading
7274     and decoding the metadata.
7275     */
7276     firstFramePos = 42;   /* <-- We know we are at byte 42 at this point. */
7277     seektablePos  = 0;
7278     seektableSize = 0;
7279     if (init.hasMetadataBlocks) {
7280         drflac_read_proc onReadOverride = onRead;
7281         drflac_seek_proc onSeekOverride = onSeek;
7282         void* pUserDataOverride = pUserData;
7283
7284 #ifndef DR_FLAC_NO_OGG
7285         if (init.container == drflac_container_ogg) {
7286             onReadOverride = drflac__on_read_ogg;
7287             onSeekOverride = drflac__on_seek_ogg;
7288             pUserDataOverride = (void*)&oggbs;
7289         }
7290 #endif
7291
7292         if (!drflac__read_and_decode_metadata(onReadOverride, onSeekOverride, onMeta, pUserDataOverride, pUserDataMD, &firstFramePos, &seektablePos, &seektableSize, &allocationCallbacks)) {
7293             return NULL;
7294         }
7295
7296         allocationSize += seektableSize;
7297     }
7298
7299
7300     pFlac = (drflac*)drflac__malloc_from_callbacks(allocationSize, &allocationCallbacks);
7301     if (pFlac == NULL) {
7302         return NULL;
7303     }
7304
7305     drflac__init_from_info(pFlac, &init);
7306     pFlac->allocationCallbacks = allocationCallbacks;
7307     pFlac->pDecodedSamples = (drflac_int32*)drflac_align((size_t)pFlac->pExtraData, DRFLAC_MAX_SIMD_VECTOR_SIZE);
7308
7309 #ifndef DR_FLAC_NO_OGG
7310     if (init.container == drflac_container_ogg) {
7311         drflac_oggbs* pInternalOggbs = (drflac_oggbs*)((drflac_uint8*)pFlac->pDecodedSamples + decodedSamplesAllocationSize + seektableSize);
7312         *pInternalOggbs = oggbs;
7313
7314         /* The Ogg bistream needs to be layered on top of the original bitstream. */
7315         pFlac->bs.onRead = drflac__on_read_ogg;
7316         pFlac->bs.onSeek = drflac__on_seek_ogg;
7317         pFlac->bs.pUserData = (void*)pInternalOggbs;
7318         pFlac->_oggbs = (void*)pInternalOggbs;
7319     }
7320 #endif
7321
7322     pFlac->firstFLACFramePosInBytes = firstFramePos;
7323
7324     /* NOTE: Seektables are not currently compatible with Ogg encapsulation (Ogg has its own accelerated seeking system). I may change this later, so I'm leaving this here for now. */
7325 #ifndef DR_FLAC_NO_OGG
7326     if (init.container == drflac_container_ogg)
7327     {
7328         pFlac->pSeekpoints = NULL;
7329         pFlac->seekpointCount = 0;
7330     }
7331     else
7332 #endif
7333     {
7334         /* If we have a seektable we need to load it now, making sure we move back to where we were previously. */
7335         if (seektablePos != 0) {
7336             pFlac->seekpointCount = seektableSize / sizeof(*pFlac->pSeekpoints);
7337             pFlac->pSeekpoints = (drflac_seekpoint*)((drflac_uint8*)pFlac->pDecodedSamples + decodedSamplesAllocationSize);
7338
7339             DRFLAC_ASSERT(pFlac->bs.onSeek != NULL);
7340             DRFLAC_ASSERT(pFlac->bs.onRead != NULL);
7341
7342             /* Seek to the seektable, then just read directly into our seektable buffer. */
7343             if (pFlac->bs.onSeek(pFlac->bs.pUserData, (int)seektablePos, drflac_seek_origin_start)) {
7344                 if (pFlac->bs.onRead(pFlac->bs.pUserData, pFlac->pSeekpoints, seektableSize) == seektableSize) {
7345                     /* Endian swap. */
7346                     drflac_uint32 iSeekpoint;
7347                     for (iSeekpoint = 0; iSeekpoint < pFlac->seekpointCount; ++iSeekpoint) {
7348                         pFlac->pSeekpoints[iSeekpoint].firstPCMFrame   = drflac__be2host_64(pFlac->pSeekpoints[iSeekpoint].firstPCMFrame);
7349                         pFlac->pSeekpoints[iSeekpoint].flacFrameOffset = drflac__be2host_64(pFlac->pSeekpoints[iSeekpoint].flacFrameOffset);
7350                         pFlac->pSeekpoints[iSeekpoint].pcmFrameCount   = drflac__be2host_16(pFlac->pSeekpoints[iSeekpoint].pcmFrameCount);
7351                     }
7352                 } else {
7353                     /* Failed to read the seektable. Pretend we don't have one. */
7354                     pFlac->pSeekpoints = NULL;
7355                     pFlac->seekpointCount = 0;
7356                 }
7357
7358                 /* We need to seek back to where we were. If this fails it's a critical error. */
7359                 if (!pFlac->bs.onSeek(pFlac->bs.pUserData, (int)pFlac->firstFLACFramePosInBytes, drflac_seek_origin_start)) {
7360                     drflac__free_from_callbacks(pFlac, &allocationCallbacks);
7361                     return NULL;
7362                 }
7363             } else {
7364                 /* Failed to seek to the seektable. Ominous sign, but for now we can just pretend we don't have one. */
7365                 pFlac->pSeekpoints = NULL;
7366                 pFlac->seekpointCount = 0;
7367             }
7368         }
7369     }
7370
7371
7372     /*
7373     If we get here, but don't have a STREAMINFO block, it means we've opened the stream in relaxed mode and need to decode
7374     the first frame.
7375     */
7376     if (!init.hasStreamInfoBlock) {
7377         pFlac->currentFLACFrame.header = init.firstFrameHeader;
7378         for (;;) {
7379             drflac_result result = drflac__decode_flac_frame(pFlac);
7380             if (result == DRFLAC_SUCCESS) {
7381                 break;
7382             } else {
7383                 if (result == DRFLAC_CRC_MISMATCH) {
7384                     if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
7385                         drflac__free_from_callbacks(pFlac, &allocationCallbacks);
7386                         return NULL;
7387                     }
7388                     continue;
7389                 } else {
7390                     drflac__free_from_callbacks(pFlac, &allocationCallbacks);
7391                     return NULL;
7392                 }
7393             }
7394         }
7395     }
7396
7397     return pFlac;
7398 }
7399
7400 static size_t drflac__on_read_memory(void* pUserData, void* bufferOut, size_t bytesToRead)
7401 {
7402     drflac__memory_stream* memoryStream = (drflac__memory_stream*)pUserData;
7403     size_t bytesRemaining;
7404
7405     DRFLAC_ASSERT(memoryStream != NULL);
7406     DRFLAC_ASSERT(memoryStream->dataSize >= memoryStream->currentReadPos);
7407
7408     bytesRemaining = memoryStream->dataSize - memoryStream->currentReadPos;
7409     if (bytesToRead > bytesRemaining) {
7410         bytesToRead = bytesRemaining;
7411     }
7412
7413     if (bytesToRead > 0) {
7414         DRFLAC_COPY_MEMORY(bufferOut, memoryStream->data + memoryStream->currentReadPos, bytesToRead);
7415         memoryStream->currentReadPos += bytesToRead;
7416     }
7417
7418     return bytesToRead;
7419 }
7420
7421 static drflac_bool32 drflac__on_seek_memory(void* pUserData, int offset, drflac_seek_origin origin)
7422 {
7423     drflac__memory_stream* memoryStream = (drflac__memory_stream*)pUserData;
7424
7425     DRFLAC_ASSERT(memoryStream != NULL);
7426     DRFLAC_ASSERT(offset >= 0); /* <-- Never seek backwards. */
7427
7428     if (offset > (drflac_int64)memoryStream->dataSize) {
7429         return DRFLAC_FALSE;
7430     }
7431
7432     if (origin == drflac_seek_origin_current) {
7433         if (memoryStream->currentReadPos + offset <= memoryStream->dataSize) {
7434             memoryStream->currentReadPos += offset;
7435         } else {
7436             return DRFLAC_FALSE;  /* Trying to seek too far forward. */
7437         }
7438     } else {
7439         if ((drflac_uint32)offset <= memoryStream->dataSize) {
7440             memoryStream->currentReadPos = offset;
7441         } else {
7442             return DRFLAC_FALSE;  /* Trying to seek too far forward. */
7443         }
7444     }
7445
7446     return DRFLAC_TRUE;
7447 }
7448
7449 DRFLAC_API drflac* drflac_open_memory(const void* pData, size_t dataSize, const drflac_allocation_callbacks* pAllocationCallbacks)
7450 {
7451     drflac__memory_stream memoryStream;
7452     drflac* pFlac;
7453
7454     memoryStream.data = (const drflac_uint8*)pData;
7455     memoryStream.dataSize = dataSize;
7456     memoryStream.currentReadPos = 0;
7457     pFlac = drflac_open(drflac__on_read_memory, drflac__on_seek_memory, &memoryStream, pAllocationCallbacks);
7458     if (pFlac == NULL) {
7459         return NULL;
7460     }
7461
7462     pFlac->memoryStream = memoryStream;
7463
7464     /* This is an awful hack... */
7465 #ifndef DR_FLAC_NO_OGG
7466     if (pFlac->container == drflac_container_ogg)
7467     {
7468         drflac_oggbs* oggbs = (drflac_oggbs*)pFlac->_oggbs;
7469         oggbs->pUserData = &pFlac->memoryStream;
7470     }
7471     else
7472 #endif
7473     {
7474         pFlac->bs.pUserData = &pFlac->memoryStream;
7475     }
7476
7477     return pFlac;
7478 }
7479
7480 DRFLAC_API drflac* drflac_open_memory_with_metadata(const void* pData, size_t dataSize, drflac_meta_proc onMeta, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks)
7481 {
7482     drflac__memory_stream memoryStream;
7483     drflac* pFlac;
7484
7485     memoryStream.data = (const drflac_uint8*)pData;
7486     memoryStream.dataSize = dataSize;
7487     memoryStream.currentReadPos = 0;
7488     pFlac = drflac_open_with_metadata_private(drflac__on_read_memory, drflac__on_seek_memory, onMeta, drflac_container_unknown, &memoryStream, pUserData, pAllocationCallbacks);
7489     if (pFlac == NULL) {
7490         return NULL;
7491     }
7492
7493     pFlac->memoryStream = memoryStream;
7494
7495     /* This is an awful hack... */
7496 #ifndef DR_FLAC_NO_OGG
7497     if (pFlac->container == drflac_container_ogg)
7498     {
7499         drflac_oggbs* oggbs = (drflac_oggbs*)pFlac->_oggbs;
7500         oggbs->pUserData = &pFlac->memoryStream;
7501     }
7502     else
7503 #endif
7504     {
7505         pFlac->bs.pUserData = &pFlac->memoryStream;
7506     }
7507
7508     return pFlac;
7509 }
7510
7511
7512
7513 DRFLAC_API drflac* drflac_open(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks)
7514 {
7515     return drflac_open_with_metadata_private(onRead, onSeek, NULL, drflac_container_unknown, pUserData, pUserData, pAllocationCallbacks);
7516 }
7517 DRFLAC_API drflac* drflac_open_relaxed(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_container container, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks)
7518 {
7519     return drflac_open_with_metadata_private(onRead, onSeek, NULL, container, pUserData, pUserData, pAllocationCallbacks);
7520 }
7521
7522 DRFLAC_API drflac* drflac_open_with_metadata(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_meta_proc onMeta, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks)
7523 {
7524     return drflac_open_with_metadata_private(onRead, onSeek, onMeta, drflac_container_unknown, pUserData, pUserData, pAllocationCallbacks);
7525 }
7526 DRFLAC_API drflac* drflac_open_with_metadata_relaxed(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_meta_proc onMeta, drflac_container container, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks)
7527 {
7528     return drflac_open_with_metadata_private(onRead, onSeek, onMeta, container, pUserData, pUserData, pAllocationCallbacks);
7529 }
7530
7531 DRFLAC_API void drflac_close(drflac* pFlac)
7532 {
7533     if (pFlac == NULL) {
7534         return;
7535     }
7536
7537     drflac__free_from_callbacks(pFlac, &pFlac->allocationCallbacks);
7538 }
7539
7540 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_left_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
7541 {
7542     drflac_uint64 i;
7543     drflac_uint64 frameCount4 = frameCount >> 2;
7544     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
7545     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
7546     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
7547     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
7548
7549     for (i = 0; i < frameCount4; ++i) {
7550         drflac_uint32 left0 = pInputSamples0U32[i*4+0] << shift0;
7551         drflac_uint32 left1 = pInputSamples0U32[i*4+1] << shift0;
7552         drflac_uint32 left2 = pInputSamples0U32[i*4+2] << shift0;
7553         drflac_uint32 left3 = pInputSamples0U32[i*4+3] << shift0;
7554
7555         drflac_uint32 side0 = pInputSamples1U32[i*4+0] << shift1;
7556         drflac_uint32 side1 = pInputSamples1U32[i*4+1] << shift1;
7557         drflac_uint32 side2 = pInputSamples1U32[i*4+2] << shift1;
7558         drflac_uint32 side3 = pInputSamples1U32[i*4+3] << shift1;
7559
7560         drflac_uint32 right0 = left0 - side0;
7561         drflac_uint32 right1 = left1 - side1;
7562         drflac_uint32 right2 = left2 - side2;
7563         drflac_uint32 right3 = left3 - side3;
7564
7565         pOutputSamples[i*8+0] = (drflac_int32)left0;
7566         pOutputSamples[i*8+1] = (drflac_int32)right0;
7567         pOutputSamples[i*8+2] = (drflac_int32)left1;
7568         pOutputSamples[i*8+3] = (drflac_int32)right1;
7569         pOutputSamples[i*8+4] = (drflac_int32)left2;
7570         pOutputSamples[i*8+5] = (drflac_int32)right2;
7571         pOutputSamples[i*8+6] = (drflac_int32)left3;
7572         pOutputSamples[i*8+7] = (drflac_int32)right3;
7573     }
7574
7575     for (i = (frameCount4 << 2); i < frameCount; ++i) {
7576         drflac_uint32 left  = pInputSamples0U32[i] << shift0;
7577         drflac_uint32 side  = pInputSamples1U32[i] << shift1;
7578         drflac_uint32 right = left - side;
7579
7580         pOutputSamples[i*2+0] = (drflac_int32)left;
7581         pOutputSamples[i*2+1] = (drflac_int32)right;
7582     }
7583 }
7584
7585 #if defined(DRFLAC_SUPPORT_SSE2)
7586 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_left_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
7587 {
7588     drflac_uint64 i;
7589     drflac_uint64 frameCount4 = frameCount >> 2;
7590     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
7591     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
7592     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
7593     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
7594
7595     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
7596
7597     for (i = 0; i < frameCount4; ++i) {
7598         __m128i left  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
7599         __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
7600         __m128i right = _mm_sub_epi32(left, side);
7601
7602         _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 0), _mm_unpacklo_epi32(left, right));
7603         _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 4), _mm_unpackhi_epi32(left, right));
7604     }
7605
7606     for (i = (frameCount4 << 2); i < frameCount; ++i) {
7607         drflac_uint32 left  = pInputSamples0U32[i] << shift0;
7608         drflac_uint32 side  = pInputSamples1U32[i] << shift1;
7609         drflac_uint32 right = left - side;
7610
7611         pOutputSamples[i*2+0] = (drflac_int32)left;
7612         pOutputSamples[i*2+1] = (drflac_int32)right;
7613     }
7614 }
7615 #endif
7616
7617 #if defined(DRFLAC_SUPPORT_NEON)
7618 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_left_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
7619 {
7620     drflac_uint64 i;
7621     drflac_uint64 frameCount4 = frameCount >> 2;
7622     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
7623     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
7624     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
7625     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
7626     int32x4_t shift0_4;
7627     int32x4_t shift1_4;
7628
7629     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
7630
7631     shift0_4 = vdupq_n_s32(shift0);
7632     shift1_4 = vdupq_n_s32(shift1);
7633
7634     for (i = 0; i < frameCount4; ++i) {
7635         uint32x4_t left;
7636         uint32x4_t side;
7637         uint32x4_t right;
7638
7639         left  = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
7640         side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
7641         right = vsubq_u32(left, side);
7642
7643         drflac__vst2q_u32((drflac_uint32*)pOutputSamples + i*8, vzipq_u32(left, right));
7644     }
7645
7646     for (i = (frameCount4 << 2); i < frameCount; ++i) {
7647         drflac_uint32 left  = pInputSamples0U32[i] << shift0;
7648         drflac_uint32 side  = pInputSamples1U32[i] << shift1;
7649         drflac_uint32 right = left - side;
7650
7651         pOutputSamples[i*2+0] = (drflac_int32)left;
7652         pOutputSamples[i*2+1] = (drflac_int32)right;
7653     }
7654 }
7655 #endif
7656
7657 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_left_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
7658 {
7659 #if defined(DRFLAC_SUPPORT_SSE2)
7660     if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
7661         drflac_read_pcm_frames_s32__decode_left_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
7662     } else
7663 #elif defined(DRFLAC_SUPPORT_NEON)
7664     if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
7665         drflac_read_pcm_frames_s32__decode_left_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
7666     } else
7667 #endif
7668     {
7669         /* Scalar fallback. */
7670         drflac_read_pcm_frames_s32__decode_left_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
7671     }
7672 }
7673
7674
7675 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_right_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
7676 {
7677     drflac_uint64 i;
7678     drflac_uint64 frameCount4 = frameCount >> 2;
7679     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
7680     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
7681     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
7682     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
7683
7684     for (i = 0; i < frameCount4; ++i) {
7685         drflac_uint32 side0  = pInputSamples0U32[i*4+0] << shift0;
7686         drflac_uint32 side1  = pInputSamples0U32[i*4+1] << shift0;
7687         drflac_uint32 side2  = pInputSamples0U32[i*4+2] << shift0;
7688         drflac_uint32 side3  = pInputSamples0U32[i*4+3] << shift0;
7689
7690         drflac_uint32 right0 = pInputSamples1U32[i*4+0] << shift1;
7691         drflac_uint32 right1 = pInputSamples1U32[i*4+1] << shift1;
7692         drflac_uint32 right2 = pInputSamples1U32[i*4+2] << shift1;
7693         drflac_uint32 right3 = pInputSamples1U32[i*4+3] << shift1;
7694
7695         drflac_uint32 left0 = right0 + side0;
7696         drflac_uint32 left1 = right1 + side1;
7697         drflac_uint32 left2 = right2 + side2;
7698         drflac_uint32 left3 = right3 + side3;
7699
7700         pOutputSamples[i*8+0] = (drflac_int32)left0;
7701         pOutputSamples[i*8+1] = (drflac_int32)right0;
7702         pOutputSamples[i*8+2] = (drflac_int32)left1;
7703         pOutputSamples[i*8+3] = (drflac_int32)right1;
7704         pOutputSamples[i*8+4] = (drflac_int32)left2;
7705         pOutputSamples[i*8+5] = (drflac_int32)right2;
7706         pOutputSamples[i*8+6] = (drflac_int32)left3;
7707         pOutputSamples[i*8+7] = (drflac_int32)right3;
7708     }
7709
7710     for (i = (frameCount4 << 2); i < frameCount; ++i) {
7711         drflac_uint32 side  = pInputSamples0U32[i] << shift0;
7712         drflac_uint32 right = pInputSamples1U32[i] << shift1;
7713         drflac_uint32 left  = right + side;
7714
7715         pOutputSamples[i*2+0] = (drflac_int32)left;
7716         pOutputSamples[i*2+1] = (drflac_int32)right;
7717     }
7718 }
7719
7720 #if defined(DRFLAC_SUPPORT_SSE2)
7721 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_right_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
7722 {
7723     drflac_uint64 i;
7724     drflac_uint64 frameCount4 = frameCount >> 2;
7725     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
7726     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
7727     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
7728     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
7729
7730     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
7731
7732     for (i = 0; i < frameCount4; ++i) {
7733         __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
7734         __m128i right = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
7735         __m128i left  = _mm_add_epi32(right, side);
7736
7737         _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 0), _mm_unpacklo_epi32(left, right));
7738         _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 4), _mm_unpackhi_epi32(left, right));
7739     }
7740
7741     for (i = (frameCount4 << 2); i < frameCount; ++i) {
7742         drflac_uint32 side  = pInputSamples0U32[i] << shift0;
7743         drflac_uint32 right = pInputSamples1U32[i] << shift1;
7744         drflac_uint32 left  = right + side;
7745
7746         pOutputSamples[i*2+0] = (drflac_int32)left;
7747         pOutputSamples[i*2+1] = (drflac_int32)right;
7748     }
7749 }
7750 #endif
7751
7752 #if defined(DRFLAC_SUPPORT_NEON)
7753 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_right_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
7754 {
7755     drflac_uint64 i;
7756     drflac_uint64 frameCount4 = frameCount >> 2;
7757     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
7758     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
7759     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
7760     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
7761     int32x4_t shift0_4;
7762     int32x4_t shift1_4;
7763
7764     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
7765
7766     shift0_4 = vdupq_n_s32(shift0);
7767     shift1_4 = vdupq_n_s32(shift1);
7768
7769     for (i = 0; i < frameCount4; ++i) {
7770         uint32x4_t side;
7771         uint32x4_t right;
7772         uint32x4_t left;
7773
7774         side  = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
7775         right = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
7776         left  = vaddq_u32(right, side);
7777
7778         drflac__vst2q_u32((drflac_uint32*)pOutputSamples + i*8, vzipq_u32(left, right));
7779     }
7780
7781     for (i = (frameCount4 << 2); i < frameCount; ++i) {
7782         drflac_uint32 side  = pInputSamples0U32[i] << shift0;
7783         drflac_uint32 right = pInputSamples1U32[i] << shift1;
7784         drflac_uint32 left  = right + side;
7785
7786         pOutputSamples[i*2+0] = (drflac_int32)left;
7787         pOutputSamples[i*2+1] = (drflac_int32)right;
7788     }
7789 }
7790 #endif
7791
7792 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_right_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
7793 {
7794 #if defined(DRFLAC_SUPPORT_SSE2)
7795     if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
7796         drflac_read_pcm_frames_s32__decode_right_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
7797     } else
7798 #elif defined(DRFLAC_SUPPORT_NEON)
7799     if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
7800         drflac_read_pcm_frames_s32__decode_right_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
7801     } else
7802 #endif
7803     {
7804         /* Scalar fallback. */
7805         drflac_read_pcm_frames_s32__decode_right_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
7806     }
7807 }
7808
7809
7810 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
7811 {
7812     drflac_uint64 i;
7813     drflac_uint64 frameCount4 = frameCount >> 2;
7814     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
7815     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
7816     drflac_int32 shift = unusedBitsPerSample;
7817
7818     if (shift > 0) {
7819         shift -= 1;
7820         for (i = 0; i < frameCount4; ++i) {
7821             drflac_uint32 temp0L;
7822             drflac_uint32 temp1L;
7823             drflac_uint32 temp2L;
7824             drflac_uint32 temp3L;
7825             drflac_uint32 temp0R;
7826             drflac_uint32 temp1R;
7827             drflac_uint32 temp2R;
7828             drflac_uint32 temp3R;
7829
7830             drflac_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
7831             drflac_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
7832             drflac_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
7833             drflac_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
7834
7835             drflac_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
7836             drflac_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
7837             drflac_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
7838             drflac_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
7839
7840             mid0 = (mid0 << 1) | (side0 & 0x01);
7841             mid1 = (mid1 << 1) | (side1 & 0x01);
7842             mid2 = (mid2 << 1) | (side2 & 0x01);
7843             mid3 = (mid3 << 1) | (side3 & 0x01);
7844
7845             temp0L = (mid0 + side0) << shift;
7846             temp1L = (mid1 + side1) << shift;
7847             temp2L = (mid2 + side2) << shift;
7848             temp3L = (mid3 + side3) << shift;
7849
7850             temp0R = (mid0 - side0) << shift;
7851             temp1R = (mid1 - side1) << shift;
7852             temp2R = (mid2 - side2) << shift;
7853             temp3R = (mid3 - side3) << shift;
7854
7855             pOutputSamples[i*8+0] = (drflac_int32)temp0L;
7856             pOutputSamples[i*8+1] = (drflac_int32)temp0R;
7857             pOutputSamples[i*8+2] = (drflac_int32)temp1L;
7858             pOutputSamples[i*8+3] = (drflac_int32)temp1R;
7859             pOutputSamples[i*8+4] = (drflac_int32)temp2L;
7860             pOutputSamples[i*8+5] = (drflac_int32)temp2R;
7861             pOutputSamples[i*8+6] = (drflac_int32)temp3L;
7862             pOutputSamples[i*8+7] = (drflac_int32)temp3R;
7863         }
7864     } else {
7865         for (i = 0; i < frameCount4; ++i) {
7866             drflac_uint32 temp0L;
7867             drflac_uint32 temp1L;
7868             drflac_uint32 temp2L;
7869             drflac_uint32 temp3L;
7870             drflac_uint32 temp0R;
7871             drflac_uint32 temp1R;
7872             drflac_uint32 temp2R;
7873             drflac_uint32 temp3R;
7874
7875             drflac_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
7876             drflac_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
7877             drflac_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
7878             drflac_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
7879
7880             drflac_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
7881             drflac_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
7882             drflac_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
7883             drflac_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
7884
7885             mid0 = (mid0 << 1) | (side0 & 0x01);
7886             mid1 = (mid1 << 1) | (side1 & 0x01);
7887             mid2 = (mid2 << 1) | (side2 & 0x01);
7888             mid3 = (mid3 << 1) | (side3 & 0x01);
7889
7890             temp0L = (drflac_uint32)((drflac_int32)(mid0 + side0) >> 1);
7891             temp1L = (drflac_uint32)((drflac_int32)(mid1 + side1) >> 1);
7892             temp2L = (drflac_uint32)((drflac_int32)(mid2 + side2) >> 1);
7893             temp3L = (drflac_uint32)((drflac_int32)(mid3 + side3) >> 1);
7894
7895             temp0R = (drflac_uint32)((drflac_int32)(mid0 - side0) >> 1);
7896             temp1R = (drflac_uint32)((drflac_int32)(mid1 - side1) >> 1);
7897             temp2R = (drflac_uint32)((drflac_int32)(mid2 - side2) >> 1);
7898             temp3R = (drflac_uint32)((drflac_int32)(mid3 - side3) >> 1);
7899
7900             pOutputSamples[i*8+0] = (drflac_int32)temp0L;
7901             pOutputSamples[i*8+1] = (drflac_int32)temp0R;
7902             pOutputSamples[i*8+2] = (drflac_int32)temp1L;
7903             pOutputSamples[i*8+3] = (drflac_int32)temp1R;
7904             pOutputSamples[i*8+4] = (drflac_int32)temp2L;
7905             pOutputSamples[i*8+5] = (drflac_int32)temp2R;
7906             pOutputSamples[i*8+6] = (drflac_int32)temp3L;
7907             pOutputSamples[i*8+7] = (drflac_int32)temp3R;
7908         }
7909     }
7910
7911     for (i = (frameCount4 << 2); i < frameCount; ++i) {
7912         drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
7913         drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
7914
7915         mid = (mid << 1) | (side & 0x01);
7916
7917         pOutputSamples[i*2+0] = (drflac_int32)((drflac_uint32)((drflac_int32)(mid + side) >> 1) << unusedBitsPerSample);
7918         pOutputSamples[i*2+1] = (drflac_int32)((drflac_uint32)((drflac_int32)(mid - side) >> 1) << unusedBitsPerSample);
7919     }
7920 }
7921
7922 #if defined(DRFLAC_SUPPORT_SSE2)
7923 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
7924 {
7925     drflac_uint64 i;
7926     drflac_uint64 frameCount4 = frameCount >> 2;
7927     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
7928     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
7929     drflac_int32 shift = unusedBitsPerSample;
7930
7931     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
7932
7933     if (shift == 0) {
7934         for (i = 0; i < frameCount4; ++i) {
7935             __m128i mid;
7936             __m128i side;
7937             __m128i left;
7938             __m128i right;
7939
7940             mid   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
7941             side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
7942
7943             mid   = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
7944
7945             left  = _mm_srai_epi32(_mm_add_epi32(mid, side), 1);
7946             right = _mm_srai_epi32(_mm_sub_epi32(mid, side), 1);
7947
7948             _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 0), _mm_unpacklo_epi32(left, right));
7949             _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 4), _mm_unpackhi_epi32(left, right));
7950         }
7951
7952         for (i = (frameCount4 << 2); i < frameCount; ++i) {
7953             drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
7954             drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
7955
7956             mid = (mid << 1) | (side & 0x01);
7957
7958             pOutputSamples[i*2+0] = (drflac_int32)(mid + side) >> 1;
7959             pOutputSamples[i*2+1] = (drflac_int32)(mid - side) >> 1;
7960         }
7961     } else {
7962         shift -= 1;
7963         for (i = 0; i < frameCount4; ++i) {
7964             __m128i mid;
7965             __m128i side;
7966             __m128i left;
7967             __m128i right;
7968
7969             mid   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
7970             side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
7971
7972             mid   = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
7973
7974             left  = _mm_slli_epi32(_mm_add_epi32(mid, side), shift);
7975             right = _mm_slli_epi32(_mm_sub_epi32(mid, side), shift);
7976
7977             _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 0), _mm_unpacklo_epi32(left, right));
7978             _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 4), _mm_unpackhi_epi32(left, right));
7979         }
7980
7981         for (i = (frameCount4 << 2); i < frameCount; ++i) {
7982             drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
7983             drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
7984
7985             mid = (mid << 1) | (side & 0x01);
7986
7987             pOutputSamples[i*2+0] = (drflac_int32)((mid + side) << shift);
7988             pOutputSamples[i*2+1] = (drflac_int32)((mid - side) << shift);
7989         }
7990     }
7991 }
7992 #endif
7993
7994 #if defined(DRFLAC_SUPPORT_NEON)
7995 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
7996 {
7997     drflac_uint64 i;
7998     drflac_uint64 frameCount4 = frameCount >> 2;
7999     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
8000     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
8001     drflac_int32 shift = unusedBitsPerSample;
8002     int32x4_t  wbpsShift0_4; /* wbps = Wasted Bits Per Sample */
8003     int32x4_t  wbpsShift1_4; /* wbps = Wasted Bits Per Sample */
8004     uint32x4_t one4;
8005
8006     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
8007
8008     wbpsShift0_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
8009     wbpsShift1_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
8010     one4         = vdupq_n_u32(1);
8011
8012     if (shift == 0) {
8013         for (i = 0; i < frameCount4; ++i) {
8014             uint32x4_t mid;
8015             uint32x4_t side;
8016             int32x4_t left;
8017             int32x4_t right;
8018
8019             mid   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbpsShift0_4);
8020             side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbpsShift1_4);
8021
8022             mid   = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, one4));
8023
8024             left  = vshrq_n_s32(vreinterpretq_s32_u32(vaddq_u32(mid, side)), 1);
8025             right = vshrq_n_s32(vreinterpretq_s32_u32(vsubq_u32(mid, side)), 1);
8026
8027             drflac__vst2q_s32(pOutputSamples + i*8, vzipq_s32(left, right));
8028         }
8029
8030         for (i = (frameCount4 << 2); i < frameCount; ++i) {
8031             drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8032             drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8033
8034             mid = (mid << 1) | (side & 0x01);
8035
8036             pOutputSamples[i*2+0] = (drflac_int32)(mid + side) >> 1;
8037             pOutputSamples[i*2+1] = (drflac_int32)(mid - side) >> 1;
8038         }
8039     } else {
8040         int32x4_t shift4;
8041
8042         shift -= 1;
8043         shift4 = vdupq_n_s32(shift);
8044
8045         for (i = 0; i < frameCount4; ++i) {
8046             uint32x4_t mid;
8047             uint32x4_t side;
8048             int32x4_t left;
8049             int32x4_t right;
8050
8051             mid   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbpsShift0_4);
8052             side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbpsShift1_4);
8053
8054             mid   = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, one4));
8055
8056             left  = vreinterpretq_s32_u32(vshlq_u32(vaddq_u32(mid, side), shift4));
8057             right = vreinterpretq_s32_u32(vshlq_u32(vsubq_u32(mid, side), shift4));
8058
8059             drflac__vst2q_s32(pOutputSamples + i*8, vzipq_s32(left, right));
8060         }
8061
8062         for (i = (frameCount4 << 2); i < frameCount; ++i) {
8063             drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8064             drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8065
8066             mid = (mid << 1) | (side & 0x01);
8067
8068             pOutputSamples[i*2+0] = (drflac_int32)((mid + side) << shift);
8069             pOutputSamples[i*2+1] = (drflac_int32)((mid - side) << shift);
8070         }
8071     }
8072 }
8073 #endif
8074
8075 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
8076 {
8077 #if defined(DRFLAC_SUPPORT_SSE2)
8078     if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
8079         drflac_read_pcm_frames_s32__decode_mid_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
8080     } else
8081 #elif defined(DRFLAC_SUPPORT_NEON)
8082     if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
8083         drflac_read_pcm_frames_s32__decode_mid_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
8084     } else
8085 #endif
8086     {
8087         /* Scalar fallback. */
8088         drflac_read_pcm_frames_s32__decode_mid_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
8089     }
8090 }
8091
8092
8093 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_independent_stereo__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
8094 {
8095     drflac_uint64 i;
8096     drflac_uint64 frameCount4 = frameCount >> 2;
8097     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
8098     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
8099     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8100     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8101
8102     for (i = 0; i < frameCount4; ++i) {
8103         drflac_uint32 tempL0 = pInputSamples0U32[i*4+0] << shift0;
8104         drflac_uint32 tempL1 = pInputSamples0U32[i*4+1] << shift0;
8105         drflac_uint32 tempL2 = pInputSamples0U32[i*4+2] << shift0;
8106         drflac_uint32 tempL3 = pInputSamples0U32[i*4+3] << shift0;
8107
8108         drflac_uint32 tempR0 = pInputSamples1U32[i*4+0] << shift1;
8109         drflac_uint32 tempR1 = pInputSamples1U32[i*4+1] << shift1;
8110         drflac_uint32 tempR2 = pInputSamples1U32[i*4+2] << shift1;
8111         drflac_uint32 tempR3 = pInputSamples1U32[i*4+3] << shift1;
8112
8113         pOutputSamples[i*8+0] = (drflac_int32)tempL0;
8114         pOutputSamples[i*8+1] = (drflac_int32)tempR0;
8115         pOutputSamples[i*8+2] = (drflac_int32)tempL1;
8116         pOutputSamples[i*8+3] = (drflac_int32)tempR1;
8117         pOutputSamples[i*8+4] = (drflac_int32)tempL2;
8118         pOutputSamples[i*8+5] = (drflac_int32)tempR2;
8119         pOutputSamples[i*8+6] = (drflac_int32)tempL3;
8120         pOutputSamples[i*8+7] = (drflac_int32)tempR3;
8121     }
8122
8123     for (i = (frameCount4 << 2); i < frameCount; ++i) {
8124         pOutputSamples[i*2+0] = (drflac_int32)(pInputSamples0U32[i] << shift0);
8125         pOutputSamples[i*2+1] = (drflac_int32)(pInputSamples1U32[i] << shift1);
8126     }
8127 }
8128
8129 #if defined(DRFLAC_SUPPORT_SSE2)
8130 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_independent_stereo__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
8131 {
8132     drflac_uint64 i;
8133     drflac_uint64 frameCount4 = frameCount >> 2;
8134     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
8135     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
8136     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8137     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8138
8139     for (i = 0; i < frameCount4; ++i) {
8140         __m128i left  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
8141         __m128i right = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
8142
8143         _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 0), _mm_unpacklo_epi32(left, right));
8144         _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 4), _mm_unpackhi_epi32(left, right));
8145     }
8146
8147     for (i = (frameCount4 << 2); i < frameCount; ++i) {
8148         pOutputSamples[i*2+0] = (drflac_int32)(pInputSamples0U32[i] << shift0);
8149         pOutputSamples[i*2+1] = (drflac_int32)(pInputSamples1U32[i] << shift1);
8150     }
8151 }
8152 #endif
8153
8154 #if defined(DRFLAC_SUPPORT_NEON)
8155 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_independent_stereo__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
8156 {
8157     drflac_uint64 i;
8158     drflac_uint64 frameCount4 = frameCount >> 2;
8159     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
8160     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
8161     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8162     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8163
8164     int32x4_t shift4_0 = vdupq_n_s32(shift0);
8165     int32x4_t shift4_1 = vdupq_n_s32(shift1);
8166
8167     for (i = 0; i < frameCount4; ++i) {
8168         int32x4_t left;
8169         int32x4_t right;
8170
8171         left  = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift4_0));
8172         right = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift4_1));
8173
8174         drflac__vst2q_s32(pOutputSamples + i*8, vzipq_s32(left, right));
8175     }
8176
8177     for (i = (frameCount4 << 2); i < frameCount; ++i) {
8178         pOutputSamples[i*2+0] = (drflac_int32)(pInputSamples0U32[i] << shift0);
8179         pOutputSamples[i*2+1] = (drflac_int32)(pInputSamples1U32[i] << shift1);
8180     }
8181 }
8182 #endif
8183
8184 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_independent_stereo(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
8185 {
8186 #if defined(DRFLAC_SUPPORT_SSE2)
8187     if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
8188         drflac_read_pcm_frames_s32__decode_independent_stereo__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
8189     } else
8190 #elif defined(DRFLAC_SUPPORT_NEON)
8191     if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
8192         drflac_read_pcm_frames_s32__decode_independent_stereo__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
8193     } else
8194 #endif
8195     {
8196         /* Scalar fallback. */
8197         drflac_read_pcm_frames_s32__decode_independent_stereo__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
8198     }
8199 }
8200
8201
8202 DRFLAC_API drflac_uint64 drflac_read_pcm_frames_s32(drflac* pFlac, drflac_uint64 framesToRead, drflac_int32* pBufferOut)
8203 {
8204     drflac_uint64 framesRead;
8205     drflac_uint32 unusedBitsPerSample;
8206
8207     if (pFlac == NULL || framesToRead == 0) {
8208         return 0;
8209     }
8210
8211     if (pBufferOut == NULL) {
8212         return drflac__seek_forward_by_pcm_frames(pFlac, framesToRead);
8213     }
8214
8215     DRFLAC_ASSERT(pFlac->bitsPerSample <= 32);
8216     unusedBitsPerSample = 32 - pFlac->bitsPerSample;
8217
8218     framesRead = 0;
8219     while (framesToRead > 0) {
8220         /* If we've run out of samples in this frame, go to the next. */
8221         if (pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
8222             if (!drflac__read_and_decode_next_flac_frame(pFlac)) {
8223                 break;  /* Couldn't read the next frame, so just break from the loop and return. */
8224             }
8225         } else {
8226             unsigned int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFLACFrame.header.channelAssignment);
8227             drflac_uint64 iFirstPCMFrame = pFlac->currentFLACFrame.header.blockSizeInPCMFrames - pFlac->currentFLACFrame.pcmFramesRemaining;
8228             drflac_uint64 frameCountThisIteration = framesToRead;
8229
8230             if (frameCountThisIteration > pFlac->currentFLACFrame.pcmFramesRemaining) {
8231                 frameCountThisIteration = pFlac->currentFLACFrame.pcmFramesRemaining;
8232             }
8233
8234             if (channelCount == 2) {
8235                 const drflac_int32* pDecodedSamples0 = pFlac->currentFLACFrame.subframes[0].pSamplesS32 + iFirstPCMFrame;
8236                 const drflac_int32* pDecodedSamples1 = pFlac->currentFLACFrame.subframes[1].pSamplesS32 + iFirstPCMFrame;
8237
8238                 switch (pFlac->currentFLACFrame.header.channelAssignment)
8239                 {
8240                     case DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
8241                     {
8242                         drflac_read_pcm_frames_s32__decode_left_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
8243                     } break;
8244
8245                     case DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
8246                     {
8247                         drflac_read_pcm_frames_s32__decode_right_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
8248                     } break;
8249
8250                     case DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
8251                     {
8252                         drflac_read_pcm_frames_s32__decode_mid_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
8253                     } break;
8254
8255                     case DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
8256                     default:
8257                     {
8258                         drflac_read_pcm_frames_s32__decode_independent_stereo(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
8259                     } break;
8260                 }
8261             } else {
8262                 /* Generic interleaving. */
8263                 drflac_uint64 i;
8264                 for (i = 0; i < frameCountThisIteration; ++i) {
8265                     unsigned int j;
8266                     for (j = 0; j < channelCount; ++j) {
8267                         pBufferOut[(i*channelCount)+j] = (drflac_int32)((drflac_uint32)(pFlac->currentFLACFrame.subframes[j].pSamplesS32[iFirstPCMFrame + i]) << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[j].wastedBitsPerSample));
8268                     }
8269                 }
8270             }
8271
8272             framesRead                += frameCountThisIteration;
8273             pBufferOut                += frameCountThisIteration * channelCount;
8274             framesToRead              -= frameCountThisIteration;
8275             pFlac->currentPCMFrame    += frameCountThisIteration;
8276             pFlac->currentFLACFrame.pcmFramesRemaining -= (drflac_uint32)frameCountThisIteration;
8277         }
8278     }
8279
8280     return framesRead;
8281 }
8282
8283 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
8284 {
8285     drflac_uint64 i;
8286     drflac_uint64 frameCount4 = frameCount >> 2;
8287     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
8288     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
8289     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8290     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8291
8292     for (i = 0; i < frameCount4; ++i) {
8293         drflac_uint32 left0 = pInputSamples0U32[i*4+0] << shift0;
8294         drflac_uint32 left1 = pInputSamples0U32[i*4+1] << shift0;
8295         drflac_uint32 left2 = pInputSamples0U32[i*4+2] << shift0;
8296         drflac_uint32 left3 = pInputSamples0U32[i*4+3] << shift0;
8297
8298         drflac_uint32 side0 = pInputSamples1U32[i*4+0] << shift1;
8299         drflac_uint32 side1 = pInputSamples1U32[i*4+1] << shift1;
8300         drflac_uint32 side2 = pInputSamples1U32[i*4+2] << shift1;
8301         drflac_uint32 side3 = pInputSamples1U32[i*4+3] << shift1;
8302
8303         drflac_uint32 right0 = left0 - side0;
8304         drflac_uint32 right1 = left1 - side1;
8305         drflac_uint32 right2 = left2 - side2;
8306         drflac_uint32 right3 = left3 - side3;
8307
8308         left0  >>= 16;
8309         left1  >>= 16;
8310         left2  >>= 16;
8311         left3  >>= 16;
8312
8313         right0 >>= 16;
8314         right1 >>= 16;
8315         right2 >>= 16;
8316         right3 >>= 16;
8317
8318         pOutputSamples[i*8+0] = (drflac_int16)left0;
8319         pOutputSamples[i*8+1] = (drflac_int16)right0;
8320         pOutputSamples[i*8+2] = (drflac_int16)left1;
8321         pOutputSamples[i*8+3] = (drflac_int16)right1;
8322         pOutputSamples[i*8+4] = (drflac_int16)left2;
8323         pOutputSamples[i*8+5] = (drflac_int16)right2;
8324         pOutputSamples[i*8+6] = (drflac_int16)left3;
8325         pOutputSamples[i*8+7] = (drflac_int16)right3;
8326     }
8327
8328     for (i = (frameCount4 << 2); i < frameCount; ++i) {
8329         drflac_uint32 left  = pInputSamples0U32[i] << shift0;
8330         drflac_uint32 side  = pInputSamples1U32[i] << shift1;
8331         drflac_uint32 right = left - side;
8332
8333         left  >>= 16;
8334         right >>= 16;
8335
8336         pOutputSamples[i*2+0] = (drflac_int16)left;
8337         pOutputSamples[i*2+1] = (drflac_int16)right;
8338     }
8339 }
8340
8341 #if defined(DRFLAC_SUPPORT_SSE2)
8342 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
8343 {
8344     drflac_uint64 i;
8345     drflac_uint64 frameCount4 = frameCount >> 2;
8346     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
8347     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
8348     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8349     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8350
8351     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
8352
8353     for (i = 0; i < frameCount4; ++i) {
8354         __m128i left  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
8355         __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
8356         __m128i right = _mm_sub_epi32(left, side);
8357
8358         left  = _mm_srai_epi32(left,  16);
8359         right = _mm_srai_epi32(right, 16);
8360
8361         _mm_storeu_si128((__m128i*)(pOutputSamples + i*8), drflac__mm_packs_interleaved_epi32(left, right));
8362     }
8363
8364     for (i = (frameCount4 << 2); i < frameCount; ++i) {
8365         drflac_uint32 left  = pInputSamples0U32[i] << shift0;
8366         drflac_uint32 side  = pInputSamples1U32[i] << shift1;
8367         drflac_uint32 right = left - side;
8368
8369         left  >>= 16;
8370         right >>= 16;
8371
8372         pOutputSamples[i*2+0] = (drflac_int16)left;
8373         pOutputSamples[i*2+1] = (drflac_int16)right;
8374     }
8375 }
8376 #endif
8377
8378 #if defined(DRFLAC_SUPPORT_NEON)
8379 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
8380 {
8381     drflac_uint64 i;
8382     drflac_uint64 frameCount4 = frameCount >> 2;
8383     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
8384     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
8385     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8386     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8387     int32x4_t shift0_4;
8388     int32x4_t shift1_4;
8389
8390     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
8391
8392     shift0_4 = vdupq_n_s32(shift0);
8393     shift1_4 = vdupq_n_s32(shift1);
8394
8395     for (i = 0; i < frameCount4; ++i) {
8396         uint32x4_t left;
8397         uint32x4_t side;
8398         uint32x4_t right;
8399
8400         left  = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
8401         side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
8402         right = vsubq_u32(left, side);
8403
8404         left  = vshrq_n_u32(left,  16);
8405         right = vshrq_n_u32(right, 16);
8406
8407         drflac__vst2q_u16((drflac_uint16*)pOutputSamples + i*8, vzip_u16(vmovn_u32(left), vmovn_u32(right)));
8408     }
8409
8410     for (i = (frameCount4 << 2); i < frameCount; ++i) {
8411         drflac_uint32 left  = pInputSamples0U32[i] << shift0;
8412         drflac_uint32 side  = pInputSamples1U32[i] << shift1;
8413         drflac_uint32 right = left - side;
8414
8415         left  >>= 16;
8416         right >>= 16;
8417
8418         pOutputSamples[i*2+0] = (drflac_int16)left;
8419         pOutputSamples[i*2+1] = (drflac_int16)right;
8420     }
8421 }
8422 #endif
8423
8424 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
8425 {
8426 #if defined(DRFLAC_SUPPORT_SSE2)
8427     if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
8428         drflac_read_pcm_frames_s16__decode_left_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
8429     } else
8430 #elif defined(DRFLAC_SUPPORT_NEON)
8431     if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
8432         drflac_read_pcm_frames_s16__decode_left_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
8433     } else
8434 #endif
8435     {
8436         /* Scalar fallback. */
8437         drflac_read_pcm_frames_s16__decode_left_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
8438     }
8439 }
8440
8441
8442 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
8443 {
8444     drflac_uint64 i;
8445     drflac_uint64 frameCount4 = frameCount >> 2;
8446     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
8447     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
8448     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8449     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8450
8451     for (i = 0; i < frameCount4; ++i) {
8452         drflac_uint32 side0  = pInputSamples0U32[i*4+0] << shift0;
8453         drflac_uint32 side1  = pInputSamples0U32[i*4+1] << shift0;
8454         drflac_uint32 side2  = pInputSamples0U32[i*4+2] << shift0;
8455         drflac_uint32 side3  = pInputSamples0U32[i*4+3] << shift0;
8456
8457         drflac_uint32 right0 = pInputSamples1U32[i*4+0] << shift1;
8458         drflac_uint32 right1 = pInputSamples1U32[i*4+1] << shift1;
8459         drflac_uint32 right2 = pInputSamples1U32[i*4+2] << shift1;
8460         drflac_uint32 right3 = pInputSamples1U32[i*4+3] << shift1;
8461
8462         drflac_uint32 left0 = right0 + side0;
8463         drflac_uint32 left1 = right1 + side1;
8464         drflac_uint32 left2 = right2 + side2;
8465         drflac_uint32 left3 = right3 + side3;
8466
8467         left0  >>= 16;
8468         left1  >>= 16;
8469         left2  >>= 16;
8470         left3  >>= 16;
8471
8472         right0 >>= 16;
8473         right1 >>= 16;
8474         right2 >>= 16;
8475         right3 >>= 16;
8476
8477         pOutputSamples[i*8+0] = (drflac_int16)left0;
8478         pOutputSamples[i*8+1] = (drflac_int16)right0;
8479         pOutputSamples[i*8+2] = (drflac_int16)left1;
8480         pOutputSamples[i*8+3] = (drflac_int16)right1;
8481         pOutputSamples[i*8+4] = (drflac_int16)left2;
8482         pOutputSamples[i*8+5] = (drflac_int16)right2;
8483         pOutputSamples[i*8+6] = (drflac_int16)left3;
8484         pOutputSamples[i*8+7] = (drflac_int16)right3;
8485     }
8486
8487     for (i = (frameCount4 << 2); i < frameCount; ++i) {
8488         drflac_uint32 side  = pInputSamples0U32[i] << shift0;
8489         drflac_uint32 right = pInputSamples1U32[i] << shift1;
8490         drflac_uint32 left  = right + side;
8491
8492         left  >>= 16;
8493         right >>= 16;
8494
8495         pOutputSamples[i*2+0] = (drflac_int16)left;
8496         pOutputSamples[i*2+1] = (drflac_int16)right;
8497     }
8498 }
8499
8500 #if defined(DRFLAC_SUPPORT_SSE2)
8501 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
8502 {
8503     drflac_uint64 i;
8504     drflac_uint64 frameCount4 = frameCount >> 2;
8505     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
8506     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
8507     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8508     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8509
8510     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
8511
8512     for (i = 0; i < frameCount4; ++i) {
8513         __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
8514         __m128i right = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
8515         __m128i left  = _mm_add_epi32(right, side);
8516
8517         left  = _mm_srai_epi32(left,  16);
8518         right = _mm_srai_epi32(right, 16);
8519
8520         _mm_storeu_si128((__m128i*)(pOutputSamples + i*8), drflac__mm_packs_interleaved_epi32(left, right));
8521     }
8522
8523     for (i = (frameCount4 << 2); i < frameCount; ++i) {
8524         drflac_uint32 side  = pInputSamples0U32[i] << shift0;
8525         drflac_uint32 right = pInputSamples1U32[i] << shift1;
8526         drflac_uint32 left  = right + side;
8527
8528         left  >>= 16;
8529         right >>= 16;
8530
8531         pOutputSamples[i*2+0] = (drflac_int16)left;
8532         pOutputSamples[i*2+1] = (drflac_int16)right;
8533     }
8534 }
8535 #endif
8536
8537 #if defined(DRFLAC_SUPPORT_NEON)
8538 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
8539 {
8540     drflac_uint64 i;
8541     drflac_uint64 frameCount4 = frameCount >> 2;
8542     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
8543     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
8544     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8545     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8546     int32x4_t shift0_4;
8547     int32x4_t shift1_4;
8548
8549     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
8550
8551     shift0_4 = vdupq_n_s32(shift0);
8552     shift1_4 = vdupq_n_s32(shift1);
8553
8554     for (i = 0; i < frameCount4; ++i) {
8555         uint32x4_t side;
8556         uint32x4_t right;
8557         uint32x4_t left;
8558
8559         side  = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
8560         right = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
8561         left  = vaddq_u32(right, side);
8562
8563         left  = vshrq_n_u32(left,  16);
8564         right = vshrq_n_u32(right, 16);
8565
8566         drflac__vst2q_u16((drflac_uint16*)pOutputSamples + i*8, vzip_u16(vmovn_u32(left), vmovn_u32(right)));
8567     }
8568
8569     for (i = (frameCount4 << 2); i < frameCount; ++i) {
8570         drflac_uint32 side  = pInputSamples0U32[i] << shift0;
8571         drflac_uint32 right = pInputSamples1U32[i] << shift1;
8572         drflac_uint32 left  = right + side;
8573
8574         left  >>= 16;
8575         right >>= 16;
8576
8577         pOutputSamples[i*2+0] = (drflac_int16)left;
8578         pOutputSamples[i*2+1] = (drflac_int16)right;
8579     }
8580 }
8581 #endif
8582
8583 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
8584 {
8585 #if defined(DRFLAC_SUPPORT_SSE2)
8586     if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
8587         drflac_read_pcm_frames_s16__decode_right_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
8588     } else
8589 #elif defined(DRFLAC_SUPPORT_NEON)
8590     if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
8591         drflac_read_pcm_frames_s16__decode_right_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
8592     } else
8593 #endif
8594     {
8595         /* Scalar fallback. */
8596         drflac_read_pcm_frames_s16__decode_right_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
8597     }
8598 }
8599
8600
8601
8602 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
8603 {
8604     drflac_uint64 i;
8605     drflac_uint64 frameCount4 = frameCount >> 2;
8606     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
8607     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
8608     drflac_uint32 shift = unusedBitsPerSample;
8609
8610     if (shift > 0) {
8611         shift -= 1;
8612         for (i = 0; i < frameCount4; ++i) {
8613             drflac_uint32 temp0L;
8614             drflac_uint32 temp1L;
8615             drflac_uint32 temp2L;
8616             drflac_uint32 temp3L;
8617             drflac_uint32 temp0R;
8618             drflac_uint32 temp1R;
8619             drflac_uint32 temp2R;
8620             drflac_uint32 temp3R;
8621
8622             drflac_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8623             drflac_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8624             drflac_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8625             drflac_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8626
8627             drflac_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8628             drflac_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8629             drflac_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8630             drflac_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8631
8632             mid0 = (mid0 << 1) | (side0 & 0x01);
8633             mid1 = (mid1 << 1) | (side1 & 0x01);
8634             mid2 = (mid2 << 1) | (side2 & 0x01);
8635             mid3 = (mid3 << 1) | (side3 & 0x01);
8636
8637             temp0L = (mid0 + side0) << shift;
8638             temp1L = (mid1 + side1) << shift;
8639             temp2L = (mid2 + side2) << shift;
8640             temp3L = (mid3 + side3) << shift;
8641
8642             temp0R = (mid0 - side0) << shift;
8643             temp1R = (mid1 - side1) << shift;
8644             temp2R = (mid2 - side2) << shift;
8645             temp3R = (mid3 - side3) << shift;
8646
8647             temp0L >>= 16;
8648             temp1L >>= 16;
8649             temp2L >>= 16;
8650             temp3L >>= 16;
8651
8652             temp0R >>= 16;
8653             temp1R >>= 16;
8654             temp2R >>= 16;
8655             temp3R >>= 16;
8656
8657             pOutputSamples[i*8+0] = (drflac_int16)temp0L;
8658             pOutputSamples[i*8+1] = (drflac_int16)temp0R;
8659             pOutputSamples[i*8+2] = (drflac_int16)temp1L;
8660             pOutputSamples[i*8+3] = (drflac_int16)temp1R;
8661             pOutputSamples[i*8+4] = (drflac_int16)temp2L;
8662             pOutputSamples[i*8+5] = (drflac_int16)temp2R;
8663             pOutputSamples[i*8+6] = (drflac_int16)temp3L;
8664             pOutputSamples[i*8+7] = (drflac_int16)temp3R;
8665         }
8666     } else {
8667         for (i = 0; i < frameCount4; ++i) {
8668             drflac_uint32 temp0L;
8669             drflac_uint32 temp1L;
8670             drflac_uint32 temp2L;
8671             drflac_uint32 temp3L;
8672             drflac_uint32 temp0R;
8673             drflac_uint32 temp1R;
8674             drflac_uint32 temp2R;
8675             drflac_uint32 temp3R;
8676
8677             drflac_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8678             drflac_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8679             drflac_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8680             drflac_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8681
8682             drflac_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8683             drflac_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8684             drflac_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8685             drflac_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8686
8687             mid0 = (mid0 << 1) | (side0 & 0x01);
8688             mid1 = (mid1 << 1) | (side1 & 0x01);
8689             mid2 = (mid2 << 1) | (side2 & 0x01);
8690             mid3 = (mid3 << 1) | (side3 & 0x01);
8691
8692             temp0L = ((drflac_int32)(mid0 + side0) >> 1);
8693             temp1L = ((drflac_int32)(mid1 + side1) >> 1);
8694             temp2L = ((drflac_int32)(mid2 + side2) >> 1);
8695             temp3L = ((drflac_int32)(mid3 + side3) >> 1);
8696
8697             temp0R = ((drflac_int32)(mid0 - side0) >> 1);
8698             temp1R = ((drflac_int32)(mid1 - side1) >> 1);
8699             temp2R = ((drflac_int32)(mid2 - side2) >> 1);
8700             temp3R = ((drflac_int32)(mid3 - side3) >> 1);
8701
8702             temp0L >>= 16;
8703             temp1L >>= 16;
8704             temp2L >>= 16;
8705             temp3L >>= 16;
8706
8707             temp0R >>= 16;
8708             temp1R >>= 16;
8709             temp2R >>= 16;
8710             temp3R >>= 16;
8711
8712             pOutputSamples[i*8+0] = (drflac_int16)temp0L;
8713             pOutputSamples[i*8+1] = (drflac_int16)temp0R;
8714             pOutputSamples[i*8+2] = (drflac_int16)temp1L;
8715             pOutputSamples[i*8+3] = (drflac_int16)temp1R;
8716             pOutputSamples[i*8+4] = (drflac_int16)temp2L;
8717             pOutputSamples[i*8+5] = (drflac_int16)temp2R;
8718             pOutputSamples[i*8+6] = (drflac_int16)temp3L;
8719             pOutputSamples[i*8+7] = (drflac_int16)temp3R;
8720         }
8721     }
8722
8723     for (i = (frameCount4 << 2); i < frameCount; ++i) {
8724         drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8725         drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8726
8727         mid = (mid << 1) | (side & 0x01);
8728
8729         pOutputSamples[i*2+0] = (drflac_int16)(((drflac_uint32)((drflac_int32)(mid + side) >> 1) << unusedBitsPerSample) >> 16);
8730         pOutputSamples[i*2+1] = (drflac_int16)(((drflac_uint32)((drflac_int32)(mid - side) >> 1) << unusedBitsPerSample) >> 16);
8731     }
8732 }
8733
8734 #if defined(DRFLAC_SUPPORT_SSE2)
8735 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
8736 {
8737     drflac_uint64 i;
8738     drflac_uint64 frameCount4 = frameCount >> 2;
8739     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
8740     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
8741     drflac_uint32 shift = unusedBitsPerSample;
8742
8743     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
8744
8745     if (shift == 0) {
8746         for (i = 0; i < frameCount4; ++i) {
8747             __m128i mid;
8748             __m128i side;
8749             __m128i left;
8750             __m128i right;
8751
8752             mid   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
8753             side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
8754
8755             mid   = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
8756
8757             left  = _mm_srai_epi32(_mm_add_epi32(mid, side), 1);
8758             right = _mm_srai_epi32(_mm_sub_epi32(mid, side), 1);
8759
8760             left  = _mm_srai_epi32(left,  16);
8761             right = _mm_srai_epi32(right, 16);
8762
8763             _mm_storeu_si128((__m128i*)(pOutputSamples + i*8), drflac__mm_packs_interleaved_epi32(left, right));
8764         }
8765
8766         for (i = (frameCount4 << 2); i < frameCount; ++i) {
8767             drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8768             drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8769
8770             mid = (mid << 1) | (side & 0x01);
8771
8772             pOutputSamples[i*2+0] = (drflac_int16)(((drflac_int32)(mid + side) >> 1) >> 16);
8773             pOutputSamples[i*2+1] = (drflac_int16)(((drflac_int32)(mid - side) >> 1) >> 16);
8774         }
8775     } else {
8776         shift -= 1;
8777         for (i = 0; i < frameCount4; ++i) {
8778             __m128i mid;
8779             __m128i side;
8780             __m128i left;
8781             __m128i right;
8782
8783             mid   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
8784             side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
8785
8786             mid   = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
8787
8788             left  = _mm_slli_epi32(_mm_add_epi32(mid, side), shift);
8789             right = _mm_slli_epi32(_mm_sub_epi32(mid, side), shift);
8790
8791             left  = _mm_srai_epi32(left,  16);
8792             right = _mm_srai_epi32(right, 16);
8793
8794             _mm_storeu_si128((__m128i*)(pOutputSamples + i*8), drflac__mm_packs_interleaved_epi32(left, right));
8795         }
8796
8797         for (i = (frameCount4 << 2); i < frameCount; ++i) {
8798             drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8799             drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8800
8801             mid = (mid << 1) | (side & 0x01);
8802
8803             pOutputSamples[i*2+0] = (drflac_int16)(((mid + side) << shift) >> 16);
8804             pOutputSamples[i*2+1] = (drflac_int16)(((mid - side) << shift) >> 16);
8805         }
8806     }
8807 }
8808 #endif
8809
8810 #if defined(DRFLAC_SUPPORT_NEON)
8811 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
8812 {
8813     drflac_uint64 i;
8814     drflac_uint64 frameCount4 = frameCount >> 2;
8815     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
8816     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
8817     drflac_uint32 shift = unusedBitsPerSample;
8818     int32x4_t wbpsShift0_4; /* wbps = Wasted Bits Per Sample */
8819     int32x4_t wbpsShift1_4; /* wbps = Wasted Bits Per Sample */
8820
8821     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
8822
8823     wbpsShift0_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
8824     wbpsShift1_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
8825
8826     if (shift == 0) {
8827         for (i = 0; i < frameCount4; ++i) {
8828             uint32x4_t mid;
8829             uint32x4_t side;
8830             int32x4_t left;
8831             int32x4_t right;
8832
8833             mid   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbpsShift0_4);
8834             side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbpsShift1_4);
8835
8836             mid   = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, vdupq_n_u32(1)));
8837
8838             left  = vshrq_n_s32(vreinterpretq_s32_u32(vaddq_u32(mid, side)), 1);
8839             right = vshrq_n_s32(vreinterpretq_s32_u32(vsubq_u32(mid, side)), 1);
8840
8841             left  = vshrq_n_s32(left,  16);
8842             right = vshrq_n_s32(right, 16);
8843
8844             drflac__vst2q_s16(pOutputSamples + i*8, vzip_s16(vmovn_s32(left), vmovn_s32(right)));
8845         }
8846
8847         for (i = (frameCount4 << 2); i < frameCount; ++i) {
8848             drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8849             drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8850
8851             mid = (mid << 1) | (side & 0x01);
8852
8853             pOutputSamples[i*2+0] = (drflac_int16)(((drflac_int32)(mid + side) >> 1) >> 16);
8854             pOutputSamples[i*2+1] = (drflac_int16)(((drflac_int32)(mid - side) >> 1) >> 16);
8855         }
8856     } else {
8857         int32x4_t shift4;
8858
8859         shift -= 1;
8860         shift4 = vdupq_n_s32(shift);
8861
8862         for (i = 0; i < frameCount4; ++i) {
8863             uint32x4_t mid;
8864             uint32x4_t side;
8865             int32x4_t left;
8866             int32x4_t right;
8867
8868             mid   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbpsShift0_4);
8869             side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbpsShift1_4);
8870
8871             mid   = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, vdupq_n_u32(1)));
8872
8873             left  = vreinterpretq_s32_u32(vshlq_u32(vaddq_u32(mid, side), shift4));
8874             right = vreinterpretq_s32_u32(vshlq_u32(vsubq_u32(mid, side), shift4));
8875
8876             left  = vshrq_n_s32(left,  16);
8877             right = vshrq_n_s32(right, 16);
8878
8879             drflac__vst2q_s16(pOutputSamples + i*8, vzip_s16(vmovn_s32(left), vmovn_s32(right)));
8880         }
8881
8882         for (i = (frameCount4 << 2); i < frameCount; ++i) {
8883             drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8884             drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8885
8886             mid = (mid << 1) | (side & 0x01);
8887
8888             pOutputSamples[i*2+0] = (drflac_int16)(((mid + side) << shift) >> 16);
8889             pOutputSamples[i*2+1] = (drflac_int16)(((mid - side) << shift) >> 16);
8890         }
8891     }
8892 }
8893 #endif
8894
8895 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
8896 {
8897 #if defined(DRFLAC_SUPPORT_SSE2)
8898     if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
8899         drflac_read_pcm_frames_s16__decode_mid_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
8900     } else
8901 #elif defined(DRFLAC_SUPPORT_NEON)
8902     if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
8903         drflac_read_pcm_frames_s16__decode_mid_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
8904     } else
8905 #endif
8906     {
8907         /* Scalar fallback. */
8908         drflac_read_pcm_frames_s16__decode_mid_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
8909     }
8910 }
8911
8912 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
8913 {
8914     drflac_uint64 i;
8915     drflac_uint64 frameCount4 = frameCount >> 2;
8916     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
8917     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
8918     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8919     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8920
8921     for (i = 0; i < frameCount4; ++i) {
8922         drflac_uint32 tempL0 = pInputSamples0U32[i*4+0] << shift0;
8923         drflac_uint32 tempL1 = pInputSamples0U32[i*4+1] << shift0;
8924         drflac_uint32 tempL2 = pInputSamples0U32[i*4+2] << shift0;
8925         drflac_uint32 tempL3 = pInputSamples0U32[i*4+3] << shift0;
8926
8927         drflac_uint32 tempR0 = pInputSamples1U32[i*4+0] << shift1;
8928         drflac_uint32 tempR1 = pInputSamples1U32[i*4+1] << shift1;
8929         drflac_uint32 tempR2 = pInputSamples1U32[i*4+2] << shift1;
8930         drflac_uint32 tempR3 = pInputSamples1U32[i*4+3] << shift1;
8931
8932         tempL0 >>= 16;
8933         tempL1 >>= 16;
8934         tempL2 >>= 16;
8935         tempL3 >>= 16;
8936
8937         tempR0 >>= 16;
8938         tempR1 >>= 16;
8939         tempR2 >>= 16;
8940         tempR3 >>= 16;
8941
8942         pOutputSamples[i*8+0] = (drflac_int16)tempL0;
8943         pOutputSamples[i*8+1] = (drflac_int16)tempR0;
8944         pOutputSamples[i*8+2] = (drflac_int16)tempL1;
8945         pOutputSamples[i*8+3] = (drflac_int16)tempR1;
8946         pOutputSamples[i*8+4] = (drflac_int16)tempL2;
8947         pOutputSamples[i*8+5] = (drflac_int16)tempR2;
8948         pOutputSamples[i*8+6] = (drflac_int16)tempL3;
8949         pOutputSamples[i*8+7] = (drflac_int16)tempR3;
8950     }
8951
8952     for (i = (frameCount4 << 2); i < frameCount; ++i) {
8953         pOutputSamples[i*2+0] = (drflac_int16)((pInputSamples0U32[i] << shift0) >> 16);
8954         pOutputSamples[i*2+1] = (drflac_int16)((pInputSamples1U32[i] << shift1) >> 16);
8955     }
8956 }
8957
8958 #if defined(DRFLAC_SUPPORT_SSE2)
8959 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
8960 {
8961     drflac_uint64 i;
8962     drflac_uint64 frameCount4 = frameCount >> 2;
8963     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
8964     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
8965     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8966     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8967
8968     for (i = 0; i < frameCount4; ++i) {
8969         __m128i left  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
8970         __m128i right = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
8971
8972         left  = _mm_srai_epi32(left,  16);
8973         right = _mm_srai_epi32(right, 16);
8974
8975         /* At this point we have results. We can now pack and interleave these into a single __m128i object and then store the in the output buffer. */
8976         _mm_storeu_si128((__m128i*)(pOutputSamples + i*8), drflac__mm_packs_interleaved_epi32(left, right));
8977     }
8978
8979     for (i = (frameCount4 << 2); i < frameCount; ++i) {
8980         pOutputSamples[i*2+0] = (drflac_int16)((pInputSamples0U32[i] << shift0) >> 16);
8981         pOutputSamples[i*2+1] = (drflac_int16)((pInputSamples1U32[i] << shift1) >> 16);
8982     }
8983 }
8984 #endif
8985
8986 #if defined(DRFLAC_SUPPORT_NEON)
8987 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
8988 {
8989     drflac_uint64 i;
8990     drflac_uint64 frameCount4 = frameCount >> 2;
8991     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
8992     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
8993     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8994     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8995
8996     int32x4_t shift0_4 = vdupq_n_s32(shift0);
8997     int32x4_t shift1_4 = vdupq_n_s32(shift1);
8998
8999     for (i = 0; i < frameCount4; ++i) {
9000         int32x4_t left;
9001         int32x4_t right;
9002
9003         left  = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4));
9004         right = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4));
9005
9006         left  = vshrq_n_s32(left,  16);
9007         right = vshrq_n_s32(right, 16);
9008
9009         drflac__vst2q_s16(pOutputSamples + i*8, vzip_s16(vmovn_s32(left), vmovn_s32(right)));
9010     }
9011
9012     for (i = (frameCount4 << 2); i < frameCount; ++i) {
9013         pOutputSamples[i*2+0] = (drflac_int16)((pInputSamples0U32[i] << shift0) >> 16);
9014         pOutputSamples[i*2+1] = (drflac_int16)((pInputSamples1U32[i] << shift1) >> 16);
9015     }
9016 }
9017 #endif
9018
9019 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
9020 {
9021 #if defined(DRFLAC_SUPPORT_SSE2)
9022     if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
9023         drflac_read_pcm_frames_s16__decode_independent_stereo__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9024     } else
9025 #elif defined(DRFLAC_SUPPORT_NEON)
9026     if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
9027         drflac_read_pcm_frames_s16__decode_independent_stereo__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9028     } else
9029 #endif
9030     {
9031         /* Scalar fallback. */
9032         drflac_read_pcm_frames_s16__decode_independent_stereo__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9033     }
9034 }
9035
9036 DRFLAC_API drflac_uint64 drflac_read_pcm_frames_s16(drflac* pFlac, drflac_uint64 framesToRead, drflac_int16* pBufferOut)
9037 {
9038     drflac_uint64 framesRead;
9039     drflac_uint32 unusedBitsPerSample;
9040
9041     if (pFlac == NULL || framesToRead == 0) {
9042         return 0;
9043     }
9044
9045     if (pBufferOut == NULL) {
9046         return drflac__seek_forward_by_pcm_frames(pFlac, framesToRead);
9047     }
9048
9049     DRFLAC_ASSERT(pFlac->bitsPerSample <= 32);
9050     unusedBitsPerSample = 32 - pFlac->bitsPerSample;
9051
9052     framesRead = 0;
9053     while (framesToRead > 0) {
9054         /* If we've run out of samples in this frame, go to the next. */
9055         if (pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
9056             if (!drflac__read_and_decode_next_flac_frame(pFlac)) {
9057                 break;  /* Couldn't read the next frame, so just break from the loop and return. */
9058             }
9059         } else {
9060             unsigned int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFLACFrame.header.channelAssignment);
9061             drflac_uint64 iFirstPCMFrame = pFlac->currentFLACFrame.header.blockSizeInPCMFrames - pFlac->currentFLACFrame.pcmFramesRemaining;
9062             drflac_uint64 frameCountThisIteration = framesToRead;
9063
9064             if (frameCountThisIteration > pFlac->currentFLACFrame.pcmFramesRemaining) {
9065                 frameCountThisIteration = pFlac->currentFLACFrame.pcmFramesRemaining;
9066             }
9067
9068             if (channelCount == 2) {
9069                 const drflac_int32* pDecodedSamples0 = pFlac->currentFLACFrame.subframes[0].pSamplesS32 + iFirstPCMFrame;
9070                 const drflac_int32* pDecodedSamples1 = pFlac->currentFLACFrame.subframes[1].pSamplesS32 + iFirstPCMFrame;
9071
9072                 switch (pFlac->currentFLACFrame.header.channelAssignment)
9073                 {
9074                     case DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
9075                     {
9076                         drflac_read_pcm_frames_s16__decode_left_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
9077                     } break;
9078
9079                     case DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
9080                     {
9081                         drflac_read_pcm_frames_s16__decode_right_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
9082                     } break;
9083
9084                     case DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
9085                     {
9086                         drflac_read_pcm_frames_s16__decode_mid_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
9087                     } break;
9088
9089                     case DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
9090                     default:
9091                     {
9092                         drflac_read_pcm_frames_s16__decode_independent_stereo(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
9093                     } break;
9094                 }
9095             } else {
9096                 /* Generic interleaving. */
9097                 drflac_uint64 i;
9098                 for (i = 0; i < frameCountThisIteration; ++i) {
9099                     unsigned int j;
9100                     for (j = 0; j < channelCount; ++j) {
9101                         drflac_int32 sampleS32 = (drflac_int32)((drflac_uint32)(pFlac->currentFLACFrame.subframes[j].pSamplesS32[iFirstPCMFrame + i]) << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[j].wastedBitsPerSample));
9102                         pBufferOut[(i*channelCount)+j] = (drflac_int16)(sampleS32 >> 16);
9103                     }
9104                 }
9105             }
9106
9107             framesRead                += frameCountThisIteration;
9108             pBufferOut                += frameCountThisIteration * channelCount;
9109             framesToRead              -= frameCountThisIteration;
9110             pFlac->currentPCMFrame    += frameCountThisIteration;
9111             pFlac->currentFLACFrame.pcmFramesRemaining -= (drflac_uint32)frameCountThisIteration;
9112         }
9113     }
9114
9115     return framesRead;
9116 }
9117
9118 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
9119 {
9120     drflac_uint64 i;
9121     drflac_uint64 frameCount4 = frameCount >> 2;
9122     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
9123     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
9124     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9125     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9126
9127     float factor = 1 / 2147483648.0;
9128
9129     for (i = 0; i < frameCount4; ++i) {
9130         drflac_uint32 left0 = pInputSamples0U32[i*4+0] << shift0;
9131         drflac_uint32 left1 = pInputSamples0U32[i*4+1] << shift0;
9132         drflac_uint32 left2 = pInputSamples0U32[i*4+2] << shift0;
9133         drflac_uint32 left3 = pInputSamples0U32[i*4+3] << shift0;
9134
9135         drflac_uint32 side0 = pInputSamples1U32[i*4+0] << shift1;
9136         drflac_uint32 side1 = pInputSamples1U32[i*4+1] << shift1;
9137         drflac_uint32 side2 = pInputSamples1U32[i*4+2] << shift1;
9138         drflac_uint32 side3 = pInputSamples1U32[i*4+3] << shift1;
9139
9140         drflac_uint32 right0 = left0 - side0;
9141         drflac_uint32 right1 = left1 - side1;
9142         drflac_uint32 right2 = left2 - side2;
9143         drflac_uint32 right3 = left3 - side3;
9144
9145         pOutputSamples[i*8+0] = (drflac_int32)left0  * factor;
9146         pOutputSamples[i*8+1] = (drflac_int32)right0 * factor;
9147         pOutputSamples[i*8+2] = (drflac_int32)left1  * factor;
9148         pOutputSamples[i*8+3] = (drflac_int32)right1 * factor;
9149         pOutputSamples[i*8+4] = (drflac_int32)left2  * factor;
9150         pOutputSamples[i*8+5] = (drflac_int32)right2 * factor;
9151         pOutputSamples[i*8+6] = (drflac_int32)left3  * factor;
9152         pOutputSamples[i*8+7] = (drflac_int32)right3 * factor;
9153     }
9154
9155     for (i = (frameCount4 << 2); i < frameCount; ++i) {
9156         drflac_uint32 left  = pInputSamples0U32[i] << shift0;
9157         drflac_uint32 side  = pInputSamples1U32[i] << shift1;
9158         drflac_uint32 right = left - side;
9159
9160         pOutputSamples[i*2+0] = (drflac_int32)left  * factor;
9161         pOutputSamples[i*2+1] = (drflac_int32)right * factor;
9162     }
9163 }
9164
9165 #if defined(DRFLAC_SUPPORT_SSE2)
9166 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
9167 {
9168     drflac_uint64 i;
9169     drflac_uint64 frameCount4 = frameCount >> 2;
9170     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
9171     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
9172     drflac_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
9173     drflac_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
9174     __m128 factor;
9175
9176     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
9177
9178     factor = _mm_set1_ps(1.0f / 8388608.0f);
9179
9180     for (i = 0; i < frameCount4; ++i) {
9181         __m128i left  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
9182         __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
9183         __m128i right = _mm_sub_epi32(left, side);
9184         __m128 leftf  = _mm_mul_ps(_mm_cvtepi32_ps(left),  factor);
9185         __m128 rightf = _mm_mul_ps(_mm_cvtepi32_ps(right), factor);
9186
9187         _mm_storeu_ps(pOutputSamples + i*8 + 0, _mm_unpacklo_ps(leftf, rightf));
9188         _mm_storeu_ps(pOutputSamples + i*8 + 4, _mm_unpackhi_ps(leftf, rightf));
9189     }
9190
9191     for (i = (frameCount4 << 2); i < frameCount; ++i) {
9192         drflac_uint32 left  = pInputSamples0U32[i] << shift0;
9193         drflac_uint32 side  = pInputSamples1U32[i] << shift1;
9194         drflac_uint32 right = left - side;
9195
9196         pOutputSamples[i*2+0] = (drflac_int32)left  / 8388608.0f;
9197         pOutputSamples[i*2+1] = (drflac_int32)right / 8388608.0f;
9198     }
9199 }
9200 #endif
9201
9202 #if defined(DRFLAC_SUPPORT_NEON)
9203 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
9204 {
9205     drflac_uint64 i;
9206     drflac_uint64 frameCount4 = frameCount >> 2;
9207     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
9208     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
9209     drflac_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
9210     drflac_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
9211     float32x4_t factor4;
9212     int32x4_t shift0_4;
9213     int32x4_t shift1_4;
9214
9215     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
9216
9217     factor4  = vdupq_n_f32(1.0f / 8388608.0f);
9218     shift0_4 = vdupq_n_s32(shift0);
9219     shift1_4 = vdupq_n_s32(shift1);
9220
9221     for (i = 0; i < frameCount4; ++i) {
9222         uint32x4_t left;
9223         uint32x4_t side;
9224         uint32x4_t right;
9225         float32x4_t leftf;
9226         float32x4_t rightf;
9227
9228         left   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
9229         side   = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
9230         right  = vsubq_u32(left, side);
9231         leftf  = vmulq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(left)),  factor4);
9232         rightf = vmulq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(right)), factor4);
9233
9234         drflac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf));
9235     }
9236
9237     for (i = (frameCount4 << 2); i < frameCount; ++i) {
9238         drflac_uint32 left  = pInputSamples0U32[i] << shift0;
9239         drflac_uint32 side  = pInputSamples1U32[i] << shift1;
9240         drflac_uint32 right = left - side;
9241
9242         pOutputSamples[i*2+0] = (drflac_int32)left  / 8388608.0f;
9243         pOutputSamples[i*2+1] = (drflac_int32)right / 8388608.0f;
9244     }
9245 }
9246 #endif
9247
9248 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
9249 {
9250 #if defined(DRFLAC_SUPPORT_SSE2)
9251     if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
9252         drflac_read_pcm_frames_f32__decode_left_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9253     } else
9254 #elif defined(DRFLAC_SUPPORT_NEON)
9255     if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
9256         drflac_read_pcm_frames_f32__decode_left_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9257     } else
9258 #endif
9259     {
9260         /* Scalar fallback. */
9261         drflac_read_pcm_frames_f32__decode_left_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9262     }
9263 }
9264
9265
9266 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
9267 {
9268     drflac_uint64 i;
9269     drflac_uint64 frameCount4 = frameCount >> 2;
9270     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
9271     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
9272     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9273     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9274     float factor = 1 / 2147483648.0;
9275
9276     for (i = 0; i < frameCount4; ++i) {
9277         drflac_uint32 side0  = pInputSamples0U32[i*4+0] << shift0;
9278         drflac_uint32 side1  = pInputSamples0U32[i*4+1] << shift0;
9279         drflac_uint32 side2  = pInputSamples0U32[i*4+2] << shift0;
9280         drflac_uint32 side3  = pInputSamples0U32[i*4+3] << shift0;
9281
9282         drflac_uint32 right0 = pInputSamples1U32[i*4+0] << shift1;
9283         drflac_uint32 right1 = pInputSamples1U32[i*4+1] << shift1;
9284         drflac_uint32 right2 = pInputSamples1U32[i*4+2] << shift1;
9285         drflac_uint32 right3 = pInputSamples1U32[i*4+3] << shift1;
9286
9287         drflac_uint32 left0 = right0 + side0;
9288         drflac_uint32 left1 = right1 + side1;
9289         drflac_uint32 left2 = right2 + side2;
9290         drflac_uint32 left3 = right3 + side3;
9291
9292         pOutputSamples[i*8+0] = (drflac_int32)left0  * factor;
9293         pOutputSamples[i*8+1] = (drflac_int32)right0 * factor;
9294         pOutputSamples[i*8+2] = (drflac_int32)left1  * factor;
9295         pOutputSamples[i*8+3] = (drflac_int32)right1 * factor;
9296         pOutputSamples[i*8+4] = (drflac_int32)left2  * factor;
9297         pOutputSamples[i*8+5] = (drflac_int32)right2 * factor;
9298         pOutputSamples[i*8+6] = (drflac_int32)left3  * factor;
9299         pOutputSamples[i*8+7] = (drflac_int32)right3 * factor;
9300     }
9301
9302     for (i = (frameCount4 << 2); i < frameCount; ++i) {
9303         drflac_uint32 side  = pInputSamples0U32[i] << shift0;
9304         drflac_uint32 right = pInputSamples1U32[i] << shift1;
9305         drflac_uint32 left  = right + side;
9306
9307         pOutputSamples[i*2+0] = (drflac_int32)left  * factor;
9308         pOutputSamples[i*2+1] = (drflac_int32)right * factor;
9309     }
9310 }
9311
9312 #if defined(DRFLAC_SUPPORT_SSE2)
9313 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
9314 {
9315     drflac_uint64 i;
9316     drflac_uint64 frameCount4 = frameCount >> 2;
9317     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
9318     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
9319     drflac_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
9320     drflac_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
9321     __m128 factor;
9322
9323     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
9324
9325     factor = _mm_set1_ps(1.0f / 8388608.0f);
9326
9327     for (i = 0; i < frameCount4; ++i) {
9328         __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
9329         __m128i right = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
9330         __m128i left  = _mm_add_epi32(right, side);
9331         __m128 leftf  = _mm_mul_ps(_mm_cvtepi32_ps(left),  factor);
9332         __m128 rightf = _mm_mul_ps(_mm_cvtepi32_ps(right), factor);
9333
9334         _mm_storeu_ps(pOutputSamples + i*8 + 0, _mm_unpacklo_ps(leftf, rightf));
9335         _mm_storeu_ps(pOutputSamples + i*8 + 4, _mm_unpackhi_ps(leftf, rightf));
9336     }
9337
9338     for (i = (frameCount4 << 2); i < frameCount; ++i) {
9339         drflac_uint32 side  = pInputSamples0U32[i] << shift0;
9340         drflac_uint32 right = pInputSamples1U32[i] << shift1;
9341         drflac_uint32 left  = right + side;
9342
9343         pOutputSamples[i*2+0] = (drflac_int32)left  / 8388608.0f;
9344         pOutputSamples[i*2+1] = (drflac_int32)right / 8388608.0f;
9345     }
9346 }
9347 #endif
9348
9349 #if defined(DRFLAC_SUPPORT_NEON)
9350 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
9351 {
9352     drflac_uint64 i;
9353     drflac_uint64 frameCount4 = frameCount >> 2;
9354     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
9355     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
9356     drflac_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
9357     drflac_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
9358     float32x4_t factor4;
9359     int32x4_t shift0_4;
9360     int32x4_t shift1_4;
9361
9362     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
9363
9364     factor4  = vdupq_n_f32(1.0f / 8388608.0f);
9365     shift0_4 = vdupq_n_s32(shift0);
9366     shift1_4 = vdupq_n_s32(shift1);
9367
9368     for (i = 0; i < frameCount4; ++i) {
9369         uint32x4_t side;
9370         uint32x4_t right;
9371         uint32x4_t left;
9372         float32x4_t leftf;
9373         float32x4_t rightf;
9374
9375         side   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
9376         right  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
9377         left   = vaddq_u32(right, side);
9378         leftf  = vmulq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(left)),  factor4);
9379         rightf = vmulq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(right)), factor4);
9380
9381         drflac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf));
9382     }
9383
9384     for (i = (frameCount4 << 2); i < frameCount; ++i) {
9385         drflac_uint32 side  = pInputSamples0U32[i] << shift0;
9386         drflac_uint32 right = pInputSamples1U32[i] << shift1;
9387         drflac_uint32 left  = right + side;
9388
9389         pOutputSamples[i*2+0] = (drflac_int32)left  / 8388608.0f;
9390         pOutputSamples[i*2+1] = (drflac_int32)right / 8388608.0f;
9391     }
9392 }
9393 #endif
9394
9395 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
9396 {
9397 #if defined(DRFLAC_SUPPORT_SSE2)
9398     if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
9399         drflac_read_pcm_frames_f32__decode_right_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9400     } else
9401 #elif defined(DRFLAC_SUPPORT_NEON)
9402     if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
9403         drflac_read_pcm_frames_f32__decode_right_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9404     } else
9405 #endif
9406     {
9407         /* Scalar fallback. */
9408         drflac_read_pcm_frames_f32__decode_right_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9409     }
9410 }
9411
9412 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
9413 {
9414     drflac_uint64 i;
9415     drflac_uint64 frameCount4 = frameCount >> 2;
9416     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
9417     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
9418     drflac_uint32 shift = unusedBitsPerSample;
9419     float factor = 1 / 2147483648.0;
9420
9421     if (shift > 0) {
9422         shift -= 1;
9423         for (i = 0; i < frameCount4; ++i) {
9424             drflac_uint32 temp0L;
9425             drflac_uint32 temp1L;
9426             drflac_uint32 temp2L;
9427             drflac_uint32 temp3L;
9428             drflac_uint32 temp0R;
9429             drflac_uint32 temp1R;
9430             drflac_uint32 temp2R;
9431             drflac_uint32 temp3R;
9432
9433             drflac_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9434             drflac_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9435             drflac_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9436             drflac_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9437
9438             drflac_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9439             drflac_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9440             drflac_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9441             drflac_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9442
9443             mid0 = (mid0 << 1) | (side0 & 0x01);
9444             mid1 = (mid1 << 1) | (side1 & 0x01);
9445             mid2 = (mid2 << 1) | (side2 & 0x01);
9446             mid3 = (mid3 << 1) | (side3 & 0x01);
9447
9448             temp0L = (mid0 + side0) << shift;
9449             temp1L = (mid1 + side1) << shift;
9450             temp2L = (mid2 + side2) << shift;
9451             temp3L = (mid3 + side3) << shift;
9452
9453             temp0R = (mid0 - side0) << shift;
9454             temp1R = (mid1 - side1) << shift;
9455             temp2R = (mid2 - side2) << shift;
9456             temp3R = (mid3 - side3) << shift;
9457
9458             pOutputSamples[i*8+0] = (drflac_int32)temp0L * factor;
9459             pOutputSamples[i*8+1] = (drflac_int32)temp0R * factor;
9460             pOutputSamples[i*8+2] = (drflac_int32)temp1L * factor;
9461             pOutputSamples[i*8+3] = (drflac_int32)temp1R * factor;
9462             pOutputSamples[i*8+4] = (drflac_int32)temp2L * factor;
9463             pOutputSamples[i*8+5] = (drflac_int32)temp2R * factor;
9464             pOutputSamples[i*8+6] = (drflac_int32)temp3L * factor;
9465             pOutputSamples[i*8+7] = (drflac_int32)temp3R * factor;
9466         }
9467     } else {
9468         for (i = 0; i < frameCount4; ++i) {
9469             drflac_uint32 temp0L;
9470             drflac_uint32 temp1L;
9471             drflac_uint32 temp2L;
9472             drflac_uint32 temp3L;
9473             drflac_uint32 temp0R;
9474             drflac_uint32 temp1R;
9475             drflac_uint32 temp2R;
9476             drflac_uint32 temp3R;
9477
9478             drflac_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9479             drflac_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9480             drflac_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9481             drflac_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9482
9483             drflac_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9484             drflac_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9485             drflac_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9486             drflac_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9487
9488             mid0 = (mid0 << 1) | (side0 & 0x01);
9489             mid1 = (mid1 << 1) | (side1 & 0x01);
9490             mid2 = (mid2 << 1) | (side2 & 0x01);
9491             mid3 = (mid3 << 1) | (side3 & 0x01);
9492
9493             temp0L = (drflac_uint32)((drflac_int32)(mid0 + side0) >> 1);
9494             temp1L = (drflac_uint32)((drflac_int32)(mid1 + side1) >> 1);
9495             temp2L = (drflac_uint32)((drflac_int32)(mid2 + side2) >> 1);
9496             temp3L = (drflac_uint32)((drflac_int32)(mid3 + side3) >> 1);
9497
9498             temp0R = (drflac_uint32)((drflac_int32)(mid0 - side0) >> 1);
9499             temp1R = (drflac_uint32)((drflac_int32)(mid1 - side1) >> 1);
9500             temp2R = (drflac_uint32)((drflac_int32)(mid2 - side2) >> 1);
9501             temp3R = (drflac_uint32)((drflac_int32)(mid3 - side3) >> 1);
9502
9503             pOutputSamples[i*8+0] = (drflac_int32)temp0L * factor;
9504             pOutputSamples[i*8+1] = (drflac_int32)temp0R * factor;
9505             pOutputSamples[i*8+2] = (drflac_int32)temp1L * factor;
9506             pOutputSamples[i*8+3] = (drflac_int32)temp1R * factor;
9507             pOutputSamples[i*8+4] = (drflac_int32)temp2L * factor;
9508             pOutputSamples[i*8+5] = (drflac_int32)temp2R * factor;
9509             pOutputSamples[i*8+6] = (drflac_int32)temp3L * factor;
9510             pOutputSamples[i*8+7] = (drflac_int32)temp3R * factor;
9511         }
9512     }
9513
9514     for (i = (frameCount4 << 2); i < frameCount; ++i) {
9515         drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9516         drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9517
9518         mid = (mid << 1) | (side & 0x01);
9519
9520         pOutputSamples[i*2+0] = (drflac_int32)((drflac_uint32)((drflac_int32)(mid + side) >> 1) << unusedBitsPerSample) * factor;
9521         pOutputSamples[i*2+1] = (drflac_int32)((drflac_uint32)((drflac_int32)(mid - side) >> 1) << unusedBitsPerSample) * factor;
9522     }
9523 }
9524
9525 #if defined(DRFLAC_SUPPORT_SSE2)
9526 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
9527 {
9528     drflac_uint64 i;
9529     drflac_uint64 frameCount4 = frameCount >> 2;
9530     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
9531     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
9532     drflac_uint32 shift = unusedBitsPerSample - 8;
9533     float factor;
9534     __m128 factor128;
9535
9536     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
9537
9538     factor = 1.0f / 8388608.0f;
9539     factor128 = _mm_set1_ps(factor);
9540
9541     if (shift == 0) {
9542         for (i = 0; i < frameCount4; ++i) {
9543             __m128i mid;
9544             __m128i side;
9545             __m128i tempL;
9546             __m128i tempR;
9547             __m128  leftf;
9548             __m128  rightf;
9549
9550             mid    = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
9551             side   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
9552
9553             mid    = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
9554
9555             tempL  = _mm_srai_epi32(_mm_add_epi32(mid, side), 1);
9556             tempR  = _mm_srai_epi32(_mm_sub_epi32(mid, side), 1);
9557
9558             leftf  = _mm_mul_ps(_mm_cvtepi32_ps(tempL), factor128);
9559             rightf = _mm_mul_ps(_mm_cvtepi32_ps(tempR), factor128);
9560
9561             _mm_storeu_ps(pOutputSamples + i*8 + 0, _mm_unpacklo_ps(leftf, rightf));
9562             _mm_storeu_ps(pOutputSamples + i*8 + 4, _mm_unpackhi_ps(leftf, rightf));
9563         }
9564
9565         for (i = (frameCount4 << 2); i < frameCount; ++i) {
9566             drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9567             drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9568
9569             mid = (mid << 1) | (side & 0x01);
9570
9571             pOutputSamples[i*2+0] = ((drflac_int32)(mid + side) >> 1) * factor;
9572             pOutputSamples[i*2+1] = ((drflac_int32)(mid - side) >> 1) * factor;
9573         }
9574     } else {
9575         shift -= 1;
9576         for (i = 0; i < frameCount4; ++i) {
9577             __m128i mid;
9578             __m128i side;
9579             __m128i tempL;
9580             __m128i tempR;
9581             __m128 leftf;
9582             __m128 rightf;
9583
9584             mid    = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
9585             side   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
9586
9587             mid    = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
9588
9589             tempL  = _mm_slli_epi32(_mm_add_epi32(mid, side), shift);
9590             tempR  = _mm_slli_epi32(_mm_sub_epi32(mid, side), shift);
9591
9592             leftf  = _mm_mul_ps(_mm_cvtepi32_ps(tempL), factor128);
9593             rightf = _mm_mul_ps(_mm_cvtepi32_ps(tempR), factor128);
9594
9595             _mm_storeu_ps(pOutputSamples + i*8 + 0, _mm_unpacklo_ps(leftf, rightf));
9596             _mm_storeu_ps(pOutputSamples + i*8 + 4, _mm_unpackhi_ps(leftf, rightf));
9597         }
9598
9599         for (i = (frameCount4 << 2); i < frameCount; ++i) {
9600             drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9601             drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9602
9603             mid = (mid << 1) | (side & 0x01);
9604
9605             pOutputSamples[i*2+0] = (drflac_int32)((mid + side) << shift) * factor;
9606             pOutputSamples[i*2+1] = (drflac_int32)((mid - side) << shift) * factor;
9607         }
9608     }
9609 }
9610 #endif
9611
9612 #if defined(DRFLAC_SUPPORT_NEON)
9613 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
9614 {
9615     drflac_uint64 i;
9616     drflac_uint64 frameCount4 = frameCount >> 2;
9617     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
9618     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
9619     drflac_uint32 shift = unusedBitsPerSample - 8;
9620     float factor;
9621     float32x4_t factor4;
9622     int32x4_t shift4;
9623     int32x4_t wbps0_4;  /* Wasted Bits Per Sample */
9624     int32x4_t wbps1_4;  /* Wasted Bits Per Sample */
9625
9626     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
9627
9628     factor  = 1.0f / 8388608.0f;
9629     factor4 = vdupq_n_f32(factor);
9630     wbps0_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
9631     wbps1_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
9632
9633     if (shift == 0) {
9634         for (i = 0; i < frameCount4; ++i) {
9635             int32x4_t lefti;
9636             int32x4_t righti;
9637             float32x4_t leftf;
9638             float32x4_t rightf;
9639
9640             uint32x4_t mid  = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbps0_4);
9641             uint32x4_t side = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbps1_4);
9642
9643             mid    = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, vdupq_n_u32(1)));
9644
9645             lefti  = vshrq_n_s32(vreinterpretq_s32_u32(vaddq_u32(mid, side)), 1);
9646             righti = vshrq_n_s32(vreinterpretq_s32_u32(vsubq_u32(mid, side)), 1);
9647
9648             leftf  = vmulq_f32(vcvtq_f32_s32(lefti),  factor4);
9649             rightf = vmulq_f32(vcvtq_f32_s32(righti), factor4);
9650
9651             drflac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf));
9652         }
9653
9654         for (i = (frameCount4 << 2); i < frameCount; ++i) {
9655             drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9656             drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9657
9658             mid = (mid << 1) | (side & 0x01);
9659
9660             pOutputSamples[i*2+0] = ((drflac_int32)(mid + side) >> 1) * factor;
9661             pOutputSamples[i*2+1] = ((drflac_int32)(mid - side) >> 1) * factor;
9662         }
9663     } else {
9664         shift -= 1;
9665         shift4 = vdupq_n_s32(shift);
9666         for (i = 0; i < frameCount4; ++i) {
9667             uint32x4_t mid;
9668             uint32x4_t side;
9669             int32x4_t lefti;
9670             int32x4_t righti;
9671             float32x4_t leftf;
9672             float32x4_t rightf;
9673
9674             mid    = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbps0_4);
9675             side   = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbps1_4);
9676
9677             mid    = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, vdupq_n_u32(1)));
9678
9679             lefti  = vreinterpretq_s32_u32(vshlq_u32(vaddq_u32(mid, side), shift4));
9680             righti = vreinterpretq_s32_u32(vshlq_u32(vsubq_u32(mid, side), shift4));
9681
9682             leftf  = vmulq_f32(vcvtq_f32_s32(lefti),  factor4);
9683             rightf = vmulq_f32(vcvtq_f32_s32(righti), factor4);
9684
9685             drflac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf));
9686         }
9687
9688         for (i = (frameCount4 << 2); i < frameCount; ++i) {
9689             drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9690             drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9691
9692             mid = (mid << 1) | (side & 0x01);
9693
9694             pOutputSamples[i*2+0] = (drflac_int32)((mid + side) << shift) * factor;
9695             pOutputSamples[i*2+1] = (drflac_int32)((mid - side) << shift) * factor;
9696         }
9697     }
9698 }
9699 #endif
9700
9701 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
9702 {
9703 #if defined(DRFLAC_SUPPORT_SSE2)
9704     if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
9705         drflac_read_pcm_frames_f32__decode_mid_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9706     } else
9707 #elif defined(DRFLAC_SUPPORT_NEON)
9708     if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
9709         drflac_read_pcm_frames_f32__decode_mid_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9710     } else
9711 #endif
9712     {
9713         /* Scalar fallback. */
9714         drflac_read_pcm_frames_f32__decode_mid_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9715     }
9716 }
9717
9718 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
9719 {
9720     drflac_uint64 i;
9721     drflac_uint64 frameCount4 = frameCount >> 2;
9722     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
9723     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
9724     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9725     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9726     float factor = 1 / 2147483648.0;
9727
9728     for (i = 0; i < frameCount4; ++i) {
9729         drflac_uint32 tempL0 = pInputSamples0U32[i*4+0] << shift0;
9730         drflac_uint32 tempL1 = pInputSamples0U32[i*4+1] << shift0;
9731         drflac_uint32 tempL2 = pInputSamples0U32[i*4+2] << shift0;
9732         drflac_uint32 tempL3 = pInputSamples0U32[i*4+3] << shift0;
9733
9734         drflac_uint32 tempR0 = pInputSamples1U32[i*4+0] << shift1;
9735         drflac_uint32 tempR1 = pInputSamples1U32[i*4+1] << shift1;
9736         drflac_uint32 tempR2 = pInputSamples1U32[i*4+2] << shift1;
9737         drflac_uint32 tempR3 = pInputSamples1U32[i*4+3] << shift1;
9738
9739         pOutputSamples[i*8+0] = (drflac_int32)tempL0 * factor;
9740         pOutputSamples[i*8+1] = (drflac_int32)tempR0 * factor;
9741         pOutputSamples[i*8+2] = (drflac_int32)tempL1 * factor;
9742         pOutputSamples[i*8+3] = (drflac_int32)tempR1 * factor;
9743         pOutputSamples[i*8+4] = (drflac_int32)tempL2 * factor;
9744         pOutputSamples[i*8+5] = (drflac_int32)tempR2 * factor;
9745         pOutputSamples[i*8+6] = (drflac_int32)tempL3 * factor;
9746         pOutputSamples[i*8+7] = (drflac_int32)tempR3 * factor;
9747     }
9748
9749     for (i = (frameCount4 << 2); i < frameCount; ++i) {
9750         pOutputSamples[i*2+0] = (drflac_int32)(pInputSamples0U32[i] << shift0) * factor;
9751         pOutputSamples[i*2+1] = (drflac_int32)(pInputSamples1U32[i] << shift1) * factor;
9752     }
9753 }
9754
9755 #if defined(DRFLAC_SUPPORT_SSE2)
9756 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
9757 {
9758     drflac_uint64 i;
9759     drflac_uint64 frameCount4 = frameCount >> 2;
9760     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
9761     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
9762     drflac_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
9763     drflac_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
9764
9765     float factor = 1.0f / 8388608.0f;
9766     __m128 factor128 = _mm_set1_ps(factor);
9767
9768     for (i = 0; i < frameCount4; ++i) {
9769         __m128i lefti;
9770         __m128i righti;
9771         __m128 leftf;
9772         __m128 rightf;
9773
9774         lefti  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
9775         righti = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
9776
9777         leftf  = _mm_mul_ps(_mm_cvtepi32_ps(lefti),  factor128);
9778         rightf = _mm_mul_ps(_mm_cvtepi32_ps(righti), factor128);
9779
9780         _mm_storeu_ps(pOutputSamples + i*8 + 0, _mm_unpacklo_ps(leftf, rightf));
9781         _mm_storeu_ps(pOutputSamples + i*8 + 4, _mm_unpackhi_ps(leftf, rightf));
9782     }
9783
9784     for (i = (frameCount4 << 2); i < frameCount; ++i) {
9785         pOutputSamples[i*2+0] = (drflac_int32)(pInputSamples0U32[i] << shift0) * factor;
9786         pOutputSamples[i*2+1] = (drflac_int32)(pInputSamples1U32[i] << shift1) * factor;
9787     }
9788 }
9789 #endif
9790
9791 #if defined(DRFLAC_SUPPORT_NEON)
9792 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
9793 {
9794     drflac_uint64 i;
9795     drflac_uint64 frameCount4 = frameCount >> 2;
9796     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
9797     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
9798     drflac_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
9799     drflac_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
9800
9801     float factor = 1.0f / 8388608.0f;
9802     float32x4_t factor4 = vdupq_n_f32(factor);
9803     int32x4_t shift0_4  = vdupq_n_s32(shift0);
9804     int32x4_t shift1_4  = vdupq_n_s32(shift1);
9805
9806     for (i = 0; i < frameCount4; ++i) {
9807         int32x4_t lefti;
9808         int32x4_t righti;
9809         float32x4_t leftf;
9810         float32x4_t rightf;
9811
9812         lefti  = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4));
9813         righti = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4));
9814
9815         leftf  = vmulq_f32(vcvtq_f32_s32(lefti),  factor4);
9816         rightf = vmulq_f32(vcvtq_f32_s32(righti), factor4);
9817
9818         drflac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf));
9819     }
9820
9821     for (i = (frameCount4 << 2); i < frameCount; ++i) {
9822         pOutputSamples[i*2+0] = (drflac_int32)(pInputSamples0U32[i] << shift0) * factor;
9823         pOutputSamples[i*2+1] = (drflac_int32)(pInputSamples1U32[i] << shift1) * factor;
9824     }
9825 }
9826 #endif
9827
9828 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
9829 {
9830 #if defined(DRFLAC_SUPPORT_SSE2)
9831     if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
9832         drflac_read_pcm_frames_f32__decode_independent_stereo__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9833     } else
9834 #elif defined(DRFLAC_SUPPORT_NEON)
9835     if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
9836         drflac_read_pcm_frames_f32__decode_independent_stereo__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9837     } else
9838 #endif
9839     {
9840         /* Scalar fallback. */
9841         drflac_read_pcm_frames_f32__decode_independent_stereo__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9842     }
9843 }
9844
9845 DRFLAC_API drflac_uint64 drflac_read_pcm_frames_f32(drflac* pFlac, drflac_uint64 framesToRead, float* pBufferOut)
9846 {
9847     drflac_uint64 framesRead;
9848     drflac_uint32 unusedBitsPerSample;
9849
9850     if (pFlac == NULL || framesToRead == 0) {
9851         return 0;
9852     }
9853
9854     if (pBufferOut == NULL) {
9855         return drflac__seek_forward_by_pcm_frames(pFlac, framesToRead);
9856     }
9857
9858     DRFLAC_ASSERT(pFlac->bitsPerSample <= 32);
9859     unusedBitsPerSample = 32 - pFlac->bitsPerSample;
9860
9861     framesRead = 0;
9862     while (framesToRead > 0) {
9863         /* If we've run out of samples in this frame, go to the next. */
9864         if (pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
9865             if (!drflac__read_and_decode_next_flac_frame(pFlac)) {
9866                 break;  /* Couldn't read the next frame, so just break from the loop and return. */
9867             }
9868         } else {
9869             unsigned int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFLACFrame.header.channelAssignment);
9870             drflac_uint64 iFirstPCMFrame = pFlac->currentFLACFrame.header.blockSizeInPCMFrames - pFlac->currentFLACFrame.pcmFramesRemaining;
9871             drflac_uint64 frameCountThisIteration = framesToRead;
9872
9873             if (frameCountThisIteration > pFlac->currentFLACFrame.pcmFramesRemaining) {
9874                 frameCountThisIteration = pFlac->currentFLACFrame.pcmFramesRemaining;
9875             }
9876
9877             if (channelCount == 2) {
9878                 const drflac_int32* pDecodedSamples0 = pFlac->currentFLACFrame.subframes[0].pSamplesS32 + iFirstPCMFrame;
9879                 const drflac_int32* pDecodedSamples1 = pFlac->currentFLACFrame.subframes[1].pSamplesS32 + iFirstPCMFrame;
9880
9881                 switch (pFlac->currentFLACFrame.header.channelAssignment)
9882                 {
9883                     case DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
9884                     {
9885                         drflac_read_pcm_frames_f32__decode_left_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
9886                     } break;
9887
9888                     case DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
9889                     {
9890                         drflac_read_pcm_frames_f32__decode_right_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
9891                     } break;
9892
9893                     case DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
9894                     {
9895                         drflac_read_pcm_frames_f32__decode_mid_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
9896                     } break;
9897
9898                     case DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
9899                     default:
9900                     {
9901                         drflac_read_pcm_frames_f32__decode_independent_stereo(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
9902                     } break;
9903                 }
9904             } else {
9905                 /* Generic interleaving. */
9906                 drflac_uint64 i;
9907                 for (i = 0; i < frameCountThisIteration; ++i) {
9908                     unsigned int j;
9909                     for (j = 0; j < channelCount; ++j) {
9910                         drflac_int32 sampleS32 = (drflac_int32)((drflac_uint32)(pFlac->currentFLACFrame.subframes[j].pSamplesS32[iFirstPCMFrame + i]) << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[j].wastedBitsPerSample));
9911                         pBufferOut[(i*channelCount)+j] = (float)(sampleS32 / 2147483648.0);
9912                     }
9913                 }
9914             }
9915
9916             framesRead                += frameCountThisIteration;
9917             pBufferOut                += frameCountThisIteration * channelCount;
9918             framesToRead              -= frameCountThisIteration;
9919             pFlac->currentPCMFrame    += frameCountThisIteration;
9920             pFlac->currentFLACFrame.pcmFramesRemaining -= (unsigned int)frameCountThisIteration;
9921         }
9922     }
9923
9924     return framesRead;
9925 }
9926
9927
9928 DRFLAC_API drflac_bool32 drflac_seek_to_pcm_frame(drflac* pFlac, drflac_uint64 pcmFrameIndex)
9929 {
9930     if (pFlac == NULL) {
9931         return DRFLAC_FALSE;
9932     }
9933
9934     /* Don't do anything if we're already on the seek point. */
9935     if (pFlac->currentPCMFrame == pcmFrameIndex) {
9936         return DRFLAC_TRUE;
9937     }
9938
9939     /*
9940     If we don't know where the first frame begins then we can't seek. This will happen when the STREAMINFO block was not present
9941     when the decoder was opened.
9942     */
9943     if (pFlac->firstFLACFramePosInBytes == 0) {
9944         return DRFLAC_FALSE;
9945     }
9946
9947     if (pcmFrameIndex == 0) {
9948         pFlac->currentPCMFrame = 0;
9949         return drflac__seek_to_first_frame(pFlac);
9950     } else {
9951         drflac_bool32 wasSuccessful = DRFLAC_FALSE;
9952
9953         /* Clamp the sample to the end. */
9954         if (pcmFrameIndex > pFlac->totalPCMFrameCount) {
9955             pcmFrameIndex = pFlac->totalPCMFrameCount;
9956         }
9957
9958         /* If the target sample and the current sample are in the same frame we just move the position forward. */
9959         if (pcmFrameIndex > pFlac->currentPCMFrame) {
9960             /* Forward. */
9961             drflac_uint32 offset = (drflac_uint32)(pcmFrameIndex - pFlac->currentPCMFrame);
9962             if (pFlac->currentFLACFrame.pcmFramesRemaining >  offset) {
9963                 pFlac->currentFLACFrame.pcmFramesRemaining -= offset;
9964                 pFlac->currentPCMFrame = pcmFrameIndex;
9965                 return DRFLAC_TRUE;
9966             }
9967         } else {
9968             /* Backward. */
9969             drflac_uint32 offsetAbs = (drflac_uint32)(pFlac->currentPCMFrame - pcmFrameIndex);
9970             drflac_uint32 currentFLACFramePCMFrameCount = pFlac->currentFLACFrame.header.blockSizeInPCMFrames;
9971             drflac_uint32 currentFLACFramePCMFramesConsumed = currentFLACFramePCMFrameCount - pFlac->currentFLACFrame.pcmFramesRemaining;
9972             if (currentFLACFramePCMFramesConsumed > offsetAbs) {
9973                 pFlac->currentFLACFrame.pcmFramesRemaining += offsetAbs;
9974                 pFlac->currentPCMFrame = pcmFrameIndex;
9975                 return DRFLAC_TRUE;
9976             }
9977         }
9978
9979         /*
9980         Different techniques depending on encapsulation. Using the native FLAC seektable with Ogg encapsulation is a bit awkward so
9981         we'll instead use Ogg's natural seeking facility.
9982         */
9983 #ifndef DR_FLAC_NO_OGG
9984         if (pFlac->container == drflac_container_ogg)
9985         {
9986             wasSuccessful = drflac_ogg__seek_to_pcm_frame(pFlac, pcmFrameIndex);
9987         }
9988         else
9989 #endif
9990         {
9991             /* First try seeking via the seek table. If this fails, fall back to a brute force seek which is much slower. */
9992             if (/*!wasSuccessful && */!pFlac->_noSeekTableSeek) {
9993                 wasSuccessful = drflac__seek_to_pcm_frame__seek_table(pFlac, pcmFrameIndex);
9994             }
9995
9996 #if !defined(DR_FLAC_NO_CRC)
9997             /* Fall back to binary search if seek table seeking fails. This requires the length of the stream to be known. */
9998             if (!wasSuccessful && !pFlac->_noBinarySearchSeek && pFlac->totalPCMFrameCount > 0) {
9999                 wasSuccessful = drflac__seek_to_pcm_frame__binary_search(pFlac, pcmFrameIndex);
10000             }
10001 #endif
10002
10003             /* Fall back to brute force if all else fails. */
10004             if (!wasSuccessful && !pFlac->_noBruteForceSeek) {
10005                 wasSuccessful = drflac__seek_to_pcm_frame__brute_force(pFlac, pcmFrameIndex);
10006             }
10007         }
10008
10009         pFlac->currentPCMFrame = pcmFrameIndex;
10010         return wasSuccessful;
10011     }
10012 }
10013
10014
10015
10016 /* High Level APIs */
10017
10018 #if defined(SIZE_MAX)
10019     #define DRFLAC_SIZE_MAX  SIZE_MAX
10020 #else
10021     #if defined(DRFLAC_64BIT)
10022         #define DRFLAC_SIZE_MAX  ((drflac_uint64)0xFFFFFFFFFFFFFFFF)
10023     #else
10024         #define DRFLAC_SIZE_MAX  0xFFFFFFFF
10025     #endif
10026 #endif
10027
10028
10029 /* Using a macro as the definition of the drflac__full_decode_and_close_*() API family. Sue me. */
10030 #define DRFLAC_DEFINE_FULL_READ_AND_CLOSE(extension, type) \
10031 static type* drflac__full_read_and_close_ ## extension (drflac* pFlac, unsigned int* channelsOut, unsigned int* sampleRateOut, drflac_uint64* totalPCMFrameCountOut)\
10032 {                                                                                                                                                                   \
10033     type* pSampleData = NULL;                                                                                                                                       \
10034     drflac_uint64 totalPCMFrameCount;                                                                                                                               \
10035                                                                                                                                                                     \
10036     DRFLAC_ASSERT(pFlac != NULL);                                                                                                                                   \
10037                                                                                                                                                                     \
10038     totalPCMFrameCount = pFlac->totalPCMFrameCount;                                                                                                                 \
10039                                                                                                                                                                     \
10040     if (totalPCMFrameCount == 0) {                                                                                                                                  \
10041         type buffer[4096];                                                                                                                                          \
10042         drflac_uint64 pcmFramesRead;                                                                                                                                \
10043         size_t sampleDataBufferSize = sizeof(buffer);                                                                                                               \
10044                                                                                                                                                                     \
10045         pSampleData = (type*)drflac__malloc_from_callbacks(sampleDataBufferSize, &pFlac->allocationCallbacks);                                                      \
10046         if (pSampleData == NULL) {                                                                                                                                  \
10047             goto on_error;                                                                                                                                          \
10048         }                                                                                                                                                           \
10049                                                                                                                                                                     \
10050         while ((pcmFramesRead = (drflac_uint64)drflac_read_pcm_frames_##extension(pFlac, sizeof(buffer)/sizeof(buffer[0])/pFlac->channels, buffer)) > 0) {          \
10051             if (((totalPCMFrameCount + pcmFramesRead) * pFlac->channels * sizeof(type)) > sampleDataBufferSize) {                                                   \
10052                 type* pNewSampleData;                                                                                                                               \
10053                 size_t newSampleDataBufferSize;                                                                                                                     \
10054                                                                                                                                                                     \
10055                 newSampleDataBufferSize = sampleDataBufferSize * 2;                                                                                                 \
10056                 pNewSampleData = (type*)drflac__realloc_from_callbacks(pSampleData, newSampleDataBufferSize, sampleDataBufferSize, &pFlac->allocationCallbacks);    \
10057                 if (pNewSampleData == NULL) {                                                                                                                       \
10058                     drflac__free_from_callbacks(pSampleData, &pFlac->allocationCallbacks);                                                                          \
10059                     goto on_error;                                                                                                                                  \
10060                 }                                                                                                                                                   \
10061                                                                                                                                                                     \
10062                 sampleDataBufferSize = newSampleDataBufferSize;                                                                                                     \
10063                 pSampleData = pNewSampleData;                                                                                                                       \
10064             }                                                                                                                                                       \
10065                                                                                                                                                                     \
10066             DRFLAC_COPY_MEMORY(pSampleData + (totalPCMFrameCount*pFlac->channels), buffer, (size_t)(pcmFramesRead*pFlac->channels*sizeof(type)));                   \
10067             totalPCMFrameCount += pcmFramesRead;                                                                                                                    \
10068         }                                                                                                                                                           \
10069                                                                                                                                                                     \
10070         /* At this point everything should be decoded, but we just want to fill the unused part buffer with silence - need to                                       \
10071            protect those ears from random noise! */                                                                                                                 \
10072         DRFLAC_ZERO_MEMORY(pSampleData + (totalPCMFrameCount*pFlac->channels), (size_t)(sampleDataBufferSize - totalPCMFrameCount*pFlac->channels*sizeof(type)));   \
10073     } else {                                                                                                                                                        \
10074         drflac_uint64 dataSize = totalPCMFrameCount*pFlac->channels*sizeof(type);                                                                                   \
10075         if (dataSize > DRFLAC_SIZE_MAX) {                                                                                                                           \
10076             goto on_error;  /* The decoded data is too big. */                                                                                                      \
10077         }                                                                                                                                                           \
10078                                                                                                                                                                     \
10079         pSampleData = (type*)drflac__malloc_from_callbacks((size_t)dataSize, &pFlac->allocationCallbacks);    /* <-- Safe cast as per the check above. */           \
10080         if (pSampleData == NULL) {                                                                                                                                  \
10081             goto on_error;                                                                                                                                          \
10082         }                                                                                                                                                           \
10083                                                                                                                                                                     \
10084         totalPCMFrameCount = drflac_read_pcm_frames_##extension(pFlac, pFlac->totalPCMFrameCount, pSampleData);                                                     \
10085     }                                                                                                                                                               \
10086                                                                                                                                                                     \
10087     if (sampleRateOut) *sampleRateOut = pFlac->sampleRate;                                                                                                          \
10088     if (channelsOut) *channelsOut = pFlac->channels;                                                                                                                \
10089     if (totalPCMFrameCountOut) *totalPCMFrameCountOut = totalPCMFrameCount;                                                                                         \
10090                                                                                                                                                                     \
10091     drflac_close(pFlac);                                                                                                                                            \
10092     return pSampleData;                                                                                                                                             \
10093                                                                                                                                                                     \
10094 on_error:                                                                                                                                                           \
10095     drflac_close(pFlac);                                                                                                                                            \
10096     return NULL;                                                                                                                                                    \
10097 }
10098
10099 DRFLAC_DEFINE_FULL_READ_AND_CLOSE(s32, drflac_int32)
10100 DRFLAC_DEFINE_FULL_READ_AND_CLOSE(s16, drflac_int16)
10101 DRFLAC_DEFINE_FULL_READ_AND_CLOSE(f32, float)
10102
10103 DRFLAC_API drflac_int32* drflac_open_and_read_pcm_frames_s32(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drflac_uint64* totalPCMFrameCountOut, const drflac_allocation_callbacks* pAllocationCallbacks)
10104 {
10105     drflac* pFlac;
10106
10107     if (channelsOut) {
10108         *channelsOut = 0;
10109     }
10110     if (sampleRateOut) {
10111         *sampleRateOut = 0;
10112     }
10113     if (totalPCMFrameCountOut) {
10114         *totalPCMFrameCountOut = 0;
10115     }
10116
10117     pFlac = drflac_open(onRead, onSeek, pUserData, pAllocationCallbacks);
10118     if (pFlac == NULL) {
10119         return NULL;
10120     }
10121
10122     return drflac__full_read_and_close_s32(pFlac, channelsOut, sampleRateOut, totalPCMFrameCountOut);
10123 }
10124
10125 DRFLAC_API drflac_int16* drflac_open_and_read_pcm_frames_s16(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drflac_uint64* totalPCMFrameCountOut, const drflac_allocation_callbacks* pAllocationCallbacks)
10126 {
10127     drflac* pFlac;
10128
10129     if (channelsOut) {
10130         *channelsOut = 0;
10131     }
10132     if (sampleRateOut) {
10133         *sampleRateOut = 0;
10134     }
10135     if (totalPCMFrameCountOut) {
10136         *totalPCMFrameCountOut = 0;
10137     }
10138
10139     pFlac = drflac_open(onRead, onSeek, pUserData, pAllocationCallbacks);
10140     if (pFlac == NULL) {
10141         return NULL;
10142     }
10143
10144     return drflac__full_read_and_close_s16(pFlac, channelsOut, sampleRateOut, totalPCMFrameCountOut);
10145 }
10146
10147 DRFLAC_API float* drflac_open_and_read_pcm_frames_f32(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drflac_uint64* totalPCMFrameCountOut, const drflac_allocation_callbacks* pAllocationCallbacks)
10148 {
10149     drflac* pFlac;
10150
10151     if (channelsOut) {
10152         *channelsOut = 0;
10153     }
10154     if (sampleRateOut) {
10155         *sampleRateOut = 0;
10156     }
10157     if (totalPCMFrameCountOut) {
10158         *totalPCMFrameCountOut = 0;
10159     }
10160
10161     pFlac = drflac_open(onRead, onSeek, pUserData, pAllocationCallbacks);
10162     if (pFlac == NULL) {
10163         return NULL;
10164     }
10165
10166     return drflac__full_read_and_close_f32(pFlac, channelsOut, sampleRateOut, totalPCMFrameCountOut);
10167 }
10168
10169 DRFLAC_API drflac_int32* drflac_open_memory_and_read_pcm_frames_s32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks)
10170 {
10171     drflac* pFlac;
10172
10173     if (sampleRate) {
10174         *sampleRate = 0;
10175     }
10176     if (channels) {
10177         *channels = 0;
10178     }
10179     if (totalPCMFrameCount) {
10180         *totalPCMFrameCount = 0;
10181     }
10182
10183     pFlac = drflac_open_memory(data, dataSize, pAllocationCallbacks);
10184     if (pFlac == NULL) {
10185         return NULL;
10186     }
10187
10188     return drflac__full_read_and_close_s32(pFlac, channels, sampleRate, totalPCMFrameCount);
10189 }
10190
10191 DRFLAC_API drflac_int16* drflac_open_memory_and_read_pcm_frames_s16(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks)
10192 {
10193     drflac* pFlac;
10194
10195     if (sampleRate) {
10196         *sampleRate = 0;
10197     }
10198     if (channels) {
10199         *channels = 0;
10200     }
10201     if (totalPCMFrameCount) {
10202         *totalPCMFrameCount = 0;
10203     }
10204
10205     pFlac = drflac_open_memory(data, dataSize, pAllocationCallbacks);
10206     if (pFlac == NULL) {
10207         return NULL;
10208     }
10209
10210     return drflac__full_read_and_close_s16(pFlac, channels, sampleRate, totalPCMFrameCount);
10211 }
10212
10213 DRFLAC_API float* drflac_open_memory_and_read_pcm_frames_f32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks)
10214 {
10215     drflac* pFlac;
10216
10217     if (sampleRate) {
10218         *sampleRate = 0;
10219     }
10220     if (channels) {
10221         *channels = 0;
10222     }
10223     if (totalPCMFrameCount) {
10224         *totalPCMFrameCount = 0;
10225     }
10226
10227     pFlac = drflac_open_memory(data, dataSize, pAllocationCallbacks);
10228     if (pFlac == NULL) {
10229         return NULL;
10230     }
10231
10232     return drflac__full_read_and_close_f32(pFlac, channels, sampleRate, totalPCMFrameCount);
10233 }
10234
10235
10236 DRFLAC_API void drflac_free(void* p, const drflac_allocation_callbacks* pAllocationCallbacks)
10237 {
10238     if (pAllocationCallbacks != NULL) {
10239         drflac__free_from_callbacks(p, pAllocationCallbacks);
10240     } else {
10241         drflac__free_default(p, NULL);
10242     }
10243 }
10244
10245
10246
10247
10248 DRFLAC_API void drflac_init_vorbis_comment_iterator(drflac_vorbis_comment_iterator* pIter, drflac_uint32 commentCount, const void* pComments)
10249 {
10250     if (pIter == NULL) {
10251         return;
10252     }
10253
10254     pIter->countRemaining = commentCount;
10255     pIter->pRunningData   = (const char*)pComments;
10256 }
10257
10258 DRFLAC_API const char* drflac_next_vorbis_comment(drflac_vorbis_comment_iterator* pIter, drflac_uint32* pCommentLengthOut)
10259 {
10260     drflac_int32 length;
10261     const char* pComment;
10262
10263     /* Safety. */
10264     if (pCommentLengthOut) {
10265         *pCommentLengthOut = 0;
10266     }
10267
10268     if (pIter == NULL || pIter->countRemaining == 0 || pIter->pRunningData == NULL) {
10269         return NULL;
10270     }
10271
10272     length = drflac__le2host_32(*(const drflac_uint32*)pIter->pRunningData);
10273     pIter->pRunningData += 4;
10274
10275     pComment = pIter->pRunningData;
10276     pIter->pRunningData += length;
10277     pIter->countRemaining -= 1;
10278
10279     if (pCommentLengthOut) {
10280         *pCommentLengthOut = length;
10281     }
10282
10283     return pComment;
10284 }
10285
10286
10287
10288
10289 DRFLAC_API void drflac_init_cuesheet_track_iterator(drflac_cuesheet_track_iterator* pIter, drflac_uint32 trackCount, const void* pTrackData)
10290 {
10291     if (pIter == NULL) {
10292         return;
10293     }
10294
10295     pIter->countRemaining = trackCount;
10296     pIter->pRunningData   = (const char*)pTrackData;
10297 }
10298
10299 DRFLAC_API drflac_bool32 drflac_next_cuesheet_track(drflac_cuesheet_track_iterator* pIter, drflac_cuesheet_track* pCuesheetTrack)
10300 {
10301     drflac_cuesheet_track cuesheetTrack;
10302     const char* pRunningData;
10303     drflac_uint64 offsetHi;
10304     drflac_uint64 offsetLo;
10305
10306     if (pIter == NULL || pIter->countRemaining == 0 || pIter->pRunningData == NULL) {
10307         return DRFLAC_FALSE;
10308     }
10309
10310     pRunningData = pIter->pRunningData;
10311
10312     offsetHi                   = drflac__be2host_32(*(const drflac_uint32*)pRunningData); pRunningData += 4;
10313     offsetLo                   = drflac__be2host_32(*(const drflac_uint32*)pRunningData); pRunningData += 4;
10314     cuesheetTrack.offset       = offsetLo | (offsetHi << 32);
10315     cuesheetTrack.trackNumber  = pRunningData[0];                                         pRunningData += 1;
10316     DRFLAC_COPY_MEMORY(cuesheetTrack.ISRC, pRunningData, sizeof(cuesheetTrack.ISRC));     pRunningData += 12;
10317     cuesheetTrack.isAudio      = (pRunningData[0] & 0x80) != 0;
10318     cuesheetTrack.preEmphasis  = (pRunningData[0] & 0x40) != 0;                           pRunningData += 14;
10319     cuesheetTrack.indexCount   = pRunningData[0];                                         pRunningData += 1;
10320     cuesheetTrack.pIndexPoints = (const drflac_cuesheet_track_index*)pRunningData;        pRunningData += cuesheetTrack.indexCount * sizeof(drflac_cuesheet_track_index);
10321
10322     pIter->pRunningData = pRunningData;
10323     pIter->countRemaining -= 1;
10324
10325     if (pCuesheetTrack) {
10326         *pCuesheetTrack = cuesheetTrack;
10327     }
10328
10329     return DRFLAC_TRUE;
10330 }
10331
10332 #if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
10333     #pragma GCC diagnostic pop
10334 #endif
10335 #endif  /* dr_flac_c */
10336 #endif  /* DR_FLAC_IMPLEMENTATION */
10337
10338
10339 /*
10340 REVISION HISTORY
10341 ================
10342 v0.12.28 - 2021-02-21
10343   - Fix a warning due to referencing _MSC_VER when it is undefined.
10344
10345 v0.12.27 - 2021-01-31
10346   - Fix a static analysis warning.
10347
10348 v0.12.26 - 2021-01-17
10349   - Fix a compilation warning due to _BSD_SOURCE being deprecated.
10350
10351 v0.12.25 - 2020-12-26
10352   - Update documentation.
10353
10354 v0.12.24 - 2020-11-29
10355   - Fix ARM64/NEON detection when compiling with MSVC.
10356
10357 v0.12.23 - 2020-11-21
10358   - Fix compilation with OpenWatcom.
10359
10360 v0.12.22 - 2020-11-01
10361   - Fix an error with the previous release.
10362
10363 v0.12.21 - 2020-11-01
10364   - Fix a possible deadlock when seeking.
10365   - Improve compiler support for older versions of GCC.
10366
10367 v0.12.20 - 2020-09-08
10368   - Fix a compilation error on older compilers.
10369
10370 v0.12.19 - 2020-08-30
10371   - Fix a bug due to an undefined 32-bit shift.
10372
10373 v0.12.18 - 2020-08-14
10374   - Fix a crash when compiling with clang-cl.
10375
10376 v0.12.17 - 2020-08-02
10377   - Simplify sized types.
10378
10379 v0.12.16 - 2020-07-25
10380   - Fix a compilation warning.
10381
10382 v0.12.15 - 2020-07-06
10383   - Check for negative LPC shifts and return an error.
10384
10385 v0.12.14 - 2020-06-23
10386   - Add include guard for the implementation section.
10387
10388 v0.12.13 - 2020-05-16
10389   - Add compile-time and run-time version querying.
10390     - DRFLAC_VERSION_MINOR
10391     - DRFLAC_VERSION_MAJOR
10392     - DRFLAC_VERSION_REVISION
10393     - DRFLAC_VERSION_STRING
10394     - drflac_version()
10395     - drflac_version_string()
10396
10397 v0.12.12 - 2020-04-30
10398   - Fix compilation errors with VC6.
10399
10400 v0.12.11 - 2020-04-19
10401   - Fix some pedantic warnings.
10402   - Fix some undefined behaviour warnings.
10403
10404 v0.12.10 - 2020-04-10
10405   - Fix some bugs when trying to seek with an invalid seek table.
10406
10407 v0.12.9 - 2020-04-05
10408   - Fix warnings.
10409
10410 v0.12.8 - 2020-04-04
10411   - Add drflac_open_file_w() and drflac_open_file_with_metadata_w().
10412   - Fix some static analysis warnings.
10413   - Minor documentation updates.
10414
10415 v0.12.7 - 2020-03-14
10416   - Fix compilation errors with VC6.
10417
10418 v0.12.6 - 2020-03-07
10419   - Fix compilation error with Visual Studio .NET 2003.
10420
10421 v0.12.5 - 2020-01-30
10422   - Silence some static analysis warnings.
10423
10424 v0.12.4 - 2020-01-29
10425   - Silence some static analysis warnings.
10426
10427 v0.12.3 - 2019-12-02
10428   - Fix some warnings when compiling with GCC and the -Og flag.
10429   - Fix a crash in out-of-memory situations.
10430   - Fix potential integer overflow bug.
10431   - Fix some static analysis warnings.
10432   - Fix a possible crash when using custom memory allocators without a custom realloc() implementation.
10433   - Fix a bug with binary search seeking where the bits per sample is not a multiple of 8.
10434
10435 v0.12.2 - 2019-10-07
10436   - Internal code clean up.
10437
10438 v0.12.1 - 2019-09-29
10439   - Fix some Clang Static Analyzer warnings.
10440   - Fix an unused variable warning.
10441
10442 v0.12.0 - 2019-09-23
10443   - API CHANGE: Add support for user defined memory allocation routines. This system allows the program to specify their own memory allocation
10444     routines with a user data pointer for client-specific contextual data. This adds an extra parameter to the end of the following APIs:
10445     - drflac_open()
10446     - drflac_open_relaxed()
10447     - drflac_open_with_metadata()
10448     - drflac_open_with_metadata_relaxed()
10449     - drflac_open_file()
10450     - drflac_open_file_with_metadata()
10451     - drflac_open_memory()
10452     - drflac_open_memory_with_metadata()
10453     - drflac_open_and_read_pcm_frames_s32()
10454     - drflac_open_and_read_pcm_frames_s16()
10455     - drflac_open_and_read_pcm_frames_f32()
10456     - drflac_open_file_and_read_pcm_frames_s32()
10457     - drflac_open_file_and_read_pcm_frames_s16()
10458     - drflac_open_file_and_read_pcm_frames_f32()
10459     - drflac_open_memory_and_read_pcm_frames_s32()
10460     - drflac_open_memory_and_read_pcm_frames_s16()
10461     - drflac_open_memory_and_read_pcm_frames_f32()
10462     Set this extra parameter to NULL to use defaults which is the same as the previous behaviour. Setting this NULL will use
10463     DRFLAC_MALLOC, DRFLAC_REALLOC and DRFLAC_FREE.
10464   - Remove deprecated APIs:
10465     - drflac_read_s32()
10466     - drflac_read_s16()
10467     - drflac_read_f32()
10468     - drflac_seek_to_sample()
10469     - drflac_open_and_decode_s32()
10470     - drflac_open_and_decode_s16()
10471     - drflac_open_and_decode_f32()
10472     - drflac_open_and_decode_file_s32()
10473     - drflac_open_and_decode_file_s16()
10474     - drflac_open_and_decode_file_f32()
10475     - drflac_open_and_decode_memory_s32()
10476     - drflac_open_and_decode_memory_s16()
10477     - drflac_open_and_decode_memory_f32()
10478   - Remove drflac.totalSampleCount which is now replaced with drflac.totalPCMFrameCount. You can emulate drflac.totalSampleCount
10479     by doing pFlac->totalPCMFrameCount*pFlac->channels.
10480   - Rename drflac.currentFrame to drflac.currentFLACFrame to remove ambiguity with PCM frames.
10481   - Fix errors when seeking to the end of a stream.
10482   - Optimizations to seeking.
10483   - SSE improvements and optimizations.
10484   - ARM NEON optimizations.
10485   - Optimizations to drflac_read_pcm_frames_s16().
10486   - Optimizations to drflac_read_pcm_frames_s32().
10487
10488 v0.11.10 - 2019-06-26
10489   - Fix a compiler error.
10490
10491 v0.11.9 - 2019-06-16
10492   - Silence some ThreadSanitizer warnings.
10493
10494 v0.11.8 - 2019-05-21
10495   - Fix warnings.
10496
10497 v0.11.7 - 2019-05-06
10498   - C89 fixes.
10499
10500 v0.11.6 - 2019-05-05
10501   - Add support for C89.
10502   - Fix a compiler warning when CRC is disabled.
10503   - Change license to choice of public domain or MIT-0.
10504
10505 v0.11.5 - 2019-04-19
10506   - Fix a compiler error with GCC.
10507
10508 v0.11.4 - 2019-04-17
10509   - Fix some warnings with GCC when compiling with -std=c99.
10510
10511 v0.11.3 - 2019-04-07
10512   - Silence warnings with GCC.
10513
10514 v0.11.2 - 2019-03-10
10515   - Fix a warning.
10516
10517 v0.11.1 - 2019-02-17
10518   - Fix a potential bug with seeking.
10519
10520 v0.11.0 - 2018-12-16
10521   - API CHANGE: Deprecated drflac_read_s32(), drflac_read_s16() and drflac_read_f32() and replaced them with
10522     drflac_read_pcm_frames_s32(), drflac_read_pcm_frames_s16() and drflac_read_pcm_frames_f32(). The new APIs take
10523     and return PCM frame counts instead of sample counts. To upgrade you will need to change the input count by
10524     dividing it by the channel count, and then do the same with the return value.
10525   - API_CHANGE: Deprecated drflac_seek_to_sample() and replaced with drflac_seek_to_pcm_frame(). Same rules as
10526     the changes to drflac_read_*() apply.
10527   - API CHANGE: Deprecated drflac_open_and_decode_*() and replaced with drflac_open_*_and_read_*(). Same rules as
10528     the changes to drflac_read_*() apply.
10529   - Optimizations.
10530
10531 v0.10.0 - 2018-09-11
10532   - Remove the DR_FLAC_NO_WIN32_IO option and the Win32 file IO functionality. If you need to use Win32 file IO you
10533     need to do it yourself via the callback API.
10534   - Fix the clang build.
10535   - Fix undefined behavior.
10536   - Fix errors with CUESHEET metdata blocks.
10537   - Add an API for iterating over each cuesheet track in the CUESHEET metadata block. This works the same way as the
10538     Vorbis comment API.
10539   - Other miscellaneous bug fixes, mostly relating to invalid FLAC streams.
10540   - Minor optimizations.
10541
10542 v0.9.11 - 2018-08-29
10543   - Fix a bug with sample reconstruction.
10544
10545 v0.9.10 - 2018-08-07
10546   - Improve 64-bit detection.
10547
10548 v0.9.9 - 2018-08-05
10549   - Fix C++ build on older versions of GCC.
10550
10551 v0.9.8 - 2018-07-24
10552   - Fix compilation errors.
10553
10554 v0.9.7 - 2018-07-05
10555   - Fix a warning.
10556
10557 v0.9.6 - 2018-06-29
10558   - Fix some typos.
10559
10560 v0.9.5 - 2018-06-23
10561   - Fix some warnings.
10562
10563 v0.9.4 - 2018-06-14
10564   - Optimizations to seeking.
10565   - Clean up.
10566
10567 v0.9.3 - 2018-05-22
10568   - Bug fix.
10569
10570 v0.9.2 - 2018-05-12
10571   - Fix a compilation error due to a missing break statement.
10572
10573 v0.9.1 - 2018-04-29
10574   - Fix compilation error with Clang.
10575
10576 v0.9 - 2018-04-24
10577   - Fix Clang build.
10578   - Start using major.minor.revision versioning.
10579
10580 v0.8g - 2018-04-19
10581   - Fix build on non-x86/x64 architectures.
10582
10583 v0.8f - 2018-02-02
10584   - Stop pretending to support changing rate/channels mid stream.
10585
10586 v0.8e - 2018-02-01
10587   - Fix a crash when the block size of a frame is larger than the maximum block size defined by the FLAC stream.
10588   - Fix a crash the the Rice partition order is invalid.
10589
10590 v0.8d - 2017-09-22
10591   - Add support for decoding streams with ID3 tags. ID3 tags are just skipped.
10592
10593 v0.8c - 2017-09-07
10594   - Fix warning on non-x86/x64 architectures.
10595
10596 v0.8b - 2017-08-19
10597   - Fix build on non-x86/x64 architectures.
10598
10599 v0.8a - 2017-08-13
10600   - A small optimization for the Clang build.
10601
10602 v0.8 - 2017-08-12
10603   - API CHANGE: Rename dr_* types to drflac_*.
10604   - Optimizations. This brings dr_flac back to about the same class of efficiency as the reference implementation.
10605   - Add support for custom implementations of malloc(), realloc(), etc.
10606   - Add CRC checking to Ogg encapsulated streams.
10607   - Fix VC++ 6 build. This is only for the C++ compiler. The C compiler is not currently supported.
10608   - Bug fixes.
10609
10610 v0.7 - 2017-07-23
10611   - Add support for opening a stream without a header block. To do this, use drflac_open_relaxed() / drflac_open_with_metadata_relaxed().
10612
10613 v0.6 - 2017-07-22
10614   - Add support for recovering from invalid frames. With this change, dr_flac will simply skip over invalid frames as if they
10615     never existed. Frames are checked against their sync code, the CRC-8 of the frame header and the CRC-16 of the whole frame.
10616
10617 v0.5 - 2017-07-16
10618   - Fix typos.
10619   - Change drflac_bool* types to unsigned.
10620   - Add CRC checking. This makes dr_flac slower, but can be disabled with #define DR_FLAC_NO_CRC.
10621
10622 v0.4f - 2017-03-10
10623   - Fix a couple of bugs with the bitstreaming code.
10624
10625 v0.4e - 2017-02-17
10626   - Fix some warnings.
10627
10628 v0.4d - 2016-12-26
10629   - Add support for 32-bit floating-point PCM decoding.
10630   - Use drflac_int* and drflac_uint* sized types to improve compiler support.
10631   - Minor improvements to documentation.
10632
10633 v0.4c - 2016-12-26
10634   - Add support for signed 16-bit integer PCM decoding.
10635
10636 v0.4b - 2016-10-23
10637   - A minor change to drflac_bool8 and drflac_bool32 types.
10638
10639 v0.4a - 2016-10-11
10640   - Rename drBool32 to drflac_bool32 for styling consistency.
10641
10642 v0.4 - 2016-09-29
10643   - API/ABI CHANGE: Use fixed size 32-bit booleans instead of the built-in bool type.
10644   - API CHANGE: Rename drflac_open_and_decode*() to drflac_open_and_decode*_s32().
10645   - API CHANGE: Swap the order of "channels" and "sampleRate" parameters in drflac_open_and_decode*(). Rationale for this is to
10646     keep it consistent with drflac_audio.
10647
10648 v0.3f - 2016-09-21
10649   - Fix a warning with GCC.
10650
10651 v0.3e - 2016-09-18
10652   - Fixed a bug where GCC 4.3+ was not getting properly identified.
10653   - Fixed a few typos.
10654   - Changed date formats to ISO 8601 (YYYY-MM-DD).
10655
10656 v0.3d - 2016-06-11
10657   - Minor clean up.
10658
10659 v0.3c - 2016-05-28
10660   - Fixed compilation error.
10661
10662 v0.3b - 2016-05-16
10663   - Fixed Linux/GCC build.
10664   - Updated documentation.
10665
10666 v0.3a - 2016-05-15
10667   - Minor fixes to documentation.
10668
10669 v0.3 - 2016-05-11
10670   - Optimizations. Now at about parity with the reference implementation on 32-bit builds.
10671   - Lots of clean up.
10672
10673 v0.2b - 2016-05-10
10674   - Bug fixes.
10675
10676 v0.2a - 2016-05-10
10677   - Made drflac_open_and_decode() more robust.
10678   - Removed an unused debugging variable
10679
10680 v0.2 - 2016-05-09
10681   - Added support for Ogg encapsulation.
10682   - API CHANGE. Have the onSeek callback take a third argument which specifies whether or not the seek
10683     should be relative to the start or the current position. Also changes the seeking rules such that
10684     seeking offsets will never be negative.
10685   - Have drflac_open_and_decode() fail gracefully if the stream has an unknown total sample count.
10686
10687 v0.1b - 2016-05-07
10688   - Properly close the file handle in drflac_open_file() and family when the decoder fails to initialize.
10689   - Removed a stale comment.
10690
10691 v0.1a - 2016-05-05
10692   - Minor formatting changes.
10693   - Fixed a warning on the GCC build.
10694
10695 v0.1 - 2016-05-03
10696   - Initial versioned release.
10697 */
10698
10699 /*
10700 This software is available as a choice of the following licenses. Choose
10701 whichever you prefer.
10702
10703 ===============================================================================
10704 ALTERNATIVE 1 - Public Domain (www.unlicense.org)
10705 ===============================================================================
10706 This is free and unencumbered software released into the public domain.
10707
10708 Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
10709 software, either in source code form or as a compiled binary, for any purpose,
10710 commercial or non-commercial, and by any means.
10711
10712 In jurisdictions that recognize copyright laws, the author or authors of this
10713 software dedicate any and all copyright interest in the software to the public
10714 domain. We make this dedication for the benefit of the public at large and to
10715 the detriment of our heirs and successors. We intend this dedication to be an
10716 overt act of relinquishment in perpetuity of all present and future rights to
10717 this software under copyright law.
10718
10719 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10720 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
10721 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
10722 AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
10723 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
10724 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10725
10726 For more information, please refer to <http://unlicense.org/>
10727
10728 ===============================================================================
10729 ALTERNATIVE 2 - MIT No Attribution
10730 ===============================================================================
10731 Copyright 2020 David Reid
10732
10733 Permission is hereby granted, free of charge, to any person obtaining a copy of
10734 this software and associated documentation files (the "Software"), to deal in
10735 the Software without restriction, including without limitation the rights to
10736 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
10737 of the Software, and to permit persons to whom the Software is furnished to do
10738 so.
10739
10740 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10741 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
10742 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
10743 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
10744 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
10745 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
10746 SOFTWARE.
10747 */