328 lines
11 KiB
Diff
328 lines
11 KiB
Diff
diff --git a/src/libFLAC/bitreader.c b/src/libFLAC/bitreader.c
|
|
index 79cb5cc4..3ec9b90d 100644
|
|
--- a/src/libFLAC/bitreader.c
|
|
+++ b/src/libFLAC/bitreader.c
|
|
@@ -1098,3 +1098,238 @@ extern FLAC__bool FLAC__bitreader_is_consumed_byte_aligned(const FLAC__BitReader
|
|
extern uint32_t FLAC__bitreader_bits_left_for_byte_alignment(const FLAC__BitReader *br);
|
|
extern uint32_t FLAC__bitreader_get_input_bits_unconsumed(const FLAC__BitReader *br);
|
|
extern FLAC__bool FLAC__bitreader_read_uint32_little_endian(FLAC__BitReader *br, FLAC__uint32 *val);
|
|
+
|
|
+
|
|
+
|
|
+#ifdef FLAC__SUPPORT_LZCNT
|
|
+
|
|
+/*
|
|
+* === LZCNT ===
|
|
+*/
|
|
+#if defined __GNUC__ || defined __clang__
|
|
+#define __LZCNT__
|
|
+#include <x86intrin.h>
|
|
+#endif
|
|
+
|
|
+#undef COUNT_ZERO_MSBS
|
|
+#undef COUNT_ZERO_MSBS2
|
|
+
|
|
+#if (ENABLE_64_BIT_WORDS == 0)
|
|
+#ifdef _MSC_VER
|
|
+#define COUNT_ZERO_MSBS(word) __lzcnt(word)
|
|
+#define COUNT_ZERO_MSBS2(word) __lzcnt(word)
|
|
+#else
|
|
+#define COUNT_ZERO_MSBS(word) __lzcnt32(word)
|
|
+#define COUNT_ZERO_MSBS2(word) __lzcnt32(word)
|
|
+#endif
|
|
+#else
|
|
+#define COUNT_ZERO_MSBS(word) __lzcnt64(word)
|
|
+#define COUNT_ZERO_MSBS2(word) __lzcnt64(word)
|
|
+#endif
|
|
+
|
|
+
|
|
+#ifdef __clang__
|
|
+#pragma clang attribute push (__attribute__((target("lzcnt"))), apply_to=function)
|
|
+#endif
|
|
+FLAC__bool FLAC__bitreader_read_unary_unsigned__LZCNT(FLAC__BitReader* br, uint32_t* val)
|
|
+{
|
|
+ uint32_t i;
|
|
+
|
|
+ FLAC__ASSERT(0 != br);
|
|
+ FLAC__ASSERT(0 != br->buffer);
|
|
+
|
|
+ *val = 0;
|
|
+ while(1) {
|
|
+ while(br->consumed_words < br->words) { /* if we've not consumed up to a partial tail word... */
|
|
+ brword b = br->buffer[br->consumed_words] << br->consumed_bits;
|
|
+ if(b) {
|
|
+ i = COUNT_ZERO_MSBS(b);
|
|
+ *val += i;
|
|
+ i++;
|
|
+ br->consumed_bits += i;
|
|
+ if (br->consumed_bits >= FLAC__BITS_PER_WORD) { /* faster way of testing if(br->consumed_bits == FLAC__BITS_PER_WORD) */
|
|
+ br->consumed_words++;
|
|
+ br->consumed_bits = 0;
|
|
+ }
|
|
+ return true;
|
|
+ } else {
|
|
+ *val += FLAC__BITS_PER_WORD - br->consumed_bits;
|
|
+ br->consumed_words++;
|
|
+ br->consumed_bits = 0;
|
|
+ /* didn't find stop bit yet, have to keep going... */
|
|
+ }
|
|
+ }
|
|
+ /* at this point we've eaten up all the whole words; have to try
|
|
+ * reading through any tail bytes before calling the read callback.
|
|
+ * this is a repeat of the above logic adjusted for the fact we
|
|
+ * don't have a whole word. note though if the client is feeding
|
|
+ * us data a byte at a time (unlikely), br->consumed_bits may not
|
|
+ * be zero.
|
|
+ */
|
|
+ if(br->bytes * 8 > br->consumed_bits) {
|
|
+ const uint32_t end = br->bytes * 8;
|
|
+ brword b = (br->buffer[br->consumed_words] & (FLAC__WORD_ALL_ONES << (FLAC__BITS_PER_WORD - end))) << br->consumed_bits;
|
|
+ if(b) {
|
|
+ i = COUNT_ZERO_MSBS(b);
|
|
+ *val += i;
|
|
+ i++;
|
|
+ br->consumed_bits += i;
|
|
+ FLAC__ASSERT(br->consumed_bits < FLAC__BITS_PER_WORD);
|
|
+ return true;
|
|
+ } else {
|
|
+ *val += end - br->consumed_bits;
|
|
+ br->consumed_bits = end;
|
|
+ FLAC__ASSERT(br->consumed_bits < FLAC__BITS_PER_WORD);
|
|
+ /* didn't find stop bit yet, have to keep going... */
|
|
+ }
|
|
+ }
|
|
+ if(!bitreader_read_from_client_(br))
|
|
+ return false;
|
|
+ }
|
|
+}
|
|
+
|
|
+
|
|
+/* this is by far the most heavily used reader call. it ain't pretty but it's fast */
|
|
+FLAC__bool FLAC__bitreader_read_rice_signed_block__LZCNT(FLAC__BitReader* br, int vals[], uint32_t nvals, uint32_t parameter)
|
|
+{
|
|
+ /* try and get br->consumed_words and br->consumed_bits into register;
|
|
+ * must remember to flush them back to *br before calling other
|
|
+ * bitreader functions that use them, and before returning */
|
|
+ uint32_t cwords, words, lsbs, msbs, x, y;
|
|
+ uint32_t ucbits; /* keep track of the number of unconsumed bits in word */
|
|
+ brword b;
|
|
+ int* val, * end;
|
|
+
|
|
+ FLAC__ASSERT(0 != br);
|
|
+ FLAC__ASSERT(0 != br->buffer);
|
|
+ /* WATCHOUT: code does not work with <32bit words; we can make things much faster with this assertion */
|
|
+ FLAC__ASSERT(FLAC__BITS_PER_WORD >= 32);
|
|
+ FLAC__ASSERT(parameter < 32);
|
|
+ /* the above two asserts also guarantee that the binary part never straddles more than 2 words, so we don't have to loop to read it */
|
|
+
|
|
+ val = vals;
|
|
+ end = vals + nvals;
|
|
+
|
|
+ if(parameter == 0) {
|
|
+ while(val < end) {
|
|
+ /* read the unary MSBs and end bit */
|
|
+ if(!FLAC__bitreader_read_unary_unsigned__LZCNT(br, &msbs))
|
|
+ return false;
|
|
+
|
|
+ *val++ = (int)(msbs >> 1) ^ -(int)(msbs & 1);
|
|
+ }
|
|
+
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ FLAC__ASSERT(parameter > 0);
|
|
+
|
|
+ cwords = br->consumed_words;
|
|
+ words = br->words;
|
|
+
|
|
+ /* if we've not consumed up to a partial tail word... */
|
|
+ if(cwords >= words) {
|
|
+ x = 0;
|
|
+ goto process_tail;
|
|
+ }
|
|
+
|
|
+ ucbits = FLAC__BITS_PER_WORD - br->consumed_bits;
|
|
+ b = br->buffer[cwords] << br->consumed_bits; /* keep unconsumed bits aligned to left */
|
|
+
|
|
+ while(val < end) {
|
|
+ /* read the unary MSBs and end bit */
|
|
+ x = y = COUNT_ZERO_MSBS2(b);
|
|
+ if (x == FLAC__BITS_PER_WORD) {
|
|
+ x = ucbits;
|
|
+ do {
|
|
+ /* didn't find stop bit yet, have to keep going... */
|
|
+ cwords++;
|
|
+ if (cwords >= words)
|
|
+ goto incomplete_msbs;
|
|
+ b = br->buffer[cwords];
|
|
+ y = COUNT_ZERO_MSBS2(b);
|
|
+ x += y;
|
|
+ } while(y == FLAC__BITS_PER_WORD);
|
|
+ }
|
|
+ b <<= y;
|
|
+ b <<= 1; /* account for stop bit */
|
|
+ ucbits = (ucbits - x - 1) % FLAC__BITS_PER_WORD;
|
|
+ msbs = x;
|
|
+
|
|
+ /* read the binary LSBs */
|
|
+ x = (FLAC__uint32)(b >> (FLAC__BITS_PER_WORD - parameter)); /* parameter < 32, so we can cast to 32-bit uint32_t */
|
|
+ if(parameter <= ucbits) {
|
|
+ ucbits -= parameter;
|
|
+ b <<= parameter;
|
|
+ } else {
|
|
+ /* there are still bits left to read, they will all be in the next word */
|
|
+ cwords++;
|
|
+ if (cwords >= words)
|
|
+ goto incomplete_lsbs;
|
|
+ b = br->buffer[cwords];
|
|
+ ucbits += FLAC__BITS_PER_WORD - parameter;
|
|
+ x |= (FLAC__uint32)(b >> ucbits);
|
|
+ b <<= FLAC__BITS_PER_WORD - ucbits;
|
|
+ }
|
|
+ lsbs = x;
|
|
+
|
|
+ /* compose the value */
|
|
+ x = (msbs << parameter) | lsbs;
|
|
+ *val++ = (int)(x >> 1) ^ -(int)(x & 1);
|
|
+
|
|
+ continue;
|
|
+
|
|
+ /* at this point we've eaten up all the whole words */
|
|
+ process_tail:
|
|
+ do {
|
|
+ if(0) {
|
|
+ incomplete_msbs:
|
|
+ br->consumed_bits = 0;
|
|
+ br->consumed_words = cwords;
|
|
+ }
|
|
+
|
|
+ /* read the unary MSBs and end bit */
|
|
+ if(!FLAC__bitreader_read_unary_unsigned__LZCNT(br, &msbs))
|
|
+ return false;
|
|
+ msbs += x;
|
|
+ x = ucbits = 0;
|
|
+
|
|
+ if(0) {
|
|
+ incomplete_lsbs:
|
|
+ br->consumed_bits = 0;
|
|
+ br->consumed_words = cwords;
|
|
+ }
|
|
+
|
|
+ /* read the binary LSBs */
|
|
+ if(!FLAC__bitreader_read_raw_uint32(br, &lsbs, parameter - ucbits))
|
|
+ return false;
|
|
+ lsbs = x | lsbs;
|
|
+
|
|
+ /* compose the value */
|
|
+ x = (msbs << parameter) | lsbs;
|
|
+ *val++ = (int)(x >> 1) ^ -(int)(x & 1);
|
|
+ x = 0;
|
|
+
|
|
+ cwords = br->consumed_words;
|
|
+ words = br->words;
|
|
+ ucbits = FLAC__BITS_PER_WORD - br->consumed_bits;
|
|
+ b = br->buffer[cwords] << br->consumed_bits;
|
|
+ } while(cwords >= words && val < end);
|
|
+ }
|
|
+
|
|
+ if(ucbits == 0 && cwords < words) {
|
|
+ /* don't leave the head word with no unconsumed bits */
|
|
+ cwords++;
|
|
+ ucbits = FLAC__BITS_PER_WORD;
|
|
+ }
|
|
+
|
|
+ br->consumed_bits = FLAC__BITS_PER_WORD - ucbits;
|
|
+ br->consumed_words = cwords;
|
|
+
|
|
+ return true;
|
|
+}
|
|
+#ifdef __clang__
|
|
+#pragma clang attribute pop
|
|
+#endif
|
|
+
|
|
+#endif // FLAC__SUPPORT_LZCNT
|
|
\ No newline at end of file
|
|
diff --git a/src/libFLAC/cpu.c b/src/libFLAC/cpu.c
|
|
index 8b92f4c7..c0924512 100644
|
|
--- a/src/libFLAC/cpu.c
|
|
+++ b/src/libFLAC/cpu.c
|
|
@@ -195,6 +195,9 @@ x86_cpu_info (FLAC__CPUInfo *info)
|
|
info->x86.avx2 = (flags_ebx & FLAC__CPUINFO_X86_CPUID_AVX2 ) ? true : false;
|
|
}
|
|
|
|
+ cpuinfo_x86(0x80000001, &flags_eax, &flags_ebx, &flags_ecx, &flags_edx);
|
|
+ info->x86.lzcnt = (flags_ecx & (1 << 5)) ? true : false;
|
|
+
|
|
#if defined FLAC__CPU_IA32
|
|
dfprintf(stderr, "CPU info (IA-32):\n");
|
|
#else
|
|
diff --git a/src/libFLAC/include/private/bitreader.h b/src/libFLAC/include/private/bitreader.h
|
|
index 585a5db2..af37eb69 100644
|
|
--- a/src/libFLAC/include/private/bitreader.h
|
|
+++ b/src/libFLAC/include/private/bitreader.h
|
|
@@ -89,3 +89,9 @@ FLAC__bool FLAC__bitreader_read_golomb_unsigned(FLAC__BitReader *br, uint32_t *v
|
|
FLAC__bool FLAC__bitreader_read_utf8_uint32(FLAC__BitReader *br, FLAC__uint32 *val, FLAC__byte *raw, uint32_t *rawlen);
|
|
FLAC__bool FLAC__bitreader_read_utf8_uint64(FLAC__BitReader *br, FLAC__uint64 *val, FLAC__byte *raw, uint32_t *rawlen);
|
|
#endif
|
|
+
|
|
+#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
|
|
+#define FLAC__SUPPORT_LZCNT
|
|
+FLAC__bool FLAC__bitreader_read_unary_unsigned__LZCNT(FLAC__BitReader* br, uint32_t* val);
|
|
+FLAC__bool FLAC__bitreader_read_rice_signed_block__LZCNT(FLAC__BitReader* br, int vals[], uint32_t nvals, uint32_t parameter);
|
|
+#endif
|
|
\ No newline at end of file
|
|
diff --git a/src/libFLAC/include/private/cpu.h b/src/libFLAC/include/private/cpu.h
|
|
index 0b50839f..61b71a37 100644
|
|
--- a/src/libFLAC/include/private/cpu.h
|
|
+++ b/src/libFLAC/include/private/cpu.h
|
|
@@ -178,6 +178,7 @@ typedef struct {
|
|
FLAC__bool avx;
|
|
FLAC__bool avx2;
|
|
FLAC__bool fma;
|
|
+ FLAC__bool lzcnt;
|
|
} FLAC__CPUInfo_x86;
|
|
|
|
typedef struct {
|
|
diff --git a/src/libFLAC/stream_decoder.c b/src/libFLAC/stream_decoder.c
|
|
index 4380b486..b1c4ea5f 100644
|
|
--- a/src/libFLAC/stream_decoder.c
|
|
+++ b/src/libFLAC/stream_decoder.c
|
|
@@ -134,6 +134,9 @@ typedef struct FLAC__StreamDecoderPrivate {
|
|
void (*local_lpc_restore_signal_64bit)(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
|
|
/* for use when the signal is <= 16 bits-per-sample, or <= 15 bits-per-sample on a side channel (which requires 1 extra bit): */
|
|
void (*local_lpc_restore_signal_16bit)(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
|
|
+
|
|
+ FLAC__bool (*local_bitreader_read_rice_signed_block)(FLAC__BitReader* br, int vals[], uint32_t nvals, uint32_t parameter);
|
|
+
|
|
void *client_data;
|
|
FILE *file; /* only used if FLAC__stream_decoder_init_file()/FLAC__stream_decoder_init_file() called, else NULL */
|
|
FLAC__BitReader *input;
|
|
@@ -377,9 +380,18 @@ static FLAC__StreamDecoderInitStatus init_stream_internal_(
|
|
decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal;
|
|
decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide;
|
|
decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal;
|
|
+ decoder->private_->local_bitreader_read_rice_signed_block = FLAC__bitreader_read_rice_signed_block;
|
|
+
|
|
/* now override with asm where appropriate */
|
|
#ifndef FLAC__NO_ASM
|
|
if(decoder->private_->cpuinfo.use_asm) {
|
|
+
|
|
+#ifdef FLAC__SUPPORT_LZCNT
|
|
+ if(decoder->private_->cpuinfo.x86.lzcnt) {
|
|
+ decoder->private_->local_bitreader_read_rice_signed_block = FLAC__bitreader_read_rice_signed_block__LZCNT;
|
|
+ }
|
|
+#endif
|
|
+
|
|
#ifdef FLAC__CPU_IA32
|
|
FLAC__ASSERT(decoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_IA32);
|
|
#ifdef FLAC__HAS_NASM
|
|
@@ -2805,7 +2817,7 @@ FLAC__bool read_residual_partitioned_rice_(FLAC__StreamDecoder *decoder, uint32_
|
|
if(rice_parameter < pesc) {
|
|
partitioned_rice_contents->raw_bits[partition] = 0;
|
|
u = (partition == 0) ? partition_samples - predictor_order : partition_samples;
|
|
- if(!FLAC__bitreader_read_rice_signed_block(decoder->private_->input, residual + sample, u, rice_parameter))
|
|
+ if(!decoder->private_->local_bitreader_read_rice_signed_block(decoder->private_->input, residual + sample, u, rice_parameter))
|
|
return false; /* read_callback_ sets the state for us */
|
|
sample += u;
|
|
}
|