cog/ThirdParty/flac/patches/libflac-lzcnt.patch

328 lines
11 KiB
Diff

diff --git a/src/libFLAC/bitreader.c b/src/libFLAC/bitreader.c
index 79cb5cc4..3ec9b90d 100644
--- a/src/libFLAC/bitreader.c
+++ b/src/libFLAC/bitreader.c
@@ -1098,3 +1098,238 @@ extern FLAC__bool FLAC__bitreader_is_consumed_byte_aligned(const FLAC__BitReader
extern uint32_t FLAC__bitreader_bits_left_for_byte_alignment(const FLAC__BitReader *br);
extern uint32_t FLAC__bitreader_get_input_bits_unconsumed(const FLAC__BitReader *br);
extern FLAC__bool FLAC__bitreader_read_uint32_little_endian(FLAC__BitReader *br, FLAC__uint32 *val);
+
+
+
+#ifdef FLAC__SUPPORT_LZCNT
+
+/*
+* === LZCNT ===
+*/
+#if defined __GNUC__ || defined __clang__
+#define __LZCNT__
+#include <x86intrin.h>
+#endif
+
+#undef COUNT_ZERO_MSBS
+#undef COUNT_ZERO_MSBS2
+
+#if (ENABLE_64_BIT_WORDS == 0)
+#ifdef _MSC_VER
+#define COUNT_ZERO_MSBS(word) __lzcnt(word)
+#define COUNT_ZERO_MSBS2(word) __lzcnt(word)
+#else
+#define COUNT_ZERO_MSBS(word) __lzcnt32(word)
+#define COUNT_ZERO_MSBS2(word) __lzcnt32(word)
+#endif
+#else
+#define COUNT_ZERO_MSBS(word) __lzcnt64(word)
+#define COUNT_ZERO_MSBS2(word) __lzcnt64(word)
+#endif
+
+
+#ifdef __clang__
+#pragma clang attribute push (__attribute__((target("lzcnt"))), apply_to=function)
+#endif
+FLAC__bool FLAC__bitreader_read_unary_unsigned__LZCNT(FLAC__BitReader* br, uint32_t* val)
+{
+ uint32_t i;
+
+ FLAC__ASSERT(0 != br);
+ FLAC__ASSERT(0 != br->buffer);
+
+ *val = 0;
+ while(1) {
+ while(br->consumed_words < br->words) { /* if we've not consumed up to a partial tail word... */
+ brword b = br->buffer[br->consumed_words] << br->consumed_bits;
+ if(b) {
+ i = COUNT_ZERO_MSBS(b);
+ *val += i;
+ i++;
+ br->consumed_bits += i;
+ if (br->consumed_bits >= FLAC__BITS_PER_WORD) { /* faster way of testing if(br->consumed_bits == FLAC__BITS_PER_WORD) */
+ br->consumed_words++;
+ br->consumed_bits = 0;
+ }
+ return true;
+ } else {
+ *val += FLAC__BITS_PER_WORD - br->consumed_bits;
+ br->consumed_words++;
+ br->consumed_bits = 0;
+ /* didn't find stop bit yet, have to keep going... */
+ }
+ }
+ /* at this point we've eaten up all the whole words; have to try
+ * reading through any tail bytes before calling the read callback.
+ * this is a repeat of the above logic adjusted for the fact we
+ * don't have a whole word. note though if the client is feeding
+ * us data a byte at a time (unlikely), br->consumed_bits may not
+ * be zero.
+ */
+ if(br->bytes * 8 > br->consumed_bits) {
+ const uint32_t end = br->bytes * 8;
+ brword b = (br->buffer[br->consumed_words] & (FLAC__WORD_ALL_ONES << (FLAC__BITS_PER_WORD - end))) << br->consumed_bits;
+ if(b) {
+ i = COUNT_ZERO_MSBS(b);
+ *val += i;
+ i++;
+ br->consumed_bits += i;
+ FLAC__ASSERT(br->consumed_bits < FLAC__BITS_PER_WORD);
+ return true;
+ } else {
+ *val += end - br->consumed_bits;
+ br->consumed_bits = end;
+ FLAC__ASSERT(br->consumed_bits < FLAC__BITS_PER_WORD);
+ /* didn't find stop bit yet, have to keep going... */
+ }
+ }
+ if(!bitreader_read_from_client_(br))
+ return false;
+ }
+}
+
+
+/* this is by far the most heavily used reader call. it ain't pretty but it's fast */
+FLAC__bool FLAC__bitreader_read_rice_signed_block__LZCNT(FLAC__BitReader* br, int vals[], uint32_t nvals, uint32_t parameter)
+{
+ /* try and get br->consumed_words and br->consumed_bits into register;
+ * must remember to flush them back to *br before calling other
+ * bitreader functions that use them, and before returning */
+ uint32_t cwords, words, lsbs, msbs, x, y;
+ uint32_t ucbits; /* keep track of the number of unconsumed bits in word */
+ brword b;
+ int* val, * end;
+
+ FLAC__ASSERT(0 != br);
+ FLAC__ASSERT(0 != br->buffer);
+ /* WATCHOUT: code does not work with <32bit words; we can make things much faster with this assertion */
+ FLAC__ASSERT(FLAC__BITS_PER_WORD >= 32);
+ FLAC__ASSERT(parameter < 32);
+ /* the above two asserts also guarantee that the binary part never straddles more than 2 words, so we don't have to loop to read it */
+
+ val = vals;
+ end = vals + nvals;
+
+ if(parameter == 0) {
+ while(val < end) {
+ /* read the unary MSBs and end bit */
+ if(!FLAC__bitreader_read_unary_unsigned__LZCNT(br, &msbs))
+ return false;
+
+ *val++ = (int)(msbs >> 1) ^ -(int)(msbs & 1);
+ }
+
+ return true;
+ }
+
+ FLAC__ASSERT(parameter > 0);
+
+ cwords = br->consumed_words;
+ words = br->words;
+
+ /* if we've not consumed up to a partial tail word... */
+ if(cwords >= words) {
+ x = 0;
+ goto process_tail;
+ }
+
+ ucbits = FLAC__BITS_PER_WORD - br->consumed_bits;
+ b = br->buffer[cwords] << br->consumed_bits; /* keep unconsumed bits aligned to left */
+
+ while(val < end) {
+ /* read the unary MSBs and end bit */
+ x = y = COUNT_ZERO_MSBS2(b);
+ if (x == FLAC__BITS_PER_WORD) {
+ x = ucbits;
+ do {
+ /* didn't find stop bit yet, have to keep going... */
+ cwords++;
+ if (cwords >= words)
+ goto incomplete_msbs;
+ b = br->buffer[cwords];
+ y = COUNT_ZERO_MSBS2(b);
+ x += y;
+ } while(y == FLAC__BITS_PER_WORD);
+ }
+ b <<= y;
+ b <<= 1; /* account for stop bit */
+ ucbits = (ucbits - x - 1) % FLAC__BITS_PER_WORD;
+ msbs = x;
+
+ /* read the binary LSBs */
+ x = (FLAC__uint32)(b >> (FLAC__BITS_PER_WORD - parameter)); /* parameter < 32, so we can cast to 32-bit uint32_t */
+ if(parameter <= ucbits) {
+ ucbits -= parameter;
+ b <<= parameter;
+ } else {
+ /* there are still bits left to read, they will all be in the next word */
+ cwords++;
+ if (cwords >= words)
+ goto incomplete_lsbs;
+ b = br->buffer[cwords];
+ ucbits += FLAC__BITS_PER_WORD - parameter;
+ x |= (FLAC__uint32)(b >> ucbits);
+ b <<= FLAC__BITS_PER_WORD - ucbits;
+ }
+ lsbs = x;
+
+ /* compose the value */
+ x = (msbs << parameter) | lsbs;
+ *val++ = (int)(x >> 1) ^ -(int)(x & 1);
+
+ continue;
+
+ /* at this point we've eaten up all the whole words */
+ process_tail:
+ do {
+ if(0) {
+ incomplete_msbs:
+ br->consumed_bits = 0;
+ br->consumed_words = cwords;
+ }
+
+ /* read the unary MSBs and end bit */
+ if(!FLAC__bitreader_read_unary_unsigned__LZCNT(br, &msbs))
+ return false;
+ msbs += x;
+ x = ucbits = 0;
+
+ if(0) {
+ incomplete_lsbs:
+ br->consumed_bits = 0;
+ br->consumed_words = cwords;
+ }
+
+ /* read the binary LSBs */
+ if(!FLAC__bitreader_read_raw_uint32(br, &lsbs, parameter - ucbits))
+ return false;
+ lsbs = x | lsbs;
+
+ /* compose the value */
+ x = (msbs << parameter) | lsbs;
+ *val++ = (int)(x >> 1) ^ -(int)(x & 1);
+ x = 0;
+
+ cwords = br->consumed_words;
+ words = br->words;
+ ucbits = FLAC__BITS_PER_WORD - br->consumed_bits;
+ b = br->buffer[cwords] << br->consumed_bits;
+ } while(cwords >= words && val < end);
+ }
+
+ if(ucbits == 0 && cwords < words) {
+ /* don't leave the head word with no unconsumed bits */
+ cwords++;
+ ucbits = FLAC__BITS_PER_WORD;
+ }
+
+ br->consumed_bits = FLAC__BITS_PER_WORD - ucbits;
+ br->consumed_words = cwords;
+
+ return true;
+}
+#ifdef __clang__
+#pragma clang attribute pop
+#endif
+
+#endif // FLAC__SUPPORT_LZCNT
\ No newline at end of file
diff --git a/src/libFLAC/cpu.c b/src/libFLAC/cpu.c
index 8b92f4c7..c0924512 100644
--- a/src/libFLAC/cpu.c
+++ b/src/libFLAC/cpu.c
@@ -195,6 +195,9 @@ x86_cpu_info (FLAC__CPUInfo *info)
info->x86.avx2 = (flags_ebx & FLAC__CPUINFO_X86_CPUID_AVX2 ) ? true : false;
}
+ cpuinfo_x86(0x80000001, &flags_eax, &flags_ebx, &flags_ecx, &flags_edx);
+ info->x86.lzcnt = (flags_ecx & (1 << 5)) ? true : false;
+
#if defined FLAC__CPU_IA32
dfprintf(stderr, "CPU info (IA-32):\n");
#else
diff --git a/src/libFLAC/include/private/bitreader.h b/src/libFLAC/include/private/bitreader.h
index 585a5db2..af37eb69 100644
--- a/src/libFLAC/include/private/bitreader.h
+++ b/src/libFLAC/include/private/bitreader.h
@@ -89,3 +89,9 @@ FLAC__bool FLAC__bitreader_read_golomb_unsigned(FLAC__BitReader *br, uint32_t *v
FLAC__bool FLAC__bitreader_read_utf8_uint32(FLAC__BitReader *br, FLAC__uint32 *val, FLAC__byte *raw, uint32_t *rawlen);
FLAC__bool FLAC__bitreader_read_utf8_uint64(FLAC__BitReader *br, FLAC__uint64 *val, FLAC__byte *raw, uint32_t *rawlen);
#endif
+
+#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
+#define FLAC__SUPPORT_LZCNT
+FLAC__bool FLAC__bitreader_read_unary_unsigned__LZCNT(FLAC__BitReader* br, uint32_t* val);
+FLAC__bool FLAC__bitreader_read_rice_signed_block__LZCNT(FLAC__BitReader* br, int vals[], uint32_t nvals, uint32_t parameter);
+#endif
\ No newline at end of file
diff --git a/src/libFLAC/include/private/cpu.h b/src/libFLAC/include/private/cpu.h
index 0b50839f..61b71a37 100644
--- a/src/libFLAC/include/private/cpu.h
+++ b/src/libFLAC/include/private/cpu.h
@@ -178,6 +178,7 @@ typedef struct {
FLAC__bool avx;
FLAC__bool avx2;
FLAC__bool fma;
+ FLAC__bool lzcnt;
} FLAC__CPUInfo_x86;
typedef struct {
diff --git a/src/libFLAC/stream_decoder.c b/src/libFLAC/stream_decoder.c
index 4380b486..b1c4ea5f 100644
--- a/src/libFLAC/stream_decoder.c
+++ b/src/libFLAC/stream_decoder.c
@@ -134,6 +134,9 @@ typedef struct FLAC__StreamDecoderPrivate {
void (*local_lpc_restore_signal_64bit)(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
/* for use when the signal is <= 16 bits-per-sample, or <= 15 bits-per-sample on a side channel (which requires 1 extra bit): */
void (*local_lpc_restore_signal_16bit)(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
+
+ FLAC__bool (*local_bitreader_read_rice_signed_block)(FLAC__BitReader* br, int vals[], uint32_t nvals, uint32_t parameter);
+
void *client_data;
FILE *file; /* only used if FLAC__stream_decoder_init_file()/FLAC__stream_decoder_init_file() called, else NULL */
FLAC__BitReader *input;
@@ -377,9 +380,18 @@ static FLAC__StreamDecoderInitStatus init_stream_internal_(
decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal;
decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide;
decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal;
+ decoder->private_->local_bitreader_read_rice_signed_block = FLAC__bitreader_read_rice_signed_block;
+
/* now override with asm where appropriate */
#ifndef FLAC__NO_ASM
if(decoder->private_->cpuinfo.use_asm) {
+
+#ifdef FLAC__SUPPORT_LZCNT
+ if(decoder->private_->cpuinfo.x86.lzcnt) {
+ decoder->private_->local_bitreader_read_rice_signed_block = FLAC__bitreader_read_rice_signed_block__LZCNT;
+ }
+#endif
+
#ifdef FLAC__CPU_IA32
FLAC__ASSERT(decoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_IA32);
#ifdef FLAC__HAS_NASM
@@ -2805,7 +2817,7 @@ FLAC__bool read_residual_partitioned_rice_(FLAC__StreamDecoder *decoder, uint32_
if(rice_parameter < pesc) {
partitioned_rice_contents->raw_bits[partition] = 0;
u = (partition == 0) ? partition_samples - predictor_order : partition_samples;
- if(!FLAC__bitreader_read_rice_signed_block(decoder->private_->input, residual + sample, u, rice_parameter))
+ if(!decoder->private_->local_bitreader_read_rice_signed_block(decoder->private_->input, residual + sample, u, rice_parameter))
return false; /* read_callback_ sets the state for us */
sample += u;
}