cog/Frameworks/WavPack/Files/read_words.c

611 lines
18 KiB
C

////////////////////////////////////////////////////////////////////////////
// **** WAVPACK **** //
// Hybrid Lossless Wavefile Compressor //
// Copyright (c) 1998 - 2013 Conifer Software. //
// All Rights Reserved. //
// Distributed under the BSD Software License (see license.txt) //
////////////////////////////////////////////////////////////////////////////
// read_words.c
// This module provides entropy word decoding functions using
// a variation on the Rice method. This was introduced in version 3.93
// because it allows splitting the data into a "lossy" stream and a
// "correction" stream in a very efficient manner and is therefore ideal
// for the "hybrid" mode. For 4.0, the efficiency of this method was
// significantly improved by moving away from the normal Rice restriction of
// using powers of two for the modulus divisions and now the method can be
// used for both hybrid and pure lossless encoding.
// Samples are divided by median probabilities at 5/7 (71.43%), 10/49 (20.41%),
// and 20/343 (5.83%). Each zone has 3.5 times fewer samples than the
// previous. Using standard Rice coding on this data would result in 1.4
// bits per sample average (not counting sign bit). However, there is a
// very simple encoding that is over 99% efficient with this data and
// results in about 1.22 bits per sample.
#include <stdlib.h>
#include <string.h>
#include "wavpack_local.h"
#if defined (HAVE___BUILTIN_CTZ) || defined (_WIN64)
#define USE_CTZ_OPTIMIZATION // use ctz intrinsic (or Windows equivalent) to count trailing ones
#else
#define USE_NEXT8_OPTIMIZATION // optimization using a table to count trailing ones
#endif
#define USE_BITMASK_TABLES // use tables instead of shifting for certain masking operations
///////////////////////////// local table storage ////////////////////////////
#ifdef USE_NEXT8_OPTIMIZATION
static const char ones_count_table [] = {
0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,
0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,
0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,
0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,7,
0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,
0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,
0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,
0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,8
};
#endif
///////////////////////////// executable code ////////////////////////////////
static uint32_t __inline read_code (Bitstream *bs, uint32_t maxcode);
// Read the next word from the bitstream "wvbits" and return the value. This
// function can be used for hybrid or lossless streams, but since an
// optimized version is available for lossless this function would normally
// be used for hybrid only. If a hybrid lossless stream is being read then
// the "correction" offset is written at the specified pointer. A return value
// of WORD_EOF indicates that the end of the bitstream was reached (all 1s) or
// some other error occurred.
int32_t FASTCALL get_word (WavpackStream *wps, int chan, int32_t *correction)
{
register struct entropy_data *c = wps->w.c + chan;
uint32_t ones_count, low, mid, high;
int32_t value;
int sign;
if (!wps->wvbits.ptr)
return WORD_EOF;
if (correction)
*correction = 0;
if (!(wps->w.c [0].median [0] & ~1) && !wps->w.holding_zero && !wps->w.holding_one && !(wps->w.c [1].median [0] & ~1)) {
uint32_t mask;
int cbits;
if (wps->w.zeros_acc) {
if (--wps->w.zeros_acc) {
c->slow_level -= (c->slow_level + SLO) >> SLS;
return 0;
}
}
else {
for (cbits = 0; cbits < 33 && getbit (&wps->wvbits); ++cbits);
if (cbits == 33)
return WORD_EOF;
if (cbits < 2)
wps->w.zeros_acc = cbits;
else {
for (mask = 1, wps->w.zeros_acc = 0; --cbits; mask <<= 1)
if (getbit (&wps->wvbits))
wps->w.zeros_acc |= mask;
wps->w.zeros_acc |= mask;
}
if (wps->w.zeros_acc) {
c->slow_level -= (c->slow_level + SLO) >> SLS;
CLEAR (wps->w.c [0].median);
CLEAR (wps->w.c [1].median);
return 0;
}
}
}
if (wps->w.holding_zero)
ones_count = wps->w.holding_zero = 0;
else {
#ifdef USE_CTZ_OPTIMIZATION
while (wps->wvbits.bc < LIMIT_ONES) {
if (++(wps->wvbits.ptr) == wps->wvbits.end)
wps->wvbits.wrap (&wps->wvbits);
wps->wvbits.sr |= *(wps->wvbits.ptr) << wps->wvbits.bc;
wps->wvbits.bc += sizeof (*(wps->wvbits.ptr)) * 8;
}
#ifdef _WIN32
_BitScanForward (&ones_count, ~wps->wvbits.sr);
#else
ones_count = __builtin_ctz (~wps->wvbits.sr);
#endif
if (ones_count >= LIMIT_ONES) {
wps->wvbits.bc -= ones_count;
wps->wvbits.sr >>= ones_count;
for (; ones_count < (LIMIT_ONES + 1) && getbit (&wps->wvbits); ++ones_count);
if (ones_count == (LIMIT_ONES + 1))
return WORD_EOF;
if (ones_count == LIMIT_ONES) {
uint32_t mask;
int cbits;
for (cbits = 0; cbits < 33 && getbit (&wps->wvbits); ++cbits);
if (cbits == 33)
return WORD_EOF;
if (cbits < 2)
ones_count = cbits;
else {
for (mask = 1, ones_count = 0; --cbits; mask <<= 1)
if (getbit (&wps->wvbits))
ones_count |= mask;
ones_count |= mask;
}
ones_count += LIMIT_ONES;
}
}
else {
wps->wvbits.bc -= ones_count + 1;
wps->wvbits.sr >>= ones_count + 1;
}
#elif defined (USE_NEXT8_OPTIMIZATION)
int next8;
if (wps->wvbits.bc < 8) {
if (++(wps->wvbits.ptr) == wps->wvbits.end)
wps->wvbits.wrap (&wps->wvbits);
next8 = (wps->wvbits.sr |= *(wps->wvbits.ptr) << wps->wvbits.bc) & 0xff;
wps->wvbits.bc += sizeof (*(wps->wvbits.ptr)) * 8;
}
else
next8 = wps->wvbits.sr & 0xff;
if (next8 == 0xff) {
wps->wvbits.bc -= 8;
wps->wvbits.sr >>= 8;
for (ones_count = 8; ones_count < (LIMIT_ONES + 1) && getbit (&wps->wvbits); ++ones_count);
if (ones_count == (LIMIT_ONES + 1))
return WORD_EOF;
if (ones_count == LIMIT_ONES) {
uint32_t mask;
int cbits;
for (cbits = 0; cbits < 33 && getbit (&wps->wvbits); ++cbits);
if (cbits == 33)
return WORD_EOF;
if (cbits < 2)
ones_count = cbits;
else {
for (mask = 1, ones_count = 0; --cbits; mask <<= 1)
if (getbit (&wps->wvbits))
ones_count |= mask;
ones_count |= mask;
}
ones_count += LIMIT_ONES;
}
}
else {
wps->wvbits.bc -= (ones_count = ones_count_table [next8]) + 1;
wps->wvbits.sr >>= ones_count + 1;
}
#else
for (ones_count = 0; ones_count < (LIMIT_ONES + 1) && getbit (&wps->wvbits); ++ones_count);
if (ones_count >= LIMIT_ONES) {
uint32_t mask;
int cbits;
if (ones_count == (LIMIT_ONES + 1))
return WORD_EOF;
for (cbits = 0; cbits < 33 && getbit (&wps->wvbits); ++cbits);
if (cbits == 33)
return WORD_EOF;
if (cbits < 2)
ones_count = cbits;
else {
for (mask = 1, ones_count = 0; --cbits; mask <<= 1)
if (getbit (&wps->wvbits))
ones_count |= mask;
ones_count |= mask;
}
ones_count += LIMIT_ONES;
}
#endif
if (wps->w.holding_one) {
wps->w.holding_one = ones_count & 1;
ones_count = (ones_count >> 1) + 1;
}
else {
wps->w.holding_one = ones_count & 1;
ones_count >>= 1;
}
wps->w.holding_zero = ~wps->w.holding_one & 1;
}
if ((wps->wphdr.flags & HYBRID_FLAG) && !chan)
update_error_limit (wps);
if (ones_count == 0) {
low = 0;
high = GET_MED (0) - 1;
DEC_MED0 ();
}
else {
low = GET_MED (0);
INC_MED0 ();
if (ones_count == 1) {
high = low + GET_MED (1) - 1;
DEC_MED1 ();
}
else {
low += GET_MED (1);
INC_MED1 ();
if (ones_count == 2) {
high = low + GET_MED (2) - 1;
DEC_MED2 ();
}
else {
low += (ones_count - 2) * GET_MED (2);
high = low + GET_MED (2) - 1;
INC_MED2 ();
}
}
}
low &= 0x7fffffff;
high &= 0x7fffffff;
mid = (high + low + 1) >> 1;
if (!c->error_limit)
mid = read_code (&wps->wvbits, high - low) + low;
else while (high - low > c->error_limit) {
if (getbit (&wps->wvbits))
mid = (high + (low = mid) + 1) >> 1;
else
mid = ((high = mid - 1) + low + 1) >> 1;
}
sign = getbit (&wps->wvbits);
if (bs_is_open (&wps->wvcbits) && c->error_limit) {
value = read_code (&wps->wvcbits, high - low) + low;
if (correction)
*correction = sign ? (mid - value) : (value - mid);
}
if (wps->wphdr.flags & HYBRID_BITRATE) {
c->slow_level -= (c->slow_level + SLO) >> SLS;
c->slow_level += wp_log2 (mid);
}
return sign ? ~mid : mid;
}
// This is an optimized version of get_word() that is used for lossless only
// (error_limit == 0). Also, rather than obtaining a single sample, it can be
// used to obtain an entire buffer of either mono or stereo samples.
int32_t get_words_lossless (WavpackStream *wps, int32_t *buffer, int32_t nsamples)
{
struct entropy_data *c = wps->w.c;
uint32_t ones_count, low, high;
Bitstream *bs = &wps->wvbits;
int32_t csamples;
#ifdef USE_NEXT8_OPTIMIZATION
int32_t next8;
#endif
if (nsamples && !bs->ptr) {
memset (buffer, 0, (wps->wphdr.flags & MONO_DATA) ? nsamples * 4 : nsamples * 8);
return nsamples;
}
if (!(wps->wphdr.flags & MONO_DATA))
nsamples *= 2;
for (csamples = 0; csamples < nsamples; ++csamples) {
if (!(wps->wphdr.flags & MONO_DATA))
c = wps->w.c + (csamples & 1);
if (wps->w.holding_zero) {
wps->w.holding_zero = 0;
low = read_code (bs, GET_MED (0) - 1);
DEC_MED0 ();
buffer [csamples] = (getbit (bs)) ? ~low : low;
if (++csamples == nsamples)
break;
if (!(wps->wphdr.flags & MONO_DATA))
c = wps->w.c + (csamples & 1);
}
if (wps->w.c [0].median [0] < 2 && !wps->w.holding_one && wps->w.c [1].median [0] < 2) {
uint32_t mask;
int cbits;
if (wps->w.zeros_acc) {
if (--wps->w.zeros_acc) {
buffer [csamples] = 0;
continue;
}
}
else {
for (cbits = 0; cbits < 33 && getbit (bs); ++cbits);
if (cbits == 33)
break;
if (cbits < 2)
wps->w.zeros_acc = cbits;
else {
for (mask = 1, wps->w.zeros_acc = 0; --cbits; mask <<= 1)
if (getbit (bs))
wps->w.zeros_acc |= mask;
wps->w.zeros_acc |= mask;
}
if (wps->w.zeros_acc) {
CLEAR (wps->w.c [0].median);
CLEAR (wps->w.c [1].median);
buffer [csamples] = 0;
continue;
}
}
}
#ifdef USE_CTZ_OPTIMIZATION
while (bs->bc < LIMIT_ONES) {
if (++(bs->ptr) == bs->end)
bs->wrap (bs);
bs->sr |= *(bs->ptr) << bs->bc;
bs->bc += sizeof (*(bs->ptr)) * 8;
}
#ifdef _WIN32
_BitScanForward (&ones_count, ~wps->wvbits.sr);
#else
ones_count = __builtin_ctz (~wps->wvbits.sr);
#endif
if (ones_count >= LIMIT_ONES) {
bs->bc -= ones_count;
bs->sr >>= ones_count;
for (; ones_count < (LIMIT_ONES + 1) && getbit (bs); ++ones_count);
if (ones_count == (LIMIT_ONES + 1))
break;
if (ones_count == LIMIT_ONES) {
uint32_t mask;
int cbits;
for (cbits = 0; cbits < 33 && getbit (bs); ++cbits);
if (cbits == 33)
break;
if (cbits < 2)
ones_count = cbits;
else {
for (mask = 1, ones_count = 0; --cbits; mask <<= 1)
if (getbit (bs))
ones_count |= mask;
ones_count |= mask;
}
ones_count += LIMIT_ONES;
}
}
else {
bs->bc -= ones_count + 1;
bs->sr >>= ones_count + 1;
}
#elif defined (USE_NEXT8_OPTIMIZATION)
if (bs->bc < 8) {
if (++(bs->ptr) == bs->end)
bs->wrap (bs);
next8 = (bs->sr |= *(bs->ptr) << bs->bc) & 0xff;
bs->bc += sizeof (*(bs->ptr)) * 8;
}
else
next8 = bs->sr & 0xff;
if (next8 == 0xff) {
bs->bc -= 8;
bs->sr >>= 8;
for (ones_count = 8; ones_count < (LIMIT_ONES + 1) && getbit (bs); ++ones_count);
if (ones_count == (LIMIT_ONES + 1))
break;
if (ones_count == LIMIT_ONES) {
uint32_t mask;
int cbits;
for (cbits = 0; cbits < 33 && getbit (bs); ++cbits);
if (cbits == 33)
break;
if (cbits < 2)
ones_count = cbits;
else {
for (mask = 1, ones_count = 0; --cbits; mask <<= 1)
if (getbit (bs))
ones_count |= mask;
ones_count |= mask;
}
ones_count += LIMIT_ONES;
}
}
else {
bs->bc -= (ones_count = ones_count_table [next8]) + 1;
bs->sr >>= ones_count + 1;
}
#else
for (ones_count = 0; ones_count < (LIMIT_ONES + 1) && getbit (bs); ++ones_count);
if (ones_count >= LIMIT_ONES) {
uint32_t mask;
int cbits;
if (ones_count == (LIMIT_ONES + 1))
break;
for (cbits = 0; cbits < 33 && getbit (bs); ++cbits);
if (cbits == 33)
break;
if (cbits < 2)
ones_count = cbits;
else {
for (mask = 1, ones_count = 0; --cbits; mask <<= 1)
if (getbit (bs))
ones_count |= mask;
ones_count |= mask;
}
ones_count += LIMIT_ONES;
}
#endif
low = wps->w.holding_one;
wps->w.holding_one = ones_count & 1;
wps->w.holding_zero = ~ones_count & 1;
ones_count = (ones_count >> 1) + low;
if (ones_count == 0) {
low = 0;
high = GET_MED (0) - 1;
DEC_MED0 ();
}
else {
low = GET_MED (0);
INC_MED0 ();
if (ones_count == 1) {
high = low + GET_MED (1) - 1;
DEC_MED1 ();
}
else {
low += GET_MED (1);
INC_MED1 ();
if (ones_count == 2) {
high = low + GET_MED (2) - 1;
DEC_MED2 ();
}
else {
low += (ones_count - 2) * GET_MED (2);
high = low + GET_MED (2) - 1;
INC_MED2 ();
}
}
}
low += read_code (bs, high - low);
buffer [csamples] = (getbit (bs)) ? ~low : low;
}
return (wps->wphdr.flags & MONO_DATA) ? csamples : (csamples / 2);
}
// Read a single unsigned value from the specified bitstream with a value
// from 0 to maxcode. If there are exactly a power of two number of possible
// codes then this will read a fixed number of bits; otherwise it reads the
// minimum number of bits and then determines whether another bit is needed
// to define the code.
static uint32_t __inline read_code (Bitstream *bs, uint32_t maxcode)
{
unsigned long local_sr;
uint32_t extras, code;
int bitcount;
if (maxcode < 2)
return maxcode ? getbit (bs) : 0;
bitcount = count_bits (maxcode);
#ifdef USE_BITMASK_TABLES
extras = bitset [bitcount] - maxcode - 1;
#else
extras = (1 << bitcount) - maxcode - 1;
#endif
local_sr = bs->sr;
while (bs->bc < bitcount) {
if (++(bs->ptr) == bs->end)
bs->wrap (bs);
local_sr |= (long)*(bs->ptr) << bs->bc;
bs->bc += sizeof (*(bs->ptr)) * 8;
}
#ifdef USE_BITMASK_TABLES
if ((code = local_sr & bitmask [bitcount - 1]) >= extras)
#else
if ((code = local_sr & ((1 << (bitcount - 1)) - 1)) >= extras)
#endif
code = (code << 1) - extras + ((local_sr >> (bitcount - 1)) & 1);
else
bitcount--;
if (sizeof (local_sr) < 8 && bs->bc > sizeof (local_sr) * 8) {
bs->bc -= bitcount;
bs->sr = *(bs->ptr) >> (sizeof (*(bs->ptr)) * 8 - bs->bc);
}
else {
bs->bc -= bitcount;
bs->sr = local_sr >> bitcount;
}
return code;
}