Updated ft2play, and implemented SSE optimizations in the Lanczos sinc resampler

CQTexperiment
Chris Moeller 2014-03-21 15:42:00 -07:00
parent a8b47ea0ed
commit 7ae9f88e2a
4 changed files with 313 additions and 46 deletions

View File

@ -1,11 +1,31 @@
#ifndef _LANCZOS_RESAMPLER_H_
#define _LANCZOS_RESAMPLER_H_
void lanczos_init();
// Ugglay
#ifdef LANCZOS_DECORATE
#define PASTE(a,b) a ## b
#define EVALUATE(a,b) PASTE(a,b)
#define lanczos_init EVALUATE(LANCZOS_DECORATE,_lanczos_init)
#define lanczos_resampler_create EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_create)
#define lanczos_resampler_delete EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_delete)
#define lanczos_resampler_dup EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_dup)
#define lanczos_resampler_dup_inplace EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_dup_inplace)
#define lanczos_resampler_get_free_count EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_get_free_count)
#define lanczos_resampler_write_sample EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_write_sample)
#define lanczos_resampler_set_rate EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_set_rate)
#define lanczos_resampler_ready EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_ready)
#define lanczos_resampler_clear EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_clear)
#define lanczos_resampler_get_sample_count EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_get_sample_count)
#define lanczos_resampler_get_sample EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_get_sample)
#define lanczos_resampler_remove_sample EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_remove_sample)
#endif
void * lanczos_resampler_create();
void lanczos_init(void);
void * lanczos_resampler_create(void);
void lanczos_resampler_delete(void *);
void * lanczos_resampler_dup(void *);
void * lanczos_resampler_dup(const void *);
void lanczos_resampler_dup_inplace(void *, const void *);
int lanczos_resampler_get_free_count(void *);
void lanczos_resampler_write_sample(void *, short sample);

View File

@ -2,6 +2,10 @@
#include <string.h>
#define _USE_MATH_DEFINES
#include <math.h>
#if (defined(_M_IX86) || defined(__i386__) || defined(_M_X64) || defined(__amd64__))
#include <xmmintrin.h>
#define LANCZOS_SSE
#endif
#ifndef M_PI
#define M_PI 3.14159265358979323846
@ -10,29 +14,60 @@
#include "internal/lanczos_resampler.h"
enum { LANCZOS_RESOLUTION = 8192 };
enum { LANCZOS_WIDTH = 8 };
enum { LANCZOS_WIDTH = 16 };
enum { LANCZOS_SAMPLES = LANCZOS_RESOLUTION * LANCZOS_WIDTH };
static double lanczos_lut[LANCZOS_SAMPLES + 1];
static float lanczos_lut[LANCZOS_SAMPLES + 1];
enum { lanczos_buffer_size = LANCZOS_WIDTH * 4 };
int fEqual(const double b, const double a)
static int fEqual(const float b, const float a)
{
return fabs(a - b) < 1.0e-6;
}
static double sinc(double x)
static float sinc(float x)
{
return fEqual(x, 0.0) ? 1.0 : sin(x * M_PI) / (x * M_PI);
}
void lanczos_init()
#ifdef LANCZOS_SSE
#ifdef _MSC_VER
#include <intrin.h>
#elif defined(__clang__) || defined(__GNUC__)
static inline void
__cpuid(int *data, int selector)
{
asm("cpuid"
: "=a" (data[0]),
"=b" (data[1]),
"=c" (data[2]),
"=d" (data[3])
: "a"(selector));
}
#else
#define __cpuid(a,b) memset((a), 0, sizeof(int) * 4)
#endif
static int query_cpu_feature_sse() {
int buffer[4];
__cpuid(buffer,1);
if ((buffer[3]&(1<<25)) == 0) return 0;
return 1;
}
static int lanczos_has_sse = 0;
#endif
void lanczos_init(void)
{
unsigned i;
double dx = (double)(LANCZOS_WIDTH) / LANCZOS_SAMPLES, x = 0.0;
float dx = (float)(LANCZOS_WIDTH) / LANCZOS_SAMPLES, x = 0.0;
for (i = 0; i < LANCZOS_SAMPLES + 1; ++i, x += dx)
lanczos_lut[i] = abs(x) < LANCZOS_WIDTH ? sinc(x) * sinc(x / LANCZOS_WIDTH) : 0.0;
lanczos_lut[i] = fabs(x) < LANCZOS_WIDTH ? sinc(x) * sinc(x / LANCZOS_WIDTH) : 0.0;
#ifdef LANCZOS_SSE
lanczos_has_sse = query_cpu_feature_sse();
#endif
}
typedef struct lanczos_resampler
@ -45,7 +80,7 @@ typedef struct lanczos_resampler
int buffer_out[lanczos_buffer_size];
} lanczos_resampler;
void * lanczos_resampler_create()
void * lanczos_resampler_create(void)
{
lanczos_resampler * r = ( lanczos_resampler * ) malloc( sizeof(lanczos_resampler) );
if ( !r ) return 0;
@ -67,9 +102,9 @@ void lanczos_resampler_delete(void * _r)
free( _r );
}
void * lanczos_resampler_dup(void * _r)
void * lanczos_resampler_dup(const void * _r)
{
lanczos_resampler * r_in = ( lanczos_resampler * ) _r;
const lanczos_resampler * r_in = ( const lanczos_resampler * ) _r;
lanczos_resampler * r_out = ( lanczos_resampler * ) malloc( sizeof(lanczos_resampler) );
if ( !r_out ) return 0;
@ -85,6 +120,21 @@ void * lanczos_resampler_dup(void * _r)
return r_out;
}
void lanczos_resampler_dup_inplace(void *_d, const void *_s)
{
const lanczos_resampler * r_in = ( const lanczos_resampler * ) _s;
lanczos_resampler * r_out = ( lanczos_resampler * ) _d;
r_out->write_pos = r_in->write_pos;
r_out->write_filled = r_in->write_filled;
r_out->read_pos = r_in->read_pos;
r_out->read_filled = r_in->read_filled;
r_out->phase = r_in->phase;
r_out->phase_inc = r_in->phase_inc;
memcpy( r_out->buffer_in, r_in->buffer_in, sizeof(r_in->buffer_in) );
memcpy( r_out->buffer_out, r_in->buffer_out, sizeof(r_in->buffer_out) );
}
int lanczos_resampler_get_free_count(void *_r)
{
lanczos_resampler * r = ( lanczos_resampler * ) _r;
@ -149,10 +199,10 @@ static int lanczos_resampler_run(lanczos_resampler * r, int ** out_, int * out_e
do
{
// accumulate in extended precision
double kernel[LANCZOS_WIDTH * 2], kernel_sum = 0.0;
float kernel[LANCZOS_WIDTH * 2], kernel_sum = 0.0;
int i = LANCZOS_WIDTH;
int phase_adj = phase * step / LANCZOS_RESOLUTION;
double sample;
float sample;
if ( out >= out_end )
break;
@ -164,7 +214,7 @@ static int lanczos_resampler_run(lanczos_resampler * r, int ** out_, int * out_e
}
for (sample = 0, i = 0; i < LANCZOS_WIDTH * 2; ++i)
sample += in[i] * kernel[i];
*out++ = (int) (sample / kernel_sum * 256.0);
*out++ = (int)(sample / kernel_sum * 256.0);
phase += phase_inc;
@ -174,10 +224,10 @@ static int lanczos_resampler_run(lanczos_resampler * r, int ** out_, int * out_e
}
while ( in < in_end );
r->phase = phase;
r->phase = (unsigned short) phase;
*out_ = out;
used = in - in_;
used = (int)(in - in_);
r->write_filled -= used;
}
@ -185,6 +235,79 @@ static int lanczos_resampler_run(lanczos_resampler * r, int ** out_, int * out_e
return used;
}
#ifdef LANCZOS_SSE
static int lanczos_resampler_run_sse(lanczos_resampler * r, int ** out_, int * out_end)
{
int in_size = r->write_filled;
float const* in_ = r->buffer_in + lanczos_buffer_size + r->write_pos - r->write_filled;
int used = 0;
in_size -= LANCZOS_WIDTH * 2;
if ( in_size > 0 )
{
int* out = *out_;
float const* in = in_;
float const* const in_end = in + in_size;
int phase = r->phase;
int phase_inc = r->phase_inc;
int step = phase_inc > LANCZOS_RESOLUTION ? LANCZOS_RESOLUTION * LANCZOS_RESOLUTION / phase_inc : LANCZOS_RESOLUTION;
do
{
// accumulate in extended precision
float kernel_sum = 0.0;
__m128 kernel[LANCZOS_WIDTH / 2];
__m128 temp1, temp2;
__m128 samplex = _mm_setzero_ps();
float *kernelf = (float*)(&kernel);
int i = LANCZOS_WIDTH;
int phase_adj = phase * step / LANCZOS_RESOLUTION;
if ( out >= out_end )
break;
for (; i >= -LANCZOS_WIDTH + 1; --i)
{
int pos = i * step;
kernel_sum += kernelf[i + LANCZOS_WIDTH - 1] = lanczos_lut[abs(phase_adj - pos)];
}
for (i = 0; i < LANCZOS_WIDTH / 2; ++i)
{
temp1 = _mm_loadu_ps( (const float *)( in + i * 4 ) );
temp2 = _mm_load_ps( (const float *)( kernel + i ) );
temp1 = _mm_mul_ps( temp1, temp2 );
samplex = _mm_add_ps( samplex, temp1 );
}
kernel_sum = 1.0 / kernel_sum * (1.0 / 32768.0);
temp1 = _mm_movehl_ps( temp1, samplex );
samplex = _mm_add_ps( samplex, temp1 );
temp1 = samplex;
temp1 = _mm_shuffle_ps( temp1, samplex, _MM_SHUFFLE(0, 0, 0, 1) );
samplex = _mm_add_ps( samplex, temp1 );
temp1 = _mm_set_ss( kernel_sum );
samplex = _mm_mul_ps( samplex, temp1 );
*out++ = _mm_cvtss_si32( samplex );
phase += phase_inc;
in += phase >> 13;
phase &= 8191;
}
while ( in < in_end );
r->phase = (unsigned short) phase;
*out_ = out;
used = (int)(in - in_);
r->write_filled -= used;
}
return used;
}
#endif
static void lanczos_resampler_fill(lanczos_resampler * r)
{
while ( r->write_filled > (LANCZOS_WIDTH * 2) &&
@ -195,6 +318,11 @@ static void lanczos_resampler_fill(lanczos_resampler * r)
int * out = r->buffer_out + write_pos;
if ( write_size > ( lanczos_buffer_size - r->read_filled ) )
write_size = lanczos_buffer_size - r->read_filled;
#ifdef LANCZOS_SSE
if ( lanczos_has_sse )
lanczos_resampler_run_sse( r, &out, out + write_size );
else
#endif
lanczos_resampler_run( r, &out, out + write_size );
r->read_filled += out - r->buffer_out - write_pos;
}

View File

@ -1,5 +1,5 @@
/*
** FT2PLAY v0.34
** FT2PLAY v0.35
** =============
**
** C port of FastTracker II's replayer, by 8bitbubsy (Olav Sørensen)
@ -1467,6 +1467,8 @@ static int16_t RelocateTon(PLAYER *p, int16_t inPeriod, int8_t addNote, StmTyp *
outPeriod = (((oldPeriod + addPeriod) >> 1) & 0xFFE0) + fineTune;
if (outPeriod < fineTune) outPeriod += (1 << 8);
if (((outPeriod - 16) >> 1) < ((12 * 10 * 16) + 16)) // non-FT2 security fix
{
if (inPeriod >= p->Note2Period[(outPeriod - 16) >> 1]) // 16-bit look-up, shift it down
{
outPeriod -= fineTune;
@ -1480,13 +1482,13 @@ static int16_t RelocateTon(PLAYER *p, int16_t inPeriod, int8_t addNote, StmTyp *
oldPeriod = (int16_t)(outPeriod);
}
}
}
outPeriod = oldPeriod + fineTune;
if (outPeriod < fineTune) outPeriod += (1 << 8);
outPeriod += ((int16_t)(addNote) << 5);
if (outPeriod >= ((((8 * 12 * 16) + 15) * 2) - 1)) outPeriod = ((8 * 12 * 16) + 15) * 2;
return (p->Note2Period[outPeriod >> 1]); // 16-bit look-up, shift it down
}
@ -2260,6 +2262,10 @@ static int8_t LoadInstrHeader(PLAYER *p, MEM *buf, uint16_t i)
mread(&ih.Samp[j], 17, 1, buf);
mseek(buf, 1 + 22, SEEK_CUR); // skip junk + name
memcpy(&p->Instr[i]->Samp[j], &ih.Samp[j], 17);
// non-FT2 fix: Force loop flags off if loop length is 0
if (p->Instr[i]->Samp[j].RepL == 0)
p->Instr[i]->Samp[j].Typ &= 0xFC;
}
}
@ -2681,13 +2687,6 @@ void voiceSetSource(PLAYER *p, uint8_t i, const int8_t *sampleData,
p->voice[i].rampTerminates = 0;
#endif
// for 9xx set offset
if (p->voice[i].samplePosition >= p->voice[i].sampleLength)
{
p->voice[i].sampleData = NULL;
p->voice[i].samplePosition = 0;
}
lanczos_resampler_clear(p->resampler[i]);
#ifdef USE_VOL_RAMP
lanczos_resampler_clear(p->resampler[i+254]);
@ -2699,6 +2698,12 @@ void voiceSetSource(PLAYER *p, uint8_t i, const int8_t *sampleData,
void voiceSetSamplePosition(PLAYER *p, uint8_t i, uint16_t value)
{
p->voice[i].samplePosition = value;
if (p->voice[i].samplePosition >= p->voice[i].sampleLength)
{
p->voice[i].samplePosition = 0;
p->voice[i].sampleData = NULL;
}
p->voice[i].interpolating = 1;
p->voice[i].oversampleCount = 0;

View File

@ -2,6 +2,10 @@
#include <string.h>
#define _USE_MATH_DEFINES
#include <math.h>
#if (defined(_M_IX86) || defined(__i386__) || defined(_M_X64) || defined(__amd64__))
#include <xmmintrin.h>
#define LANCZOS_SSE
#endif
#ifndef M_PI
#define M_PI 3.14159265358979323846
@ -10,29 +14,60 @@
#include "lanczos_resampler.h"
enum { LANCZOS_RESOLUTION = 8192 };
enum { LANCZOS_WIDTH = 8 };
enum { LANCZOS_WIDTH = 16 };
enum { LANCZOS_SAMPLES = LANCZOS_RESOLUTION * LANCZOS_WIDTH };
static double lanczos_lut[LANCZOS_SAMPLES + 1];
static float lanczos_lut[LANCZOS_SAMPLES + 1];
enum { lanczos_buffer_size = LANCZOS_WIDTH * 4 };
static int fEqual(const double b, const double a)
static int fEqual(const float b, const float a)
{
return fabs(a - b) < 1.0e-6;
}
static double sinc(double x)
static float sinc(float x)
{
return fEqual(x, 0.0) ? 1.0 : sin(x * M_PI) / (x * M_PI);
}
#ifdef LANCZOS_SSE
#ifdef _MSC_VER
#include <intrin.h>
#elif defined(__clang__) || defined(__GNUC__)
static inline void
__cpuid(int *data, int selector)
{
asm("cpuid"
: "=a" (data[0]),
"=b" (data[1]),
"=c" (data[2]),
"=d" (data[3])
: "a"(selector));
}
#else
#define __cpuid(a,b) memset((a), 0, sizeof(int) * 4)
#endif
static int query_cpu_feature_sse() {
int buffer[4];
__cpuid(buffer,1);
if ((buffer[3]&(1<<25)) == 0) return 0;
return 1;
}
static int lanczos_has_sse = 0;
#endif
void lanczos_init(void)
{
unsigned i;
double dx = (double)(LANCZOS_WIDTH) / LANCZOS_SAMPLES, x = 0.0;
float dx = (float)(LANCZOS_WIDTH) / LANCZOS_SAMPLES, x = 0.0;
for (i = 0; i < LANCZOS_SAMPLES + 1; ++i, x += dx)
lanczos_lut[i] = fabs(x) < LANCZOS_WIDTH ? sinc(x) * sinc(x / LANCZOS_WIDTH) : 0.0;
#ifdef LANCZOS_SSE
lanczos_has_sse = query_cpu_feature_sse();
#endif
}
typedef struct lanczos_resampler
@ -164,10 +199,10 @@ static int lanczos_resampler_run(lanczos_resampler * r, float ** out_, float * o
do
{
// accumulate in extended precision
double kernel[LANCZOS_WIDTH * 2], kernel_sum = 0.0;
float kernel[LANCZOS_WIDTH * 2], kernel_sum = 0.0;
int i = LANCZOS_WIDTH;
int phase_adj = phase * step / LANCZOS_RESOLUTION;
double sample;
float sample;
if ( out >= out_end )
break;
@ -200,6 +235,80 @@ static int lanczos_resampler_run(lanczos_resampler * r, float ** out_, float * o
return used;
}
#ifdef LANCZOS_SSE
static int lanczos_resampler_run_sse(lanczos_resampler * r, float ** out_, float * out_end)
{
int in_size = r->write_filled;
float const* in_ = r->buffer_in + lanczos_buffer_size + r->write_pos - r->write_filled;
int used = 0;
in_size -= LANCZOS_WIDTH * 2;
if ( in_size > 0 )
{
float* out = *out_;
float const* in = in_;
float const* const in_end = in + in_size;
int phase = r->phase;
int phase_inc = r->phase_inc;
int step = phase_inc > LANCZOS_RESOLUTION ? LANCZOS_RESOLUTION * LANCZOS_RESOLUTION / phase_inc : LANCZOS_RESOLUTION;
do
{
// accumulate in extended precision
float kernel_sum;
__m128 kernel[LANCZOS_WIDTH / 2];
__m128 temp1, temp2;
__m128 samplex = _mm_setzero_ps();
float *kernelf = (float*)(&kernel);
int i = LANCZOS_WIDTH;
int phase_adj = phase * step / LANCZOS_RESOLUTION;
if ( out >= out_end )
break;
for (; i >= -LANCZOS_WIDTH + 1; --i)
{
int pos = i * step;
kernel_sum += kernelf[i + LANCZOS_WIDTH - 1] = lanczos_lut[abs(phase_adj - pos)];
}
for (i = 0; i < LANCZOS_WIDTH / 2; ++i)
{
temp1 = _mm_loadu_ps( (const float *)( in + i * 4 ) );
temp2 = _mm_load_ps( (const float *)( kernel + i ) );
temp1 = _mm_mul_ps( temp1, temp2 );
samplex = _mm_add_ps( samplex, temp1 );
}
kernel_sum = 1.0 / kernel_sum * (1.0 / 32768.0);
temp1 = _mm_movehl_ps( temp1, samplex );
samplex = _mm_add_ps( samplex, temp1 );
temp1 = samplex;
temp1 = _mm_shuffle_ps( temp1, samplex, _MM_SHUFFLE(0, 0, 0, 1) );
samplex = _mm_add_ps( samplex, temp1 );
temp1 = _mm_set_ss( kernel_sum );
samplex = _mm_mul_ps( samplex, temp1 );
_mm_store_ss( out, samplex );
++out;
phase += phase_inc;
in += phase >> 13;
phase &= 8191;
}
while ( in < in_end );
r->phase = (unsigned short) phase;
*out_ = out;
used = (int)(in - in_);
r->write_filled -= used;
}
return used;
}
#endif
static void lanczos_resampler_fill(lanczos_resampler * r)
{
while ( r->write_filled > (LANCZOS_WIDTH * 2) &&
@ -210,6 +319,11 @@ static void lanczos_resampler_fill(lanczos_resampler * r)
float * out = r->buffer_out + write_pos;
if ( write_size > ( lanczos_buffer_size - r->read_filled ) )
write_size = lanczos_buffer_size - r->read_filled;
#ifdef LANCZOS_SSE
if ( lanczos_has_sse )
lanczos_resampler_run_sse( r, &out, out + write_size );
else
#endif
lanczos_resampler_run( r, &out, out + write_size );
r->read_filled += out - r->buffer_out - write_pos;
}