From 7ae9f88e2ab3b00dc2a16e1eadd737444f041555 Mon Sep 17 00:00:00 2001 From: Chris Moeller Date: Fri, 21 Mar 2014 15:42:00 -0700 Subject: [PATCH] Updated ft2play, and implemented SSE optimizations in the Lanczos sinc resampler --- .../dumb/include/internal/lanczos_resampler.h | 26 ++- .../Dumb/dumb/src/helpers/lanczos_resampler.c | 160 ++++++++++++++++-- Frameworks/modplay/modplay/ft2play.c | 43 ++--- .../modplay/modplay/lanczos_resampler.c | 130 +++++++++++++- 4 files changed, 313 insertions(+), 46 deletions(-) diff --git a/Frameworks/Dumb/dumb/include/internal/lanczos_resampler.h b/Frameworks/Dumb/dumb/include/internal/lanczos_resampler.h index a691697f0..464f1fa39 100644 --- a/Frameworks/Dumb/dumb/include/internal/lanczos_resampler.h +++ b/Frameworks/Dumb/dumb/include/internal/lanczos_resampler.h @@ -1,11 +1,31 @@ #ifndef _LANCZOS_RESAMPLER_H_ #define _LANCZOS_RESAMPLER_H_ -void lanczos_init(); +// Ugglay +#ifdef LANCZOS_DECORATE +#define PASTE(a,b) a ## b +#define EVALUATE(a,b) PASTE(a,b) +#define lanczos_init EVALUATE(LANCZOS_DECORATE,_lanczos_init) +#define lanczos_resampler_create EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_create) +#define lanczos_resampler_delete EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_delete) +#define lanczos_resampler_dup EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_dup) +#define lanczos_resampler_dup_inplace EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_dup_inplace) +#define lanczos_resampler_get_free_count EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_get_free_count) +#define lanczos_resampler_write_sample EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_write_sample) +#define lanczos_resampler_set_rate EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_set_rate) +#define lanczos_resampler_ready EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_ready) +#define lanczos_resampler_clear EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_clear) +#define lanczos_resampler_get_sample_count EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_get_sample_count) +#define lanczos_resampler_get_sample EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_get_sample) +#define lanczos_resampler_remove_sample EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_remove_sample) +#endif -void * lanczos_resampler_create(); +void lanczos_init(void); + +void * lanczos_resampler_create(void); void lanczos_resampler_delete(void *); -void * lanczos_resampler_dup(void *); +void * lanczos_resampler_dup(const void *); +void lanczos_resampler_dup_inplace(void *, const void *); int lanczos_resampler_get_free_count(void *); void lanczos_resampler_write_sample(void *, short sample); diff --git a/Frameworks/Dumb/dumb/src/helpers/lanczos_resampler.c b/Frameworks/Dumb/dumb/src/helpers/lanczos_resampler.c index d99abf595..9eff64b0f 100644 --- a/Frameworks/Dumb/dumb/src/helpers/lanczos_resampler.c +++ b/Frameworks/Dumb/dumb/src/helpers/lanczos_resampler.c @@ -2,6 +2,10 @@ #include #define _USE_MATH_DEFINES #include +#if (defined(_M_IX86) || defined(__i386__) || defined(_M_X64) || defined(__amd64__)) +#include +#define LANCZOS_SSE +#endif #ifndef M_PI #define M_PI 3.14159265358979323846 @@ -10,29 +14,60 @@ #include "internal/lanczos_resampler.h" enum { LANCZOS_RESOLUTION = 8192 }; -enum { LANCZOS_WIDTH = 8 }; +enum { LANCZOS_WIDTH = 16 }; enum { LANCZOS_SAMPLES = LANCZOS_RESOLUTION * LANCZOS_WIDTH }; -static double lanczos_lut[LANCZOS_SAMPLES + 1]; +static float lanczos_lut[LANCZOS_SAMPLES + 1]; enum { lanczos_buffer_size = LANCZOS_WIDTH * 4 }; -int fEqual(const double b, const double a) +static int fEqual(const float b, const float a) { return fabs(a - b) < 1.0e-6; } -static double sinc(double x) +static float sinc(float x) { return fEqual(x, 0.0) ? 1.0 : sin(x * M_PI) / (x * M_PI); } -void lanczos_init() +#ifdef LANCZOS_SSE +#ifdef _MSC_VER +#include +#elif defined(__clang__) || defined(__GNUC__) +static inline void +__cpuid(int *data, int selector) +{ + asm("cpuid" + : "=a" (data[0]), + "=b" (data[1]), + "=c" (data[2]), + "=d" (data[3]) + : "a"(selector)); +} +#else +#define __cpuid(a,b) memset((a), 0, sizeof(int) * 4) +#endif + +static int query_cpu_feature_sse() { + int buffer[4]; + __cpuid(buffer,1); + if ((buffer[3]&(1<<25)) == 0) return 0; + return 1; +} + +static int lanczos_has_sse = 0; +#endif + +void lanczos_init(void) { unsigned i; - double dx = (double)(LANCZOS_WIDTH) / LANCZOS_SAMPLES, x = 0.0; + float dx = (float)(LANCZOS_WIDTH) / LANCZOS_SAMPLES, x = 0.0; for (i = 0; i < LANCZOS_SAMPLES + 1; ++i, x += dx) - lanczos_lut[i] = abs(x) < LANCZOS_WIDTH ? sinc(x) * sinc(x / LANCZOS_WIDTH) : 0.0; + lanczos_lut[i] = fabs(x) < LANCZOS_WIDTH ? sinc(x) * sinc(x / LANCZOS_WIDTH) : 0.0; +#ifdef LANCZOS_SSE + lanczos_has_sse = query_cpu_feature_sse(); +#endif } typedef struct lanczos_resampler @@ -45,7 +80,7 @@ typedef struct lanczos_resampler int buffer_out[lanczos_buffer_size]; } lanczos_resampler; -void * lanczos_resampler_create() +void * lanczos_resampler_create(void) { lanczos_resampler * r = ( lanczos_resampler * ) malloc( sizeof(lanczos_resampler) ); if ( !r ) return 0; @@ -67,9 +102,9 @@ void lanczos_resampler_delete(void * _r) free( _r ); } -void * lanczos_resampler_dup(void * _r) +void * lanczos_resampler_dup(const void * _r) { - lanczos_resampler * r_in = ( lanczos_resampler * ) _r; + const lanczos_resampler * r_in = ( const lanczos_resampler * ) _r; lanczos_resampler * r_out = ( lanczos_resampler * ) malloc( sizeof(lanczos_resampler) ); if ( !r_out ) return 0; @@ -85,6 +120,21 @@ void * lanczos_resampler_dup(void * _r) return r_out; } +void lanczos_resampler_dup_inplace(void *_d, const void *_s) +{ + const lanczos_resampler * r_in = ( const lanczos_resampler * ) _s; + lanczos_resampler * r_out = ( lanczos_resampler * ) _d; + + r_out->write_pos = r_in->write_pos; + r_out->write_filled = r_in->write_filled; + r_out->read_pos = r_in->read_pos; + r_out->read_filled = r_in->read_filled; + r_out->phase = r_in->phase; + r_out->phase_inc = r_in->phase_inc; + memcpy( r_out->buffer_in, r_in->buffer_in, sizeof(r_in->buffer_in) ); + memcpy( r_out->buffer_out, r_in->buffer_out, sizeof(r_in->buffer_out) ); +} + int lanczos_resampler_get_free_count(void *_r) { lanczos_resampler * r = ( lanczos_resampler * ) _r; @@ -149,10 +199,10 @@ static int lanczos_resampler_run(lanczos_resampler * r, int ** out_, int * out_e do { // accumulate in extended precision - double kernel[LANCZOS_WIDTH * 2], kernel_sum = 0.0; + float kernel[LANCZOS_WIDTH * 2], kernel_sum = 0.0; int i = LANCZOS_WIDTH; int phase_adj = phase * step / LANCZOS_RESOLUTION; - double sample; + float sample; if ( out >= out_end ) break; @@ -164,7 +214,7 @@ static int lanczos_resampler_run(lanczos_resampler * r, int ** out_, int * out_e } for (sample = 0, i = 0; i < LANCZOS_WIDTH * 2; ++i) sample += in[i] * kernel[i]; - *out++ = (int) (sample / kernel_sum * 256.0); + *out++ = (int)(sample / kernel_sum * 256.0); phase += phase_inc; @@ -174,10 +224,10 @@ static int lanczos_resampler_run(lanczos_resampler * r, int ** out_, int * out_e } while ( in < in_end ); - r->phase = phase; + r->phase = (unsigned short) phase; *out_ = out; - used = in - in_; + used = (int)(in - in_); r->write_filled -= used; } @@ -185,6 +235,79 @@ static int lanczos_resampler_run(lanczos_resampler * r, int ** out_, int * out_e return used; } +#ifdef LANCZOS_SSE +static int lanczos_resampler_run_sse(lanczos_resampler * r, int ** out_, int * out_end) +{ + int in_size = r->write_filled; + float const* in_ = r->buffer_in + lanczos_buffer_size + r->write_pos - r->write_filled; + int used = 0; + in_size -= LANCZOS_WIDTH * 2; + if ( in_size > 0 ) + { + int* out = *out_; + float const* in = in_; + float const* const in_end = in + in_size; + int phase = r->phase; + int phase_inc = r->phase_inc; + + int step = phase_inc > LANCZOS_RESOLUTION ? LANCZOS_RESOLUTION * LANCZOS_RESOLUTION / phase_inc : LANCZOS_RESOLUTION; + + do + { + // accumulate in extended precision + float kernel_sum = 0.0; + __m128 kernel[LANCZOS_WIDTH / 2]; + __m128 temp1, temp2; + __m128 samplex = _mm_setzero_ps(); + float *kernelf = (float*)(&kernel); + int i = LANCZOS_WIDTH; + int phase_adj = phase * step / LANCZOS_RESOLUTION; + + if ( out >= out_end ) + break; + + for (; i >= -LANCZOS_WIDTH + 1; --i) + { + int pos = i * step; + kernel_sum += kernelf[i + LANCZOS_WIDTH - 1] = lanczos_lut[abs(phase_adj - pos)]; + } + for (i = 0; i < LANCZOS_WIDTH / 2; ++i) + { + temp1 = _mm_loadu_ps( (const float *)( in + i * 4 ) ); + temp2 = _mm_load_ps( (const float *)( kernel + i ) ); + temp1 = _mm_mul_ps( temp1, temp2 ); + samplex = _mm_add_ps( samplex, temp1 ); + } + kernel_sum = 1.0 / kernel_sum * (1.0 / 32768.0); + temp1 = _mm_movehl_ps( temp1, samplex ); + samplex = _mm_add_ps( samplex, temp1 ); + temp1 = samplex; + temp1 = _mm_shuffle_ps( temp1, samplex, _MM_SHUFFLE(0, 0, 0, 1) ); + samplex = _mm_add_ps( samplex, temp1 ); + temp1 = _mm_set_ss( kernel_sum ); + samplex = _mm_mul_ps( samplex, temp1 ); + *out++ = _mm_cvtss_si32( samplex ); + + phase += phase_inc; + + in += phase >> 13; + + phase &= 8191; + } + while ( in < in_end ); + + r->phase = (unsigned short) phase; + *out_ = out; + + used = (int)(in - in_); + + r->write_filled -= used; + } + + return used; +} +#endif + static void lanczos_resampler_fill(lanczos_resampler * r) { while ( r->write_filled > (LANCZOS_WIDTH * 2) && @@ -195,7 +318,12 @@ static void lanczos_resampler_fill(lanczos_resampler * r) int * out = r->buffer_out + write_pos; if ( write_size > ( lanczos_buffer_size - r->read_filled ) ) write_size = lanczos_buffer_size - r->read_filled; - lanczos_resampler_run( r, &out, out + write_size ); +#ifdef LANCZOS_SSE + if ( lanczos_has_sse ) + lanczos_resampler_run_sse( r, &out, out + write_size ); + else +#endif + lanczos_resampler_run( r, &out, out + write_size ); r->read_filled += out - r->buffer_out - write_pos; } } diff --git a/Frameworks/modplay/modplay/ft2play.c b/Frameworks/modplay/modplay/ft2play.c index 1664cff4c..2ebf17b05 100644 --- a/Frameworks/modplay/modplay/ft2play.c +++ b/Frameworks/modplay/modplay/ft2play.c @@ -1,5 +1,5 @@ /* - ** FT2PLAY v0.34 + ** FT2PLAY v0.35 ** ============= ** ** C port of FastTracker II's replayer, by 8bitbubsy (Olav Sørensen) @@ -1467,17 +1467,20 @@ static int16_t RelocateTon(PLAYER *p, int16_t inPeriod, int8_t addNote, StmTyp * outPeriod = (((oldPeriod + addPeriod) >> 1) & 0xFFE0) + fineTune; if (outPeriod < fineTune) outPeriod += (1 << 8); - if (inPeriod >= p->Note2Period[(outPeriod - 16) >> 1]) // 16-bit look-up, shift it down + if (((outPeriod - 16) >> 1) < ((12 * 10 * 16) + 16)) // non-FT2 security fix { - outPeriod -= fineTune; - if (outPeriod & 0x00010000) outPeriod = (outPeriod - (1 << 8)) & 0x0000FFE0; - addPeriod = (int16_t)(outPeriod); - } - else - { - outPeriod -= fineTune; - if (outPeriod & 0x00010000) outPeriod = (outPeriod - (1 << 8)) & 0x0000FFE0; - oldPeriod = (int16_t)(outPeriod); + if (inPeriod >= p->Note2Period[(outPeriod - 16) >> 1]) // 16-bit look-up, shift it down + { + outPeriod -= fineTune; + if (outPeriod & 0x00010000) outPeriod = (outPeriod - (1 << 8)) & 0x0000FFE0; + addPeriod = (int16_t)(outPeriod); + } + else + { + outPeriod -= fineTune; + if (outPeriod & 0x00010000) outPeriod = (outPeriod - (1 << 8)) & 0x0000FFE0; + oldPeriod = (int16_t)(outPeriod); + } } } @@ -1486,7 +1489,6 @@ static int16_t RelocateTon(PLAYER *p, int16_t inPeriod, int8_t addNote, StmTyp * outPeriod += ((int16_t)(addNote) << 5); if (outPeriod >= ((((8 * 12 * 16) + 15) * 2) - 1)) outPeriod = ((8 * 12 * 16) + 15) * 2; - return (p->Note2Period[outPeriod >> 1]); // 16-bit look-up, shift it down } @@ -2260,6 +2262,10 @@ static int8_t LoadInstrHeader(PLAYER *p, MEM *buf, uint16_t i) mread(&ih.Samp[j], 17, 1, buf); mseek(buf, 1 + 22, SEEK_CUR); // skip junk + name memcpy(&p->Instr[i]->Samp[j], &ih.Samp[j], 17); + + // non-FT2 fix: Force loop flags off if loop length is 0 + if (p->Instr[i]->Samp[j].RepL == 0) + p->Instr[i]->Samp[j].Typ &= 0xFC; } } @@ -2681,13 +2687,6 @@ void voiceSetSource(PLAYER *p, uint8_t i, const int8_t *sampleData, p->voice[i].rampTerminates = 0; #endif - // for 9xx set offset - if (p->voice[i].samplePosition >= p->voice[i].sampleLength) - { - p->voice[i].sampleData = NULL; - p->voice[i].samplePosition = 0; - } - lanczos_resampler_clear(p->resampler[i]); #ifdef USE_VOL_RAMP lanczos_resampler_clear(p->resampler[i+254]); @@ -2699,6 +2698,12 @@ void voiceSetSource(PLAYER *p, uint8_t i, const int8_t *sampleData, void voiceSetSamplePosition(PLAYER *p, uint8_t i, uint16_t value) { p->voice[i].samplePosition = value; + if (p->voice[i].samplePosition >= p->voice[i].sampleLength) + { + p->voice[i].samplePosition = 0; + p->voice[i].sampleData = NULL; + } + p->voice[i].interpolating = 1; p->voice[i].oversampleCount = 0; diff --git a/Frameworks/modplay/modplay/lanczos_resampler.c b/Frameworks/modplay/modplay/lanczos_resampler.c index 559244f18..ef1c63c49 100644 --- a/Frameworks/modplay/modplay/lanczos_resampler.c +++ b/Frameworks/modplay/modplay/lanczos_resampler.c @@ -2,6 +2,10 @@ #include #define _USE_MATH_DEFINES #include +#if (defined(_M_IX86) || defined(__i386__) || defined(_M_X64) || defined(__amd64__)) +#include +#define LANCZOS_SSE +#endif #ifndef M_PI #define M_PI 3.14159265358979323846 @@ -10,29 +14,60 @@ #include "lanczos_resampler.h" enum { LANCZOS_RESOLUTION = 8192 }; -enum { LANCZOS_WIDTH = 8 }; +enum { LANCZOS_WIDTH = 16 }; enum { LANCZOS_SAMPLES = LANCZOS_RESOLUTION * LANCZOS_WIDTH }; -static double lanczos_lut[LANCZOS_SAMPLES + 1]; +static float lanczos_lut[LANCZOS_SAMPLES + 1]; enum { lanczos_buffer_size = LANCZOS_WIDTH * 4 }; -static int fEqual(const double b, const double a) +static int fEqual(const float b, const float a) { return fabs(a - b) < 1.0e-6; } -static double sinc(double x) +static float sinc(float x) { return fEqual(x, 0.0) ? 1.0 : sin(x * M_PI) / (x * M_PI); } +#ifdef LANCZOS_SSE +#ifdef _MSC_VER +#include +#elif defined(__clang__) || defined(__GNUC__) +static inline void +__cpuid(int *data, int selector) +{ + asm("cpuid" + : "=a" (data[0]), + "=b" (data[1]), + "=c" (data[2]), + "=d" (data[3]) + : "a"(selector)); +} +#else +#define __cpuid(a,b) memset((a), 0, sizeof(int) * 4) +#endif + +static int query_cpu_feature_sse() { + int buffer[4]; + __cpuid(buffer,1); + if ((buffer[3]&(1<<25)) == 0) return 0; + return 1; +} + +static int lanczos_has_sse = 0; +#endif + void lanczos_init(void) { unsigned i; - double dx = (double)(LANCZOS_WIDTH) / LANCZOS_SAMPLES, x = 0.0; + float dx = (float)(LANCZOS_WIDTH) / LANCZOS_SAMPLES, x = 0.0; for (i = 0; i < LANCZOS_SAMPLES + 1; ++i, x += dx) lanczos_lut[i] = fabs(x) < LANCZOS_WIDTH ? sinc(x) * sinc(x / LANCZOS_WIDTH) : 0.0; +#ifdef LANCZOS_SSE + lanczos_has_sse = query_cpu_feature_sse(); +#endif } typedef struct lanczos_resampler @@ -164,10 +199,10 @@ static int lanczos_resampler_run(lanczos_resampler * r, float ** out_, float * o do { // accumulate in extended precision - double kernel[LANCZOS_WIDTH * 2], kernel_sum = 0.0; + float kernel[LANCZOS_WIDTH * 2], kernel_sum = 0.0; int i = LANCZOS_WIDTH; int phase_adj = phase * step / LANCZOS_RESOLUTION; - double sample; + float sample; if ( out >= out_end ) break; @@ -200,6 +235,80 @@ static int lanczos_resampler_run(lanczos_resampler * r, float ** out_, float * o return used; } +#ifdef LANCZOS_SSE +static int lanczos_resampler_run_sse(lanczos_resampler * r, float ** out_, float * out_end) +{ + int in_size = r->write_filled; + float const* in_ = r->buffer_in + lanczos_buffer_size + r->write_pos - r->write_filled; + int used = 0; + in_size -= LANCZOS_WIDTH * 2; + if ( in_size > 0 ) + { + float* out = *out_; + float const* in = in_; + float const* const in_end = in + in_size; + int phase = r->phase; + int phase_inc = r->phase_inc; + + int step = phase_inc > LANCZOS_RESOLUTION ? LANCZOS_RESOLUTION * LANCZOS_RESOLUTION / phase_inc : LANCZOS_RESOLUTION; + + do + { + // accumulate in extended precision + float kernel_sum; + __m128 kernel[LANCZOS_WIDTH / 2]; + __m128 temp1, temp2; + __m128 samplex = _mm_setzero_ps(); + float *kernelf = (float*)(&kernel); + int i = LANCZOS_WIDTH; + int phase_adj = phase * step / LANCZOS_RESOLUTION; + + if ( out >= out_end ) + break; + + for (; i >= -LANCZOS_WIDTH + 1; --i) + { + int pos = i * step; + kernel_sum += kernelf[i + LANCZOS_WIDTH - 1] = lanczos_lut[abs(phase_adj - pos)]; + } + for (i = 0; i < LANCZOS_WIDTH / 2; ++i) + { + temp1 = _mm_loadu_ps( (const float *)( in + i * 4 ) ); + temp2 = _mm_load_ps( (const float *)( kernel + i ) ); + temp1 = _mm_mul_ps( temp1, temp2 ); + samplex = _mm_add_ps( samplex, temp1 ); + } + kernel_sum = 1.0 / kernel_sum * (1.0 / 32768.0); + temp1 = _mm_movehl_ps( temp1, samplex ); + samplex = _mm_add_ps( samplex, temp1 ); + temp1 = samplex; + temp1 = _mm_shuffle_ps( temp1, samplex, _MM_SHUFFLE(0, 0, 0, 1) ); + samplex = _mm_add_ps( samplex, temp1 ); + temp1 = _mm_set_ss( kernel_sum ); + samplex = _mm_mul_ps( samplex, temp1 ); + _mm_store_ss( out, samplex ); + ++out; + + phase += phase_inc; + + in += phase >> 13; + + phase &= 8191; + } + while ( in < in_end ); + + r->phase = (unsigned short) phase; + *out_ = out; + + used = (int)(in - in_); + + r->write_filled -= used; + } + + return used; +} +#endif + static void lanczos_resampler_fill(lanczos_resampler * r) { while ( r->write_filled > (LANCZOS_WIDTH * 2) && @@ -210,7 +319,12 @@ static void lanczos_resampler_fill(lanczos_resampler * r) float * out = r->buffer_out + write_pos; if ( write_size > ( lanczos_buffer_size - r->read_filled ) ) write_size = lanczos_buffer_size - r->read_filled; - lanczos_resampler_run( r, &out, out + write_size ); +#ifdef LANCZOS_SSE + if ( lanczos_has_sse ) + lanczos_resampler_run_sse( r, &out, out + write_size ); + else +#endif + lanczos_resampler_run( r, &out, out + write_size ); r->read_filled += out - r->buffer_out - write_pos; } }