Added automatic detection of SSE support for x86 and force on for x64

CQTexperiment
Chris Moeller 2013-09-29 17:27:13 -07:00
parent fa6b5d4290
commit a93ec7b95f
4 changed files with 63 additions and 10 deletions

View File

@ -924,4 +924,10 @@ long _dumb_it_read_sample_data_adpcm4(IT_SAMPLE *sample, DUMBFILE *f);
void _dumb_it_interleave_stereo_sample(IT_SAMPLE *sample); void _dumb_it_interleave_stereo_sample(IT_SAMPLE *sample);
/* Calling either of these is optional */
void _dumb_init_cubic();
#ifdef _USE_SSE
void _dumb_init_sse();
#endif
#endif /* INTERNAL_IT_H */ #endif /* INTERNAL_IT_H */

View File

@ -65,7 +65,7 @@ long dumb_resample(DUMB_RESAMPLER *resampler, sample_t *dst, long dst_size, VOLU
if (VOLUMES_ARE_ZERO) dst = NULL; if (VOLUMES_ARE_ZERO) dst = NULL;
init_cubic(); _dumb_init_cubic();
quality = resampler->quality; quality = resampler->quality;
@ -382,7 +382,7 @@ void dumb_resample_get_current_sample(DUMB_RESAMPLER *resampler, VOLUME_PARAMETE
if (VOLUMES_ARE_ZERO) { MIX_ZEROS(=); return; } if (VOLUMES_ARE_ZERO) { MIX_ZEROS(=); return; }
init_cubic(); _dumb_init_cubic();
quality = resampler->quality; quality = resampler->quality;

View File

@ -160,12 +160,11 @@ int dumb_resampling_quality = DUMB_RQ_CUBIC;
static short cubicA0[1025], cubicA1[1025]; static short cubicA0[1025], cubicA1[1025];
/*static*/ void init_cubic(void) void _dumb_init_cubic(void)
{ {
unsigned int t; /* 3*1024*1024*1024 is within range if it's unsigned */ unsigned int t; /* 3*1024*1024*1024 is within range if it's unsigned */
static int done = 0; static int done = 0;
if (done) return; if (done) return;
done = 1;
for (t = 0; t < 1025; t++) { for (t = 0; t < 1025; t++) {
/* int casts to pacify warnings about negating unsigned values */ /* int casts to pacify warnings about negating unsigned values */
cubicA0[t] = -(int)( t*t*t >> 17) + (int)( t*t >> 6) - (int)(t << 3); cubicA0[t] = -(int)( t*t*t >> 17) + (int)( t*t >> 6) - (int)(t << 3);
@ -173,6 +172,8 @@ static short cubicA0[1025], cubicA1[1025];
} }
lanczos_init(); lanczos_init();
done = 1;
} }

View File

@ -19,6 +19,7 @@
#include <math.h> #include <math.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h>
#include "dumb.h" #include "dumb.h"
#include "internal/dumb.h" #include "internal/dumb.h"
@ -786,11 +787,56 @@ static void it_filter_sse(DUMB_CLICK_REMOVER *cr, IT_FILTER_STATE *state, sample
#undef LOG10 #undef LOG10
int _dumb_it_use_sse = 0; #if defined(_M_IX86) || defined(__i386__)
#ifdef _MSC_VER
#include <intrin.h>
#else
static inline void
__cpuid(int *data, int selector)
{
asm("cpuid"
: "=a" (data[0]),
"=b" (data[1]),
"=c" (data[2]),
"=d" (data[3])
: "a"(selector));
}
#endif
static int query_cpu_feature_sse() {
int buffer[4];
__cpuid(buffer,1);
if ((buffer[3]&(1<<25)) == 0) return 0;
return 1;
}
static int _dumb_it_use_sse = 0;
void _dumb_init_sse()
{
static int initialized = 0;
if (!initialized)
{
_dumb_it_use_sse = query_cpu_feature_sse();
initialized = 1;
}
}
#elif defined(_M_X64) || defined(__amd64__)
static const int _dumb_it_use_sse = 1;
#else
static const int _dumb_it_use_sse = 0;
#endif
static void it_filter(DUMB_CLICK_REMOVER *cr, IT_FILTER_STATE *state, sample_t *dst, long pos, sample_t *src, long size, int step, int sampfreq, int cutoff, int resonance) static void it_filter(DUMB_CLICK_REMOVER *cr, IT_FILTER_STATE *state, sample_t *dst, long pos, sample_t *src, long size, int step, int sampfreq, int cutoff, int resonance)
{ {
#if defined(_USE_SSE) #if defined(_USE_SSE)
_dumb_init_sse();
if ( _dumb_it_use_sse ) it_filter_sse( cr, state, dst, pos, src, size, step, sampfreq, cutoff, resonance ); if ( _dumb_it_use_sse ) it_filter_sse( cr, state, dst, pos, src, size, step, sampfreq, cutoff, resonance );
else else
#endif #endif
@ -938,7 +984,7 @@ static void reset_channel_effects(IT_CHANNEL *channel)
channel->xm_volslide = 0; channel->xm_volslide = 0;
channel->panslide = 0; channel->panslide = 0;
channel->channelvolslide = 0; channel->channelvolslide = 0;
channel->arpeggio_table = &arpeggio_mod; channel->arpeggio_table = (const unsigned char *) &arpeggio_mod;
memset(channel->arpeggio_offsets, 0, sizeof(channel->arpeggio_offsets)); memset(channel->arpeggio_offsets, 0, sizeof(channel->arpeggio_offsets));
channel->retrig = 0; channel->retrig = 0;
if (channel->xm_retrig) { if (channel->xm_retrig) {
@ -2490,7 +2536,7 @@ Yxy This uses a table 4 times larger (hence 4 times slower) than
channel->arpeggio_offsets[0] = 0; channel->arpeggio_offsets[0] = 0;
channel->arpeggio_offsets[1] = (v & 0xF0) >> 4; channel->arpeggio_offsets[1] = (v & 0xF0) >> 4;
channel->arpeggio_offsets[2] = (v & 0x0F); channel->arpeggio_offsets[2] = (v & 0x0F);
channel->arpeggio_table = ((sigdata->flags & (IT_WAS_AN_XM|IT_WAS_A_MOD))==IT_WAS_AN_XM) ? &arpeggio_xm : &arpeggio_mod; channel->arpeggio_table = (const unsigned char *)(((sigdata->flags & (IT_WAS_AN_XM|IT_WAS_A_MOD))==IT_WAS_AN_XM) ? &arpeggio_xm : &arpeggio_mod);
} }
break; break;
case IT_SET_CHANNEL_VOLUME: case IT_SET_CHANNEL_VOLUME:
@ -3124,15 +3170,15 @@ Yxy This uses a table 4 times larger (hence 4 times slower) than
switch (entry->effect) switch (entry->effect)
{ {
case IT_OKT_ARPEGGIO_3: case IT_OKT_ARPEGGIO_3:
channel->arpeggio_table = &arpeggio_okt_3; channel->arpeggio_table = (const unsigned char *)&arpeggio_okt_3;
break; break;
case IT_OKT_ARPEGGIO_4: case IT_OKT_ARPEGGIO_4:
channel->arpeggio_table = &arpeggio_okt_4; channel->arpeggio_table = (const unsigned char *)&arpeggio_okt_4;
break; break;
case IT_OKT_ARPEGGIO_5: case IT_OKT_ARPEGGIO_5:
channel->arpeggio_table = &arpeggio_okt_5; channel->arpeggio_table = (const unsigned char *)&arpeggio_okt_5;
break; break;
} }
} }