diff --git a/Frameworks/Dumb/dumb/include/internal/barray.h b/Frameworks/Dumb/dumb/include/internal/barray.h index 53c9a6cf3..5570d58ec 100644 --- a/Frameworks/Dumb/dumb/include/internal/barray.h +++ b/Frameworks/Dumb/dumb/include/internal/barray.h @@ -3,6 +3,25 @@ #include +#ifdef BARRAY_DECORATE +#undef PASTE +#undef EVALUATE +#define PASTE(a,b) a ## b +#define EVALUATE(a,b) PASTE(a,b) +#define bit_array_create EVALUATE(BARRAY_DECORATE,_bit_array_create) +#define bit_array_destroy EVALUATE(BARRAY_DECORATE,_bit_array_destroy) +#define bit_array_dup EVALUATE(BARRAY_DECORATE,_bit_array_dup) +#define bit_array_reset EVALUATE(BARRAY_DECORATE,_bit_array_reset) +#define bit_array_set EVALUATE(BARRAY_DECORATE,_bit_array_set) +#define bit_array_set_range EVALUATE(BARRAY_DECORATE,_bit_array_set_range) +#define bit_array_test EVALUATE(BARRAY_DECORATE,_bit_array_test) +#define bit_array_test_range EVALUATE(BARRAY_DECORATE,_bit_array_test_range) +#define bit_array_clear EVALUATE(BARRAY_DECORATE,_bit_array_clear) +#define bit_array_clear_range EVALUATE(BARRAY_DECORATE,_bit_array_clear_range) +#define bit_array_merge EVALUATE(BARRAY_DECORATE,_bit_array_merge) +#define bit_array_mask EVALUATE(BARRAY_DECORATE,_bit_array_mask) +#endif + void * bit_array_create(size_t size); void bit_array_destroy(void * array); void * bit_array_dup(void * array); @@ -10,9 +29,13 @@ void * bit_array_dup(void * array); void bit_array_reset(void * array); void bit_array_set(void * array, size_t bit); +void bit_array_set_range(void * array, size_t bit, size_t count); + int bit_array_test(void * array, size_t bit); int bit_array_test_range(void * array, size_t bit, size_t count); + void bit_array_clear(void * array, size_t bit); +void bit_array_clear_range(void * array, size_t bit, size_t count); void bit_array_merge(void * array, void * source, size_t offset); void bit_array_mask(void * array, void * source, size_t offset); diff --git a/Frameworks/Dumb/dumb/include/internal/resampler.h b/Frameworks/Dumb/dumb/include/internal/resampler.h index 0050ebf1a..cea5e38d1 100644 --- a/Frameworks/Dumb/dumb/include/internal/resampler.h +++ b/Frameworks/Dumb/dumb/include/internal/resampler.h @@ -3,6 +3,8 @@ // Ugglay #ifdef RESAMPLER_DECORATE +#undef PASTE +#undef EVALUATE #define PASTE(a,b) a ## b #define EVALUATE(a,b) PASTE(a,b) #define resampler_init EVALUATE(RESAMPLER_DECORATE,_resampler_init) diff --git a/Frameworks/Dumb/dumb/src/helpers/resampler.c b/Frameworks/Dumb/dumb/src/helpers/resampler.c index b684c1c98..cc7a5fcf2 100644 --- a/Frameworks/Dumb/dumb/src/helpers/resampler.c +++ b/Frameworks/Dumb/dumb/src/helpers/resampler.c @@ -8,11 +8,13 @@ #endif #ifdef __APPLE__ #include -#if TARGET_CPU_ARM -#include +#if TARGET_CPU_ARM || TARGET_CPU_ARM64 #define RESAMPLER_NEON #endif #endif +#ifdef RESAMPLER_NEON +#include +#endif #ifdef _MSC_VER #define ALIGNED _declspec(align(16)) @@ -470,7 +472,7 @@ static int resampler_run_blep(resampler * r, float ** out_, float * out_end) } last_amp += sample; sample /= kernel_sum; - for (sample = 0, i = 0; i < SINC_WIDTH * 2; ++i) + for (i = 0; i < SINC_WIDTH * 2; ++i) out[i] += sample * kernel[i]; } @@ -626,8 +628,8 @@ static int resampler_run_blep(resampler * r, float ** out_, float * out_end) { temp1 = vld1q_f32( (const float32_t *)( kernel + i ) ); temp2 = vld1q_f32( (const float32_t *) out + i * 4 ); - temp1 = vmlaq_f32( temp2, temp1, samplex ); - vst1q_f32( (float32_t *) out + i * 4, temp1 ); + temp2 = vmlaq_f32( temp2, temp1, samplex ); + vst1q_f32( (float32_t *) out + i * 4, temp2 ); } } @@ -743,7 +745,7 @@ static int resampler_run_blam(resampler * r, float ** out_, float * out_end) } last_amp += sample; sample /= kernel_sum; - for (sample = 0, i = 0; i < SINC_WIDTH * 2; ++i) + for (i = 0; i < SINC_WIDTH * 2; ++i) out[i] += sample * kernel[i]; } @@ -908,7 +910,7 @@ static int resampler_run_blam(resampler * r, float ** out_, float * out_end) sample = in[0]; if (phase_inc < 1.0f) - sample += (in[1] - in[0]) * fphase; + sample += (in[1] - in[0]) * phase; sample -= last_amp; if (sample) @@ -935,8 +937,8 @@ static int resampler_run_blam(resampler * r, float ** out_, float * out_end) { temp1 = vld1q_f32( (const float32_t *)( kernel + i ) ); temp2 = vld1q_f32( (const float32_t *) out + i * 4 ); - temp1 = vmlaq_f32( temp2, temp1, samplex ); - vst1q_f32( (float32_t *) out + i * 4, temp1 ); + temp2 = vmlaq_f32( temp2, temp1, samplex ); + vst1q_f32( (float32_t *) out + i * 4, temp2 ); } } diff --git a/Frameworks/Dumb/dumb/src/it/itread.c b/Frameworks/Dumb/dumb/src/it/itread.c index 0ed0b1b9a..8f41d57bf 100644 --- a/Frameworks/Dumb/dumb/src/it/itread.c +++ b/Frameworks/Dumb/dumb/src/it/itread.c @@ -53,13 +53,13 @@ static int readblock(DUMBFILE *f, readblock_crap * crap) size = dumbfile_igetw(f); if (size < 0) - return size; + return (int)size; crap->sourcebuf = malloc(size); if (!crap->sourcebuf) return -1; - c = dumbfile_getnc((char *)crap->sourcebuf, size, f); + c = (int)dumbfile_getnc((char *)crap->sourcebuf, size, f); if (c < size) { free(crap->sourcebuf); crap->sourcebuf = NULL; @@ -114,7 +114,7 @@ static int decompress8(DUMBFILE *f, signed char *data, int len, int it215, int s int blocklen, blockpos; byte bitwidth; word val; - char d1, d2; + signed char d1, d2; readblock_crap crap; memset(&crap, 0, sizeof(crap)); @@ -166,14 +166,14 @@ static int decompress8(DUMBFILE *f, signed char *data, int len, int it215, int s //Expand the value to signed byte: { - char v; //The sample value: + signed char v; //The sample value: if (bitwidth < 8) { byte shift = 8 - bitwidth; v = (val << shift); v >>= shift; } else - v = (char)val; + v = (signed char)val; //And integrate the sample value //(It always has to end with integration doesn't it ? ;-) @@ -202,7 +202,7 @@ static int decompress16(DUMBFILE *f, short *data, int len, int it215, int stereo int blocklen, blockpos; byte bitwidth; long val; - short d1, d2; + signed short d1, d2; readblock_crap crap; memset(&crap, 0, sizeof(crap)); @@ -428,13 +428,15 @@ static int it_read_old_instrument(IT_INSTRUMENT *instrument, DUMBFILE *f) static int it_read_instrument(IT_INSTRUMENT *instrument, DUMBFILE *f, int maxlen) { - int n, len; + int n; + long len; /*if (dumbfile_mgetl(f) != IT_INSTRUMENT_SIGNATURE) return -1;*/ // XXX if (maxlen) len = dumbfile_pos(f); + else len = 0; dumbfile_skip(f, 4); @@ -660,17 +662,17 @@ static long it_read_sample_data(IT_SAMPLE *sample, unsigned char convert, DUMBFI if (sample->flags & IT_SAMPLE_STEREO) { if (sample->flags & IT_SAMPLE_16BIT) { - decompress16(f, (short *) sample->data, datasize >> 1, convert & 4, 1); - decompress16(f, (short *) sample->data + 1, datasize >> 1, convert & 4, 1); + decompress16(f, (short *) sample->data, (int)(datasize >> 1), convert & 4, 1); + decompress16(f, (short *) sample->data + 1, (int)(datasize >> 1), convert & 4, 1); } else { - decompress8(f, (signed char *) sample->data, datasize >> 1, convert & 4, 1); - decompress8(f, (signed char *) sample->data + 1, datasize >> 1, convert & 4, 1); + decompress8(f, (signed char *) sample->data, (int)(datasize >> 1), convert & 4, 1); + decompress8(f, (signed char *) sample->data + 1, (int)(datasize >> 1), convert & 4, 1); } } else { if (sample->flags & IT_SAMPLE_16BIT) - decompress16(f, (short *) sample->data, datasize, convert & 4, 0); + decompress16(f, (short *) sample->data, (int)datasize, convert & 4, 0); else - decompress8(f, (signed char *) sample->data, datasize, convert & 4, 0); + decompress8(f, (signed char *) sample->data, (int)datasize, convert & 4, 0); } } else if (sample->flags & IT_SAMPLE_16BIT) { if (sample->flags & IT_SAMPLE_STEREO) { @@ -923,8 +925,8 @@ IT_COMPONENT; static int it_component_compare(const void *e1, const void *e2) { - return ((const IT_COMPONENT *)e1)->offset - - ((const IT_COMPONENT *)e2)->offset; + return (int)(((const IT_COMPONENT *)e1)->offset - + ((const IT_COMPONENT *)e2)->offset); } @@ -994,7 +996,7 @@ static sigdata_t *it_load_sigdata(DUMBFILE *f) dumbfile_skip(f, 1); message_length = dumbfile_igetw(f); - message_offset = dumbfile_igetl(f); + message_offset = (int)dumbfile_igetl(f); /* Skip Reserved. */ dumbfile_skip(f, 4); @@ -1235,7 +1237,7 @@ static sigdata_t *it_load_sigdata(DUMBFILE *f) case IT_COMPONENT_SONG_MESSAGE: if ( n < n_components ) { - message_length = min( message_length, component[n+1].offset - component[n].offset ); + message_length = min( message_length, (int)(component[n+1].offset - component[n].offset) ); } sigdata->song_message = malloc(message_length + 1); if (sigdata->song_message) { @@ -1253,7 +1255,7 @@ static sigdata_t *it_load_sigdata(DUMBFILE *f) if (cmwt < 0x200) m = it_read_old_instrument(&sigdata->instrument[component[n].n], f); else - m = it_read_instrument(&sigdata->instrument[component[n].n], f, (n + 1 < n_components) ? (component[n+1].offset - component[n].offset) : 0); + m = it_read_instrument(&sigdata->instrument[component[n].n], f, (n + 1 < n_components) ? (int)(component[n+1].offset - component[n].offset) : 0); if (m) { free(buffer); @@ -1340,7 +1342,7 @@ static sigdata_t *it_load_sigdata(DUMBFILE *f) if ( !dumbfile_error( f ) && n < 10 ) { - unsigned int mptx_id = dumbfile_igetl( f ); + unsigned int mptx_id = (unsigned int)dumbfile_igetl( f ); while ( !dumbfile_error( f ) && mptx_id != DUMB_ID('M','P','T','S') ) { unsigned int size = dumbfile_igetw( f ); @@ -1353,10 +1355,10 @@ static sigdata_t *it_load_sigdata(DUMBFILE *f) break; } - mptx_id = dumbfile_igetl( f ); + mptx_id = (unsigned int)dumbfile_igetl( f ); } - mptx_id = dumbfile_igetl( f ); + mptx_id = (unsigned int)dumbfile_igetl( f ); while ( !dumbfile_error(f) && dumbfile_pos(f) < dumbfile_get_size(f) ) { unsigned int size = dumbfile_igetw( f ); @@ -1368,14 +1370,14 @@ static sigdata_t *it_load_sigdata(DUMBFILE *f) if ( size == 2 ) sigdata->tempo = dumbfile_igetw( f ); else if ( size == 4 ) - sigdata->tempo = dumbfile_igetl( f ); + sigdata->tempo = (int)dumbfile_igetl( f ); break; default: dumbfile_skip(f, size); break; } - mptx_id = dumbfile_igetl( f ); + mptx_id = (unsigned int)dumbfile_igetl( f ); } } diff --git a/Frameworks/Dumb/dumb/src/it/itrender.c b/Frameworks/Dumb/dumb/src/it/itrender.c index d5fe0b34f..15d956c4d 100644 --- a/Frameworks/Dumb/dumb/src/it/itrender.c +++ b/Frameworks/Dumb/dumb/src/it/itrender.c @@ -28,6 +28,16 @@ #include "internal/resampler.h" +// Keep this disabled, as it's actually slower than the original C/integer version +// +//#ifdef __APPLE__ +//#include +//#if TARGET_CPU_ARM || TARGET_CPU_ARM64 +//#include +//#define FILTER_NEON +//#endif +//#endif + // #define BIT_ARRAY_BULLSHIT static IT_PLAYING *new_playing() @@ -742,6 +752,91 @@ static void it_filter_sse(DUMB_CLICK_REMOVER *cr, IT_FILTER_STATE *state, sample } #endif +#ifdef FILTER_NEON +static void it_filter_neon(DUMB_CLICK_REMOVER *cr, IT_FILTER_STATE *state, sample_t *dst, long pos, sample_t *src, long size, int step, int sampfreq, int cutoff, int resonance) +{ + float32x4_t data, impulse; + float32x4_t temp1; + float32x2_t temp2; + float32_t temp3; + + sample_t currsample = state->currsample; + sample_t prevsample = state->prevsample; + + float imp[4]; + + //profiler( filter_sse ); On ClawHammer Athlon64 3200+, ~12000 cycles, ~500 for that x87 setup code (as opposed to ~25500 for the original integer code) + + long datasize; + + { + float inv_angle = (float)(sampfreq * pow(0.5, 0.25 + cutoff*(1.0/(24< 2.0f) d = 2.0f; + d = (loss - d) * inv_angle; + e = inv_angle * inv_angle; + imp[0] = 1.0f / (1.0f + d + e); + imp[2] = -e * imp[0]; + imp[1] = 1.0f - imp[0] - imp[2]; +#else + imp[0] = 1.0f / (inv_angle*inv_angle + inv_angle*loss + loss); + imp[2] = -(inv_angle*inv_angle) * imp[0]; + imp[1] = 1.0f - imp[0] - imp[2]; +#endif + imp[3] = 0.0f; + } + + dst += pos * step; + datasize = size * step; + + { + int ai, bi, ci, i; + + if (cr) { + sample_t startstep; + ai = (int)(imp[0] * (1 << (16+SCALEB))); + bi = (int)(imp[1] * (1 << (16+SCALEB))); + ci = (int)(imp[2] * (1 << (16+SCALEB))); + startstep = MULSCA(src[0], ai) + MULSCA(currsample, bi) + MULSCA(prevsample, ci); + dumb_record_click(cr, pos, startstep); + } + + data = vdupq_n_f32(0.0f); + data = vsetq_lane_f32( currsample, data, 1 ); + data = vsetq_lane_f32( prevsample, data, 2 ); + impulse = vld1q_f32( (const float32_t *) &imp ); + + for (i = 0; i < datasize; i += step) { + data = vsetq_lane_f32( src [i], data, 0 ); + temp1 = vmulq_f32(data, impulse); + temp2 = vadd_f32(vget_high_f32(temp1), vget_low_f32(temp1)); + temp3 = vget_lane_f32(vpadd_f32(temp2, temp2), 0); + data = vextq_f32(data, data, 3); + data = vsetq_lane_f32(temp3, data, 1); + dst [i] += temp3; + } + + currsample = temp3; + prevsample = vgetq_lane_f32(data, 2); + + if (cr) { + sample_t endstep = MULSCA(src[datasize], ai) + MULSCA(currsample, bi) + MULSCA(prevsample, ci); + dumb_record_click(cr, pos + size, -endstep); + } + } + + state->currsample = currsample; + state->prevsample = prevsample; +} +#endif + #undef LOG10 #ifdef _USE_SSE @@ -821,7 +916,11 @@ static void it_filter(DUMB_CLICK_REMOVER *cr, IT_FILTER_STATE *state, sample_t * if ( _dumb_it_use_sse ) it_filter_sse( cr, state, dst, pos, src, size, step, sampfreq, cutoff, resonance ); else #endif +#ifdef FILTER_NEON + it_filter_neon( cr, state, dst, pos, src, size, step, sampfreq, cutoff, resonance ); +#else it_filter_int( cr, state, dst, pos, src, size, step, sampfreq, cutoff, resonance ); +#endif } @@ -2076,7 +2175,7 @@ static void post_process_it_volpan(DUMB_IT_SIGRENDERER *sigrenderer, IT_ENTRY *e static void it_send_midi(DUMB_IT_SIGRENDERER *sigrenderer, IT_CHANNEL *channel, unsigned char midi_byte) { if (sigrenderer->callbacks->midi) - if ((*sigrenderer->callbacks->midi)(sigrenderer->callbacks->midi_data, channel - sigrenderer->channel, midi_byte)) + if ((*sigrenderer->callbacks->midi)(sigrenderer->callbacks->midi_data, (int)(channel - sigrenderer->channel), midi_byte)) return; switch (channel->midi_state) { @@ -2555,11 +2654,11 @@ Yxy This uses a table 4 times larger (hence 4 times slower) than IT_SAMPLE *sample = playing->sample; int end; if ((sample->flags & IT_SAMPLE_SUS_LOOP) && !(playing->flags & IT_PLAYING_SUSTAINOFF)) - end = sample->sus_loop_end; + end = (int)sample->sus_loop_end; else if (sample->flags & IT_SAMPLE_LOOP) - end = sample->loop_end; + end = (int)sample->loop_end; else { - end = sample->length; + end = (int)sample->length; if ( sigdata->flags & IT_WAS_PROCESSED && end > 64 ) // XXX bah damn LPC and edge case modules end -= 64; } @@ -3992,7 +4091,7 @@ static void process_playing(DUMB_IT_SIGRENDERER *sigrenderer, IT_PLAYING *playin playing->sample_vibrato_time += playing->sample->vibrato_speed; } -#if defined(_MSC_VER) && _MSC_VER < 1800 +#if (defined(_MSC_VER) && _MSC_VER < 1800) || defined(__ANDROID__) static float log2(float x) {return (float)log(x)/(float)log(2.0f);} #endif @@ -4006,6 +4105,7 @@ static int delta_to_note(float delta, int base) } // Period table for Protracker octaves 0-5: +#if 0 static const unsigned short ProTrackerPeriodTable[6*12] = { 1712,1616,1524,1440,1356,1280,1208,1140,1076,1016,960,907, @@ -4036,6 +4136,7 @@ static const unsigned short ProTrackerTunedPeriods[16*12] = 1736,1640,1548,1460,1378,1302,1228,1160,1094,1032,974,920, 1724,1628,1536,1450,1368,1292,1220,1150,1086,1026,968,914 }; +#endif static void process_all_playing(DUMB_IT_SIGRENDERER *sigrenderer) { @@ -4137,7 +4238,7 @@ static void process_all_playing(DUMB_IT_SIGRENDERER *sigrenderer) } if (playing->channel->glissando && playing->channel->toneporta && playing->channel->destnote < 120) { - playing->delta = (float)pow(DUMB_SEMITONE_BASE, delta_to_note(playing->delta, playing->sample->C5_speed) - 60) + playing->delta = (float)pow(DUMB_SEMITONE_BASE, delta_to_note(playing->delta, (int)playing->sample->C5_speed) - 60) * playing->sample->C5_speed * (1.f / 65536.f); } @@ -5613,9 +5714,9 @@ void _dumb_it_end_sigrenderer(sigrenderer_t *vsigrenderer) #ifdef BIT_ARRAY_BULLSHIT static long it_sigrenderer_get_position(sigrenderer_t *vsigrenderer) { - DUMB_IT_SIGRENDERER *sigrenderer = vsigrenderer; + DUMB_IT_SIGRENDERER *sigrenderer = (DUMB_IT_SIGRENDERER *) vsigrenderer; - return sigrenderer->time_played >> 16; + return (long)(sigrenderer->time_played >> 16); } #endif @@ -5671,7 +5772,7 @@ void dumb_it_sr_get_channel_state(DUMB_IT_SIGRENDERER *sr, int channel, DUMB_IT_ if (playing->flags & IT_PLAYING_DEAD) { state->sample = 0; return; } - state->channel = playing->channel - sr->channel; + state->channel = (int)(playing->channel - sr->channel); state->sample = playing->sampnum; state->volume = calculate_volume(sr, playing, 1.0f); diff --git a/Frameworks/Dumb/dumb/src/it/readamf.c b/Frameworks/Dumb/dumb/src/it/readamf.c index d79abccf0..82fa4fca3 100644 --- a/Frameworks/Dumb/dumb/src/it/readamf.c +++ b/Frameworks/Dumb/dumb/src/it/readamf.c @@ -210,7 +210,7 @@ static int it_amf_read_sample_header( IT_SAMPLE *sample, DUMBFILE *f, int * offs dumbfile_getnc( (char *) sample->filename, 13, f ); sample->filename[13] = 0; - *offset = dumbfile_igetl( f ); + *offset = (int)dumbfile_igetl( f ); sample->length = dumbfile_igetl( f ); sample->C5_speed = dumbfile_igetw( f ); sample->default_volume = dumbfile_getc( f ); @@ -259,14 +259,14 @@ static int it_amf_read_sample_data( IT_SAMPLE *sample, DUMBFILE *f ) return -1; if ( sample->length ) - read_length = dumbfile_getnc( sample->data, sample->length, f ); + read_length = (int)dumbfile_getnc( sample->data, sample->length, f ); for ( i = 0; i < read_length; i++ ) { - ( ( char * ) sample->data )[ i ] ^= 0x80; + ( ( signed char * ) sample->data )[ i ] ^= 0x80; } for ( i = read_length; i < sample->length; i++ ) { - ( ( char * ) sample->data )[ i ] = 0; + ( ( signed char * ) sample->data )[ i ] = 0; } return 0; /* Sometimes the last sample is truncated :( */ diff --git a/Frameworks/Dumb/dumb/src/it/readoldpsm.c b/Frameworks/Dumb/dumb/src/it/readoldpsm.c index 6000bdb7d..b3f81076d 100644 --- a/Frameworks/Dumb/dumb/src/it/readoldpsm.c +++ b/Frameworks/Dumb/dumb/src/it/readoldpsm.c @@ -169,8 +169,8 @@ static int it_old_psm_read_samples(IT_SAMPLE ** sample, DUMBFILE * f, int * num) } else { if (flags & 4) { for (o = 0; o < s->length; o++) { - delta += (short)(sdata[o * 2] | (sdata[(o * 2) + 1] << 8)); - ((short *)s->data)[o] = delta; + delta += (signed short)(sdata[o * 2] | (sdata[(o * 2) + 1] << 8)); + ((signed short *)s->data)[o] = delta; } } else { for (o = 0; o < s->length; o++) { @@ -464,7 +464,7 @@ static int it_old_psm_read_patterns(IT_PATTERN * pattern, DUMBFILE * f, int num, } } - p->n_entries = entry - p->entry; + p->n_entries = (int)(entry - p->entry); offset += psize; } @@ -493,8 +493,8 @@ PSM_COMPONENT; static int psm_component_compare(const void *e1, const void *e2) { - return ((const PSM_COMPONENT *)e1)->offset - - ((const PSM_COMPONENT *)e2)->offset; + return (int)(((const PSM_COMPONENT *)e1)->offset - + ((const PSM_COMPONENT *)e2)->offset); } static DUMB_IT_SIGDATA *it_old_psm_load_sigdata(DUMBFILE *f) @@ -590,7 +590,7 @@ static DUMB_IT_SIGDATA *it_old_psm_load_sigdata(DUMBFILE *f) if (!n_components) goto error_fc; - total_pattern_size = dumbfile_igetl(f); + total_pattern_size = (int)dumbfile_igetl(f); if (!total_pattern_size) goto error_fc; qsort(component, n_components, sizeof(PSM_COMPONENT), &psm_component_compare);