diff --git a/Frameworks/lazyusf/lazyusf/memory.c b/Frameworks/lazyusf/lazyusf/memory.c index 861c8bd42..ff5d96a07 100644 --- a/Frameworks/lazyusf/lazyusf/memory.c +++ b/Frameworks/lazyusf/lazyusf/memory.c @@ -191,7 +191,7 @@ int32_t r4300i_LB_NonMemory ( usf_state_t * state, uint32_t PAddr, uint32_t * Va } uint32_t r4300i_LB_VAddr ( usf_state_t * state, uint32_t VAddr, uint8_t * Value ) { - uintptr_t address; + uint32_t address; address = state->TLB_Map[VAddr >> 12]; if (address == 0) { return 0; } *Value = *(uint8_t *)(address + (VAddr ^ 3)); @@ -199,7 +199,7 @@ uint32_t r4300i_LB_VAddr ( usf_state_t * state, uint32_t VAddr, uint8_t * Value } uint32_t r4300i_LD_VAddr ( usf_state_t * state, uint32_t VAddr, uint64_t * Value ) { - uintptr_t address; + uint32_t address; address = state->TLB_Map[VAddr >> 12]; if (address == 0) { return 0; } *((uint32_t *)(Value) + 1) = *(uint32_t *)(address + VAddr); @@ -220,7 +220,7 @@ int32_t r4300i_LH_NonMemory ( usf_state_t * state, uint32_t PAddr, uint32_t * Va } uint32_t r4300i_LH_VAddr ( usf_state_t * state, uint32_t VAddr, uint16_t * Value ) { - uintptr_t address; + uint32_t address; address = state->TLB_Map[VAddr >> 12]; if (address == 0) return 0; @@ -426,7 +426,7 @@ int32_t r4300i_SB_NonMemory ( usf_state_t * state, uint32_t PAddr, uint8_t Value } uint32_t r4300i_SB_VAddr ( usf_state_t * state, uint32_t VAddr, uint8_t Value ) { - uintptr_t address; + uint32_t address; address = state->TLB_Map[VAddr >> 12]; if (address == 0) { return 0; } @@ -457,7 +457,7 @@ int32_t r4300i_SH_NonMemory ( usf_state_t * state, uint32_t PAddr, uint16_t Valu } uint32_t r4300i_SD_VAddr ( usf_state_t * state, uint32_t VAddr, uint64_t Value ) { - uintptr_t address; + uint32_t address; address = state->TLB_Map[VAddr >> 12]; if (address == 0) { return 0; } *(uint32_t *)(address + VAddr) = *((uint32_t *)(&Value) + 1); @@ -466,7 +466,7 @@ uint32_t r4300i_SD_VAddr ( usf_state_t * state, uint32_t VAddr, uint64_t Value ) } uint32_t r4300i_SH_VAddr ( usf_state_t * state, uint32_t VAddr, uint16_t Value ) { - uintptr_t address; + uint32_t address; address = state->TLB_Map[VAddr >> 12]; if (address == 0) { return 0; } diff --git a/Frameworks/lazyusf/lazyusf/rsp/execute.h b/Frameworks/lazyusf/lazyusf/rsp/execute.h index 8afbb7238..01801242b 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/execute.h +++ b/Frameworks/lazyusf/lazyusf/rsp/execute.h @@ -22,6 +22,9 @@ NOINLINE void run_task(usf_state_t * state) { register int PC; int wrap_count = 0; +#ifdef SP_EXECUTE_LOG + int last_PC; +#endif if (CFG_WAIT_FOR_CPU_HOST != 0) { @@ -36,6 +39,9 @@ NOINLINE void run_task(usf_state_t * state) register uint32_t inst; inst = *(uint32_t *)(state->IMEM + FIT_IMEM(PC)); +#ifdef SP_EXECUTE_LOG + last_PC = PC; +#endif #ifdef EMULATE_STATIC_PC PC = (PC + 0x004); if ( FIT_IMEM(PC) == 0 && ++wrap_count == 32 ) @@ -46,7 +52,7 @@ NOINLINE void run_task(usf_state_t * state) EX: #endif #ifdef SP_EXECUTE_LOG - step_SP_commands(inst); + step_SP_commands(state, last_PC, inst); #endif if (inst >> 25 == 0x25) /* is a VU instruction */ { @@ -463,6 +469,9 @@ EX: continue; BRANCH: inst = *(uint32_t *)(state->IMEM + FIT_IMEM(PC)); +#ifdef SP_EXECUTE_LOG + last_PC = PC; +#endif PC = state->temp_PC & 0x00000FFC; goto EX; #endif diff --git a/Frameworks/lazyusf/lazyusf/rsp/rsp.h b/Frameworks/lazyusf/lazyusf/rsp/rsp.h index 1ba9287af..0b40ccc91 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/rsp.h +++ b/Frameworks/lazyusf/lazyusf/rsp/rsp.h @@ -58,12 +58,10 @@ NOINLINE static void message(usf_state_t * state, const char* body, int priority */ #define CHARACTERS_PER_LINE (80) /* typical standard DOS text file limit per line */ -#if 0 NOINLINE static void update_conf(const char* source) { (void)source; } -#endif #ifdef SP_EXECUTE_LOG extern void step_SP_commands(usf_state_t * state, int PC, uint32_t inst); diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/cf.h b/Frameworks/lazyusf/lazyusf/rsp/vu/cf.h index 9e080c2f0..d4b31a910 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/cf.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/cf.h @@ -180,7 +180,6 @@ void set_VCC(usf_state_t * state, unsigned short VCC) state->clip[i] = (VCC >> (i + 0x8)) & 1; return; /* Little endian becomes big. */ } -#if 0 void set_VCE(usf_state_t * state, unsigned char VCE) { register int i; @@ -190,4 +189,3 @@ void set_VCE(usf_state_t * state, unsigned char VCE) return; /* Little endian becomes big. */ } #endif -#endif diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/clamp.h b/Frameworks/lazyusf/lazyusf/rsp/vu/clamp.h index e2b6d79d6..d238931af 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/clamp.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/clamp.h @@ -42,6 +42,22 @@ static INLINE void merge(short* VD, short* cmp, short* pass, short* fail) { register int i; + +#ifdef ARCH_MIN_ARM_NEON + + int16x8_t p,f,d,c,vd,temp; + + p = vld1q_s16((const int16_t*)pass); + f = vld1q_s16((const int16_t*)fail); + c = vld1q_s16((const int16_t*)cmp); + + d = vsubq_s16(p,f); + vd = vmlaq_s16(f, c, d); //vd = f + (cmp * d) + vst1q_s16(VD, vd); + return; + +#else + #if (0) /* Do not use this version yet, as it still does not vectorize to SSE2. */ for (i = 0; i < N; i++) @@ -55,9 +71,92 @@ static INLINE void merge(short* VD, short* cmp, short* pass, short* fail) VD[i] = fail[i] + cmp[i]*diff[i]; /* actually `(cmp[i] != 0)*diff[i]` */ #endif return; + +#endif } -#ifndef ARCH_MIN_SSE2 +#ifdef ARCH_MIN_ARM_NEON +static INLINE void vector_copy(short * VD, short * VS) +{ + int16x8_t xmm; + xmm = vld1q_s16((const int16_t*)VS); + vst1q_s16(VD, xmm); + + return; +} + +static INLINE void SIGNED_CLAMP_ADD(usf_state_t * state, short* VD, short* VS, short* VT) +{ + int16x8_t dst, src, vco, max, min; + + src = vld1q_s16((const int16_t*)VS); + dst = vld1q_s16((const int16_t*)VT); + vco = vld1q_s16((const int16_t*)state->co); + + max = vmaxq_s16(dst, src); + min = vminq_s16(dst, src); + + min = vqaddq_s16(min, vco); + max = vqaddq_s16(max, min); + + vst1q_s16(VD, max); + return; + +} + +static INLINE void SIGNED_CLAMP_SUB(usf_state_t * state, short* VD, short* VS, short* VT) +{ + int16x8_t dst, src, vco, dif, res, xmm,vd; + + src = vld1q_s16((const int16_t*)VS); + vd = vld1q_s16((const int16_t*)VD); + dst = vld1q_s16((const int16_t*)VT); + vco = vld1q_s16((const int16_t*)state->co); + + res = vqsubq_s16(src, dst); + + dif = vaddq_s16(res, vco); + dif = veorq_s16(dif, res); + dif = vandq_s16(dif, dst); + xmm = vsubq_s16(src, dst); + src = vbicq_s16(dif, src); + xmm = vandq_s16(xmm, src); + xmm = vshrq_n_s16(xmm, 15); + + xmm = vbicq_s16(vco, xmm); + res = vqsubq_s16(res, xmm); + vst1q_s16(VD, res); + + return; + +} + +static INLINE void SIGNED_CLAMP_AM(usf_state_t * state, short* VD) +{ + int16x8_t pvs, pvd; + int16x8x2_t packed; + int16x8_t result; + int16x4_t low, high; + + pvs = vld1q_s16((const int16_t*)VACC_H); + pvd = vld1q_s16((const int16_t*)VACC_M); + + packed = vzipq_s16(pvd,pvs); + + low = vqmovn_s32((int32x4_t)packed.val[0]); + high = vqmovn_s32((int32x4_t)packed.val[1]); + + result = vcombine_s16(low,high); + + vst1q_s16(VD,result); + + return; +} + +#endif + +#if !defined ARCH_MIN_SSE2 && !defined ARCH_MIN_ARM_NEON + static INLINE void vector_copy(short* VD, short* VS) { #if (0) @@ -92,6 +191,8 @@ static INLINE void SIGNED_CLAMP_ADD(usf_state_t * state, short* VD, short* VS, s VD[i] ^= 0x8000 & (hi[i] | lo[i]); return; } + + static INLINE void SIGNED_CLAMP_SUB(usf_state_t * state, short* VD, short* VS, short* VT) { ALIGNED int32_t dif[N]; @@ -113,8 +214,9 @@ static INLINE void SIGNED_CLAMP_SUB(usf_state_t * state, short* VD, short* VS, s VD[i] ^= 0x8000 & (hi[i] | lo[i]); return; } + static INLINE void SIGNED_CLAMP_AM(usf_state_t * state, short* VD) -{ /* typical sign-clamp of accumulator-mid (bits 31:16) */ +{ ALIGNED short hi[N], lo[N]; register int i; @@ -135,7 +237,9 @@ static INLINE void SIGNED_CLAMP_AM(usf_state_t * state, short* VD) VD[i] ^= 0x8000 * (hi[i] | lo[i]); return; } -#else +#endif + +#ifdef ARCH_MIN_SSE2 /* * We actually need to write explicit SSE2 code for this because GCC 4.8.1 * (and possibly later versions) has a code generation bug with vectorizing @@ -225,10 +329,29 @@ static INLINE void SIGNED_CLAMP_AM(usf_state_t * state, short* VD) static INLINE void UNSIGNED_CLAMP(usf_state_t * state, short* VD) { /* sign-zero hybrid clamp of accumulator-mid (bits 31:16) */ + ALIGNED short cond[N]; ALIGNED short temp[N]; register int i; +#ifdef ARCH_MIN_ARM_NEON + + uint16x8_t c; + int16x8_t t = vld1q_s16((const int16_t*)temp); + int16x8_t vaccm = vld1q_s16((const int16_t*)VACC_M); + + SIGNED_CLAMP_AM(state, temp); + + c = vcgtq_s16(t,vaccm); + int16x8_t t_ = vshrq_n_s16(t,15); + int16x8_t vd = vbicq_s16(t,t_); + vd = vorrq_s16(vd,(int16x8_t)c); + vst1q_s16(VD, vd); + + return; + +#else + SIGNED_CLAMP_AM(state, temp); /* no direct map in SSE, but closely based on this */ for (i = 0; i < N; i++) cond[i] = -(temp[i] > VACC_M[i]); /* VD |= -(ACC47..16 > +32767) */ @@ -237,12 +360,36 @@ static INLINE void UNSIGNED_CLAMP(usf_state_t * state, short* VD) for (i = 0; i < N; i++) VD[i] = VD[i] | cond[i]; return; +#endif } + static INLINE void SIGNED_CLAMP_AL(usf_state_t * state, short* VD) { /* sign-clamp accumulator-low (bits 15:0) */ + ALIGNED short cond[N]; ALIGNED short temp[N]; register int i; + + +#ifdef ARCH_MIN_ARM_NEON + + SIGNED_CLAMP_AM(state, temp); + + uint16x8_t c; + int16x8_t eightk = vdupq_n_s16(0x8000); + uint16x8_t one = vdupq_n_u16(1); + int16x8_t t = vld1q_s16((const int16_t*)temp); + int16x8_t vaccm = vld1q_s16((const int16_t*)VACC_M); + + c = vceqq_s16(t,vaccm); + c = vaddq_u16(c, one); + t = veorq_s16(t, eightk); + vst1q_u16(cond,c); + vst1q_s16(temp,t); + merge(VD, cond, temp, VACC_L); + + return; +#else SIGNED_CLAMP_AM(state, temp); /* no direct map in SSE, but closely based on this */ for (i = 0; i < N; i++) @@ -251,5 +398,6 @@ static INLINE void SIGNED_CLAMP_AL(usf_state_t * state, short* VD) temp[i] ^= 0x8000; /* half-assed unsigned saturation mix in the clamp */ merge(VD, cond, temp, VACC_L); return; +#endif } #endif diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/shuffle.h b/Frameworks/lazyusf/lazyusf/rsp/vu/shuffle.h index 0898133fb..f82991c86 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/shuffle.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/shuffle.h @@ -22,65 +22,42 @@ #define INLINE #endif -#ifndef ARCH_MIN_SSE2 -/* - * vector-scalar element decoding - * Obsolete. Consider using at least the SSE2 algorithms instead. - */ -static const int ei[16][8] = { - { 00, 01, 02, 03, 04, 05, 06, 07 }, /* none (vector-only operand) */ - { 00, 01, 02, 03, 04, 05, 06, 07 }, - { 00, 00, 02, 02, 04, 04, 06, 06 }, /* 0Q */ - { 01, 01, 03, 03, 05, 05, 07, 07 }, /* 1Q */ - { 00, 00, 00, 00, 04, 04, 04, 04 }, /* 0H */ - { 01, 01, 01, 01, 05, 05, 05, 05 }, /* 1H */ - { 02, 02, 02, 02, 06, 06, 06, 06 }, /* 2H */ - { 03, 03, 03, 03, 07, 07, 07, 07 }, /* 3H */ - { 00, 00, 00, 00, 00, 00, 00, 00 }, /* 0 */ - { 01, 01, 01, 01, 01, 01, 01, 01 }, /* 1 */ - { 02, 02, 02, 02, 02, 02, 02, 02 }, /* 2 */ - { 03, 03, 03, 03, 03, 03, 03, 03 }, /* 3 */ - { 04, 04, 04, 04, 04, 04, 04, 04 }, /* 4 */ - { 05, 05, 05, 05, 05, 05, 05, 05 }, /* 5 */ - { 06, 06, 06, 06, 06, 06, 06, 06 }, /* 6 */ - { 07, 07, 07, 07, 07, 07, 07, 07 } /* 7 */ +#ifdef ARCH_MIN_ARM_NEON +static const unsigned char smask[16][16] = { + {0x0,0x1,0x2,0x3,0x4,0x5,0x6,0x7,0x8,0x9,0xA,0xB,0xC,0xD,0xE,0xF}, + {0x0,0x1,0x2,0x3,0x4,0x5,0x6,0x7,0x8,0x9,0xA,0xB,0xC,0xD,0xE,0xF}, + {0x0,0x1,0x0,0x1,0x4,0x5,0x4,0x5,0x8,0x9,0x8,0x9,0xC,0xD,0xC,0xD}, + {0x2,0x3,0x2,0x3,0x6,0x7,0x6,0x7,0xA,0xB,0xA,0xB,0xE,0xF,0xE,0xF}, + {0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9}, + {0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB}, + {0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD}, + {0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF}, + {0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1}, + {0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3}, + {0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5}, + {0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7}, + {0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9}, + {0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB}, + {0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD}, + {0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF} }; -int sub_mask[16] = { - 0x0, - 0x0, - 0x1, 0x1, - 0x3, 0x3, 0x3, 0x3, - 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7 -}; +#define conv816_to_882(v) ((int8x8x2_t) { vget_low_s8(v), vget_high_s8(v) }) INLINE static void SHUFFLE_VECTOR(short* VD, short* VT, const int e) -{ - ALIGNED short SV[8]; - register int i, j; -#if (0 == 0) - j = sub_mask[e]; - for (i = 0; i < N; i++) - SV[i] = VT[(i & ~j) | (e & j)]; -#else - if (e & 0x8) - for (i = 0; i < N; i++) - SV[i] = VT[(i & 0x0) | (e & 0x7)]; - else if (e & 0x4) - for (i = 0; i < N; i++) - SV[i] = VT[(i & 0xC) | (e & 0x3)]; - else if (e & 0x2) - for (i = 0; i < N; i++) - SV[i] = VT[(i & 0xE) | (e & 0x1)]; - else /* if ((e == 0b0000) || (e == 0b0001)) */ - for (i = 0; i < N; i++) - SV[i] = VT[(i & 0x7) | (e & 0x0)]; -#endif - for (i = 0; i < N; i++) - *(VD + i) = *(SV + i); +{ + int8x16_t xmm; + int8x16_t key; + + xmm = vld1q_s8((const int8_t*)VT); + key = vld1q_s8(smask[e]); + xmm = vcombine_s8(vtbl2_s8(conv816_to_882(xmm), vget_low_s8(key)), vtbl2_s8(conv816_to_882(xmm), vget_high_s8(key))); + vst1q_s8((int8_t*)VD, xmm); + return; } -#else +#endif + #ifdef ARCH_MIN_SSSE3 static const unsigned char smask[16][16] = { {0x0,0x1,0x2,0x3,0x4,0x5,0x6,0x7,0x8,0x9,0xA,0xB,0xC,0xD,0xE,0xF}, @@ -112,7 +89,9 @@ INLINE static void SHUFFLE_VECTOR(short* VD, short* VT, const int e) _mm_store_si128((__m128i *)VD, xmm); return; } -#else +#endif + +#if defined ARCH_MIN_SSE2 && !defined ARCH_MIN_SSSE3 #define B(x) ((x) & 3) #define SHUFFLE(a,b,c,d) ((B(d)<<6) | (B(c)<<4) | (B(b)<<2) | (B(a)<<0)) @@ -274,5 +253,65 @@ INLINE static void SHUFFLE_VECTOR(short* VD, short* VT, const int e) return; } #endif + +#if !defined ARCH_MIN_ARM_NEON && !defined ARCH_MIN_SSE2 && !defined ARCH_MIN_SSSE3 +/* + * vector-scalar element decoding + * Obsolete. Consider using at least the SSE2 algorithms instead. + */ +static const int ei[16][8] = { + { 00, 01, 02, 03, 04, 05, 06, 07 }, /* none (vector-only operand) */ + { 00, 01, 02, 03, 04, 05, 06, 07 }, + { 00, 00, 02, 02, 04, 04, 06, 06 }, /* 0Q */ + { 01, 01, 03, 03, 05, 05, 07, 07 }, /* 1Q */ + { 00, 00, 00, 00, 04, 04, 04, 04 }, /* 0H */ + { 01, 01, 01, 01, 05, 05, 05, 05 }, /* 1H */ + { 02, 02, 02, 02, 06, 06, 06, 06 }, /* 2H */ + { 03, 03, 03, 03, 07, 07, 07, 07 }, /* 3H */ + { 00, 00, 00, 00, 00, 00, 00, 00 }, /* 0 */ + { 01, 01, 01, 01, 01, 01, 01, 01 }, /* 1 */ + { 02, 02, 02, 02, 02, 02, 02, 02 }, /* 2 */ + { 03, 03, 03, 03, 03, 03, 03, 03 }, /* 3 */ + { 04, 04, 04, 04, 04, 04, 04, 04 }, /* 4 */ + { 05, 05, 05, 05, 05, 05, 05, 05 }, /* 5 */ + { 06, 06, 06, 06, 06, 06, 06, 06 }, /* 6 */ + { 07, 07, 07, 07, 07, 07, 07, 07 } /* 7 */ +}; + +static const int sub_mask[16] = { + 0x0, + 0x0, + 0x1, 0x1, + 0x3, 0x3, 0x3, 0x3, + 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7 +}; + +INLINE static void SHUFFLE_VECTOR(short* VD, short* VT, const int e) +{ + ALIGNED short SV[8]; + register int i, j; +#if (0 == 0) + j = sub_mask[e]; + for (i = 0; i < N; i++) + SV[i] = VT[(i & ~j) | (e & j)]; +#else + if (e & 0x8) + for (i = 0; i < N; i++) + SV[i] = VT[(i & 0x0) | (e & 0x7)]; + else if (e & 0x4) + for (i = 0; i < N; i++) + SV[i] = VT[(i & 0xC) | (e & 0x3)]; + else if (e & 0x2) + for (i = 0; i < N; i++) + SV[i] = VT[(i & 0xE) | (e & 0x1)]; + else /* if ((e == 0b0000) || (e == 0b0001)) */ + for (i = 0; i < N; i++) + SV[i] = VT[(i & 0x7) | (e & 0x0)]; +#endif + for (i = 0; i < N; i++) + *(VD + i) = *(SV + i); + return; +} + #endif #endif diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vabs.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vabs.h index 5f36d3420..fc4e6c3d5 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vabs.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vabs.h @@ -27,6 +27,30 @@ INLINE static void do_abs(usf_state_t * state, short* VD, short* VS, short* VT) register int i; vector_copy(res, VT); + +#ifdef ARCH_MIN_ARM_NEON + + int16x8_t vs = vld1q_s16((const int16_t*)VS); + int16x8_t resi = vld1q_s16((const int16_t*)VT); + int16x8_t zero = vdupq_n_s16(0); + int16x8_t eightk = vdupq_n_s16(0x8000); + int16x8_t one = vdupq_n_s16(1); + + uint16x8_t negi = vcltq_s16(vs,zero); + int16x8_t posi = vaddq_s16((int16x8_t)negi,one); + posi = vorrq_s16(posi,(int16x8_t)negi); + resi = veorq_s16(resi,posi); + uint16x8_t ccch = vcgeq_s16(resi,eightk); + int16x8_t ch = vnegq_s16((int16x8_t)ccch); + resi = vaddq_s16(resi, (int16x8_t)ch); + + vst1q_s16(VACC_L, resi); + vector_copy(VD, VACC_L); + return; + +#else + + #ifndef ARCH_MIN_SSE2 #define MASK_XOR #endif @@ -65,6 +89,7 @@ INLINE static void do_abs(usf_state_t * state, short* VD, short* VS, short* VT) vector_copy(VACC_L, res); vector_copy(VD, VACC_L); return; +#endif } static void VABS(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vadd.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vadd.h index d29fb80a0..9566e6122 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vadd.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vadd.h @@ -15,6 +15,50 @@ INLINE static void clr_ci(usf_state_t * state, short* VD, short* VS, short* VT) { /* clear CARRY and carry in to accumulators */ +#ifdef ARCH_MIN_SSE2 + + __m128i xmm,vs,vt,co; /*,ne;*/ + + xmm = _mm_setzero_si128(); + vs = _mm_load_si128((const __m128i*)VS); + vt = _mm_load_si128((const __m128i*)VT); + co = _mm_load_si128((const __m128i*)state->co); + + vs = _mm_add_epi16(vs,vt); + vs = _mm_add_epi16(vs,co); + + _mm_store_si128((__m128i*)VACC_L, vs); + + SIGNED_CLAMP_ADD(state, VD, VS, VT); + + _mm_storeu_si128((__m128i*)state->ne, xmm); + _mm_storeu_si128((__m128i*)state->co, xmm); + + return; +#endif + +#ifdef ARCH_MIN_ARM_NEON + + int16x8_t vs, vt, zero1,co; + + zero1 = vdupq_n_s16(0); + vs = vld1q_s16((const int16_t*)VS); + vt = vld1q_s16((const int16_t*)VT); + co = vld1q_s16((const int16_t*)state->co); + + vs = vaddq_s16(vs,vt); + vs = vaddq_s16(vs,co); + + vst1q_s16(VACC_L, vs); + + SIGNED_CLAMP_ADD(state, VD, VS, VT); + vst1q_s16(state->ne, zero1); + vst1q_s16(state->co, zero1); + + return; +#endif + +#if !defined ARCH_MIN_ARM_NEON && !defined ARCH_MIN_SSE2 register int i; for (i = 0; i < N; i++) @@ -24,7 +68,10 @@ INLINE static void clr_ci(usf_state_t * state, short* VD, short* VS, short* VT) state->ne[i] = 0; for (i = 0; i < N; i++) state->co[i] = 0; + return; +#endif + } static void VADD(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vaddc.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vaddc.h index b352040c4..c6b8c4a96 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vaddc.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vaddc.h @@ -15,6 +15,30 @@ INLINE static void set_co(usf_state_t * state, short* VD, short* VS, short* VT) { /* set CARRY and carry out from sum */ + +#ifdef ARCH_MIN_ARM_NEON + + uint16x4_t vs_low = vld1_u16((const uint16_t*)VS); + uint16x4_t vs_high = vld1_u16((const uint16_t*)VS+4); + uint16x4_t vt_low = vld1_u16((const uint16_t*)VT); + uint16x4_t vt_high = vld1_u16((const uint16_t*)VT+4); + uint32x4_t zero = vdupq_n_u32(0); + + uint32x4_t v_l = vaddl_u16(vs_low, vt_low); + uint32x4_t v_h = vaddl_u16(vs_high, vt_high); + uint16x4_t vl16 = vaddhn_u32(v_l,zero); + uint16x4_t vh16 = vaddhn_u32(v_h,zero); + uint16x8_t vaccl = vcombine_u16(vmovn_u32(v_l),vmovn_u32(v_h)); + uint16x8_t co = vcombine_u16(vl16,vh16); + + vst1q_u16(VACC_L, vaccl); + vector_copy(VD, VACC_L); + vst1q_u16(state->ne, (uint16x8_t)zero); + vst1q_u16(state->co, co); + + return; +#else + ALIGNED int32_t sum[N]; register int i; @@ -28,6 +52,8 @@ INLINE static void set_co(usf_state_t * state, short* VD, short* VS, short* VT) for (i = 0; i < N; i++) state->co[i] = sum[i] >> 16; /* native: (sum[i] > +65535) */ return; + +#endif } static void VADDC(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vand.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vand.h index ebe9434c7..b5c00884e 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vand.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vand.h @@ -13,14 +13,25 @@ \******************************************************************************/ #include "vu.h" -static INLINE void do_and(usf_state_t * state, short* VD, short* VS, short* VT) +INLINE void do_and(usf_state_t * state, short* VD, short* VS, short* VT) { +#ifdef ARCH_MIN_ARM_NEON + + int16x8_t vs = vld1q_s16((const int16_t *)VS); + int16x8_t vt = vld1q_s16((const int16_t *)VT); + int16x8_t vaccl = vandq_s16(vs,vt); + vst1q_s16(VACC_L, vaccl); + vector_copy(VD, VACC_L); + return; + +#else register int i; for (i = 0; i < N; i++) VACC_L[i] = VS[i] & VT[i]; vector_copy(VD, VACC_L); return; +#endif } static void VAND(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vch.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vch.h index fb9eb9744..63345778e 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vch.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vch.h @@ -19,6 +19,58 @@ INLINE static void do_ch(usf_state_t * state, short* VD, short* VS, short* VT) ALIGNED short sn[N]; ALIGNED short VC[N]; ALIGNED short diff[N]; + +#ifdef ARCH_MIN_ARM_NEON + + int16x8_t v_vc,neg_sn,vce,v_eq; + int16x8_t zero = vdupq_n_s16(0); + int16x8_t one = vdupq_n_s16(1); + + int16x8_t vs = vld1q_s16((const int16_t *)VS); + int16x8_t vt = vld1q_s16((const int16_t *)VT); + v_vc = vt; + int16x8_t v_sn = veorq_s16(vs,v_vc); + uint16x8_t sn_u = vcltq_s16(v_sn,zero); + v_vc = veorq_s16(v_vc, (int16x8_t)sn_u); + neg_sn = vnegq_s16((int16x8_t)sn_u); + + uint16x8_t vs_vc_eq = vceqq_s16(vs,v_vc); + vce = vandq_s16((int16x8_t)vs_vc_eq,v_sn); + + v_vc = vaddq_s16(v_vc,v_sn); + v_eq = vorrq_s16(vce,(int16x8_t)vs_vc_eq); + v_eq = vnegq_s16(v_eq); + + int16x8_t not_sn = vsubq_s16(neg_sn, one); + int16x8_t neg_vs = vmvnq_s16(vs); + int16x8_t v_diff = vorrq_s16(neg_vs, not_sn); + uint16x8_t ule = vcleq_s16(vt,v_diff); + int16x8_t v_le = vnegq_s16((int16x8_t)ule); + + v_diff = vorrq_s16(vs, (int16x8_t)sn_u); + uint16x8_t uge = vcgeq_s16(v_diff,vt); + int16x8_t v_ge = vnegq_s16((int16x8_t)uge); + + vst1q_s16(ge, v_ge); + vst1q_s16(le, v_le); + vst1q_s16(sn, v_sn); + vst1q_s16(VC, v_vc); + + merge(state->comp, sn, le, ge); + merge(VACC_L, state->comp, VC, VS); + vector_copy(VD, VACC_L); + + v_eq = veorq_s16(v_eq, one); + + vst1q_s16(state->clip, v_ge); + vst1q_s16(state->comp, v_le); + vst1q_s16(state->ne, v_eq); + vst1q_s16(state->co, v_sn); + + return; + +#else + register int i; for (i = 0; i < N; i++) @@ -72,6 +124,7 @@ INLINE static void do_ch(usf_state_t * state, short* VD, short* VS, short* VT) for (i = 0; i < N; i++) state->co[i] = sn[i]; return; +#endif } static void VCH(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vcl.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vcl.h index 328bb3ff6..9138955c1 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vcl.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vcl.h @@ -15,6 +15,7 @@ INLINE static void do_cl(usf_state_t * state, short* VD, short* VS, short* VT) { + ALIGNED short eq[N], ge[N], le[N]; ALIGNED short gen[N], len[N], lz[N], uz[N], sn[N]; ALIGNED short diff[N]; @@ -22,6 +23,88 @@ INLINE static void do_cl(usf_state_t * state, short* VD, short* VS, short* VT) ALIGNED unsigned short VB[N], VC[N]; register int i; +#ifdef ARCH_MIN_ARM_NEON + + int16x8_t v_vc,neg_sn,v_eq,v_vb,v_sn,v_diff,v_uz,v_cmp; + + int16x8_t zero = vdupq_n_s16(0); + int16x8_t minus1 = vdupq_n_s16(-1); + int16x8_t one = vdupq_n_s16(1); + + int16x8_t vs = vld1q_s16((const int16_t *)VS); + int16x8_t vt = vld1q_s16((const int16_t *)VT); + int16x8_t ne = vld1q_s16((const int16_t *)state->ne); + int16x8_t co = vld1q_s16((const int16_t *)state->co); + int16x8_t vce = vld1q_s16((const int16_t *)state->vce); + + v_vb = vs; + v_vc = vt; + + v_eq = veorq_s16(ne, one); + v_sn = co; + + neg_sn = vnegq_s16((int16x8_t)v_sn); + v_vc = veorq_s16(v_vc, (int16x8_t)neg_sn); + v_vc = vaddq_s16(v_vc, v_sn); + + v_diff = vsubq_s16(v_vb,v_vc); + + uint16x8_t vb_cond = vceqq_s16(v_vb,minus1); + uint16x8_t vc_cond = vceqq_s16(v_vc,zero); + vb_cond = vmvnq_u16(vb_cond); + vc_cond = vmvnq_u16(vc_cond); + v_uz = vorrq_s16((int16x8_t)vb_cond, (int16x8_t)vc_cond); + + uint16x8_t v_lz = vceqq_s16(v_diff,zero); + int16x8_t lz_s = vnegq_s16((int16x8_t)v_lz); + + int16x8_t v_gen = vorrq_s16(lz_s,v_uz); + int16x8_t v_len = vandq_s16(lz_s,v_uz); + v_gen = vandq_s16(v_gen,vce); + + vce = veorq_s16(vce,one); + v_len = vandq_s16(v_len,vce); + + v_len = vorrq_s16(v_len,v_gen); + uint16x8_t gen_u = vcgeq_u16((uint16x8_t)v_vb,(uint16x8_t)v_vc); + v_gen = vnegq_s16((int16x8_t)gen_u); + + v_cmp = vandq_s16(v_eq,v_sn); + + vst1q_s16(cmp, v_cmp); + vst1q_s16(len, v_len); + + merge(le, cmp, len, state->comp); + int16x8_t sn_xor = veorq_s16(v_sn,one); + v_cmp = vandq_s16(v_eq,sn_xor); + + vst1q_s16(cmp, v_cmp); + vst1q_s16(gen, v_gen); + vst1q_s16(sn, v_sn); + vst1q_s16(VC, v_vc); + + merge(ge, cmp, gen, state->clip); + + merge(cmp, sn, le, ge); + merge(VACC_L, cmp, (short *)VC, VS); + vector_copy(VD, VACC_L); + + int16x8_t v_ge = vld1q_s16((const int16_t *)ge); + int16x8_t v_le = vld1q_s16((const int16_t *)le); + + vst1q_s16(state->clip,v_ge); + vst1q_s16(state->comp,v_le); + + vst1q_s16(state->ne,zero); + vst1q_s16(state->co,zero); + vst1q_s16(state->vce,zero); + + return; + +#else + + + for (i = 0; i < N; i++) VB[i] = VS[i]; for (i = 0; i < N; i++) @@ -88,6 +171,7 @@ INLINE static void do_cl(usf_state_t * state, short* VD, short* VS, short* VT) for (i = 0; i < N; i++) state->vce[i] = 0; return; +#endif } static void VCL(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vcr.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vcr.h index 589cdbb8b..b39b7ac6f 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vcr.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vcr.h @@ -20,6 +20,48 @@ INLINE static void do_cr(usf_state_t * state, short* VD, short* VS, short* VT) ALIGNED short cmp[N]; register int i; +#ifdef ARCH_MIN_ARM_NEON + + int16x8_t v_sn, v_cmp, v_vc; + + int16x8_t zero = vdupq_n_s16(0); + + int16x8_t vs = vld1q_s16((const int16_t *)VS); + int16x8_t vt = vld1q_s16((const int16_t *)VT); + + v_vc = vt; + v_sn = veorq_s16(vs,vt); + v_sn = vshrq_n_s16(v_sn,15); + + v_cmp = vandq_s16(vs, v_sn); + v_cmp = vmvnq_s16(v_cmp); + uint16x8_t v_le = vcleq_s16(vt,v_cmp); + int16x8_t v_le_ = vnegq_s16((int16x8_t)v_le); + + v_cmp = vorrq_s16(vs, v_sn); + uint16x8_t v_ge = vcgeq_s16(v_cmp, vt); + int16x8_t v_ge_ = vnegq_s16((int16x8_t)v_ge); + + v_vc = veorq_s16(v_vc,v_sn); + + vst1q_s16(VC, v_vc); + vst1q_s16(le, v_le_); + vst1q_s16(ge, v_ge_); + + merge(VACC_L, le, VC, VS); + vector_copy(VD, VACC_L); + + vst1q_s16(state->clip, v_ge_); + vst1q_s16(state->comp, v_le_); + vst1q_s16(state->ne,zero); + vst1q_s16(state->co,zero); + vst1q_s16(state->vce,zero); + + return; + +#else + + for (i = 0; i < N; i++) VC[i] = VT[i]; for (i = 0; i < N; i++) @@ -55,6 +97,7 @@ INLINE static void do_cr(usf_state_t * state, short* VD, short* VS, short* VT) for (i = 0; i < N; i++) state->vce[i] = 0; return; +#endif } static void VCR(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/veq.h b/Frameworks/lazyusf/lazyusf/rsp/vu/veq.h index e6002f6e6..870683960 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/veq.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/veq.h @@ -17,6 +17,30 @@ INLINE static void do_eq(usf_state_t * state, short* VD, short* VS, short* VT) { register int i; +#ifdef ARCH_MIN_ARM_NEON + + int16x8_t one = vdupq_n_s16(1); + int16x8_t zero = vdupq_n_s16(0); + int16x8_t vs = vld1q_s16((const int16_t *)VS); + int16x8_t vt = vld1q_s16((const int16_t *)VT); + int16x8_t v_ne = vld1q_s16((const int16_t *)state->ne); + + uint16x8_t v_comp = vceqq_s16(vs, vt); + int16x8_t v_comp_ = vnegq_s16((int16x8_t)v_comp); + v_ne = veorq_s16(v_ne, one); + v_comp_ = vandq_s16(v_comp_, v_ne); + + vector_copy(VACC_L, VT); + vector_copy(VD, VACC_L); + + vst1q_s16(state->comp,v_comp_); + vst1q_s16(state->ne,zero); + vst1q_s16(state->co,zero); + + return; + +#else + for (i = 0; i < N; i++) state->clip[i] = 0; for (i = 0; i < N; i++) @@ -35,6 +59,7 @@ INLINE static void do_eq(usf_state_t * state, short* VD, short* VS, short* VT) for (i = 0; i < N; i++) state->co[i] = 0; return; +#endif } static void VEQ(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vge.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vge.h index 694632f02..512ec9246 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vge.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vge.h @@ -15,10 +15,44 @@ INLINE static void do_ge(usf_state_t * state, short* VD, short* VS, short* VT) { + ALIGNED short ce[N]; ALIGNED short eq[N]; register int i; +#ifdef ARCH_MIN_ARM_NEON + + + int16x8_t zero = vdupq_n_s16(0); + int16x8_t one = vdupq_n_s16(1); + + int16x8_t vs = vld1q_s16((const int16_t *)VS); + int16x8_t vt = vld1q_s16((const int16_t *)VT); + int16x8_t v_ne = vld1q_s16((const int16_t *)state->ne); + int16x8_t v_co = vld1q_s16((const int16_t *)state->co); + + uint16x8_t v_eq_u = vceqq_s16(vs,vt); + int16x8_t v_eq_s = vnegq_s16((int16x8_t)v_eq_u); + + int16x8_t v_ce = vandq_s16(v_ne,v_co); + v_ce = veorq_s16(v_ce,one); + v_eq_s = vandq_s16(v_eq_s, v_ce); + vst1q_s16(state->clip, zero); + uint16x8_t v_comp = vcgtq_s16(vs, vt); + int16x8_t v_comp_s = vnegq_s16((int16x8_t)v_comp); + + v_comp_s = vorrq_s16(v_comp_s, v_eq_s); + vst1q_s16(state->comp, v_comp_s); + + merge(VACC_L, state->comp, VS, VT); + vector_copy(VD, VACC_L); + + vst1q_s16(state->ne,zero); + vst1q_s16(state->co,zero); + + return; +#else + for (i = 0; i < N; i++) eq[i] = (VS[i] == VT[i]); for (i = 0; i < N; i++) @@ -39,6 +73,7 @@ INLINE static void do_ge(usf_state_t * state, short* VD, short* VS, short* VT) for (i = 0; i < N; i++) state->co[i] = 0; return; +#endif } static void VGE(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vlt.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vlt.h index 54469474f..11e4d9742 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vlt.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vlt.h @@ -15,6 +15,37 @@ INLINE static void do_lt(usf_state_t * state, short* VD, short* VS, short* VT) { + +#ifdef ARCH_MIN_ARM_NEON + + int16x8_t zero = vdupq_n_s16(0); + + int16x8_t vs = vld1q_s16((const int16_t *)VS); + int16x8_t vt = vld1q_s16((const int16_t *)VT); + int16x8_t v_ne = vld1q_s16((const int16_t *)state->ne); + int16x8_t v_co = vld1q_s16((const int16_t *)state->co); + + uint16x8_t v_eq_u = vceqq_s16(vs,vt); + int16x8_t v_cn = vandq_s16(v_ne,v_co); + v_eq_u = vandq_u16(v_eq_u,(uint16x8_t)v_cn); + + vst1q_s16(state->clip, zero); + + uint16x8_t v_comp = vcltq_s16(vs, vt); + int16x8_t v_comp_s = vnegq_s16((int16x8_t)v_comp); + v_comp_s = vorrq_s16(v_comp_s, (int16x8_t)v_eq_u); + + vst1q_s16(state->comp, v_comp_s); + + merge(VACC_L, state->comp, VS, VT); + vector_copy(VD, VACC_L); + + vst1q_s16(state->ne, zero); + vst1q_s16(state->co, zero); + return; + +#else + ALIGNED short cn[N]; ALIGNED short eq[N]; register int i; @@ -39,6 +70,7 @@ INLINE static void do_lt(usf_state_t * state, short* VD, short* VS, short* VT) for (i = 0; i < N; i++) state->co[i] = 0; return; +#endif } static void VLT(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vmacf.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vmacf.h index 3a5564afb..8932996fe 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vmacf.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vmacf.h @@ -15,10 +15,81 @@ INLINE static void do_macf(usf_state_t * state, short* VD, short* VS, short* VT) { - ALIGNED int32_t product[N]; + +#ifdef ARCH_MIN_ARM_NEON + + uint16x8_t vs = vld1q_u16((const uint16_t*)VS); + uint16x8_t vt = vld1q_u16((const uint16_t*)VT); + uint16x4_t v_vaccl_low = vld1_u16((const uint16_t*)VACC_L); + uint16x4_t v_vaccl_high = vld1_u16((const uint16_t*)VACC_L+4); + uint16x4_t v_vaccm_low = vld1_u16((const uint16_t*)VACC_M); + uint16x4_t v_vaccm_high = vld1_u16((const uint16_t*)VACC_M+4); + uint16x4_t v_vacch_low = vld1_u16((const uint16_t*)VACC_H); + uint16x4_t v_vacch_high = vld1_u16((const uint16_t*)VACC_H+4); + int32x4_t zero = vdupq_n_s32(0); + uint32x4_t zero_u = vdupq_n_u32(0); + uint32x4_t highmask = vdupq_n_u32(0x0000ffff); + + uint16x4_t vs_low = vget_low_u16(vs); + uint16x4_t vs_high = vget_high_u16(vs); + uint16x4_t vt_low = vget_low_u16(vt); + uint16x4_t vt_high = vget_high_u16(vt); + + int32x4_t product_L = vqdmlal_s16(zero, (int16x4_t)vs_low, (int16x4_t)vt_low); + int32x4_t product_H = vqdmlal_s16(zero, (int16x4_t)vs_high, (int16x4_t)vt_high); + uint32x4_t addend_L = vandq_u32(highmask, (uint32x4_t)product_L); + uint32x4_t addend_H = vandq_u32(highmask, (uint32x4_t)product_H); + + addend_L = vaddw_u16((uint32x4_t)addend_L, v_vaccl_low); + addend_H = vaddw_u16((uint32x4_t)addend_H, v_vaccl_high); + + uint16x8_t v_vaccl = vcombine_u16(vmovn_u32(addend_L), vmovn_u32(addend_H)); + + + addend_L = vaddl_u16( + vaddhn_u32((uint32x4_t)product_L, zero_u), + vaddhn_u32((uint32x4_t)addend_L, zero_u) + ); + + addend_H = vaddl_u16( + vaddhn_u32((uint32x4_t)product_H, zero_u), + vaddhn_u32((uint32x4_t)addend_H, zero_u) + ); + + addend_L = vaddw_u16(addend_L, v_vaccm_low); + addend_H = vaddw_u16(addend_H, v_vaccm_high); + + uint16x8_t v_vaccm = vcombine_u16(vmovn_u32(addend_L), vmovn_u32(addend_H)); + + //product_L = vshrq_n_s32(product_L, 1); + //product_H = vshrq_n_s32(product_H, 1); + + uint32x4_t cond_L = vcltq_s32(product_L,zero); + int32x4_t cond_L_s = vnegq_s32((int32x4_t)cond_L); + uint32x4_t cond_H = vcltq_s32(product_H,zero); + int32x4_t cond_H_s = vnegq_s32((int32x4_t)cond_H); + + v_vacch_low = vsub_u16(v_vacch_low, vmovn_u32((uint32x4_t)cond_L_s)); + v_vacch_high = vsub_u16(v_vacch_high, vmovn_u32((uint32x4_t)cond_H_s)); + + v_vacch_low = vadd_u16(vshrn_n_u32(addend_L,16), v_vacch_low); + v_vacch_high = vadd_u16(vshrn_n_u32(addend_H,16), v_vacch_high); + + uint16x8_t v_vacch = vcombine_u16(v_vacch_low, v_vacch_high); + + vst1q_s16(VACC_L, (int16x8_t)v_vaccl); + vst1q_s16(VACC_M, (int16x8_t)v_vaccm); + vst1q_s16(VACC_H, (int16x8_t)v_vacch); + + SIGNED_CLAMP_AM(state, VD); + return; + +#else + + ALIGNED int32_t product[N]; ALIGNED uint32_t addend[N]; register int i; - + for (i = 0; i < N; i++) product[i] = VS[i] * VT[i]; for (i = 0; i < N; i++) @@ -38,7 +109,9 @@ INLINE static void do_macf(usf_state_t * state, short* VD, short* VS, short* VT) for (i = 0; i < N; i++) VACC_H[i] += addend[i] >> 16; SIGNED_CLAMP_AM(state, VD); + return; +#endif } static void VMACF(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vmacu.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vmacu.h index 0298dabbc..90c25bca1 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vmacu.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vmacu.h @@ -15,6 +15,78 @@ INLINE static void do_macu(usf_state_t * state, short* VD, short* VS, short* VT) { + +#ifdef ARCH_MIN_ARM_NEON + + uint16x8_t vs = vld1q_u16((const uint16_t*)VS); + uint16x8_t vt = vld1q_u16((const uint16_t*)VT); + uint16x4_t v_vaccl_low = vld1_u16((const uint16_t*)VACC_L); + uint16x4_t v_vaccl_high = vld1_u16((const uint16_t*)VACC_L+4); + uint16x4_t v_vaccm_low = vld1_u16((const uint16_t*)VACC_M); + uint16x4_t v_vaccm_high = vld1_u16((const uint16_t*)VACC_M+4); + uint16x4_t v_vacch_low = vld1_u16((const uint16_t*)VACC_H); + uint16x4_t v_vacch_high = vld1_u16((const uint16_t*)VACC_H+4); + int32x4_t zero = vdupq_n_s32(0); + uint32x4_t zero_u = vdupq_n_u32(0); + uint32x4_t highmask = vdupq_n_u32(0x0000ffff); + + uint16x4_t vs_low = vget_low_u16(vs); + uint16x4_t vs_high = vget_high_u16(vs); + uint16x4_t vt_low = vget_low_u16(vt); + uint16x4_t vt_high = vget_high_u16(vt); + + int32x4_t product_L = vqdmlal_s16(zero, (int16x4_t)vs_low, (int16x4_t)vt_low); + int32x4_t product_H = vqdmlal_s16(zero, (int16x4_t)vs_high, (int16x4_t)vt_high); + uint32x4_t addend_L = vandq_u32(highmask, (uint32x4_t)product_L); + uint32x4_t addend_H = vandq_u32(highmask, (uint32x4_t)product_H); + + addend_L = vaddw_u16((uint32x4_t)addend_L, v_vaccl_low); + addend_H = vaddw_u16((uint32x4_t)addend_H, v_vaccl_high); + + uint16x8_t v_vaccl = vcombine_u16(vmovn_u32(addend_L), vmovn_u32(addend_H)); + + + addend_L = vaddl_u16( + vaddhn_u32((uint32x4_t)product_L, zero_u), + vaddhn_u32((uint32x4_t)addend_L, zero_u) + ); + + addend_H = vaddl_u16( + vaddhn_u32((uint32x4_t)product_H, zero_u), + vaddhn_u32((uint32x4_t)addend_H, zero_u) + ); + + addend_L = vaddw_u16(addend_L, v_vaccm_low); + addend_H = vaddw_u16(addend_H, v_vaccm_high); + + uint16x8_t v_vaccm = vcombine_u16(vmovn_u32(addend_L), vmovn_u32(addend_H)); + + //product_L = vshrq_n_s32(product_L, 1); + //product_H = vshrq_n_s32(product_H, 1); + + uint32x4_t cond_L = vcltq_s32(product_L,zero); + int32x4_t cond_L_s = vnegq_s32((int32x4_t)cond_L); + uint32x4_t cond_H = vcltq_s32(product_H,zero); + int32x4_t cond_H_s = vnegq_s32((int32x4_t)cond_H); + + v_vacch_low = vsub_u16(v_vacch_low, vmovn_u32((uint32x4_t)cond_L_s)); + v_vacch_high = vsub_u16(v_vacch_high, vmovn_u32((uint32x4_t)cond_H_s)); + + v_vacch_low = vadd_u16(vshrn_n_u32(addend_L,16), v_vacch_low); + v_vacch_high = vadd_u16(vshrn_n_u32(addend_H,16), v_vacch_high); + + uint16x8_t v_vacch = vcombine_u16(v_vacch_low, v_vacch_high); + + vst1q_s16(VACC_L, (int16x8_t)v_vaccl); + vst1q_s16(VACC_M, (int16x8_t)v_vaccm); + vst1q_s16(VACC_H, (int16x8_t)v_vacch); + + UNSIGNED_CLAMP(state, VD); + return; + +#else + + ALIGNED int32_t product[N]; ALIGNED uint32_t addend[N]; register int i; @@ -39,6 +111,7 @@ INLINE static void do_macu(usf_state_t * state, short* VD, short* VS, short* VT) VACC_H[i] += addend[i] >> 16; UNSIGNED_CLAMP(state, VD); return; +#endif } static void VMACU(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vmadh.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vmadh.h index e46d1023d..f6e822a7a 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vmadh.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vmadh.h @@ -15,6 +15,39 @@ INLINE static void do_madh(usf_state_t * state, short* VD, short* VS, short* VT) { + +#ifdef ARCH_MIN_ARM_NEON + + int16x8_t vs,vt, vaccm, vacch, vacc_h, vacc_m,prod_low,prod_high,one,cond; + uint16x8_t cond_u,res; + + one = vdupq_n_s16(1); + vs = vld1q_s16((const int16_t *)VS); + vt = vld1q_s16((const int16_t *)VT); + vaccm = vld1q_s16((const int16_t *)VACC_M); + vacch = vld1q_s16((const int16_t *)VACC_H); + + prod_low = vmulq_s16(vs,vt); + vacc_m = vaddq_s16(vaccm,prod_low); + + prod_high = vqdmulhq_s16(vs,vt); + prod_high = vshrq_n_s16(prod_high, 1); + + res = vqaddq_u16((uint16x8_t)vaccm, (uint16x8_t)prod_low); + cond_u = vceqq_s16((int16x8_t)res,vacc_m); + cond_u = vaddq_u16(cond_u, (uint16x8_t)one); + + vacc_h = vaddq_s16(prod_high, vacch); + vacc_h = vaddq_s16((int16x8_t)cond_u, vacc_h); + + vst1q_s16(VACC_M, vacc_m); + vst1q_s16(VACC_H, vacc_h); + + SIGNED_CLAMP_AM(state, VD); + return; + +#else + ALIGNED int32_t product[N]; ALIGNED uint32_t addend[N]; register int i; @@ -29,6 +62,8 @@ INLINE static void do_madh(usf_state_t * state, short* VD, short* VS, short* VT) VACC_H[i] += (addend[i] >> 16) + (product[i] >> 16); SIGNED_CLAMP_AM(state, VD); return; + +#endif } static void VMADH(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vmadl.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vmadl.h index bc540da9f..57b2e542f 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vmadl.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vmadl.h @@ -15,6 +15,56 @@ INLINE static void do_madl(usf_state_t * state, short* VD, short* VS, short* VT) { + +#ifdef ARCH_MIN_ARM_NEON + + uint16x8_t vs = vld1q_u16((const uint16_t*)VS); + uint16x8_t vt = vld1q_u16((const uint16_t*)VT); + uint16x8_t v_vaccl = vld1q_u16((const uint16_t*)VACC_L); + uint16x8_t v_vaccm = vld1q_u16((const uint16_t*)VACC_M); + uint16x8_t v_vacch = vld1q_u16((const uint16_t*)VACC_H); + + uint32x4_t zero = vdupq_n_u32(0); + uint16x4_t vs_low = vget_low_u16(vs); + uint16x4_t vs_high = vget_high_u16(vs); + uint16x4_t vt_low = vget_low_u16(vt); + uint16x4_t vt_high = vget_high_u16(vt); + + uint32x4_t product_L = vmulq_u32( vmovl_u16(vs_low), vmovl_u16(vt_low) ); + uint32x4_t product_H = vmulq_u32( vmovl_u16(vs_high), vmovl_u16(vt_high) ); + + uint16x4_t addend_L = vaddhn_u32(product_L, zero); + uint16x4_t addend_H = vaddhn_u32(product_H, zero); + + uint32x4_t exceed1 = vaddl_u16(addend_L, vget_low_u16(v_vaccl)); + uint32x4_t exceed2 = vaddl_u16(addend_H, vget_high_u16(v_vaccl)); + + v_vaccl = vcombine_u16(vmovn_u32(exceed1), vmovn_u32(exceed2)); + + addend_L = vaddhn_u32(exceed1, zero); + addend_H = vaddhn_u32(exceed2, zero); + + exceed1 = vaddl_u16(addend_L, vget_low_u16(v_vaccm)); + exceed2 = vaddl_u16(addend_H, vget_high_u16(v_vaccm)); + + v_vaccm = vcombine_u16(vmovn_u32(exceed1), vmovn_u32(exceed2)); + + addend_L = vaddhn_u32(exceed1, zero); + addend_H = vaddhn_u32(exceed2, zero); + + uint16x8_t v_vacch2 = vcombine_u16(addend_L, addend_H); + v_vacch = vaddq_u16(v_vacch, v_vacch2); + + vst1q_s16(VACC_L, (int16x8_t)v_vaccl); + vst1q_s16(VACC_M, (int16x8_t)v_vaccm); + vst1q_s16(VACC_H, (int16x8_t)v_vacch); + + SIGNED_CLAMP_AL(state, VD); + return; + +#else + + ALIGNED int32_t product[N]; ALIGNED uint32_t addend[N]; register int i; @@ -37,6 +87,7 @@ INLINE static void do_madl(usf_state_t * state, short* VD, short* VS, short* VT) VACC_H[i] += addend[i] >> 16; SIGNED_CLAMP_AL(state, VD); return; +#endif } static void VMADL(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vmadm.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vmadm.h index 0cf54f7b8..15eb4ce39 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vmadm.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vmadm.h @@ -15,6 +15,43 @@ INLINE static void do_madm(usf_state_t * state, short* VD, short* VS, short* VT) { + +#ifdef ARCH_MIN_ARM_NEON + + uint32x4_t zero = vdupq_n_u32(0); + int16x4_t vs_low = vld1_s16((const int16_t*)VS); + int16x4_t vs_high = vld1_s16((const int16_t*)VS+4); + uint16x4_t vt_low = vld1_u16((const uint16_t*)VT); + uint16x4_t vt_high = vld1_u16((const uint16_t*)VT+4); + uint16x4_t vaccl_low = vld1_u16((const uint16_t*)VACC_L); + uint16x4_t vaccl_high = vld1_u16((const uint16_t*)VACC_L+4); + uint16x4_t vaccm_low = vld1_u16((const uint16_t*)VACC_M); + uint16x4_t vaccm_high = vld1_u16((const uint16_t*)VACC_M+4); + int16x4_t vacch_low = vld1_s16((const int16_t*)VACC_H); + int16x4_t vacch_high = vld1_s16((const int16_t*)VACC_H+4); + + int32x4_t vaccl_l = vmlaq_s32((int32x4_t)vmovl_u16(vaccl_low),vmovl_s16(vs_low),(int32x4_t)vmovl_u16(vt_low)); + int32x4_t vaccl_h = vmlaq_s32((int32x4_t)vmovl_u16(vaccl_high),vmovl_s16(vs_high),(int32x4_t)vmovl_u16(vt_high)); + uint32x4_t vaccm_l = vaddq_u32(vmovl_u16(vaccm_low), (uint32x4_t)vshrq_n_s32(vaccl_l,16)); + uint32x4_t vaccm_h = vaddq_u32(vmovl_u16(vaccm_high),(uint32x4_t)vshrq_n_s32(vaccl_h,16)); + uint16x4_t vacch_l = vaddhn_u32(vaccm_l, zero); + uint16x4_t vacch_h = vaddhn_u32(vaccm_h, zero); + int16x4_t vacch_low2 = vadd_s16(vacch_low,(int16x4_t)vacch_l); + int16x4_t vacch_high2 = vadd_s16(vacch_high,(int16x4_t)vacch_h); + + int16x8_t vaccl = vcombine_s16(vmovn_s32(vaccl_l),vmovn_s32(vaccl_h)); + uint16x8_t vaccm = vcombine_u16(vmovn_u32(vaccm_l),vmovn_u32(vaccm_h)); + int16x8_t vacch = vcombine_s16(vacch_low2,vacch_high2); + + vst1q_s16(VACC_L, vaccl); + vst1q_s16(VACC_M, (int16x8_t)vaccm); + vst1q_s16(VACC_H, vacch); + SIGNED_CLAMP_AM(state, VD); + + return; +#else + + ALIGNED uint32_t addend[N]; register int i; @@ -32,6 +69,7 @@ INLINE static void do_madm(usf_state_t * state, short* VD, short* VS, short* VT) VACC_H[i] += addend[i] >> 16; SIGNED_CLAMP_AM(state, VD); return; +#endif } static void VMADM(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vmadn.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vmadn.h index f60c73a29..329a5c1c1 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vmadn.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vmadn.h @@ -15,6 +15,43 @@ INLINE static void do_madn(usf_state_t * state, short* VD, short* VS, short* VT) { + +#ifdef ARCH_MIN_ARM_NEON + + uint32x4_t zero = vdupq_n_u32(0); + uint16x4_t vs_low = vld1_u16((const uint16_t*)VS); + uint16x4_t vs_high = vld1_u16((const uint16_t*)VS+4); + int16x4_t vt_low = vld1_s16((const int16_t*)VT); + int16x4_t vt_high = vld1_s16((const int16_t*)VT+4); + uint16x4_t vaccl_low = vld1_u16((const uint16_t*)VACC_L); + uint16x4_t vaccl_high = vld1_u16((const uint16_t*)VACC_L+4); + uint16x4_t vaccm_low = vld1_u16((const uint16_t*)VACC_M); + uint16x4_t vaccm_high = vld1_u16((const uint16_t*)VACC_M+4); + int16x4_t vacch_low = vld1_s16((const int16_t*)VACC_H); + int16x4_t vacch_high = vld1_s16((const int16_t*)VACC_H+4); + + int32x4_t vaccl_l = vmlaq_s32((int32x4_t)vmovl_u16(vaccl_low),(int32x4_t)vmovl_u16(vs_low),vmovl_s16(vt_low)); + int32x4_t vaccl_h = vmlaq_s32((int32x4_t)vmovl_u16(vaccl_high),(int32x4_t)vmovl_u16(vs_high),vmovl_s16(vt_high)); + uint32x4_t vaccm_l = vaddq_u32(vmovl_u16(vaccm_low), (uint32x4_t)vshrq_n_s32(vaccl_l,16)); + uint32x4_t vaccm_h = vaddq_u32(vmovl_u16(vaccm_high),(uint32x4_t)vshrq_n_s32(vaccl_h,16)); + uint16x4_t vacch_l = vaddhn_u32(vaccm_l, zero); + uint16x4_t vacch_h = vaddhn_u32(vaccm_h, zero); + int16x4_t vacch_low2 = vadd_s16(vacch_low,(int16x4_t)vacch_l); + int16x4_t vacch_high2 = vadd_s16(vacch_high,(int16x4_t)vacch_h); + + int16x8_t vaccl = vcombine_s16(vmovn_s32(vaccl_l),vmovn_s32(vaccl_h)); + uint16x8_t vaccm = vcombine_u16(vmovn_u32(vaccm_l),vmovn_u32(vaccm_h)); + int16x8_t vacch = vcombine_s16(vacch_low2,vacch_high2); + + vst1q_s16(VACC_L, vaccl); + vst1q_s16(VACC_M, (int16x8_t)vaccm); + vst1q_s16(VACC_H, vacch); + SIGNED_CLAMP_AL(state, VD); + return; + +#else + + ALIGNED uint32_t addend[N]; register int i; @@ -32,6 +69,7 @@ INLINE static void do_madn(usf_state_t * state, short* VD, short* VS, short* VT) VACC_H[i] += addend[i] >> 16; SIGNED_CLAMP_AL(state, VD); return; +#endif } static void VMADN(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vmudh.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vmudh.h index 86fcd14d3..ddd9b93ad 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vmudh.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vmudh.h @@ -15,6 +15,37 @@ INLINE static void do_mudh(usf_state_t * state, short* VD, short* VS, short* VT) { + +#ifdef ARCH_MIN_ARM_NEON + + int16x8_t zero16 = vdupq_n_s16(0); + int32x4_t zero32 = vdupq_n_s32(0); + + int16x4_t vs_low = vld1_s16((const int16_t *)VS); + int16x4_t vs_high = vld1_s16((const int16_t *)VS+4); + int16x4_t vt_low = vld1_s16((const int16_t *)VT); + int16x4_t vt_high = vld1_s16((const int16_t *)VT+4); + + int32x4_t l1 = vmovl_s16(vs_low); + int32x4_t h1 = vmovl_s16(vs_high); + int32x4_t l2 = vmovl_s16(vt_low); + int32x4_t h2 = vmovl_s16(vt_high); + + int32x4_t l = vmulq_s32(l1,l2); + int32x4_t h = vmulq_s32(h1,h2); + + int16x8_t vaccm = vcombine_s16(vmovn_s32(l),vmovn_s32(h)); + int16x8_t vacch = vcombine_s16(vaddhn_s32(l,zero32),vaddhn_s32(h,zero32)); + + vst1q_s16(VACC_L, zero16); + vst1q_s16(VACC_M, vaccm); + vst1q_s16(VACC_H, vacch); + + SIGNED_CLAMP_AM(state, VD); + return; + +#else + register int i; for (i = 0; i < N; i++) @@ -25,6 +56,7 @@ INLINE static void do_mudh(usf_state_t * state, short* VD, short* VS, short* VT) VACC_H[i] = (short)(VS[i]*VT[i] >> 16); SIGNED_CLAMP_AM(state, VD); return; +#endif } static void VMUDH(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vmudl.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vmudl.h index a159abe5b..e2a6d8501 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vmudl.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vmudl.h @@ -15,6 +15,26 @@ INLINE static void do_mudl(usf_state_t * state, short* VD, short* VS, short* VT) { + +#ifdef ARCH_MIN_ARM_NEON + + uint16x4_t vs_low = vld1_u16((const uint16_t*)VS); + uint16x4_t vs_high = vld1_u16((const uint16_t*)VS+4); + uint16x4_t vt_low = vld1_u16((const uint16_t*)VT); + uint16x4_t vt_high = vld1_u16((const uint16_t*)VT+4); + int16x8_t zero = vdupq_n_s16(0); + + uint32x4_t lo = vmull_u16(vs_low, vt_low); + uint32x4_t high = vmull_u16(vs_high, vt_high); + uint16x8_t result = vcombine_u16(vshrn_n_u32(lo,16),vshrn_n_u32(high,16)); + vst1q_u16(VACC_L, result); + vst1q_s16(VACC_M, zero); + vst1q_s16(VACC_H, zero); + vector_copy(VD, VACC_L); + return; + +#else + register int i; for (i = 0; i < N; i++) @@ -25,6 +45,7 @@ INLINE static void do_mudl(usf_state_t * state, short* VD, short* VS, short* VT) VACC_H[i] = 0x0000; vector_copy(VD, VACC_L); /* no possibilities to clamp */ return; +#endif } static void VMUDL(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vmudm.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vmudm.h index 2af1a8caa..6eda2581c 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vmudm.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vmudm.h @@ -15,6 +15,39 @@ INLINE static void do_mudm(usf_state_t * state, short* VD, short* VS, short* VT) { +#ifdef ARCH_MIN_ARM_NEON + + int16x8_t vd, vs,vt,res,four,vacc_l, vacc_m, vacc_h; + + int32x4_t zero = vdupq_n_s32(0); + int16x8_t zero16 = vdupq_n_s16(0); + + int16x4_t vs_low = vld1_s16((const uint16_t *)VS); + int16x4_t vs_high = vld1_s16((const uint16_t *)VS+4); + uint16x4_t vt_low = vld1_u16((const int16_t *)VT); + uint16x4_t vt_high = vld1_u16((const int16_t *)VT+4); + + int32x4_t l1 = vmovl_s16(vs_low); + int32x4_t h1 = vmovl_s16(vs_high); + uint32x4_t l2 = vmovl_u16(vt_low); + uint32x4_t h2 = vmovl_u16(vt_high); + + int32x4_t l = vmulq_s32(l1,(int32x4_t)l2); + int32x4_t h = vmulq_s32(h1,(int32x4_t)h2); + + int16x8_t vaccl = vcombine_s16(vmovn_s32(l),vmovn_s32(h)); + int16x8_t vaccm = vcombine_s16(vaddhn_s32(l,zero),vaddhn_s32(h,zero)); + uint16x8_t uvacch = vcltq_s16(vaccm, zero16); + + vst1q_s16(VACC_L, vaccl); + vst1q_s16(VACC_M, vaccm); + vst1q_s16(VACC_H, (int16x8_t)uvacch); + + vector_copy(VD, VACC_M); /* no possibilities to clamp */ + + return; +#else + register int i; for (i = 0; i < N; i++) @@ -25,6 +58,7 @@ INLINE static void do_mudm(usf_state_t * state, short* VD, short* VS, short* VT) VACC_H[i] = -(VACC_M[i] < 0); vector_copy(VD, VACC_M); /* no possibilities to clamp */ return; +#endif } static void VMUDM(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vmudn.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vmudn.h index 1a879cd4e..8b9edce23 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vmudn.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vmudn.h @@ -15,6 +15,40 @@ INLINE static void do_mudn(usf_state_t * state, short* VD, short* VS, short* VT) { + +#ifdef ARCH_MIN_ARM_NEON + + int16x8_t vd, vs,vt,res,four,vacc_l, vacc_m, vacc_h; + + int32x4_t zero = vdupq_n_s32(0); + int16x8_t zero16 = vdupq_n_s16(0); + + uint16x4_t vs_low = vld1_u16((const uint16_t *)VS); + uint16x4_t vs_high = vld1_u16((const uint16_t *)VS+4); + int16x4_t vt_low = vld1_s16((const int16_t *)VT); + int16x4_t vt_high = vld1_s16((const int16_t *)VT+4); + + uint32x4_t l1 = vmovl_u16(vs_low); + uint32x4_t h1 = vmovl_u16(vs_high); + int32x4_t l2 = vmovl_s16(vt_low); + int32x4_t h2 = vmovl_s16(vt_high); + + int32x4_t l = vmulq_s32((int32x4_t)l1,l2); + int32x4_t h = vmulq_s32((int32x4_t)h1,h2); + + int16x8_t vaccl = vcombine_s16(vmovn_s32(l),vmovn_s32(h)); + int16x8_t vaccm = vcombine_s16(vaddhn_s32(l,zero),vaddhn_s32(h,zero)); + uint16x8_t uvacch = vcltq_s16(vaccm, zero16); + vst1q_s16(VACC_L, vaccl); + vst1q_s16(VACC_M, vaccm); + vst1q_s16(VACC_H, (int16x8_t)uvacch); + + vector_copy(VD, VACC_L); /* no possibilities to clamp */ + + return; + +#else + register int i; for (i = 0; i < N; i++) @@ -25,6 +59,7 @@ INLINE static void do_mudn(usf_state_t * state, short* VD, short* VS, short* VT) VACC_H[i] = -(VACC_M[i] < 0); vector_copy(VD, VACC_L); /* no possibilities to clamp */ return; +#endif } static void VMUDN(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vmulf.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vmulf.h index 0464e6295..f6c522bdb 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vmulf.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vmulf.h @@ -22,11 +22,44 @@ * Wrong: ACC(HI) = -((INT32)(acc) < 0) * Right: ACC(HI) = -(SEMIFRAC < 0) */ -#define SEMIFRAC (VS[i]*VT[i]*2/2 + 0x8000/2) + +#define SEMIFRAC (VS[i]*VT[i] + 0x4000) #endif INLINE static void do_mulf(usf_state_t * state, short* VD, short* VS, short* VT) { + +#ifdef ARCH_MIN_ARM_NEON + + int16x8_t vs,vt,res,four,zero,vacc_l, vacc_m, vacc_h; + uint16x8_t cond_u, vacc_m_cond_u,one; + + one = vdupq_n_u16(1); + four = vdupq_n_s16(0x4000); + zero = vdupq_n_s16(0); + + vs = vld1q_s16((const int16_t *)VS); + vt = vld1q_s16((const int16_t *)VT); + + vacc_m = vqrdmulhq_s16(vs, vt); + vacc_l = vmlaq_s16(four, vs,vt); + vacc_l = vshlq_n_s16(vacc_l,1); + + cond_u = vceqq_s16(vs,vt); + cond_u = vaddq_u16(cond_u, one); + vacc_m_cond_u = vcltq_s16(vacc_m, zero); + cond_u = vandq_u16(vacc_m_cond_u, cond_u); + vacc_h = vqnegq_s16((int16x8_t)cond_u); + + vst1q_s16(VACC_L,vacc_l); + vst1q_s16(VACC_M,vacc_m); + vst1q_s16(VACC_H,vacc_h); + + SIGNED_CLAMP_AM(state, VD); + return; + +#else + register int i; for (i = 0; i < N; i++) @@ -35,7 +68,7 @@ INLINE static void do_mulf(usf_state_t * state, short* VD, short* VS, short* VT) VACC_M[i] = (SEMIFRAC << 1) >> 16; for (i = 0; i < N; i++) VACC_H[i] = -((VACC_M[i] < 0) & (VS[i] != VT[i])); /* -32768 * -32768 */ -#ifndef ARCH_MIN_SSE2 +#if !defined ARCH_MIN_SSE2 vector_copy(VD, VACC_M); for (i = 0; i < N; i++) VD[i] -= (VACC_M[i] < 0) & (VS[i] == VT[i]); /* ACC b 31 set, min*min */ @@ -43,6 +76,7 @@ INLINE static void do_mulf(usf_state_t * state, short* VD, short* VS, short* VT) SIGNED_CLAMP_AM(state, VD); #endif return; +#endif } static void VMULF(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vmulu.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vmulu.h index 038294a33..a83f0b812 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vmulu.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vmulu.h @@ -22,11 +22,50 @@ * Wrong: ACC(HI) = -((INT32)(acc) < 0) * Right: ACC(HI) = -(SEMIFRAC < 0) */ -#define SEMIFRAC (VS[i]*VT[i]*2/2 + 0x8000/2) +#define SEMIFRAC (VS[i]*VT[i]*2/2 + 0x4000) #endif INLINE static void do_mulu(usf_state_t * state, short* VD, short* VS, short* VT) { + +#ifdef ARCH_MIN_ARM_NEON + + int16x8_t vd, vs,vt,res,four,zero,vacc_l, vacc_m, vacc_h; + uint16x8_t cond_u, vacc_m_cond_u,one; + + one = vdupq_n_u16(1); + four = vdupq_n_s16(0x4000); + zero = vdupq_n_s16(0); + + vs = vld1q_s16((const int16_t *)VS); + vt = vld1q_s16((const int16_t *)VT); + + + vacc_m = vqrdmulhq_s16(vs, vt); + vacc_l = vmlaq_s16(four, vs,vt); + vacc_l = vshlq_n_s16(vacc_l,1); + + cond_u = vceqq_s16(vs,vt); + cond_u = vaddq_u16(cond_u, one); + vacc_m_cond_u = vcltq_s16(vacc_m, zero); + cond_u = vandq_u16(vacc_m_cond_u, cond_u); + vacc_h = vqnegq_s16((int16x8_t)cond_u); + + vst1q_s16(VACC_L,vacc_l); + vst1q_s16(VACC_M,vacc_m); + vst1q_s16(VACC_H,vacc_h); + + + vd = vacc_m; + + uint16x8_t vacc_m_u = vshrq_n_u16((uint16x8_t)vacc_m, 15); + vd = vorrq_s16(vd, (int16x8_t)vacc_m_u); + vd = vbicq_s16(vd, vacc_h); + vst1q_s16(VD, vd); + return; + +#else + register int i; for (i = 0; i < N; i++) @@ -45,6 +84,7 @@ INLINE static void do_mulu(usf_state_t * state, short* VD, short* VS, short* VT) VD[i] &= ~(VACC_H[i] >> 0); /* VD &= -(result >= 0x000000000000) */ #endif return; +#endif } static void VMULU(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vnand.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vnand.h index b71edf891..83a33ea72 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vnand.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vnand.h @@ -13,14 +13,27 @@ \******************************************************************************/ #include "vu.h" -static INLINE void do_nand(usf_state_t * state, short* VD, short* VS, short* VT) +INLINE void do_nand(usf_state_t * state, short* VD, short* VS, short* VT) { + +#ifdef ARCH_MIN_ARM_NEON + int16x8_t vs, vt,vaccl; + vs = vld1q_s16((const int16_t*)VS); + vt = vld1q_s16((const int16_t*)VT); + vaccl = vandq_s16(vs,vt); + vaccl = vmvnq_s16(vaccl); + vst1q_s16(VACC_L, vaccl); + vector_copy(VD, VACC_L); + return; + +#else register int i; for (i = 0; i < N; i++) VACC_L[i] = ~(VS[i] & VT[i]); vector_copy(VD, VACC_L); return; +#endif } static void VNAND(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vne.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vne.h index b84589fc2..c24b9a199 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vne.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vne.h @@ -15,6 +15,33 @@ INLINE static void do_ne(usf_state_t * state, short* VD, short* VS, short* VT) { + +#ifdef ARCH_MIN_ARM_NEON + + int16x8_t vs, vt,vaccl, ne; + int16x8_t zero = vdupq_n_s16(0); + uint16x8_t cond; + + vs = vld1q_s16((const int16_t*)VS); + vt = vld1q_s16((const int16_t*)VT); + ne = vld1q_s16((const int16_t*)state->ne); + + cond = vceqq_s16(vs,vt); + cond = vmvnq_u16(cond); // this is needed if you need to do "not-equal" + cond = (uint16x8_t)vnegq_s16((int16x8_t)cond); + uint16x8_t comp = vorrq_u16(cond, (uint16x8_t)ne); + + vst1q_s16(state->clip, zero); + vst1q_s16(state->comp, (int16x8_t)cond); + + vector_copy(VACC_L, VS); + vector_copy(VD, VACC_L); + vst1q_s16(state->ne, zero); + vst1q_s16(state->co, zero); + + return; +#else + register int i; for (i = 0; i < N; i++) @@ -35,6 +62,7 @@ INLINE static void do_ne(usf_state_t * state, short* VD, short* VS, short* VT) for (i = 0; i < N; i++) state->co[i] = 0; return; +#endif } static void VNE(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vnor.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vnor.h index de744b41b..0de3c56fa 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vnor.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vnor.h @@ -13,14 +13,27 @@ \******************************************************************************/ #include "vu.h" -static INLINE void do_nor(usf_state_t * state, short* VD, short* VS, short* VT) +INLINE void do_nor(usf_state_t * state, short* VD, short* VS, short* VT) { + +#ifdef ARCH_MIN_ARM_NEON + int16x8_t vs, vt, vaccl; + vs = vld1q_s16((const int16_t*)VS); + vt = vld1q_s16((const int16_t*)VT); + vaccl = vorrq_s16(vs,vt); + vaccl = vmvnq_s16(vaccl); + + vst1q_s16(VACC_L, vaccl); + return; +#else + register int i; for (i = 0; i < N; i++) VACC_L[i] = ~(VS[i] | VT[i]); vector_copy(VD, VACC_L); return; +#endif } static void VNOR(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vnxor.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vnxor.h index ac0bdb7d6..54b6a8457 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vnxor.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vnxor.h @@ -13,14 +13,29 @@ \******************************************************************************/ #include "vu.h" -static INLINE void do_nxor(usf_state_t * state, short* VD, short* VS, short* VT) +INLINE void do_nxor(usf_state_t * state, short* VD, short* VS, short* VT) { + +#ifdef ARCH_MIN_ARM_NEON + + int16x8_t vs, vt,vaccl; + vs = vld1q_s16((const int16_t*)VS); + vt = vld1q_s16((const int16_t*)VT); + vaccl = veorq_s16(vs,vt); + vaccl = vmvnq_s16(vaccl); + + vst1q_s16(VACC_L, vaccl); + vector_copy(VD, VACC_L); + return; +#else + register int i; for (i = 0; i < N; i++) VACC_L[i] = ~(VS[i] ^ VT[i]); vector_copy(VD, VACC_L); return; +#endif } static void VNXOR(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vor.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vor.h index ce588c422..b93f8098c 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vor.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vor.h @@ -13,14 +13,28 @@ \******************************************************************************/ #include "vu.h" -static INLINE void do_or(usf_state_t * state, short* VD, short* VS, short* VT) +INLINE void do_or(usf_state_t * state, short* VD, short* VS, short* VT) { + +#ifdef ARCH_MIN_ARM_NEON + + int16x8_t vs, vt,vaccl; + vs = vld1q_s16((const int16_t*)VS); + vt = vld1q_s16((const int16_t*)VT); + vaccl = vorrq_s16(vs,vt); + vst1q_s16(VACC_L, vaccl); + vector_copy(VD, VACC_L); + return; + +#else + register int i; for (i = 0; i < N; i++) VACC_L[i] = VS[i] | VT[i]; vector_copy(VD, VACC_L); return; +#endif } static void VOR(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vsaw.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vsaw.h index ae0e26e2b..1782d62d3 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vsaw.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vsaw.h @@ -34,13 +34,25 @@ static void VSAR(int vd, int vs, int vt, int e) if (e > 2) { message(state, "VSAR\nInvalid mask.", 2); + #if ARCH_MIN_ARM_NEON + int16x8_t zero = vdupq_n_s16(0); + vst1q_s16(VR[vd], zero); + #else for (i = 0; i < N; i++) VR[vd][i] = 0x0000; /* override behavior (zilmar) */ + #endif } else + { + #if ARCH_MIN_ARM_NEON + vector_copy(VR[vd], VACC[e]); + #else for (i = 0; i < N; i++) VR[vd][i] = VACC[e][i]; - for (i = 0; i < N; i++) + #endif + } + + for (i = 0; i < N; i++) VACC[e][i] = oldval[i]; /* ... = VS */ return; } @@ -59,9 +71,19 @@ static void VSAW(usf_state_t * state, int vd, int vs, int vt, int e) if (e > 0x2) { /* branch very unlikely...never seen a game do VSAW illegally */ message(state, "VSAW\nIllegal mask.", 2); - for (i = 0; i < N; i++) + + #if ARCH_MIN_ARM_NEON + + int16x8_t zero = vdupq_n_s16(0); + vst1q_s16(state->VR[vd], zero); + + #else + + for (i = 0; i < N; i++) state->VR[vd][i] = 0x0000; /* override behavior (zilmar) */ return; + + #endif } vector_copy(state->VR[vd], state->VACC[e]); return; diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vsub.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vsub.h index f3c1b511a..d92f61952 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vsub.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vsub.h @@ -15,6 +15,25 @@ INLINE static void clr_bi(usf_state_t * state, short* VD, short* VS, short* VT) { /* clear CARRY and borrow in to accumulators */ + +#ifdef ARCH_MIN_ARM_NEON + + int16x8_t vs, vt, zero, co; + zero = vdupq_n_s16(0); + vs = vld1q_s16((const int16_t*)VS); + vt = vld1q_s16((const int16_t*)VT); + co = vld1q_s16((const int16_t*)state->co); + + vs = vsubq_s16(vs,vt); + vs = vsubq_s16(vs,co); + vst1q_s16(VACC_L, vs); + + SIGNED_CLAMP_SUB(state, VD, VS, VT); + vst1q_s16(state->ne, zero); + vst1q_s16(state->co, zero); + return; +#else + register int i; for (i = 0; i < N; i++) @@ -25,6 +44,7 @@ INLINE static void clr_bi(usf_state_t * state, short* VD, short* VS, short* VT) for (i = 0; i < N; i++) state->co[i] = 0; return; +#endif } static void VSUB(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vsubc.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vsubc.h index b7b055170..7d46758ca 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vsubc.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vsubc.h @@ -15,6 +15,36 @@ INLINE static void set_bo(usf_state_t * state, short* VD, short* VS, short* VT) { /* set CARRY and borrow out from difference */ + +#ifdef ARCH_MIN_ARM_NEON + + int16x8_t vs, vt,vaccl,ne, co2; + uint16x8_t cond; + + int16x8_t zero = vdupq_n_s16(0); + int16x8_t one = vdupq_n_s16(1); + vs = vld1q_s16((const int16_t*)VS); + vt = vld1q_s16((const int16_t*)VT); + + vaccl = vsubq_s16(vs, vt); + uint16x8_t vdif = vqsubq_u16((uint16x8_t)vs, (uint16x8_t)vt); + + vst1q_s16(VACC_L, vaccl); + vector_copy(VD, VACC_L); + + cond = vceqq_s16(vs, vt); + ne = vaddq_s16((int16x8_t)cond, one); + + vdif = vorrq_u16(vdif,cond); + cond = vceqq_u16(vdif, (uint16x8_t)zero); + co2 = vnegq_s16((int16x8_t)cond); + + vst1q_s16(state->ne, ne); + vst1q_s16(state->co, co2); + return; + +#else + ALIGNED int32_t dif[N]; register int i; @@ -28,6 +58,7 @@ INLINE static void set_bo(usf_state_t * state, short* VD, short* VS, short* VT) for (i = 0; i < N; i++) state->co[i] = (dif[i] < 0); return; +#endif } static void VSUBC(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vu.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vu.h index 7981bdc67..b7c282d52 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vu.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vu.h @@ -54,9 +54,18 @@ static void res_V(usf_state_t * state, int vd, int vs, int vt, int e) if (vs != vt || vt != e) return; message(state, "C2\nRESERVED", 2); /* uncertain how to handle reserved, untested */ + +#ifdef ARCH_MIN_ARM_NEON + int16x8_t zero = vdupq_n_s16(0); + vst1q_s16(state->VR[vd], zero); + return; +#else + + for (i = 0; i < N; i++) state->VR[vd][i] = 0x0000; /* override behavior (bpoint) */ return; +#endif } static void res_M(usf_state_t * state, int vd, int vs, int vt, int e) { diff --git a/Frameworks/lazyusf/lazyusf/rsp/vu/vxor.h b/Frameworks/lazyusf/lazyusf/rsp/vu/vxor.h index 364e0594d..a7a9b56b9 100644 --- a/Frameworks/lazyusf/lazyusf/rsp/vu/vxor.h +++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vxor.h @@ -13,14 +13,28 @@ \******************************************************************************/ #include "vu.h" -static INLINE void do_xor(usf_state_t * state, short* VD, short* VS, short* VT) +INLINE void do_xor(usf_state_t * state, short* VD, short* VS, short* VT) { + +#ifdef ARCH_MIN_ARM_NEON + + int16x8_t vs, vt; + vs = vld1q_s16((const int16_t*)VS); + vt = vld1q_s16((const int16_t*)VT); + vs = veorq_s16(vs, vt); + vst1q_s16(VACC_L, vs); + vector_copy(VD, VACC_L); + + return; +#else + register int i; for (i = 0; i < N; i++) VACC_L[i] = VS[i] ^ VT[i]; vector_copy(VD, VACC_L); return; +#endif } static void VXOR(usf_state_t * state, int vd, int vs, int vt, int e) diff --git a/Frameworks/lazyusf/lazyusf/rsp_hle/alist.c b/Frameworks/lazyusf/lazyusf/rsp_hle/alist.c index ec91d0ffc..87633764a 100644 --- a/Frameworks/lazyusf/lazyusf/rsp_hle/alist.c +++ b/Frameworks/lazyusf/lazyusf/rsp_hle/alist.c @@ -29,6 +29,8 @@ #include #include +#include "common.h" + #include "alist.h" #include "arithmetics.h" #include "audio.h" @@ -58,12 +60,12 @@ static int16_t* sample(struct hle_t* hle, unsigned pos) static uint8_t* alist_u8(struct hle_t* hle, uint16_t dmem) { - return &hle->alist_buffer[dmem ^ S8]; + return u8(hle->alist_buffer, dmem); } static int16_t* alist_s16(struct hle_t* hle, uint16_t dmem) { - return (int16_t*)(&hle->alist_buffer[dmem ^ S16]); + return (int16_t*)u16(hle->alist_buffer, dmem); } @@ -82,13 +84,13 @@ static void alist_envmix_mix(size_t n, int16_t** dst, const int16_t* gains, int1 static int16_t ramp_step(struct ramp_t* ramp) { - bool target_reached; - + bool target_reached; + ramp->value += ramp->step; target_reached = (ramp->step <= 0) - ? (ramp->value <= ramp->target) - : (ramp->value >= ramp->target); + ? (ramp->value <= ramp->target) + : (ramp->value >= ramp->target); if (target_reached) { @@ -184,7 +186,7 @@ void alist_move(struct hle_t* hle, uint16_t dmemo, uint16_t dmemi, uint16_t coun void alist_copy_every_other_sample(struct hle_t* hle, uint16_t dmemo, uint16_t dmemi, uint16_t count) { while (count != 0) { - *(uint16_t*)(alist_u8(hle, dmemo)) = *(uint16_t*)(alist_u8(hle, dmemi)); + *alist_s16(hle, dmemo) = *alist_s16(hle, dmemi); dmemo += 2; dmemi += 4; --count; @@ -373,7 +375,7 @@ void alist_envmix_ge( const int32_t *rate, uint32_t address) { - unsigned k, i, ptr; + unsigned k; size_t n = (aux) ? 4 : 2; const int16_t* const in = (int16_t*)(hle->alist_buffer + dmemi); @@ -390,8 +392,8 @@ void alist_envmix_ge( ramps[1].value = (vol[1] << 16); ramps[0].target = (target[0] << 16); ramps[1].target = (target[1] << 16); - ramps[0].step = rate[0]; - ramps[1].step = rate[1]; + ramps[0].step = rate[0] / 8; + ramps[1].step = rate[1] / 8; } else { memcpy((uint8_t *)save_buffer, (hle->dram + address), 80); wet = *(int16_t *)(save_buffer + 0); /* 0-1 */ @@ -407,26 +409,23 @@ void alist_envmix_ge( } count >>= 1; - for (ptr = 0, k = 0; k < count; k += 8) { + for (k = 0; k < count; k++) { int16_t gains[4]; int16_t* buffers[4]; int16_t l_vol = ramp_step(&ramps[0]); int16_t r_vol = ramp_step(&ramps[1]); - gains[0] = clamp_s16((l_vol * dry + 0x4000) >> 15); - gains[1] = clamp_s16((r_vol * dry + 0x4000) >> 15); - gains[2] = clamp_s16((l_vol * wet + 0x4000) >> 15); - gains[3] = clamp_s16((r_vol * wet + 0x4000) >> 15); + buffers[0] = dl + (k^S); + buffers[1] = dr + (k^S); + buffers[2] = wl + (k^S); + buffers[3] = wr + (k^S); - for (i = 0; i < 8; i++) { - buffers[0] = dl + (ptr^S); - buffers[1] = dr + (ptr^S); - buffers[2] = wl + (ptr^S); - buffers[3] = wr + (ptr^S); + gains[0] = clamp_s16((l_vol * dry + 0x4000) >> 15); + gains[1] = clamp_s16((r_vol * dry + 0x4000) >> 15); + gains[2] = clamp_s16((l_vol * wet + 0x4000) >> 15); + gains[3] = clamp_s16((r_vol * wet + 0x4000) >> 15); - alist_envmix_mix(n, buffers, gains, in[ptr^S]); - ptr++; - } + alist_envmix_mix(n, buffers, gains, in[k^S]); } *(int16_t *)(save_buffer + 0) = wet; /* 0-1 */ diff --git a/Frameworks/lazyusf/lazyusf/rsp_hle/alist_audio.c b/Frameworks/lazyusf/lazyusf/rsp_hle/alist_audio.c index c3d85da3a..76a544a06 100644 --- a/Frameworks/lazyusf/lazyusf/rsp_hle/alist_audio.c +++ b/Frameworks/lazyusf/lazyusf/rsp_hle/alist_audio.c @@ -29,6 +29,8 @@ #include #include +#include "common.h" + #include "alist.h" #include "hle_internal.h" #include "memory.h" @@ -52,7 +54,7 @@ static void clear_segments(struct hle_t* hle) } /* audio commands definition */ -static void SPNOOP(struct hle_t* hle, uint32_t w1, uint32_t w2) +static void SPNOOP(struct hle_t* UNUSED(hle), uint32_t UNUSED(w1), uint32_t UNUSED(w2)) { } @@ -142,7 +144,7 @@ static void SETVOL(struct hle_t* hle, uint32_t w1, uint32_t w2) } } -static void SETLOOP(struct hle_t* hle, uint32_t w1, uint32_t w2) +static void SETLOOP(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2) { hle->alist_audio.loop = get_address(hle, w2); } @@ -165,7 +167,7 @@ static void ADPCM(struct hle_t* hle, uint32_t w1, uint32_t w2) address); } -static void LOADBUFF(struct hle_t* hle, uint32_t w1, uint32_t w2) +static void LOADBUFF(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2) { uint32_t address = get_address(hle, w2); @@ -175,7 +177,7 @@ static void LOADBUFF(struct hle_t* hle, uint32_t w1, uint32_t w2) alist_load(hle, hle->alist_audio.in, address, hle->alist_audio.count); } -static void SAVEBUFF(struct hle_t* hle, uint32_t w1, uint32_t w2) +static void SAVEBUFF(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2) { uint32_t address = get_address(hle, w2); @@ -220,7 +222,7 @@ static void LOADADPCM(struct hle_t* hle, uint32_t w1, uint32_t w2) dram_load_u16(hle, (uint16_t*)hle->alist_audio.table, address, align(count, 8) >> 1); } -static void INTERLEAVE(struct hle_t* hle, uint32_t w1, uint32_t w2) +static void INTERLEAVE(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2) { uint16_t left = (w2 >> 16) + DMEM_BASE; uint16_t right = w2 + DMEM_BASE; @@ -243,7 +245,7 @@ static void MIXER(struct hle_t* hle, uint32_t w1, uint32_t w2) alist_mix(hle, dmemo, dmemi, align(hle->alist_audio.count, 32), gain); } -static void SEGMENT(struct hle_t* hle, uint32_t w1, uint32_t w2) +static void SEGMENT(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2) { set_address(hle, w2); } diff --git a/Frameworks/lazyusf/lazyusf/rsp_hle/alist_naudio.c b/Frameworks/lazyusf/lazyusf/rsp_hle/alist_naudio.c index 5dd404025..6fd0ab20d 100644 --- a/Frameworks/lazyusf/lazyusf/rsp_hle/alist_naudio.c +++ b/Frameworks/lazyusf/lazyusf/rsp_hle/alist_naudio.c @@ -28,12 +28,13 @@ #endif #include +#include "common.h" + #include "alist.h" #include "hle_external.h" #include "hle_internal.h" #include "memory.h" - -static void MP3(struct hle_t* hle, uint32_t w1, uint32_t w2); +#include "ucodes.h" enum { NAUDIO_COUNT = 0x170 }; /* ie 184 samples */ enum { @@ -57,7 +58,7 @@ static void UNKNOWN(struct hle_t* hle, uint32_t w1, uint32_t w2) } -static void SPNOOP(struct hle_t* hle, uint32_t w1, uint32_t w2) +static void SPNOOP(struct hle_t* UNUSED(hle), uint32_t UNUSED(w1), uint32_t UNUSED(w2)) { } @@ -67,10 +68,11 @@ static void NAUDIO_0000(struct hle_t* hle, uint32_t w1, uint32_t w2) UNKNOWN(hle, w1, w2); } -static void NAUDIO_02B0(struct hle_t* hle, uint32_t w1, uint32_t w2) +static void NAUDIO_02B0(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2) { - uint32_t rate = (hle->alist_naudio.rate[1] & 0xffff0000) | (w2 & 0xffff); - hle->alist_naudio.rate[1] = rate; + /* emulate code at 0x12b0 (inside SETVOL), because PC always execute in IMEM */ + hle->alist_naudio.rate[1] &= ~0xffff; + hle->alist_naudio.rate[1] |= (w2 & 0xffff); } static void NAUDIO_14(struct hle_t* hle, uint32_t w1, uint32_t w2) @@ -196,7 +198,7 @@ static void DMEMMOVE(struct hle_t* hle, uint32_t w1, uint32_t w2) alist_move(hle, dmemo, dmemi, (count + 3) & ~3); } -static void SETLOOP(struct hle_t* hle, uint32_t w1, uint32_t w2) +static void SETLOOP(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2) { hle->alist_naudio.loop = (w2 & 0xffffff); } @@ -241,12 +243,12 @@ static void RESAMPLE(struct hle_t* hle, uint32_t w1, uint32_t w2) address); } -static void INTERLEAVE(struct hle_t* hle, uint32_t w1, uint32_t w2) +static void INTERLEAVE(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t UNUSED(w2)) { alist_interleave(hle, NAUDIO_MAIN, NAUDIO_DRY_LEFT, NAUDIO_DRY_RIGHT, NAUDIO_COUNT); } -static void MP3ADDY(struct hle_t* hle, uint32_t w1, uint32_t w2) +static void MP3ADDY(struct hle_t* UNUSED(hle), uint32_t UNUSED(w1), uint32_t UNUSED(w2)) { } diff --git a/Frameworks/lazyusf/lazyusf/rsp_hle/alist_nead.c b/Frameworks/lazyusf/lazyusf/rsp_hle/alist_nead.c index efe6d753e..4e3f28c28 100644 --- a/Frameworks/lazyusf/lazyusf/rsp_hle/alist_nead.c +++ b/Frameworks/lazyusf/lazyusf/rsp_hle/alist_nead.c @@ -28,6 +28,8 @@ #endif #include +#include "common.h" + #include "alist.h" #include "hle_external.h" #include "hle_internal.h" @@ -49,7 +51,7 @@ static void UNKNOWN(struct hle_t* hle, uint32_t w1, uint32_t w2) } -static void SPNOOP(struct hle_t* hle, uint32_t w1, uint32_t w2) +static void SPNOOP(struct hle_t* UNUSED(hle), uint32_t UNUSED(w1), uint32_t UNUSED(w2)) { } @@ -61,7 +63,7 @@ static void LOADADPCM(struct hle_t* hle, uint32_t w1, uint32_t w2) dram_load_u16(hle, (uint16_t*)hle->alist_nead.table, address, count >> 1); } -static void SETLOOP(struct hle_t* hle, uint32_t w1, uint32_t w2) +static void SETLOOP(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2) { hle->alist_nead.loop = w2 & 0xffffff; } @@ -190,7 +192,7 @@ static void ENVSETUP1(struct hle_t* hle, uint32_t w1, uint32_t w2) hle->alist_nead.env_steps[1] = w2; } -static void ENVSETUP2(struct hle_t* hle, uint32_t w1, uint32_t w2) +static void ENVSETUP2(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2) { hle->alist_nead.env_values[0] = (w2 >> 16); hle->alist_nead.env_values[1] = w2; @@ -206,6 +208,7 @@ static void ENVMIXER_MK(struct hle_t* hle, uint32_t w1, uint32_t w2) uint16_t dmem_dr = (w2 >> 12) & 0xff0; uint16_t dmem_wl = (w2 >> 4) & 0xff0; uint16_t dmem_wr = (w2 << 4) & 0xff0; + xors[2] = 0; /* unsupported by this ucode */ xors[3] = 0; /* unsupported by this ucode */ xors[0] = 0 - (int16_t)((w1 & 0x2) >> 1); @@ -233,6 +236,7 @@ static void ENVMIXER(struct hle_t* hle, uint32_t w1, uint32_t w2) uint16_t dmem_dr = (w2 >> 12) & 0xff0; uint16_t dmem_wl = (w2 >> 4) & 0xff0; uint16_t dmem_wr = (w2 << 4) & 0xff0; + xors[2] = 0 - (int16_t)((w1 & 0x8) >> 1); xors[3] = 0 - (int16_t)((w1 & 0x4) >> 1); xors[0] = 0 - (int16_t)((w1 & 0x2) >> 1); @@ -267,7 +271,7 @@ static void INTERL(struct hle_t* hle, uint32_t w1, uint32_t w2) alist_copy_every_other_sample(hle, dmemo, dmemi, count); } -static void INTERLEAVE_MK(struct hle_t* hle, uint32_t w1, uint32_t w2) +static void INTERLEAVE_MK(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2) { uint16_t left = (w2 >> 16); uint16_t right = w2; @@ -323,7 +327,7 @@ static void FILTER(struct hle_t* hle, uint32_t w1, uint32_t w2) } } -static void SEGMENT(struct hle_t* hle, uint32_t w1, uint32_t w2) +static void SEGMENT(struct hle_t* UNUSED(hle), uint32_t UNUSED(w1), uint32_t UNUSED(w2)) { } diff --git a/Frameworks/lazyusf/lazyusf/rsp_hle/arithmetics.h b/Frameworks/lazyusf/lazyusf/rsp_hle/arithmetics.h index 817447265..3d0edf6fb 100644 --- a/Frameworks/lazyusf/lazyusf/rsp_hle/arithmetics.h +++ b/Frameworks/lazyusf/lazyusf/rsp_hle/arithmetics.h @@ -24,13 +24,7 @@ #include -#ifdef _MSC_VER -#define INLINE __forceinline -#else -#define INLINE inline __attribute__((always_inline)) -#endif - -INLINE static int16_t clamp_s16(int_fast32_t x) +static inline int16_t clamp_s16(int_fast32_t x) { x = (x < INT16_MIN) ? INT16_MIN: x; x = (x > INT16_MAX) ? INT16_MAX: x; diff --git a/Frameworks/lazyusf/lazyusf/rsp_hle/audio.c b/Frameworks/lazyusf/lazyusf/rsp_hle/audio.c index 2f03afab8..e473d3460 100644 --- a/Frameworks/lazyusf/lazyusf/rsp_hle/audio.c +++ b/Frameworks/lazyusf/lazyusf/rsp_hle/audio.c @@ -23,41 +23,75 @@ #include #include +#include "common.h" + #include "arithmetics.h" const int16_t RESAMPLE_LUT[64 * 4] = { - 0x0c39, 0x66ad, 0x0d46, 0xffdf, 0x0b39, 0x6696, 0x0e5f, 0xffd8, - 0x0a44, 0x6669, 0x0f83, 0xffd0, 0x095a, 0x6626, 0x10b4, 0xffc8, - 0x087d, 0x65cd, 0x11f0, 0xffbf, 0x07ab, 0x655e, 0x1338, 0xffb6, - 0x06e4, 0x64d9, 0x148c, 0xffac, 0x0628, 0x643f, 0x15eb, 0xffa1, - 0x0577, 0x638f, 0x1756, 0xff96, 0x04d1, 0x62cb, 0x18cb, 0xff8a, - 0x0435, 0x61f3, 0x1a4c, 0xff7e, 0x03a4, 0x6106, 0x1bd7, 0xff71, - 0x031c, 0x6007, 0x1d6c, 0xff64, 0x029f, 0x5ef5, 0x1f0b, 0xff56, - 0x022a, 0x5dd0, 0x20b3, 0xff48, 0x01be, 0x5c9a, 0x2264, 0xff3a, - 0x015b, 0x5b53, 0x241e, 0xff2c, 0x0101, 0x59fc, 0x25e0, 0xff1e, - 0x00ae, 0x5896, 0x27a9, 0xff10, 0x0063, 0x5720, 0x297a, 0xff02, - 0x001f, 0x559d, 0x2b50, 0xfef4, 0xffe2, 0x540d, 0x2d2c, 0xfee8, - 0xffac, 0x5270, 0x2f0d, 0xfedb, 0xff7c, 0x50c7, 0x30f3, 0xfed0, - 0xff53, 0x4f14, 0x32dc, 0xfec6, 0xff2e, 0x4d57, 0x34c8, 0xfebd, - 0xff0f, 0x4b91, 0x36b6, 0xfeb6, 0xfef5, 0x49c2, 0x38a5, 0xfeb0, - 0xfedf, 0x47ed, 0x3a95, 0xfeac, 0xfece, 0x4611, 0x3c85, 0xfeab, - 0xfec0, 0x4430, 0x3e74, 0xfeac, 0xfeb6, 0x424a, 0x4060, 0xfeaf, - 0xfeaf, 0x4060, 0x424a, 0xfeb6, 0xfeac, 0x3e74, 0x4430, 0xfec0, - 0xfeab, 0x3c85, 0x4611, 0xfece, 0xfeac, 0x3a95, 0x47ed, 0xfedf, - 0xfeb0, 0x38a5, 0x49c2, 0xfef5, 0xfeb6, 0x36b6, 0x4b91, 0xff0f, - 0xfebd, 0x34c8, 0x4d57, 0xff2e, 0xfec6, 0x32dc, 0x4f14, 0xff53, - 0xfed0, 0x30f3, 0x50c7, 0xff7c, 0xfedb, 0x2f0d, 0x5270, 0xffac, - 0xfee8, 0x2d2c, 0x540d, 0xffe2, 0xfef4, 0x2b50, 0x559d, 0x001f, - 0xff02, 0x297a, 0x5720, 0x0063, 0xff10, 0x27a9, 0x5896, 0x00ae, - 0xff1e, 0x25e0, 0x59fc, 0x0101, 0xff2c, 0x241e, 0x5b53, 0x015b, - 0xff3a, 0x2264, 0x5c9a, 0x01be, 0xff48, 0x20b3, 0x5dd0, 0x022a, - 0xff56, 0x1f0b, 0x5ef5, 0x029f, 0xff64, 0x1d6c, 0x6007, 0x031c, - 0xff71, 0x1bd7, 0x6106, 0x03a4, 0xff7e, 0x1a4c, 0x61f3, 0x0435, - 0xff8a, 0x18cb, 0x62cb, 0x04d1, 0xff96, 0x1756, 0x638f, 0x0577, - 0xffa1, 0x15eb, 0x643f, 0x0628, 0xffac, 0x148c, 0x64d9, 0x06e4, - 0xffb6, 0x1338, 0x655e, 0x07ab, 0xffbf, 0x11f0, 0x65cd, 0x087d, - 0xffc8, 0x10b4, 0x6626, 0x095a, 0xffd0, 0x0f83, 0x6669, 0x0a44, - 0xffd8, 0x0e5f, 0x6696, 0x0b39, 0xffdf, 0x0d46, 0x66ad, 0x0c39 + (int16_t)0x0c39, (int16_t)0x66ad, (int16_t)0x0d46, (int16_t)0xffdf, + (int16_t)0x0b39, (int16_t)0x6696, (int16_t)0x0e5f, (int16_t)0xffd8, + (int16_t)0x0a44, (int16_t)0x6669, (int16_t)0x0f83, (int16_t)0xffd0, + (int16_t)0x095a, (int16_t)0x6626, (int16_t)0x10b4, (int16_t)0xffc8, + (int16_t)0x087d, (int16_t)0x65cd, (int16_t)0x11f0, (int16_t)0xffbf, + (int16_t)0x07ab, (int16_t)0x655e, (int16_t)0x1338, (int16_t)0xffb6, + (int16_t)0x06e4, (int16_t)0x64d9, (int16_t)0x148c, (int16_t)0xffac, + (int16_t)0x0628, (int16_t)0x643f, (int16_t)0x15eb, (int16_t)0xffa1, + (int16_t)0x0577, (int16_t)0x638f, (int16_t)0x1756, (int16_t)0xff96, + (int16_t)0x04d1, (int16_t)0x62cb, (int16_t)0x18cb, (int16_t)0xff8a, + (int16_t)0x0435, (int16_t)0x61f3, (int16_t)0x1a4c, (int16_t)0xff7e, + (int16_t)0x03a4, (int16_t)0x6106, (int16_t)0x1bd7, (int16_t)0xff71, + (int16_t)0x031c, (int16_t)0x6007, (int16_t)0x1d6c, (int16_t)0xff64, + (int16_t)0x029f, (int16_t)0x5ef5, (int16_t)0x1f0b, (int16_t)0xff56, + (int16_t)0x022a, (int16_t)0x5dd0, (int16_t)0x20b3, (int16_t)0xff48, + (int16_t)0x01be, (int16_t)0x5c9a, (int16_t)0x2264, (int16_t)0xff3a, + (int16_t)0x015b, (int16_t)0x5b53, (int16_t)0x241e, (int16_t)0xff2c, + (int16_t)0x0101, (int16_t)0x59fc, (int16_t)0x25e0, (int16_t)0xff1e, + (int16_t)0x00ae, (int16_t)0x5896, (int16_t)0x27a9, (int16_t)0xff10, + (int16_t)0x0063, (int16_t)0x5720, (int16_t)0x297a, (int16_t)0xff02, + (int16_t)0x001f, (int16_t)0x559d, (int16_t)0x2b50, (int16_t)0xfef4, + (int16_t)0xffe2, (int16_t)0x540d, (int16_t)0x2d2c, (int16_t)0xfee8, + (int16_t)0xffac, (int16_t)0x5270, (int16_t)0x2f0d, (int16_t)0xfedb, + (int16_t)0xff7c, (int16_t)0x50c7, (int16_t)0x30f3, (int16_t)0xfed0, + (int16_t)0xff53, (int16_t)0x4f14, (int16_t)0x32dc, (int16_t)0xfec6, + (int16_t)0xff2e, (int16_t)0x4d57, (int16_t)0x34c8, (int16_t)0xfebd, + (int16_t)0xff0f, (int16_t)0x4b91, (int16_t)0x36b6, (int16_t)0xfeb6, + (int16_t)0xfef5, (int16_t)0x49c2, (int16_t)0x38a5, (int16_t)0xfeb0, + (int16_t)0xfedf, (int16_t)0x47ed, (int16_t)0x3a95, (int16_t)0xfeac, + (int16_t)0xfece, (int16_t)0x4611, (int16_t)0x3c85, (int16_t)0xfeab, + (int16_t)0xfec0, (int16_t)0x4430, (int16_t)0x3e74, (int16_t)0xfeac, + (int16_t)0xfeb6, (int16_t)0x424a, (int16_t)0x4060, (int16_t)0xfeaf, + (int16_t)0xfeaf, (int16_t)0x4060, (int16_t)0x424a, (int16_t)0xfeb6, + (int16_t)0xfeac, (int16_t)0x3e74, (int16_t)0x4430, (int16_t)0xfec0, + (int16_t)0xfeab, (int16_t)0x3c85, (int16_t)0x4611, (int16_t)0xfece, + (int16_t)0xfeac, (int16_t)0x3a95, (int16_t)0x47ed, (int16_t)0xfedf, + (int16_t)0xfeb0, (int16_t)0x38a5, (int16_t)0x49c2, (int16_t)0xfef5, + (int16_t)0xfeb6, (int16_t)0x36b6, (int16_t)0x4b91, (int16_t)0xff0f, + (int16_t)0xfebd, (int16_t)0x34c8, (int16_t)0x4d57, (int16_t)0xff2e, + (int16_t)0xfec6, (int16_t)0x32dc, (int16_t)0x4f14, (int16_t)0xff53, + (int16_t)0xfed0, (int16_t)0x30f3, (int16_t)0x50c7, (int16_t)0xff7c, + (int16_t)0xfedb, (int16_t)0x2f0d, (int16_t)0x5270, (int16_t)0xffac, + (int16_t)0xfee8, (int16_t)0x2d2c, (int16_t)0x540d, (int16_t)0xffe2, + (int16_t)0xfef4, (int16_t)0x2b50, (int16_t)0x559d, (int16_t)0x001f, + (int16_t)0xff02, (int16_t)0x297a, (int16_t)0x5720, (int16_t)0x0063, + (int16_t)0xff10, (int16_t)0x27a9, (int16_t)0x5896, (int16_t)0x00ae, + (int16_t)0xff1e, (int16_t)0x25e0, (int16_t)0x59fc, (int16_t)0x0101, + (int16_t)0xff2c, (int16_t)0x241e, (int16_t)0x5b53, (int16_t)0x015b, + (int16_t)0xff3a, (int16_t)0x2264, (int16_t)0x5c9a, (int16_t)0x01be, + (int16_t)0xff48, (int16_t)0x20b3, (int16_t)0x5dd0, (int16_t)0x022a, + (int16_t)0xff56, (int16_t)0x1f0b, (int16_t)0x5ef5, (int16_t)0x029f, + (int16_t)0xff64, (int16_t)0x1d6c, (int16_t)0x6007, (int16_t)0x031c, + (int16_t)0xff71, (int16_t)0x1bd7, (int16_t)0x6106, (int16_t)0x03a4, + (int16_t)0xff7e, (int16_t)0x1a4c, (int16_t)0x61f3, (int16_t)0x0435, + (int16_t)0xff8a, (int16_t)0x18cb, (int16_t)0x62cb, (int16_t)0x04d1, + (int16_t)0xff96, (int16_t)0x1756, (int16_t)0x638f, (int16_t)0x0577, + (int16_t)0xffa1, (int16_t)0x15eb, (int16_t)0x643f, (int16_t)0x0628, + (int16_t)0xffac, (int16_t)0x148c, (int16_t)0x64d9, (int16_t)0x06e4, + (int16_t)0xffb6, (int16_t)0x1338, (int16_t)0x655e, (int16_t)0x07ab, + (int16_t)0xffbf, (int16_t)0x11f0, (int16_t)0x65cd, (int16_t)0x087d, + (int16_t)0xffc8, (int16_t)0x10b4, (int16_t)0x6626, (int16_t)0x095a, + (int16_t)0xffd0, (int16_t)0x0f83, (int16_t)0x6669, (int16_t)0x0a44, + (int16_t)0xffd8, (int16_t)0x0e5f, (int16_t)0x6696, (int16_t)0x0b39, + (int16_t)0xffdf, (int16_t)0x0d46, (int16_t)0x66ad, (int16_t)0x0c39 }; int32_t rdot(size_t n, const int16_t *x, const int16_t *y) diff --git a/Frameworks/lazyusf/lazyusf/rsp_hle/audio.h b/Frameworks/lazyusf/lazyusf/rsp_hle/audio.h index 9c4449c66..e2c34cd08 100644 --- a/Frameworks/lazyusf/lazyusf/rsp_hle/audio.h +++ b/Frameworks/lazyusf/lazyusf/rsp_hle/audio.h @@ -29,13 +29,7 @@ extern const int16_t RESAMPLE_LUT[64 * 4]; int32_t rdot(size_t n, const int16_t *x, const int16_t *y); -#ifdef _MSC_VER -#define INLINE __forceinline -#else -#define INLINE inline __attribute__((always_inline)) -#endif - -INLINE static int16_t adpcm_predict_sample(uint8_t byte, uint8_t mask, +static inline int16_t adpcm_predict_sample(uint8_t byte, uint8_t mask, unsigned lshift, unsigned rshift) { int16_t sample = (uint16_t)(byte & mask) << lshift; diff --git a/Frameworks/lazyusf/lazyusf/rsp_hle/common.h b/Frameworks/lazyusf/lazyusf/rsp_hle/common.h new file mode 100644 index 000000000..195b8641a --- /dev/null +++ b/Frameworks/lazyusf/lazyusf/rsp_hle/common.h @@ -0,0 +1,39 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Mupen64plus-rsp-hle - common.h * + * Mupen64Plus homepage: http://code.google.com/p/mupen64plus/ * + * Copyright (C) 2014 Bobby Smiles * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#ifndef COMMON_H +#define COMMON_H + +/* macro for unused variable warning suppression */ +#ifdef __GNUC__ +# define UNUSED(x) UNUSED_ ## x __attribute__((__unused__)) +#else +# define UNUSED(x) UNUSED_ ## x +#endif + +#ifdef _MSC_VER +# define inline __forceinline +#elif defined __GNUC__ +# define inline inline __attribute__((always_inline)) +#endif + +#endif + diff --git a/Frameworks/lazyusf/lazyusf/rsp_hle/hle.c b/Frameworks/lazyusf/lazyusf/rsp_hle/hle.c index b98bd1b7a..490274ddd 100644 --- a/Frameworks/lazyusf/lazyusf/rsp_hle/hle.c +++ b/Frameworks/lazyusf/lazyusf/rsp_hle/hle.c @@ -32,6 +32,8 @@ #include #endif +#include "common.h" + #include "hle_external.h" #include "hle_internal.h" #include "memory.h" diff --git a/Frameworks/lazyusf/lazyusf/rsp_hle/hle_internal.h b/Frameworks/lazyusf/lazyusf/rsp_hle/hle_internal.h index 242766de4..352d956b5 100644 --- a/Frameworks/lazyusf/lazyusf/rsp_hle/hle_internal.h +++ b/Frameworks/lazyusf/lazyusf/rsp_hle/hle_internal.h @@ -72,12 +72,6 @@ struct hle_t /* mp3.c */ uint8_t mp3_buffer[0x1000]; - /* FIXME: rewrite mp3 module to avoid these */ - uint32_t mp3_inPtr; - uint32_t mp3_outPtr; - uint32_t mp3_t6; - uint32_t mp3_t5; - uint32_t mp3_t4; }; #endif diff --git a/Frameworks/lazyusf/lazyusf/rsp_hle/jpeg.c b/Frameworks/lazyusf/lazyusf/rsp_hle/jpeg.c index 4365c05f1..bde1d6b69 100644 --- a/Frameworks/lazyusf/lazyusf/rsp_hle/jpeg.c +++ b/Frameworks/lazyusf/lazyusf/rsp_hle/jpeg.c @@ -25,6 +25,8 @@ #include #include +#include "common.h" + #include "arithmetics.h" #include "hle_external.h" #include "hle_internal.h" diff --git a/Frameworks/lazyusf/lazyusf/rsp_hle/memory.c b/Frameworks/lazyusf/lazyusf/rsp_hle/memory.c index edf263e8c..0c1b82d51 100644 --- a/Frameworks/lazyusf/lazyusf/rsp_hle/memory.c +++ b/Frameworks/lazyusf/lazyusf/rsp_hle/memory.c @@ -21,103 +21,56 @@ #include +#include "common.h" + #include "memory.h" /* Global functions */ -void dmem_load_u8(struct hle_t* hle, uint8_t* dst, uint16_t address, size_t count) +void load_u8(uint8_t* dst, const unsigned char* buffer, unsigned address, size_t count) { while (count != 0) { - *(dst++) = *dmem_u8(hle, address); + *(dst++) = *u8(buffer, address); address += 1; --count; } } -void dmem_load_u16(struct hle_t* hle, uint16_t* dst, uint16_t address, size_t count) +void load_u16(uint16_t* dst, const unsigned char* buffer, unsigned address, size_t count) { while (count != 0) { - *(dst++) = *dmem_u16(hle, address); + *(dst++) = *u16(buffer, address); address += 2; --count; } } -void dmem_load_u32(struct hle_t* hle, uint32_t* dst, uint16_t address, size_t count) +void load_u32(uint32_t* dst, const unsigned char* buffer, unsigned address, size_t count) { /* Optimization for uint32_t */ - memcpy(dst, dmem_u32(hle, address), count * sizeof(uint32_t)); + memcpy(dst, u32(buffer, address), count * sizeof(uint32_t)); } -void dmem_store_u8(struct hle_t* hle, const uint8_t* src, uint16_t address, size_t count) +void store_u8(unsigned char* buffer, unsigned address, const uint8_t* src, size_t count) { while (count != 0) { - *dmem_u8(hle, address) = *(src++); + *u8(buffer, address) = *(src++); address += 1; --count; } } -void dmem_store_u16(struct hle_t* hle, const uint16_t* src, uint16_t address, size_t count) +void store_u16(unsigned char* buffer, unsigned address, const uint16_t* src, size_t count) { while (count != 0) { - *dmem_u16(hle, address) = *(src++); + *u16(buffer, address) = *(src++); address += 2; --count; } } -void dmem_store_u32(struct hle_t* hle, const uint32_t* src, uint16_t address, size_t count) +void store_u32(unsigned char* buffer, unsigned address, const uint32_t* src, size_t count) { /* Optimization for uint32_t */ - memcpy(dmem_u32(hle, address), src, count * sizeof(uint32_t)); -} - - -void dram_load_u8(struct hle_t* hle, uint8_t* dst, uint32_t address, size_t count) -{ - while (count != 0) { - *(dst++) = *dram_u8(hle, address); - address += 1; - --count; - } -} - -void dram_load_u16(struct hle_t* hle, uint16_t* dst, uint32_t address, size_t count) -{ - while (count != 0) { - *(dst++) = *dram_u16(hle, address); - address += 2; - --count; - } -} - -void dram_load_u32(struct hle_t* hle, uint32_t* dst, uint32_t address, size_t count) -{ - /* Optimization for uint32_t */ - memcpy(dst, dram_u32(hle, address), count * sizeof(uint32_t)); -} - -void dram_store_u8(struct hle_t* hle, const uint8_t* src, uint32_t address, size_t count) -{ - while (count != 0) { - *dram_u8(hle, address) = *(src++); - address += 1; - --count; - } -} - -void dram_store_u16(struct hle_t* hle, const uint16_t* src, uint32_t address, size_t count) -{ - while (count != 0) { - *dram_u16(hle, address) = *(src++); - address += 2; - --count; - } -} - -void dram_store_u32(struct hle_t* hle, const uint32_t* src, uint32_t address, size_t count) -{ - /* Optimization for uint32_t */ - memcpy(dram_u32(hle, address), src, count * sizeof(uint32_t)); + memcpy(u32(buffer, address), src, count * sizeof(uint32_t)); } diff --git a/Frameworks/lazyusf/lazyusf/rsp_hle/memory.h b/Frameworks/lazyusf/lazyusf/rsp_hle/memory.h index ff951824a..91a747b16 100644 --- a/Frameworks/lazyusf/lazyusf/rsp_hle/memory.h +++ b/Frameworks/lazyusf/lazyusf/rsp_hle/memory.h @@ -57,65 +57,128 @@ enum { TASK_YIELD_DATA_SIZE = 0xffc }; -#ifdef _MSC_VER -#define INLINE __forceinline -#else -#define INLINE inline __attribute__((always_inline)) -#endif - -INLINE static unsigned int align(unsigned int x, unsigned amount) +static inline unsigned int align(unsigned int x, unsigned amount) { --amount; return (x + amount) & ~amount; } -INLINE static uint8_t* const dmem_u8(struct hle_t* hle, uint16_t address) +static inline uint8_t* u8(const unsigned char* buffer, unsigned address) { - return (uint8_t*)(&hle->dmem[(address & 0xfff) ^ S8]); + return (uint8_t*)(buffer + (address ^ S8)); } -INLINE static uint16_t* const dmem_u16(struct hle_t* hle, uint16_t address) +static inline uint16_t* u16(const unsigned char* buffer, unsigned address) { assert((address & 1) == 0); - return (uint16_t*)(&hle->dmem[(address & 0xfff) ^ S16]); + return (uint16_t*)(buffer + (address ^ S16)); } -INLINE static uint32_t* const dmem_u32(struct hle_t* hle, uint16_t address) +static inline uint32_t* u32(const unsigned char* buffer, unsigned address) { assert((address & 3) == 0); - return (uint32_t*)(&hle->dmem[(address & 0xfff)]); + return (uint32_t*)(buffer + address); } -INLINE static uint8_t* const dram_u8(struct hle_t* hle, uint32_t address) +void load_u8 (uint8_t* dst, const unsigned char* buffer, unsigned address, size_t count); +void load_u16(uint16_t* dst, const unsigned char* buffer, unsigned address, size_t count); +void load_u32(uint32_t* dst, const unsigned char* buffer, unsigned address, size_t count); +void store_u8 (unsigned char* buffer, unsigned address, const uint8_t* src, size_t count); +void store_u16(unsigned char* buffer, unsigned address, const uint16_t* src, size_t count); +void store_u32(unsigned char* buffer, unsigned address, const uint32_t* src, size_t count); + + +/* convenient functions for DMEM access */ +static inline uint8_t* dmem_u8(struct hle_t* hle, uint16_t address) { - return (uint8_t*)&hle->dram[(address & 0xffffff) ^ S8]; + return u8(hle->dmem, address & 0xfff); } -INLINE static uint16_t* const dram_u16(struct hle_t* hle, uint32_t address) +static inline uint16_t* dmem_u16(struct hle_t* hle, uint16_t address) { - assert((address & 1) == 0); - return (uint16_t*)&hle->dram[(address & 0xffffff) ^ S16]; + return u16(hle->dmem, address & 0xfff); } -INLINE static uint32_t* const dram_u32(struct hle_t* hle, uint32_t address) +static inline uint32_t* dmem_u32(struct hle_t* hle, uint16_t address) { - assert((address & 3) == 0); - return (uint32_t*)&hle->dram[address & 0xffffff]; + return u32(hle->dmem, address & 0xfff); } -void dmem_load_u8 (struct hle_t* hle, uint8_t* dst, uint16_t address, size_t count); -void dmem_load_u16(struct hle_t* hle, uint16_t* dst, uint16_t address, size_t count); -void dmem_load_u32(struct hle_t* hle, uint32_t* dst, uint16_t address, size_t count); -void dmem_store_u8 (struct hle_t* hle, const uint8_t* src, uint16_t address, size_t count); -void dmem_store_u16(struct hle_t* hle, const uint16_t* src, uint16_t address, size_t count); -void dmem_store_u32(struct hle_t* hle, const uint32_t* src, uint16_t address, size_t count); +static inline void dmem_load_u8(struct hle_t* hle, uint8_t* dst, uint16_t address, size_t count) +{ + load_u8(dst, hle->dmem, address & 0xfff, count); +} -void dram_load_u8 (struct hle_t* hle, uint8_t* dst, uint32_t address, size_t count); -void dram_load_u16(struct hle_t* hle, uint16_t* dst, uint32_t address, size_t count); -void dram_load_u32(struct hle_t* hle, uint32_t* dst, uint32_t address, size_t count); -void dram_store_u8 (struct hle_t* hle, const uint8_t* src, uint32_t address, size_t count); -void dram_store_u16(struct hle_t* hle, const uint16_t* src, uint32_t address, size_t count); -void dram_store_u32(struct hle_t* hle, const uint32_t* src, uint32_t address, size_t count); +static inline void dmem_load_u16(struct hle_t* hle, uint16_t* dst, uint16_t address, size_t count) +{ + load_u16(dst, hle->dmem, address & 0xfff, count); +} + +static inline void dmem_load_u32(struct hle_t* hle, uint32_t* dst, uint16_t address, size_t count) +{ + load_u32(dst, hle->dmem, address & 0xfff, count); +} + +static inline void dmem_store_u8(struct hle_t* hle, const uint8_t* src, uint16_t address, size_t count) +{ + store_u8(hle->dmem, address & 0xfff, src, count); +} + +static inline void dmem_store_u16(struct hle_t* hle, const uint16_t* src, uint16_t address, size_t count) +{ + store_u16(hle->dmem, address & 0xfff, src, count); +} + +static inline void dmem_store_u32(struct hle_t* hle, const uint32_t* src, uint16_t address, size_t count) +{ + store_u32(hle->dmem, address & 0xfff, src, count); +} + +/* convenient functions DRAM access */ +static inline uint8_t* dram_u8(struct hle_t* hle, uint32_t address) +{ + return u8(hle->dram, address & 0xffffff); +} + +static inline uint16_t* dram_u16(struct hle_t* hle, uint32_t address) +{ + return u16(hle->dram, address & 0xffffff); +} + +static inline uint32_t* dram_u32(struct hle_t* hle, uint32_t address) +{ + return u32(hle->dram, address & 0xffffff); +} + +static inline void dram_load_u8(struct hle_t* hle, uint8_t* dst, uint32_t address, size_t count) +{ + load_u8(dst, hle->dram, address & 0xffffff, count); +} + +static inline void dram_load_u16(struct hle_t* hle, uint16_t* dst, uint32_t address, size_t count) +{ + load_u16(dst, hle->dram, address & 0xffffff, count); +} + +static inline void dram_load_u32(struct hle_t* hle, uint32_t* dst, uint32_t address, size_t count) +{ + load_u32(dst, hle->dram, address & 0xffffff, count); +} + +static inline void dram_store_u8(struct hle_t* hle, const uint8_t* src, uint32_t address, size_t count) +{ + store_u8(hle->dram, address & 0xffffff, src, count); +} + +static inline void dram_store_u16(struct hle_t* hle, const uint16_t* src, uint32_t address, size_t count) +{ + store_u16(hle->dram, address & 0xffffff, src, count); +} + +static inline void dram_store_u32(struct hle_t* hle, const uint32_t* src, uint32_t address, size_t count) +{ + store_u32(hle->dram, address & 0xffffff, src, count); +} #endif diff --git a/Frameworks/lazyusf/lazyusf/rsp_hle/mp3.c b/Frameworks/lazyusf/lazyusf/rsp_hle/mp3.c index afcbad8cf..20a42c4d6 100644 --- a/Frameworks/lazyusf/lazyusf/rsp_hle/mp3.c +++ b/Frameworks/lazyusf/lazyusf/rsp_hle/mp3.c @@ -1,6 +1,7 @@ /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * - * Mupen64plus-rsp-hle - ucode3mp3.h * + * Mupen64plus-rsp-hle - mp3.c * * Mupen64Plus homepage: http://code.google.com/p/mupen64plus/ * + * Copyright (C) 2014 Bobby Smiles * * Copyright (C) 2009 Richard Goedeken * * Copyright (C) 2002 Hacktarux * * * @@ -23,10 +24,16 @@ #include #include -#include "hle_external.h" +#include "common.h" + +#include "arithmetics.h" #include "hle_internal.h" #include "memory.h" +static void InnerLoop(struct hle_t* hle, + uint32_t outPtr, uint32_t inPtr, + uint32_t t6, uint32_t t5, uint32_t t4); + static const uint16_t DeWindowLUT [0x420] = { 0x0000, 0xFFF3, 0x005D, 0xFF38, 0x037A, 0xF736, 0x0B37, 0xC00E, 0x7FFF, 0x3FF2, 0x0B37, 0x08CA, 0x037A, 0x00C8, 0x005D, 0x000D, @@ -198,10 +205,13 @@ static void MP3AB0(int32_t* v) } } -static void InnerLoop(struct hle_t* hle); - void mp3_task(struct hle_t* hle, unsigned int index, uint32_t address) { + uint32_t inPtr, outPtr; + uint32_t t6;/* = 0x08A0; - I think these are temporary storage buffers */ + uint32_t t5;/* = 0x0AC0; */ + uint32_t t4;/* = (w1 & 0x1E); */ + /* Initialization Code */ uint32_t readPtr; /* s5 */ uint32_t writePtr; /* s6 */ @@ -209,9 +219,9 @@ void mp3_task(struct hle_t* hle, unsigned int index, uint32_t address) int cnt, cnt2; /* I think these are temporary storage buffers */ - hle->mp3_t6 = 0x08A0; - hle->mp3_t5 = 0x0AC0; - hle->mp3_t4 = index; + t6 = 0x08A0; + t5 = 0x0AC0; + t4 = index; writePtr = readPtr = address; /* Just do that for efficiency... may remove and use directly later anyway */ @@ -222,20 +232,21 @@ void mp3_task(struct hle_t* hle, unsigned int index, uint32_t address) for (cnt = 0; cnt < 0x480; cnt += 0x180) { /* DMA: 0xCF0 <- RDRAM[s5] : 0x180 */ memcpy(hle->mp3_buffer + 0xCF0, hle->dram + readPtr, 0x180); - hle->mp3_inPtr = 0xCF0; /* s7 */ - hle->mp3_outPtr = 0xE70; /* s3 */ + inPtr = 0xCF0; /* s7 */ + outPtr = 0xE70; /* s3 */ /* --------------- Inner Loop Start -------------------- */ for (cnt2 = 0; cnt2 < 0x180; cnt2 += 0x40) { - hle->mp3_t6 &= 0xFFE0; - hle->mp3_t5 &= 0xFFE0; - hle->mp3_t6 |= hle->mp3_t4; - hle->mp3_t5 |= hle->mp3_t4; - InnerLoop(hle); - hle->mp3_t4 = (hle->mp3_t4 - 2) & 0x1E; - tmp = hle->mp3_t6; - hle->mp3_t6 = hle->mp3_t5; - hle->mp3_t5 = tmp; - hle->mp3_inPtr += 0x40; + t6 &= 0xFFE0; + t5 &= 0xFFE0; + t6 |= t4; + t5 |= t4; + InnerLoop(hle, outPtr, inPtr, t6, t5, t4); + t4 = (t4 - 2) & 0x1E; + tmp = t6; + t6 = t5; + t5 = tmp; + inPtr += 0x40; + outPtr += 0x40; } /* --------------- Inner Loop End -------------------- */ memcpy(hle->dram + writePtr, hle->mp3_buffer + 0xe70, 0x180); @@ -244,9 +255,9 @@ void mp3_task(struct hle_t* hle, unsigned int index, uint32_t address) } } - - -static void InnerLoop(struct hle_t* hle) +static void InnerLoop(struct hle_t* hle, + uint32_t outPtr, uint32_t inPtr, + uint32_t t6, uint32_t t5, uint32_t t4) { /* Part 1: 100% Accurate */ @@ -274,56 +285,56 @@ static void InnerLoop(struct hle_t* hle) int32_t vt; int32_t v[32]; - v[0] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x00 ^ S16)); - v[31] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x3E ^ S16)); + v[0] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x00 ^ S16)); + v[31] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x3E ^ S16)); v[0] += v[31]; - v[1] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x02 ^ S16)); - v[30] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x3C ^ S16)); + v[1] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x02 ^ S16)); + v[30] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x3C ^ S16)); v[1] += v[30]; - v[2] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x06 ^ S16)); - v[28] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x38 ^ S16)); + v[2] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x06 ^ S16)); + v[28] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x38 ^ S16)); v[2] += v[28]; - v[3] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x04 ^ S16)); - v[29] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x3A ^ S16)); + v[3] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x04 ^ S16)); + v[29] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x3A ^ S16)); v[3] += v[29]; - v[4] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x0E ^ S16)); - v[24] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x30 ^ S16)); + v[4] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x0E ^ S16)); + v[24] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x30 ^ S16)); v[4] += v[24]; - v[5] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x0C ^ S16)); - v[25] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x32 ^ S16)); + v[5] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x0C ^ S16)); + v[25] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x32 ^ S16)); v[5] += v[25]; - v[6] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x08 ^ S16)); - v[27] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x36 ^ S16)); + v[6] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x08 ^ S16)); + v[27] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x36 ^ S16)); v[6] += v[27]; - v[7] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x0A ^ S16)); - v[26] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x34 ^ S16)); + v[7] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x0A ^ S16)); + v[26] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x34 ^ S16)); v[7] += v[26]; - v[8] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x1E ^ S16)); - v[16] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x20 ^ S16)); + v[8] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x1E ^ S16)); + v[16] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x20 ^ S16)); v[8] += v[16]; - v[9] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x1C ^ S16)); - v[17] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x22 ^ S16)); + v[9] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x1C ^ S16)); + v[17] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x22 ^ S16)); v[9] += v[17]; - v[10] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x18 ^ S16)); - v[19] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x26 ^ S16)); + v[10] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x18 ^ S16)); + v[19] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x26 ^ S16)); v[10] += v[19]; - v[11] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x1A ^ S16)); - v[18] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x24 ^ S16)); + v[11] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x1A ^ S16)); + v[18] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x24 ^ S16)); v[11] += v[18]; - v[12] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x10 ^ S16)); - v[23] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x2E ^ S16)); + v[12] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x10 ^ S16)); + v[23] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x2E ^ S16)); v[12] += v[23]; - v[13] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x12 ^ S16)); - v[22] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x2C ^ S16)); + v[13] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x12 ^ S16)); + v[22] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x2C ^ S16)); v[13] += v[22]; - v[14] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x16 ^ S16)); - v[20] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x28 ^ S16)); + v[14] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x16 ^ S16)); + v[20] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x28 ^ S16)); v[14] += v[20]; - v[15] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x14 ^ S16)); - v[21] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x2A ^ S16)); + v[15] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x14 ^ S16)); + v[21] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x2A ^ S16)); v[15] += v[21]; /* Part 2-4 */ @@ -332,10 +343,10 @@ static void InnerLoop(struct hle_t* hle) /* Part 5 - 1-Wide Butterflies - 100% Accurate but need SSVs!!! */ - t0 = hle->mp3_t6 + 0x100; - t1 = hle->mp3_t6 + 0x200; - t2 = hle->mp3_t5 + 0x100; - t3 = hle->mp3_t5 + 0x200; + t0 = t6 + 0x100; + t1 = t6 + 0x200; + t2 = t5 + 0x100; + t3 = t5 + 0x200; /* 0x13A8 */ v[1] = 0; @@ -344,14 +355,14 @@ static void InnerLoop(struct hle_t* hle) v[16] = -v[16] - v[17]; v[2] = v[18] + v[19]; /* ** Store v[11] -> (T6 + 0)** */ - *(int16_t *)(hle->mp3_buffer + ((hle->mp3_t6 + (short)0x0))) = (short)v[11]; + *(int16_t *)(hle->mp3_buffer + ((t6 + (short)0x0))) = (short)v[11]; v[11] = -v[11]; /* ** Store v[16] -> (T3 + 0)** */ *(int16_t *)(hle->mp3_buffer + ((t3 + (short)0x0))) = (short)v[16]; /* ** Store v[11] -> (T5 + 0)** */ - *(int16_t *)(hle->mp3_buffer + ((hle->mp3_t5 + (short)0x0))) = (short)v[11]; + *(int16_t *)(hle->mp3_buffer + ((t5 + (short)0x0))) = (short)v[11]; /* 0x13E8 - Verified.... */ v[2] = -v[2]; /* ** Store v[2] -> (T2 + 0)** */ @@ -398,7 +409,7 @@ static void InnerLoop(struct hle_t* hle) v[17] = v[13] - v[10]; v[9] = v[9] + v[14]; /* ** Store v[9] -> (T6 + 0x40) */ - *(int16_t *)(hle->mp3_buffer + ((hle->mp3_t6 + (short)0x40))) = (short)v[9]; + *(int16_t *)(hle->mp3_buffer + ((t6 + (short)0x40))) = (short)v[9]; v[11] = v[11] - v[13]; /* ** Store v[17] -> (T0 + 0xFFC0) */ *(int16_t *)(hle->mp3_buffer + ((t0 + (short)0xFFC0))) = (short)v[17]; @@ -414,63 +425,63 @@ static void InnerLoop(struct hle_t* hle) /* ** Store v[8] -> (T3 + 0xFFC0) */ *(int16_t *)(hle->mp3_buffer + ((t3 + (short)0xFFC0))) = (short)v[8]; /* ** Store v[14] -> (T5 + 0x40) */ - *(int16_t *)(hle->mp3_buffer + ((hle->mp3_t5 + (short)0x40))) = (short)v[14]; + *(int16_t *)(hle->mp3_buffer + ((t5 + (short)0x40))) = (short)v[14]; /* ** Store v[10] -> (T2 + 0xFFC0) */ *(int16_t *)(hle->mp3_buffer + ((t2 + (short)0xFFC0))) = (short)v[10]; /* 0x14FC - Verified... */ /* Part 6 - 100% Accurate */ - v[0] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x00 ^ S16)); - v[31] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x3E ^ S16)); + v[0] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x00 ^ S16)); + v[31] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x3E ^ S16)); v[0] -= v[31]; - v[1] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x02 ^ S16)); - v[30] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x3C ^ S16)); + v[1] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x02 ^ S16)); + v[30] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x3C ^ S16)); v[1] -= v[30]; - v[2] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x06 ^ S16)); - v[28] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x38 ^ S16)); + v[2] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x06 ^ S16)); + v[28] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x38 ^ S16)); v[2] -= v[28]; - v[3] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x04 ^ S16)); - v[29] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x3A ^ S16)); + v[3] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x04 ^ S16)); + v[29] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x3A ^ S16)); v[3] -= v[29]; - v[4] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x0E ^ S16)); - v[24] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x30 ^ S16)); + v[4] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x0E ^ S16)); + v[24] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x30 ^ S16)); v[4] -= v[24]; - v[5] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x0C ^ S16)); - v[25] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x32 ^ S16)); + v[5] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x0C ^ S16)); + v[25] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x32 ^ S16)); v[5] -= v[25]; - v[6] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x08 ^ S16)); - v[27] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x36 ^ S16)); + v[6] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x08 ^ S16)); + v[27] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x36 ^ S16)); v[6] -= v[27]; - v[7] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x0A ^ S16)); - v[26] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x34 ^ S16)); + v[7] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x0A ^ S16)); + v[26] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x34 ^ S16)); v[7] -= v[26]; - v[8] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x1E ^ S16)); - v[16] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x20 ^ S16)); + v[8] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x1E ^ S16)); + v[16] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x20 ^ S16)); v[8] -= v[16]; - v[9] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x1C ^ S16)); - v[17] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x22 ^ S16)); + v[9] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x1C ^ S16)); + v[17] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x22 ^ S16)); v[9] -= v[17]; - v[10] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x18 ^ S16)); - v[19] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x26 ^ S16)); + v[10] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x18 ^ S16)); + v[19] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x26 ^ S16)); v[10] -= v[19]; - v[11] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x1A ^ S16)); - v[18] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x24 ^ S16)); + v[11] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x1A ^ S16)); + v[18] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x24 ^ S16)); v[11] -= v[18]; - v[12] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x10 ^ S16)); - v[23] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x2E ^ S16)); + v[12] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x10 ^ S16)); + v[23] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x2E ^ S16)); v[12] -= v[23]; - v[13] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x12 ^ S16)); - v[22] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x2C ^ S16)); + v[13] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x12 ^ S16)); + v[22] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x2C ^ S16)); v[13] -= v[22]; - v[14] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x16 ^ S16)); - v[20] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x28 ^ S16)); + v[14] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x16 ^ S16)); + v[20] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x28 ^ S16)); v[14] -= v[20]; - v[15] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x14 ^ S16)); - v[21] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x2A ^ S16)); + v[15] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x14 ^ S16)); + v[21] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x2A ^ S16)); v[15] -= v[21]; for (i = 0; i < 16; i++) @@ -516,10 +527,10 @@ static void InnerLoop(struct hle_t* hle) v[14] = v[6] - v[14]; v[15] = (((v[30] - v[31]) * 0x5A827) >> 0x10) - v[7]; /* Store v14 -> (T5 + 0x20) */ - *(int16_t *)(hle->mp3_buffer + ((hle->mp3_t5 + (short)0x20))) = (short)v[14]; + *(int16_t *)(hle->mp3_buffer + ((t5 + (short)0x20))) = (short)v[14]; v[14] = v[14] + v[1]; /* Store v[14] -> (T6 + 0x20) */ - *(int16_t *)(hle->mp3_buffer + ((hle->mp3_t6 + (short)0x20))) = (short)v[14]; + *(int16_t *)(hle->mp3_buffer + ((t6 + (short)0x20))) = (short)v[14]; /* Store v[15] -> (T1 + 0xFFE0) */ *(int16_t *)(hle->mp3_buffer + ((t1 + (short)0xFFE0))) = (short)v[15]; v[9] = v[9] + v[10]; @@ -527,7 +538,7 @@ static void InnerLoop(struct hle_t* hle) v[6] = v[10] - v[6]; v[1] = v[9] - v[1]; /* Store v[6] -> (T5 + 0x60) */ - *(int16_t *)(hle->mp3_buffer + ((hle->mp3_t5 + (short)0x60))) = (short)v[6]; + *(int16_t *)(hle->mp3_buffer + ((t5 + (short)0x60))) = (short)v[6]; v[10] = v[10] + v[2]; v[10] = v[4] - v[10]; /* Store v[10] -> (T2 + 0xFFA0) */ @@ -547,7 +558,7 @@ static void InnerLoop(struct hle_t* hle) *(int16_t *)(hle->mp3_buffer + ((t1 + (short)0xFFA0))) = (short)v[7]; v[11] = v[11] - v[3]; /* Store v[1] -> (T6 + 0x60) */ - *(int16_t *)(hle->mp3_buffer + ((hle->mp3_t6 + (short)0x60))) = (short)v[1]; + *(int16_t *)(hle->mp3_buffer + ((t6 + (short)0x60))) = (short)v[1]; v[11] = v[11] - v[5]; /* Store v[11] -> (T0 + 0x60) */ *(int16_t *)(hle->mp3_buffer + ((t0 + (short)0x60))) = (short)v[11]; @@ -564,9 +575,9 @@ static void InnerLoop(struct hle_t* hle) /* Step 8 - Dewindowing */ - addptr = hle->mp3_t6 & 0xFFE0; + addptr = t6 & 0xFFE0; - offset = 0x10 - (hle->mp3_t4 >> 1); + offset = 0x10 - (t4 >> 1); for (x = 0; x < 8; x++) { int32_t v0; int32_t v18; @@ -585,14 +596,14 @@ static void InnerLoop(struct hle_t* hle) /* Clamp(v0); */ /* Clamp(v18); */ /* clamp??? */ - *(int16_t *)(hle->mp3_buffer + (hle->mp3_outPtr ^ S16)) = v0; - *(int16_t *)(hle->mp3_buffer + ((hle->mp3_outPtr + 2)^S16)) = v18; - hle->mp3_outPtr += 4; + *(int16_t *)(hle->mp3_buffer + (outPtr ^ S16)) = v0; + *(int16_t *)(hle->mp3_buffer + ((outPtr + 2)^S16)) = v18; + outPtr += 4; addptr += 0x30; offset += 0x38; } - offset = 0x10 - (hle->mp3_t4 >> 1) + 8 * 0x40; + offset = 0x10 - (t4 >> 1) + 8 * 0x40; v2 = v4 = 0; for (i = 0; i < 4; i++) { v2 += ((int) * (int16_t *)(hle->mp3_buffer + (addptr) + 0x00) * (short)DeWindowLUT[offset + 0x00] + 0x4000) >> 0xF; @@ -606,12 +617,12 @@ static void InnerLoop(struct hle_t* hle) } mult6 = *(int32_t *)(hle->mp3_buffer + 0xCE8); mult4 = *(int32_t *)(hle->mp3_buffer + 0xCEC); - if (hle->mp3_t4 & 0x2) { + if (t4 & 0x2) { v2 = (v2 **(uint32_t *)(hle->mp3_buffer + 0xCE8)) >> 0x10; - *(int16_t *)(hle->mp3_buffer + (hle->mp3_outPtr ^ S16)) = v2; + *(int16_t *)(hle->mp3_buffer + (outPtr ^ S16)) = v2; } else { v4 = (v4 **(uint32_t *)(hle->mp3_buffer + 0xCE8)) >> 0x10; - *(int16_t *)(hle->mp3_buffer + (hle->mp3_outPtr ^ S16)) = v4; + *(int16_t *)(hle->mp3_buffer + (outPtr ^ S16)) = v4; mult4 = *(uint32_t *)(hle->mp3_buffer + 0xCE8); } addptr -= 0x50; @@ -621,7 +632,7 @@ static void InnerLoop(struct hle_t* hle) int32_t v18; v2 = v4 = v6 = v8 = 0; - offset = (0x22F - (hle->mp3_t4 >> 1) + x * 0x40); + offset = (0x22F - (t4 >> 1) + x * 0x40); for (i = 0; i < 4; i++) { v2 += ((int) * (int16_t *)(hle->mp3_buffer + (addptr) + 0x20) * (short)DeWindowLUT[offset + 0x00] + 0x4000) >> 0xF; @@ -640,13 +651,13 @@ static void InnerLoop(struct hle_t* hle) /* Clamp(v0); */ /* Clamp(v18); */ /* clamp??? */ - *(int16_t *)(hle->mp3_buffer + ((hle->mp3_outPtr + 2)^S16)) = v0; - *(int16_t *)(hle->mp3_buffer + ((hle->mp3_outPtr + 4)^S16)) = v18; - hle->mp3_outPtr += 4; + *(int16_t *)(hle->mp3_buffer + ((outPtr + 2)^S16)) = v0; + *(int16_t *)(hle->mp3_buffer + ((outPtr + 4)^S16)) = v18; + outPtr += 4; addptr -= 0x50; } - tmp = hle->mp3_outPtr; + tmp = outPtr; hi0 = mult6; hi1 = mult4; @@ -655,43 +666,20 @@ static void InnerLoop(struct hle_t* hle) for (i = 0; i < 8; i++) { /* v0 */ vt = (*(int16_t *)(hle->mp3_buffer + ((tmp - 0x40)^S16)) * hi0); - if (vt > 32767) { - vt = 32767; - } else { - if (vt < -32767) - vt = -32767; - } - *(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0x40)^S16)) = (int16_t)vt; + *(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0x40)^S16)) = clamp_s16(vt); /* v17 */ vt = (*(int16_t *)(hle->mp3_buffer + ((tmp - 0x30)^S16)) * hi0); - if (vt > 32767) { - vt = 32767; - } else { - if (vt < -32767) - vt = -32767; - } - *(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0x30)^S16)) = vt; + *(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0x30)^S16)) = clamp_s16(vt); /* v2 */ vt = (*(int16_t *)(hle->mp3_buffer + ((tmp - 0x1E)^S16)) * hi1); - if (vt > 32767) { - vt = 32767; - } else { - if (vt < -32767) - vt = -32767; - } - *(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0x1E)^S16)) = vt; + *(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0x1E)^S16)) = clamp_s16(vt); /* v4 */ vt = (*(int16_t *)(hle->mp3_buffer + ((tmp - 0xE)^S16)) * hi1); - if (vt > 32767) { - vt = 32767; - } else { - if (vt < -32767) - vt = -32767; - } - *(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0xE)^S16)) = vt; + *(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0xE)^S16)) = clamp_s16(vt); + tmp += 2; } } diff --git a/Frameworks/lazyusf/lazyusf/rsp_hle/musyx.c b/Frameworks/lazyusf/lazyusf/rsp_hle/musyx.c index 950c558fc..6d6fd1b08 100644 --- a/Frameworks/lazyusf/lazyusf/rsp_hle/musyx.c +++ b/Frameworks/lazyusf/lazyusf/rsp_hle/musyx.c @@ -28,6 +28,8 @@ #include #include +#include "common.h" + #include "arithmetics.h" #include "audio.h" #include "hle_external.h" @@ -436,20 +438,16 @@ static void init_subframes_v1(musyx_t *musyx) static void init_subframes_v2(musyx_t *musyx) { unsigned i,k; + int16_t values[4]; + int16_t* subframes[4]; - int16_t values[4] = { - clamp_s16(musyx->base_vol[0]), - clamp_s16(musyx->base_vol[1]), - clamp_s16(musyx->base_vol[2]), - clamp_s16(musyx->base_vol[3]) - }; + for(k = 0; k < 4; ++k) + values[k] = clamp_s16(musyx->base_vol[k]); - int16_t* subframes[4] = { - musyx->left, - musyx->right, - musyx->cc0, - musyx->e50 - }; + subframes[0] = musyx->left; + subframes[1] = musyx->right; + subframes[2] = musyx->cc0; + subframes[3] = musyx->e50; for (i = 0; i < SUBFRAME_SIZE; ++i) { @@ -855,7 +853,7 @@ static void sfx_stage(struct hle_t* hle, mix_sfx_with_main_subframes_t mix_sfx_w } static void mix_sfx_with_main_subframes_v1(musyx_t *musyx, const int16_t *subframe, - const uint16_t* gains) + const uint16_t* UNUSED(gains)) { unsigned i; diff --git a/Frameworks/lazyusf/lazyusf/usf.h b/Frameworks/lazyusf/lazyusf/usf.h index 654e66791..5e0076e41 100644 --- a/Frameworks/lazyusf/lazyusf/usf.h +++ b/Frameworks/lazyusf/lazyusf/usf.h @@ -47,6 +47,12 @@ int usf_upload_section(void * state, const uint8_t * data, size_t size); Requesting zero samples with a null pointer is an acceptable way to force at least one block of samples to render and return the current sample rate in the variable passed in. + Requesting a non-zero number of samples with a null buffer pointer will + result in exactly count samples being rendered and discarded. + Emulation runs in whole blocks until there have been exactly enough + Audio Interface DMA transfers to at least fill count samples, at which + point the remainder is buffered in the emulator state until the next + usf_render() call. Returns 0 on success, or a pointer to the last error message on failure. */ const char * usf_render(void * state, int16_t * buffer, size_t count, int32_t * sample_rate); diff --git a/Frameworks/lazyusf/lazyusf/usf_internal.h b/Frameworks/lazyusf/lazyusf/usf_internal.h index e795dc63b..2b8cb6742 100644 --- a/Frameworks/lazyusf/lazyusf/usf_internal.h +++ b/Frameworks/lazyusf/lazyusf/usf_internal.h @@ -77,10 +77,20 @@ struct usf_state int16_t * sample_buffer; // audio.c + // SampleRate is usually guaranteed to stay the same for the duration + // of a given track, and depends on the game. int32_t SampleRate; + // Audio is rendered in whole Audio Interface DMA transfers, which are + // then copied directly to the caller's buffer. Any left over samples + // from the last DMA transfer that fills the caller's buffer will be + // stored here until the next call to usf_render() int16_t samplebuf[16384]; size_t samples_in_buffer; + // This buffer does not really need to be that large, as it is likely + // to only accumulate a handlful of error messages, at which point + // emulation is immediately halted and the messages are returned to + // the caller. const char * last_error; char error_message[1024]; diff --git a/Plugins/HighlyComplete/HighlyComplete/HCDecoder.mm b/Plugins/HighlyComplete/HighlyComplete/HCDecoder.mm index 788a8f4bd..9dc82aa2b 100644 --- a/Plugins/HighlyComplete/HighlyComplete/HCDecoder.mm +++ b/Plugins/HighlyComplete/HighlyComplete/HCDecoder.mm @@ -1030,6 +1030,8 @@ static int usf_info(void * context, const char * name, const char * value) sampleRate = samplerate; usfRemoveSilence = YES; + + silence_seconds = 10; } else if ( type == 0x22 ) {