Update LazyUSF and increased silence detection threshold for USF files to 10 seconds, which fixes Majora's Mask - Staff Roll

CQTexperiment
Chris Moeller 2014-04-07 17:42:09 -07:00
parent ef2c8efdf9
commit 0fb8aa57bb
56 changed files with 1765 additions and 423 deletions

View File

@ -191,7 +191,7 @@ int32_t r4300i_LB_NonMemory ( usf_state_t * state, uint32_t PAddr, uint32_t * Va
}
uint32_t r4300i_LB_VAddr ( usf_state_t * state, uint32_t VAddr, uint8_t * Value ) {
uintptr_t address;
uint32_t address;
address = state->TLB_Map[VAddr >> 12];
if (address == 0) { return 0; }
*Value = *(uint8_t *)(address + (VAddr ^ 3));
@ -199,7 +199,7 @@ uint32_t r4300i_LB_VAddr ( usf_state_t * state, uint32_t VAddr, uint8_t * Value
}
uint32_t r4300i_LD_VAddr ( usf_state_t * state, uint32_t VAddr, uint64_t * Value ) {
uintptr_t address;
uint32_t address;
address = state->TLB_Map[VAddr >> 12];
if (address == 0) { return 0; }
*((uint32_t *)(Value) + 1) = *(uint32_t *)(address + VAddr);
@ -220,7 +220,7 @@ int32_t r4300i_LH_NonMemory ( usf_state_t * state, uint32_t PAddr, uint32_t * Va
}
uint32_t r4300i_LH_VAddr ( usf_state_t * state, uint32_t VAddr, uint16_t * Value ) {
uintptr_t address;
uint32_t address;
address = state->TLB_Map[VAddr >> 12];
if (address == 0)
return 0;
@ -426,7 +426,7 @@ int32_t r4300i_SB_NonMemory ( usf_state_t * state, uint32_t PAddr, uint8_t Value
}
uint32_t r4300i_SB_VAddr ( usf_state_t * state, uint32_t VAddr, uint8_t Value ) {
uintptr_t address;
uint32_t address;
address = state->TLB_Map[VAddr >> 12];
if (address == 0) { return 0; }
@ -457,7 +457,7 @@ int32_t r4300i_SH_NonMemory ( usf_state_t * state, uint32_t PAddr, uint16_t Valu
}
uint32_t r4300i_SD_VAddr ( usf_state_t * state, uint32_t VAddr, uint64_t Value ) {
uintptr_t address;
uint32_t address;
address = state->TLB_Map[VAddr >> 12];
if (address == 0) { return 0; }
*(uint32_t *)(address + VAddr) = *((uint32_t *)(&Value) + 1);
@ -466,7 +466,7 @@ uint32_t r4300i_SD_VAddr ( usf_state_t * state, uint32_t VAddr, uint64_t Value )
}
uint32_t r4300i_SH_VAddr ( usf_state_t * state, uint32_t VAddr, uint16_t Value ) {
uintptr_t address;
uint32_t address;
address = state->TLB_Map[VAddr >> 12];
if (address == 0) { return 0; }

View File

@ -22,6 +22,9 @@ NOINLINE void run_task(usf_state_t * state)
{
register int PC;
int wrap_count = 0;
#ifdef SP_EXECUTE_LOG
int last_PC;
#endif
if (CFG_WAIT_FOR_CPU_HOST != 0)
{
@ -36,6 +39,9 @@ NOINLINE void run_task(usf_state_t * state)
register uint32_t inst;
inst = *(uint32_t *)(state->IMEM + FIT_IMEM(PC));
#ifdef SP_EXECUTE_LOG
last_PC = PC;
#endif
#ifdef EMULATE_STATIC_PC
PC = (PC + 0x004);
if ( FIT_IMEM(PC) == 0 && ++wrap_count == 32 )
@ -46,7 +52,7 @@ NOINLINE void run_task(usf_state_t * state)
EX:
#endif
#ifdef SP_EXECUTE_LOG
step_SP_commands(inst);
step_SP_commands(state, last_PC, inst);
#endif
if (inst >> 25 == 0x25) /* is a VU instruction */
{
@ -463,6 +469,9 @@ EX:
continue;
BRANCH:
inst = *(uint32_t *)(state->IMEM + FIT_IMEM(PC));
#ifdef SP_EXECUTE_LOG
last_PC = PC;
#endif
PC = state->temp_PC & 0x00000FFC;
goto EX;
#endif

View File

@ -58,12 +58,10 @@ NOINLINE static void message(usf_state_t * state, const char* body, int priority
*/
#define CHARACTERS_PER_LINE (80)
/* typical standard DOS text file limit per line */
#if 0
NOINLINE static void update_conf(const char* source)
{
(void)source;
}
#endif
#ifdef SP_EXECUTE_LOG
extern void step_SP_commands(usf_state_t * state, int PC, uint32_t inst);

View File

@ -180,7 +180,6 @@ void set_VCC(usf_state_t * state, unsigned short VCC)
state->clip[i] = (VCC >> (i + 0x8)) & 1;
return; /* Little endian becomes big. */
}
#if 0
void set_VCE(usf_state_t * state, unsigned char VCE)
{
register int i;
@ -190,4 +189,3 @@ void set_VCE(usf_state_t * state, unsigned char VCE)
return; /* Little endian becomes big. */
}
#endif
#endif

View File

@ -42,6 +42,22 @@
static INLINE void merge(short* VD, short* cmp, short* pass, short* fail)
{
register int i;
#ifdef ARCH_MIN_ARM_NEON
int16x8_t p,f,d,c,vd,temp;
p = vld1q_s16((const int16_t*)pass);
f = vld1q_s16((const int16_t*)fail);
c = vld1q_s16((const int16_t*)cmp);
d = vsubq_s16(p,f);
vd = vmlaq_s16(f, c, d); //vd = f + (cmp * d)
vst1q_s16(VD, vd);
return;
#else
#if (0)
/* Do not use this version yet, as it still does not vectorize to SSE2. */
for (i = 0; i < N; i++)
@ -55,9 +71,92 @@ static INLINE void merge(short* VD, short* cmp, short* pass, short* fail)
VD[i] = fail[i] + cmp[i]*diff[i]; /* actually `(cmp[i] != 0)*diff[i]` */
#endif
return;
#endif
}
#ifndef ARCH_MIN_SSE2
#ifdef ARCH_MIN_ARM_NEON
static INLINE void vector_copy(short * VD, short * VS)
{
int16x8_t xmm;
xmm = vld1q_s16((const int16_t*)VS);
vst1q_s16(VD, xmm);
return;
}
static INLINE void SIGNED_CLAMP_ADD(usf_state_t * state, short* VD, short* VS, short* VT)
{
int16x8_t dst, src, vco, max, min;
src = vld1q_s16((const int16_t*)VS);
dst = vld1q_s16((const int16_t*)VT);
vco = vld1q_s16((const int16_t*)state->co);
max = vmaxq_s16(dst, src);
min = vminq_s16(dst, src);
min = vqaddq_s16(min, vco);
max = vqaddq_s16(max, min);
vst1q_s16(VD, max);
return;
}
static INLINE void SIGNED_CLAMP_SUB(usf_state_t * state, short* VD, short* VS, short* VT)
{
int16x8_t dst, src, vco, dif, res, xmm,vd;
src = vld1q_s16((const int16_t*)VS);
vd = vld1q_s16((const int16_t*)VD);
dst = vld1q_s16((const int16_t*)VT);
vco = vld1q_s16((const int16_t*)state->co);
res = vqsubq_s16(src, dst);
dif = vaddq_s16(res, vco);
dif = veorq_s16(dif, res);
dif = vandq_s16(dif, dst);
xmm = vsubq_s16(src, dst);
src = vbicq_s16(dif, src);
xmm = vandq_s16(xmm, src);
xmm = vshrq_n_s16(xmm, 15);
xmm = vbicq_s16(vco, xmm);
res = vqsubq_s16(res, xmm);
vst1q_s16(VD, res);
return;
}
static INLINE void SIGNED_CLAMP_AM(usf_state_t * state, short* VD)
{
int16x8_t pvs, pvd;
int16x8x2_t packed;
int16x8_t result;
int16x4_t low, high;
pvs = vld1q_s16((const int16_t*)VACC_H);
pvd = vld1q_s16((const int16_t*)VACC_M);
packed = vzipq_s16(pvd,pvs);
low = vqmovn_s32((int32x4_t)packed.val[0]);
high = vqmovn_s32((int32x4_t)packed.val[1]);
result = vcombine_s16(low,high);
vst1q_s16(VD,result);
return;
}
#endif
#if !defined ARCH_MIN_SSE2 && !defined ARCH_MIN_ARM_NEON
static INLINE void vector_copy(short* VD, short* VS)
{
#if (0)
@ -92,6 +191,8 @@ static INLINE void SIGNED_CLAMP_ADD(usf_state_t * state, short* VD, short* VS, s
VD[i] ^= 0x8000 & (hi[i] | lo[i]);
return;
}
static INLINE void SIGNED_CLAMP_SUB(usf_state_t * state, short* VD, short* VS, short* VT)
{
ALIGNED int32_t dif[N];
@ -113,8 +214,9 @@ static INLINE void SIGNED_CLAMP_SUB(usf_state_t * state, short* VD, short* VS, s
VD[i] ^= 0x8000 & (hi[i] | lo[i]);
return;
}
static INLINE void SIGNED_CLAMP_AM(usf_state_t * state, short* VD)
{ /* typical sign-clamp of accumulator-mid (bits 31:16) */
{
ALIGNED short hi[N], lo[N];
register int i;
@ -135,7 +237,9 @@ static INLINE void SIGNED_CLAMP_AM(usf_state_t * state, short* VD)
VD[i] ^= 0x8000 * (hi[i] | lo[i]);
return;
}
#else
#endif
#ifdef ARCH_MIN_SSE2
/*
* We actually need to write explicit SSE2 code for this because GCC 4.8.1
* (and possibly later versions) has a code generation bug with vectorizing
@ -225,10 +329,29 @@ static INLINE void SIGNED_CLAMP_AM(usf_state_t * state, short* VD)
static INLINE void UNSIGNED_CLAMP(usf_state_t * state, short* VD)
{ /* sign-zero hybrid clamp of accumulator-mid (bits 31:16) */
ALIGNED short cond[N];
ALIGNED short temp[N];
register int i;
#ifdef ARCH_MIN_ARM_NEON
uint16x8_t c;
int16x8_t t = vld1q_s16((const int16_t*)temp);
int16x8_t vaccm = vld1q_s16((const int16_t*)VACC_M);
SIGNED_CLAMP_AM(state, temp);
c = vcgtq_s16(t,vaccm);
int16x8_t t_ = vshrq_n_s16(t,15);
int16x8_t vd = vbicq_s16(t,t_);
vd = vorrq_s16(vd,(int16x8_t)c);
vst1q_s16(VD, vd);
return;
#else
SIGNED_CLAMP_AM(state, temp); /* no direct map in SSE, but closely based on this */
for (i = 0; i < N; i++)
cond[i] = -(temp[i] > VACC_M[i]); /* VD |= -(ACC47..16 > +32767) */
@ -237,12 +360,36 @@ static INLINE void UNSIGNED_CLAMP(usf_state_t * state, short* VD)
for (i = 0; i < N; i++)
VD[i] = VD[i] | cond[i];
return;
#endif
}
static INLINE void SIGNED_CLAMP_AL(usf_state_t * state, short* VD)
{ /* sign-clamp accumulator-low (bits 15:0) */
ALIGNED short cond[N];
ALIGNED short temp[N];
register int i;
#ifdef ARCH_MIN_ARM_NEON
SIGNED_CLAMP_AM(state, temp);
uint16x8_t c;
int16x8_t eightk = vdupq_n_s16(0x8000);
uint16x8_t one = vdupq_n_u16(1);
int16x8_t t = vld1q_s16((const int16_t*)temp);
int16x8_t vaccm = vld1q_s16((const int16_t*)VACC_M);
c = vceqq_s16(t,vaccm);
c = vaddq_u16(c, one);
t = veorq_s16(t, eightk);
vst1q_u16(cond,c);
vst1q_s16(temp,t);
merge(VD, cond, temp, VACC_L);
return;
#else
SIGNED_CLAMP_AM(state, temp); /* no direct map in SSE, but closely based on this */
for (i = 0; i < N; i++)
@ -251,5 +398,6 @@ static INLINE void SIGNED_CLAMP_AL(usf_state_t * state, short* VD)
temp[i] ^= 0x8000; /* half-assed unsigned saturation mix in the clamp */
merge(VD, cond, temp, VACC_L);
return;
#endif
}
#endif

View File

@ -22,65 +22,42 @@
#define INLINE
#endif
#ifndef ARCH_MIN_SSE2
/*
* vector-scalar element decoding
* Obsolete. Consider using at least the SSE2 algorithms instead.
*/
static const int ei[16][8] = {
{ 00, 01, 02, 03, 04, 05, 06, 07 }, /* none (vector-only operand) */
{ 00, 01, 02, 03, 04, 05, 06, 07 },
{ 00, 00, 02, 02, 04, 04, 06, 06 }, /* 0Q */
{ 01, 01, 03, 03, 05, 05, 07, 07 }, /* 1Q */
{ 00, 00, 00, 00, 04, 04, 04, 04 }, /* 0H */
{ 01, 01, 01, 01, 05, 05, 05, 05 }, /* 1H */
{ 02, 02, 02, 02, 06, 06, 06, 06 }, /* 2H */
{ 03, 03, 03, 03, 07, 07, 07, 07 }, /* 3H */
{ 00, 00, 00, 00, 00, 00, 00, 00 }, /* 0 */
{ 01, 01, 01, 01, 01, 01, 01, 01 }, /* 1 */
{ 02, 02, 02, 02, 02, 02, 02, 02 }, /* 2 */
{ 03, 03, 03, 03, 03, 03, 03, 03 }, /* 3 */
{ 04, 04, 04, 04, 04, 04, 04, 04 }, /* 4 */
{ 05, 05, 05, 05, 05, 05, 05, 05 }, /* 5 */
{ 06, 06, 06, 06, 06, 06, 06, 06 }, /* 6 */
{ 07, 07, 07, 07, 07, 07, 07, 07 } /* 7 */
#ifdef ARCH_MIN_ARM_NEON
static const unsigned char smask[16][16] = {
{0x0,0x1,0x2,0x3,0x4,0x5,0x6,0x7,0x8,0x9,0xA,0xB,0xC,0xD,0xE,0xF},
{0x0,0x1,0x2,0x3,0x4,0x5,0x6,0x7,0x8,0x9,0xA,0xB,0xC,0xD,0xE,0xF},
{0x0,0x1,0x0,0x1,0x4,0x5,0x4,0x5,0x8,0x9,0x8,0x9,0xC,0xD,0xC,0xD},
{0x2,0x3,0x2,0x3,0x6,0x7,0x6,0x7,0xA,0xB,0xA,0xB,0xE,0xF,0xE,0xF},
{0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9},
{0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB},
{0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD},
{0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF},
{0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1},
{0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3},
{0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5},
{0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7},
{0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9},
{0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB},
{0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD},
{0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF}
};
int sub_mask[16] = {
0x0,
0x0,
0x1, 0x1,
0x3, 0x3, 0x3, 0x3,
0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7
};
#define conv816_to_882(v) ((int8x8x2_t) { vget_low_s8(v), vget_high_s8(v) })
INLINE static void SHUFFLE_VECTOR(short* VD, short* VT, const int e)
{
ALIGNED short SV[8];
register int i, j;
#if (0 == 0)
j = sub_mask[e];
for (i = 0; i < N; i++)
SV[i] = VT[(i & ~j) | (e & j)];
#else
if (e & 0x8)
for (i = 0; i < N; i++)
SV[i] = VT[(i & 0x0) | (e & 0x7)];
else if (e & 0x4)
for (i = 0; i < N; i++)
SV[i] = VT[(i & 0xC) | (e & 0x3)];
else if (e & 0x2)
for (i = 0; i < N; i++)
SV[i] = VT[(i & 0xE) | (e & 0x1)];
else /* if ((e == 0b0000) || (e == 0b0001)) */
for (i = 0; i < N; i++)
SV[i] = VT[(i & 0x7) | (e & 0x0)];
#endif
for (i = 0; i < N; i++)
*(VD + i) = *(SV + i);
{
int8x16_t xmm;
int8x16_t key;
xmm = vld1q_s8((const int8_t*)VT);
key = vld1q_s8(smask[e]);
xmm = vcombine_s8(vtbl2_s8(conv816_to_882(xmm), vget_low_s8(key)), vtbl2_s8(conv816_to_882(xmm), vget_high_s8(key)));
vst1q_s8((int8_t*)VD, xmm);
return;
}
#else
#endif
#ifdef ARCH_MIN_SSSE3
static const unsigned char smask[16][16] = {
{0x0,0x1,0x2,0x3,0x4,0x5,0x6,0x7,0x8,0x9,0xA,0xB,0xC,0xD,0xE,0xF},
@ -112,7 +89,9 @@ INLINE static void SHUFFLE_VECTOR(short* VD, short* VT, const int e)
_mm_store_si128((__m128i *)VD, xmm);
return;
}
#else
#endif
#if defined ARCH_MIN_SSE2 && !defined ARCH_MIN_SSSE3
#define B(x) ((x) & 3)
#define SHUFFLE(a,b,c,d) ((B(d)<<6) | (B(c)<<4) | (B(b)<<2) | (B(a)<<0))
@ -274,5 +253,65 @@ INLINE static void SHUFFLE_VECTOR(short* VD, short* VT, const int e)
return;
}
#endif
#if !defined ARCH_MIN_ARM_NEON && !defined ARCH_MIN_SSE2 && !defined ARCH_MIN_SSSE3
/*
* vector-scalar element decoding
* Obsolete. Consider using at least the SSE2 algorithms instead.
*/
static const int ei[16][8] = {
{ 00, 01, 02, 03, 04, 05, 06, 07 }, /* none (vector-only operand) */
{ 00, 01, 02, 03, 04, 05, 06, 07 },
{ 00, 00, 02, 02, 04, 04, 06, 06 }, /* 0Q */
{ 01, 01, 03, 03, 05, 05, 07, 07 }, /* 1Q */
{ 00, 00, 00, 00, 04, 04, 04, 04 }, /* 0H */
{ 01, 01, 01, 01, 05, 05, 05, 05 }, /* 1H */
{ 02, 02, 02, 02, 06, 06, 06, 06 }, /* 2H */
{ 03, 03, 03, 03, 07, 07, 07, 07 }, /* 3H */
{ 00, 00, 00, 00, 00, 00, 00, 00 }, /* 0 */
{ 01, 01, 01, 01, 01, 01, 01, 01 }, /* 1 */
{ 02, 02, 02, 02, 02, 02, 02, 02 }, /* 2 */
{ 03, 03, 03, 03, 03, 03, 03, 03 }, /* 3 */
{ 04, 04, 04, 04, 04, 04, 04, 04 }, /* 4 */
{ 05, 05, 05, 05, 05, 05, 05, 05 }, /* 5 */
{ 06, 06, 06, 06, 06, 06, 06, 06 }, /* 6 */
{ 07, 07, 07, 07, 07, 07, 07, 07 } /* 7 */
};
static const int sub_mask[16] = {
0x0,
0x0,
0x1, 0x1,
0x3, 0x3, 0x3, 0x3,
0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7
};
INLINE static void SHUFFLE_VECTOR(short* VD, short* VT, const int e)
{
ALIGNED short SV[8];
register int i, j;
#if (0 == 0)
j = sub_mask[e];
for (i = 0; i < N; i++)
SV[i] = VT[(i & ~j) | (e & j)];
#else
if (e & 0x8)
for (i = 0; i < N; i++)
SV[i] = VT[(i & 0x0) | (e & 0x7)];
else if (e & 0x4)
for (i = 0; i < N; i++)
SV[i] = VT[(i & 0xC) | (e & 0x3)];
else if (e & 0x2)
for (i = 0; i < N; i++)
SV[i] = VT[(i & 0xE) | (e & 0x1)];
else /* if ((e == 0b0000) || (e == 0b0001)) */
for (i = 0; i < N; i++)
SV[i] = VT[(i & 0x7) | (e & 0x0)];
#endif
for (i = 0; i < N; i++)
*(VD + i) = *(SV + i);
return;
}
#endif
#endif

View File

@ -27,6 +27,30 @@ INLINE static void do_abs(usf_state_t * state, short* VD, short* VS, short* VT)
register int i;
vector_copy(res, VT);
#ifdef ARCH_MIN_ARM_NEON
int16x8_t vs = vld1q_s16((const int16_t*)VS);
int16x8_t resi = vld1q_s16((const int16_t*)VT);
int16x8_t zero = vdupq_n_s16(0);
int16x8_t eightk = vdupq_n_s16(0x8000);
int16x8_t one = vdupq_n_s16(1);
uint16x8_t negi = vcltq_s16(vs,zero);
int16x8_t posi = vaddq_s16((int16x8_t)negi,one);
posi = vorrq_s16(posi,(int16x8_t)negi);
resi = veorq_s16(resi,posi);
uint16x8_t ccch = vcgeq_s16(resi,eightk);
int16x8_t ch = vnegq_s16((int16x8_t)ccch);
resi = vaddq_s16(resi, (int16x8_t)ch);
vst1q_s16(VACC_L, resi);
vector_copy(VD, VACC_L);
return;
#else
#ifndef ARCH_MIN_SSE2
#define MASK_XOR
#endif
@ -65,6 +89,7 @@ INLINE static void do_abs(usf_state_t * state, short* VD, short* VS, short* VT)
vector_copy(VACC_L, res);
vector_copy(VD, VACC_L);
return;
#endif
}
static void VABS(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -15,6 +15,50 @@
INLINE static void clr_ci(usf_state_t * state, short* VD, short* VS, short* VT)
{ /* clear CARRY and carry in to accumulators */
#ifdef ARCH_MIN_SSE2
__m128i xmm,vs,vt,co; /*,ne;*/
xmm = _mm_setzero_si128();
vs = _mm_load_si128((const __m128i*)VS);
vt = _mm_load_si128((const __m128i*)VT);
co = _mm_load_si128((const __m128i*)state->co);
vs = _mm_add_epi16(vs,vt);
vs = _mm_add_epi16(vs,co);
_mm_store_si128((__m128i*)VACC_L, vs);
SIGNED_CLAMP_ADD(state, VD, VS, VT);
_mm_storeu_si128((__m128i*)state->ne, xmm);
_mm_storeu_si128((__m128i*)state->co, xmm);
return;
#endif
#ifdef ARCH_MIN_ARM_NEON
int16x8_t vs, vt, zero1,co;
zero1 = vdupq_n_s16(0);
vs = vld1q_s16((const int16_t*)VS);
vt = vld1q_s16((const int16_t*)VT);
co = vld1q_s16((const int16_t*)state->co);
vs = vaddq_s16(vs,vt);
vs = vaddq_s16(vs,co);
vst1q_s16(VACC_L, vs);
SIGNED_CLAMP_ADD(state, VD, VS, VT);
vst1q_s16(state->ne, zero1);
vst1q_s16(state->co, zero1);
return;
#endif
#if !defined ARCH_MIN_ARM_NEON && !defined ARCH_MIN_SSE2
register int i;
for (i = 0; i < N; i++)
@ -24,7 +68,10 @@ INLINE static void clr_ci(usf_state_t * state, short* VD, short* VS, short* VT)
state->ne[i] = 0;
for (i = 0; i < N; i++)
state->co[i] = 0;
return;
#endif
}
static void VADD(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -15,6 +15,30 @@
INLINE static void set_co(usf_state_t * state, short* VD, short* VS, short* VT)
{ /* set CARRY and carry out from sum */
#ifdef ARCH_MIN_ARM_NEON
uint16x4_t vs_low = vld1_u16((const uint16_t*)VS);
uint16x4_t vs_high = vld1_u16((const uint16_t*)VS+4);
uint16x4_t vt_low = vld1_u16((const uint16_t*)VT);
uint16x4_t vt_high = vld1_u16((const uint16_t*)VT+4);
uint32x4_t zero = vdupq_n_u32(0);
uint32x4_t v_l = vaddl_u16(vs_low, vt_low);
uint32x4_t v_h = vaddl_u16(vs_high, vt_high);
uint16x4_t vl16 = vaddhn_u32(v_l,zero);
uint16x4_t vh16 = vaddhn_u32(v_h,zero);
uint16x8_t vaccl = vcombine_u16(vmovn_u32(v_l),vmovn_u32(v_h));
uint16x8_t co = vcombine_u16(vl16,vh16);
vst1q_u16(VACC_L, vaccl);
vector_copy(VD, VACC_L);
vst1q_u16(state->ne, (uint16x8_t)zero);
vst1q_u16(state->co, co);
return;
#else
ALIGNED int32_t sum[N];
register int i;
@ -28,6 +52,8 @@ INLINE static void set_co(usf_state_t * state, short* VD, short* VS, short* VT)
for (i = 0; i < N; i++)
state->co[i] = sum[i] >> 16; /* native: (sum[i] > +65535) */
return;
#endif
}
static void VADDC(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -13,14 +13,25 @@
\******************************************************************************/
#include "vu.h"
static INLINE void do_and(usf_state_t * state, short* VD, short* VS, short* VT)
INLINE void do_and(usf_state_t * state, short* VD, short* VS, short* VT)
{
#ifdef ARCH_MIN_ARM_NEON
int16x8_t vs = vld1q_s16((const int16_t *)VS);
int16x8_t vt = vld1q_s16((const int16_t *)VT);
int16x8_t vaccl = vandq_s16(vs,vt);
vst1q_s16(VACC_L, vaccl);
vector_copy(VD, VACC_L);
return;
#else
register int i;
for (i = 0; i < N; i++)
VACC_L[i] = VS[i] & VT[i];
vector_copy(VD, VACC_L);
return;
#endif
}
static void VAND(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -19,6 +19,58 @@ INLINE static void do_ch(usf_state_t * state, short* VD, short* VS, short* VT)
ALIGNED short sn[N];
ALIGNED short VC[N];
ALIGNED short diff[N];
#ifdef ARCH_MIN_ARM_NEON
int16x8_t v_vc,neg_sn,vce,v_eq;
int16x8_t zero = vdupq_n_s16(0);
int16x8_t one = vdupq_n_s16(1);
int16x8_t vs = vld1q_s16((const int16_t *)VS);
int16x8_t vt = vld1q_s16((const int16_t *)VT);
v_vc = vt;
int16x8_t v_sn = veorq_s16(vs,v_vc);
uint16x8_t sn_u = vcltq_s16(v_sn,zero);
v_vc = veorq_s16(v_vc, (int16x8_t)sn_u);
neg_sn = vnegq_s16((int16x8_t)sn_u);
uint16x8_t vs_vc_eq = vceqq_s16(vs,v_vc);
vce = vandq_s16((int16x8_t)vs_vc_eq,v_sn);
v_vc = vaddq_s16(v_vc,v_sn);
v_eq = vorrq_s16(vce,(int16x8_t)vs_vc_eq);
v_eq = vnegq_s16(v_eq);
int16x8_t not_sn = vsubq_s16(neg_sn, one);
int16x8_t neg_vs = vmvnq_s16(vs);
int16x8_t v_diff = vorrq_s16(neg_vs, not_sn);
uint16x8_t ule = vcleq_s16(vt,v_diff);
int16x8_t v_le = vnegq_s16((int16x8_t)ule);
v_diff = vorrq_s16(vs, (int16x8_t)sn_u);
uint16x8_t uge = vcgeq_s16(v_diff,vt);
int16x8_t v_ge = vnegq_s16((int16x8_t)uge);
vst1q_s16(ge, v_ge);
vst1q_s16(le, v_le);
vst1q_s16(sn, v_sn);
vst1q_s16(VC, v_vc);
merge(state->comp, sn, le, ge);
merge(VACC_L, state->comp, VC, VS);
vector_copy(VD, VACC_L);
v_eq = veorq_s16(v_eq, one);
vst1q_s16(state->clip, v_ge);
vst1q_s16(state->comp, v_le);
vst1q_s16(state->ne, v_eq);
vst1q_s16(state->co, v_sn);
return;
#else
register int i;
for (i = 0; i < N; i++)
@ -72,6 +124,7 @@ INLINE static void do_ch(usf_state_t * state, short* VD, short* VS, short* VT)
for (i = 0; i < N; i++)
state->co[i] = sn[i];
return;
#endif
}
static void VCH(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -15,6 +15,7 @@
INLINE static void do_cl(usf_state_t * state, short* VD, short* VS, short* VT)
{
ALIGNED short eq[N], ge[N], le[N];
ALIGNED short gen[N], len[N], lz[N], uz[N], sn[N];
ALIGNED short diff[N];
@ -22,6 +23,88 @@ INLINE static void do_cl(usf_state_t * state, short* VD, short* VS, short* VT)
ALIGNED unsigned short VB[N], VC[N];
register int i;
#ifdef ARCH_MIN_ARM_NEON
int16x8_t v_vc,neg_sn,v_eq,v_vb,v_sn,v_diff,v_uz,v_cmp;
int16x8_t zero = vdupq_n_s16(0);
int16x8_t minus1 = vdupq_n_s16(-1);
int16x8_t one = vdupq_n_s16(1);
int16x8_t vs = vld1q_s16((const int16_t *)VS);
int16x8_t vt = vld1q_s16((const int16_t *)VT);
int16x8_t ne = vld1q_s16((const int16_t *)state->ne);
int16x8_t co = vld1q_s16((const int16_t *)state->co);
int16x8_t vce = vld1q_s16((const int16_t *)state->vce);
v_vb = vs;
v_vc = vt;
v_eq = veorq_s16(ne, one);
v_sn = co;
neg_sn = vnegq_s16((int16x8_t)v_sn);
v_vc = veorq_s16(v_vc, (int16x8_t)neg_sn);
v_vc = vaddq_s16(v_vc, v_sn);
v_diff = vsubq_s16(v_vb,v_vc);
uint16x8_t vb_cond = vceqq_s16(v_vb,minus1);
uint16x8_t vc_cond = vceqq_s16(v_vc,zero);
vb_cond = vmvnq_u16(vb_cond);
vc_cond = vmvnq_u16(vc_cond);
v_uz = vorrq_s16((int16x8_t)vb_cond, (int16x8_t)vc_cond);
uint16x8_t v_lz = vceqq_s16(v_diff,zero);
int16x8_t lz_s = vnegq_s16((int16x8_t)v_lz);
int16x8_t v_gen = vorrq_s16(lz_s,v_uz);
int16x8_t v_len = vandq_s16(lz_s,v_uz);
v_gen = vandq_s16(v_gen,vce);
vce = veorq_s16(vce,one);
v_len = vandq_s16(v_len,vce);
v_len = vorrq_s16(v_len,v_gen);
uint16x8_t gen_u = vcgeq_u16((uint16x8_t)v_vb,(uint16x8_t)v_vc);
v_gen = vnegq_s16((int16x8_t)gen_u);
v_cmp = vandq_s16(v_eq,v_sn);
vst1q_s16(cmp, v_cmp);
vst1q_s16(len, v_len);
merge(le, cmp, len, state->comp);
int16x8_t sn_xor = veorq_s16(v_sn,one);
v_cmp = vandq_s16(v_eq,sn_xor);
vst1q_s16(cmp, v_cmp);
vst1q_s16(gen, v_gen);
vst1q_s16(sn, v_sn);
vst1q_s16(VC, v_vc);
merge(ge, cmp, gen, state->clip);
merge(cmp, sn, le, ge);
merge(VACC_L, cmp, (short *)VC, VS);
vector_copy(VD, VACC_L);
int16x8_t v_ge = vld1q_s16((const int16_t *)ge);
int16x8_t v_le = vld1q_s16((const int16_t *)le);
vst1q_s16(state->clip,v_ge);
vst1q_s16(state->comp,v_le);
vst1q_s16(state->ne,zero);
vst1q_s16(state->co,zero);
vst1q_s16(state->vce,zero);
return;
#else
for (i = 0; i < N; i++)
VB[i] = VS[i];
for (i = 0; i < N; i++)
@ -88,6 +171,7 @@ INLINE static void do_cl(usf_state_t * state, short* VD, short* VS, short* VT)
for (i = 0; i < N; i++)
state->vce[i] = 0;
return;
#endif
}
static void VCL(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -20,6 +20,48 @@ INLINE static void do_cr(usf_state_t * state, short* VD, short* VS, short* VT)
ALIGNED short cmp[N];
register int i;
#ifdef ARCH_MIN_ARM_NEON
int16x8_t v_sn, v_cmp, v_vc;
int16x8_t zero = vdupq_n_s16(0);
int16x8_t vs = vld1q_s16((const int16_t *)VS);
int16x8_t vt = vld1q_s16((const int16_t *)VT);
v_vc = vt;
v_sn = veorq_s16(vs,vt);
v_sn = vshrq_n_s16(v_sn,15);
v_cmp = vandq_s16(vs, v_sn);
v_cmp = vmvnq_s16(v_cmp);
uint16x8_t v_le = vcleq_s16(vt,v_cmp);
int16x8_t v_le_ = vnegq_s16((int16x8_t)v_le);
v_cmp = vorrq_s16(vs, v_sn);
uint16x8_t v_ge = vcgeq_s16(v_cmp, vt);
int16x8_t v_ge_ = vnegq_s16((int16x8_t)v_ge);
v_vc = veorq_s16(v_vc,v_sn);
vst1q_s16(VC, v_vc);
vst1q_s16(le, v_le_);
vst1q_s16(ge, v_ge_);
merge(VACC_L, le, VC, VS);
vector_copy(VD, VACC_L);
vst1q_s16(state->clip, v_ge_);
vst1q_s16(state->comp, v_le_);
vst1q_s16(state->ne,zero);
vst1q_s16(state->co,zero);
vst1q_s16(state->vce,zero);
return;
#else
for (i = 0; i < N; i++)
VC[i] = VT[i];
for (i = 0; i < N; i++)
@ -55,6 +97,7 @@ INLINE static void do_cr(usf_state_t * state, short* VD, short* VS, short* VT)
for (i = 0; i < N; i++)
state->vce[i] = 0;
return;
#endif
}
static void VCR(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -17,6 +17,30 @@ INLINE static void do_eq(usf_state_t * state, short* VD, short* VS, short* VT)
{
register int i;
#ifdef ARCH_MIN_ARM_NEON
int16x8_t one = vdupq_n_s16(1);
int16x8_t zero = vdupq_n_s16(0);
int16x8_t vs = vld1q_s16((const int16_t *)VS);
int16x8_t vt = vld1q_s16((const int16_t *)VT);
int16x8_t v_ne = vld1q_s16((const int16_t *)state->ne);
uint16x8_t v_comp = vceqq_s16(vs, vt);
int16x8_t v_comp_ = vnegq_s16((int16x8_t)v_comp);
v_ne = veorq_s16(v_ne, one);
v_comp_ = vandq_s16(v_comp_, v_ne);
vector_copy(VACC_L, VT);
vector_copy(VD, VACC_L);
vst1q_s16(state->comp,v_comp_);
vst1q_s16(state->ne,zero);
vst1q_s16(state->co,zero);
return;
#else
for (i = 0; i < N; i++)
state->clip[i] = 0;
for (i = 0; i < N; i++)
@ -35,6 +59,7 @@ INLINE static void do_eq(usf_state_t * state, short* VD, short* VS, short* VT)
for (i = 0; i < N; i++)
state->co[i] = 0;
return;
#endif
}
static void VEQ(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -15,10 +15,44 @@
INLINE static void do_ge(usf_state_t * state, short* VD, short* VS, short* VT)
{
ALIGNED short ce[N];
ALIGNED short eq[N];
register int i;
#ifdef ARCH_MIN_ARM_NEON
int16x8_t zero = vdupq_n_s16(0);
int16x8_t one = vdupq_n_s16(1);
int16x8_t vs = vld1q_s16((const int16_t *)VS);
int16x8_t vt = vld1q_s16((const int16_t *)VT);
int16x8_t v_ne = vld1q_s16((const int16_t *)state->ne);
int16x8_t v_co = vld1q_s16((const int16_t *)state->co);
uint16x8_t v_eq_u = vceqq_s16(vs,vt);
int16x8_t v_eq_s = vnegq_s16((int16x8_t)v_eq_u);
int16x8_t v_ce = vandq_s16(v_ne,v_co);
v_ce = veorq_s16(v_ce,one);
v_eq_s = vandq_s16(v_eq_s, v_ce);
vst1q_s16(state->clip, zero);
uint16x8_t v_comp = vcgtq_s16(vs, vt);
int16x8_t v_comp_s = vnegq_s16((int16x8_t)v_comp);
v_comp_s = vorrq_s16(v_comp_s, v_eq_s);
vst1q_s16(state->comp, v_comp_s);
merge(VACC_L, state->comp, VS, VT);
vector_copy(VD, VACC_L);
vst1q_s16(state->ne,zero);
vst1q_s16(state->co,zero);
return;
#else
for (i = 0; i < N; i++)
eq[i] = (VS[i] == VT[i]);
for (i = 0; i < N; i++)
@ -39,6 +73,7 @@ INLINE static void do_ge(usf_state_t * state, short* VD, short* VS, short* VT)
for (i = 0; i < N; i++)
state->co[i] = 0;
return;
#endif
}
static void VGE(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -15,6 +15,37 @@
INLINE static void do_lt(usf_state_t * state, short* VD, short* VS, short* VT)
{
#ifdef ARCH_MIN_ARM_NEON
int16x8_t zero = vdupq_n_s16(0);
int16x8_t vs = vld1q_s16((const int16_t *)VS);
int16x8_t vt = vld1q_s16((const int16_t *)VT);
int16x8_t v_ne = vld1q_s16((const int16_t *)state->ne);
int16x8_t v_co = vld1q_s16((const int16_t *)state->co);
uint16x8_t v_eq_u = vceqq_s16(vs,vt);
int16x8_t v_cn = vandq_s16(v_ne,v_co);
v_eq_u = vandq_u16(v_eq_u,(uint16x8_t)v_cn);
vst1q_s16(state->clip, zero);
uint16x8_t v_comp = vcltq_s16(vs, vt);
int16x8_t v_comp_s = vnegq_s16((int16x8_t)v_comp);
v_comp_s = vorrq_s16(v_comp_s, (int16x8_t)v_eq_u);
vst1q_s16(state->comp, v_comp_s);
merge(VACC_L, state->comp, VS, VT);
vector_copy(VD, VACC_L);
vst1q_s16(state->ne, zero);
vst1q_s16(state->co, zero);
return;
#else
ALIGNED short cn[N];
ALIGNED short eq[N];
register int i;
@ -39,6 +70,7 @@ INLINE static void do_lt(usf_state_t * state, short* VD, short* VS, short* VT)
for (i = 0; i < N; i++)
state->co[i] = 0;
return;
#endif
}
static void VLT(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -15,10 +15,81 @@
INLINE static void do_macf(usf_state_t * state, short* VD, short* VS, short* VT)
{
ALIGNED int32_t product[N];
#ifdef ARCH_MIN_ARM_NEON
uint16x8_t vs = vld1q_u16((const uint16_t*)VS);
uint16x8_t vt = vld1q_u16((const uint16_t*)VT);
uint16x4_t v_vaccl_low = vld1_u16((const uint16_t*)VACC_L);
uint16x4_t v_vaccl_high = vld1_u16((const uint16_t*)VACC_L+4);
uint16x4_t v_vaccm_low = vld1_u16((const uint16_t*)VACC_M);
uint16x4_t v_vaccm_high = vld1_u16((const uint16_t*)VACC_M+4);
uint16x4_t v_vacch_low = vld1_u16((const uint16_t*)VACC_H);
uint16x4_t v_vacch_high = vld1_u16((const uint16_t*)VACC_H+4);
int32x4_t zero = vdupq_n_s32(0);
uint32x4_t zero_u = vdupq_n_u32(0);
uint32x4_t highmask = vdupq_n_u32(0x0000ffff);
uint16x4_t vs_low = vget_low_u16(vs);
uint16x4_t vs_high = vget_high_u16(vs);
uint16x4_t vt_low = vget_low_u16(vt);
uint16x4_t vt_high = vget_high_u16(vt);
int32x4_t product_L = vqdmlal_s16(zero, (int16x4_t)vs_low, (int16x4_t)vt_low);
int32x4_t product_H = vqdmlal_s16(zero, (int16x4_t)vs_high, (int16x4_t)vt_high);
uint32x4_t addend_L = vandq_u32(highmask, (uint32x4_t)product_L);
uint32x4_t addend_H = vandq_u32(highmask, (uint32x4_t)product_H);
addend_L = vaddw_u16((uint32x4_t)addend_L, v_vaccl_low);
addend_H = vaddw_u16((uint32x4_t)addend_H, v_vaccl_high);
uint16x8_t v_vaccl = vcombine_u16(vmovn_u32(addend_L), vmovn_u32(addend_H));
addend_L = vaddl_u16(
vaddhn_u32((uint32x4_t)product_L, zero_u),
vaddhn_u32((uint32x4_t)addend_L, zero_u)
);
addend_H = vaddl_u16(
vaddhn_u32((uint32x4_t)product_H, zero_u),
vaddhn_u32((uint32x4_t)addend_H, zero_u)
);
addend_L = vaddw_u16(addend_L, v_vaccm_low);
addend_H = vaddw_u16(addend_H, v_vaccm_high);
uint16x8_t v_vaccm = vcombine_u16(vmovn_u32(addend_L), vmovn_u32(addend_H));
//product_L = vshrq_n_s32(product_L, 1);
//product_H = vshrq_n_s32(product_H, 1);
uint32x4_t cond_L = vcltq_s32(product_L,zero);
int32x4_t cond_L_s = vnegq_s32((int32x4_t)cond_L);
uint32x4_t cond_H = vcltq_s32(product_H,zero);
int32x4_t cond_H_s = vnegq_s32((int32x4_t)cond_H);
v_vacch_low = vsub_u16(v_vacch_low, vmovn_u32((uint32x4_t)cond_L_s));
v_vacch_high = vsub_u16(v_vacch_high, vmovn_u32((uint32x4_t)cond_H_s));
v_vacch_low = vadd_u16(vshrn_n_u32(addend_L,16), v_vacch_low);
v_vacch_high = vadd_u16(vshrn_n_u32(addend_H,16), v_vacch_high);
uint16x8_t v_vacch = vcombine_u16(v_vacch_low, v_vacch_high);
vst1q_s16(VACC_L, (int16x8_t)v_vaccl);
vst1q_s16(VACC_M, (int16x8_t)v_vaccm);
vst1q_s16(VACC_H, (int16x8_t)v_vacch);
SIGNED_CLAMP_AM(state, VD);
return;
#else
ALIGNED int32_t product[N];
ALIGNED uint32_t addend[N];
register int i;
for (i = 0; i < N; i++)
product[i] = VS[i] * VT[i];
for (i = 0; i < N; i++)
@ -38,7 +109,9 @@ INLINE static void do_macf(usf_state_t * state, short* VD, short* VS, short* VT)
for (i = 0; i < N; i++)
VACC_H[i] += addend[i] >> 16;
SIGNED_CLAMP_AM(state, VD);
return;
#endif
}
static void VMACF(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -15,6 +15,78 @@
INLINE static void do_macu(usf_state_t * state, short* VD, short* VS, short* VT)
{
#ifdef ARCH_MIN_ARM_NEON
uint16x8_t vs = vld1q_u16((const uint16_t*)VS);
uint16x8_t vt = vld1q_u16((const uint16_t*)VT);
uint16x4_t v_vaccl_low = vld1_u16((const uint16_t*)VACC_L);
uint16x4_t v_vaccl_high = vld1_u16((const uint16_t*)VACC_L+4);
uint16x4_t v_vaccm_low = vld1_u16((const uint16_t*)VACC_M);
uint16x4_t v_vaccm_high = vld1_u16((const uint16_t*)VACC_M+4);
uint16x4_t v_vacch_low = vld1_u16((const uint16_t*)VACC_H);
uint16x4_t v_vacch_high = vld1_u16((const uint16_t*)VACC_H+4);
int32x4_t zero = vdupq_n_s32(0);
uint32x4_t zero_u = vdupq_n_u32(0);
uint32x4_t highmask = vdupq_n_u32(0x0000ffff);
uint16x4_t vs_low = vget_low_u16(vs);
uint16x4_t vs_high = vget_high_u16(vs);
uint16x4_t vt_low = vget_low_u16(vt);
uint16x4_t vt_high = vget_high_u16(vt);
int32x4_t product_L = vqdmlal_s16(zero, (int16x4_t)vs_low, (int16x4_t)vt_low);
int32x4_t product_H = vqdmlal_s16(zero, (int16x4_t)vs_high, (int16x4_t)vt_high);
uint32x4_t addend_L = vandq_u32(highmask, (uint32x4_t)product_L);
uint32x4_t addend_H = vandq_u32(highmask, (uint32x4_t)product_H);
addend_L = vaddw_u16((uint32x4_t)addend_L, v_vaccl_low);
addend_H = vaddw_u16((uint32x4_t)addend_H, v_vaccl_high);
uint16x8_t v_vaccl = vcombine_u16(vmovn_u32(addend_L), vmovn_u32(addend_H));
addend_L = vaddl_u16(
vaddhn_u32((uint32x4_t)product_L, zero_u),
vaddhn_u32((uint32x4_t)addend_L, zero_u)
);
addend_H = vaddl_u16(
vaddhn_u32((uint32x4_t)product_H, zero_u),
vaddhn_u32((uint32x4_t)addend_H, zero_u)
);
addend_L = vaddw_u16(addend_L, v_vaccm_low);
addend_H = vaddw_u16(addend_H, v_vaccm_high);
uint16x8_t v_vaccm = vcombine_u16(vmovn_u32(addend_L), vmovn_u32(addend_H));
//product_L = vshrq_n_s32(product_L, 1);
//product_H = vshrq_n_s32(product_H, 1);
uint32x4_t cond_L = vcltq_s32(product_L,zero);
int32x4_t cond_L_s = vnegq_s32((int32x4_t)cond_L);
uint32x4_t cond_H = vcltq_s32(product_H,zero);
int32x4_t cond_H_s = vnegq_s32((int32x4_t)cond_H);
v_vacch_low = vsub_u16(v_vacch_low, vmovn_u32((uint32x4_t)cond_L_s));
v_vacch_high = vsub_u16(v_vacch_high, vmovn_u32((uint32x4_t)cond_H_s));
v_vacch_low = vadd_u16(vshrn_n_u32(addend_L,16), v_vacch_low);
v_vacch_high = vadd_u16(vshrn_n_u32(addend_H,16), v_vacch_high);
uint16x8_t v_vacch = vcombine_u16(v_vacch_low, v_vacch_high);
vst1q_s16(VACC_L, (int16x8_t)v_vaccl);
vst1q_s16(VACC_M, (int16x8_t)v_vaccm);
vst1q_s16(VACC_H, (int16x8_t)v_vacch);
UNSIGNED_CLAMP(state, VD);
return;
#else
ALIGNED int32_t product[N];
ALIGNED uint32_t addend[N];
register int i;
@ -39,6 +111,7 @@ INLINE static void do_macu(usf_state_t * state, short* VD, short* VS, short* VT)
VACC_H[i] += addend[i] >> 16;
UNSIGNED_CLAMP(state, VD);
return;
#endif
}
static void VMACU(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -15,6 +15,39 @@
INLINE static void do_madh(usf_state_t * state, short* VD, short* VS, short* VT)
{
#ifdef ARCH_MIN_ARM_NEON
int16x8_t vs,vt, vaccm, vacch, vacc_h, vacc_m,prod_low,prod_high,one,cond;
uint16x8_t cond_u,res;
one = vdupq_n_s16(1);
vs = vld1q_s16((const int16_t *)VS);
vt = vld1q_s16((const int16_t *)VT);
vaccm = vld1q_s16((const int16_t *)VACC_M);
vacch = vld1q_s16((const int16_t *)VACC_H);
prod_low = vmulq_s16(vs,vt);
vacc_m = vaddq_s16(vaccm,prod_low);
prod_high = vqdmulhq_s16(vs,vt);
prod_high = vshrq_n_s16(prod_high, 1);
res = vqaddq_u16((uint16x8_t)vaccm, (uint16x8_t)prod_low);
cond_u = vceqq_s16((int16x8_t)res,vacc_m);
cond_u = vaddq_u16(cond_u, (uint16x8_t)one);
vacc_h = vaddq_s16(prod_high, vacch);
vacc_h = vaddq_s16((int16x8_t)cond_u, vacc_h);
vst1q_s16(VACC_M, vacc_m);
vst1q_s16(VACC_H, vacc_h);
SIGNED_CLAMP_AM(state, VD);
return;
#else
ALIGNED int32_t product[N];
ALIGNED uint32_t addend[N];
register int i;
@ -29,6 +62,8 @@ INLINE static void do_madh(usf_state_t * state, short* VD, short* VS, short* VT)
VACC_H[i] += (addend[i] >> 16) + (product[i] >> 16);
SIGNED_CLAMP_AM(state, VD);
return;
#endif
}
static void VMADH(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -15,6 +15,56 @@
INLINE static void do_madl(usf_state_t * state, short* VD, short* VS, short* VT)
{
#ifdef ARCH_MIN_ARM_NEON
uint16x8_t vs = vld1q_u16((const uint16_t*)VS);
uint16x8_t vt = vld1q_u16((const uint16_t*)VT);
uint16x8_t v_vaccl = vld1q_u16((const uint16_t*)VACC_L);
uint16x8_t v_vaccm = vld1q_u16((const uint16_t*)VACC_M);
uint16x8_t v_vacch = vld1q_u16((const uint16_t*)VACC_H);
uint32x4_t zero = vdupq_n_u32(0);
uint16x4_t vs_low = vget_low_u16(vs);
uint16x4_t vs_high = vget_high_u16(vs);
uint16x4_t vt_low = vget_low_u16(vt);
uint16x4_t vt_high = vget_high_u16(vt);
uint32x4_t product_L = vmulq_u32( vmovl_u16(vs_low), vmovl_u16(vt_low) );
uint32x4_t product_H = vmulq_u32( vmovl_u16(vs_high), vmovl_u16(vt_high) );
uint16x4_t addend_L = vaddhn_u32(product_L, zero);
uint16x4_t addend_H = vaddhn_u32(product_H, zero);
uint32x4_t exceed1 = vaddl_u16(addend_L, vget_low_u16(v_vaccl));
uint32x4_t exceed2 = vaddl_u16(addend_H, vget_high_u16(v_vaccl));
v_vaccl = vcombine_u16(vmovn_u32(exceed1), vmovn_u32(exceed2));
addend_L = vaddhn_u32(exceed1, zero);
addend_H = vaddhn_u32(exceed2, zero);
exceed1 = vaddl_u16(addend_L, vget_low_u16(v_vaccm));
exceed2 = vaddl_u16(addend_H, vget_high_u16(v_vaccm));
v_vaccm = vcombine_u16(vmovn_u32(exceed1), vmovn_u32(exceed2));
addend_L = vaddhn_u32(exceed1, zero);
addend_H = vaddhn_u32(exceed2, zero);
uint16x8_t v_vacch2 = vcombine_u16(addend_L, addend_H);
v_vacch = vaddq_u16(v_vacch, v_vacch2);
vst1q_s16(VACC_L, (int16x8_t)v_vaccl);
vst1q_s16(VACC_M, (int16x8_t)v_vaccm);
vst1q_s16(VACC_H, (int16x8_t)v_vacch);
SIGNED_CLAMP_AL(state, VD);
return;
#else
ALIGNED int32_t product[N];
ALIGNED uint32_t addend[N];
register int i;
@ -37,6 +87,7 @@ INLINE static void do_madl(usf_state_t * state, short* VD, short* VS, short* VT)
VACC_H[i] += addend[i] >> 16;
SIGNED_CLAMP_AL(state, VD);
return;
#endif
}
static void VMADL(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -15,6 +15,43 @@
INLINE static void do_madm(usf_state_t * state, short* VD, short* VS, short* VT)
{
#ifdef ARCH_MIN_ARM_NEON
uint32x4_t zero = vdupq_n_u32(0);
int16x4_t vs_low = vld1_s16((const int16_t*)VS);
int16x4_t vs_high = vld1_s16((const int16_t*)VS+4);
uint16x4_t vt_low = vld1_u16((const uint16_t*)VT);
uint16x4_t vt_high = vld1_u16((const uint16_t*)VT+4);
uint16x4_t vaccl_low = vld1_u16((const uint16_t*)VACC_L);
uint16x4_t vaccl_high = vld1_u16((const uint16_t*)VACC_L+4);
uint16x4_t vaccm_low = vld1_u16((const uint16_t*)VACC_M);
uint16x4_t vaccm_high = vld1_u16((const uint16_t*)VACC_M+4);
int16x4_t vacch_low = vld1_s16((const int16_t*)VACC_H);
int16x4_t vacch_high = vld1_s16((const int16_t*)VACC_H+4);
int32x4_t vaccl_l = vmlaq_s32((int32x4_t)vmovl_u16(vaccl_low),vmovl_s16(vs_low),(int32x4_t)vmovl_u16(vt_low));
int32x4_t vaccl_h = vmlaq_s32((int32x4_t)vmovl_u16(vaccl_high),vmovl_s16(vs_high),(int32x4_t)vmovl_u16(vt_high));
uint32x4_t vaccm_l = vaddq_u32(vmovl_u16(vaccm_low), (uint32x4_t)vshrq_n_s32(vaccl_l,16));
uint32x4_t vaccm_h = vaddq_u32(vmovl_u16(vaccm_high),(uint32x4_t)vshrq_n_s32(vaccl_h,16));
uint16x4_t vacch_l = vaddhn_u32(vaccm_l, zero);
uint16x4_t vacch_h = vaddhn_u32(vaccm_h, zero);
int16x4_t vacch_low2 = vadd_s16(vacch_low,(int16x4_t)vacch_l);
int16x4_t vacch_high2 = vadd_s16(vacch_high,(int16x4_t)vacch_h);
int16x8_t vaccl = vcombine_s16(vmovn_s32(vaccl_l),vmovn_s32(vaccl_h));
uint16x8_t vaccm = vcombine_u16(vmovn_u32(vaccm_l),vmovn_u32(vaccm_h));
int16x8_t vacch = vcombine_s16(vacch_low2,vacch_high2);
vst1q_s16(VACC_L, vaccl);
vst1q_s16(VACC_M, (int16x8_t)vaccm);
vst1q_s16(VACC_H, vacch);
SIGNED_CLAMP_AM(state, VD);
return;
#else
ALIGNED uint32_t addend[N];
register int i;
@ -32,6 +69,7 @@ INLINE static void do_madm(usf_state_t * state, short* VD, short* VS, short* VT)
VACC_H[i] += addend[i] >> 16;
SIGNED_CLAMP_AM(state, VD);
return;
#endif
}
static void VMADM(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -15,6 +15,43 @@
INLINE static void do_madn(usf_state_t * state, short* VD, short* VS, short* VT)
{
#ifdef ARCH_MIN_ARM_NEON
uint32x4_t zero = vdupq_n_u32(0);
uint16x4_t vs_low = vld1_u16((const uint16_t*)VS);
uint16x4_t vs_high = vld1_u16((const uint16_t*)VS+4);
int16x4_t vt_low = vld1_s16((const int16_t*)VT);
int16x4_t vt_high = vld1_s16((const int16_t*)VT+4);
uint16x4_t vaccl_low = vld1_u16((const uint16_t*)VACC_L);
uint16x4_t vaccl_high = vld1_u16((const uint16_t*)VACC_L+4);
uint16x4_t vaccm_low = vld1_u16((const uint16_t*)VACC_M);
uint16x4_t vaccm_high = vld1_u16((const uint16_t*)VACC_M+4);
int16x4_t vacch_low = vld1_s16((const int16_t*)VACC_H);
int16x4_t vacch_high = vld1_s16((const int16_t*)VACC_H+4);
int32x4_t vaccl_l = vmlaq_s32((int32x4_t)vmovl_u16(vaccl_low),(int32x4_t)vmovl_u16(vs_low),vmovl_s16(vt_low));
int32x4_t vaccl_h = vmlaq_s32((int32x4_t)vmovl_u16(vaccl_high),(int32x4_t)vmovl_u16(vs_high),vmovl_s16(vt_high));
uint32x4_t vaccm_l = vaddq_u32(vmovl_u16(vaccm_low), (uint32x4_t)vshrq_n_s32(vaccl_l,16));
uint32x4_t vaccm_h = vaddq_u32(vmovl_u16(vaccm_high),(uint32x4_t)vshrq_n_s32(vaccl_h,16));
uint16x4_t vacch_l = vaddhn_u32(vaccm_l, zero);
uint16x4_t vacch_h = vaddhn_u32(vaccm_h, zero);
int16x4_t vacch_low2 = vadd_s16(vacch_low,(int16x4_t)vacch_l);
int16x4_t vacch_high2 = vadd_s16(vacch_high,(int16x4_t)vacch_h);
int16x8_t vaccl = vcombine_s16(vmovn_s32(vaccl_l),vmovn_s32(vaccl_h));
uint16x8_t vaccm = vcombine_u16(vmovn_u32(vaccm_l),vmovn_u32(vaccm_h));
int16x8_t vacch = vcombine_s16(vacch_low2,vacch_high2);
vst1q_s16(VACC_L, vaccl);
vst1q_s16(VACC_M, (int16x8_t)vaccm);
vst1q_s16(VACC_H, vacch);
SIGNED_CLAMP_AL(state, VD);
return;
#else
ALIGNED uint32_t addend[N];
register int i;
@ -32,6 +69,7 @@ INLINE static void do_madn(usf_state_t * state, short* VD, short* VS, short* VT)
VACC_H[i] += addend[i] >> 16;
SIGNED_CLAMP_AL(state, VD);
return;
#endif
}
static void VMADN(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -15,6 +15,37 @@
INLINE static void do_mudh(usf_state_t * state, short* VD, short* VS, short* VT)
{
#ifdef ARCH_MIN_ARM_NEON
int16x8_t zero16 = vdupq_n_s16(0);
int32x4_t zero32 = vdupq_n_s32(0);
int16x4_t vs_low = vld1_s16((const int16_t *)VS);
int16x4_t vs_high = vld1_s16((const int16_t *)VS+4);
int16x4_t vt_low = vld1_s16((const int16_t *)VT);
int16x4_t vt_high = vld1_s16((const int16_t *)VT+4);
int32x4_t l1 = vmovl_s16(vs_low);
int32x4_t h1 = vmovl_s16(vs_high);
int32x4_t l2 = vmovl_s16(vt_low);
int32x4_t h2 = vmovl_s16(vt_high);
int32x4_t l = vmulq_s32(l1,l2);
int32x4_t h = vmulq_s32(h1,h2);
int16x8_t vaccm = vcombine_s16(vmovn_s32(l),vmovn_s32(h));
int16x8_t vacch = vcombine_s16(vaddhn_s32(l,zero32),vaddhn_s32(h,zero32));
vst1q_s16(VACC_L, zero16);
vst1q_s16(VACC_M, vaccm);
vst1q_s16(VACC_H, vacch);
SIGNED_CLAMP_AM(state, VD);
return;
#else
register int i;
for (i = 0; i < N; i++)
@ -25,6 +56,7 @@ INLINE static void do_mudh(usf_state_t * state, short* VD, short* VS, short* VT)
VACC_H[i] = (short)(VS[i]*VT[i] >> 16);
SIGNED_CLAMP_AM(state, VD);
return;
#endif
}
static void VMUDH(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -15,6 +15,26 @@
INLINE static void do_mudl(usf_state_t * state, short* VD, short* VS, short* VT)
{
#ifdef ARCH_MIN_ARM_NEON
uint16x4_t vs_low = vld1_u16((const uint16_t*)VS);
uint16x4_t vs_high = vld1_u16((const uint16_t*)VS+4);
uint16x4_t vt_low = vld1_u16((const uint16_t*)VT);
uint16x4_t vt_high = vld1_u16((const uint16_t*)VT+4);
int16x8_t zero = vdupq_n_s16(0);
uint32x4_t lo = vmull_u16(vs_low, vt_low);
uint32x4_t high = vmull_u16(vs_high, vt_high);
uint16x8_t result = vcombine_u16(vshrn_n_u32(lo,16),vshrn_n_u32(high,16));
vst1q_u16(VACC_L, result);
vst1q_s16(VACC_M, zero);
vst1q_s16(VACC_H, zero);
vector_copy(VD, VACC_L);
return;
#else
register int i;
for (i = 0; i < N; i++)
@ -25,6 +45,7 @@ INLINE static void do_mudl(usf_state_t * state, short* VD, short* VS, short* VT)
VACC_H[i] = 0x0000;
vector_copy(VD, VACC_L); /* no possibilities to clamp */
return;
#endif
}
static void VMUDL(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -15,6 +15,39 @@
INLINE static void do_mudm(usf_state_t * state, short* VD, short* VS, short* VT)
{
#ifdef ARCH_MIN_ARM_NEON
int16x8_t vd, vs,vt,res,four,vacc_l, vacc_m, vacc_h;
int32x4_t zero = vdupq_n_s32(0);
int16x8_t zero16 = vdupq_n_s16(0);
int16x4_t vs_low = vld1_s16((const uint16_t *)VS);
int16x4_t vs_high = vld1_s16((const uint16_t *)VS+4);
uint16x4_t vt_low = vld1_u16((const int16_t *)VT);
uint16x4_t vt_high = vld1_u16((const int16_t *)VT+4);
int32x4_t l1 = vmovl_s16(vs_low);
int32x4_t h1 = vmovl_s16(vs_high);
uint32x4_t l2 = vmovl_u16(vt_low);
uint32x4_t h2 = vmovl_u16(vt_high);
int32x4_t l = vmulq_s32(l1,(int32x4_t)l2);
int32x4_t h = vmulq_s32(h1,(int32x4_t)h2);
int16x8_t vaccl = vcombine_s16(vmovn_s32(l),vmovn_s32(h));
int16x8_t vaccm = vcombine_s16(vaddhn_s32(l,zero),vaddhn_s32(h,zero));
uint16x8_t uvacch = vcltq_s16(vaccm, zero16);
vst1q_s16(VACC_L, vaccl);
vst1q_s16(VACC_M, vaccm);
vst1q_s16(VACC_H, (int16x8_t)uvacch);
vector_copy(VD, VACC_M); /* no possibilities to clamp */
return;
#else
register int i;
for (i = 0; i < N; i++)
@ -25,6 +58,7 @@ INLINE static void do_mudm(usf_state_t * state, short* VD, short* VS, short* VT)
VACC_H[i] = -(VACC_M[i] < 0);
vector_copy(VD, VACC_M); /* no possibilities to clamp */
return;
#endif
}
static void VMUDM(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -15,6 +15,40 @@
INLINE static void do_mudn(usf_state_t * state, short* VD, short* VS, short* VT)
{
#ifdef ARCH_MIN_ARM_NEON
int16x8_t vd, vs,vt,res,four,vacc_l, vacc_m, vacc_h;
int32x4_t zero = vdupq_n_s32(0);
int16x8_t zero16 = vdupq_n_s16(0);
uint16x4_t vs_low = vld1_u16((const uint16_t *)VS);
uint16x4_t vs_high = vld1_u16((const uint16_t *)VS+4);
int16x4_t vt_low = vld1_s16((const int16_t *)VT);
int16x4_t vt_high = vld1_s16((const int16_t *)VT+4);
uint32x4_t l1 = vmovl_u16(vs_low);
uint32x4_t h1 = vmovl_u16(vs_high);
int32x4_t l2 = vmovl_s16(vt_low);
int32x4_t h2 = vmovl_s16(vt_high);
int32x4_t l = vmulq_s32((int32x4_t)l1,l2);
int32x4_t h = vmulq_s32((int32x4_t)h1,h2);
int16x8_t vaccl = vcombine_s16(vmovn_s32(l),vmovn_s32(h));
int16x8_t vaccm = vcombine_s16(vaddhn_s32(l,zero),vaddhn_s32(h,zero));
uint16x8_t uvacch = vcltq_s16(vaccm, zero16);
vst1q_s16(VACC_L, vaccl);
vst1q_s16(VACC_M, vaccm);
vst1q_s16(VACC_H, (int16x8_t)uvacch);
vector_copy(VD, VACC_L); /* no possibilities to clamp */
return;
#else
register int i;
for (i = 0; i < N; i++)
@ -25,6 +59,7 @@ INLINE static void do_mudn(usf_state_t * state, short* VD, short* VS, short* VT)
VACC_H[i] = -(VACC_M[i] < 0);
vector_copy(VD, VACC_L); /* no possibilities to clamp */
return;
#endif
}
static void VMUDN(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -22,11 +22,44 @@
* Wrong: ACC(HI) = -((INT32)(acc) < 0)
* Right: ACC(HI) = -(SEMIFRAC < 0)
*/
#define SEMIFRAC (VS[i]*VT[i]*2/2 + 0x8000/2)
#define SEMIFRAC (VS[i]*VT[i] + 0x4000)
#endif
INLINE static void do_mulf(usf_state_t * state, short* VD, short* VS, short* VT)
{
#ifdef ARCH_MIN_ARM_NEON
int16x8_t vs,vt,res,four,zero,vacc_l, vacc_m, vacc_h;
uint16x8_t cond_u, vacc_m_cond_u,one;
one = vdupq_n_u16(1);
four = vdupq_n_s16(0x4000);
zero = vdupq_n_s16(0);
vs = vld1q_s16((const int16_t *)VS);
vt = vld1q_s16((const int16_t *)VT);
vacc_m = vqrdmulhq_s16(vs, vt);
vacc_l = vmlaq_s16(four, vs,vt);
vacc_l = vshlq_n_s16(vacc_l,1);
cond_u = vceqq_s16(vs,vt);
cond_u = vaddq_u16(cond_u, one);
vacc_m_cond_u = vcltq_s16(vacc_m, zero);
cond_u = vandq_u16(vacc_m_cond_u, cond_u);
vacc_h = vqnegq_s16((int16x8_t)cond_u);
vst1q_s16(VACC_L,vacc_l);
vst1q_s16(VACC_M,vacc_m);
vst1q_s16(VACC_H,vacc_h);
SIGNED_CLAMP_AM(state, VD);
return;
#else
register int i;
for (i = 0; i < N; i++)
@ -35,7 +68,7 @@ INLINE static void do_mulf(usf_state_t * state, short* VD, short* VS, short* VT)
VACC_M[i] = (SEMIFRAC << 1) >> 16;
for (i = 0; i < N; i++)
VACC_H[i] = -((VACC_M[i] < 0) & (VS[i] != VT[i])); /* -32768 * -32768 */
#ifndef ARCH_MIN_SSE2
#if !defined ARCH_MIN_SSE2
vector_copy(VD, VACC_M);
for (i = 0; i < N; i++)
VD[i] -= (VACC_M[i] < 0) & (VS[i] == VT[i]); /* ACC b 31 set, min*min */
@ -43,6 +76,7 @@ INLINE static void do_mulf(usf_state_t * state, short* VD, short* VS, short* VT)
SIGNED_CLAMP_AM(state, VD);
#endif
return;
#endif
}
static void VMULF(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -22,11 +22,50 @@
* Wrong: ACC(HI) = -((INT32)(acc) < 0)
* Right: ACC(HI) = -(SEMIFRAC < 0)
*/
#define SEMIFRAC (VS[i]*VT[i]*2/2 + 0x8000/2)
#define SEMIFRAC (VS[i]*VT[i]*2/2 + 0x4000)
#endif
INLINE static void do_mulu(usf_state_t * state, short* VD, short* VS, short* VT)
{
#ifdef ARCH_MIN_ARM_NEON
int16x8_t vd, vs,vt,res,four,zero,vacc_l, vacc_m, vacc_h;
uint16x8_t cond_u, vacc_m_cond_u,one;
one = vdupq_n_u16(1);
four = vdupq_n_s16(0x4000);
zero = vdupq_n_s16(0);
vs = vld1q_s16((const int16_t *)VS);
vt = vld1q_s16((const int16_t *)VT);
vacc_m = vqrdmulhq_s16(vs, vt);
vacc_l = vmlaq_s16(four, vs,vt);
vacc_l = vshlq_n_s16(vacc_l,1);
cond_u = vceqq_s16(vs,vt);
cond_u = vaddq_u16(cond_u, one);
vacc_m_cond_u = vcltq_s16(vacc_m, zero);
cond_u = vandq_u16(vacc_m_cond_u, cond_u);
vacc_h = vqnegq_s16((int16x8_t)cond_u);
vst1q_s16(VACC_L,vacc_l);
vst1q_s16(VACC_M,vacc_m);
vst1q_s16(VACC_H,vacc_h);
vd = vacc_m;
uint16x8_t vacc_m_u = vshrq_n_u16((uint16x8_t)vacc_m, 15);
vd = vorrq_s16(vd, (int16x8_t)vacc_m_u);
vd = vbicq_s16(vd, vacc_h);
vst1q_s16(VD, vd);
return;
#else
register int i;
for (i = 0; i < N; i++)
@ -45,6 +84,7 @@ INLINE static void do_mulu(usf_state_t * state, short* VD, short* VS, short* VT)
VD[i] &= ~(VACC_H[i] >> 0); /* VD &= -(result >= 0x000000000000) */
#endif
return;
#endif
}
static void VMULU(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -13,14 +13,27 @@
\******************************************************************************/
#include "vu.h"
static INLINE void do_nand(usf_state_t * state, short* VD, short* VS, short* VT)
INLINE void do_nand(usf_state_t * state, short* VD, short* VS, short* VT)
{
#ifdef ARCH_MIN_ARM_NEON
int16x8_t vs, vt,vaccl;
vs = vld1q_s16((const int16_t*)VS);
vt = vld1q_s16((const int16_t*)VT);
vaccl = vandq_s16(vs,vt);
vaccl = vmvnq_s16(vaccl);
vst1q_s16(VACC_L, vaccl);
vector_copy(VD, VACC_L);
return;
#else
register int i;
for (i = 0; i < N; i++)
VACC_L[i] = ~(VS[i] & VT[i]);
vector_copy(VD, VACC_L);
return;
#endif
}
static void VNAND(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -15,6 +15,33 @@
INLINE static void do_ne(usf_state_t * state, short* VD, short* VS, short* VT)
{
#ifdef ARCH_MIN_ARM_NEON
int16x8_t vs, vt,vaccl, ne;
int16x8_t zero = vdupq_n_s16(0);
uint16x8_t cond;
vs = vld1q_s16((const int16_t*)VS);
vt = vld1q_s16((const int16_t*)VT);
ne = vld1q_s16((const int16_t*)state->ne);
cond = vceqq_s16(vs,vt);
cond = vmvnq_u16(cond); // this is needed if you need to do "not-equal"
cond = (uint16x8_t)vnegq_s16((int16x8_t)cond);
uint16x8_t comp = vorrq_u16(cond, (uint16x8_t)ne);
vst1q_s16(state->clip, zero);
vst1q_s16(state->comp, (int16x8_t)cond);
vector_copy(VACC_L, VS);
vector_copy(VD, VACC_L);
vst1q_s16(state->ne, zero);
vst1q_s16(state->co, zero);
return;
#else
register int i;
for (i = 0; i < N; i++)
@ -35,6 +62,7 @@ INLINE static void do_ne(usf_state_t * state, short* VD, short* VS, short* VT)
for (i = 0; i < N; i++)
state->co[i] = 0;
return;
#endif
}
static void VNE(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -13,14 +13,27 @@
\******************************************************************************/
#include "vu.h"
static INLINE void do_nor(usf_state_t * state, short* VD, short* VS, short* VT)
INLINE void do_nor(usf_state_t * state, short* VD, short* VS, short* VT)
{
#ifdef ARCH_MIN_ARM_NEON
int16x8_t vs, vt, vaccl;
vs = vld1q_s16((const int16_t*)VS);
vt = vld1q_s16((const int16_t*)VT);
vaccl = vorrq_s16(vs,vt);
vaccl = vmvnq_s16(vaccl);
vst1q_s16(VACC_L, vaccl);
return;
#else
register int i;
for (i = 0; i < N; i++)
VACC_L[i] = ~(VS[i] | VT[i]);
vector_copy(VD, VACC_L);
return;
#endif
}
static void VNOR(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -13,14 +13,29 @@
\******************************************************************************/
#include "vu.h"
static INLINE void do_nxor(usf_state_t * state, short* VD, short* VS, short* VT)
INLINE void do_nxor(usf_state_t * state, short* VD, short* VS, short* VT)
{
#ifdef ARCH_MIN_ARM_NEON
int16x8_t vs, vt,vaccl;
vs = vld1q_s16((const int16_t*)VS);
vt = vld1q_s16((const int16_t*)VT);
vaccl = veorq_s16(vs,vt);
vaccl = vmvnq_s16(vaccl);
vst1q_s16(VACC_L, vaccl);
vector_copy(VD, VACC_L);
return;
#else
register int i;
for (i = 0; i < N; i++)
VACC_L[i] = ~(VS[i] ^ VT[i]);
vector_copy(VD, VACC_L);
return;
#endif
}
static void VNXOR(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -13,14 +13,28 @@
\******************************************************************************/
#include "vu.h"
static INLINE void do_or(usf_state_t * state, short* VD, short* VS, short* VT)
INLINE void do_or(usf_state_t * state, short* VD, short* VS, short* VT)
{
#ifdef ARCH_MIN_ARM_NEON
int16x8_t vs, vt,vaccl;
vs = vld1q_s16((const int16_t*)VS);
vt = vld1q_s16((const int16_t*)VT);
vaccl = vorrq_s16(vs,vt);
vst1q_s16(VACC_L, vaccl);
vector_copy(VD, VACC_L);
return;
#else
register int i;
for (i = 0; i < N; i++)
VACC_L[i] = VS[i] | VT[i];
vector_copy(VD, VACC_L);
return;
#endif
}
static void VOR(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -34,13 +34,25 @@ static void VSAR(int vd, int vs, int vt, int e)
if (e > 2)
{
message(state, "VSAR\nInvalid mask.", 2);
#if ARCH_MIN_ARM_NEON
int16x8_t zero = vdupq_n_s16(0);
vst1q_s16(VR[vd], zero);
#else
for (i = 0; i < N; i++)
VR[vd][i] = 0x0000; /* override behavior (zilmar) */
#endif
}
else
{
#if ARCH_MIN_ARM_NEON
vector_copy(VR[vd], VACC[e]);
#else
for (i = 0; i < N; i++)
VR[vd][i] = VACC[e][i];
for (i = 0; i < N; i++)
#endif
}
for (i = 0; i < N; i++)
VACC[e][i] = oldval[i]; /* ... = VS */
return;
}
@ -59,9 +71,19 @@ static void VSAW(usf_state_t * state, int vd, int vs, int vt, int e)
if (e > 0x2)
{ /* branch very unlikely...never seen a game do VSAW illegally */
message(state, "VSAW\nIllegal mask.", 2);
for (i = 0; i < N; i++)
#if ARCH_MIN_ARM_NEON
int16x8_t zero = vdupq_n_s16(0);
vst1q_s16(state->VR[vd], zero);
#else
for (i = 0; i < N; i++)
state->VR[vd][i] = 0x0000; /* override behavior (zilmar) */
return;
#endif
}
vector_copy(state->VR[vd], state->VACC[e]);
return;

View File

@ -15,6 +15,25 @@
INLINE static void clr_bi(usf_state_t * state, short* VD, short* VS, short* VT)
{ /* clear CARRY and borrow in to accumulators */
#ifdef ARCH_MIN_ARM_NEON
int16x8_t vs, vt, zero, co;
zero = vdupq_n_s16(0);
vs = vld1q_s16((const int16_t*)VS);
vt = vld1q_s16((const int16_t*)VT);
co = vld1q_s16((const int16_t*)state->co);
vs = vsubq_s16(vs,vt);
vs = vsubq_s16(vs,co);
vst1q_s16(VACC_L, vs);
SIGNED_CLAMP_SUB(state, VD, VS, VT);
vst1q_s16(state->ne, zero);
vst1q_s16(state->co, zero);
return;
#else
register int i;
for (i = 0; i < N; i++)
@ -25,6 +44,7 @@ INLINE static void clr_bi(usf_state_t * state, short* VD, short* VS, short* VT)
for (i = 0; i < N; i++)
state->co[i] = 0;
return;
#endif
}
static void VSUB(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -15,6 +15,36 @@
INLINE static void set_bo(usf_state_t * state, short* VD, short* VS, short* VT)
{ /* set CARRY and borrow out from difference */
#ifdef ARCH_MIN_ARM_NEON
int16x8_t vs, vt,vaccl,ne, co2;
uint16x8_t cond;
int16x8_t zero = vdupq_n_s16(0);
int16x8_t one = vdupq_n_s16(1);
vs = vld1q_s16((const int16_t*)VS);
vt = vld1q_s16((const int16_t*)VT);
vaccl = vsubq_s16(vs, vt);
uint16x8_t vdif = vqsubq_u16((uint16x8_t)vs, (uint16x8_t)vt);
vst1q_s16(VACC_L, vaccl);
vector_copy(VD, VACC_L);
cond = vceqq_s16(vs, vt);
ne = vaddq_s16((int16x8_t)cond, one);
vdif = vorrq_u16(vdif,cond);
cond = vceqq_u16(vdif, (uint16x8_t)zero);
co2 = vnegq_s16((int16x8_t)cond);
vst1q_s16(state->ne, ne);
vst1q_s16(state->co, co2);
return;
#else
ALIGNED int32_t dif[N];
register int i;
@ -28,6 +58,7 @@ INLINE static void set_bo(usf_state_t * state, short* VD, short* VS, short* VT)
for (i = 0; i < N; i++)
state->co[i] = (dif[i] < 0);
return;
#endif
}
static void VSUBC(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -54,9 +54,18 @@ static void res_V(usf_state_t * state, int vd, int vs, int vt, int e)
if (vs != vt || vt != e)
return;
message(state, "C2\nRESERVED", 2); /* uncertain how to handle reserved, untested */
#ifdef ARCH_MIN_ARM_NEON
int16x8_t zero = vdupq_n_s16(0);
vst1q_s16(state->VR[vd], zero);
return;
#else
for (i = 0; i < N; i++)
state->VR[vd][i] = 0x0000; /* override behavior (bpoint) */
return;
#endif
}
static void res_M(usf_state_t * state, int vd, int vs, int vt, int e)
{

View File

@ -13,14 +13,28 @@
\******************************************************************************/
#include "vu.h"
static INLINE void do_xor(usf_state_t * state, short* VD, short* VS, short* VT)
INLINE void do_xor(usf_state_t * state, short* VD, short* VS, short* VT)
{
#ifdef ARCH_MIN_ARM_NEON
int16x8_t vs, vt;
vs = vld1q_s16((const int16_t*)VS);
vt = vld1q_s16((const int16_t*)VT);
vs = veorq_s16(vs, vt);
vst1q_s16(VACC_L, vs);
vector_copy(VD, VACC_L);
return;
#else
register int i;
for (i = 0; i < N; i++)
VACC_L[i] = VS[i] ^ VT[i];
vector_copy(VD, VACC_L);
return;
#endif
}
static void VXOR(usf_state_t * state, int vd, int vs, int vt, int e)

View File

@ -29,6 +29,8 @@
#include <stdint.h>
#include <string.h>
#include "common.h"
#include "alist.h"
#include "arithmetics.h"
#include "audio.h"
@ -58,12 +60,12 @@ static int16_t* sample(struct hle_t* hle, unsigned pos)
static uint8_t* alist_u8(struct hle_t* hle, uint16_t dmem)
{
return &hle->alist_buffer[dmem ^ S8];
return u8(hle->alist_buffer, dmem);
}
static int16_t* alist_s16(struct hle_t* hle, uint16_t dmem)
{
return (int16_t*)(&hle->alist_buffer[dmem ^ S16]);
return (int16_t*)u16(hle->alist_buffer, dmem);
}
@ -82,13 +84,13 @@ static void alist_envmix_mix(size_t n, int16_t** dst, const int16_t* gains, int1
static int16_t ramp_step(struct ramp_t* ramp)
{
bool target_reached;
bool target_reached;
ramp->value += ramp->step;
target_reached = (ramp->step <= 0)
? (ramp->value <= ramp->target)
: (ramp->value >= ramp->target);
? (ramp->value <= ramp->target)
: (ramp->value >= ramp->target);
if (target_reached)
{
@ -184,7 +186,7 @@ void alist_move(struct hle_t* hle, uint16_t dmemo, uint16_t dmemi, uint16_t coun
void alist_copy_every_other_sample(struct hle_t* hle, uint16_t dmemo, uint16_t dmemi, uint16_t count)
{
while (count != 0) {
*(uint16_t*)(alist_u8(hle, dmemo)) = *(uint16_t*)(alist_u8(hle, dmemi));
*alist_s16(hle, dmemo) = *alist_s16(hle, dmemi);
dmemo += 2;
dmemi += 4;
--count;
@ -373,7 +375,7 @@ void alist_envmix_ge(
const int32_t *rate,
uint32_t address)
{
unsigned k, i, ptr;
unsigned k;
size_t n = (aux) ? 4 : 2;
const int16_t* const in = (int16_t*)(hle->alist_buffer + dmemi);
@ -390,8 +392,8 @@ void alist_envmix_ge(
ramps[1].value = (vol[1] << 16);
ramps[0].target = (target[0] << 16);
ramps[1].target = (target[1] << 16);
ramps[0].step = rate[0];
ramps[1].step = rate[1];
ramps[0].step = rate[0] / 8;
ramps[1].step = rate[1] / 8;
} else {
memcpy((uint8_t *)save_buffer, (hle->dram + address), 80);
wet = *(int16_t *)(save_buffer + 0); /* 0-1 */
@ -407,26 +409,23 @@ void alist_envmix_ge(
}
count >>= 1;
for (ptr = 0, k = 0; k < count; k += 8) {
for (k = 0; k < count; k++) {
int16_t gains[4];
int16_t* buffers[4];
int16_t l_vol = ramp_step(&ramps[0]);
int16_t r_vol = ramp_step(&ramps[1]);
gains[0] = clamp_s16((l_vol * dry + 0x4000) >> 15);
gains[1] = clamp_s16((r_vol * dry + 0x4000) >> 15);
gains[2] = clamp_s16((l_vol * wet + 0x4000) >> 15);
gains[3] = clamp_s16((r_vol * wet + 0x4000) >> 15);
buffers[0] = dl + (k^S);
buffers[1] = dr + (k^S);
buffers[2] = wl + (k^S);
buffers[3] = wr + (k^S);
for (i = 0; i < 8; i++) {
buffers[0] = dl + (ptr^S);
buffers[1] = dr + (ptr^S);
buffers[2] = wl + (ptr^S);
buffers[3] = wr + (ptr^S);
gains[0] = clamp_s16((l_vol * dry + 0x4000) >> 15);
gains[1] = clamp_s16((r_vol * dry + 0x4000) >> 15);
gains[2] = clamp_s16((l_vol * wet + 0x4000) >> 15);
gains[3] = clamp_s16((r_vol * wet + 0x4000) >> 15);
alist_envmix_mix(n, buffers, gains, in[ptr^S]);
ptr++;
}
alist_envmix_mix(n, buffers, gains, in[k^S]);
}
*(int16_t *)(save_buffer + 0) = wet; /* 0-1 */

View File

@ -29,6 +29,8 @@
#include <stdint.h>
#include <string.h>
#include "common.h"
#include "alist.h"
#include "hle_internal.h"
#include "memory.h"
@ -52,7 +54,7 @@ static void clear_segments(struct hle_t* hle)
}
/* audio commands definition */
static void SPNOOP(struct hle_t* hle, uint32_t w1, uint32_t w2)
static void SPNOOP(struct hle_t* UNUSED(hle), uint32_t UNUSED(w1), uint32_t UNUSED(w2))
{
}
@ -142,7 +144,7 @@ static void SETVOL(struct hle_t* hle, uint32_t w1, uint32_t w2)
}
}
static void SETLOOP(struct hle_t* hle, uint32_t w1, uint32_t w2)
static void SETLOOP(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
{
hle->alist_audio.loop = get_address(hle, w2);
}
@ -165,7 +167,7 @@ static void ADPCM(struct hle_t* hle, uint32_t w1, uint32_t w2)
address);
}
static void LOADBUFF(struct hle_t* hle, uint32_t w1, uint32_t w2)
static void LOADBUFF(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
{
uint32_t address = get_address(hle, w2);
@ -175,7 +177,7 @@ static void LOADBUFF(struct hle_t* hle, uint32_t w1, uint32_t w2)
alist_load(hle, hle->alist_audio.in, address, hle->alist_audio.count);
}
static void SAVEBUFF(struct hle_t* hle, uint32_t w1, uint32_t w2)
static void SAVEBUFF(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
{
uint32_t address = get_address(hle, w2);
@ -220,7 +222,7 @@ static void LOADADPCM(struct hle_t* hle, uint32_t w1, uint32_t w2)
dram_load_u16(hle, (uint16_t*)hle->alist_audio.table, address, align(count, 8) >> 1);
}
static void INTERLEAVE(struct hle_t* hle, uint32_t w1, uint32_t w2)
static void INTERLEAVE(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
{
uint16_t left = (w2 >> 16) + DMEM_BASE;
uint16_t right = w2 + DMEM_BASE;
@ -243,7 +245,7 @@ static void MIXER(struct hle_t* hle, uint32_t w1, uint32_t w2)
alist_mix(hle, dmemo, dmemi, align(hle->alist_audio.count, 32), gain);
}
static void SEGMENT(struct hle_t* hle, uint32_t w1, uint32_t w2)
static void SEGMENT(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
{
set_address(hle, w2);
}

View File

@ -28,12 +28,13 @@
#endif
#include <stdint.h>
#include "common.h"
#include "alist.h"
#include "hle_external.h"
#include "hle_internal.h"
#include "memory.h"
static void MP3(struct hle_t* hle, uint32_t w1, uint32_t w2);
#include "ucodes.h"
enum { NAUDIO_COUNT = 0x170 }; /* ie 184 samples */
enum {
@ -57,7 +58,7 @@ static void UNKNOWN(struct hle_t* hle, uint32_t w1, uint32_t w2)
}
static void SPNOOP(struct hle_t* hle, uint32_t w1, uint32_t w2)
static void SPNOOP(struct hle_t* UNUSED(hle), uint32_t UNUSED(w1), uint32_t UNUSED(w2))
{
}
@ -67,10 +68,11 @@ static void NAUDIO_0000(struct hle_t* hle, uint32_t w1, uint32_t w2)
UNKNOWN(hle, w1, w2);
}
static void NAUDIO_02B0(struct hle_t* hle, uint32_t w1, uint32_t w2)
static void NAUDIO_02B0(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
{
uint32_t rate = (hle->alist_naudio.rate[1] & 0xffff0000) | (w2 & 0xffff);
hle->alist_naudio.rate[1] = rate;
/* emulate code at 0x12b0 (inside SETVOL), because PC always execute in IMEM */
hle->alist_naudio.rate[1] &= ~0xffff;
hle->alist_naudio.rate[1] |= (w2 & 0xffff);
}
static void NAUDIO_14(struct hle_t* hle, uint32_t w1, uint32_t w2)
@ -196,7 +198,7 @@ static void DMEMMOVE(struct hle_t* hle, uint32_t w1, uint32_t w2)
alist_move(hle, dmemo, dmemi, (count + 3) & ~3);
}
static void SETLOOP(struct hle_t* hle, uint32_t w1, uint32_t w2)
static void SETLOOP(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
{
hle->alist_naudio.loop = (w2 & 0xffffff);
}
@ -241,12 +243,12 @@ static void RESAMPLE(struct hle_t* hle, uint32_t w1, uint32_t w2)
address);
}
static void INTERLEAVE(struct hle_t* hle, uint32_t w1, uint32_t w2)
static void INTERLEAVE(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t UNUSED(w2))
{
alist_interleave(hle, NAUDIO_MAIN, NAUDIO_DRY_LEFT, NAUDIO_DRY_RIGHT, NAUDIO_COUNT);
}
static void MP3ADDY(struct hle_t* hle, uint32_t w1, uint32_t w2)
static void MP3ADDY(struct hle_t* UNUSED(hle), uint32_t UNUSED(w1), uint32_t UNUSED(w2))
{
}

View File

@ -28,6 +28,8 @@
#endif
#include <stdint.h>
#include "common.h"
#include "alist.h"
#include "hle_external.h"
#include "hle_internal.h"
@ -49,7 +51,7 @@ static void UNKNOWN(struct hle_t* hle, uint32_t w1, uint32_t w2)
}
static void SPNOOP(struct hle_t* hle, uint32_t w1, uint32_t w2)
static void SPNOOP(struct hle_t* UNUSED(hle), uint32_t UNUSED(w1), uint32_t UNUSED(w2))
{
}
@ -61,7 +63,7 @@ static void LOADADPCM(struct hle_t* hle, uint32_t w1, uint32_t w2)
dram_load_u16(hle, (uint16_t*)hle->alist_nead.table, address, count >> 1);
}
static void SETLOOP(struct hle_t* hle, uint32_t w1, uint32_t w2)
static void SETLOOP(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
{
hle->alist_nead.loop = w2 & 0xffffff;
}
@ -190,7 +192,7 @@ static void ENVSETUP1(struct hle_t* hle, uint32_t w1, uint32_t w2)
hle->alist_nead.env_steps[1] = w2;
}
static void ENVSETUP2(struct hle_t* hle, uint32_t w1, uint32_t w2)
static void ENVSETUP2(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
{
hle->alist_nead.env_values[0] = (w2 >> 16);
hle->alist_nead.env_values[1] = w2;
@ -206,6 +208,7 @@ static void ENVMIXER_MK(struct hle_t* hle, uint32_t w1, uint32_t w2)
uint16_t dmem_dr = (w2 >> 12) & 0xff0;
uint16_t dmem_wl = (w2 >> 4) & 0xff0;
uint16_t dmem_wr = (w2 << 4) & 0xff0;
xors[2] = 0; /* unsupported by this ucode */
xors[3] = 0; /* unsupported by this ucode */
xors[0] = 0 - (int16_t)((w1 & 0x2) >> 1);
@ -233,6 +236,7 @@ static void ENVMIXER(struct hle_t* hle, uint32_t w1, uint32_t w2)
uint16_t dmem_dr = (w2 >> 12) & 0xff0;
uint16_t dmem_wl = (w2 >> 4) & 0xff0;
uint16_t dmem_wr = (w2 << 4) & 0xff0;
xors[2] = 0 - (int16_t)((w1 & 0x8) >> 1);
xors[3] = 0 - (int16_t)((w1 & 0x4) >> 1);
xors[0] = 0 - (int16_t)((w1 & 0x2) >> 1);
@ -267,7 +271,7 @@ static void INTERL(struct hle_t* hle, uint32_t w1, uint32_t w2)
alist_copy_every_other_sample(hle, dmemo, dmemi, count);
}
static void INTERLEAVE_MK(struct hle_t* hle, uint32_t w1, uint32_t w2)
static void INTERLEAVE_MK(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
{
uint16_t left = (w2 >> 16);
uint16_t right = w2;
@ -323,7 +327,7 @@ static void FILTER(struct hle_t* hle, uint32_t w1, uint32_t w2)
}
}
static void SEGMENT(struct hle_t* hle, uint32_t w1, uint32_t w2)
static void SEGMENT(struct hle_t* UNUSED(hle), uint32_t UNUSED(w1), uint32_t UNUSED(w2))
{
}

View File

@ -24,13 +24,7 @@
#include <stdint.h>
#ifdef _MSC_VER
#define INLINE __forceinline
#else
#define INLINE inline __attribute__((always_inline))
#endif
INLINE static int16_t clamp_s16(int_fast32_t x)
static inline int16_t clamp_s16(int_fast32_t x)
{
x = (x < INT16_MIN) ? INT16_MIN: x;
x = (x > INT16_MAX) ? INT16_MAX: x;

View File

@ -23,41 +23,75 @@
#include <stddef.h>
#include <stdint.h>
#include "common.h"
#include "arithmetics.h"
const int16_t RESAMPLE_LUT[64 * 4] = {
0x0c39, 0x66ad, 0x0d46, 0xffdf, 0x0b39, 0x6696, 0x0e5f, 0xffd8,
0x0a44, 0x6669, 0x0f83, 0xffd0, 0x095a, 0x6626, 0x10b4, 0xffc8,
0x087d, 0x65cd, 0x11f0, 0xffbf, 0x07ab, 0x655e, 0x1338, 0xffb6,
0x06e4, 0x64d9, 0x148c, 0xffac, 0x0628, 0x643f, 0x15eb, 0xffa1,
0x0577, 0x638f, 0x1756, 0xff96, 0x04d1, 0x62cb, 0x18cb, 0xff8a,
0x0435, 0x61f3, 0x1a4c, 0xff7e, 0x03a4, 0x6106, 0x1bd7, 0xff71,
0x031c, 0x6007, 0x1d6c, 0xff64, 0x029f, 0x5ef5, 0x1f0b, 0xff56,
0x022a, 0x5dd0, 0x20b3, 0xff48, 0x01be, 0x5c9a, 0x2264, 0xff3a,
0x015b, 0x5b53, 0x241e, 0xff2c, 0x0101, 0x59fc, 0x25e0, 0xff1e,
0x00ae, 0x5896, 0x27a9, 0xff10, 0x0063, 0x5720, 0x297a, 0xff02,
0x001f, 0x559d, 0x2b50, 0xfef4, 0xffe2, 0x540d, 0x2d2c, 0xfee8,
0xffac, 0x5270, 0x2f0d, 0xfedb, 0xff7c, 0x50c7, 0x30f3, 0xfed0,
0xff53, 0x4f14, 0x32dc, 0xfec6, 0xff2e, 0x4d57, 0x34c8, 0xfebd,
0xff0f, 0x4b91, 0x36b6, 0xfeb6, 0xfef5, 0x49c2, 0x38a5, 0xfeb0,
0xfedf, 0x47ed, 0x3a95, 0xfeac, 0xfece, 0x4611, 0x3c85, 0xfeab,
0xfec0, 0x4430, 0x3e74, 0xfeac, 0xfeb6, 0x424a, 0x4060, 0xfeaf,
0xfeaf, 0x4060, 0x424a, 0xfeb6, 0xfeac, 0x3e74, 0x4430, 0xfec0,
0xfeab, 0x3c85, 0x4611, 0xfece, 0xfeac, 0x3a95, 0x47ed, 0xfedf,
0xfeb0, 0x38a5, 0x49c2, 0xfef5, 0xfeb6, 0x36b6, 0x4b91, 0xff0f,
0xfebd, 0x34c8, 0x4d57, 0xff2e, 0xfec6, 0x32dc, 0x4f14, 0xff53,
0xfed0, 0x30f3, 0x50c7, 0xff7c, 0xfedb, 0x2f0d, 0x5270, 0xffac,
0xfee8, 0x2d2c, 0x540d, 0xffe2, 0xfef4, 0x2b50, 0x559d, 0x001f,
0xff02, 0x297a, 0x5720, 0x0063, 0xff10, 0x27a9, 0x5896, 0x00ae,
0xff1e, 0x25e0, 0x59fc, 0x0101, 0xff2c, 0x241e, 0x5b53, 0x015b,
0xff3a, 0x2264, 0x5c9a, 0x01be, 0xff48, 0x20b3, 0x5dd0, 0x022a,
0xff56, 0x1f0b, 0x5ef5, 0x029f, 0xff64, 0x1d6c, 0x6007, 0x031c,
0xff71, 0x1bd7, 0x6106, 0x03a4, 0xff7e, 0x1a4c, 0x61f3, 0x0435,
0xff8a, 0x18cb, 0x62cb, 0x04d1, 0xff96, 0x1756, 0x638f, 0x0577,
0xffa1, 0x15eb, 0x643f, 0x0628, 0xffac, 0x148c, 0x64d9, 0x06e4,
0xffb6, 0x1338, 0x655e, 0x07ab, 0xffbf, 0x11f0, 0x65cd, 0x087d,
0xffc8, 0x10b4, 0x6626, 0x095a, 0xffd0, 0x0f83, 0x6669, 0x0a44,
0xffd8, 0x0e5f, 0x6696, 0x0b39, 0xffdf, 0x0d46, 0x66ad, 0x0c39
(int16_t)0x0c39, (int16_t)0x66ad, (int16_t)0x0d46, (int16_t)0xffdf,
(int16_t)0x0b39, (int16_t)0x6696, (int16_t)0x0e5f, (int16_t)0xffd8,
(int16_t)0x0a44, (int16_t)0x6669, (int16_t)0x0f83, (int16_t)0xffd0,
(int16_t)0x095a, (int16_t)0x6626, (int16_t)0x10b4, (int16_t)0xffc8,
(int16_t)0x087d, (int16_t)0x65cd, (int16_t)0x11f0, (int16_t)0xffbf,
(int16_t)0x07ab, (int16_t)0x655e, (int16_t)0x1338, (int16_t)0xffb6,
(int16_t)0x06e4, (int16_t)0x64d9, (int16_t)0x148c, (int16_t)0xffac,
(int16_t)0x0628, (int16_t)0x643f, (int16_t)0x15eb, (int16_t)0xffa1,
(int16_t)0x0577, (int16_t)0x638f, (int16_t)0x1756, (int16_t)0xff96,
(int16_t)0x04d1, (int16_t)0x62cb, (int16_t)0x18cb, (int16_t)0xff8a,
(int16_t)0x0435, (int16_t)0x61f3, (int16_t)0x1a4c, (int16_t)0xff7e,
(int16_t)0x03a4, (int16_t)0x6106, (int16_t)0x1bd7, (int16_t)0xff71,
(int16_t)0x031c, (int16_t)0x6007, (int16_t)0x1d6c, (int16_t)0xff64,
(int16_t)0x029f, (int16_t)0x5ef5, (int16_t)0x1f0b, (int16_t)0xff56,
(int16_t)0x022a, (int16_t)0x5dd0, (int16_t)0x20b3, (int16_t)0xff48,
(int16_t)0x01be, (int16_t)0x5c9a, (int16_t)0x2264, (int16_t)0xff3a,
(int16_t)0x015b, (int16_t)0x5b53, (int16_t)0x241e, (int16_t)0xff2c,
(int16_t)0x0101, (int16_t)0x59fc, (int16_t)0x25e0, (int16_t)0xff1e,
(int16_t)0x00ae, (int16_t)0x5896, (int16_t)0x27a9, (int16_t)0xff10,
(int16_t)0x0063, (int16_t)0x5720, (int16_t)0x297a, (int16_t)0xff02,
(int16_t)0x001f, (int16_t)0x559d, (int16_t)0x2b50, (int16_t)0xfef4,
(int16_t)0xffe2, (int16_t)0x540d, (int16_t)0x2d2c, (int16_t)0xfee8,
(int16_t)0xffac, (int16_t)0x5270, (int16_t)0x2f0d, (int16_t)0xfedb,
(int16_t)0xff7c, (int16_t)0x50c7, (int16_t)0x30f3, (int16_t)0xfed0,
(int16_t)0xff53, (int16_t)0x4f14, (int16_t)0x32dc, (int16_t)0xfec6,
(int16_t)0xff2e, (int16_t)0x4d57, (int16_t)0x34c8, (int16_t)0xfebd,
(int16_t)0xff0f, (int16_t)0x4b91, (int16_t)0x36b6, (int16_t)0xfeb6,
(int16_t)0xfef5, (int16_t)0x49c2, (int16_t)0x38a5, (int16_t)0xfeb0,
(int16_t)0xfedf, (int16_t)0x47ed, (int16_t)0x3a95, (int16_t)0xfeac,
(int16_t)0xfece, (int16_t)0x4611, (int16_t)0x3c85, (int16_t)0xfeab,
(int16_t)0xfec0, (int16_t)0x4430, (int16_t)0x3e74, (int16_t)0xfeac,
(int16_t)0xfeb6, (int16_t)0x424a, (int16_t)0x4060, (int16_t)0xfeaf,
(int16_t)0xfeaf, (int16_t)0x4060, (int16_t)0x424a, (int16_t)0xfeb6,
(int16_t)0xfeac, (int16_t)0x3e74, (int16_t)0x4430, (int16_t)0xfec0,
(int16_t)0xfeab, (int16_t)0x3c85, (int16_t)0x4611, (int16_t)0xfece,
(int16_t)0xfeac, (int16_t)0x3a95, (int16_t)0x47ed, (int16_t)0xfedf,
(int16_t)0xfeb0, (int16_t)0x38a5, (int16_t)0x49c2, (int16_t)0xfef5,
(int16_t)0xfeb6, (int16_t)0x36b6, (int16_t)0x4b91, (int16_t)0xff0f,
(int16_t)0xfebd, (int16_t)0x34c8, (int16_t)0x4d57, (int16_t)0xff2e,
(int16_t)0xfec6, (int16_t)0x32dc, (int16_t)0x4f14, (int16_t)0xff53,
(int16_t)0xfed0, (int16_t)0x30f3, (int16_t)0x50c7, (int16_t)0xff7c,
(int16_t)0xfedb, (int16_t)0x2f0d, (int16_t)0x5270, (int16_t)0xffac,
(int16_t)0xfee8, (int16_t)0x2d2c, (int16_t)0x540d, (int16_t)0xffe2,
(int16_t)0xfef4, (int16_t)0x2b50, (int16_t)0x559d, (int16_t)0x001f,
(int16_t)0xff02, (int16_t)0x297a, (int16_t)0x5720, (int16_t)0x0063,
(int16_t)0xff10, (int16_t)0x27a9, (int16_t)0x5896, (int16_t)0x00ae,
(int16_t)0xff1e, (int16_t)0x25e0, (int16_t)0x59fc, (int16_t)0x0101,
(int16_t)0xff2c, (int16_t)0x241e, (int16_t)0x5b53, (int16_t)0x015b,
(int16_t)0xff3a, (int16_t)0x2264, (int16_t)0x5c9a, (int16_t)0x01be,
(int16_t)0xff48, (int16_t)0x20b3, (int16_t)0x5dd0, (int16_t)0x022a,
(int16_t)0xff56, (int16_t)0x1f0b, (int16_t)0x5ef5, (int16_t)0x029f,
(int16_t)0xff64, (int16_t)0x1d6c, (int16_t)0x6007, (int16_t)0x031c,
(int16_t)0xff71, (int16_t)0x1bd7, (int16_t)0x6106, (int16_t)0x03a4,
(int16_t)0xff7e, (int16_t)0x1a4c, (int16_t)0x61f3, (int16_t)0x0435,
(int16_t)0xff8a, (int16_t)0x18cb, (int16_t)0x62cb, (int16_t)0x04d1,
(int16_t)0xff96, (int16_t)0x1756, (int16_t)0x638f, (int16_t)0x0577,
(int16_t)0xffa1, (int16_t)0x15eb, (int16_t)0x643f, (int16_t)0x0628,
(int16_t)0xffac, (int16_t)0x148c, (int16_t)0x64d9, (int16_t)0x06e4,
(int16_t)0xffb6, (int16_t)0x1338, (int16_t)0x655e, (int16_t)0x07ab,
(int16_t)0xffbf, (int16_t)0x11f0, (int16_t)0x65cd, (int16_t)0x087d,
(int16_t)0xffc8, (int16_t)0x10b4, (int16_t)0x6626, (int16_t)0x095a,
(int16_t)0xffd0, (int16_t)0x0f83, (int16_t)0x6669, (int16_t)0x0a44,
(int16_t)0xffd8, (int16_t)0x0e5f, (int16_t)0x6696, (int16_t)0x0b39,
(int16_t)0xffdf, (int16_t)0x0d46, (int16_t)0x66ad, (int16_t)0x0c39
};
int32_t rdot(size_t n, const int16_t *x, const int16_t *y)

View File

@ -29,13 +29,7 @@ extern const int16_t RESAMPLE_LUT[64 * 4];
int32_t rdot(size_t n, const int16_t *x, const int16_t *y);
#ifdef _MSC_VER
#define INLINE __forceinline
#else
#define INLINE inline __attribute__((always_inline))
#endif
INLINE static int16_t adpcm_predict_sample(uint8_t byte, uint8_t mask,
static inline int16_t adpcm_predict_sample(uint8_t byte, uint8_t mask,
unsigned lshift, unsigned rshift)
{
int16_t sample = (uint16_t)(byte & mask) << lshift;

View File

@ -0,0 +1,39 @@
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* Mupen64plus-rsp-hle - common.h *
* Mupen64Plus homepage: http://code.google.com/p/mupen64plus/ *
* Copyright (C) 2014 Bobby Smiles *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write to the *
* Free Software Foundation, Inc., *
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
#ifndef COMMON_H
#define COMMON_H
/* macro for unused variable warning suppression */
#ifdef __GNUC__
# define UNUSED(x) UNUSED_ ## x __attribute__((__unused__))
#else
# define UNUSED(x) UNUSED_ ## x
#endif
#ifdef _MSC_VER
# define inline __forceinline
#elif defined __GNUC__
# define inline inline __attribute__((always_inline))
#endif
#endif

View File

@ -32,6 +32,8 @@
#include <stdio.h>
#endif
#include "common.h"
#include "hle_external.h"
#include "hle_internal.h"
#include "memory.h"

View File

@ -72,12 +72,6 @@ struct hle_t
/* mp3.c */
uint8_t mp3_buffer[0x1000];
/* FIXME: rewrite mp3 module to avoid these */
uint32_t mp3_inPtr;
uint32_t mp3_outPtr;
uint32_t mp3_t6;
uint32_t mp3_t5;
uint32_t mp3_t4;
};
#endif

View File

@ -25,6 +25,8 @@
#include <stdint.h>
#include <stdlib.h>
#include "common.h"
#include "arithmetics.h"
#include "hle_external.h"
#include "hle_internal.h"

View File

@ -21,103 +21,56 @@
#include <string.h>
#include "common.h"
#include "memory.h"
/* Global functions */
void dmem_load_u8(struct hle_t* hle, uint8_t* dst, uint16_t address, size_t count)
void load_u8(uint8_t* dst, const unsigned char* buffer, unsigned address, size_t count)
{
while (count != 0) {
*(dst++) = *dmem_u8(hle, address);
*(dst++) = *u8(buffer, address);
address += 1;
--count;
}
}
void dmem_load_u16(struct hle_t* hle, uint16_t* dst, uint16_t address, size_t count)
void load_u16(uint16_t* dst, const unsigned char* buffer, unsigned address, size_t count)
{
while (count != 0) {
*(dst++) = *dmem_u16(hle, address);
*(dst++) = *u16(buffer, address);
address += 2;
--count;
}
}
void dmem_load_u32(struct hle_t* hle, uint32_t* dst, uint16_t address, size_t count)
void load_u32(uint32_t* dst, const unsigned char* buffer, unsigned address, size_t count)
{
/* Optimization for uint32_t */
memcpy(dst, dmem_u32(hle, address), count * sizeof(uint32_t));
memcpy(dst, u32(buffer, address), count * sizeof(uint32_t));
}
void dmem_store_u8(struct hle_t* hle, const uint8_t* src, uint16_t address, size_t count)
void store_u8(unsigned char* buffer, unsigned address, const uint8_t* src, size_t count)
{
while (count != 0) {
*dmem_u8(hle, address) = *(src++);
*u8(buffer, address) = *(src++);
address += 1;
--count;
}
}
void dmem_store_u16(struct hle_t* hle, const uint16_t* src, uint16_t address, size_t count)
void store_u16(unsigned char* buffer, unsigned address, const uint16_t* src, size_t count)
{
while (count != 0) {
*dmem_u16(hle, address) = *(src++);
*u16(buffer, address) = *(src++);
address += 2;
--count;
}
}
void dmem_store_u32(struct hle_t* hle, const uint32_t* src, uint16_t address, size_t count)
void store_u32(unsigned char* buffer, unsigned address, const uint32_t* src, size_t count)
{
/* Optimization for uint32_t */
memcpy(dmem_u32(hle, address), src, count * sizeof(uint32_t));
}
void dram_load_u8(struct hle_t* hle, uint8_t* dst, uint32_t address, size_t count)
{
while (count != 0) {
*(dst++) = *dram_u8(hle, address);
address += 1;
--count;
}
}
void dram_load_u16(struct hle_t* hle, uint16_t* dst, uint32_t address, size_t count)
{
while (count != 0) {
*(dst++) = *dram_u16(hle, address);
address += 2;
--count;
}
}
void dram_load_u32(struct hle_t* hle, uint32_t* dst, uint32_t address, size_t count)
{
/* Optimization for uint32_t */
memcpy(dst, dram_u32(hle, address), count * sizeof(uint32_t));
}
void dram_store_u8(struct hle_t* hle, const uint8_t* src, uint32_t address, size_t count)
{
while (count != 0) {
*dram_u8(hle, address) = *(src++);
address += 1;
--count;
}
}
void dram_store_u16(struct hle_t* hle, const uint16_t* src, uint32_t address, size_t count)
{
while (count != 0) {
*dram_u16(hle, address) = *(src++);
address += 2;
--count;
}
}
void dram_store_u32(struct hle_t* hle, const uint32_t* src, uint32_t address, size_t count)
{
/* Optimization for uint32_t */
memcpy(dram_u32(hle, address), src, count * sizeof(uint32_t));
memcpy(u32(buffer, address), src, count * sizeof(uint32_t));
}

View File

@ -57,65 +57,128 @@ enum {
TASK_YIELD_DATA_SIZE = 0xffc
};
#ifdef _MSC_VER
#define INLINE __forceinline
#else
#define INLINE inline __attribute__((always_inline))
#endif
INLINE static unsigned int align(unsigned int x, unsigned amount)
static inline unsigned int align(unsigned int x, unsigned amount)
{
--amount;
return (x + amount) & ~amount;
}
INLINE static uint8_t* const dmem_u8(struct hle_t* hle, uint16_t address)
static inline uint8_t* u8(const unsigned char* buffer, unsigned address)
{
return (uint8_t*)(&hle->dmem[(address & 0xfff) ^ S8]);
return (uint8_t*)(buffer + (address ^ S8));
}
INLINE static uint16_t* const dmem_u16(struct hle_t* hle, uint16_t address)
static inline uint16_t* u16(const unsigned char* buffer, unsigned address)
{
assert((address & 1) == 0);
return (uint16_t*)(&hle->dmem[(address & 0xfff) ^ S16]);
return (uint16_t*)(buffer + (address ^ S16));
}
INLINE static uint32_t* const dmem_u32(struct hle_t* hle, uint16_t address)
static inline uint32_t* u32(const unsigned char* buffer, unsigned address)
{
assert((address & 3) == 0);
return (uint32_t*)(&hle->dmem[(address & 0xfff)]);
return (uint32_t*)(buffer + address);
}
INLINE static uint8_t* const dram_u8(struct hle_t* hle, uint32_t address)
void load_u8 (uint8_t* dst, const unsigned char* buffer, unsigned address, size_t count);
void load_u16(uint16_t* dst, const unsigned char* buffer, unsigned address, size_t count);
void load_u32(uint32_t* dst, const unsigned char* buffer, unsigned address, size_t count);
void store_u8 (unsigned char* buffer, unsigned address, const uint8_t* src, size_t count);
void store_u16(unsigned char* buffer, unsigned address, const uint16_t* src, size_t count);
void store_u32(unsigned char* buffer, unsigned address, const uint32_t* src, size_t count);
/* convenient functions for DMEM access */
static inline uint8_t* dmem_u8(struct hle_t* hle, uint16_t address)
{
return (uint8_t*)&hle->dram[(address & 0xffffff) ^ S8];
return u8(hle->dmem, address & 0xfff);
}
INLINE static uint16_t* const dram_u16(struct hle_t* hle, uint32_t address)
static inline uint16_t* dmem_u16(struct hle_t* hle, uint16_t address)
{
assert((address & 1) == 0);
return (uint16_t*)&hle->dram[(address & 0xffffff) ^ S16];
return u16(hle->dmem, address & 0xfff);
}
INLINE static uint32_t* const dram_u32(struct hle_t* hle, uint32_t address)
static inline uint32_t* dmem_u32(struct hle_t* hle, uint16_t address)
{
assert((address & 3) == 0);
return (uint32_t*)&hle->dram[address & 0xffffff];
return u32(hle->dmem, address & 0xfff);
}
void dmem_load_u8 (struct hle_t* hle, uint8_t* dst, uint16_t address, size_t count);
void dmem_load_u16(struct hle_t* hle, uint16_t* dst, uint16_t address, size_t count);
void dmem_load_u32(struct hle_t* hle, uint32_t* dst, uint16_t address, size_t count);
void dmem_store_u8 (struct hle_t* hle, const uint8_t* src, uint16_t address, size_t count);
void dmem_store_u16(struct hle_t* hle, const uint16_t* src, uint16_t address, size_t count);
void dmem_store_u32(struct hle_t* hle, const uint32_t* src, uint16_t address, size_t count);
static inline void dmem_load_u8(struct hle_t* hle, uint8_t* dst, uint16_t address, size_t count)
{
load_u8(dst, hle->dmem, address & 0xfff, count);
}
void dram_load_u8 (struct hle_t* hle, uint8_t* dst, uint32_t address, size_t count);
void dram_load_u16(struct hle_t* hle, uint16_t* dst, uint32_t address, size_t count);
void dram_load_u32(struct hle_t* hle, uint32_t* dst, uint32_t address, size_t count);
void dram_store_u8 (struct hle_t* hle, const uint8_t* src, uint32_t address, size_t count);
void dram_store_u16(struct hle_t* hle, const uint16_t* src, uint32_t address, size_t count);
void dram_store_u32(struct hle_t* hle, const uint32_t* src, uint32_t address, size_t count);
static inline void dmem_load_u16(struct hle_t* hle, uint16_t* dst, uint16_t address, size_t count)
{
load_u16(dst, hle->dmem, address & 0xfff, count);
}
static inline void dmem_load_u32(struct hle_t* hle, uint32_t* dst, uint16_t address, size_t count)
{
load_u32(dst, hle->dmem, address & 0xfff, count);
}
static inline void dmem_store_u8(struct hle_t* hle, const uint8_t* src, uint16_t address, size_t count)
{
store_u8(hle->dmem, address & 0xfff, src, count);
}
static inline void dmem_store_u16(struct hle_t* hle, const uint16_t* src, uint16_t address, size_t count)
{
store_u16(hle->dmem, address & 0xfff, src, count);
}
static inline void dmem_store_u32(struct hle_t* hle, const uint32_t* src, uint16_t address, size_t count)
{
store_u32(hle->dmem, address & 0xfff, src, count);
}
/* convenient functions DRAM access */
static inline uint8_t* dram_u8(struct hle_t* hle, uint32_t address)
{
return u8(hle->dram, address & 0xffffff);
}
static inline uint16_t* dram_u16(struct hle_t* hle, uint32_t address)
{
return u16(hle->dram, address & 0xffffff);
}
static inline uint32_t* dram_u32(struct hle_t* hle, uint32_t address)
{
return u32(hle->dram, address & 0xffffff);
}
static inline void dram_load_u8(struct hle_t* hle, uint8_t* dst, uint32_t address, size_t count)
{
load_u8(dst, hle->dram, address & 0xffffff, count);
}
static inline void dram_load_u16(struct hle_t* hle, uint16_t* dst, uint32_t address, size_t count)
{
load_u16(dst, hle->dram, address & 0xffffff, count);
}
static inline void dram_load_u32(struct hle_t* hle, uint32_t* dst, uint32_t address, size_t count)
{
load_u32(dst, hle->dram, address & 0xffffff, count);
}
static inline void dram_store_u8(struct hle_t* hle, const uint8_t* src, uint32_t address, size_t count)
{
store_u8(hle->dram, address & 0xffffff, src, count);
}
static inline void dram_store_u16(struct hle_t* hle, const uint16_t* src, uint32_t address, size_t count)
{
store_u16(hle->dram, address & 0xffffff, src, count);
}
static inline void dram_store_u32(struct hle_t* hle, const uint32_t* src, uint32_t address, size_t count)
{
store_u32(hle->dram, address & 0xffffff, src, count);
}
#endif

View File

@ -1,6 +1,7 @@
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* Mupen64plus-rsp-hle - ucode3mp3.h *
* Mupen64plus-rsp-hle - mp3.c *
* Mupen64Plus homepage: http://code.google.com/p/mupen64plus/ *
* Copyright (C) 2014 Bobby Smiles *
* Copyright (C) 2009 Richard Goedeken *
* Copyright (C) 2002 Hacktarux *
* *
@ -23,10 +24,16 @@
#include <string.h>
#include <stdint.h>
#include "hle_external.h"
#include "common.h"
#include "arithmetics.h"
#include "hle_internal.h"
#include "memory.h"
static void InnerLoop(struct hle_t* hle,
uint32_t outPtr, uint32_t inPtr,
uint32_t t6, uint32_t t5, uint32_t t4);
static const uint16_t DeWindowLUT [0x420] = {
0x0000, 0xFFF3, 0x005D, 0xFF38, 0x037A, 0xF736, 0x0B37, 0xC00E,
0x7FFF, 0x3FF2, 0x0B37, 0x08CA, 0x037A, 0x00C8, 0x005D, 0x000D,
@ -198,10 +205,13 @@ static void MP3AB0(int32_t* v)
}
}
static void InnerLoop(struct hle_t* hle);
void mp3_task(struct hle_t* hle, unsigned int index, uint32_t address)
{
uint32_t inPtr, outPtr;
uint32_t t6;/* = 0x08A0; - I think these are temporary storage buffers */
uint32_t t5;/* = 0x0AC0; */
uint32_t t4;/* = (w1 & 0x1E); */
/* Initialization Code */
uint32_t readPtr; /* s5 */
uint32_t writePtr; /* s6 */
@ -209,9 +219,9 @@ void mp3_task(struct hle_t* hle, unsigned int index, uint32_t address)
int cnt, cnt2;
/* I think these are temporary storage buffers */
hle->mp3_t6 = 0x08A0;
hle->mp3_t5 = 0x0AC0;
hle->mp3_t4 = index;
t6 = 0x08A0;
t5 = 0x0AC0;
t4 = index;
writePtr = readPtr = address;
/* Just do that for efficiency... may remove and use directly later anyway */
@ -222,20 +232,21 @@ void mp3_task(struct hle_t* hle, unsigned int index, uint32_t address)
for (cnt = 0; cnt < 0x480; cnt += 0x180) {
/* DMA: 0xCF0 <- RDRAM[s5] : 0x180 */
memcpy(hle->mp3_buffer + 0xCF0, hle->dram + readPtr, 0x180);
hle->mp3_inPtr = 0xCF0; /* s7 */
hle->mp3_outPtr = 0xE70; /* s3 */
inPtr = 0xCF0; /* s7 */
outPtr = 0xE70; /* s3 */
/* --------------- Inner Loop Start -------------------- */
for (cnt2 = 0; cnt2 < 0x180; cnt2 += 0x40) {
hle->mp3_t6 &= 0xFFE0;
hle->mp3_t5 &= 0xFFE0;
hle->mp3_t6 |= hle->mp3_t4;
hle->mp3_t5 |= hle->mp3_t4;
InnerLoop(hle);
hle->mp3_t4 = (hle->mp3_t4 - 2) & 0x1E;
tmp = hle->mp3_t6;
hle->mp3_t6 = hle->mp3_t5;
hle->mp3_t5 = tmp;
hle->mp3_inPtr += 0x40;
t6 &= 0xFFE0;
t5 &= 0xFFE0;
t6 |= t4;
t5 |= t4;
InnerLoop(hle, outPtr, inPtr, t6, t5, t4);
t4 = (t4 - 2) & 0x1E;
tmp = t6;
t6 = t5;
t5 = tmp;
inPtr += 0x40;
outPtr += 0x40;
}
/* --------------- Inner Loop End -------------------- */
memcpy(hle->dram + writePtr, hle->mp3_buffer + 0xe70, 0x180);
@ -244,9 +255,9 @@ void mp3_task(struct hle_t* hle, unsigned int index, uint32_t address)
}
}
static void InnerLoop(struct hle_t* hle)
static void InnerLoop(struct hle_t* hle,
uint32_t outPtr, uint32_t inPtr,
uint32_t t6, uint32_t t5, uint32_t t4)
{
/* Part 1: 100% Accurate */
@ -274,56 +285,56 @@ static void InnerLoop(struct hle_t* hle)
int32_t vt;
int32_t v[32];
v[0] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x00 ^ S16));
v[31] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x3E ^ S16));
v[0] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x00 ^ S16));
v[31] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x3E ^ S16));
v[0] += v[31];
v[1] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x02 ^ S16));
v[30] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x3C ^ S16));
v[1] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x02 ^ S16));
v[30] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x3C ^ S16));
v[1] += v[30];
v[2] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x06 ^ S16));
v[28] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x38 ^ S16));
v[2] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x06 ^ S16));
v[28] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x38 ^ S16));
v[2] += v[28];
v[3] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x04 ^ S16));
v[29] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x3A ^ S16));
v[3] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x04 ^ S16));
v[29] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x3A ^ S16));
v[3] += v[29];
v[4] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x0E ^ S16));
v[24] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x30 ^ S16));
v[4] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x0E ^ S16));
v[24] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x30 ^ S16));
v[4] += v[24];
v[5] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x0C ^ S16));
v[25] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x32 ^ S16));
v[5] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x0C ^ S16));
v[25] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x32 ^ S16));
v[5] += v[25];
v[6] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x08 ^ S16));
v[27] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x36 ^ S16));
v[6] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x08 ^ S16));
v[27] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x36 ^ S16));
v[6] += v[27];
v[7] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x0A ^ S16));
v[26] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x34 ^ S16));
v[7] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x0A ^ S16));
v[26] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x34 ^ S16));
v[7] += v[26];
v[8] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x1E ^ S16));
v[16] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x20 ^ S16));
v[8] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x1E ^ S16));
v[16] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x20 ^ S16));
v[8] += v[16];
v[9] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x1C ^ S16));
v[17] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x22 ^ S16));
v[9] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x1C ^ S16));
v[17] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x22 ^ S16));
v[9] += v[17];
v[10] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x18 ^ S16));
v[19] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x26 ^ S16));
v[10] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x18 ^ S16));
v[19] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x26 ^ S16));
v[10] += v[19];
v[11] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x1A ^ S16));
v[18] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x24 ^ S16));
v[11] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x1A ^ S16));
v[18] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x24 ^ S16));
v[11] += v[18];
v[12] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x10 ^ S16));
v[23] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x2E ^ S16));
v[12] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x10 ^ S16));
v[23] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x2E ^ S16));
v[12] += v[23];
v[13] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x12 ^ S16));
v[22] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x2C ^ S16));
v[13] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x12 ^ S16));
v[22] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x2C ^ S16));
v[13] += v[22];
v[14] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x16 ^ S16));
v[20] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x28 ^ S16));
v[14] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x16 ^ S16));
v[20] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x28 ^ S16));
v[14] += v[20];
v[15] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x14 ^ S16));
v[21] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x2A ^ S16));
v[15] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x14 ^ S16));
v[21] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x2A ^ S16));
v[15] += v[21];
/* Part 2-4 */
@ -332,10 +343,10 @@ static void InnerLoop(struct hle_t* hle)
/* Part 5 - 1-Wide Butterflies - 100% Accurate but need SSVs!!! */
t0 = hle->mp3_t6 + 0x100;
t1 = hle->mp3_t6 + 0x200;
t2 = hle->mp3_t5 + 0x100;
t3 = hle->mp3_t5 + 0x200;
t0 = t6 + 0x100;
t1 = t6 + 0x200;
t2 = t5 + 0x100;
t3 = t5 + 0x200;
/* 0x13A8 */
v[1] = 0;
@ -344,14 +355,14 @@ static void InnerLoop(struct hle_t* hle)
v[16] = -v[16] - v[17];
v[2] = v[18] + v[19];
/* ** Store v[11] -> (T6 + 0)** */
*(int16_t *)(hle->mp3_buffer + ((hle->mp3_t6 + (short)0x0))) = (short)v[11];
*(int16_t *)(hle->mp3_buffer + ((t6 + (short)0x0))) = (short)v[11];
v[11] = -v[11];
/* ** Store v[16] -> (T3 + 0)** */
*(int16_t *)(hle->mp3_buffer + ((t3 + (short)0x0))) = (short)v[16];
/* ** Store v[11] -> (T5 + 0)** */
*(int16_t *)(hle->mp3_buffer + ((hle->mp3_t5 + (short)0x0))) = (short)v[11];
*(int16_t *)(hle->mp3_buffer + ((t5 + (short)0x0))) = (short)v[11];
/* 0x13E8 - Verified.... */
v[2] = -v[2];
/* ** Store v[2] -> (T2 + 0)** */
@ -398,7 +409,7 @@ static void InnerLoop(struct hle_t* hle)
v[17] = v[13] - v[10];
v[9] = v[9] + v[14];
/* ** Store v[9] -> (T6 + 0x40) */
*(int16_t *)(hle->mp3_buffer + ((hle->mp3_t6 + (short)0x40))) = (short)v[9];
*(int16_t *)(hle->mp3_buffer + ((t6 + (short)0x40))) = (short)v[9];
v[11] = v[11] - v[13];
/* ** Store v[17] -> (T0 + 0xFFC0) */
*(int16_t *)(hle->mp3_buffer + ((t0 + (short)0xFFC0))) = (short)v[17];
@ -414,63 +425,63 @@ static void InnerLoop(struct hle_t* hle)
/* ** Store v[8] -> (T3 + 0xFFC0) */
*(int16_t *)(hle->mp3_buffer + ((t3 + (short)0xFFC0))) = (short)v[8];
/* ** Store v[14] -> (T5 + 0x40) */
*(int16_t *)(hle->mp3_buffer + ((hle->mp3_t5 + (short)0x40))) = (short)v[14];
*(int16_t *)(hle->mp3_buffer + ((t5 + (short)0x40))) = (short)v[14];
/* ** Store v[10] -> (T2 + 0xFFC0) */
*(int16_t *)(hle->mp3_buffer + ((t2 + (short)0xFFC0))) = (short)v[10];
/* 0x14FC - Verified... */
/* Part 6 - 100% Accurate */
v[0] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x00 ^ S16));
v[31] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x3E ^ S16));
v[0] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x00 ^ S16));
v[31] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x3E ^ S16));
v[0] -= v[31];
v[1] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x02 ^ S16));
v[30] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x3C ^ S16));
v[1] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x02 ^ S16));
v[30] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x3C ^ S16));
v[1] -= v[30];
v[2] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x06 ^ S16));
v[28] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x38 ^ S16));
v[2] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x06 ^ S16));
v[28] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x38 ^ S16));
v[2] -= v[28];
v[3] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x04 ^ S16));
v[29] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x3A ^ S16));
v[3] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x04 ^ S16));
v[29] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x3A ^ S16));
v[3] -= v[29];
v[4] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x0E ^ S16));
v[24] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x30 ^ S16));
v[4] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x0E ^ S16));
v[24] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x30 ^ S16));
v[4] -= v[24];
v[5] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x0C ^ S16));
v[25] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x32 ^ S16));
v[5] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x0C ^ S16));
v[25] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x32 ^ S16));
v[5] -= v[25];
v[6] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x08 ^ S16));
v[27] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x36 ^ S16));
v[6] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x08 ^ S16));
v[27] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x36 ^ S16));
v[6] -= v[27];
v[7] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x0A ^ S16));
v[26] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x34 ^ S16));
v[7] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x0A ^ S16));
v[26] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x34 ^ S16));
v[7] -= v[26];
v[8] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x1E ^ S16));
v[16] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x20 ^ S16));
v[8] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x1E ^ S16));
v[16] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x20 ^ S16));
v[8] -= v[16];
v[9] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x1C ^ S16));
v[17] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x22 ^ S16));
v[9] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x1C ^ S16));
v[17] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x22 ^ S16));
v[9] -= v[17];
v[10] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x18 ^ S16));
v[19] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x26 ^ S16));
v[10] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x18 ^ S16));
v[19] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x26 ^ S16));
v[10] -= v[19];
v[11] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x1A ^ S16));
v[18] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x24 ^ S16));
v[11] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x1A ^ S16));
v[18] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x24 ^ S16));
v[11] -= v[18];
v[12] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x10 ^ S16));
v[23] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x2E ^ S16));
v[12] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x10 ^ S16));
v[23] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x2E ^ S16));
v[12] -= v[23];
v[13] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x12 ^ S16));
v[22] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x2C ^ S16));
v[13] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x12 ^ S16));
v[22] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x2C ^ S16));
v[13] -= v[22];
v[14] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x16 ^ S16));
v[20] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x28 ^ S16));
v[14] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x16 ^ S16));
v[20] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x28 ^ S16));
v[14] -= v[20];
v[15] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x14 ^ S16));
v[21] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x2A ^ S16));
v[15] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x14 ^ S16));
v[21] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x2A ^ S16));
v[15] -= v[21];
for (i = 0; i < 16; i++)
@ -516,10 +527,10 @@ static void InnerLoop(struct hle_t* hle)
v[14] = v[6] - v[14];
v[15] = (((v[30] - v[31]) * 0x5A827) >> 0x10) - v[7];
/* Store v14 -> (T5 + 0x20) */
*(int16_t *)(hle->mp3_buffer + ((hle->mp3_t5 + (short)0x20))) = (short)v[14];
*(int16_t *)(hle->mp3_buffer + ((t5 + (short)0x20))) = (short)v[14];
v[14] = v[14] + v[1];
/* Store v[14] -> (T6 + 0x20) */
*(int16_t *)(hle->mp3_buffer + ((hle->mp3_t6 + (short)0x20))) = (short)v[14];
*(int16_t *)(hle->mp3_buffer + ((t6 + (short)0x20))) = (short)v[14];
/* Store v[15] -> (T1 + 0xFFE0) */
*(int16_t *)(hle->mp3_buffer + ((t1 + (short)0xFFE0))) = (short)v[15];
v[9] = v[9] + v[10];
@ -527,7 +538,7 @@ static void InnerLoop(struct hle_t* hle)
v[6] = v[10] - v[6];
v[1] = v[9] - v[1];
/* Store v[6] -> (T5 + 0x60) */
*(int16_t *)(hle->mp3_buffer + ((hle->mp3_t5 + (short)0x60))) = (short)v[6];
*(int16_t *)(hle->mp3_buffer + ((t5 + (short)0x60))) = (short)v[6];
v[10] = v[10] + v[2];
v[10] = v[4] - v[10];
/* Store v[10] -> (T2 + 0xFFA0) */
@ -547,7 +558,7 @@ static void InnerLoop(struct hle_t* hle)
*(int16_t *)(hle->mp3_buffer + ((t1 + (short)0xFFA0))) = (short)v[7];
v[11] = v[11] - v[3];
/* Store v[1] -> (T6 + 0x60) */
*(int16_t *)(hle->mp3_buffer + ((hle->mp3_t6 + (short)0x60))) = (short)v[1];
*(int16_t *)(hle->mp3_buffer + ((t6 + (short)0x60))) = (short)v[1];
v[11] = v[11] - v[5];
/* Store v[11] -> (T0 + 0x60) */
*(int16_t *)(hle->mp3_buffer + ((t0 + (short)0x60))) = (short)v[11];
@ -564,9 +575,9 @@ static void InnerLoop(struct hle_t* hle)
/* Step 8 - Dewindowing */
addptr = hle->mp3_t6 & 0xFFE0;
addptr = t6 & 0xFFE0;
offset = 0x10 - (hle->mp3_t4 >> 1);
offset = 0x10 - (t4 >> 1);
for (x = 0; x < 8; x++) {
int32_t v0;
int32_t v18;
@ -585,14 +596,14 @@ static void InnerLoop(struct hle_t* hle)
/* Clamp(v0); */
/* Clamp(v18); */
/* clamp??? */
*(int16_t *)(hle->mp3_buffer + (hle->mp3_outPtr ^ S16)) = v0;
*(int16_t *)(hle->mp3_buffer + ((hle->mp3_outPtr + 2)^S16)) = v18;
hle->mp3_outPtr += 4;
*(int16_t *)(hle->mp3_buffer + (outPtr ^ S16)) = v0;
*(int16_t *)(hle->mp3_buffer + ((outPtr + 2)^S16)) = v18;
outPtr += 4;
addptr += 0x30;
offset += 0x38;
}
offset = 0x10 - (hle->mp3_t4 >> 1) + 8 * 0x40;
offset = 0x10 - (t4 >> 1) + 8 * 0x40;
v2 = v4 = 0;
for (i = 0; i < 4; i++) {
v2 += ((int) * (int16_t *)(hle->mp3_buffer + (addptr) + 0x00) * (short)DeWindowLUT[offset + 0x00] + 0x4000) >> 0xF;
@ -606,12 +617,12 @@ static void InnerLoop(struct hle_t* hle)
}
mult6 = *(int32_t *)(hle->mp3_buffer + 0xCE8);
mult4 = *(int32_t *)(hle->mp3_buffer + 0xCEC);
if (hle->mp3_t4 & 0x2) {
if (t4 & 0x2) {
v2 = (v2 **(uint32_t *)(hle->mp3_buffer + 0xCE8)) >> 0x10;
*(int16_t *)(hle->mp3_buffer + (hle->mp3_outPtr ^ S16)) = v2;
*(int16_t *)(hle->mp3_buffer + (outPtr ^ S16)) = v2;
} else {
v4 = (v4 **(uint32_t *)(hle->mp3_buffer + 0xCE8)) >> 0x10;
*(int16_t *)(hle->mp3_buffer + (hle->mp3_outPtr ^ S16)) = v4;
*(int16_t *)(hle->mp3_buffer + (outPtr ^ S16)) = v4;
mult4 = *(uint32_t *)(hle->mp3_buffer + 0xCE8);
}
addptr -= 0x50;
@ -621,7 +632,7 @@ static void InnerLoop(struct hle_t* hle)
int32_t v18;
v2 = v4 = v6 = v8 = 0;
offset = (0x22F - (hle->mp3_t4 >> 1) + x * 0x40);
offset = (0x22F - (t4 >> 1) + x * 0x40);
for (i = 0; i < 4; i++) {
v2 += ((int) * (int16_t *)(hle->mp3_buffer + (addptr) + 0x20) * (short)DeWindowLUT[offset + 0x00] + 0x4000) >> 0xF;
@ -640,13 +651,13 @@ static void InnerLoop(struct hle_t* hle)
/* Clamp(v0); */
/* Clamp(v18); */
/* clamp??? */
*(int16_t *)(hle->mp3_buffer + ((hle->mp3_outPtr + 2)^S16)) = v0;
*(int16_t *)(hle->mp3_buffer + ((hle->mp3_outPtr + 4)^S16)) = v18;
hle->mp3_outPtr += 4;
*(int16_t *)(hle->mp3_buffer + ((outPtr + 2)^S16)) = v0;
*(int16_t *)(hle->mp3_buffer + ((outPtr + 4)^S16)) = v18;
outPtr += 4;
addptr -= 0x50;
}
tmp = hle->mp3_outPtr;
tmp = outPtr;
hi0 = mult6;
hi1 = mult4;
@ -655,43 +666,20 @@ static void InnerLoop(struct hle_t* hle)
for (i = 0; i < 8; i++) {
/* v0 */
vt = (*(int16_t *)(hle->mp3_buffer + ((tmp - 0x40)^S16)) * hi0);
if (vt > 32767) {
vt = 32767;
} else {
if (vt < -32767)
vt = -32767;
}
*(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0x40)^S16)) = (int16_t)vt;
*(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0x40)^S16)) = clamp_s16(vt);
/* v17 */
vt = (*(int16_t *)(hle->mp3_buffer + ((tmp - 0x30)^S16)) * hi0);
if (vt > 32767) {
vt = 32767;
} else {
if (vt < -32767)
vt = -32767;
}
*(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0x30)^S16)) = vt;
*(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0x30)^S16)) = clamp_s16(vt);
/* v2 */
vt = (*(int16_t *)(hle->mp3_buffer + ((tmp - 0x1E)^S16)) * hi1);
if (vt > 32767) {
vt = 32767;
} else {
if (vt < -32767)
vt = -32767;
}
*(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0x1E)^S16)) = vt;
*(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0x1E)^S16)) = clamp_s16(vt);
/* v4 */
vt = (*(int16_t *)(hle->mp3_buffer + ((tmp - 0xE)^S16)) * hi1);
if (vt > 32767) {
vt = 32767;
} else {
if (vt < -32767)
vt = -32767;
}
*(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0xE)^S16)) = vt;
*(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0xE)^S16)) = clamp_s16(vt);
tmp += 2;
}
}

View File

@ -28,6 +28,8 @@
#include <string.h>
#include <stddef.h>
#include "common.h"
#include "arithmetics.h"
#include "audio.h"
#include "hle_external.h"
@ -436,20 +438,16 @@ static void init_subframes_v1(musyx_t *musyx)
static void init_subframes_v2(musyx_t *musyx)
{
unsigned i,k;
int16_t values[4];
int16_t* subframes[4];
int16_t values[4] = {
clamp_s16(musyx->base_vol[0]),
clamp_s16(musyx->base_vol[1]),
clamp_s16(musyx->base_vol[2]),
clamp_s16(musyx->base_vol[3])
};
for(k = 0; k < 4; ++k)
values[k] = clamp_s16(musyx->base_vol[k]);
int16_t* subframes[4] = {
musyx->left,
musyx->right,
musyx->cc0,
musyx->e50
};
subframes[0] = musyx->left;
subframes[1] = musyx->right;
subframes[2] = musyx->cc0;
subframes[3] = musyx->e50;
for (i = 0; i < SUBFRAME_SIZE; ++i) {
@ -855,7 +853,7 @@ static void sfx_stage(struct hle_t* hle, mix_sfx_with_main_subframes_t mix_sfx_w
}
static void mix_sfx_with_main_subframes_v1(musyx_t *musyx, const int16_t *subframe,
const uint16_t* gains)
const uint16_t* UNUSED(gains))
{
unsigned i;

View File

@ -47,6 +47,12 @@ int usf_upload_section(void * state, const uint8_t * data, size_t size);
Requesting zero samples with a null pointer is an acceptable way to
force at least one block of samples to render and return the current
sample rate in the variable passed in.
Requesting a non-zero number of samples with a null buffer pointer will
result in exactly count samples being rendered and discarded.
Emulation runs in whole blocks until there have been exactly enough
Audio Interface DMA transfers to at least fill count samples, at which
point the remainder is buffered in the emulator state until the next
usf_render() call.
Returns 0 on success, or a pointer to the last error message on failure. */
const char * usf_render(void * state, int16_t * buffer, size_t count, int32_t * sample_rate);

View File

@ -77,10 +77,20 @@ struct usf_state
int16_t * sample_buffer;
// audio.c
// SampleRate is usually guaranteed to stay the same for the duration
// of a given track, and depends on the game.
int32_t SampleRate;
// Audio is rendered in whole Audio Interface DMA transfers, which are
// then copied directly to the caller's buffer. Any left over samples
// from the last DMA transfer that fills the caller's buffer will be
// stored here until the next call to usf_render()
int16_t samplebuf[16384];
size_t samples_in_buffer;
// This buffer does not really need to be that large, as it is likely
// to only accumulate a handlful of error messages, at which point
// emulation is immediately halted and the messages are returned to
// the caller.
const char * last_error;
char error_message[1024];

View File

@ -1030,6 +1030,8 @@ static int usf_info(void * context, const char * name, const char * value)
sampleRate = samplerate;
usfRemoveSilence = YES;
silence_seconds = 10;
}
else if ( type == 0x22 )
{