Update LazyUSF and increased silence detection threshold for USF files to 10 seconds, which fixes Majora's Mask - Staff Roll
parent
ef2c8efdf9
commit
0fb8aa57bb
|
@ -191,7 +191,7 @@ int32_t r4300i_LB_NonMemory ( usf_state_t * state, uint32_t PAddr, uint32_t * Va
|
|||
}
|
||||
|
||||
uint32_t r4300i_LB_VAddr ( usf_state_t * state, uint32_t VAddr, uint8_t * Value ) {
|
||||
uintptr_t address;
|
||||
uint32_t address;
|
||||
address = state->TLB_Map[VAddr >> 12];
|
||||
if (address == 0) { return 0; }
|
||||
*Value = *(uint8_t *)(address + (VAddr ^ 3));
|
||||
|
@ -199,7 +199,7 @@ uint32_t r4300i_LB_VAddr ( usf_state_t * state, uint32_t VAddr, uint8_t * Value
|
|||
}
|
||||
|
||||
uint32_t r4300i_LD_VAddr ( usf_state_t * state, uint32_t VAddr, uint64_t * Value ) {
|
||||
uintptr_t address;
|
||||
uint32_t address;
|
||||
address = state->TLB_Map[VAddr >> 12];
|
||||
if (address == 0) { return 0; }
|
||||
*((uint32_t *)(Value) + 1) = *(uint32_t *)(address + VAddr);
|
||||
|
@ -220,7 +220,7 @@ int32_t r4300i_LH_NonMemory ( usf_state_t * state, uint32_t PAddr, uint32_t * Va
|
|||
}
|
||||
|
||||
uint32_t r4300i_LH_VAddr ( usf_state_t * state, uint32_t VAddr, uint16_t * Value ) {
|
||||
uintptr_t address;
|
||||
uint32_t address;
|
||||
address = state->TLB_Map[VAddr >> 12];
|
||||
if (address == 0)
|
||||
return 0;
|
||||
|
@ -426,7 +426,7 @@ int32_t r4300i_SB_NonMemory ( usf_state_t * state, uint32_t PAddr, uint8_t Value
|
|||
}
|
||||
|
||||
uint32_t r4300i_SB_VAddr ( usf_state_t * state, uint32_t VAddr, uint8_t Value ) {
|
||||
uintptr_t address;
|
||||
uint32_t address;
|
||||
address = state->TLB_Map[VAddr >> 12];
|
||||
|
||||
if (address == 0) { return 0; }
|
||||
|
@ -457,7 +457,7 @@ int32_t r4300i_SH_NonMemory ( usf_state_t * state, uint32_t PAddr, uint16_t Valu
|
|||
}
|
||||
|
||||
uint32_t r4300i_SD_VAddr ( usf_state_t * state, uint32_t VAddr, uint64_t Value ) {
|
||||
uintptr_t address;
|
||||
uint32_t address;
|
||||
address = state->TLB_Map[VAddr >> 12];
|
||||
if (address == 0) { return 0; }
|
||||
*(uint32_t *)(address + VAddr) = *((uint32_t *)(&Value) + 1);
|
||||
|
@ -466,7 +466,7 @@ uint32_t r4300i_SD_VAddr ( usf_state_t * state, uint32_t VAddr, uint64_t Value )
|
|||
}
|
||||
|
||||
uint32_t r4300i_SH_VAddr ( usf_state_t * state, uint32_t VAddr, uint16_t Value ) {
|
||||
uintptr_t address;
|
||||
uint32_t address;
|
||||
address = state->TLB_Map[VAddr >> 12];
|
||||
|
||||
if (address == 0) { return 0; }
|
||||
|
|
|
@ -22,6 +22,9 @@ NOINLINE void run_task(usf_state_t * state)
|
|||
{
|
||||
register int PC;
|
||||
int wrap_count = 0;
|
||||
#ifdef SP_EXECUTE_LOG
|
||||
int last_PC;
|
||||
#endif
|
||||
|
||||
if (CFG_WAIT_FOR_CPU_HOST != 0)
|
||||
{
|
||||
|
@ -36,6 +39,9 @@ NOINLINE void run_task(usf_state_t * state)
|
|||
register uint32_t inst;
|
||||
|
||||
inst = *(uint32_t *)(state->IMEM + FIT_IMEM(PC));
|
||||
#ifdef SP_EXECUTE_LOG
|
||||
last_PC = PC;
|
||||
#endif
|
||||
#ifdef EMULATE_STATIC_PC
|
||||
PC = (PC + 0x004);
|
||||
if ( FIT_IMEM(PC) == 0 && ++wrap_count == 32 )
|
||||
|
@ -46,7 +52,7 @@ NOINLINE void run_task(usf_state_t * state)
|
|||
EX:
|
||||
#endif
|
||||
#ifdef SP_EXECUTE_LOG
|
||||
step_SP_commands(inst);
|
||||
step_SP_commands(state, last_PC, inst);
|
||||
#endif
|
||||
if (inst >> 25 == 0x25) /* is a VU instruction */
|
||||
{
|
||||
|
@ -463,6 +469,9 @@ EX:
|
|||
continue;
|
||||
BRANCH:
|
||||
inst = *(uint32_t *)(state->IMEM + FIT_IMEM(PC));
|
||||
#ifdef SP_EXECUTE_LOG
|
||||
last_PC = PC;
|
||||
#endif
|
||||
PC = state->temp_PC & 0x00000FFC;
|
||||
goto EX;
|
||||
#endif
|
||||
|
|
|
@ -58,12 +58,10 @@ NOINLINE static void message(usf_state_t * state, const char* body, int priority
|
|||
*/
|
||||
#define CHARACTERS_PER_LINE (80)
|
||||
/* typical standard DOS text file limit per line */
|
||||
#if 0
|
||||
NOINLINE static void update_conf(const char* source)
|
||||
{
|
||||
(void)source;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef SP_EXECUTE_LOG
|
||||
extern void step_SP_commands(usf_state_t * state, int PC, uint32_t inst);
|
||||
|
|
|
@ -180,7 +180,6 @@ void set_VCC(usf_state_t * state, unsigned short VCC)
|
|||
state->clip[i] = (VCC >> (i + 0x8)) & 1;
|
||||
return; /* Little endian becomes big. */
|
||||
}
|
||||
#if 0
|
||||
void set_VCE(usf_state_t * state, unsigned char VCE)
|
||||
{
|
||||
register int i;
|
||||
|
@ -190,4 +189,3 @@ void set_VCE(usf_state_t * state, unsigned char VCE)
|
|||
return; /* Little endian becomes big. */
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
@ -42,6 +42,22 @@
|
|||
static INLINE void merge(short* VD, short* cmp, short* pass, short* fail)
|
||||
{
|
||||
register int i;
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
int16x8_t p,f,d,c,vd,temp;
|
||||
|
||||
p = vld1q_s16((const int16_t*)pass);
|
||||
f = vld1q_s16((const int16_t*)fail);
|
||||
c = vld1q_s16((const int16_t*)cmp);
|
||||
|
||||
d = vsubq_s16(p,f);
|
||||
vd = vmlaq_s16(f, c, d); //vd = f + (cmp * d)
|
||||
vst1q_s16(VD, vd);
|
||||
return;
|
||||
|
||||
#else
|
||||
|
||||
#if (0)
|
||||
/* Do not use this version yet, as it still does not vectorize to SSE2. */
|
||||
for (i = 0; i < N; i++)
|
||||
|
@ -55,9 +71,92 @@ static INLINE void merge(short* VD, short* cmp, short* pass, short* fail)
|
|||
VD[i] = fail[i] + cmp[i]*diff[i]; /* actually `(cmp[i] != 0)*diff[i]` */
|
||||
#endif
|
||||
return;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef ARCH_MIN_SSE2
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
static INLINE void vector_copy(short * VD, short * VS)
|
||||
{
|
||||
int16x8_t xmm;
|
||||
xmm = vld1q_s16((const int16_t*)VS);
|
||||
vst1q_s16(VD, xmm);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
static INLINE void SIGNED_CLAMP_ADD(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{
|
||||
int16x8_t dst, src, vco, max, min;
|
||||
|
||||
src = vld1q_s16((const int16_t*)VS);
|
||||
dst = vld1q_s16((const int16_t*)VT);
|
||||
vco = vld1q_s16((const int16_t*)state->co);
|
||||
|
||||
max = vmaxq_s16(dst, src);
|
||||
min = vminq_s16(dst, src);
|
||||
|
||||
min = vqaddq_s16(min, vco);
|
||||
max = vqaddq_s16(max, min);
|
||||
|
||||
vst1q_s16(VD, max);
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
static INLINE void SIGNED_CLAMP_SUB(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{
|
||||
int16x8_t dst, src, vco, dif, res, xmm,vd;
|
||||
|
||||
src = vld1q_s16((const int16_t*)VS);
|
||||
vd = vld1q_s16((const int16_t*)VD);
|
||||
dst = vld1q_s16((const int16_t*)VT);
|
||||
vco = vld1q_s16((const int16_t*)state->co);
|
||||
|
||||
res = vqsubq_s16(src, dst);
|
||||
|
||||
dif = vaddq_s16(res, vco);
|
||||
dif = veorq_s16(dif, res);
|
||||
dif = vandq_s16(dif, dst);
|
||||
xmm = vsubq_s16(src, dst);
|
||||
src = vbicq_s16(dif, src);
|
||||
xmm = vandq_s16(xmm, src);
|
||||
xmm = vshrq_n_s16(xmm, 15);
|
||||
|
||||
xmm = vbicq_s16(vco, xmm);
|
||||
res = vqsubq_s16(res, xmm);
|
||||
vst1q_s16(VD, res);
|
||||
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
static INLINE void SIGNED_CLAMP_AM(usf_state_t * state, short* VD)
|
||||
{
|
||||
int16x8_t pvs, pvd;
|
||||
int16x8x2_t packed;
|
||||
int16x8_t result;
|
||||
int16x4_t low, high;
|
||||
|
||||
pvs = vld1q_s16((const int16_t*)VACC_H);
|
||||
pvd = vld1q_s16((const int16_t*)VACC_M);
|
||||
|
||||
packed = vzipq_s16(pvd,pvs);
|
||||
|
||||
low = vqmovn_s32((int32x4_t)packed.val[0]);
|
||||
high = vqmovn_s32((int32x4_t)packed.val[1]);
|
||||
|
||||
result = vcombine_s16(low,high);
|
||||
|
||||
vst1q_s16(VD,result);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined ARCH_MIN_SSE2 && !defined ARCH_MIN_ARM_NEON
|
||||
|
||||
static INLINE void vector_copy(short* VD, short* VS)
|
||||
{
|
||||
#if (0)
|
||||
|
@ -92,6 +191,8 @@ static INLINE void SIGNED_CLAMP_ADD(usf_state_t * state, short* VD, short* VS, s
|
|||
VD[i] ^= 0x8000 & (hi[i] | lo[i]);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
static INLINE void SIGNED_CLAMP_SUB(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{
|
||||
ALIGNED int32_t dif[N];
|
||||
|
@ -113,8 +214,9 @@ static INLINE void SIGNED_CLAMP_SUB(usf_state_t * state, short* VD, short* VS, s
|
|||
VD[i] ^= 0x8000 & (hi[i] | lo[i]);
|
||||
return;
|
||||
}
|
||||
|
||||
static INLINE void SIGNED_CLAMP_AM(usf_state_t * state, short* VD)
|
||||
{ /* typical sign-clamp of accumulator-mid (bits 31:16) */
|
||||
{
|
||||
ALIGNED short hi[N], lo[N];
|
||||
register int i;
|
||||
|
||||
|
@ -135,7 +237,9 @@ static INLINE void SIGNED_CLAMP_AM(usf_state_t * state, short* VD)
|
|||
VD[i] ^= 0x8000 * (hi[i] | lo[i]);
|
||||
return;
|
||||
}
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef ARCH_MIN_SSE2
|
||||
/*
|
||||
* We actually need to write explicit SSE2 code for this because GCC 4.8.1
|
||||
* (and possibly later versions) has a code generation bug with vectorizing
|
||||
|
@ -225,10 +329,29 @@ static INLINE void SIGNED_CLAMP_AM(usf_state_t * state, short* VD)
|
|||
|
||||
static INLINE void UNSIGNED_CLAMP(usf_state_t * state, short* VD)
|
||||
{ /* sign-zero hybrid clamp of accumulator-mid (bits 31:16) */
|
||||
|
||||
ALIGNED short cond[N];
|
||||
ALIGNED short temp[N];
|
||||
register int i;
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
uint16x8_t c;
|
||||
int16x8_t t = vld1q_s16((const int16_t*)temp);
|
||||
int16x8_t vaccm = vld1q_s16((const int16_t*)VACC_M);
|
||||
|
||||
SIGNED_CLAMP_AM(state, temp);
|
||||
|
||||
c = vcgtq_s16(t,vaccm);
|
||||
int16x8_t t_ = vshrq_n_s16(t,15);
|
||||
int16x8_t vd = vbicq_s16(t,t_);
|
||||
vd = vorrq_s16(vd,(int16x8_t)c);
|
||||
vst1q_s16(VD, vd);
|
||||
|
||||
return;
|
||||
|
||||
#else
|
||||
|
||||
SIGNED_CLAMP_AM(state, temp); /* no direct map in SSE, but closely based on this */
|
||||
for (i = 0; i < N; i++)
|
||||
cond[i] = -(temp[i] > VACC_M[i]); /* VD |= -(ACC47..16 > +32767) */
|
||||
|
@ -237,12 +360,36 @@ static INLINE void UNSIGNED_CLAMP(usf_state_t * state, short* VD)
|
|||
for (i = 0; i < N; i++)
|
||||
VD[i] = VD[i] | cond[i];
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
static INLINE void SIGNED_CLAMP_AL(usf_state_t * state, short* VD)
|
||||
{ /* sign-clamp accumulator-low (bits 15:0) */
|
||||
|
||||
ALIGNED short cond[N];
|
||||
ALIGNED short temp[N];
|
||||
register int i;
|
||||
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
SIGNED_CLAMP_AM(state, temp);
|
||||
|
||||
uint16x8_t c;
|
||||
int16x8_t eightk = vdupq_n_s16(0x8000);
|
||||
uint16x8_t one = vdupq_n_u16(1);
|
||||
int16x8_t t = vld1q_s16((const int16_t*)temp);
|
||||
int16x8_t vaccm = vld1q_s16((const int16_t*)VACC_M);
|
||||
|
||||
c = vceqq_s16(t,vaccm);
|
||||
c = vaddq_u16(c, one);
|
||||
t = veorq_s16(t, eightk);
|
||||
vst1q_u16(cond,c);
|
||||
vst1q_s16(temp,t);
|
||||
merge(VD, cond, temp, VACC_L);
|
||||
|
||||
return;
|
||||
#else
|
||||
|
||||
SIGNED_CLAMP_AM(state, temp); /* no direct map in SSE, but closely based on this */
|
||||
for (i = 0; i < N; i++)
|
||||
|
@ -251,5 +398,6 @@ static INLINE void SIGNED_CLAMP_AL(usf_state_t * state, short* VD)
|
|||
temp[i] ^= 0x8000; /* half-assed unsigned saturation mix in the clamp */
|
||||
merge(VD, cond, temp, VACC_L);
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -22,65 +22,42 @@
|
|||
#define INLINE
|
||||
#endif
|
||||
|
||||
#ifndef ARCH_MIN_SSE2
|
||||
/*
|
||||
* vector-scalar element decoding
|
||||
* Obsolete. Consider using at least the SSE2 algorithms instead.
|
||||
*/
|
||||
static const int ei[16][8] = {
|
||||
{ 00, 01, 02, 03, 04, 05, 06, 07 }, /* none (vector-only operand) */
|
||||
{ 00, 01, 02, 03, 04, 05, 06, 07 },
|
||||
{ 00, 00, 02, 02, 04, 04, 06, 06 }, /* 0Q */
|
||||
{ 01, 01, 03, 03, 05, 05, 07, 07 }, /* 1Q */
|
||||
{ 00, 00, 00, 00, 04, 04, 04, 04 }, /* 0H */
|
||||
{ 01, 01, 01, 01, 05, 05, 05, 05 }, /* 1H */
|
||||
{ 02, 02, 02, 02, 06, 06, 06, 06 }, /* 2H */
|
||||
{ 03, 03, 03, 03, 07, 07, 07, 07 }, /* 3H */
|
||||
{ 00, 00, 00, 00, 00, 00, 00, 00 }, /* 0 */
|
||||
{ 01, 01, 01, 01, 01, 01, 01, 01 }, /* 1 */
|
||||
{ 02, 02, 02, 02, 02, 02, 02, 02 }, /* 2 */
|
||||
{ 03, 03, 03, 03, 03, 03, 03, 03 }, /* 3 */
|
||||
{ 04, 04, 04, 04, 04, 04, 04, 04 }, /* 4 */
|
||||
{ 05, 05, 05, 05, 05, 05, 05, 05 }, /* 5 */
|
||||
{ 06, 06, 06, 06, 06, 06, 06, 06 }, /* 6 */
|
||||
{ 07, 07, 07, 07, 07, 07, 07, 07 } /* 7 */
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
static const unsigned char smask[16][16] = {
|
||||
{0x0,0x1,0x2,0x3,0x4,0x5,0x6,0x7,0x8,0x9,0xA,0xB,0xC,0xD,0xE,0xF},
|
||||
{0x0,0x1,0x2,0x3,0x4,0x5,0x6,0x7,0x8,0x9,0xA,0xB,0xC,0xD,0xE,0xF},
|
||||
{0x0,0x1,0x0,0x1,0x4,0x5,0x4,0x5,0x8,0x9,0x8,0x9,0xC,0xD,0xC,0xD},
|
||||
{0x2,0x3,0x2,0x3,0x6,0x7,0x6,0x7,0xA,0xB,0xA,0xB,0xE,0xF,0xE,0xF},
|
||||
{0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9},
|
||||
{0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB},
|
||||
{0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD},
|
||||
{0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF},
|
||||
{0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1},
|
||||
{0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3},
|
||||
{0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5},
|
||||
{0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7},
|
||||
{0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9},
|
||||
{0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB},
|
||||
{0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD},
|
||||
{0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF}
|
||||
};
|
||||
|
||||
int sub_mask[16] = {
|
||||
0x0,
|
||||
0x0,
|
||||
0x1, 0x1,
|
||||
0x3, 0x3, 0x3, 0x3,
|
||||
0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7
|
||||
};
|
||||
#define conv816_to_882(v) ((int8x8x2_t) { vget_low_s8(v), vget_high_s8(v) })
|
||||
|
||||
INLINE static void SHUFFLE_VECTOR(short* VD, short* VT, const int e)
|
||||
{
|
||||
ALIGNED short SV[8];
|
||||
register int i, j;
|
||||
#if (0 == 0)
|
||||
j = sub_mask[e];
|
||||
for (i = 0; i < N; i++)
|
||||
SV[i] = VT[(i & ~j) | (e & j)];
|
||||
#else
|
||||
if (e & 0x8)
|
||||
for (i = 0; i < N; i++)
|
||||
SV[i] = VT[(i & 0x0) | (e & 0x7)];
|
||||
else if (e & 0x4)
|
||||
for (i = 0; i < N; i++)
|
||||
SV[i] = VT[(i & 0xC) | (e & 0x3)];
|
||||
else if (e & 0x2)
|
||||
for (i = 0; i < N; i++)
|
||||
SV[i] = VT[(i & 0xE) | (e & 0x1)];
|
||||
else /* if ((e == 0b0000) || (e == 0b0001)) */
|
||||
for (i = 0; i < N; i++)
|
||||
SV[i] = VT[(i & 0x7) | (e & 0x0)];
|
||||
#endif
|
||||
for (i = 0; i < N; i++)
|
||||
*(VD + i) = *(SV + i);
|
||||
{
|
||||
int8x16_t xmm;
|
||||
int8x16_t key;
|
||||
|
||||
xmm = vld1q_s8((const int8_t*)VT);
|
||||
key = vld1q_s8(smask[e]);
|
||||
xmm = vcombine_s8(vtbl2_s8(conv816_to_882(xmm), vget_low_s8(key)), vtbl2_s8(conv816_to_882(xmm), vget_high_s8(key)));
|
||||
vst1q_s8((int8_t*)VD, xmm);
|
||||
|
||||
return;
|
||||
}
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef ARCH_MIN_SSSE3
|
||||
static const unsigned char smask[16][16] = {
|
||||
{0x0,0x1,0x2,0x3,0x4,0x5,0x6,0x7,0x8,0x9,0xA,0xB,0xC,0xD,0xE,0xF},
|
||||
|
@ -112,7 +89,9 @@ INLINE static void SHUFFLE_VECTOR(short* VD, short* VT, const int e)
|
|||
_mm_store_si128((__m128i *)VD, xmm);
|
||||
return;
|
||||
}
|
||||
#else
|
||||
#endif
|
||||
|
||||
#if defined ARCH_MIN_SSE2 && !defined ARCH_MIN_SSSE3
|
||||
#define B(x) ((x) & 3)
|
||||
#define SHUFFLE(a,b,c,d) ((B(d)<<6) | (B(c)<<4) | (B(b)<<2) | (B(a)<<0))
|
||||
|
||||
|
@ -274,5 +253,65 @@ INLINE static void SHUFFLE_VECTOR(short* VD, short* VT, const int e)
|
|||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if !defined ARCH_MIN_ARM_NEON && !defined ARCH_MIN_SSE2 && !defined ARCH_MIN_SSSE3
|
||||
/*
|
||||
* vector-scalar element decoding
|
||||
* Obsolete. Consider using at least the SSE2 algorithms instead.
|
||||
*/
|
||||
static const int ei[16][8] = {
|
||||
{ 00, 01, 02, 03, 04, 05, 06, 07 }, /* none (vector-only operand) */
|
||||
{ 00, 01, 02, 03, 04, 05, 06, 07 },
|
||||
{ 00, 00, 02, 02, 04, 04, 06, 06 }, /* 0Q */
|
||||
{ 01, 01, 03, 03, 05, 05, 07, 07 }, /* 1Q */
|
||||
{ 00, 00, 00, 00, 04, 04, 04, 04 }, /* 0H */
|
||||
{ 01, 01, 01, 01, 05, 05, 05, 05 }, /* 1H */
|
||||
{ 02, 02, 02, 02, 06, 06, 06, 06 }, /* 2H */
|
||||
{ 03, 03, 03, 03, 07, 07, 07, 07 }, /* 3H */
|
||||
{ 00, 00, 00, 00, 00, 00, 00, 00 }, /* 0 */
|
||||
{ 01, 01, 01, 01, 01, 01, 01, 01 }, /* 1 */
|
||||
{ 02, 02, 02, 02, 02, 02, 02, 02 }, /* 2 */
|
||||
{ 03, 03, 03, 03, 03, 03, 03, 03 }, /* 3 */
|
||||
{ 04, 04, 04, 04, 04, 04, 04, 04 }, /* 4 */
|
||||
{ 05, 05, 05, 05, 05, 05, 05, 05 }, /* 5 */
|
||||
{ 06, 06, 06, 06, 06, 06, 06, 06 }, /* 6 */
|
||||
{ 07, 07, 07, 07, 07, 07, 07, 07 } /* 7 */
|
||||
};
|
||||
|
||||
static const int sub_mask[16] = {
|
||||
0x0,
|
||||
0x0,
|
||||
0x1, 0x1,
|
||||
0x3, 0x3, 0x3, 0x3,
|
||||
0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7
|
||||
};
|
||||
|
||||
INLINE static void SHUFFLE_VECTOR(short* VD, short* VT, const int e)
|
||||
{
|
||||
ALIGNED short SV[8];
|
||||
register int i, j;
|
||||
#if (0 == 0)
|
||||
j = sub_mask[e];
|
||||
for (i = 0; i < N; i++)
|
||||
SV[i] = VT[(i & ~j) | (e & j)];
|
||||
#else
|
||||
if (e & 0x8)
|
||||
for (i = 0; i < N; i++)
|
||||
SV[i] = VT[(i & 0x0) | (e & 0x7)];
|
||||
else if (e & 0x4)
|
||||
for (i = 0; i < N; i++)
|
||||
SV[i] = VT[(i & 0xC) | (e & 0x3)];
|
||||
else if (e & 0x2)
|
||||
for (i = 0; i < N; i++)
|
||||
SV[i] = VT[(i & 0xE) | (e & 0x1)];
|
||||
else /* if ((e == 0b0000) || (e == 0b0001)) */
|
||||
for (i = 0; i < N; i++)
|
||||
SV[i] = VT[(i & 0x7) | (e & 0x0)];
|
||||
#endif
|
||||
for (i = 0; i < N; i++)
|
||||
*(VD + i) = *(SV + i);
|
||||
return;
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
@ -27,6 +27,30 @@ INLINE static void do_abs(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
register int i;
|
||||
|
||||
vector_copy(res, VT);
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
int16x8_t vs = vld1q_s16((const int16_t*)VS);
|
||||
int16x8_t resi = vld1q_s16((const int16_t*)VT);
|
||||
int16x8_t zero = vdupq_n_s16(0);
|
||||
int16x8_t eightk = vdupq_n_s16(0x8000);
|
||||
int16x8_t one = vdupq_n_s16(1);
|
||||
|
||||
uint16x8_t negi = vcltq_s16(vs,zero);
|
||||
int16x8_t posi = vaddq_s16((int16x8_t)negi,one);
|
||||
posi = vorrq_s16(posi,(int16x8_t)negi);
|
||||
resi = veorq_s16(resi,posi);
|
||||
uint16x8_t ccch = vcgeq_s16(resi,eightk);
|
||||
int16x8_t ch = vnegq_s16((int16x8_t)ccch);
|
||||
resi = vaddq_s16(resi, (int16x8_t)ch);
|
||||
|
||||
vst1q_s16(VACC_L, resi);
|
||||
vector_copy(VD, VACC_L);
|
||||
return;
|
||||
|
||||
#else
|
||||
|
||||
|
||||
#ifndef ARCH_MIN_SSE2
|
||||
#define MASK_XOR
|
||||
#endif
|
||||
|
@ -65,6 +89,7 @@ INLINE static void do_abs(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
vector_copy(VACC_L, res);
|
||||
vector_copy(VD, VACC_L);
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VABS(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -15,6 +15,50 @@
|
|||
|
||||
INLINE static void clr_ci(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{ /* clear CARRY and carry in to accumulators */
|
||||
#ifdef ARCH_MIN_SSE2
|
||||
|
||||
__m128i xmm,vs,vt,co; /*,ne;*/
|
||||
|
||||
xmm = _mm_setzero_si128();
|
||||
vs = _mm_load_si128((const __m128i*)VS);
|
||||
vt = _mm_load_si128((const __m128i*)VT);
|
||||
co = _mm_load_si128((const __m128i*)state->co);
|
||||
|
||||
vs = _mm_add_epi16(vs,vt);
|
||||
vs = _mm_add_epi16(vs,co);
|
||||
|
||||
_mm_store_si128((__m128i*)VACC_L, vs);
|
||||
|
||||
SIGNED_CLAMP_ADD(state, VD, VS, VT);
|
||||
|
||||
_mm_storeu_si128((__m128i*)state->ne, xmm);
|
||||
_mm_storeu_si128((__m128i*)state->co, xmm);
|
||||
|
||||
return;
|
||||
#endif
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
int16x8_t vs, vt, zero1,co;
|
||||
|
||||
zero1 = vdupq_n_s16(0);
|
||||
vs = vld1q_s16((const int16_t*)VS);
|
||||
vt = vld1q_s16((const int16_t*)VT);
|
||||
co = vld1q_s16((const int16_t*)state->co);
|
||||
|
||||
vs = vaddq_s16(vs,vt);
|
||||
vs = vaddq_s16(vs,co);
|
||||
|
||||
vst1q_s16(VACC_L, vs);
|
||||
|
||||
SIGNED_CLAMP_ADD(state, VD, VS, VT);
|
||||
vst1q_s16(state->ne, zero1);
|
||||
vst1q_s16(state->co, zero1);
|
||||
|
||||
return;
|
||||
#endif
|
||||
|
||||
#if !defined ARCH_MIN_ARM_NEON && !defined ARCH_MIN_SSE2
|
||||
register int i;
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
|
@ -24,7 +68,10 @@ INLINE static void clr_ci(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
state->ne[i] = 0;
|
||||
for (i = 0; i < N; i++)
|
||||
state->co[i] = 0;
|
||||
|
||||
return;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
static void VADD(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -15,6 +15,30 @@
|
|||
|
||||
INLINE static void set_co(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{ /* set CARRY and carry out from sum */
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
uint16x4_t vs_low = vld1_u16((const uint16_t*)VS);
|
||||
uint16x4_t vs_high = vld1_u16((const uint16_t*)VS+4);
|
||||
uint16x4_t vt_low = vld1_u16((const uint16_t*)VT);
|
||||
uint16x4_t vt_high = vld1_u16((const uint16_t*)VT+4);
|
||||
uint32x4_t zero = vdupq_n_u32(0);
|
||||
|
||||
uint32x4_t v_l = vaddl_u16(vs_low, vt_low);
|
||||
uint32x4_t v_h = vaddl_u16(vs_high, vt_high);
|
||||
uint16x4_t vl16 = vaddhn_u32(v_l,zero);
|
||||
uint16x4_t vh16 = vaddhn_u32(v_h,zero);
|
||||
uint16x8_t vaccl = vcombine_u16(vmovn_u32(v_l),vmovn_u32(v_h));
|
||||
uint16x8_t co = vcombine_u16(vl16,vh16);
|
||||
|
||||
vst1q_u16(VACC_L, vaccl);
|
||||
vector_copy(VD, VACC_L);
|
||||
vst1q_u16(state->ne, (uint16x8_t)zero);
|
||||
vst1q_u16(state->co, co);
|
||||
|
||||
return;
|
||||
#else
|
||||
|
||||
ALIGNED int32_t sum[N];
|
||||
register int i;
|
||||
|
||||
|
@ -28,6 +52,8 @@ INLINE static void set_co(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
for (i = 0; i < N; i++)
|
||||
state->co[i] = sum[i] >> 16; /* native: (sum[i] > +65535) */
|
||||
return;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VADDC(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -13,14 +13,25 @@
|
|||
\******************************************************************************/
|
||||
#include "vu.h"
|
||||
|
||||
static INLINE void do_and(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
INLINE void do_and(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
int16x8_t vs = vld1q_s16((const int16_t *)VS);
|
||||
int16x8_t vt = vld1q_s16((const int16_t *)VT);
|
||||
int16x8_t vaccl = vandq_s16(vs,vt);
|
||||
vst1q_s16(VACC_L, vaccl);
|
||||
vector_copy(VD, VACC_L);
|
||||
return;
|
||||
|
||||
#else
|
||||
register int i;
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
VACC_L[i] = VS[i] & VT[i];
|
||||
vector_copy(VD, VACC_L);
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VAND(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -19,6 +19,58 @@ INLINE static void do_ch(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
ALIGNED short sn[N];
|
||||
ALIGNED short VC[N];
|
||||
ALIGNED short diff[N];
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
int16x8_t v_vc,neg_sn,vce,v_eq;
|
||||
int16x8_t zero = vdupq_n_s16(0);
|
||||
int16x8_t one = vdupq_n_s16(1);
|
||||
|
||||
int16x8_t vs = vld1q_s16((const int16_t *)VS);
|
||||
int16x8_t vt = vld1q_s16((const int16_t *)VT);
|
||||
v_vc = vt;
|
||||
int16x8_t v_sn = veorq_s16(vs,v_vc);
|
||||
uint16x8_t sn_u = vcltq_s16(v_sn,zero);
|
||||
v_vc = veorq_s16(v_vc, (int16x8_t)sn_u);
|
||||
neg_sn = vnegq_s16((int16x8_t)sn_u);
|
||||
|
||||
uint16x8_t vs_vc_eq = vceqq_s16(vs,v_vc);
|
||||
vce = vandq_s16((int16x8_t)vs_vc_eq,v_sn);
|
||||
|
||||
v_vc = vaddq_s16(v_vc,v_sn);
|
||||
v_eq = vorrq_s16(vce,(int16x8_t)vs_vc_eq);
|
||||
v_eq = vnegq_s16(v_eq);
|
||||
|
||||
int16x8_t not_sn = vsubq_s16(neg_sn, one);
|
||||
int16x8_t neg_vs = vmvnq_s16(vs);
|
||||
int16x8_t v_diff = vorrq_s16(neg_vs, not_sn);
|
||||
uint16x8_t ule = vcleq_s16(vt,v_diff);
|
||||
int16x8_t v_le = vnegq_s16((int16x8_t)ule);
|
||||
|
||||
v_diff = vorrq_s16(vs, (int16x8_t)sn_u);
|
||||
uint16x8_t uge = vcgeq_s16(v_diff,vt);
|
||||
int16x8_t v_ge = vnegq_s16((int16x8_t)uge);
|
||||
|
||||
vst1q_s16(ge, v_ge);
|
||||
vst1q_s16(le, v_le);
|
||||
vst1q_s16(sn, v_sn);
|
||||
vst1q_s16(VC, v_vc);
|
||||
|
||||
merge(state->comp, sn, le, ge);
|
||||
merge(VACC_L, state->comp, VC, VS);
|
||||
vector_copy(VD, VACC_L);
|
||||
|
||||
v_eq = veorq_s16(v_eq, one);
|
||||
|
||||
vst1q_s16(state->clip, v_ge);
|
||||
vst1q_s16(state->comp, v_le);
|
||||
vst1q_s16(state->ne, v_eq);
|
||||
vst1q_s16(state->co, v_sn);
|
||||
|
||||
return;
|
||||
|
||||
#else
|
||||
|
||||
register int i;
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
|
@ -72,6 +124,7 @@ INLINE static void do_ch(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
for (i = 0; i < N; i++)
|
||||
state->co[i] = sn[i];
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VCH(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
|
||||
INLINE static void do_cl(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{
|
||||
|
||||
ALIGNED short eq[N], ge[N], le[N];
|
||||
ALIGNED short gen[N], len[N], lz[N], uz[N], sn[N];
|
||||
ALIGNED short diff[N];
|
||||
|
@ -22,6 +23,88 @@ INLINE static void do_cl(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
ALIGNED unsigned short VB[N], VC[N];
|
||||
register int i;
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
int16x8_t v_vc,neg_sn,v_eq,v_vb,v_sn,v_diff,v_uz,v_cmp;
|
||||
|
||||
int16x8_t zero = vdupq_n_s16(0);
|
||||
int16x8_t minus1 = vdupq_n_s16(-1);
|
||||
int16x8_t one = vdupq_n_s16(1);
|
||||
|
||||
int16x8_t vs = vld1q_s16((const int16_t *)VS);
|
||||
int16x8_t vt = vld1q_s16((const int16_t *)VT);
|
||||
int16x8_t ne = vld1q_s16((const int16_t *)state->ne);
|
||||
int16x8_t co = vld1q_s16((const int16_t *)state->co);
|
||||
int16x8_t vce = vld1q_s16((const int16_t *)state->vce);
|
||||
|
||||
v_vb = vs;
|
||||
v_vc = vt;
|
||||
|
||||
v_eq = veorq_s16(ne, one);
|
||||
v_sn = co;
|
||||
|
||||
neg_sn = vnegq_s16((int16x8_t)v_sn);
|
||||
v_vc = veorq_s16(v_vc, (int16x8_t)neg_sn);
|
||||
v_vc = vaddq_s16(v_vc, v_sn);
|
||||
|
||||
v_diff = vsubq_s16(v_vb,v_vc);
|
||||
|
||||
uint16x8_t vb_cond = vceqq_s16(v_vb,minus1);
|
||||
uint16x8_t vc_cond = vceqq_s16(v_vc,zero);
|
||||
vb_cond = vmvnq_u16(vb_cond);
|
||||
vc_cond = vmvnq_u16(vc_cond);
|
||||
v_uz = vorrq_s16((int16x8_t)vb_cond, (int16x8_t)vc_cond);
|
||||
|
||||
uint16x8_t v_lz = vceqq_s16(v_diff,zero);
|
||||
int16x8_t lz_s = vnegq_s16((int16x8_t)v_lz);
|
||||
|
||||
int16x8_t v_gen = vorrq_s16(lz_s,v_uz);
|
||||
int16x8_t v_len = vandq_s16(lz_s,v_uz);
|
||||
v_gen = vandq_s16(v_gen,vce);
|
||||
|
||||
vce = veorq_s16(vce,one);
|
||||
v_len = vandq_s16(v_len,vce);
|
||||
|
||||
v_len = vorrq_s16(v_len,v_gen);
|
||||
uint16x8_t gen_u = vcgeq_u16((uint16x8_t)v_vb,(uint16x8_t)v_vc);
|
||||
v_gen = vnegq_s16((int16x8_t)gen_u);
|
||||
|
||||
v_cmp = vandq_s16(v_eq,v_sn);
|
||||
|
||||
vst1q_s16(cmp, v_cmp);
|
||||
vst1q_s16(len, v_len);
|
||||
|
||||
merge(le, cmp, len, state->comp);
|
||||
int16x8_t sn_xor = veorq_s16(v_sn,one);
|
||||
v_cmp = vandq_s16(v_eq,sn_xor);
|
||||
|
||||
vst1q_s16(cmp, v_cmp);
|
||||
vst1q_s16(gen, v_gen);
|
||||
vst1q_s16(sn, v_sn);
|
||||
vst1q_s16(VC, v_vc);
|
||||
|
||||
merge(ge, cmp, gen, state->clip);
|
||||
|
||||
merge(cmp, sn, le, ge);
|
||||
merge(VACC_L, cmp, (short *)VC, VS);
|
||||
vector_copy(VD, VACC_L);
|
||||
|
||||
int16x8_t v_ge = vld1q_s16((const int16_t *)ge);
|
||||
int16x8_t v_le = vld1q_s16((const int16_t *)le);
|
||||
|
||||
vst1q_s16(state->clip,v_ge);
|
||||
vst1q_s16(state->comp,v_le);
|
||||
|
||||
vst1q_s16(state->ne,zero);
|
||||
vst1q_s16(state->co,zero);
|
||||
vst1q_s16(state->vce,zero);
|
||||
|
||||
return;
|
||||
|
||||
#else
|
||||
|
||||
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
VB[i] = VS[i];
|
||||
for (i = 0; i < N; i++)
|
||||
|
@ -88,6 +171,7 @@ INLINE static void do_cl(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
for (i = 0; i < N; i++)
|
||||
state->vce[i] = 0;
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VCL(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -20,6 +20,48 @@ INLINE static void do_cr(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
ALIGNED short cmp[N];
|
||||
register int i;
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
int16x8_t v_sn, v_cmp, v_vc;
|
||||
|
||||
int16x8_t zero = vdupq_n_s16(0);
|
||||
|
||||
int16x8_t vs = vld1q_s16((const int16_t *)VS);
|
||||
int16x8_t vt = vld1q_s16((const int16_t *)VT);
|
||||
|
||||
v_vc = vt;
|
||||
v_sn = veorq_s16(vs,vt);
|
||||
v_sn = vshrq_n_s16(v_sn,15);
|
||||
|
||||
v_cmp = vandq_s16(vs, v_sn);
|
||||
v_cmp = vmvnq_s16(v_cmp);
|
||||
uint16x8_t v_le = vcleq_s16(vt,v_cmp);
|
||||
int16x8_t v_le_ = vnegq_s16((int16x8_t)v_le);
|
||||
|
||||
v_cmp = vorrq_s16(vs, v_sn);
|
||||
uint16x8_t v_ge = vcgeq_s16(v_cmp, vt);
|
||||
int16x8_t v_ge_ = vnegq_s16((int16x8_t)v_ge);
|
||||
|
||||
v_vc = veorq_s16(v_vc,v_sn);
|
||||
|
||||
vst1q_s16(VC, v_vc);
|
||||
vst1q_s16(le, v_le_);
|
||||
vst1q_s16(ge, v_ge_);
|
||||
|
||||
merge(VACC_L, le, VC, VS);
|
||||
vector_copy(VD, VACC_L);
|
||||
|
||||
vst1q_s16(state->clip, v_ge_);
|
||||
vst1q_s16(state->comp, v_le_);
|
||||
vst1q_s16(state->ne,zero);
|
||||
vst1q_s16(state->co,zero);
|
||||
vst1q_s16(state->vce,zero);
|
||||
|
||||
return;
|
||||
|
||||
#else
|
||||
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
VC[i] = VT[i];
|
||||
for (i = 0; i < N; i++)
|
||||
|
@ -55,6 +97,7 @@ INLINE static void do_cr(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
for (i = 0; i < N; i++)
|
||||
state->vce[i] = 0;
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VCR(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -17,6 +17,30 @@ INLINE static void do_eq(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
{
|
||||
register int i;
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
int16x8_t one = vdupq_n_s16(1);
|
||||
int16x8_t zero = vdupq_n_s16(0);
|
||||
int16x8_t vs = vld1q_s16((const int16_t *)VS);
|
||||
int16x8_t vt = vld1q_s16((const int16_t *)VT);
|
||||
int16x8_t v_ne = vld1q_s16((const int16_t *)state->ne);
|
||||
|
||||
uint16x8_t v_comp = vceqq_s16(vs, vt);
|
||||
int16x8_t v_comp_ = vnegq_s16((int16x8_t)v_comp);
|
||||
v_ne = veorq_s16(v_ne, one);
|
||||
v_comp_ = vandq_s16(v_comp_, v_ne);
|
||||
|
||||
vector_copy(VACC_L, VT);
|
||||
vector_copy(VD, VACC_L);
|
||||
|
||||
vst1q_s16(state->comp,v_comp_);
|
||||
vst1q_s16(state->ne,zero);
|
||||
vst1q_s16(state->co,zero);
|
||||
|
||||
return;
|
||||
|
||||
#else
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
state->clip[i] = 0;
|
||||
for (i = 0; i < N; i++)
|
||||
|
@ -35,6 +59,7 @@ INLINE static void do_eq(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
for (i = 0; i < N; i++)
|
||||
state->co[i] = 0;
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VEQ(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -15,10 +15,44 @@
|
|||
|
||||
INLINE static void do_ge(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{
|
||||
|
||||
ALIGNED short ce[N];
|
||||
ALIGNED short eq[N];
|
||||
register int i;
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
|
||||
int16x8_t zero = vdupq_n_s16(0);
|
||||
int16x8_t one = vdupq_n_s16(1);
|
||||
|
||||
int16x8_t vs = vld1q_s16((const int16_t *)VS);
|
||||
int16x8_t vt = vld1q_s16((const int16_t *)VT);
|
||||
int16x8_t v_ne = vld1q_s16((const int16_t *)state->ne);
|
||||
int16x8_t v_co = vld1q_s16((const int16_t *)state->co);
|
||||
|
||||
uint16x8_t v_eq_u = vceqq_s16(vs,vt);
|
||||
int16x8_t v_eq_s = vnegq_s16((int16x8_t)v_eq_u);
|
||||
|
||||
int16x8_t v_ce = vandq_s16(v_ne,v_co);
|
||||
v_ce = veorq_s16(v_ce,one);
|
||||
v_eq_s = vandq_s16(v_eq_s, v_ce);
|
||||
vst1q_s16(state->clip, zero);
|
||||
uint16x8_t v_comp = vcgtq_s16(vs, vt);
|
||||
int16x8_t v_comp_s = vnegq_s16((int16x8_t)v_comp);
|
||||
|
||||
v_comp_s = vorrq_s16(v_comp_s, v_eq_s);
|
||||
vst1q_s16(state->comp, v_comp_s);
|
||||
|
||||
merge(VACC_L, state->comp, VS, VT);
|
||||
vector_copy(VD, VACC_L);
|
||||
|
||||
vst1q_s16(state->ne,zero);
|
||||
vst1q_s16(state->co,zero);
|
||||
|
||||
return;
|
||||
#else
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
eq[i] = (VS[i] == VT[i]);
|
||||
for (i = 0; i < N; i++)
|
||||
|
@ -39,6 +73,7 @@ INLINE static void do_ge(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
for (i = 0; i < N; i++)
|
||||
state->co[i] = 0;
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VGE(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -15,6 +15,37 @@
|
|||
|
||||
INLINE static void do_lt(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
int16x8_t zero = vdupq_n_s16(0);
|
||||
|
||||
int16x8_t vs = vld1q_s16((const int16_t *)VS);
|
||||
int16x8_t vt = vld1q_s16((const int16_t *)VT);
|
||||
int16x8_t v_ne = vld1q_s16((const int16_t *)state->ne);
|
||||
int16x8_t v_co = vld1q_s16((const int16_t *)state->co);
|
||||
|
||||
uint16x8_t v_eq_u = vceqq_s16(vs,vt);
|
||||
int16x8_t v_cn = vandq_s16(v_ne,v_co);
|
||||
v_eq_u = vandq_u16(v_eq_u,(uint16x8_t)v_cn);
|
||||
|
||||
vst1q_s16(state->clip, zero);
|
||||
|
||||
uint16x8_t v_comp = vcltq_s16(vs, vt);
|
||||
int16x8_t v_comp_s = vnegq_s16((int16x8_t)v_comp);
|
||||
v_comp_s = vorrq_s16(v_comp_s, (int16x8_t)v_eq_u);
|
||||
|
||||
vst1q_s16(state->comp, v_comp_s);
|
||||
|
||||
merge(VACC_L, state->comp, VS, VT);
|
||||
vector_copy(VD, VACC_L);
|
||||
|
||||
vst1q_s16(state->ne, zero);
|
||||
vst1q_s16(state->co, zero);
|
||||
return;
|
||||
|
||||
#else
|
||||
|
||||
ALIGNED short cn[N];
|
||||
ALIGNED short eq[N];
|
||||
register int i;
|
||||
|
@ -39,6 +70,7 @@ INLINE static void do_lt(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
for (i = 0; i < N; i++)
|
||||
state->co[i] = 0;
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VLT(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -15,10 +15,81 @@
|
|||
|
||||
INLINE static void do_macf(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{
|
||||
ALIGNED int32_t product[N];
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
uint16x8_t vs = vld1q_u16((const uint16_t*)VS);
|
||||
uint16x8_t vt = vld1q_u16((const uint16_t*)VT);
|
||||
uint16x4_t v_vaccl_low = vld1_u16((const uint16_t*)VACC_L);
|
||||
uint16x4_t v_vaccl_high = vld1_u16((const uint16_t*)VACC_L+4);
|
||||
uint16x4_t v_vaccm_low = vld1_u16((const uint16_t*)VACC_M);
|
||||
uint16x4_t v_vaccm_high = vld1_u16((const uint16_t*)VACC_M+4);
|
||||
uint16x4_t v_vacch_low = vld1_u16((const uint16_t*)VACC_H);
|
||||
uint16x4_t v_vacch_high = vld1_u16((const uint16_t*)VACC_H+4);
|
||||
int32x4_t zero = vdupq_n_s32(0);
|
||||
uint32x4_t zero_u = vdupq_n_u32(0);
|
||||
uint32x4_t highmask = vdupq_n_u32(0x0000ffff);
|
||||
|
||||
uint16x4_t vs_low = vget_low_u16(vs);
|
||||
uint16x4_t vs_high = vget_high_u16(vs);
|
||||
uint16x4_t vt_low = vget_low_u16(vt);
|
||||
uint16x4_t vt_high = vget_high_u16(vt);
|
||||
|
||||
int32x4_t product_L = vqdmlal_s16(zero, (int16x4_t)vs_low, (int16x4_t)vt_low);
|
||||
int32x4_t product_H = vqdmlal_s16(zero, (int16x4_t)vs_high, (int16x4_t)vt_high);
|
||||
uint32x4_t addend_L = vandq_u32(highmask, (uint32x4_t)product_L);
|
||||
uint32x4_t addend_H = vandq_u32(highmask, (uint32x4_t)product_H);
|
||||
|
||||
addend_L = vaddw_u16((uint32x4_t)addend_L, v_vaccl_low);
|
||||
addend_H = vaddw_u16((uint32x4_t)addend_H, v_vaccl_high);
|
||||
|
||||
uint16x8_t v_vaccl = vcombine_u16(vmovn_u32(addend_L), vmovn_u32(addend_H));
|
||||
|
||||
|
||||
addend_L = vaddl_u16(
|
||||
vaddhn_u32((uint32x4_t)product_L, zero_u),
|
||||
vaddhn_u32((uint32x4_t)addend_L, zero_u)
|
||||
);
|
||||
|
||||
addend_H = vaddl_u16(
|
||||
vaddhn_u32((uint32x4_t)product_H, zero_u),
|
||||
vaddhn_u32((uint32x4_t)addend_H, zero_u)
|
||||
);
|
||||
|
||||
addend_L = vaddw_u16(addend_L, v_vaccm_low);
|
||||
addend_H = vaddw_u16(addend_H, v_vaccm_high);
|
||||
|
||||
uint16x8_t v_vaccm = vcombine_u16(vmovn_u32(addend_L), vmovn_u32(addend_H));
|
||||
|
||||
//product_L = vshrq_n_s32(product_L, 1);
|
||||
//product_H = vshrq_n_s32(product_H, 1);
|
||||
|
||||
uint32x4_t cond_L = vcltq_s32(product_L,zero);
|
||||
int32x4_t cond_L_s = vnegq_s32((int32x4_t)cond_L);
|
||||
uint32x4_t cond_H = vcltq_s32(product_H,zero);
|
||||
int32x4_t cond_H_s = vnegq_s32((int32x4_t)cond_H);
|
||||
|
||||
v_vacch_low = vsub_u16(v_vacch_low, vmovn_u32((uint32x4_t)cond_L_s));
|
||||
v_vacch_high = vsub_u16(v_vacch_high, vmovn_u32((uint32x4_t)cond_H_s));
|
||||
|
||||
v_vacch_low = vadd_u16(vshrn_n_u32(addend_L,16), v_vacch_low);
|
||||
v_vacch_high = vadd_u16(vshrn_n_u32(addend_H,16), v_vacch_high);
|
||||
|
||||
uint16x8_t v_vacch = vcombine_u16(v_vacch_low, v_vacch_high);
|
||||
|
||||
vst1q_s16(VACC_L, (int16x8_t)v_vaccl);
|
||||
vst1q_s16(VACC_M, (int16x8_t)v_vaccm);
|
||||
vst1q_s16(VACC_H, (int16x8_t)v_vacch);
|
||||
|
||||
SIGNED_CLAMP_AM(state, VD);
|
||||
return;
|
||||
|
||||
#else
|
||||
|
||||
ALIGNED int32_t product[N];
|
||||
ALIGNED uint32_t addend[N];
|
||||
register int i;
|
||||
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
product[i] = VS[i] * VT[i];
|
||||
for (i = 0; i < N; i++)
|
||||
|
@ -38,7 +109,9 @@ INLINE static void do_macf(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
for (i = 0; i < N; i++)
|
||||
VACC_H[i] += addend[i] >> 16;
|
||||
SIGNED_CLAMP_AM(state, VD);
|
||||
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VMACF(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -15,6 +15,78 @@
|
|||
|
||||
INLINE static void do_macu(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
uint16x8_t vs = vld1q_u16((const uint16_t*)VS);
|
||||
uint16x8_t vt = vld1q_u16((const uint16_t*)VT);
|
||||
uint16x4_t v_vaccl_low = vld1_u16((const uint16_t*)VACC_L);
|
||||
uint16x4_t v_vaccl_high = vld1_u16((const uint16_t*)VACC_L+4);
|
||||
uint16x4_t v_vaccm_low = vld1_u16((const uint16_t*)VACC_M);
|
||||
uint16x4_t v_vaccm_high = vld1_u16((const uint16_t*)VACC_M+4);
|
||||
uint16x4_t v_vacch_low = vld1_u16((const uint16_t*)VACC_H);
|
||||
uint16x4_t v_vacch_high = vld1_u16((const uint16_t*)VACC_H+4);
|
||||
int32x4_t zero = vdupq_n_s32(0);
|
||||
uint32x4_t zero_u = vdupq_n_u32(0);
|
||||
uint32x4_t highmask = vdupq_n_u32(0x0000ffff);
|
||||
|
||||
uint16x4_t vs_low = vget_low_u16(vs);
|
||||
uint16x4_t vs_high = vget_high_u16(vs);
|
||||
uint16x4_t vt_low = vget_low_u16(vt);
|
||||
uint16x4_t vt_high = vget_high_u16(vt);
|
||||
|
||||
int32x4_t product_L = vqdmlal_s16(zero, (int16x4_t)vs_low, (int16x4_t)vt_low);
|
||||
int32x4_t product_H = vqdmlal_s16(zero, (int16x4_t)vs_high, (int16x4_t)vt_high);
|
||||
uint32x4_t addend_L = vandq_u32(highmask, (uint32x4_t)product_L);
|
||||
uint32x4_t addend_H = vandq_u32(highmask, (uint32x4_t)product_H);
|
||||
|
||||
addend_L = vaddw_u16((uint32x4_t)addend_L, v_vaccl_low);
|
||||
addend_H = vaddw_u16((uint32x4_t)addend_H, v_vaccl_high);
|
||||
|
||||
uint16x8_t v_vaccl = vcombine_u16(vmovn_u32(addend_L), vmovn_u32(addend_H));
|
||||
|
||||
|
||||
addend_L = vaddl_u16(
|
||||
vaddhn_u32((uint32x4_t)product_L, zero_u),
|
||||
vaddhn_u32((uint32x4_t)addend_L, zero_u)
|
||||
);
|
||||
|
||||
addend_H = vaddl_u16(
|
||||
vaddhn_u32((uint32x4_t)product_H, zero_u),
|
||||
vaddhn_u32((uint32x4_t)addend_H, zero_u)
|
||||
);
|
||||
|
||||
addend_L = vaddw_u16(addend_L, v_vaccm_low);
|
||||
addend_H = vaddw_u16(addend_H, v_vaccm_high);
|
||||
|
||||
uint16x8_t v_vaccm = vcombine_u16(vmovn_u32(addend_L), vmovn_u32(addend_H));
|
||||
|
||||
//product_L = vshrq_n_s32(product_L, 1);
|
||||
//product_H = vshrq_n_s32(product_H, 1);
|
||||
|
||||
uint32x4_t cond_L = vcltq_s32(product_L,zero);
|
||||
int32x4_t cond_L_s = vnegq_s32((int32x4_t)cond_L);
|
||||
uint32x4_t cond_H = vcltq_s32(product_H,zero);
|
||||
int32x4_t cond_H_s = vnegq_s32((int32x4_t)cond_H);
|
||||
|
||||
v_vacch_low = vsub_u16(v_vacch_low, vmovn_u32((uint32x4_t)cond_L_s));
|
||||
v_vacch_high = vsub_u16(v_vacch_high, vmovn_u32((uint32x4_t)cond_H_s));
|
||||
|
||||
v_vacch_low = vadd_u16(vshrn_n_u32(addend_L,16), v_vacch_low);
|
||||
v_vacch_high = vadd_u16(vshrn_n_u32(addend_H,16), v_vacch_high);
|
||||
|
||||
uint16x8_t v_vacch = vcombine_u16(v_vacch_low, v_vacch_high);
|
||||
|
||||
vst1q_s16(VACC_L, (int16x8_t)v_vaccl);
|
||||
vst1q_s16(VACC_M, (int16x8_t)v_vaccm);
|
||||
vst1q_s16(VACC_H, (int16x8_t)v_vacch);
|
||||
|
||||
UNSIGNED_CLAMP(state, VD);
|
||||
return;
|
||||
|
||||
#else
|
||||
|
||||
|
||||
ALIGNED int32_t product[N];
|
||||
ALIGNED uint32_t addend[N];
|
||||
register int i;
|
||||
|
@ -39,6 +111,7 @@ INLINE static void do_macu(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
VACC_H[i] += addend[i] >> 16;
|
||||
UNSIGNED_CLAMP(state, VD);
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VMACU(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -15,6 +15,39 @@
|
|||
|
||||
INLINE static void do_madh(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
int16x8_t vs,vt, vaccm, vacch, vacc_h, vacc_m,prod_low,prod_high,one,cond;
|
||||
uint16x8_t cond_u,res;
|
||||
|
||||
one = vdupq_n_s16(1);
|
||||
vs = vld1q_s16((const int16_t *)VS);
|
||||
vt = vld1q_s16((const int16_t *)VT);
|
||||
vaccm = vld1q_s16((const int16_t *)VACC_M);
|
||||
vacch = vld1q_s16((const int16_t *)VACC_H);
|
||||
|
||||
prod_low = vmulq_s16(vs,vt);
|
||||
vacc_m = vaddq_s16(vaccm,prod_low);
|
||||
|
||||
prod_high = vqdmulhq_s16(vs,vt);
|
||||
prod_high = vshrq_n_s16(prod_high, 1);
|
||||
|
||||
res = vqaddq_u16((uint16x8_t)vaccm, (uint16x8_t)prod_low);
|
||||
cond_u = vceqq_s16((int16x8_t)res,vacc_m);
|
||||
cond_u = vaddq_u16(cond_u, (uint16x8_t)one);
|
||||
|
||||
vacc_h = vaddq_s16(prod_high, vacch);
|
||||
vacc_h = vaddq_s16((int16x8_t)cond_u, vacc_h);
|
||||
|
||||
vst1q_s16(VACC_M, vacc_m);
|
||||
vst1q_s16(VACC_H, vacc_h);
|
||||
|
||||
SIGNED_CLAMP_AM(state, VD);
|
||||
return;
|
||||
|
||||
#else
|
||||
|
||||
ALIGNED int32_t product[N];
|
||||
ALIGNED uint32_t addend[N];
|
||||
register int i;
|
||||
|
@ -29,6 +62,8 @@ INLINE static void do_madh(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
VACC_H[i] += (addend[i] >> 16) + (product[i] >> 16);
|
||||
SIGNED_CLAMP_AM(state, VD);
|
||||
return;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VMADH(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -15,6 +15,56 @@
|
|||
|
||||
INLINE static void do_madl(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
uint16x8_t vs = vld1q_u16((const uint16_t*)VS);
|
||||
uint16x8_t vt = vld1q_u16((const uint16_t*)VT);
|
||||
uint16x8_t v_vaccl = vld1q_u16((const uint16_t*)VACC_L);
|
||||
uint16x8_t v_vaccm = vld1q_u16((const uint16_t*)VACC_M);
|
||||
uint16x8_t v_vacch = vld1q_u16((const uint16_t*)VACC_H);
|
||||
|
||||
uint32x4_t zero = vdupq_n_u32(0);
|
||||
uint16x4_t vs_low = vget_low_u16(vs);
|
||||
uint16x4_t vs_high = vget_high_u16(vs);
|
||||
uint16x4_t vt_low = vget_low_u16(vt);
|
||||
uint16x4_t vt_high = vget_high_u16(vt);
|
||||
|
||||
uint32x4_t product_L = vmulq_u32( vmovl_u16(vs_low), vmovl_u16(vt_low) );
|
||||
uint32x4_t product_H = vmulq_u32( vmovl_u16(vs_high), vmovl_u16(vt_high) );
|
||||
|
||||
uint16x4_t addend_L = vaddhn_u32(product_L, zero);
|
||||
uint16x4_t addend_H = vaddhn_u32(product_H, zero);
|
||||
|
||||
uint32x4_t exceed1 = vaddl_u16(addend_L, vget_low_u16(v_vaccl));
|
||||
uint32x4_t exceed2 = vaddl_u16(addend_H, vget_high_u16(v_vaccl));
|
||||
|
||||
v_vaccl = vcombine_u16(vmovn_u32(exceed1), vmovn_u32(exceed2));
|
||||
|
||||
addend_L = vaddhn_u32(exceed1, zero);
|
||||
addend_H = vaddhn_u32(exceed2, zero);
|
||||
|
||||
exceed1 = vaddl_u16(addend_L, vget_low_u16(v_vaccm));
|
||||
exceed2 = vaddl_u16(addend_H, vget_high_u16(v_vaccm));
|
||||
|
||||
v_vaccm = vcombine_u16(vmovn_u32(exceed1), vmovn_u32(exceed2));
|
||||
|
||||
addend_L = vaddhn_u32(exceed1, zero);
|
||||
addend_H = vaddhn_u32(exceed2, zero);
|
||||
|
||||
uint16x8_t v_vacch2 = vcombine_u16(addend_L, addend_H);
|
||||
v_vacch = vaddq_u16(v_vacch, v_vacch2);
|
||||
|
||||
vst1q_s16(VACC_L, (int16x8_t)v_vaccl);
|
||||
vst1q_s16(VACC_M, (int16x8_t)v_vaccm);
|
||||
vst1q_s16(VACC_H, (int16x8_t)v_vacch);
|
||||
|
||||
SIGNED_CLAMP_AL(state, VD);
|
||||
return;
|
||||
|
||||
#else
|
||||
|
||||
|
||||
ALIGNED int32_t product[N];
|
||||
ALIGNED uint32_t addend[N];
|
||||
register int i;
|
||||
|
@ -37,6 +87,7 @@ INLINE static void do_madl(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
VACC_H[i] += addend[i] >> 16;
|
||||
SIGNED_CLAMP_AL(state, VD);
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VMADL(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -15,6 +15,43 @@
|
|||
|
||||
INLINE static void do_madm(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
uint32x4_t zero = vdupq_n_u32(0);
|
||||
int16x4_t vs_low = vld1_s16((const int16_t*)VS);
|
||||
int16x4_t vs_high = vld1_s16((const int16_t*)VS+4);
|
||||
uint16x4_t vt_low = vld1_u16((const uint16_t*)VT);
|
||||
uint16x4_t vt_high = vld1_u16((const uint16_t*)VT+4);
|
||||
uint16x4_t vaccl_low = vld1_u16((const uint16_t*)VACC_L);
|
||||
uint16x4_t vaccl_high = vld1_u16((const uint16_t*)VACC_L+4);
|
||||
uint16x4_t vaccm_low = vld1_u16((const uint16_t*)VACC_M);
|
||||
uint16x4_t vaccm_high = vld1_u16((const uint16_t*)VACC_M+4);
|
||||
int16x4_t vacch_low = vld1_s16((const int16_t*)VACC_H);
|
||||
int16x4_t vacch_high = vld1_s16((const int16_t*)VACC_H+4);
|
||||
|
||||
int32x4_t vaccl_l = vmlaq_s32((int32x4_t)vmovl_u16(vaccl_low),vmovl_s16(vs_low),(int32x4_t)vmovl_u16(vt_low));
|
||||
int32x4_t vaccl_h = vmlaq_s32((int32x4_t)vmovl_u16(vaccl_high),vmovl_s16(vs_high),(int32x4_t)vmovl_u16(vt_high));
|
||||
uint32x4_t vaccm_l = vaddq_u32(vmovl_u16(vaccm_low), (uint32x4_t)vshrq_n_s32(vaccl_l,16));
|
||||
uint32x4_t vaccm_h = vaddq_u32(vmovl_u16(vaccm_high),(uint32x4_t)vshrq_n_s32(vaccl_h,16));
|
||||
uint16x4_t vacch_l = vaddhn_u32(vaccm_l, zero);
|
||||
uint16x4_t vacch_h = vaddhn_u32(vaccm_h, zero);
|
||||
int16x4_t vacch_low2 = vadd_s16(vacch_low,(int16x4_t)vacch_l);
|
||||
int16x4_t vacch_high2 = vadd_s16(vacch_high,(int16x4_t)vacch_h);
|
||||
|
||||
int16x8_t vaccl = vcombine_s16(vmovn_s32(vaccl_l),vmovn_s32(vaccl_h));
|
||||
uint16x8_t vaccm = vcombine_u16(vmovn_u32(vaccm_l),vmovn_u32(vaccm_h));
|
||||
int16x8_t vacch = vcombine_s16(vacch_low2,vacch_high2);
|
||||
|
||||
vst1q_s16(VACC_L, vaccl);
|
||||
vst1q_s16(VACC_M, (int16x8_t)vaccm);
|
||||
vst1q_s16(VACC_H, vacch);
|
||||
SIGNED_CLAMP_AM(state, VD);
|
||||
|
||||
return;
|
||||
#else
|
||||
|
||||
|
||||
ALIGNED uint32_t addend[N];
|
||||
register int i;
|
||||
|
||||
|
@ -32,6 +69,7 @@ INLINE static void do_madm(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
VACC_H[i] += addend[i] >> 16;
|
||||
SIGNED_CLAMP_AM(state, VD);
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VMADM(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -15,6 +15,43 @@
|
|||
|
||||
INLINE static void do_madn(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
uint32x4_t zero = vdupq_n_u32(0);
|
||||
uint16x4_t vs_low = vld1_u16((const uint16_t*)VS);
|
||||
uint16x4_t vs_high = vld1_u16((const uint16_t*)VS+4);
|
||||
int16x4_t vt_low = vld1_s16((const int16_t*)VT);
|
||||
int16x4_t vt_high = vld1_s16((const int16_t*)VT+4);
|
||||
uint16x4_t vaccl_low = vld1_u16((const uint16_t*)VACC_L);
|
||||
uint16x4_t vaccl_high = vld1_u16((const uint16_t*)VACC_L+4);
|
||||
uint16x4_t vaccm_low = vld1_u16((const uint16_t*)VACC_M);
|
||||
uint16x4_t vaccm_high = vld1_u16((const uint16_t*)VACC_M+4);
|
||||
int16x4_t vacch_low = vld1_s16((const int16_t*)VACC_H);
|
||||
int16x4_t vacch_high = vld1_s16((const int16_t*)VACC_H+4);
|
||||
|
||||
int32x4_t vaccl_l = vmlaq_s32((int32x4_t)vmovl_u16(vaccl_low),(int32x4_t)vmovl_u16(vs_low),vmovl_s16(vt_low));
|
||||
int32x4_t vaccl_h = vmlaq_s32((int32x4_t)vmovl_u16(vaccl_high),(int32x4_t)vmovl_u16(vs_high),vmovl_s16(vt_high));
|
||||
uint32x4_t vaccm_l = vaddq_u32(vmovl_u16(vaccm_low), (uint32x4_t)vshrq_n_s32(vaccl_l,16));
|
||||
uint32x4_t vaccm_h = vaddq_u32(vmovl_u16(vaccm_high),(uint32x4_t)vshrq_n_s32(vaccl_h,16));
|
||||
uint16x4_t vacch_l = vaddhn_u32(vaccm_l, zero);
|
||||
uint16x4_t vacch_h = vaddhn_u32(vaccm_h, zero);
|
||||
int16x4_t vacch_low2 = vadd_s16(vacch_low,(int16x4_t)vacch_l);
|
||||
int16x4_t vacch_high2 = vadd_s16(vacch_high,(int16x4_t)vacch_h);
|
||||
|
||||
int16x8_t vaccl = vcombine_s16(vmovn_s32(vaccl_l),vmovn_s32(vaccl_h));
|
||||
uint16x8_t vaccm = vcombine_u16(vmovn_u32(vaccm_l),vmovn_u32(vaccm_h));
|
||||
int16x8_t vacch = vcombine_s16(vacch_low2,vacch_high2);
|
||||
|
||||
vst1q_s16(VACC_L, vaccl);
|
||||
vst1q_s16(VACC_M, (int16x8_t)vaccm);
|
||||
vst1q_s16(VACC_H, vacch);
|
||||
SIGNED_CLAMP_AL(state, VD);
|
||||
return;
|
||||
|
||||
#else
|
||||
|
||||
|
||||
ALIGNED uint32_t addend[N];
|
||||
register int i;
|
||||
|
||||
|
@ -32,6 +69,7 @@ INLINE static void do_madn(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
VACC_H[i] += addend[i] >> 16;
|
||||
SIGNED_CLAMP_AL(state, VD);
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VMADN(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -15,6 +15,37 @@
|
|||
|
||||
INLINE static void do_mudh(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
int16x8_t zero16 = vdupq_n_s16(0);
|
||||
int32x4_t zero32 = vdupq_n_s32(0);
|
||||
|
||||
int16x4_t vs_low = vld1_s16((const int16_t *)VS);
|
||||
int16x4_t vs_high = vld1_s16((const int16_t *)VS+4);
|
||||
int16x4_t vt_low = vld1_s16((const int16_t *)VT);
|
||||
int16x4_t vt_high = vld1_s16((const int16_t *)VT+4);
|
||||
|
||||
int32x4_t l1 = vmovl_s16(vs_low);
|
||||
int32x4_t h1 = vmovl_s16(vs_high);
|
||||
int32x4_t l2 = vmovl_s16(vt_low);
|
||||
int32x4_t h2 = vmovl_s16(vt_high);
|
||||
|
||||
int32x4_t l = vmulq_s32(l1,l2);
|
||||
int32x4_t h = vmulq_s32(h1,h2);
|
||||
|
||||
int16x8_t vaccm = vcombine_s16(vmovn_s32(l),vmovn_s32(h));
|
||||
int16x8_t vacch = vcombine_s16(vaddhn_s32(l,zero32),vaddhn_s32(h,zero32));
|
||||
|
||||
vst1q_s16(VACC_L, zero16);
|
||||
vst1q_s16(VACC_M, vaccm);
|
||||
vst1q_s16(VACC_H, vacch);
|
||||
|
||||
SIGNED_CLAMP_AM(state, VD);
|
||||
return;
|
||||
|
||||
#else
|
||||
|
||||
register int i;
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
|
@ -25,6 +56,7 @@ INLINE static void do_mudh(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
VACC_H[i] = (short)(VS[i]*VT[i] >> 16);
|
||||
SIGNED_CLAMP_AM(state, VD);
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VMUDH(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -15,6 +15,26 @@
|
|||
|
||||
INLINE static void do_mudl(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
uint16x4_t vs_low = vld1_u16((const uint16_t*)VS);
|
||||
uint16x4_t vs_high = vld1_u16((const uint16_t*)VS+4);
|
||||
uint16x4_t vt_low = vld1_u16((const uint16_t*)VT);
|
||||
uint16x4_t vt_high = vld1_u16((const uint16_t*)VT+4);
|
||||
int16x8_t zero = vdupq_n_s16(0);
|
||||
|
||||
uint32x4_t lo = vmull_u16(vs_low, vt_low);
|
||||
uint32x4_t high = vmull_u16(vs_high, vt_high);
|
||||
uint16x8_t result = vcombine_u16(vshrn_n_u32(lo,16),vshrn_n_u32(high,16));
|
||||
vst1q_u16(VACC_L, result);
|
||||
vst1q_s16(VACC_M, zero);
|
||||
vst1q_s16(VACC_H, zero);
|
||||
vector_copy(VD, VACC_L);
|
||||
return;
|
||||
|
||||
#else
|
||||
|
||||
register int i;
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
|
@ -25,6 +45,7 @@ INLINE static void do_mudl(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
VACC_H[i] = 0x0000;
|
||||
vector_copy(VD, VACC_L); /* no possibilities to clamp */
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VMUDL(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -15,6 +15,39 @@
|
|||
|
||||
INLINE static void do_mudm(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
int16x8_t vd, vs,vt,res,four,vacc_l, vacc_m, vacc_h;
|
||||
|
||||
int32x4_t zero = vdupq_n_s32(0);
|
||||
int16x8_t zero16 = vdupq_n_s16(0);
|
||||
|
||||
int16x4_t vs_low = vld1_s16((const uint16_t *)VS);
|
||||
int16x4_t vs_high = vld1_s16((const uint16_t *)VS+4);
|
||||
uint16x4_t vt_low = vld1_u16((const int16_t *)VT);
|
||||
uint16x4_t vt_high = vld1_u16((const int16_t *)VT+4);
|
||||
|
||||
int32x4_t l1 = vmovl_s16(vs_low);
|
||||
int32x4_t h1 = vmovl_s16(vs_high);
|
||||
uint32x4_t l2 = vmovl_u16(vt_low);
|
||||
uint32x4_t h2 = vmovl_u16(vt_high);
|
||||
|
||||
int32x4_t l = vmulq_s32(l1,(int32x4_t)l2);
|
||||
int32x4_t h = vmulq_s32(h1,(int32x4_t)h2);
|
||||
|
||||
int16x8_t vaccl = vcombine_s16(vmovn_s32(l),vmovn_s32(h));
|
||||
int16x8_t vaccm = vcombine_s16(vaddhn_s32(l,zero),vaddhn_s32(h,zero));
|
||||
uint16x8_t uvacch = vcltq_s16(vaccm, zero16);
|
||||
|
||||
vst1q_s16(VACC_L, vaccl);
|
||||
vst1q_s16(VACC_M, vaccm);
|
||||
vst1q_s16(VACC_H, (int16x8_t)uvacch);
|
||||
|
||||
vector_copy(VD, VACC_M); /* no possibilities to clamp */
|
||||
|
||||
return;
|
||||
#else
|
||||
|
||||
register int i;
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
|
@ -25,6 +58,7 @@ INLINE static void do_mudm(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
VACC_H[i] = -(VACC_M[i] < 0);
|
||||
vector_copy(VD, VACC_M); /* no possibilities to clamp */
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VMUDM(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -15,6 +15,40 @@
|
|||
|
||||
INLINE static void do_mudn(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
int16x8_t vd, vs,vt,res,four,vacc_l, vacc_m, vacc_h;
|
||||
|
||||
int32x4_t zero = vdupq_n_s32(0);
|
||||
int16x8_t zero16 = vdupq_n_s16(0);
|
||||
|
||||
uint16x4_t vs_low = vld1_u16((const uint16_t *)VS);
|
||||
uint16x4_t vs_high = vld1_u16((const uint16_t *)VS+4);
|
||||
int16x4_t vt_low = vld1_s16((const int16_t *)VT);
|
||||
int16x4_t vt_high = vld1_s16((const int16_t *)VT+4);
|
||||
|
||||
uint32x4_t l1 = vmovl_u16(vs_low);
|
||||
uint32x4_t h1 = vmovl_u16(vs_high);
|
||||
int32x4_t l2 = vmovl_s16(vt_low);
|
||||
int32x4_t h2 = vmovl_s16(vt_high);
|
||||
|
||||
int32x4_t l = vmulq_s32((int32x4_t)l1,l2);
|
||||
int32x4_t h = vmulq_s32((int32x4_t)h1,h2);
|
||||
|
||||
int16x8_t vaccl = vcombine_s16(vmovn_s32(l),vmovn_s32(h));
|
||||
int16x8_t vaccm = vcombine_s16(vaddhn_s32(l,zero),vaddhn_s32(h,zero));
|
||||
uint16x8_t uvacch = vcltq_s16(vaccm, zero16);
|
||||
vst1q_s16(VACC_L, vaccl);
|
||||
vst1q_s16(VACC_M, vaccm);
|
||||
vst1q_s16(VACC_H, (int16x8_t)uvacch);
|
||||
|
||||
vector_copy(VD, VACC_L); /* no possibilities to clamp */
|
||||
|
||||
return;
|
||||
|
||||
#else
|
||||
|
||||
register int i;
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
|
@ -25,6 +59,7 @@ INLINE static void do_mudn(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
VACC_H[i] = -(VACC_M[i] < 0);
|
||||
vector_copy(VD, VACC_L); /* no possibilities to clamp */
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VMUDN(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -22,11 +22,44 @@
|
|||
* Wrong: ACC(HI) = -((INT32)(acc) < 0)
|
||||
* Right: ACC(HI) = -(SEMIFRAC < 0)
|
||||
*/
|
||||
#define SEMIFRAC (VS[i]*VT[i]*2/2 + 0x8000/2)
|
||||
|
||||
#define SEMIFRAC (VS[i]*VT[i] + 0x4000)
|
||||
#endif
|
||||
|
||||
INLINE static void do_mulf(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
int16x8_t vs,vt,res,four,zero,vacc_l, vacc_m, vacc_h;
|
||||
uint16x8_t cond_u, vacc_m_cond_u,one;
|
||||
|
||||
one = vdupq_n_u16(1);
|
||||
four = vdupq_n_s16(0x4000);
|
||||
zero = vdupq_n_s16(0);
|
||||
|
||||
vs = vld1q_s16((const int16_t *)VS);
|
||||
vt = vld1q_s16((const int16_t *)VT);
|
||||
|
||||
vacc_m = vqrdmulhq_s16(vs, vt);
|
||||
vacc_l = vmlaq_s16(four, vs,vt);
|
||||
vacc_l = vshlq_n_s16(vacc_l,1);
|
||||
|
||||
cond_u = vceqq_s16(vs,vt);
|
||||
cond_u = vaddq_u16(cond_u, one);
|
||||
vacc_m_cond_u = vcltq_s16(vacc_m, zero);
|
||||
cond_u = vandq_u16(vacc_m_cond_u, cond_u);
|
||||
vacc_h = vqnegq_s16((int16x8_t)cond_u);
|
||||
|
||||
vst1q_s16(VACC_L,vacc_l);
|
||||
vst1q_s16(VACC_M,vacc_m);
|
||||
vst1q_s16(VACC_H,vacc_h);
|
||||
|
||||
SIGNED_CLAMP_AM(state, VD);
|
||||
return;
|
||||
|
||||
#else
|
||||
|
||||
register int i;
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
|
@ -35,7 +68,7 @@ INLINE static void do_mulf(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
VACC_M[i] = (SEMIFRAC << 1) >> 16;
|
||||
for (i = 0; i < N; i++)
|
||||
VACC_H[i] = -((VACC_M[i] < 0) & (VS[i] != VT[i])); /* -32768 * -32768 */
|
||||
#ifndef ARCH_MIN_SSE2
|
||||
#if !defined ARCH_MIN_SSE2
|
||||
vector_copy(VD, VACC_M);
|
||||
for (i = 0; i < N; i++)
|
||||
VD[i] -= (VACC_M[i] < 0) & (VS[i] == VT[i]); /* ACC b 31 set, min*min */
|
||||
|
@ -43,6 +76,7 @@ INLINE static void do_mulf(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
SIGNED_CLAMP_AM(state, VD);
|
||||
#endif
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VMULF(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -22,11 +22,50 @@
|
|||
* Wrong: ACC(HI) = -((INT32)(acc) < 0)
|
||||
* Right: ACC(HI) = -(SEMIFRAC < 0)
|
||||
*/
|
||||
#define SEMIFRAC (VS[i]*VT[i]*2/2 + 0x8000/2)
|
||||
#define SEMIFRAC (VS[i]*VT[i]*2/2 + 0x4000)
|
||||
#endif
|
||||
|
||||
INLINE static void do_mulu(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
int16x8_t vd, vs,vt,res,four,zero,vacc_l, vacc_m, vacc_h;
|
||||
uint16x8_t cond_u, vacc_m_cond_u,one;
|
||||
|
||||
one = vdupq_n_u16(1);
|
||||
four = vdupq_n_s16(0x4000);
|
||||
zero = vdupq_n_s16(0);
|
||||
|
||||
vs = vld1q_s16((const int16_t *)VS);
|
||||
vt = vld1q_s16((const int16_t *)VT);
|
||||
|
||||
|
||||
vacc_m = vqrdmulhq_s16(vs, vt);
|
||||
vacc_l = vmlaq_s16(four, vs,vt);
|
||||
vacc_l = vshlq_n_s16(vacc_l,1);
|
||||
|
||||
cond_u = vceqq_s16(vs,vt);
|
||||
cond_u = vaddq_u16(cond_u, one);
|
||||
vacc_m_cond_u = vcltq_s16(vacc_m, zero);
|
||||
cond_u = vandq_u16(vacc_m_cond_u, cond_u);
|
||||
vacc_h = vqnegq_s16((int16x8_t)cond_u);
|
||||
|
||||
vst1q_s16(VACC_L,vacc_l);
|
||||
vst1q_s16(VACC_M,vacc_m);
|
||||
vst1q_s16(VACC_H,vacc_h);
|
||||
|
||||
|
||||
vd = vacc_m;
|
||||
|
||||
uint16x8_t vacc_m_u = vshrq_n_u16((uint16x8_t)vacc_m, 15);
|
||||
vd = vorrq_s16(vd, (int16x8_t)vacc_m_u);
|
||||
vd = vbicq_s16(vd, vacc_h);
|
||||
vst1q_s16(VD, vd);
|
||||
return;
|
||||
|
||||
#else
|
||||
|
||||
register int i;
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
|
@ -45,6 +84,7 @@ INLINE static void do_mulu(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
VD[i] &= ~(VACC_H[i] >> 0); /* VD &= -(result >= 0x000000000000) */
|
||||
#endif
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VMULU(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -13,14 +13,27 @@
|
|||
\******************************************************************************/
|
||||
#include "vu.h"
|
||||
|
||||
static INLINE void do_nand(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
INLINE void do_nand(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
int16x8_t vs, vt,vaccl;
|
||||
vs = vld1q_s16((const int16_t*)VS);
|
||||
vt = vld1q_s16((const int16_t*)VT);
|
||||
vaccl = vandq_s16(vs,vt);
|
||||
vaccl = vmvnq_s16(vaccl);
|
||||
vst1q_s16(VACC_L, vaccl);
|
||||
vector_copy(VD, VACC_L);
|
||||
return;
|
||||
|
||||
#else
|
||||
register int i;
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
VACC_L[i] = ~(VS[i] & VT[i]);
|
||||
vector_copy(VD, VACC_L);
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VNAND(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -15,6 +15,33 @@
|
|||
|
||||
INLINE static void do_ne(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
int16x8_t vs, vt,vaccl, ne;
|
||||
int16x8_t zero = vdupq_n_s16(0);
|
||||
uint16x8_t cond;
|
||||
|
||||
vs = vld1q_s16((const int16_t*)VS);
|
||||
vt = vld1q_s16((const int16_t*)VT);
|
||||
ne = vld1q_s16((const int16_t*)state->ne);
|
||||
|
||||
cond = vceqq_s16(vs,vt);
|
||||
cond = vmvnq_u16(cond); // this is needed if you need to do "not-equal"
|
||||
cond = (uint16x8_t)vnegq_s16((int16x8_t)cond);
|
||||
uint16x8_t comp = vorrq_u16(cond, (uint16x8_t)ne);
|
||||
|
||||
vst1q_s16(state->clip, zero);
|
||||
vst1q_s16(state->comp, (int16x8_t)cond);
|
||||
|
||||
vector_copy(VACC_L, VS);
|
||||
vector_copy(VD, VACC_L);
|
||||
vst1q_s16(state->ne, zero);
|
||||
vst1q_s16(state->co, zero);
|
||||
|
||||
return;
|
||||
#else
|
||||
|
||||
register int i;
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
|
@ -35,6 +62,7 @@ INLINE static void do_ne(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
for (i = 0; i < N; i++)
|
||||
state->co[i] = 0;
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VNE(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -13,14 +13,27 @@
|
|||
\******************************************************************************/
|
||||
#include "vu.h"
|
||||
|
||||
static INLINE void do_nor(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
INLINE void do_nor(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
int16x8_t vs, vt, vaccl;
|
||||
vs = vld1q_s16((const int16_t*)VS);
|
||||
vt = vld1q_s16((const int16_t*)VT);
|
||||
vaccl = vorrq_s16(vs,vt);
|
||||
vaccl = vmvnq_s16(vaccl);
|
||||
|
||||
vst1q_s16(VACC_L, vaccl);
|
||||
return;
|
||||
#else
|
||||
|
||||
register int i;
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
VACC_L[i] = ~(VS[i] | VT[i]);
|
||||
vector_copy(VD, VACC_L);
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VNOR(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -13,14 +13,29 @@
|
|||
\******************************************************************************/
|
||||
#include "vu.h"
|
||||
|
||||
static INLINE void do_nxor(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
INLINE void do_nxor(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
int16x8_t vs, vt,vaccl;
|
||||
vs = vld1q_s16((const int16_t*)VS);
|
||||
vt = vld1q_s16((const int16_t*)VT);
|
||||
vaccl = veorq_s16(vs,vt);
|
||||
vaccl = vmvnq_s16(vaccl);
|
||||
|
||||
vst1q_s16(VACC_L, vaccl);
|
||||
vector_copy(VD, VACC_L);
|
||||
return;
|
||||
#else
|
||||
|
||||
register int i;
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
VACC_L[i] = ~(VS[i] ^ VT[i]);
|
||||
vector_copy(VD, VACC_L);
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VNXOR(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -13,14 +13,28 @@
|
|||
\******************************************************************************/
|
||||
#include "vu.h"
|
||||
|
||||
static INLINE void do_or(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
INLINE void do_or(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
int16x8_t vs, vt,vaccl;
|
||||
vs = vld1q_s16((const int16_t*)VS);
|
||||
vt = vld1q_s16((const int16_t*)VT);
|
||||
vaccl = vorrq_s16(vs,vt);
|
||||
vst1q_s16(VACC_L, vaccl);
|
||||
vector_copy(VD, VACC_L);
|
||||
return;
|
||||
|
||||
#else
|
||||
|
||||
register int i;
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
VACC_L[i] = VS[i] | VT[i];
|
||||
vector_copy(VD, VACC_L);
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VOR(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -34,13 +34,25 @@ static void VSAR(int vd, int vs, int vt, int e)
|
|||
if (e > 2)
|
||||
{
|
||||
message(state, "VSAR\nInvalid mask.", 2);
|
||||
#if ARCH_MIN_ARM_NEON
|
||||
int16x8_t zero = vdupq_n_s16(0);
|
||||
vst1q_s16(VR[vd], zero);
|
||||
#else
|
||||
for (i = 0; i < N; i++)
|
||||
VR[vd][i] = 0x0000; /* override behavior (zilmar) */
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#if ARCH_MIN_ARM_NEON
|
||||
vector_copy(VR[vd], VACC[e]);
|
||||
#else
|
||||
for (i = 0; i < N; i++)
|
||||
VR[vd][i] = VACC[e][i];
|
||||
for (i = 0; i < N; i++)
|
||||
#endif
|
||||
}
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
VACC[e][i] = oldval[i]; /* ... = VS */
|
||||
return;
|
||||
}
|
||||
|
@ -59,9 +71,19 @@ static void VSAW(usf_state_t * state, int vd, int vs, int vt, int e)
|
|||
if (e > 0x2)
|
||||
{ /* branch very unlikely...never seen a game do VSAW illegally */
|
||||
message(state, "VSAW\nIllegal mask.", 2);
|
||||
for (i = 0; i < N; i++)
|
||||
|
||||
#if ARCH_MIN_ARM_NEON
|
||||
|
||||
int16x8_t zero = vdupq_n_s16(0);
|
||||
vst1q_s16(state->VR[vd], zero);
|
||||
|
||||
#else
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
state->VR[vd][i] = 0x0000; /* override behavior (zilmar) */
|
||||
return;
|
||||
|
||||
#endif
|
||||
}
|
||||
vector_copy(state->VR[vd], state->VACC[e]);
|
||||
return;
|
||||
|
|
|
@ -15,6 +15,25 @@
|
|||
|
||||
INLINE static void clr_bi(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{ /* clear CARRY and borrow in to accumulators */
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
int16x8_t vs, vt, zero, co;
|
||||
zero = vdupq_n_s16(0);
|
||||
vs = vld1q_s16((const int16_t*)VS);
|
||||
vt = vld1q_s16((const int16_t*)VT);
|
||||
co = vld1q_s16((const int16_t*)state->co);
|
||||
|
||||
vs = vsubq_s16(vs,vt);
|
||||
vs = vsubq_s16(vs,co);
|
||||
vst1q_s16(VACC_L, vs);
|
||||
|
||||
SIGNED_CLAMP_SUB(state, VD, VS, VT);
|
||||
vst1q_s16(state->ne, zero);
|
||||
vst1q_s16(state->co, zero);
|
||||
return;
|
||||
#else
|
||||
|
||||
register int i;
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
|
@ -25,6 +44,7 @@ INLINE static void clr_bi(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
for (i = 0; i < N; i++)
|
||||
state->co[i] = 0;
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VSUB(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -15,6 +15,36 @@
|
|||
|
||||
INLINE static void set_bo(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{ /* set CARRY and borrow out from difference */
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
int16x8_t vs, vt,vaccl,ne, co2;
|
||||
uint16x8_t cond;
|
||||
|
||||
int16x8_t zero = vdupq_n_s16(0);
|
||||
int16x8_t one = vdupq_n_s16(1);
|
||||
vs = vld1q_s16((const int16_t*)VS);
|
||||
vt = vld1q_s16((const int16_t*)VT);
|
||||
|
||||
vaccl = vsubq_s16(vs, vt);
|
||||
uint16x8_t vdif = vqsubq_u16((uint16x8_t)vs, (uint16x8_t)vt);
|
||||
|
||||
vst1q_s16(VACC_L, vaccl);
|
||||
vector_copy(VD, VACC_L);
|
||||
|
||||
cond = vceqq_s16(vs, vt);
|
||||
ne = vaddq_s16((int16x8_t)cond, one);
|
||||
|
||||
vdif = vorrq_u16(vdif,cond);
|
||||
cond = vceqq_u16(vdif, (uint16x8_t)zero);
|
||||
co2 = vnegq_s16((int16x8_t)cond);
|
||||
|
||||
vst1q_s16(state->ne, ne);
|
||||
vst1q_s16(state->co, co2);
|
||||
return;
|
||||
|
||||
#else
|
||||
|
||||
ALIGNED int32_t dif[N];
|
||||
register int i;
|
||||
|
||||
|
@ -28,6 +58,7 @@ INLINE static void set_bo(usf_state_t * state, short* VD, short* VS, short* VT)
|
|||
for (i = 0; i < N; i++)
|
||||
state->co[i] = (dif[i] < 0);
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VSUBC(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -54,9 +54,18 @@ static void res_V(usf_state_t * state, int vd, int vs, int vt, int e)
|
|||
if (vs != vt || vt != e)
|
||||
return;
|
||||
message(state, "C2\nRESERVED", 2); /* uncertain how to handle reserved, untested */
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
int16x8_t zero = vdupq_n_s16(0);
|
||||
vst1q_s16(state->VR[vd], zero);
|
||||
return;
|
||||
#else
|
||||
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
state->VR[vd][i] = 0x0000; /* override behavior (bpoint) */
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
static void res_M(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
{
|
||||
|
|
|
@ -13,14 +13,28 @@
|
|||
\******************************************************************************/
|
||||
#include "vu.h"
|
||||
|
||||
static INLINE void do_xor(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
INLINE void do_xor(usf_state_t * state, short* VD, short* VS, short* VT)
|
||||
{
|
||||
|
||||
#ifdef ARCH_MIN_ARM_NEON
|
||||
|
||||
int16x8_t vs, vt;
|
||||
vs = vld1q_s16((const int16_t*)VS);
|
||||
vt = vld1q_s16((const int16_t*)VT);
|
||||
vs = veorq_s16(vs, vt);
|
||||
vst1q_s16(VACC_L, vs);
|
||||
vector_copy(VD, VACC_L);
|
||||
|
||||
return;
|
||||
#else
|
||||
|
||||
register int i;
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
VACC_L[i] = VS[i] ^ VT[i];
|
||||
vector_copy(VD, VACC_L);
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void VXOR(usf_state_t * state, int vd, int vs, int vt, int e)
|
||||
|
|
|
@ -29,6 +29,8 @@
|
|||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#include "alist.h"
|
||||
#include "arithmetics.h"
|
||||
#include "audio.h"
|
||||
|
@ -58,12 +60,12 @@ static int16_t* sample(struct hle_t* hle, unsigned pos)
|
|||
|
||||
static uint8_t* alist_u8(struct hle_t* hle, uint16_t dmem)
|
||||
{
|
||||
return &hle->alist_buffer[dmem ^ S8];
|
||||
return u8(hle->alist_buffer, dmem);
|
||||
}
|
||||
|
||||
static int16_t* alist_s16(struct hle_t* hle, uint16_t dmem)
|
||||
{
|
||||
return (int16_t*)(&hle->alist_buffer[dmem ^ S16]);
|
||||
return (int16_t*)u16(hle->alist_buffer, dmem);
|
||||
}
|
||||
|
||||
|
||||
|
@ -82,13 +84,13 @@ static void alist_envmix_mix(size_t n, int16_t** dst, const int16_t* gains, int1
|
|||
|
||||
static int16_t ramp_step(struct ramp_t* ramp)
|
||||
{
|
||||
bool target_reached;
|
||||
|
||||
bool target_reached;
|
||||
|
||||
ramp->value += ramp->step;
|
||||
|
||||
target_reached = (ramp->step <= 0)
|
||||
? (ramp->value <= ramp->target)
|
||||
: (ramp->value >= ramp->target);
|
||||
? (ramp->value <= ramp->target)
|
||||
: (ramp->value >= ramp->target);
|
||||
|
||||
if (target_reached)
|
||||
{
|
||||
|
@ -184,7 +186,7 @@ void alist_move(struct hle_t* hle, uint16_t dmemo, uint16_t dmemi, uint16_t coun
|
|||
void alist_copy_every_other_sample(struct hle_t* hle, uint16_t dmemo, uint16_t dmemi, uint16_t count)
|
||||
{
|
||||
while (count != 0) {
|
||||
*(uint16_t*)(alist_u8(hle, dmemo)) = *(uint16_t*)(alist_u8(hle, dmemi));
|
||||
*alist_s16(hle, dmemo) = *alist_s16(hle, dmemi);
|
||||
dmemo += 2;
|
||||
dmemi += 4;
|
||||
--count;
|
||||
|
@ -373,7 +375,7 @@ void alist_envmix_ge(
|
|||
const int32_t *rate,
|
||||
uint32_t address)
|
||||
{
|
||||
unsigned k, i, ptr;
|
||||
unsigned k;
|
||||
size_t n = (aux) ? 4 : 2;
|
||||
|
||||
const int16_t* const in = (int16_t*)(hle->alist_buffer + dmemi);
|
||||
|
@ -390,8 +392,8 @@ void alist_envmix_ge(
|
|||
ramps[1].value = (vol[1] << 16);
|
||||
ramps[0].target = (target[0] << 16);
|
||||
ramps[1].target = (target[1] << 16);
|
||||
ramps[0].step = rate[0];
|
||||
ramps[1].step = rate[1];
|
||||
ramps[0].step = rate[0] / 8;
|
||||
ramps[1].step = rate[1] / 8;
|
||||
} else {
|
||||
memcpy((uint8_t *)save_buffer, (hle->dram + address), 80);
|
||||
wet = *(int16_t *)(save_buffer + 0); /* 0-1 */
|
||||
|
@ -407,26 +409,23 @@ void alist_envmix_ge(
|
|||
}
|
||||
|
||||
count >>= 1;
|
||||
for (ptr = 0, k = 0; k < count; k += 8) {
|
||||
for (k = 0; k < count; k++) {
|
||||
int16_t gains[4];
|
||||
int16_t* buffers[4];
|
||||
int16_t l_vol = ramp_step(&ramps[0]);
|
||||
int16_t r_vol = ramp_step(&ramps[1]);
|
||||
|
||||
gains[0] = clamp_s16((l_vol * dry + 0x4000) >> 15);
|
||||
gains[1] = clamp_s16((r_vol * dry + 0x4000) >> 15);
|
||||
gains[2] = clamp_s16((l_vol * wet + 0x4000) >> 15);
|
||||
gains[3] = clamp_s16((r_vol * wet + 0x4000) >> 15);
|
||||
buffers[0] = dl + (k^S);
|
||||
buffers[1] = dr + (k^S);
|
||||
buffers[2] = wl + (k^S);
|
||||
buffers[3] = wr + (k^S);
|
||||
|
||||
for (i = 0; i < 8; i++) {
|
||||
buffers[0] = dl + (ptr^S);
|
||||
buffers[1] = dr + (ptr^S);
|
||||
buffers[2] = wl + (ptr^S);
|
||||
buffers[3] = wr + (ptr^S);
|
||||
gains[0] = clamp_s16((l_vol * dry + 0x4000) >> 15);
|
||||
gains[1] = clamp_s16((r_vol * dry + 0x4000) >> 15);
|
||||
gains[2] = clamp_s16((l_vol * wet + 0x4000) >> 15);
|
||||
gains[3] = clamp_s16((r_vol * wet + 0x4000) >> 15);
|
||||
|
||||
alist_envmix_mix(n, buffers, gains, in[ptr^S]);
|
||||
ptr++;
|
||||
}
|
||||
alist_envmix_mix(n, buffers, gains, in[k^S]);
|
||||
}
|
||||
|
||||
*(int16_t *)(save_buffer + 0) = wet; /* 0-1 */
|
||||
|
|
|
@ -29,6 +29,8 @@
|
|||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#include "alist.h"
|
||||
#include "hle_internal.h"
|
||||
#include "memory.h"
|
||||
|
@ -52,7 +54,7 @@ static void clear_segments(struct hle_t* hle)
|
|||
}
|
||||
|
||||
/* audio commands definition */
|
||||
static void SPNOOP(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
||||
static void SPNOOP(struct hle_t* UNUSED(hle), uint32_t UNUSED(w1), uint32_t UNUSED(w2))
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -142,7 +144,7 @@ static void SETVOL(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
|||
}
|
||||
}
|
||||
|
||||
static void SETLOOP(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
||||
static void SETLOOP(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
|
||||
{
|
||||
hle->alist_audio.loop = get_address(hle, w2);
|
||||
}
|
||||
|
@ -165,7 +167,7 @@ static void ADPCM(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
|||
address);
|
||||
}
|
||||
|
||||
static void LOADBUFF(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
||||
static void LOADBUFF(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
|
||||
{
|
||||
uint32_t address = get_address(hle, w2);
|
||||
|
||||
|
@ -175,7 +177,7 @@ static void LOADBUFF(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
|||
alist_load(hle, hle->alist_audio.in, address, hle->alist_audio.count);
|
||||
}
|
||||
|
||||
static void SAVEBUFF(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
||||
static void SAVEBUFF(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
|
||||
{
|
||||
uint32_t address = get_address(hle, w2);
|
||||
|
||||
|
@ -220,7 +222,7 @@ static void LOADADPCM(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
|||
dram_load_u16(hle, (uint16_t*)hle->alist_audio.table, address, align(count, 8) >> 1);
|
||||
}
|
||||
|
||||
static void INTERLEAVE(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
||||
static void INTERLEAVE(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
|
||||
{
|
||||
uint16_t left = (w2 >> 16) + DMEM_BASE;
|
||||
uint16_t right = w2 + DMEM_BASE;
|
||||
|
@ -243,7 +245,7 @@ static void MIXER(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
|||
alist_mix(hle, dmemo, dmemi, align(hle->alist_audio.count, 32), gain);
|
||||
}
|
||||
|
||||
static void SEGMENT(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
||||
static void SEGMENT(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
|
||||
{
|
||||
set_address(hle, w2);
|
||||
}
|
||||
|
|
|
@ -28,12 +28,13 @@
|
|||
#endif
|
||||
#include <stdint.h>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#include "alist.h"
|
||||
#include "hle_external.h"
|
||||
#include "hle_internal.h"
|
||||
#include "memory.h"
|
||||
|
||||
static void MP3(struct hle_t* hle, uint32_t w1, uint32_t w2);
|
||||
#include "ucodes.h"
|
||||
|
||||
enum { NAUDIO_COUNT = 0x170 }; /* ie 184 samples */
|
||||
enum {
|
||||
|
@ -57,7 +58,7 @@ static void UNKNOWN(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
|||
}
|
||||
|
||||
|
||||
static void SPNOOP(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
||||
static void SPNOOP(struct hle_t* UNUSED(hle), uint32_t UNUSED(w1), uint32_t UNUSED(w2))
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -67,10 +68,11 @@ static void NAUDIO_0000(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
|||
UNKNOWN(hle, w1, w2);
|
||||
}
|
||||
|
||||
static void NAUDIO_02B0(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
||||
static void NAUDIO_02B0(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
|
||||
{
|
||||
uint32_t rate = (hle->alist_naudio.rate[1] & 0xffff0000) | (w2 & 0xffff);
|
||||
hle->alist_naudio.rate[1] = rate;
|
||||
/* emulate code at 0x12b0 (inside SETVOL), because PC always execute in IMEM */
|
||||
hle->alist_naudio.rate[1] &= ~0xffff;
|
||||
hle->alist_naudio.rate[1] |= (w2 & 0xffff);
|
||||
}
|
||||
|
||||
static void NAUDIO_14(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
||||
|
@ -196,7 +198,7 @@ static void DMEMMOVE(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
|||
alist_move(hle, dmemo, dmemi, (count + 3) & ~3);
|
||||
}
|
||||
|
||||
static void SETLOOP(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
||||
static void SETLOOP(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
|
||||
{
|
||||
hle->alist_naudio.loop = (w2 & 0xffffff);
|
||||
}
|
||||
|
@ -241,12 +243,12 @@ static void RESAMPLE(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
|||
address);
|
||||
}
|
||||
|
||||
static void INTERLEAVE(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
||||
static void INTERLEAVE(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t UNUSED(w2))
|
||||
{
|
||||
alist_interleave(hle, NAUDIO_MAIN, NAUDIO_DRY_LEFT, NAUDIO_DRY_RIGHT, NAUDIO_COUNT);
|
||||
}
|
||||
|
||||
static void MP3ADDY(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
||||
static void MP3ADDY(struct hle_t* UNUSED(hle), uint32_t UNUSED(w1), uint32_t UNUSED(w2))
|
||||
{
|
||||
}
|
||||
|
||||
|
|
|
@ -28,6 +28,8 @@
|
|||
#endif
|
||||
#include <stdint.h>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#include "alist.h"
|
||||
#include "hle_external.h"
|
||||
#include "hle_internal.h"
|
||||
|
@ -49,7 +51,7 @@ static void UNKNOWN(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
|||
}
|
||||
|
||||
|
||||
static void SPNOOP(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
||||
static void SPNOOP(struct hle_t* UNUSED(hle), uint32_t UNUSED(w1), uint32_t UNUSED(w2))
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -61,7 +63,7 @@ static void LOADADPCM(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
|||
dram_load_u16(hle, (uint16_t*)hle->alist_nead.table, address, count >> 1);
|
||||
}
|
||||
|
||||
static void SETLOOP(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
||||
static void SETLOOP(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
|
||||
{
|
||||
hle->alist_nead.loop = w2 & 0xffffff;
|
||||
}
|
||||
|
@ -190,7 +192,7 @@ static void ENVSETUP1(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
|||
hle->alist_nead.env_steps[1] = w2;
|
||||
}
|
||||
|
||||
static void ENVSETUP2(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
||||
static void ENVSETUP2(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
|
||||
{
|
||||
hle->alist_nead.env_values[0] = (w2 >> 16);
|
||||
hle->alist_nead.env_values[1] = w2;
|
||||
|
@ -206,6 +208,7 @@ static void ENVMIXER_MK(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
|||
uint16_t dmem_dr = (w2 >> 12) & 0xff0;
|
||||
uint16_t dmem_wl = (w2 >> 4) & 0xff0;
|
||||
uint16_t dmem_wr = (w2 << 4) & 0xff0;
|
||||
|
||||
xors[2] = 0; /* unsupported by this ucode */
|
||||
xors[3] = 0; /* unsupported by this ucode */
|
||||
xors[0] = 0 - (int16_t)((w1 & 0x2) >> 1);
|
||||
|
@ -233,6 +236,7 @@ static void ENVMIXER(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
|||
uint16_t dmem_dr = (w2 >> 12) & 0xff0;
|
||||
uint16_t dmem_wl = (w2 >> 4) & 0xff0;
|
||||
uint16_t dmem_wr = (w2 << 4) & 0xff0;
|
||||
|
||||
xors[2] = 0 - (int16_t)((w1 & 0x8) >> 1);
|
||||
xors[3] = 0 - (int16_t)((w1 & 0x4) >> 1);
|
||||
xors[0] = 0 - (int16_t)((w1 & 0x2) >> 1);
|
||||
|
@ -267,7 +271,7 @@ static void INTERL(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
|||
alist_copy_every_other_sample(hle, dmemo, dmemi, count);
|
||||
}
|
||||
|
||||
static void INTERLEAVE_MK(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
||||
static void INTERLEAVE_MK(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
|
||||
{
|
||||
uint16_t left = (w2 >> 16);
|
||||
uint16_t right = w2;
|
||||
|
@ -323,7 +327,7 @@ static void FILTER(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
|||
}
|
||||
}
|
||||
|
||||
static void SEGMENT(struct hle_t* hle, uint32_t w1, uint32_t w2)
|
||||
static void SEGMENT(struct hle_t* UNUSED(hle), uint32_t UNUSED(w1), uint32_t UNUSED(w2))
|
||||
{
|
||||
}
|
||||
|
||||
|
|
|
@ -24,13 +24,7 @@
|
|||
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define INLINE __forceinline
|
||||
#else
|
||||
#define INLINE inline __attribute__((always_inline))
|
||||
#endif
|
||||
|
||||
INLINE static int16_t clamp_s16(int_fast32_t x)
|
||||
static inline int16_t clamp_s16(int_fast32_t x)
|
||||
{
|
||||
x = (x < INT16_MIN) ? INT16_MIN: x;
|
||||
x = (x > INT16_MAX) ? INT16_MAX: x;
|
||||
|
|
|
@ -23,41 +23,75 @@
|
|||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#include "arithmetics.h"
|
||||
|
||||
const int16_t RESAMPLE_LUT[64 * 4] = {
|
||||
0x0c39, 0x66ad, 0x0d46, 0xffdf, 0x0b39, 0x6696, 0x0e5f, 0xffd8,
|
||||
0x0a44, 0x6669, 0x0f83, 0xffd0, 0x095a, 0x6626, 0x10b4, 0xffc8,
|
||||
0x087d, 0x65cd, 0x11f0, 0xffbf, 0x07ab, 0x655e, 0x1338, 0xffb6,
|
||||
0x06e4, 0x64d9, 0x148c, 0xffac, 0x0628, 0x643f, 0x15eb, 0xffa1,
|
||||
0x0577, 0x638f, 0x1756, 0xff96, 0x04d1, 0x62cb, 0x18cb, 0xff8a,
|
||||
0x0435, 0x61f3, 0x1a4c, 0xff7e, 0x03a4, 0x6106, 0x1bd7, 0xff71,
|
||||
0x031c, 0x6007, 0x1d6c, 0xff64, 0x029f, 0x5ef5, 0x1f0b, 0xff56,
|
||||
0x022a, 0x5dd0, 0x20b3, 0xff48, 0x01be, 0x5c9a, 0x2264, 0xff3a,
|
||||
0x015b, 0x5b53, 0x241e, 0xff2c, 0x0101, 0x59fc, 0x25e0, 0xff1e,
|
||||
0x00ae, 0x5896, 0x27a9, 0xff10, 0x0063, 0x5720, 0x297a, 0xff02,
|
||||
0x001f, 0x559d, 0x2b50, 0xfef4, 0xffe2, 0x540d, 0x2d2c, 0xfee8,
|
||||
0xffac, 0x5270, 0x2f0d, 0xfedb, 0xff7c, 0x50c7, 0x30f3, 0xfed0,
|
||||
0xff53, 0x4f14, 0x32dc, 0xfec6, 0xff2e, 0x4d57, 0x34c8, 0xfebd,
|
||||
0xff0f, 0x4b91, 0x36b6, 0xfeb6, 0xfef5, 0x49c2, 0x38a5, 0xfeb0,
|
||||
0xfedf, 0x47ed, 0x3a95, 0xfeac, 0xfece, 0x4611, 0x3c85, 0xfeab,
|
||||
0xfec0, 0x4430, 0x3e74, 0xfeac, 0xfeb6, 0x424a, 0x4060, 0xfeaf,
|
||||
0xfeaf, 0x4060, 0x424a, 0xfeb6, 0xfeac, 0x3e74, 0x4430, 0xfec0,
|
||||
0xfeab, 0x3c85, 0x4611, 0xfece, 0xfeac, 0x3a95, 0x47ed, 0xfedf,
|
||||
0xfeb0, 0x38a5, 0x49c2, 0xfef5, 0xfeb6, 0x36b6, 0x4b91, 0xff0f,
|
||||
0xfebd, 0x34c8, 0x4d57, 0xff2e, 0xfec6, 0x32dc, 0x4f14, 0xff53,
|
||||
0xfed0, 0x30f3, 0x50c7, 0xff7c, 0xfedb, 0x2f0d, 0x5270, 0xffac,
|
||||
0xfee8, 0x2d2c, 0x540d, 0xffe2, 0xfef4, 0x2b50, 0x559d, 0x001f,
|
||||
0xff02, 0x297a, 0x5720, 0x0063, 0xff10, 0x27a9, 0x5896, 0x00ae,
|
||||
0xff1e, 0x25e0, 0x59fc, 0x0101, 0xff2c, 0x241e, 0x5b53, 0x015b,
|
||||
0xff3a, 0x2264, 0x5c9a, 0x01be, 0xff48, 0x20b3, 0x5dd0, 0x022a,
|
||||
0xff56, 0x1f0b, 0x5ef5, 0x029f, 0xff64, 0x1d6c, 0x6007, 0x031c,
|
||||
0xff71, 0x1bd7, 0x6106, 0x03a4, 0xff7e, 0x1a4c, 0x61f3, 0x0435,
|
||||
0xff8a, 0x18cb, 0x62cb, 0x04d1, 0xff96, 0x1756, 0x638f, 0x0577,
|
||||
0xffa1, 0x15eb, 0x643f, 0x0628, 0xffac, 0x148c, 0x64d9, 0x06e4,
|
||||
0xffb6, 0x1338, 0x655e, 0x07ab, 0xffbf, 0x11f0, 0x65cd, 0x087d,
|
||||
0xffc8, 0x10b4, 0x6626, 0x095a, 0xffd0, 0x0f83, 0x6669, 0x0a44,
|
||||
0xffd8, 0x0e5f, 0x6696, 0x0b39, 0xffdf, 0x0d46, 0x66ad, 0x0c39
|
||||
(int16_t)0x0c39, (int16_t)0x66ad, (int16_t)0x0d46, (int16_t)0xffdf,
|
||||
(int16_t)0x0b39, (int16_t)0x6696, (int16_t)0x0e5f, (int16_t)0xffd8,
|
||||
(int16_t)0x0a44, (int16_t)0x6669, (int16_t)0x0f83, (int16_t)0xffd0,
|
||||
(int16_t)0x095a, (int16_t)0x6626, (int16_t)0x10b4, (int16_t)0xffc8,
|
||||
(int16_t)0x087d, (int16_t)0x65cd, (int16_t)0x11f0, (int16_t)0xffbf,
|
||||
(int16_t)0x07ab, (int16_t)0x655e, (int16_t)0x1338, (int16_t)0xffb6,
|
||||
(int16_t)0x06e4, (int16_t)0x64d9, (int16_t)0x148c, (int16_t)0xffac,
|
||||
(int16_t)0x0628, (int16_t)0x643f, (int16_t)0x15eb, (int16_t)0xffa1,
|
||||
(int16_t)0x0577, (int16_t)0x638f, (int16_t)0x1756, (int16_t)0xff96,
|
||||
(int16_t)0x04d1, (int16_t)0x62cb, (int16_t)0x18cb, (int16_t)0xff8a,
|
||||
(int16_t)0x0435, (int16_t)0x61f3, (int16_t)0x1a4c, (int16_t)0xff7e,
|
||||
(int16_t)0x03a4, (int16_t)0x6106, (int16_t)0x1bd7, (int16_t)0xff71,
|
||||
(int16_t)0x031c, (int16_t)0x6007, (int16_t)0x1d6c, (int16_t)0xff64,
|
||||
(int16_t)0x029f, (int16_t)0x5ef5, (int16_t)0x1f0b, (int16_t)0xff56,
|
||||
(int16_t)0x022a, (int16_t)0x5dd0, (int16_t)0x20b3, (int16_t)0xff48,
|
||||
(int16_t)0x01be, (int16_t)0x5c9a, (int16_t)0x2264, (int16_t)0xff3a,
|
||||
(int16_t)0x015b, (int16_t)0x5b53, (int16_t)0x241e, (int16_t)0xff2c,
|
||||
(int16_t)0x0101, (int16_t)0x59fc, (int16_t)0x25e0, (int16_t)0xff1e,
|
||||
(int16_t)0x00ae, (int16_t)0x5896, (int16_t)0x27a9, (int16_t)0xff10,
|
||||
(int16_t)0x0063, (int16_t)0x5720, (int16_t)0x297a, (int16_t)0xff02,
|
||||
(int16_t)0x001f, (int16_t)0x559d, (int16_t)0x2b50, (int16_t)0xfef4,
|
||||
(int16_t)0xffe2, (int16_t)0x540d, (int16_t)0x2d2c, (int16_t)0xfee8,
|
||||
(int16_t)0xffac, (int16_t)0x5270, (int16_t)0x2f0d, (int16_t)0xfedb,
|
||||
(int16_t)0xff7c, (int16_t)0x50c7, (int16_t)0x30f3, (int16_t)0xfed0,
|
||||
(int16_t)0xff53, (int16_t)0x4f14, (int16_t)0x32dc, (int16_t)0xfec6,
|
||||
(int16_t)0xff2e, (int16_t)0x4d57, (int16_t)0x34c8, (int16_t)0xfebd,
|
||||
(int16_t)0xff0f, (int16_t)0x4b91, (int16_t)0x36b6, (int16_t)0xfeb6,
|
||||
(int16_t)0xfef5, (int16_t)0x49c2, (int16_t)0x38a5, (int16_t)0xfeb0,
|
||||
(int16_t)0xfedf, (int16_t)0x47ed, (int16_t)0x3a95, (int16_t)0xfeac,
|
||||
(int16_t)0xfece, (int16_t)0x4611, (int16_t)0x3c85, (int16_t)0xfeab,
|
||||
(int16_t)0xfec0, (int16_t)0x4430, (int16_t)0x3e74, (int16_t)0xfeac,
|
||||
(int16_t)0xfeb6, (int16_t)0x424a, (int16_t)0x4060, (int16_t)0xfeaf,
|
||||
(int16_t)0xfeaf, (int16_t)0x4060, (int16_t)0x424a, (int16_t)0xfeb6,
|
||||
(int16_t)0xfeac, (int16_t)0x3e74, (int16_t)0x4430, (int16_t)0xfec0,
|
||||
(int16_t)0xfeab, (int16_t)0x3c85, (int16_t)0x4611, (int16_t)0xfece,
|
||||
(int16_t)0xfeac, (int16_t)0x3a95, (int16_t)0x47ed, (int16_t)0xfedf,
|
||||
(int16_t)0xfeb0, (int16_t)0x38a5, (int16_t)0x49c2, (int16_t)0xfef5,
|
||||
(int16_t)0xfeb6, (int16_t)0x36b6, (int16_t)0x4b91, (int16_t)0xff0f,
|
||||
(int16_t)0xfebd, (int16_t)0x34c8, (int16_t)0x4d57, (int16_t)0xff2e,
|
||||
(int16_t)0xfec6, (int16_t)0x32dc, (int16_t)0x4f14, (int16_t)0xff53,
|
||||
(int16_t)0xfed0, (int16_t)0x30f3, (int16_t)0x50c7, (int16_t)0xff7c,
|
||||
(int16_t)0xfedb, (int16_t)0x2f0d, (int16_t)0x5270, (int16_t)0xffac,
|
||||
(int16_t)0xfee8, (int16_t)0x2d2c, (int16_t)0x540d, (int16_t)0xffe2,
|
||||
(int16_t)0xfef4, (int16_t)0x2b50, (int16_t)0x559d, (int16_t)0x001f,
|
||||
(int16_t)0xff02, (int16_t)0x297a, (int16_t)0x5720, (int16_t)0x0063,
|
||||
(int16_t)0xff10, (int16_t)0x27a9, (int16_t)0x5896, (int16_t)0x00ae,
|
||||
(int16_t)0xff1e, (int16_t)0x25e0, (int16_t)0x59fc, (int16_t)0x0101,
|
||||
(int16_t)0xff2c, (int16_t)0x241e, (int16_t)0x5b53, (int16_t)0x015b,
|
||||
(int16_t)0xff3a, (int16_t)0x2264, (int16_t)0x5c9a, (int16_t)0x01be,
|
||||
(int16_t)0xff48, (int16_t)0x20b3, (int16_t)0x5dd0, (int16_t)0x022a,
|
||||
(int16_t)0xff56, (int16_t)0x1f0b, (int16_t)0x5ef5, (int16_t)0x029f,
|
||||
(int16_t)0xff64, (int16_t)0x1d6c, (int16_t)0x6007, (int16_t)0x031c,
|
||||
(int16_t)0xff71, (int16_t)0x1bd7, (int16_t)0x6106, (int16_t)0x03a4,
|
||||
(int16_t)0xff7e, (int16_t)0x1a4c, (int16_t)0x61f3, (int16_t)0x0435,
|
||||
(int16_t)0xff8a, (int16_t)0x18cb, (int16_t)0x62cb, (int16_t)0x04d1,
|
||||
(int16_t)0xff96, (int16_t)0x1756, (int16_t)0x638f, (int16_t)0x0577,
|
||||
(int16_t)0xffa1, (int16_t)0x15eb, (int16_t)0x643f, (int16_t)0x0628,
|
||||
(int16_t)0xffac, (int16_t)0x148c, (int16_t)0x64d9, (int16_t)0x06e4,
|
||||
(int16_t)0xffb6, (int16_t)0x1338, (int16_t)0x655e, (int16_t)0x07ab,
|
||||
(int16_t)0xffbf, (int16_t)0x11f0, (int16_t)0x65cd, (int16_t)0x087d,
|
||||
(int16_t)0xffc8, (int16_t)0x10b4, (int16_t)0x6626, (int16_t)0x095a,
|
||||
(int16_t)0xffd0, (int16_t)0x0f83, (int16_t)0x6669, (int16_t)0x0a44,
|
||||
(int16_t)0xffd8, (int16_t)0x0e5f, (int16_t)0x6696, (int16_t)0x0b39,
|
||||
(int16_t)0xffdf, (int16_t)0x0d46, (int16_t)0x66ad, (int16_t)0x0c39
|
||||
};
|
||||
|
||||
int32_t rdot(size_t n, const int16_t *x, const int16_t *y)
|
||||
|
|
|
@ -29,13 +29,7 @@ extern const int16_t RESAMPLE_LUT[64 * 4];
|
|||
|
||||
int32_t rdot(size_t n, const int16_t *x, const int16_t *y);
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define INLINE __forceinline
|
||||
#else
|
||||
#define INLINE inline __attribute__((always_inline))
|
||||
#endif
|
||||
|
||||
INLINE static int16_t adpcm_predict_sample(uint8_t byte, uint8_t mask,
|
||||
static inline int16_t adpcm_predict_sample(uint8_t byte, uint8_t mask,
|
||||
unsigned lshift, unsigned rshift)
|
||||
{
|
||||
int16_t sample = (uint16_t)(byte & mask) << lshift;
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
|
||||
* Mupen64plus-rsp-hle - common.h *
|
||||
* Mupen64Plus homepage: http://code.google.com/p/mupen64plus/ *
|
||||
* Copyright (C) 2014 Bobby Smiles *
|
||||
* *
|
||||
* This program is free software; you can redistribute it and/or modify *
|
||||
* it under the terms of the GNU General Public License as published by *
|
||||
* the Free Software Foundation; either version 2 of the License, or *
|
||||
* (at your option) any later version. *
|
||||
* *
|
||||
* This program is distributed in the hope that it will be useful, *
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
|
||||
* GNU General Public License for more details. *
|
||||
* *
|
||||
* You should have received a copy of the GNU General Public License *
|
||||
* along with this program; if not, write to the *
|
||||
* Free Software Foundation, Inc., *
|
||||
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
|
||||
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
|
||||
|
||||
#ifndef COMMON_H
|
||||
#define COMMON_H
|
||||
|
||||
/* macro for unused variable warning suppression */
|
||||
#ifdef __GNUC__
|
||||
# define UNUSED(x) UNUSED_ ## x __attribute__((__unused__))
|
||||
#else
|
||||
# define UNUSED(x) UNUSED_ ## x
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
# define inline __forceinline
|
||||
#elif defined __GNUC__
|
||||
# define inline inline __attribute__((always_inline))
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -32,6 +32,8 @@
|
|||
#include <stdio.h>
|
||||
#endif
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#include "hle_external.h"
|
||||
#include "hle_internal.h"
|
||||
#include "memory.h"
|
||||
|
|
|
@ -72,12 +72,6 @@ struct hle_t
|
|||
|
||||
/* mp3.c */
|
||||
uint8_t mp3_buffer[0x1000];
|
||||
/* FIXME: rewrite mp3 module to avoid these */
|
||||
uint32_t mp3_inPtr;
|
||||
uint32_t mp3_outPtr;
|
||||
uint32_t mp3_t6;
|
||||
uint32_t mp3_t5;
|
||||
uint32_t mp3_t4;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
|
@ -25,6 +25,8 @@
|
|||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#include "arithmetics.h"
|
||||
#include "hle_external.h"
|
||||
#include "hle_internal.h"
|
||||
|
|
|
@ -21,103 +21,56 @@
|
|||
|
||||
#include <string.h>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#include "memory.h"
|
||||
|
||||
/* Global functions */
|
||||
void dmem_load_u8(struct hle_t* hle, uint8_t* dst, uint16_t address, size_t count)
|
||||
void load_u8(uint8_t* dst, const unsigned char* buffer, unsigned address, size_t count)
|
||||
{
|
||||
while (count != 0) {
|
||||
*(dst++) = *dmem_u8(hle, address);
|
||||
*(dst++) = *u8(buffer, address);
|
||||
address += 1;
|
||||
--count;
|
||||
}
|
||||
}
|
||||
|
||||
void dmem_load_u16(struct hle_t* hle, uint16_t* dst, uint16_t address, size_t count)
|
||||
void load_u16(uint16_t* dst, const unsigned char* buffer, unsigned address, size_t count)
|
||||
{
|
||||
while (count != 0) {
|
||||
*(dst++) = *dmem_u16(hle, address);
|
||||
*(dst++) = *u16(buffer, address);
|
||||
address += 2;
|
||||
--count;
|
||||
}
|
||||
}
|
||||
|
||||
void dmem_load_u32(struct hle_t* hle, uint32_t* dst, uint16_t address, size_t count)
|
||||
void load_u32(uint32_t* dst, const unsigned char* buffer, unsigned address, size_t count)
|
||||
{
|
||||
/* Optimization for uint32_t */
|
||||
memcpy(dst, dmem_u32(hle, address), count * sizeof(uint32_t));
|
||||
memcpy(dst, u32(buffer, address), count * sizeof(uint32_t));
|
||||
}
|
||||
|
||||
void dmem_store_u8(struct hle_t* hle, const uint8_t* src, uint16_t address, size_t count)
|
||||
void store_u8(unsigned char* buffer, unsigned address, const uint8_t* src, size_t count)
|
||||
{
|
||||
while (count != 0) {
|
||||
*dmem_u8(hle, address) = *(src++);
|
||||
*u8(buffer, address) = *(src++);
|
||||
address += 1;
|
||||
--count;
|
||||
}
|
||||
}
|
||||
|
||||
void dmem_store_u16(struct hle_t* hle, const uint16_t* src, uint16_t address, size_t count)
|
||||
void store_u16(unsigned char* buffer, unsigned address, const uint16_t* src, size_t count)
|
||||
{
|
||||
while (count != 0) {
|
||||
*dmem_u16(hle, address) = *(src++);
|
||||
*u16(buffer, address) = *(src++);
|
||||
address += 2;
|
||||
--count;
|
||||
}
|
||||
}
|
||||
|
||||
void dmem_store_u32(struct hle_t* hle, const uint32_t* src, uint16_t address, size_t count)
|
||||
void store_u32(unsigned char* buffer, unsigned address, const uint32_t* src, size_t count)
|
||||
{
|
||||
/* Optimization for uint32_t */
|
||||
memcpy(dmem_u32(hle, address), src, count * sizeof(uint32_t));
|
||||
}
|
||||
|
||||
|
||||
void dram_load_u8(struct hle_t* hle, uint8_t* dst, uint32_t address, size_t count)
|
||||
{
|
||||
while (count != 0) {
|
||||
*(dst++) = *dram_u8(hle, address);
|
||||
address += 1;
|
||||
--count;
|
||||
}
|
||||
}
|
||||
|
||||
void dram_load_u16(struct hle_t* hle, uint16_t* dst, uint32_t address, size_t count)
|
||||
{
|
||||
while (count != 0) {
|
||||
*(dst++) = *dram_u16(hle, address);
|
||||
address += 2;
|
||||
--count;
|
||||
}
|
||||
}
|
||||
|
||||
void dram_load_u32(struct hle_t* hle, uint32_t* dst, uint32_t address, size_t count)
|
||||
{
|
||||
/* Optimization for uint32_t */
|
||||
memcpy(dst, dram_u32(hle, address), count * sizeof(uint32_t));
|
||||
}
|
||||
|
||||
void dram_store_u8(struct hle_t* hle, const uint8_t* src, uint32_t address, size_t count)
|
||||
{
|
||||
while (count != 0) {
|
||||
*dram_u8(hle, address) = *(src++);
|
||||
address += 1;
|
||||
--count;
|
||||
}
|
||||
}
|
||||
|
||||
void dram_store_u16(struct hle_t* hle, const uint16_t* src, uint32_t address, size_t count)
|
||||
{
|
||||
while (count != 0) {
|
||||
*dram_u16(hle, address) = *(src++);
|
||||
address += 2;
|
||||
--count;
|
||||
}
|
||||
}
|
||||
|
||||
void dram_store_u32(struct hle_t* hle, const uint32_t* src, uint32_t address, size_t count)
|
||||
{
|
||||
/* Optimization for uint32_t */
|
||||
memcpy(dram_u32(hle, address), src, count * sizeof(uint32_t));
|
||||
memcpy(u32(buffer, address), src, count * sizeof(uint32_t));
|
||||
}
|
||||
|
||||
|
|
|
@ -57,65 +57,128 @@ enum {
|
|||
TASK_YIELD_DATA_SIZE = 0xffc
|
||||
};
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define INLINE __forceinline
|
||||
#else
|
||||
#define INLINE inline __attribute__((always_inline))
|
||||
#endif
|
||||
|
||||
INLINE static unsigned int align(unsigned int x, unsigned amount)
|
||||
static inline unsigned int align(unsigned int x, unsigned amount)
|
||||
{
|
||||
--amount;
|
||||
return (x + amount) & ~amount;
|
||||
}
|
||||
|
||||
INLINE static uint8_t* const dmem_u8(struct hle_t* hle, uint16_t address)
|
||||
static inline uint8_t* u8(const unsigned char* buffer, unsigned address)
|
||||
{
|
||||
return (uint8_t*)(&hle->dmem[(address & 0xfff) ^ S8]);
|
||||
return (uint8_t*)(buffer + (address ^ S8));
|
||||
}
|
||||
|
||||
INLINE static uint16_t* const dmem_u16(struct hle_t* hle, uint16_t address)
|
||||
static inline uint16_t* u16(const unsigned char* buffer, unsigned address)
|
||||
{
|
||||
assert((address & 1) == 0);
|
||||
return (uint16_t*)(&hle->dmem[(address & 0xfff) ^ S16]);
|
||||
return (uint16_t*)(buffer + (address ^ S16));
|
||||
}
|
||||
|
||||
INLINE static uint32_t* const dmem_u32(struct hle_t* hle, uint16_t address)
|
||||
static inline uint32_t* u32(const unsigned char* buffer, unsigned address)
|
||||
{
|
||||
assert((address & 3) == 0);
|
||||
return (uint32_t*)(&hle->dmem[(address & 0xfff)]);
|
||||
return (uint32_t*)(buffer + address);
|
||||
}
|
||||
|
||||
INLINE static uint8_t* const dram_u8(struct hle_t* hle, uint32_t address)
|
||||
void load_u8 (uint8_t* dst, const unsigned char* buffer, unsigned address, size_t count);
|
||||
void load_u16(uint16_t* dst, const unsigned char* buffer, unsigned address, size_t count);
|
||||
void load_u32(uint32_t* dst, const unsigned char* buffer, unsigned address, size_t count);
|
||||
void store_u8 (unsigned char* buffer, unsigned address, const uint8_t* src, size_t count);
|
||||
void store_u16(unsigned char* buffer, unsigned address, const uint16_t* src, size_t count);
|
||||
void store_u32(unsigned char* buffer, unsigned address, const uint32_t* src, size_t count);
|
||||
|
||||
|
||||
/* convenient functions for DMEM access */
|
||||
static inline uint8_t* dmem_u8(struct hle_t* hle, uint16_t address)
|
||||
{
|
||||
return (uint8_t*)&hle->dram[(address & 0xffffff) ^ S8];
|
||||
return u8(hle->dmem, address & 0xfff);
|
||||
}
|
||||
|
||||
INLINE static uint16_t* const dram_u16(struct hle_t* hle, uint32_t address)
|
||||
static inline uint16_t* dmem_u16(struct hle_t* hle, uint16_t address)
|
||||
{
|
||||
assert((address & 1) == 0);
|
||||
return (uint16_t*)&hle->dram[(address & 0xffffff) ^ S16];
|
||||
return u16(hle->dmem, address & 0xfff);
|
||||
}
|
||||
|
||||
INLINE static uint32_t* const dram_u32(struct hle_t* hle, uint32_t address)
|
||||
static inline uint32_t* dmem_u32(struct hle_t* hle, uint16_t address)
|
||||
{
|
||||
assert((address & 3) == 0);
|
||||
return (uint32_t*)&hle->dram[address & 0xffffff];
|
||||
return u32(hle->dmem, address & 0xfff);
|
||||
}
|
||||
|
||||
void dmem_load_u8 (struct hle_t* hle, uint8_t* dst, uint16_t address, size_t count);
|
||||
void dmem_load_u16(struct hle_t* hle, uint16_t* dst, uint16_t address, size_t count);
|
||||
void dmem_load_u32(struct hle_t* hle, uint32_t* dst, uint16_t address, size_t count);
|
||||
void dmem_store_u8 (struct hle_t* hle, const uint8_t* src, uint16_t address, size_t count);
|
||||
void dmem_store_u16(struct hle_t* hle, const uint16_t* src, uint16_t address, size_t count);
|
||||
void dmem_store_u32(struct hle_t* hle, const uint32_t* src, uint16_t address, size_t count);
|
||||
static inline void dmem_load_u8(struct hle_t* hle, uint8_t* dst, uint16_t address, size_t count)
|
||||
{
|
||||
load_u8(dst, hle->dmem, address & 0xfff, count);
|
||||
}
|
||||
|
||||
void dram_load_u8 (struct hle_t* hle, uint8_t* dst, uint32_t address, size_t count);
|
||||
void dram_load_u16(struct hle_t* hle, uint16_t* dst, uint32_t address, size_t count);
|
||||
void dram_load_u32(struct hle_t* hle, uint32_t* dst, uint32_t address, size_t count);
|
||||
void dram_store_u8 (struct hle_t* hle, const uint8_t* src, uint32_t address, size_t count);
|
||||
void dram_store_u16(struct hle_t* hle, const uint16_t* src, uint32_t address, size_t count);
|
||||
void dram_store_u32(struct hle_t* hle, const uint32_t* src, uint32_t address, size_t count);
|
||||
static inline void dmem_load_u16(struct hle_t* hle, uint16_t* dst, uint16_t address, size_t count)
|
||||
{
|
||||
load_u16(dst, hle->dmem, address & 0xfff, count);
|
||||
}
|
||||
|
||||
static inline void dmem_load_u32(struct hle_t* hle, uint32_t* dst, uint16_t address, size_t count)
|
||||
{
|
||||
load_u32(dst, hle->dmem, address & 0xfff, count);
|
||||
}
|
||||
|
||||
static inline void dmem_store_u8(struct hle_t* hle, const uint8_t* src, uint16_t address, size_t count)
|
||||
{
|
||||
store_u8(hle->dmem, address & 0xfff, src, count);
|
||||
}
|
||||
|
||||
static inline void dmem_store_u16(struct hle_t* hle, const uint16_t* src, uint16_t address, size_t count)
|
||||
{
|
||||
store_u16(hle->dmem, address & 0xfff, src, count);
|
||||
}
|
||||
|
||||
static inline void dmem_store_u32(struct hle_t* hle, const uint32_t* src, uint16_t address, size_t count)
|
||||
{
|
||||
store_u32(hle->dmem, address & 0xfff, src, count);
|
||||
}
|
||||
|
||||
/* convenient functions DRAM access */
|
||||
static inline uint8_t* dram_u8(struct hle_t* hle, uint32_t address)
|
||||
{
|
||||
return u8(hle->dram, address & 0xffffff);
|
||||
}
|
||||
|
||||
static inline uint16_t* dram_u16(struct hle_t* hle, uint32_t address)
|
||||
{
|
||||
return u16(hle->dram, address & 0xffffff);
|
||||
}
|
||||
|
||||
static inline uint32_t* dram_u32(struct hle_t* hle, uint32_t address)
|
||||
{
|
||||
return u32(hle->dram, address & 0xffffff);
|
||||
}
|
||||
|
||||
static inline void dram_load_u8(struct hle_t* hle, uint8_t* dst, uint32_t address, size_t count)
|
||||
{
|
||||
load_u8(dst, hle->dram, address & 0xffffff, count);
|
||||
}
|
||||
|
||||
static inline void dram_load_u16(struct hle_t* hle, uint16_t* dst, uint32_t address, size_t count)
|
||||
{
|
||||
load_u16(dst, hle->dram, address & 0xffffff, count);
|
||||
}
|
||||
|
||||
static inline void dram_load_u32(struct hle_t* hle, uint32_t* dst, uint32_t address, size_t count)
|
||||
{
|
||||
load_u32(dst, hle->dram, address & 0xffffff, count);
|
||||
}
|
||||
|
||||
static inline void dram_store_u8(struct hle_t* hle, const uint8_t* src, uint32_t address, size_t count)
|
||||
{
|
||||
store_u8(hle->dram, address & 0xffffff, src, count);
|
||||
}
|
||||
|
||||
static inline void dram_store_u16(struct hle_t* hle, const uint16_t* src, uint32_t address, size_t count)
|
||||
{
|
||||
store_u16(hle->dram, address & 0xffffff, src, count);
|
||||
}
|
||||
|
||||
static inline void dram_store_u32(struct hle_t* hle, const uint32_t* src, uint32_t address, size_t count)
|
||||
{
|
||||
store_u32(hle->dram, address & 0xffffff, src, count);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
|
||||
* Mupen64plus-rsp-hle - ucode3mp3.h *
|
||||
* Mupen64plus-rsp-hle - mp3.c *
|
||||
* Mupen64Plus homepage: http://code.google.com/p/mupen64plus/ *
|
||||
* Copyright (C) 2014 Bobby Smiles *
|
||||
* Copyright (C) 2009 Richard Goedeken *
|
||||
* Copyright (C) 2002 Hacktarux *
|
||||
* *
|
||||
|
@ -23,10 +24,16 @@
|
|||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "hle_external.h"
|
||||
#include "common.h"
|
||||
|
||||
#include "arithmetics.h"
|
||||
#include "hle_internal.h"
|
||||
#include "memory.h"
|
||||
|
||||
static void InnerLoop(struct hle_t* hle,
|
||||
uint32_t outPtr, uint32_t inPtr,
|
||||
uint32_t t6, uint32_t t5, uint32_t t4);
|
||||
|
||||
static const uint16_t DeWindowLUT [0x420] = {
|
||||
0x0000, 0xFFF3, 0x005D, 0xFF38, 0x037A, 0xF736, 0x0B37, 0xC00E,
|
||||
0x7FFF, 0x3FF2, 0x0B37, 0x08CA, 0x037A, 0x00C8, 0x005D, 0x000D,
|
||||
|
@ -198,10 +205,13 @@ static void MP3AB0(int32_t* v)
|
|||
}
|
||||
}
|
||||
|
||||
static void InnerLoop(struct hle_t* hle);
|
||||
|
||||
void mp3_task(struct hle_t* hle, unsigned int index, uint32_t address)
|
||||
{
|
||||
uint32_t inPtr, outPtr;
|
||||
uint32_t t6;/* = 0x08A0; - I think these are temporary storage buffers */
|
||||
uint32_t t5;/* = 0x0AC0; */
|
||||
uint32_t t4;/* = (w1 & 0x1E); */
|
||||
|
||||
/* Initialization Code */
|
||||
uint32_t readPtr; /* s5 */
|
||||
uint32_t writePtr; /* s6 */
|
||||
|
@ -209,9 +219,9 @@ void mp3_task(struct hle_t* hle, unsigned int index, uint32_t address)
|
|||
int cnt, cnt2;
|
||||
|
||||
/* I think these are temporary storage buffers */
|
||||
hle->mp3_t6 = 0x08A0;
|
||||
hle->mp3_t5 = 0x0AC0;
|
||||
hle->mp3_t4 = index;
|
||||
t6 = 0x08A0;
|
||||
t5 = 0x0AC0;
|
||||
t4 = index;
|
||||
|
||||
writePtr = readPtr = address;
|
||||
/* Just do that for efficiency... may remove and use directly later anyway */
|
||||
|
@ -222,20 +232,21 @@ void mp3_task(struct hle_t* hle, unsigned int index, uint32_t address)
|
|||
for (cnt = 0; cnt < 0x480; cnt += 0x180) {
|
||||
/* DMA: 0xCF0 <- RDRAM[s5] : 0x180 */
|
||||
memcpy(hle->mp3_buffer + 0xCF0, hle->dram + readPtr, 0x180);
|
||||
hle->mp3_inPtr = 0xCF0; /* s7 */
|
||||
hle->mp3_outPtr = 0xE70; /* s3 */
|
||||
inPtr = 0xCF0; /* s7 */
|
||||
outPtr = 0xE70; /* s3 */
|
||||
/* --------------- Inner Loop Start -------------------- */
|
||||
for (cnt2 = 0; cnt2 < 0x180; cnt2 += 0x40) {
|
||||
hle->mp3_t6 &= 0xFFE0;
|
||||
hle->mp3_t5 &= 0xFFE0;
|
||||
hle->mp3_t6 |= hle->mp3_t4;
|
||||
hle->mp3_t5 |= hle->mp3_t4;
|
||||
InnerLoop(hle);
|
||||
hle->mp3_t4 = (hle->mp3_t4 - 2) & 0x1E;
|
||||
tmp = hle->mp3_t6;
|
||||
hle->mp3_t6 = hle->mp3_t5;
|
||||
hle->mp3_t5 = tmp;
|
||||
hle->mp3_inPtr += 0x40;
|
||||
t6 &= 0xFFE0;
|
||||
t5 &= 0xFFE0;
|
||||
t6 |= t4;
|
||||
t5 |= t4;
|
||||
InnerLoop(hle, outPtr, inPtr, t6, t5, t4);
|
||||
t4 = (t4 - 2) & 0x1E;
|
||||
tmp = t6;
|
||||
t6 = t5;
|
||||
t5 = tmp;
|
||||
inPtr += 0x40;
|
||||
outPtr += 0x40;
|
||||
}
|
||||
/* --------------- Inner Loop End -------------------- */
|
||||
memcpy(hle->dram + writePtr, hle->mp3_buffer + 0xe70, 0x180);
|
||||
|
@ -244,9 +255,9 @@ void mp3_task(struct hle_t* hle, unsigned int index, uint32_t address)
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void InnerLoop(struct hle_t* hle)
|
||||
static void InnerLoop(struct hle_t* hle,
|
||||
uint32_t outPtr, uint32_t inPtr,
|
||||
uint32_t t6, uint32_t t5, uint32_t t4)
|
||||
{
|
||||
/* Part 1: 100% Accurate */
|
||||
|
||||
|
@ -274,56 +285,56 @@ static void InnerLoop(struct hle_t* hle)
|
|||
int32_t vt;
|
||||
int32_t v[32];
|
||||
|
||||
v[0] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x00 ^ S16));
|
||||
v[31] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x3E ^ S16));
|
||||
v[0] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x00 ^ S16));
|
||||
v[31] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x3E ^ S16));
|
||||
v[0] += v[31];
|
||||
v[1] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x02 ^ S16));
|
||||
v[30] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x3C ^ S16));
|
||||
v[1] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x02 ^ S16));
|
||||
v[30] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x3C ^ S16));
|
||||
v[1] += v[30];
|
||||
v[2] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x06 ^ S16));
|
||||
v[28] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x38 ^ S16));
|
||||
v[2] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x06 ^ S16));
|
||||
v[28] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x38 ^ S16));
|
||||
v[2] += v[28];
|
||||
v[3] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x04 ^ S16));
|
||||
v[29] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x3A ^ S16));
|
||||
v[3] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x04 ^ S16));
|
||||
v[29] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x3A ^ S16));
|
||||
v[3] += v[29];
|
||||
|
||||
v[4] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x0E ^ S16));
|
||||
v[24] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x30 ^ S16));
|
||||
v[4] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x0E ^ S16));
|
||||
v[24] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x30 ^ S16));
|
||||
v[4] += v[24];
|
||||
v[5] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x0C ^ S16));
|
||||
v[25] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x32 ^ S16));
|
||||
v[5] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x0C ^ S16));
|
||||
v[25] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x32 ^ S16));
|
||||
v[5] += v[25];
|
||||
v[6] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x08 ^ S16));
|
||||
v[27] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x36 ^ S16));
|
||||
v[6] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x08 ^ S16));
|
||||
v[27] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x36 ^ S16));
|
||||
v[6] += v[27];
|
||||
v[7] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x0A ^ S16));
|
||||
v[26] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x34 ^ S16));
|
||||
v[7] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x0A ^ S16));
|
||||
v[26] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x34 ^ S16));
|
||||
v[7] += v[26];
|
||||
|
||||
v[8] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x1E ^ S16));
|
||||
v[16] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x20 ^ S16));
|
||||
v[8] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x1E ^ S16));
|
||||
v[16] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x20 ^ S16));
|
||||
v[8] += v[16];
|
||||
v[9] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x1C ^ S16));
|
||||
v[17] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x22 ^ S16));
|
||||
v[9] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x1C ^ S16));
|
||||
v[17] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x22 ^ S16));
|
||||
v[9] += v[17];
|
||||
v[10] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x18 ^ S16));
|
||||
v[19] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x26 ^ S16));
|
||||
v[10] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x18 ^ S16));
|
||||
v[19] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x26 ^ S16));
|
||||
v[10] += v[19];
|
||||
v[11] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x1A ^ S16));
|
||||
v[18] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x24 ^ S16));
|
||||
v[11] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x1A ^ S16));
|
||||
v[18] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x24 ^ S16));
|
||||
v[11] += v[18];
|
||||
|
||||
v[12] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x10 ^ S16));
|
||||
v[23] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x2E ^ S16));
|
||||
v[12] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x10 ^ S16));
|
||||
v[23] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x2E ^ S16));
|
||||
v[12] += v[23];
|
||||
v[13] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x12 ^ S16));
|
||||
v[22] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x2C ^ S16));
|
||||
v[13] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x12 ^ S16));
|
||||
v[22] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x2C ^ S16));
|
||||
v[13] += v[22];
|
||||
v[14] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x16 ^ S16));
|
||||
v[20] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x28 ^ S16));
|
||||
v[14] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x16 ^ S16));
|
||||
v[20] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x28 ^ S16));
|
||||
v[14] += v[20];
|
||||
v[15] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x14 ^ S16));
|
||||
v[21] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x2A ^ S16));
|
||||
v[15] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x14 ^ S16));
|
||||
v[21] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x2A ^ S16));
|
||||
v[15] += v[21];
|
||||
|
||||
/* Part 2-4 */
|
||||
|
@ -332,10 +343,10 @@ static void InnerLoop(struct hle_t* hle)
|
|||
|
||||
/* Part 5 - 1-Wide Butterflies - 100% Accurate but need SSVs!!! */
|
||||
|
||||
t0 = hle->mp3_t6 + 0x100;
|
||||
t1 = hle->mp3_t6 + 0x200;
|
||||
t2 = hle->mp3_t5 + 0x100;
|
||||
t3 = hle->mp3_t5 + 0x200;
|
||||
t0 = t6 + 0x100;
|
||||
t1 = t6 + 0x200;
|
||||
t2 = t5 + 0x100;
|
||||
t3 = t5 + 0x200;
|
||||
|
||||
/* 0x13A8 */
|
||||
v[1] = 0;
|
||||
|
@ -344,14 +355,14 @@ static void InnerLoop(struct hle_t* hle)
|
|||
v[16] = -v[16] - v[17];
|
||||
v[2] = v[18] + v[19];
|
||||
/* ** Store v[11] -> (T6 + 0)** */
|
||||
*(int16_t *)(hle->mp3_buffer + ((hle->mp3_t6 + (short)0x0))) = (short)v[11];
|
||||
*(int16_t *)(hle->mp3_buffer + ((t6 + (short)0x0))) = (short)v[11];
|
||||
|
||||
|
||||
v[11] = -v[11];
|
||||
/* ** Store v[16] -> (T3 + 0)** */
|
||||
*(int16_t *)(hle->mp3_buffer + ((t3 + (short)0x0))) = (short)v[16];
|
||||
/* ** Store v[11] -> (T5 + 0)** */
|
||||
*(int16_t *)(hle->mp3_buffer + ((hle->mp3_t5 + (short)0x0))) = (short)v[11];
|
||||
*(int16_t *)(hle->mp3_buffer + ((t5 + (short)0x0))) = (short)v[11];
|
||||
/* 0x13E8 - Verified.... */
|
||||
v[2] = -v[2];
|
||||
/* ** Store v[2] -> (T2 + 0)** */
|
||||
|
@ -398,7 +409,7 @@ static void InnerLoop(struct hle_t* hle)
|
|||
v[17] = v[13] - v[10];
|
||||
v[9] = v[9] + v[14];
|
||||
/* ** Store v[9] -> (T6 + 0x40) */
|
||||
*(int16_t *)(hle->mp3_buffer + ((hle->mp3_t6 + (short)0x40))) = (short)v[9];
|
||||
*(int16_t *)(hle->mp3_buffer + ((t6 + (short)0x40))) = (short)v[9];
|
||||
v[11] = v[11] - v[13];
|
||||
/* ** Store v[17] -> (T0 + 0xFFC0) */
|
||||
*(int16_t *)(hle->mp3_buffer + ((t0 + (short)0xFFC0))) = (short)v[17];
|
||||
|
@ -414,63 +425,63 @@ static void InnerLoop(struct hle_t* hle)
|
|||
/* ** Store v[8] -> (T3 + 0xFFC0) */
|
||||
*(int16_t *)(hle->mp3_buffer + ((t3 + (short)0xFFC0))) = (short)v[8];
|
||||
/* ** Store v[14] -> (T5 + 0x40) */
|
||||
*(int16_t *)(hle->mp3_buffer + ((hle->mp3_t5 + (short)0x40))) = (short)v[14];
|
||||
*(int16_t *)(hle->mp3_buffer + ((t5 + (short)0x40))) = (short)v[14];
|
||||
/* ** Store v[10] -> (T2 + 0xFFC0) */
|
||||
*(int16_t *)(hle->mp3_buffer + ((t2 + (short)0xFFC0))) = (short)v[10];
|
||||
/* 0x14FC - Verified... */
|
||||
|
||||
/* Part 6 - 100% Accurate */
|
||||
|
||||
v[0] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x00 ^ S16));
|
||||
v[31] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x3E ^ S16));
|
||||
v[0] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x00 ^ S16));
|
||||
v[31] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x3E ^ S16));
|
||||
v[0] -= v[31];
|
||||
v[1] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x02 ^ S16));
|
||||
v[30] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x3C ^ S16));
|
||||
v[1] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x02 ^ S16));
|
||||
v[30] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x3C ^ S16));
|
||||
v[1] -= v[30];
|
||||
v[2] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x06 ^ S16));
|
||||
v[28] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x38 ^ S16));
|
||||
v[2] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x06 ^ S16));
|
||||
v[28] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x38 ^ S16));
|
||||
v[2] -= v[28];
|
||||
v[3] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x04 ^ S16));
|
||||
v[29] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x3A ^ S16));
|
||||
v[3] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x04 ^ S16));
|
||||
v[29] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x3A ^ S16));
|
||||
v[3] -= v[29];
|
||||
|
||||
v[4] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x0E ^ S16));
|
||||
v[24] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x30 ^ S16));
|
||||
v[4] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x0E ^ S16));
|
||||
v[24] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x30 ^ S16));
|
||||
v[4] -= v[24];
|
||||
v[5] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x0C ^ S16));
|
||||
v[25] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x32 ^ S16));
|
||||
v[5] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x0C ^ S16));
|
||||
v[25] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x32 ^ S16));
|
||||
v[5] -= v[25];
|
||||
v[6] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x08 ^ S16));
|
||||
v[27] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x36 ^ S16));
|
||||
v[6] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x08 ^ S16));
|
||||
v[27] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x36 ^ S16));
|
||||
v[6] -= v[27];
|
||||
v[7] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x0A ^ S16));
|
||||
v[26] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x34 ^ S16));
|
||||
v[7] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x0A ^ S16));
|
||||
v[26] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x34 ^ S16));
|
||||
v[7] -= v[26];
|
||||
|
||||
v[8] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x1E ^ S16));
|
||||
v[16] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x20 ^ S16));
|
||||
v[8] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x1E ^ S16));
|
||||
v[16] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x20 ^ S16));
|
||||
v[8] -= v[16];
|
||||
v[9] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x1C ^ S16));
|
||||
v[17] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x22 ^ S16));
|
||||
v[9] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x1C ^ S16));
|
||||
v[17] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x22 ^ S16));
|
||||
v[9] -= v[17];
|
||||
v[10] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x18 ^ S16));
|
||||
v[19] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x26 ^ S16));
|
||||
v[10] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x18 ^ S16));
|
||||
v[19] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x26 ^ S16));
|
||||
v[10] -= v[19];
|
||||
v[11] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x1A ^ S16));
|
||||
v[18] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x24 ^ S16));
|
||||
v[11] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x1A ^ S16));
|
||||
v[18] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x24 ^ S16));
|
||||
v[11] -= v[18];
|
||||
|
||||
v[12] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x10 ^ S16));
|
||||
v[23] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x2E ^ S16));
|
||||
v[12] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x10 ^ S16));
|
||||
v[23] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x2E ^ S16));
|
||||
v[12] -= v[23];
|
||||
v[13] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x12 ^ S16));
|
||||
v[22] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x2C ^ S16));
|
||||
v[13] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x12 ^ S16));
|
||||
v[22] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x2C ^ S16));
|
||||
v[13] -= v[22];
|
||||
v[14] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x16 ^ S16));
|
||||
v[20] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x28 ^ S16));
|
||||
v[14] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x16 ^ S16));
|
||||
v[20] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x28 ^ S16));
|
||||
v[14] -= v[20];
|
||||
v[15] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x14 ^ S16));
|
||||
v[21] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x2A ^ S16));
|
||||
v[15] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x14 ^ S16));
|
||||
v[21] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x2A ^ S16));
|
||||
v[15] -= v[21];
|
||||
|
||||
for (i = 0; i < 16; i++)
|
||||
|
@ -516,10 +527,10 @@ static void InnerLoop(struct hle_t* hle)
|
|||
v[14] = v[6] - v[14];
|
||||
v[15] = (((v[30] - v[31]) * 0x5A827) >> 0x10) - v[7];
|
||||
/* Store v14 -> (T5 + 0x20) */
|
||||
*(int16_t *)(hle->mp3_buffer + ((hle->mp3_t5 + (short)0x20))) = (short)v[14];
|
||||
*(int16_t *)(hle->mp3_buffer + ((t5 + (short)0x20))) = (short)v[14];
|
||||
v[14] = v[14] + v[1];
|
||||
/* Store v[14] -> (T6 + 0x20) */
|
||||
*(int16_t *)(hle->mp3_buffer + ((hle->mp3_t6 + (short)0x20))) = (short)v[14];
|
||||
*(int16_t *)(hle->mp3_buffer + ((t6 + (short)0x20))) = (short)v[14];
|
||||
/* Store v[15] -> (T1 + 0xFFE0) */
|
||||
*(int16_t *)(hle->mp3_buffer + ((t1 + (short)0xFFE0))) = (short)v[15];
|
||||
v[9] = v[9] + v[10];
|
||||
|
@ -527,7 +538,7 @@ static void InnerLoop(struct hle_t* hle)
|
|||
v[6] = v[10] - v[6];
|
||||
v[1] = v[9] - v[1];
|
||||
/* Store v[6] -> (T5 + 0x60) */
|
||||
*(int16_t *)(hle->mp3_buffer + ((hle->mp3_t5 + (short)0x60))) = (short)v[6];
|
||||
*(int16_t *)(hle->mp3_buffer + ((t5 + (short)0x60))) = (short)v[6];
|
||||
v[10] = v[10] + v[2];
|
||||
v[10] = v[4] - v[10];
|
||||
/* Store v[10] -> (T2 + 0xFFA0) */
|
||||
|
@ -547,7 +558,7 @@ static void InnerLoop(struct hle_t* hle)
|
|||
*(int16_t *)(hle->mp3_buffer + ((t1 + (short)0xFFA0))) = (short)v[7];
|
||||
v[11] = v[11] - v[3];
|
||||
/* Store v[1] -> (T6 + 0x60) */
|
||||
*(int16_t *)(hle->mp3_buffer + ((hle->mp3_t6 + (short)0x60))) = (short)v[1];
|
||||
*(int16_t *)(hle->mp3_buffer + ((t6 + (short)0x60))) = (short)v[1];
|
||||
v[11] = v[11] - v[5];
|
||||
/* Store v[11] -> (T0 + 0x60) */
|
||||
*(int16_t *)(hle->mp3_buffer + ((t0 + (short)0x60))) = (short)v[11];
|
||||
|
@ -564,9 +575,9 @@ static void InnerLoop(struct hle_t* hle)
|
|||
|
||||
/* Step 8 - Dewindowing */
|
||||
|
||||
addptr = hle->mp3_t6 & 0xFFE0;
|
||||
addptr = t6 & 0xFFE0;
|
||||
|
||||
offset = 0x10 - (hle->mp3_t4 >> 1);
|
||||
offset = 0x10 - (t4 >> 1);
|
||||
for (x = 0; x < 8; x++) {
|
||||
int32_t v0;
|
||||
int32_t v18;
|
||||
|
@ -585,14 +596,14 @@ static void InnerLoop(struct hle_t* hle)
|
|||
/* Clamp(v0); */
|
||||
/* Clamp(v18); */
|
||||
/* clamp??? */
|
||||
*(int16_t *)(hle->mp3_buffer + (hle->mp3_outPtr ^ S16)) = v0;
|
||||
*(int16_t *)(hle->mp3_buffer + ((hle->mp3_outPtr + 2)^S16)) = v18;
|
||||
hle->mp3_outPtr += 4;
|
||||
*(int16_t *)(hle->mp3_buffer + (outPtr ^ S16)) = v0;
|
||||
*(int16_t *)(hle->mp3_buffer + ((outPtr + 2)^S16)) = v18;
|
||||
outPtr += 4;
|
||||
addptr += 0x30;
|
||||
offset += 0x38;
|
||||
}
|
||||
|
||||
offset = 0x10 - (hle->mp3_t4 >> 1) + 8 * 0x40;
|
||||
offset = 0x10 - (t4 >> 1) + 8 * 0x40;
|
||||
v2 = v4 = 0;
|
||||
for (i = 0; i < 4; i++) {
|
||||
v2 += ((int) * (int16_t *)(hle->mp3_buffer + (addptr) + 0x00) * (short)DeWindowLUT[offset + 0x00] + 0x4000) >> 0xF;
|
||||
|
@ -606,12 +617,12 @@ static void InnerLoop(struct hle_t* hle)
|
|||
}
|
||||
mult6 = *(int32_t *)(hle->mp3_buffer + 0xCE8);
|
||||
mult4 = *(int32_t *)(hle->mp3_buffer + 0xCEC);
|
||||
if (hle->mp3_t4 & 0x2) {
|
||||
if (t4 & 0x2) {
|
||||
v2 = (v2 **(uint32_t *)(hle->mp3_buffer + 0xCE8)) >> 0x10;
|
||||
*(int16_t *)(hle->mp3_buffer + (hle->mp3_outPtr ^ S16)) = v2;
|
||||
*(int16_t *)(hle->mp3_buffer + (outPtr ^ S16)) = v2;
|
||||
} else {
|
||||
v4 = (v4 **(uint32_t *)(hle->mp3_buffer + 0xCE8)) >> 0x10;
|
||||
*(int16_t *)(hle->mp3_buffer + (hle->mp3_outPtr ^ S16)) = v4;
|
||||
*(int16_t *)(hle->mp3_buffer + (outPtr ^ S16)) = v4;
|
||||
mult4 = *(uint32_t *)(hle->mp3_buffer + 0xCE8);
|
||||
}
|
||||
addptr -= 0x50;
|
||||
|
@ -621,7 +632,7 @@ static void InnerLoop(struct hle_t* hle)
|
|||
int32_t v18;
|
||||
v2 = v4 = v6 = v8 = 0;
|
||||
|
||||
offset = (0x22F - (hle->mp3_t4 >> 1) + x * 0x40);
|
||||
offset = (0x22F - (t4 >> 1) + x * 0x40);
|
||||
|
||||
for (i = 0; i < 4; i++) {
|
||||
v2 += ((int) * (int16_t *)(hle->mp3_buffer + (addptr) + 0x20) * (short)DeWindowLUT[offset + 0x00] + 0x4000) >> 0xF;
|
||||
|
@ -640,13 +651,13 @@ static void InnerLoop(struct hle_t* hle)
|
|||
/* Clamp(v0); */
|
||||
/* Clamp(v18); */
|
||||
/* clamp??? */
|
||||
*(int16_t *)(hle->mp3_buffer + ((hle->mp3_outPtr + 2)^S16)) = v0;
|
||||
*(int16_t *)(hle->mp3_buffer + ((hle->mp3_outPtr + 4)^S16)) = v18;
|
||||
hle->mp3_outPtr += 4;
|
||||
*(int16_t *)(hle->mp3_buffer + ((outPtr + 2)^S16)) = v0;
|
||||
*(int16_t *)(hle->mp3_buffer + ((outPtr + 4)^S16)) = v18;
|
||||
outPtr += 4;
|
||||
addptr -= 0x50;
|
||||
}
|
||||
|
||||
tmp = hle->mp3_outPtr;
|
||||
tmp = outPtr;
|
||||
hi0 = mult6;
|
||||
hi1 = mult4;
|
||||
|
||||
|
@ -655,43 +666,20 @@ static void InnerLoop(struct hle_t* hle)
|
|||
for (i = 0; i < 8; i++) {
|
||||
/* v0 */
|
||||
vt = (*(int16_t *)(hle->mp3_buffer + ((tmp - 0x40)^S16)) * hi0);
|
||||
if (vt > 32767) {
|
||||
vt = 32767;
|
||||
} else {
|
||||
if (vt < -32767)
|
||||
vt = -32767;
|
||||
}
|
||||
*(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0x40)^S16)) = (int16_t)vt;
|
||||
*(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0x40)^S16)) = clamp_s16(vt);
|
||||
|
||||
/* v17 */
|
||||
vt = (*(int16_t *)(hle->mp3_buffer + ((tmp - 0x30)^S16)) * hi0);
|
||||
if (vt > 32767) {
|
||||
vt = 32767;
|
||||
} else {
|
||||
if (vt < -32767)
|
||||
vt = -32767;
|
||||
}
|
||||
*(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0x30)^S16)) = vt;
|
||||
*(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0x30)^S16)) = clamp_s16(vt);
|
||||
|
||||
/* v2 */
|
||||
vt = (*(int16_t *)(hle->mp3_buffer + ((tmp - 0x1E)^S16)) * hi1);
|
||||
if (vt > 32767) {
|
||||
vt = 32767;
|
||||
} else {
|
||||
if (vt < -32767)
|
||||
vt = -32767;
|
||||
}
|
||||
*(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0x1E)^S16)) = vt;
|
||||
*(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0x1E)^S16)) = clamp_s16(vt);
|
||||
|
||||
/* v4 */
|
||||
vt = (*(int16_t *)(hle->mp3_buffer + ((tmp - 0xE)^S16)) * hi1);
|
||||
if (vt > 32767) {
|
||||
vt = 32767;
|
||||
} else {
|
||||
if (vt < -32767)
|
||||
vt = -32767;
|
||||
}
|
||||
*(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0xE)^S16)) = vt;
|
||||
*(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0xE)^S16)) = clamp_s16(vt);
|
||||
|
||||
tmp += 2;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -28,6 +28,8 @@
|
|||
#include <string.h>
|
||||
#include <stddef.h>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#include "arithmetics.h"
|
||||
#include "audio.h"
|
||||
#include "hle_external.h"
|
||||
|
@ -436,20 +438,16 @@ static void init_subframes_v1(musyx_t *musyx)
|
|||
static void init_subframes_v2(musyx_t *musyx)
|
||||
{
|
||||
unsigned i,k;
|
||||
int16_t values[4];
|
||||
int16_t* subframes[4];
|
||||
|
||||
int16_t values[4] = {
|
||||
clamp_s16(musyx->base_vol[0]),
|
||||
clamp_s16(musyx->base_vol[1]),
|
||||
clamp_s16(musyx->base_vol[2]),
|
||||
clamp_s16(musyx->base_vol[3])
|
||||
};
|
||||
for(k = 0; k < 4; ++k)
|
||||
values[k] = clamp_s16(musyx->base_vol[k]);
|
||||
|
||||
int16_t* subframes[4] = {
|
||||
musyx->left,
|
||||
musyx->right,
|
||||
musyx->cc0,
|
||||
musyx->e50
|
||||
};
|
||||
subframes[0] = musyx->left;
|
||||
subframes[1] = musyx->right;
|
||||
subframes[2] = musyx->cc0;
|
||||
subframes[3] = musyx->e50;
|
||||
|
||||
for (i = 0; i < SUBFRAME_SIZE; ++i) {
|
||||
|
||||
|
@ -855,7 +853,7 @@ static void sfx_stage(struct hle_t* hle, mix_sfx_with_main_subframes_t mix_sfx_w
|
|||
}
|
||||
|
||||
static void mix_sfx_with_main_subframes_v1(musyx_t *musyx, const int16_t *subframe,
|
||||
const uint16_t* gains)
|
||||
const uint16_t* UNUSED(gains))
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
|
|
|
@ -47,6 +47,12 @@ int usf_upload_section(void * state, const uint8_t * data, size_t size);
|
|||
Requesting zero samples with a null pointer is an acceptable way to
|
||||
force at least one block of samples to render and return the current
|
||||
sample rate in the variable passed in.
|
||||
Requesting a non-zero number of samples with a null buffer pointer will
|
||||
result in exactly count samples being rendered and discarded.
|
||||
Emulation runs in whole blocks until there have been exactly enough
|
||||
Audio Interface DMA transfers to at least fill count samples, at which
|
||||
point the remainder is buffered in the emulator state until the next
|
||||
usf_render() call.
|
||||
Returns 0 on success, or a pointer to the last error message on failure. */
|
||||
const char * usf_render(void * state, int16_t * buffer, size_t count, int32_t * sample_rate);
|
||||
|
||||
|
|
|
@ -77,10 +77,20 @@ struct usf_state
|
|||
int16_t * sample_buffer;
|
||||
|
||||
// audio.c
|
||||
// SampleRate is usually guaranteed to stay the same for the duration
|
||||
// of a given track, and depends on the game.
|
||||
int32_t SampleRate;
|
||||
// Audio is rendered in whole Audio Interface DMA transfers, which are
|
||||
// then copied directly to the caller's buffer. Any left over samples
|
||||
// from the last DMA transfer that fills the caller's buffer will be
|
||||
// stored here until the next call to usf_render()
|
||||
int16_t samplebuf[16384];
|
||||
size_t samples_in_buffer;
|
||||
|
||||
// This buffer does not really need to be that large, as it is likely
|
||||
// to only accumulate a handlful of error messages, at which point
|
||||
// emulation is immediately halted and the messages are returned to
|
||||
// the caller.
|
||||
const char * last_error;
|
||||
char error_message[1024];
|
||||
|
||||
|
|
|
@ -1030,6 +1030,8 @@ static int usf_info(void * context, const char * name, const char * value)
|
|||
sampleRate = samplerate;
|
||||
|
||||
usfRemoveSilence = YES;
|
||||
|
||||
silence_seconds = 10;
|
||||
}
|
||||
else if ( type == 0x22 )
|
||||
{
|
||||
|
|
Loading…
Reference in New Issue