Update LazyUSF and increased silence detection threshold for USF files to 10 seconds, which fixes Majora's Mask - Staff Roll

2014-04-07 17:42:09 -07:00 · 2014-04-07 17:42:09 -07:00 · 0fb8aa57bb
parent ef2c8efdf9
commit 0fb8aa57bb
56 changed files with 1765 additions and 423 deletions
--- a/Frameworks/lazyusf/lazyusf/memory.c
+++ b/Frameworks/lazyusf/lazyusf/memory.c
@ -191,7 +191,7 @@ int32_t r4300i_LB_NonMemory ( usf_state_t * state, uint32_t PAddr, uint32_t * Va
 }

 uint32_t r4300i_LB_VAddr ( usf_state_t * state, uint32_t VAddr, uint8_t * Value ) {
-	uintptr_t address;
+	uint32_t address;
 	address = state->TLB_Map[VAddr >> 12];
 	if (address == 0) { return 0; }
 	*Value = *(uint8_t *)(address + (VAddr ^ 3));
@ -199,7 +199,7 @@ uint32_t r4300i_LB_VAddr ( usf_state_t * state, uint32_t VAddr, uint8_t * Value
 }

 uint32_t r4300i_LD_VAddr ( usf_state_t * state, uint32_t VAddr, uint64_t * Value ) {
-	uintptr_t address;
+	uint32_t address;
 	address = state->TLB_Map[VAddr >> 12];
 	if (address == 0) { return 0; }
 	*((uint32_t *)(Value) + 1) = *(uint32_t *)(address + VAddr);
@ -220,7 +220,7 @@ int32_t r4300i_LH_NonMemory ( usf_state_t * state, uint32_t PAddr, uint32_t * Va
 }

 uint32_t r4300i_LH_VAddr ( usf_state_t * state, uint32_t VAddr, uint16_t * Value ) {
-	uintptr_t address;
+	uint32_t address;
 	address = state->TLB_Map[VAddr >> 12];
 	if (address == 0)
 		return 0;
@ -426,7 +426,7 @@ int32_t r4300i_SB_NonMemory ( usf_state_t * state, uint32_t PAddr, uint8_t Value
 }

 uint32_t r4300i_SB_VAddr ( usf_state_t * state, uint32_t VAddr, uint8_t Value ) {
-	uintptr_t address;
+	uint32_t address;
 	address = state->TLB_Map[VAddr >> 12];

 	if (address == 0) { return 0; }
@ -457,7 +457,7 @@ int32_t r4300i_SH_NonMemory ( usf_state_t * state, uint32_t PAddr, uint16_t Valu
 }

 uint32_t r4300i_SD_VAddr ( usf_state_t * state, uint32_t VAddr, uint64_t Value ) {
-	uintptr_t address;
+	uint32_t address;
 	address = state->TLB_Map[VAddr >> 12];
 	if (address == 0) { return 0; }
 	*(uint32_t *)(address + VAddr) = *((uint32_t *)(&Value) + 1);
@ -466,7 +466,7 @@ uint32_t r4300i_SD_VAddr ( usf_state_t * state, uint32_t VAddr, uint64_t Value )
 }

 uint32_t r4300i_SH_VAddr ( usf_state_t * state, uint32_t VAddr, uint16_t Value ) {
-	uintptr_t address;
+	uint32_t address;
 	address = state->TLB_Map[VAddr >> 12];

 	if (address == 0) { return 0; }
--- a/Frameworks/lazyusf/lazyusf/rsp/execute.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/execute.h
@ -22,6 +22,9 @@ NOINLINE void run_task(usf_state_t * state)
 {
    register int PC;
    int wrap_count = 0;
+#ifdef SP_EXECUTE_LOG
+    int last_PC;
+#endif

    if (CFG_WAIT_FOR_CPU_HOST != 0)
    {
@ -36,6 +39,9 @@ NOINLINE void run_task(usf_state_t * state)
        register uint32_t inst;

        inst = *(uint32_t *)(state->IMEM + FIT_IMEM(PC));
+#ifdef SP_EXECUTE_LOG
+        last_PC = PC;
+#endif
 #ifdef EMULATE_STATIC_PC
        PC = (PC + 0x004);
        if ( FIT_IMEM(PC) == 0 && ++wrap_count == 32 )
@ -46,7 +52,7 @@ NOINLINE void run_task(usf_state_t * state)
 EX:
 #endif
 #ifdef SP_EXECUTE_LOG
-        step_SP_commands(inst);
+        step_SP_commands(state, last_PC, inst);
 #endif
        if (inst >> 25 == 0x25) /* is a VU instruction */
        {
@ -463,6 +469,9 @@ EX:
        continue;
 BRANCH:
        inst = *(uint32_t *)(state->IMEM + FIT_IMEM(PC));
+#ifdef SP_EXECUTE_LOG
+        last_PC = PC;
+#endif
        PC = state->temp_PC & 0x00000FFC;
        goto EX;
 #endif
--- a/Frameworks/lazyusf/lazyusf/rsp/rsp.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/rsp.h
@ -58,12 +58,10 @@ NOINLINE static void message(usf_state_t * state, const char* body, int priority
 */
 #define CHARACTERS_PER_LINE     (80)
 /* typical standard DOS text file limit per line */
-#if 0
 NOINLINE static void update_conf(const char* source)
 {
    (void)source;
 }
-#endif

 #ifdef SP_EXECUTE_LOG
 extern void step_SP_commands(usf_state_t * state, int PC, uint32_t inst);
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/cf.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/cf.h
@ -180,7 +180,6 @@ void set_VCC(usf_state_t * state, unsigned short VCC)
        state->clip[i] = (VCC >> (i + 0x8)) & 1;
    return; /* Little endian becomes big. */
 }
-#if 0
 void set_VCE(usf_state_t * state, unsigned char VCE)
 {
    register int i;
@ -190,4 +189,3 @@ void set_VCE(usf_state_t * state, unsigned char VCE)
    return; /* Little endian becomes big. */
 }
 #endif
-#endif
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/clamp.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/clamp.h
@ -42,6 +42,22 @@
 static INLINE void merge(short* VD, short* cmp, short* pass, short* fail)
 {
    register int i;
+
+#ifdef ARCH_MIN_ARM_NEON
+
+	int16x8_t p,f,d,c,vd,temp;
+
+	p = vld1q_s16((const int16_t*)pass);
+	f = vld1q_s16((const int16_t*)fail);
+	c = vld1q_s16((const int16_t*)cmp);
+	
+	d = vsubq_s16(p,f);
+	vd = vmlaq_s16(f, c, d); //vd = f + (cmp * d)
+	vst1q_s16(VD, vd);
+	return;
+	
+#else
+
 #if (0)
 /* Do not use this version yet, as it still does not vectorize to SSE2. */
    for (i = 0; i < N; i++)
@ -55,9 +71,92 @@ static INLINE void merge(short* VD, short* cmp, short* pass, short* fail)
        VD[i] = fail[i] + cmp[i]*diff[i]; /* actually `(cmp[i] != 0)*diff[i]` */
 #endif
    return;
+
+#endif
 }

-#ifndef ARCH_MIN_SSE2
+#ifdef ARCH_MIN_ARM_NEON
+static INLINE void vector_copy(short * VD, short * VS)
+{
+    int16x8_t xmm;
+    xmm = vld1q_s16((const int16_t*)VS);
+    vst1q_s16(VD, xmm);
+
+    return;
+}
+
+static INLINE void SIGNED_CLAMP_ADD(usf_state_t * state, short* VD, short* VS, short* VT)
+{
+    int16x8_t dst, src, vco, max, min;
+
+    src = vld1q_s16((const int16_t*)VS);
+    dst = vld1q_s16((const int16_t*)VT);
+    vco = vld1q_s16((const int16_t*)state->co);
+
+    max = vmaxq_s16(dst, src);
+    min = vminq_s16(dst, src);
+
+    min = vqaddq_s16(min, vco);
+    max = vqaddq_s16(max, min);
+
+    vst1q_s16(VD, max);
+    return;
+
+}
+
+static INLINE void SIGNED_CLAMP_SUB(usf_state_t * state, short* VD, short* VS, short* VT)
+{
+	int16x8_t dst, src, vco, dif, res, xmm,vd;
+	
+    src = vld1q_s16((const int16_t*)VS);
+	vd = vld1q_s16((const int16_t*)VD);
+    dst = vld1q_s16((const int16_t*)VT);
+    vco = vld1q_s16((const int16_t*)state->co);
+
+    res = vqsubq_s16(src, dst);
+
+    dif = vaddq_s16(res, vco);
+    dif = veorq_s16(dif, res);
+    dif = vandq_s16(dif, dst);
+    xmm = vsubq_s16(src, dst); 
+	src = vbicq_s16(dif, src); 
+    xmm = vandq_s16(xmm, src);
+    xmm = vshrq_n_s16(xmm, 15);
+
+	xmm = vbicq_s16(vco, xmm);
+    res = vqsubq_s16(res, xmm);
+    vst1q_s16(VD, res);
+
+    return;
+	
+}
+
+static INLINE void SIGNED_CLAMP_AM(usf_state_t * state, short* VD)
+{
+	int16x8_t pvs, pvd;
+	int16x8x2_t packed;
+	int16x8_t result;
+	int16x4_t low, high;
+	
+	pvs = vld1q_s16((const int16_t*)VACC_H);
+	pvd = vld1q_s16((const int16_t*)VACC_M);
+		
+	packed = vzipq_s16(pvd,pvs);
+		
+	low = vqmovn_s32((int32x4_t)packed.val[0]);
+	high = vqmovn_s32((int32x4_t)packed.val[1]);
+	
+	result = vcombine_s16(low,high);
+		
+	vst1q_s16(VD,result);
+
+	return;
+}
+
+#endif
+
+#if !defined ARCH_MIN_SSE2 && !defined ARCH_MIN_ARM_NEON
+
 static INLINE void vector_copy(short* VD, short* VS)
 {
 #if (0)
@ -92,6 +191,8 @@ static INLINE void SIGNED_CLAMP_ADD(usf_state_t * state, short* VD, short* VS, s
        VD[i] ^= 0x8000 & (hi[i] | lo[i]);
    return;
 }
+
+
 static INLINE void SIGNED_CLAMP_SUB(usf_state_t * state, short* VD, short* VS, short* VT)
 {
    ALIGNED int32_t dif[N];
@ -113,8 +214,9 @@ static INLINE void SIGNED_CLAMP_SUB(usf_state_t * state, short* VD, short* VS, s
        VD[i] ^= 0x8000 & (hi[i] | lo[i]);
    return;
 }
+
 static INLINE void SIGNED_CLAMP_AM(usf_state_t * state, short* VD)
-{ /* typical sign-clamp of accumulator-mid (bits 31:16) */
+{ 
    ALIGNED short hi[N], lo[N];
    register int i;

@ -135,7 +237,9 @@ static INLINE void SIGNED_CLAMP_AM(usf_state_t * state, short* VD)
        VD[i] ^= 0x8000 * (hi[i] | lo[i]);
    return;
 }
-#else
+#endif
+
+#ifdef ARCH_MIN_SSE2
 /*
 * We actually need to write explicit SSE2 code for this because GCC 4.8.1
 * (and possibly later versions) has a code generation bug with vectorizing
@ -225,10 +329,29 @@ static INLINE void SIGNED_CLAMP_AM(usf_state_t * state, short* VD)

 static INLINE void UNSIGNED_CLAMP(usf_state_t * state, short* VD)
 { /* sign-zero hybrid clamp of accumulator-mid (bits 31:16) */
+
    ALIGNED short cond[N];
    ALIGNED short temp[N];
    register int i;

+#ifdef ARCH_MIN_ARM_NEON
+	
+	uint16x8_t c;
+	int16x8_t t = vld1q_s16((const int16_t*)temp);
+	int16x8_t vaccm = vld1q_s16((const int16_t*)VACC_M);
+
+    SIGNED_CLAMP_AM(state, temp);
+
+	c = vcgtq_s16(t,vaccm);
+	int16x8_t t_ = vshrq_n_s16(t,15);
+	int16x8_t vd = vbicq_s16(t,t_);
+	vd = vorrq_s16(vd,(int16x8_t)c);
+	vst1q_s16(VD, vd);
+	
+	return;
+
+#else
+
    SIGNED_CLAMP_AM(state, temp); /* no direct map in SSE, but closely based on this */
    for (i = 0; i < N; i++)
        cond[i] = -(temp[i] >  VACC_M[i]); /* VD |= -(ACC47..16 > +32767) */
@ -237,12 +360,36 @@ static INLINE void UNSIGNED_CLAMP(usf_state_t * state, short* VD)
    for (i = 0; i < N; i++)
        VD[i] = VD[i] | cond[i];
    return;
+#endif
 }
+
 static INLINE void SIGNED_CLAMP_AL(usf_state_t * state, short* VD)
 { /* sign-clamp accumulator-low (bits 15:0) */
+
    ALIGNED short cond[N];
    ALIGNED short temp[N];
    register int i;
+	
+	
+#ifdef ARCH_MIN_ARM_NEON
+	
+	SIGNED_CLAMP_AM(state, temp); 
+	
+	uint16x8_t c;
+	int16x8_t eightk = vdupq_n_s16(0x8000);
+	uint16x8_t one = vdupq_n_u16(1);
+	int16x8_t t = vld1q_s16((const int16_t*)temp);
+	int16x8_t vaccm = vld1q_s16((const int16_t*)VACC_M);
+
+	c = vceqq_s16(t,vaccm);
+	c = vaddq_u16(c, one); 
+	t = veorq_s16(t, eightk);
+	vst1q_u16(cond,c);
+	vst1q_s16(temp,t);	
+	merge(VD, cond, temp, VACC_L);
+	
+	return;
+#else

    SIGNED_CLAMP_AM(state, temp); /* no direct map in SSE, but closely based on this */
    for (i = 0; i < N; i++)
@ -251,5 +398,6 @@ static INLINE void SIGNED_CLAMP_AL(usf_state_t * state, short* VD)
        temp[i] ^= 0x8000; /* half-assed unsigned saturation mix in the clamp */
    merge(VD, cond, temp, VACC_L);
    return;
+#endif
 }
 #endif
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/shuffle.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/shuffle.h
@ -22,65 +22,42 @@
 #define INLINE
 #endif

-#ifndef ARCH_MIN_SSE2
-/*
- * vector-scalar element decoding
- * Obsolete.  Consider using at least the SSE2 algorithms instead.
- */
-static const int ei[16][8] = {
-    { 00, 01, 02, 03, 04, 05, 06, 07 }, /* none (vector-only operand) */
-    { 00, 01, 02, 03, 04, 05, 06, 07 },
-    { 00, 00, 02, 02, 04, 04, 06, 06 }, /* 0Q */
-    { 01, 01, 03, 03, 05, 05, 07, 07 }, /* 1Q */
-    { 00, 00, 00, 00, 04, 04, 04, 04 }, /* 0H */
-    { 01, 01, 01, 01, 05, 05, 05, 05 }, /* 1H */
-    { 02, 02, 02, 02, 06, 06, 06, 06 }, /* 2H */
-    { 03, 03, 03, 03, 07, 07, 07, 07 }, /* 3H */
-    { 00, 00, 00, 00, 00, 00, 00, 00 }, /* 0 */
-    { 01, 01, 01, 01, 01, 01, 01, 01 }, /* 1 */
-    { 02, 02, 02, 02, 02, 02, 02, 02 }, /* 2 */
-    { 03, 03, 03, 03, 03, 03, 03, 03 }, /* 3 */
-    { 04, 04, 04, 04, 04, 04, 04, 04 }, /* 4 */
-    { 05, 05, 05, 05, 05, 05, 05, 05 }, /* 5 */
-    { 06, 06, 06, 06, 06, 06, 06, 06 }, /* 6 */
-    { 07, 07, 07, 07, 07, 07, 07, 07 }  /* 7 */
+#ifdef ARCH_MIN_ARM_NEON
+static const unsigned char smask[16][16] = {
+    {0x0,0x1,0x2,0x3,0x4,0x5,0x6,0x7,0x8,0x9,0xA,0xB,0xC,0xD,0xE,0xF},
+    {0x0,0x1,0x2,0x3,0x4,0x5,0x6,0x7,0x8,0x9,0xA,0xB,0xC,0xD,0xE,0xF},
+    {0x0,0x1,0x0,0x1,0x4,0x5,0x4,0x5,0x8,0x9,0x8,0x9,0xC,0xD,0xC,0xD},
+    {0x2,0x3,0x2,0x3,0x6,0x7,0x6,0x7,0xA,0xB,0xA,0xB,0xE,0xF,0xE,0xF},
+    {0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9},
+    {0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB},
+    {0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD},
+    {0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF},
+    {0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1},
+    {0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3},
+    {0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5},
+    {0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7},
+    {0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9},
+    {0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB},
+    {0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD},
+    {0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF}
 };

-int sub_mask[16] = {
-    0x0,
-    0x0,
-    0x1, 0x1,
-    0x3, 0x3, 0x3, 0x3,
-    0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7
-};
+#define conv816_to_882(v) ((int8x8x2_t) { vget_low_s8(v), vget_high_s8(v) })

 INLINE static void SHUFFLE_VECTOR(short* VD, short* VT, const int e)
-{
-    ALIGNED short SV[8];
-    register int i, j;
-#if (0 == 0)
-    j = sub_mask[e];
-    for (i = 0; i < N; i++)
-        SV[i] = VT[(i & ~j) | (e & j)];
-#else
-    if (e & 0x8)
-        for (i = 0; i < N; i++)
-            SV[i] = VT[(i & 0x0) | (e & 0x7)];
-    else if (e & 0x4)
-        for (i = 0; i < N; i++)
-            SV[i] = VT[(i & 0xC) | (e & 0x3)];
-    else if (e & 0x2)
-        for (i = 0; i < N; i++)
-            SV[i] = VT[(i & 0xE) | (e & 0x1)];
-    else /* if ((e == 0b0000) || (e == 0b0001)) */
-        for (i = 0; i < N; i++)
-            SV[i] = VT[(i & 0x7) | (e & 0x0)];
-#endif
-    for (i = 0; i < N; i++)
-        *(VD + i) = *(SV + i);
+{ 
+	int8x16_t xmm;
+    int8x16_t key;
+
+    xmm = vld1q_s8((const int8_t*)VT);
+    key = vld1q_s8(smask[e]);
+    xmm = vcombine_s8(vtbl2_s8(conv816_to_882(xmm), vget_low_s8(key)), vtbl2_s8(conv816_to_882(xmm), vget_high_s8(key)));
+	vst1q_s8((int8_t*)VD, xmm);
+    	    
    return;
 }
-#else
+#endif
+
 #ifdef ARCH_MIN_SSSE3
 static const unsigned char smask[16][16] = {
    {0x0,0x1,0x2,0x3,0x4,0x5,0x6,0x7,0x8,0x9,0xA,0xB,0xC,0xD,0xE,0xF},
@ -112,7 +89,9 @@ INLINE static void SHUFFLE_VECTOR(short* VD, short* VT, const int e)
    _mm_store_si128((__m128i *)VD, xmm);
    return;
 }
-#else
+#endif
+
+#if defined ARCH_MIN_SSE2 && !defined ARCH_MIN_SSSE3
 #define B(x)    ((x) & 3)
 #define SHUFFLE(a,b,c,d)    ((B(d)<<6) | (B(c)<<4) | (B(b)<<2) | (B(a)<<0))

@ -274,5 +253,65 @@ INLINE static void SHUFFLE_VECTOR(short* VD, short* VT, const int e)
    return;
 }
 #endif
+
+#if !defined ARCH_MIN_ARM_NEON && !defined ARCH_MIN_SSE2 && !defined ARCH_MIN_SSSE3
+/*
+ * vector-scalar element decoding
+ * Obsolete.  Consider using at least the SSE2 algorithms instead.
+ */
+static const int ei[16][8] = {
+    { 00, 01, 02, 03, 04, 05, 06, 07 }, /* none (vector-only operand) */
+    { 00, 01, 02, 03, 04, 05, 06, 07 },
+    { 00, 00, 02, 02, 04, 04, 06, 06 }, /* 0Q */
+    { 01, 01, 03, 03, 05, 05, 07, 07 }, /* 1Q */
+    { 00, 00, 00, 00, 04, 04, 04, 04 }, /* 0H */
+    { 01, 01, 01, 01, 05, 05, 05, 05 }, /* 1H */
+    { 02, 02, 02, 02, 06, 06, 06, 06 }, /* 2H */
+    { 03, 03, 03, 03, 07, 07, 07, 07 }, /* 3H */
+    { 00, 00, 00, 00, 00, 00, 00, 00 }, /* 0 */
+    { 01, 01, 01, 01, 01, 01, 01, 01 }, /* 1 */
+    { 02, 02, 02, 02, 02, 02, 02, 02 }, /* 2 */
+    { 03, 03, 03, 03, 03, 03, 03, 03 }, /* 3 */
+    { 04, 04, 04, 04, 04, 04, 04, 04 }, /* 4 */
+    { 05, 05, 05, 05, 05, 05, 05, 05 }, /* 5 */
+    { 06, 06, 06, 06, 06, 06, 06, 06 }, /* 6 */
+    { 07, 07, 07, 07, 07, 07, 07, 07 }  /* 7 */
+};
+
+static const int sub_mask[16] = {
+    0x0,
+    0x0,
+    0x1, 0x1,
+    0x3, 0x3, 0x3, 0x3,
+    0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7
+};
+
+INLINE static void SHUFFLE_VECTOR(short* VD, short* VT, const int e)
+{
+    ALIGNED short SV[8];
+    register int i, j;
+#if (0 == 0)
+    j = sub_mask[e];
+    for (i = 0; i < N; i++)
+        SV[i] = VT[(i & ~j) | (e & j)];
+#else
+    if (e & 0x8)
+        for (i = 0; i < N; i++)
+            SV[i] = VT[(i & 0x0) | (e & 0x7)];
+    else if (e & 0x4)
+        for (i = 0; i < N; i++)
+            SV[i] = VT[(i & 0xC) | (e & 0x3)];
+    else if (e & 0x2)
+        for (i = 0; i < N; i++)
+            SV[i] = VT[(i & 0xE) | (e & 0x1)];
+    else /* if ((e == 0b0000) || (e == 0b0001)) */
+        for (i = 0; i < N; i++)
+            SV[i] = VT[(i & 0x7) | (e & 0x0)];
+#endif
+    for (i = 0; i < N; i++)
+        *(VD + i) = *(SV + i);
+    return;
+}
+
 #endif
 #endif
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vabs.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vabs.h
@ -27,6 +27,30 @@ INLINE static void do_abs(usf_state_t * state, short* VD, short* VS, short* VT)
    register int i;

    vector_copy(res, VT);
+
+#ifdef ARCH_MIN_ARM_NEON
+	
+	int16x8_t vs = vld1q_s16((const int16_t*)VS);
+    int16x8_t resi = vld1q_s16((const int16_t*)VT);
+    int16x8_t zero = vdupq_n_s16(0);
+    int16x8_t eightk = vdupq_n_s16(0x8000);
+    int16x8_t one = vdupq_n_s16(1);
+
+    uint16x8_t negi = vcltq_s16(vs,zero);
+    int16x8_t posi = vaddq_s16((int16x8_t)negi,one);
+    posi = vorrq_s16(posi,(int16x8_t)negi);
+    resi = veorq_s16(resi,posi);
+    uint16x8_t ccch = vcgeq_s16(resi,eightk);
+    int16x8_t ch = vnegq_s16((int16x8_t)ccch);
+    resi = vaddq_s16(resi, (int16x8_t)ch);
+	
+	vst1q_s16(VACC_L, resi);	
+    vector_copy(VD, VACC_L);
+	return;
+	
+#else
+	
+	
 #ifndef ARCH_MIN_SSE2
 #define MASK_XOR
 #endif
@ -65,6 +89,7 @@ INLINE static void do_abs(usf_state_t * state, short* VD, short* VS, short* VT)
    vector_copy(VACC_L, res);
    vector_copy(VD, VACC_L);
    return;
+#endif
 }

 static void VABS(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vadd.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vadd.h
@ -15,6 +15,50 @@

 INLINE static void clr_ci(usf_state_t * state, short* VD, short* VS, short* VT)
 { /* clear CARRY and carry in to accumulators */
+#ifdef ARCH_MIN_SSE2
+	
+	__m128i xmm,vs,vt,co; /*,ne;*/
+	
+	xmm = _mm_setzero_si128();
+	vs = _mm_load_si128((const __m128i*)VS);
+	vt = _mm_load_si128((const __m128i*)VT);
+	co = _mm_load_si128((const __m128i*)state->co);
+	
+	vs = _mm_add_epi16(vs,vt);
+	vs = _mm_add_epi16(vs,co);
+	
+	_mm_store_si128((__m128i*)VACC_L, vs);
+	
+	SIGNED_CLAMP_ADD(state, VD, VS, VT);
+	
+	_mm_storeu_si128((__m128i*)state->ne, xmm);
+	_mm_storeu_si128((__m128i*)state->co, xmm);	
+
+	return;
+#endif
+
+#ifdef ARCH_MIN_ARM_NEON
+
+	int16x8_t vs, vt, zero1,co;
+			
+	zero1 = vdupq_n_s16(0);
+	vs = vld1q_s16((const int16_t*)VS);
+	vt = vld1q_s16((const int16_t*)VT);
+	co = vld1q_s16((const int16_t*)state->co);
+
+	vs = vaddq_s16(vs,vt);
+	vs = vaddq_s16(vs,co);
+	
+	vst1q_s16(VACC_L, vs);
+	
+	SIGNED_CLAMP_ADD(state, VD, VS, VT);
+	vst1q_s16(state->ne, zero1);
+	vst1q_s16(state->co, zero1);
+	
+	return;
+#endif
+
+#if !defined ARCH_MIN_ARM_NEON && !defined ARCH_MIN_SSE2
    register int i;

    for (i = 0; i < N; i++)
@ -24,7 +68,10 @@ INLINE static void clr_ci(usf_state_t * state, short* VD, short* VS, short* VT)
        state->ne[i] = 0;
    for (i = 0; i < N; i++)
        state->co[i] = 0;
+		
    return;
+#endif
+
 }

 static void VADD(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vaddc.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vaddc.h
@ -15,6 +15,30 @@

 INLINE static void set_co(usf_state_t * state, short* VD, short* VS, short* VT)
 { /* set CARRY and carry out from sum */
+
+#ifdef ARCH_MIN_ARM_NEON
+
+	uint16x4_t vs_low = vld1_u16((const uint16_t*)VS);
+    uint16x4_t vs_high = vld1_u16((const uint16_t*)VS+4);
+    uint16x4_t vt_low = vld1_u16((const uint16_t*)VT);
+    uint16x4_t vt_high = vld1_u16((const uint16_t*)VT+4);
+    uint32x4_t zero = vdupq_n_u32(0);
+
+    uint32x4_t v_l = vaddl_u16(vs_low, vt_low);
+    uint32x4_t v_h = vaddl_u16(vs_high, vt_high);
+    uint16x4_t vl16 = vaddhn_u32(v_l,zero);
+    uint16x4_t vh16 = vaddhn_u32(v_h,zero);
+    uint16x8_t vaccl = vcombine_u16(vmovn_u32(v_l),vmovn_u32(v_h));
+    uint16x8_t co = vcombine_u16(vl16,vh16);
+	
+	vst1q_u16(VACC_L, vaccl);
+	vector_copy(VD, VACC_L);
+	vst1q_u16(state->ne, (uint16x8_t)zero);
+	vst1q_u16(state->co, co);
+	
+	return;
+#else
+
    ALIGNED int32_t sum[N];
    register int i;

@ -28,6 +52,8 @@ INLINE static void set_co(usf_state_t * state, short* VD, short* VS, short* VT)
    for (i = 0; i < N; i++)
        state->co[i] = sum[i] >> 16; /* native:  (sum[i] > +65535) */
    return;
+
+#endif
 }

 static void VADDC(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vand.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vand.h
@ -13,14 +13,25 @@
 \******************************************************************************/
 #include "vu.h"

-static INLINE void do_and(usf_state_t * state, short* VD, short* VS, short* VT)
+INLINE void do_and(usf_state_t * state, short* VD, short* VS, short* VT)
 {
+#ifdef ARCH_MIN_ARM_NEON
+
+	int16x8_t vs = vld1q_s16((const int16_t *)VS);
+    int16x8_t vt = vld1q_s16((const int16_t *)VT);
+	int16x8_t vaccl = vandq_s16(vs,vt);
+	vst1q_s16(VACC_L, vaccl);
+	vector_copy(VD, VACC_L);
+	return;
+	
+#else
    register int i;

    for (i = 0; i < N; i++)
        VACC_L[i] = VS[i] & VT[i];
    vector_copy(VD, VACC_L);
    return;
+#endif
 }

 static void VAND(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vch.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vch.h
@ -19,6 +19,58 @@ INLINE static void do_ch(usf_state_t * state, short* VD, short* VS, short* VT)
    ALIGNED short sn[N];
    ALIGNED short VC[N];
    ALIGNED short diff[N];
+
+#ifdef ARCH_MIN_ARM_NEON
+
+	int16x8_t v_vc,neg_sn,vce,v_eq;
+	int16x8_t zero = vdupq_n_s16(0);
+	int16x8_t one = vdupq_n_s16(1);
+
+	int16x8_t vs = vld1q_s16((const int16_t *)VS);
+    int16x8_t vt = vld1q_s16((const int16_t *)VT);
+	v_vc = vt;
+	int16x8_t v_sn = veorq_s16(vs,v_vc);
+	uint16x8_t sn_u = vcltq_s16(v_sn,zero);
+	v_vc = veorq_s16(v_vc, (int16x8_t)sn_u);
+	neg_sn = vnegq_s16((int16x8_t)sn_u);
+	
+	uint16x8_t vs_vc_eq = vceqq_s16(vs,v_vc);
+	vce = vandq_s16((int16x8_t)vs_vc_eq,v_sn);
+	
+	v_vc = vaddq_s16(v_vc,v_sn);
+	v_eq = vorrq_s16(vce,(int16x8_t)vs_vc_eq);
+	v_eq = vnegq_s16(v_eq);
+
+	int16x8_t not_sn = vsubq_s16(neg_sn, one);
+	int16x8_t neg_vs = vmvnq_s16(vs);
+	int16x8_t v_diff = vorrq_s16(neg_vs, not_sn);
+	uint16x8_t ule = vcleq_s16(vt,v_diff);
+	int16x8_t v_le = vnegq_s16((int16x8_t)ule);
+
+	v_diff = vorrq_s16(vs, (int16x8_t)sn_u);
+	uint16x8_t uge = vcgeq_s16(v_diff,vt);
+	int16x8_t v_ge = vnegq_s16((int16x8_t)uge);
+
+	vst1q_s16(ge, v_ge);
+	vst1q_s16(le, v_le);
+	vst1q_s16(sn, v_sn);
+	vst1q_s16(VC, v_vc);
+	
+	merge(state->comp, sn, le, ge);
+    merge(VACC_L, state->comp, VC, VS);
+    vector_copy(VD, VACC_L);
+
+	v_eq = veorq_s16(v_eq, one);
+	
+	vst1q_s16(state->clip, v_ge);
+	vst1q_s16(state->comp, v_le);
+	vst1q_s16(state->ne, v_eq);
+	vst1q_s16(state->co, v_sn);
+	   
+	return;
+	
+#else
+   
    register int i;

    for (i = 0; i < N; i++)
@ -72,6 +124,7 @@ INLINE static void do_ch(usf_state_t * state, short* VD, short* VS, short* VT)
    for (i = 0; i < N; i++)
        state->co[i] = sn[i];
    return;
+#endif
 }

 static void VCH(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vcl.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vcl.h
@ -15,6 +15,7 @@

 INLINE static void do_cl(usf_state_t * state, short* VD, short* VS, short* VT)
 {
+
    ALIGNED short eq[N], ge[N], le[N];
    ALIGNED short gen[N], len[N], lz[N], uz[N], sn[N];
    ALIGNED short diff[N];
@ -22,6 +23,88 @@ INLINE static void do_cl(usf_state_t * state, short* VD, short* VS, short* VT)
    ALIGNED unsigned short VB[N], VC[N];
    register int i;

+#ifdef ARCH_MIN_ARM_NEON
+
+	int16x8_t v_vc,neg_sn,v_eq,v_vb,v_sn,v_diff,v_uz,v_cmp;
+
+	int16x8_t zero = vdupq_n_s16(0);
+	int16x8_t minus1 = vdupq_n_s16(-1);
+	int16x8_t one = vdupq_n_s16(1);
+	
+	int16x8_t vs = vld1q_s16((const int16_t *)VS);
+    int16x8_t vt = vld1q_s16((const int16_t *)VT);
+	int16x8_t ne = vld1q_s16((const int16_t *)state->ne);
+	int16x8_t co = vld1q_s16((const int16_t *)state->co);
+	int16x8_t vce = vld1q_s16((const int16_t *)state->vce);
+
+	v_vb = vs;
+	v_vc = vt;
+
+	v_eq = veorq_s16(ne, one);
+	v_sn = co;
+
+	neg_sn = vnegq_s16((int16x8_t)v_sn);
+	v_vc = veorq_s16(v_vc, (int16x8_t)neg_sn);
+	v_vc = vaddq_s16(v_vc, v_sn);
+		
+	v_diff = vsubq_s16(v_vb,v_vc);
+	
+	uint16x8_t vb_cond = vceqq_s16(v_vb,minus1);
+	uint16x8_t vc_cond = vceqq_s16(v_vc,zero);
+	vb_cond = vmvnq_u16(vb_cond);
+	vc_cond = vmvnq_u16(vc_cond);
+	v_uz = vorrq_s16((int16x8_t)vb_cond, (int16x8_t)vc_cond);
+
+	uint16x8_t v_lz = vceqq_s16(v_diff,zero);
+	int16x8_t lz_s = vnegq_s16((int16x8_t)v_lz);
+
+	int16x8_t v_gen = vorrq_s16(lz_s,v_uz);
+	int16x8_t v_len = vandq_s16(lz_s,v_uz);
+	v_gen = vandq_s16(v_gen,vce);
+	
+	vce = veorq_s16(vce,one);
+	v_len = vandq_s16(v_len,vce);
+	
+	v_len = vorrq_s16(v_len,v_gen);
+	uint16x8_t gen_u = vcgeq_u16((uint16x8_t)v_vb,(uint16x8_t)v_vc);
+	v_gen = vnegq_s16((int16x8_t)gen_u);
+
+	v_cmp = vandq_s16(v_eq,v_sn);
+	
+	vst1q_s16(cmp, v_cmp);
+	vst1q_s16(len, v_len);
+	
+	merge(le, cmp, len, state->comp);
+	int16x8_t sn_xor = veorq_s16(v_sn,one);
+	v_cmp = vandq_s16(v_eq,sn_xor);
+
+	vst1q_s16(cmp, v_cmp);
+	vst1q_s16(gen, v_gen);
+	vst1q_s16(sn, v_sn);
+	vst1q_s16(VC, v_vc);
+	
+    merge(ge, cmp, gen, state->clip);
+
+    merge(cmp, sn, le, ge);
+    merge(VACC_L, cmp, (short *)VC, VS);
+    vector_copy(VD, VACC_L);
+	
+	int16x8_t v_ge = vld1q_s16((const int16_t *)ge);
+	int16x8_t v_le = vld1q_s16((const int16_t *)le);
+		
+	vst1q_s16(state->clip,v_ge);
+	vst1q_s16(state->comp,v_le);	
+	
+	vst1q_s16(state->ne,zero);
+	vst1q_s16(state->co,zero);
+	vst1q_s16(state->vce,zero);
+	
+	return;
+
+#else
+
+
+
    for (i = 0; i < N; i++)
        VB[i] = VS[i];
    for (i = 0; i < N; i++)
@ -88,6 +171,7 @@ INLINE static void do_cl(usf_state_t * state, short* VD, short* VS, short* VT)
    for (i = 0; i < N; i++)
        state->vce[i] = 0;
    return;
+#endif
 }

 static void VCL(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vcr.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vcr.h
@ -20,6 +20,48 @@ INLINE static void do_cr(usf_state_t * state, short* VD, short* VS, short* VT)
    ALIGNED short cmp[N];
    register int i;

+#ifdef ARCH_MIN_ARM_NEON
+
+	int16x8_t v_sn, v_cmp, v_vc;
+
+	int16x8_t zero = vdupq_n_s16(0);
+	
+	int16x8_t vs = vld1q_s16((const int16_t *)VS);
+    int16x8_t vt = vld1q_s16((const int16_t *)VT);
+
+	v_vc = vt;
+	v_sn = veorq_s16(vs,vt);
+	v_sn = vshrq_n_s16(v_sn,15);
+
+	v_cmp = vandq_s16(vs, v_sn);
+	v_cmp = vmvnq_s16(v_cmp);
+	uint16x8_t v_le = vcleq_s16(vt,v_cmp);
+	int16x8_t v_le_ = vnegq_s16((int16x8_t)v_le);
+
+	v_cmp = vorrq_s16(vs, v_sn);
+	uint16x8_t v_ge = vcgeq_s16(v_cmp, vt);
+	int16x8_t v_ge_ = vnegq_s16((int16x8_t)v_ge);
+
+	v_vc = veorq_s16(v_vc,v_sn);
+
+	vst1q_s16(VC, v_vc);
+	vst1q_s16(le, v_le_);
+	vst1q_s16(ge, v_ge_);
+
+	merge(VACC_L, le, VC, VS);
+    vector_copy(VD, VACC_L);
+
+	vst1q_s16(state->clip, v_ge_);
+	vst1q_s16(state->comp, v_le_);
+	vst1q_s16(state->ne,zero);
+	vst1q_s16(state->co,zero);
+	vst1q_s16(state->vce,zero);
+	
+	return;
+	
+#else
+	
+	
    for (i = 0; i < N; i++)
        VC[i] = VT[i];
    for (i = 0; i < N; i++)
@ -55,6 +97,7 @@ INLINE static void do_cr(usf_state_t * state, short* VD, short* VS, short* VT)
    for (i = 0; i < N; i++)
        state->vce[i] = 0;
    return;
+#endif
 }

 static void VCR(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/veq.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/veq.h
@ -17,6 +17,30 @@ INLINE static void do_eq(usf_state_t * state, short* VD, short* VS, short* VT)
 {
    register int i;

+#ifdef ARCH_MIN_ARM_NEON
+
+	int16x8_t one = vdupq_n_s16(1);
+	int16x8_t zero = vdupq_n_s16(0);
+	int16x8_t vs = vld1q_s16((const int16_t *)VS);
+    int16x8_t vt = vld1q_s16((const int16_t *)VT);
+	int16x8_t v_ne = vld1q_s16((const int16_t *)state->ne);
+
+	uint16x8_t v_comp = vceqq_s16(vs, vt);
+	int16x8_t v_comp_ = vnegq_s16((int16x8_t)v_comp);
+	v_ne = veorq_s16(v_ne, one);
+	v_comp_ = vandq_s16(v_comp_, v_ne);
+
+    vector_copy(VACC_L, VT);
+    vector_copy(VD, VACC_L);
+	
+	vst1q_s16(state->comp,v_comp_);
+	vst1q_s16(state->ne,zero);
+	vst1q_s16(state->co,zero);
+
+	return;
+
+#else
+	
    for (i = 0; i < N; i++)
        state->clip[i] = 0;
    for (i = 0; i < N; i++)
@ -35,6 +59,7 @@ INLINE static void do_eq(usf_state_t * state, short* VD, short* VS, short* VT)
    for (i = 0; i < N; i++)
        state->co[i] = 0;
    return;
+#endif
 }

 static void VEQ(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vge.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vge.h
@ -15,10 +15,44 @@

 INLINE static void do_ge(usf_state_t * state, short* VD, short* VS, short* VT)
 {
+
    ALIGNED short ce[N];
    ALIGNED short eq[N];
    register int i;

+#ifdef ARCH_MIN_ARM_NEON
+
+
+	int16x8_t zero = vdupq_n_s16(0);
+	int16x8_t one = vdupq_n_s16(1);
+	
+	int16x8_t vs = vld1q_s16((const int16_t *)VS);
+    int16x8_t vt = vld1q_s16((const int16_t *)VT);
+    int16x8_t v_ne = vld1q_s16((const int16_t *)state->ne);
+    int16x8_t v_co = vld1q_s16((const int16_t *)state->co);
+	
+	uint16x8_t v_eq_u = vceqq_s16(vs,vt);
+	int16x8_t v_eq_s = vnegq_s16((int16x8_t)v_eq_u);
+
+	int16x8_t v_ce = vandq_s16(v_ne,v_co);
+	v_ce = veorq_s16(v_ce,one);
+	v_eq_s = vandq_s16(v_eq_s, v_ce);
+	vst1q_s16(state->clip, zero);
+	uint16x8_t v_comp = vcgtq_s16(vs, vt);
+	int16x8_t v_comp_s = vnegq_s16((int16x8_t)v_comp);
+
+	v_comp_s = vorrq_s16(v_comp_s, v_eq_s);
+	vst1q_s16(state->comp, v_comp_s);
+	
+	merge(VACC_L, state->comp, VS, VT);
+    vector_copy(VD, VACC_L);
+	
+	vst1q_s16(state->ne,zero);
+	vst1q_s16(state->co,zero);
+	
+	return;
+#else
+	
    for (i = 0; i < N; i++)
        eq[i] = (VS[i] == VT[i]);
    for (i = 0; i < N; i++)
@ -39,6 +73,7 @@ INLINE static void do_ge(usf_state_t * state, short* VD, short* VS, short* VT)
    for (i = 0; i < N; i++)
        state->co[i] = 0;
    return;
+#endif
 }

 static void VGE(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vlt.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vlt.h
@ -15,6 +15,37 @@

 INLINE static void do_lt(usf_state_t * state, short* VD, short* VS, short* VT)
 {
+
+#ifdef ARCH_MIN_ARM_NEON
+	
+	int16x8_t zero = vdupq_n_s16(0);
+		
+	int16x8_t vs = vld1q_s16((const int16_t *)VS);
+    int16x8_t vt = vld1q_s16((const int16_t *)VT);
+    int16x8_t v_ne = vld1q_s16((const int16_t *)state->ne);
+    int16x8_t v_co = vld1q_s16((const int16_t *)state->co);
+	
+	uint16x8_t v_eq_u = vceqq_s16(vs,vt);
+	int16x8_t v_cn = vandq_s16(v_ne,v_co);
+	v_eq_u = vandq_u16(v_eq_u,(uint16x8_t)v_cn);
+
+	vst1q_s16(state->clip, zero);
+
+	uint16x8_t v_comp = vcltq_s16(vs, vt);
+	int16x8_t v_comp_s = vnegq_s16((int16x8_t)v_comp);
+	v_comp_s = vorrq_s16(v_comp_s, (int16x8_t)v_eq_u);
+	
+	vst1q_s16(state->comp, v_comp_s);
+
+    merge(VACC_L, state->comp, VS, VT);
+    vector_copy(VD, VACC_L);
+
+	vst1q_s16(state->ne, zero);
+	vst1q_s16(state->co, zero);
+	return;
+	
+#else
+
    ALIGNED short cn[N];
    ALIGNED short eq[N];
    register int i;
@ -39,6 +70,7 @@ INLINE static void do_lt(usf_state_t * state, short* VD, short* VS, short* VT)
    for (i = 0; i < N; i++)
        state->co[i] = 0;
    return;
+#endif
 }

 static void VLT(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vmacf.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vmacf.h
@ -15,10 +15,81 @@

 INLINE static void do_macf(usf_state_t * state, short* VD, short* VS, short* VT)
 {
-    ALIGNED int32_t product[N];
+
+#ifdef ARCH_MIN_ARM_NEON
+	
+	uint16x8_t vs = vld1q_u16((const uint16_t*)VS);
+	uint16x8_t vt = vld1q_u16((const uint16_t*)VT);
+	uint16x4_t v_vaccl_low = vld1_u16((const uint16_t*)VACC_L);
+	uint16x4_t v_vaccl_high = vld1_u16((const uint16_t*)VACC_L+4);
+	uint16x4_t v_vaccm_low = vld1_u16((const uint16_t*)VACC_M);
+	uint16x4_t v_vaccm_high = vld1_u16((const uint16_t*)VACC_M+4);
+	uint16x4_t v_vacch_low = vld1_u16((const uint16_t*)VACC_H);
+	uint16x4_t v_vacch_high = vld1_u16((const uint16_t*)VACC_H+4);
+	int32x4_t zero = vdupq_n_s32(0);
+	uint32x4_t zero_u = vdupq_n_u32(0);
+	uint32x4_t highmask = vdupq_n_u32(0x0000ffff);
+
+	uint16x4_t vs_low =  vget_low_u16(vs);
+    uint16x4_t vs_high = vget_high_u16(vs);
+    uint16x4_t vt_low =  vget_low_u16(vt);
+    uint16x4_t vt_high = vget_high_u16(vt);
+	
+	int32x4_t product_L = vqdmlal_s16(zero, (int16x4_t)vs_low, (int16x4_t)vt_low);
+	int32x4_t product_H = vqdmlal_s16(zero, (int16x4_t)vs_high, (int16x4_t)vt_high);
+	uint32x4_t addend_L = vandq_u32(highmask, (uint32x4_t)product_L);
+	uint32x4_t addend_H = vandq_u32(highmask, (uint32x4_t)product_H);
+
+	addend_L = vaddw_u16((uint32x4_t)addend_L, v_vaccl_low);
+	addend_H = vaddw_u16((uint32x4_t)addend_H, v_vaccl_high);
+	
+	uint16x8_t v_vaccl = vcombine_u16(vmovn_u32(addend_L), vmovn_u32(addend_H));
+
+
+	addend_L = vaddl_u16( 
+		vaddhn_u32((uint32x4_t)product_L, zero_u), 
+		vaddhn_u32((uint32x4_t)addend_L, zero_u)		
+		);
+
+	addend_H = vaddl_u16( 
+		vaddhn_u32((uint32x4_t)product_H, zero_u), 
+		vaddhn_u32((uint32x4_t)addend_H, zero_u)		
+		);
+
+	addend_L = vaddw_u16(addend_L, v_vaccm_low);
+	addend_H = vaddw_u16(addend_H, v_vaccm_high);
+
+	uint16x8_t v_vaccm = vcombine_u16(vmovn_u32(addend_L), vmovn_u32(addend_H));
+
+	//product_L = vshrq_n_s32(product_L, 1);
+	//product_H = vshrq_n_s32(product_H, 1);
+
+	uint32x4_t cond_L = vcltq_s32(product_L,zero);
+	int32x4_t cond_L_s = vnegq_s32((int32x4_t)cond_L);
+	uint32x4_t cond_H = vcltq_s32(product_H,zero);
+	int32x4_t cond_H_s = vnegq_s32((int32x4_t)cond_H);
+	
+	v_vacch_low = vsub_u16(v_vacch_low, vmovn_u32((uint32x4_t)cond_L_s));
+	v_vacch_high = vsub_u16(v_vacch_high, vmovn_u32((uint32x4_t)cond_H_s));
+	
+	v_vacch_low = vadd_u16(vshrn_n_u32(addend_L,16), v_vacch_low);
+	v_vacch_high = vadd_u16(vshrn_n_u32(addend_H,16), v_vacch_high);
+
+	uint16x8_t v_vacch = vcombine_u16(v_vacch_low, v_vacch_high);
+
+	vst1q_s16(VACC_L, (int16x8_t)v_vaccl);
+	vst1q_s16(VACC_M, (int16x8_t)v_vaccm);
+	vst1q_s16(VACC_H, (int16x8_t)v_vacch);
+		
+	SIGNED_CLAMP_AM(state, VD);
+	return;
+	
+#else
+
+	ALIGNED int32_t product[N];
    ALIGNED uint32_t addend[N];
    register int i;
-
+   
    for (i = 0; i < N; i++)
        product[i] = VS[i] * VT[i];
    for (i = 0; i < N; i++)
@ -38,7 +109,9 @@ INLINE static void do_macf(usf_state_t * state, short* VD, short* VS, short* VT)
    for (i = 0; i < N; i++)
        VACC_H[i] += addend[i] >> 16;
    SIGNED_CLAMP_AM(state, VD);
+
    return;
+#endif
 }

 static void VMACF(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vmacu.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vmacu.h
@ -15,6 +15,78 @@

 INLINE static void do_macu(usf_state_t * state, short* VD, short* VS, short* VT)
 {
+
+#ifdef ARCH_MIN_ARM_NEON
+	
+	uint16x8_t vs = vld1q_u16((const uint16_t*)VS);
+	uint16x8_t vt = vld1q_u16((const uint16_t*)VT);
+	uint16x4_t v_vaccl_low = vld1_u16((const uint16_t*)VACC_L);
+	uint16x4_t v_vaccl_high = vld1_u16((const uint16_t*)VACC_L+4);
+	uint16x4_t v_vaccm_low = vld1_u16((const uint16_t*)VACC_M);
+	uint16x4_t v_vaccm_high = vld1_u16((const uint16_t*)VACC_M+4);
+	uint16x4_t v_vacch_low = vld1_u16((const uint16_t*)VACC_H);
+	uint16x4_t v_vacch_high = vld1_u16((const uint16_t*)VACC_H+4);
+	int32x4_t zero = vdupq_n_s32(0);
+	uint32x4_t zero_u = vdupq_n_u32(0);
+	uint32x4_t highmask = vdupq_n_u32(0x0000ffff);
+
+	uint16x4_t vs_low =  vget_low_u16(vs);
+    uint16x4_t vs_high = vget_high_u16(vs);
+    uint16x4_t vt_low =  vget_low_u16(vt);
+    uint16x4_t vt_high = vget_high_u16(vt);
+	
+	int32x4_t product_L = vqdmlal_s16(zero, (int16x4_t)vs_low, (int16x4_t)vt_low);
+	int32x4_t product_H = vqdmlal_s16(zero, (int16x4_t)vs_high, (int16x4_t)vt_high);
+	uint32x4_t addend_L = vandq_u32(highmask, (uint32x4_t)product_L);
+	uint32x4_t addend_H = vandq_u32(highmask, (uint32x4_t)product_H);
+
+	addend_L = vaddw_u16((uint32x4_t)addend_L, v_vaccl_low);
+	addend_H = vaddw_u16((uint32x4_t)addend_H, v_vaccl_high);
+	
+	uint16x8_t v_vaccl = vcombine_u16(vmovn_u32(addend_L), vmovn_u32(addend_H));
+
+
+	addend_L = vaddl_u16( 
+		vaddhn_u32((uint32x4_t)product_L, zero_u), 
+		vaddhn_u32((uint32x4_t)addend_L, zero_u)		
+		);
+
+	addend_H = vaddl_u16( 
+		vaddhn_u32((uint32x4_t)product_H, zero_u), 
+		vaddhn_u32((uint32x4_t)addend_H, zero_u)		
+		);
+
+	addend_L = vaddw_u16(addend_L, v_vaccm_low);
+	addend_H = vaddw_u16(addend_H, v_vaccm_high);
+
+	uint16x8_t v_vaccm = vcombine_u16(vmovn_u32(addend_L), vmovn_u32(addend_H));
+
+	//product_L = vshrq_n_s32(product_L, 1);
+	//product_H = vshrq_n_s32(product_H, 1);
+
+	uint32x4_t cond_L = vcltq_s32(product_L,zero);
+	int32x4_t cond_L_s = vnegq_s32((int32x4_t)cond_L);
+	uint32x4_t cond_H = vcltq_s32(product_H,zero);
+	int32x4_t cond_H_s = vnegq_s32((int32x4_t)cond_H);
+	
+	v_vacch_low = vsub_u16(v_vacch_low, vmovn_u32((uint32x4_t)cond_L_s));
+	v_vacch_high = vsub_u16(v_vacch_high, vmovn_u32((uint32x4_t)cond_H_s));
+	
+	v_vacch_low = vadd_u16(vshrn_n_u32(addend_L,16), v_vacch_low);
+	v_vacch_high = vadd_u16(vshrn_n_u32(addend_H,16), v_vacch_high);
+
+	uint16x8_t v_vacch = vcombine_u16(v_vacch_low, v_vacch_high);
+
+	vst1q_s16(VACC_L, (int16x8_t)v_vaccl);
+	vst1q_s16(VACC_M, (int16x8_t)v_vaccm);
+	vst1q_s16(VACC_H, (int16x8_t)v_vacch);
+		
+	UNSIGNED_CLAMP(state, VD);
+	return;
+	
+#else
+
+
    ALIGNED int32_t product[N];
    ALIGNED uint32_t addend[N];
    register int i;
@ -39,6 +111,7 @@ INLINE static void do_macu(usf_state_t * state, short* VD, short* VS, short* VT)
        VACC_H[i] += addend[i] >> 16;
    UNSIGNED_CLAMP(state, VD);
    return;
+#endif
 }

 static void VMACU(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vmadh.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vmadh.h
@ -15,6 +15,39 @@

 INLINE static void do_madh(usf_state_t * state, short* VD, short* VS, short* VT)
 {
+
+#ifdef ARCH_MIN_ARM_NEON
+	
+	int16x8_t vs,vt, vaccm, vacch, vacc_h, vacc_m,prod_low,prod_high,one,cond;
+	uint16x8_t cond_u,res;
+	 	     
+	one = vdupq_n_s16(1);
+    vs = vld1q_s16((const int16_t *)VS);
+    vt = vld1q_s16((const int16_t *)VT);
+	vaccm = vld1q_s16((const int16_t *)VACC_M);
+	vacch = vld1q_s16((const int16_t *)VACC_H);
+		
+	prod_low = vmulq_s16(vs,vt);
+	vacc_m = vaddq_s16(vaccm,prod_low);
+
+	prod_high = vqdmulhq_s16(vs,vt);
+	prod_high = vshrq_n_s16(prod_high, 1);
+		
+	res = vqaddq_u16((uint16x8_t)vaccm, (uint16x8_t)prod_low);
+	cond_u = vceqq_s16((int16x8_t)res,vacc_m);
+	cond_u = vaddq_u16(cond_u, (uint16x8_t)one);
+
+	vacc_h = vaddq_s16(prod_high, vacch);
+	vacc_h = vaddq_s16((int16x8_t)cond_u, vacc_h);
+	
+	vst1q_s16(VACC_M, vacc_m);
+	vst1q_s16(VACC_H, vacc_h);
+	
+	SIGNED_CLAMP_AM(state, VD);
+	return;
+	
+#else
+
    ALIGNED int32_t product[N];
    ALIGNED uint32_t addend[N];
    register int i;
@ -29,6 +62,8 @@ INLINE static void do_madh(usf_state_t * state, short* VD, short* VS, short* VT)
        VACC_H[i] += (addend[i] >> 16) + (product[i] >> 16);
    SIGNED_CLAMP_AM(state, VD);
    return;
+
+#endif
 }

 static void VMADH(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vmadl.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vmadl.h
@ -15,6 +15,56 @@

 INLINE static void do_madl(usf_state_t * state, short* VD, short* VS, short* VT)
 {
+
+#ifdef ARCH_MIN_ARM_NEON
+
+	uint16x8_t vs = vld1q_u16((const uint16_t*)VS);
+	uint16x8_t vt = vld1q_u16((const uint16_t*)VT);
+	uint16x8_t v_vaccl = vld1q_u16((const uint16_t*)VACC_L);
+	uint16x8_t v_vaccm = vld1q_u16((const uint16_t*)VACC_M);
+	uint16x8_t v_vacch = vld1q_u16((const uint16_t*)VACC_H);
+
+	uint32x4_t zero = vdupq_n_u32(0);
+	uint16x4_t vs_low =  vget_low_u16(vs);
+    uint16x4_t vs_high = vget_high_u16(vs);
+    uint16x4_t vt_low =  vget_low_u16(vt);
+    uint16x4_t vt_high = vget_high_u16(vt);
+	
+	uint32x4_t product_L = vmulq_u32( vmovl_u16(vs_low), vmovl_u16(vt_low) );
+	uint32x4_t product_H = vmulq_u32( vmovl_u16(vs_high), vmovl_u16(vt_high) );
+
+	uint16x4_t addend_L = vaddhn_u32(product_L, zero);
+	uint16x4_t addend_H = vaddhn_u32(product_H, zero);
+	
+	uint32x4_t exceed1 = vaddl_u16(addend_L, vget_low_u16(v_vaccl));
+	uint32x4_t exceed2 = vaddl_u16(addend_H, vget_high_u16(v_vaccl));
+	
+	v_vaccl = vcombine_u16(vmovn_u32(exceed1), vmovn_u32(exceed2));
+	
+	addend_L = vaddhn_u32(exceed1, zero);
+	addend_H = vaddhn_u32(exceed2, zero);
+	
+	exceed1 = vaddl_u16(addend_L, vget_low_u16(v_vaccm));
+	exceed2 = vaddl_u16(addend_H, vget_high_u16(v_vaccm));
+	
+	v_vaccm = vcombine_u16(vmovn_u32(exceed1), vmovn_u32(exceed2));
+
+	addend_L = vaddhn_u32(exceed1, zero);
+	addend_H = vaddhn_u32(exceed2, zero);
+	
+	uint16x8_t v_vacch2 = vcombine_u16(addend_L, addend_H);
+	v_vacch = vaddq_u16(v_vacch, v_vacch2);
+
+	vst1q_s16(VACC_L, (int16x8_t)v_vaccl);
+	vst1q_s16(VACC_M, (int16x8_t)v_vaccm);
+	vst1q_s16(VACC_H, (int16x8_t)v_vacch);
+	
+	SIGNED_CLAMP_AL(state, VD);
+	return;
+	
+#else
+
+
    ALIGNED int32_t product[N];
    ALIGNED uint32_t addend[N];
    register int i;
@ -37,6 +87,7 @@ INLINE static void do_madl(usf_state_t * state, short* VD, short* VS, short* VT)
        VACC_H[i] += addend[i] >> 16;
    SIGNED_CLAMP_AL(state, VD);
    return;
+#endif
 }

 static void VMADL(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vmadm.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vmadm.h
@ -15,6 +15,43 @@

 INLINE static void do_madm(usf_state_t * state, short* VD, short* VS, short* VT)
 {
+
+#ifdef ARCH_MIN_ARM_NEON
+	
+	uint32x4_t zero = vdupq_n_u32(0);
+	int16x4_t vs_low = vld1_s16((const int16_t*)VS);
+    int16x4_t vs_high = vld1_s16((const int16_t*)VS+4);
+    uint16x4_t vt_low = vld1_u16((const uint16_t*)VT);
+    uint16x4_t vt_high = vld1_u16((const uint16_t*)VT+4);
+	uint16x4_t vaccl_low = vld1_u16((const uint16_t*)VACC_L);
+	uint16x4_t vaccl_high = vld1_u16((const uint16_t*)VACC_L+4);
+	uint16x4_t vaccm_low = vld1_u16((const uint16_t*)VACC_M);
+	uint16x4_t vaccm_high = vld1_u16((const uint16_t*)VACC_M+4);
+	int16x4_t vacch_low = vld1_s16((const int16_t*)VACC_H);
+	int16x4_t vacch_high = vld1_s16((const int16_t*)VACC_H+4);
+	
+	int32x4_t vaccl_l = vmlaq_s32((int32x4_t)vmovl_u16(vaccl_low),vmovl_s16(vs_low),(int32x4_t)vmovl_u16(vt_low));
+	int32x4_t vaccl_h = vmlaq_s32((int32x4_t)vmovl_u16(vaccl_high),vmovl_s16(vs_high),(int32x4_t)vmovl_u16(vt_high));
+	uint32x4_t vaccm_l = vaddq_u32(vmovl_u16(vaccm_low), (uint32x4_t)vshrq_n_s32(vaccl_l,16));
+	uint32x4_t vaccm_h = vaddq_u32(vmovl_u16(vaccm_high),(uint32x4_t)vshrq_n_s32(vaccl_h,16));
+	uint16x4_t vacch_l = vaddhn_u32(vaccm_l, zero);
+	uint16x4_t vacch_h = vaddhn_u32(vaccm_h, zero);
+	int16x4_t vacch_low2 = vadd_s16(vacch_low,(int16x4_t)vacch_l);
+	int16x4_t vacch_high2 = vadd_s16(vacch_high,(int16x4_t)vacch_h);
+
+	int16x8_t vaccl = vcombine_s16(vmovn_s32(vaccl_l),vmovn_s32(vaccl_h));
+	uint16x8_t vaccm = vcombine_u16(vmovn_u32(vaccm_l),vmovn_u32(vaccm_h));
+	int16x8_t vacch = vcombine_s16(vacch_low2,vacch_high2);
+
+	vst1q_s16(VACC_L, vaccl);
+	vst1q_s16(VACC_M, (int16x8_t)vaccm);
+	vst1q_s16(VACC_H, vacch);	
+	SIGNED_CLAMP_AM(state, VD);
+      
+    return;
+#else
+
+
    ALIGNED uint32_t addend[N];
    register int i;

@ -32,6 +69,7 @@ INLINE static void do_madm(usf_state_t * state, short* VD, short* VS, short* VT)
        VACC_H[i] += addend[i] >> 16;
    SIGNED_CLAMP_AM(state, VD);
    return;
+#endif
 }

 static void VMADM(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vmadn.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vmadn.h
@ -15,6 +15,43 @@

 INLINE static void do_madn(usf_state_t * state, short* VD, short* VS, short* VT)
 {
+
+#ifdef ARCH_MIN_ARM_NEON
+
+	uint32x4_t zero = vdupq_n_u32(0);
+	uint16x4_t vs_low = vld1_u16((const uint16_t*)VS);
+    uint16x4_t vs_high = vld1_u16((const uint16_t*)VS+4);
+    int16x4_t vt_low = vld1_s16((const int16_t*)VT);
+    int16x4_t vt_high = vld1_s16((const int16_t*)VT+4);
+	uint16x4_t vaccl_low = vld1_u16((const uint16_t*)VACC_L);
+	uint16x4_t vaccl_high = vld1_u16((const uint16_t*)VACC_L+4);
+	uint16x4_t vaccm_low = vld1_u16((const uint16_t*)VACC_M);
+	uint16x4_t vaccm_high = vld1_u16((const uint16_t*)VACC_M+4);
+	int16x4_t vacch_low = vld1_s16((const int16_t*)VACC_H);
+	int16x4_t vacch_high = vld1_s16((const int16_t*)VACC_H+4);
+	
+	int32x4_t vaccl_l = vmlaq_s32((int32x4_t)vmovl_u16(vaccl_low),(int32x4_t)vmovl_u16(vs_low),vmovl_s16(vt_low));
+	int32x4_t vaccl_h = vmlaq_s32((int32x4_t)vmovl_u16(vaccl_high),(int32x4_t)vmovl_u16(vs_high),vmovl_s16(vt_high));
+	uint32x4_t vaccm_l = vaddq_u32(vmovl_u16(vaccm_low), (uint32x4_t)vshrq_n_s32(vaccl_l,16));
+	uint32x4_t vaccm_h = vaddq_u32(vmovl_u16(vaccm_high),(uint32x4_t)vshrq_n_s32(vaccl_h,16));
+	uint16x4_t vacch_l = vaddhn_u32(vaccm_l, zero);
+	uint16x4_t vacch_h = vaddhn_u32(vaccm_h, zero);
+	int16x4_t vacch_low2 = vadd_s16(vacch_low,(int16x4_t)vacch_l);
+	int16x4_t vacch_high2 = vadd_s16(vacch_high,(int16x4_t)vacch_h);
+
+	int16x8_t vaccl = vcombine_s16(vmovn_s32(vaccl_l),vmovn_s32(vaccl_h));
+	uint16x8_t vaccm = vcombine_u16(vmovn_u32(vaccm_l),vmovn_u32(vaccm_h));
+	int16x8_t vacch = vcombine_s16(vacch_low2,vacch_high2);
+
+	vst1q_s16(VACC_L, vaccl);
+	vst1q_s16(VACC_M, (int16x8_t)vaccm);
+	vst1q_s16(VACC_H, vacch);
+    SIGNED_CLAMP_AL(state, VD);
+	return;
+		
+#else
+
+
    ALIGNED uint32_t addend[N];
    register int i;

@ -32,6 +69,7 @@ INLINE static void do_madn(usf_state_t * state, short* VD, short* VS, short* VT)
        VACC_H[i] += addend[i] >> 16;
    SIGNED_CLAMP_AL(state, VD);
    return;
+#endif
 }

 static void VMADN(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vmudh.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vmudh.h
@ -15,6 +15,37 @@

 INLINE static void do_mudh(usf_state_t * state, short* VD, short* VS, short* VT)
 {
+
+#ifdef ARCH_MIN_ARM_NEON
+
+	int16x8_t zero16 = vdupq_n_s16(0);
+	int32x4_t zero32 = vdupq_n_s32(0);
+
+	int16x4_t vs_low =  vld1_s16((const int16_t *)VS);
+	int16x4_t vs_high = vld1_s16((const int16_t *)VS+4);
+	int16x4_t vt_low =  vld1_s16((const int16_t *)VT);
+	int16x4_t vt_high = vld1_s16((const int16_t *)VT+4);
+	 
+    int32x4_t l1 = vmovl_s16(vs_low);
+	int32x4_t h1 = vmovl_s16(vs_high);
+	int32x4_t l2 = vmovl_s16(vt_low);
+	int32x4_t h2 = vmovl_s16(vt_high);
+	
+    int32x4_t l = vmulq_s32(l1,l2);
+	int32x4_t h = vmulq_s32(h1,h2);
+    
+	int16x8_t vaccm = vcombine_s16(vmovn_s32(l),vmovn_s32(h));
+	int16x8_t vacch = vcombine_s16(vaddhn_s32(l,zero32),vaddhn_s32(h,zero32));
+
+	vst1q_s16(VACC_L, zero16);
+	vst1q_s16(VACC_M, vaccm);
+	vst1q_s16(VACC_H, vacch);
+	
+	SIGNED_CLAMP_AM(state, VD);
+	return;
+	
+#else
+
    register int i;

    for (i = 0; i < N; i++)
@ -25,6 +56,7 @@ INLINE static void do_mudh(usf_state_t * state, short* VD, short* VS, short* VT)
        VACC_H[i] = (short)(VS[i]*VT[i] >> 16);
    SIGNED_CLAMP_AM(state, VD);
    return;
+#endif
 }

 static void VMUDH(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vmudl.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vmudl.h
@ -15,6 +15,26 @@

 INLINE static void do_mudl(usf_state_t * state, short* VD, short* VS, short* VT)
 {
+
+#ifdef ARCH_MIN_ARM_NEON
+
+    uint16x4_t vs_low = vld1_u16((const uint16_t*)VS);
+    uint16x4_t vs_high = vld1_u16((const uint16_t*)VS+4);
+    uint16x4_t vt_low = vld1_u16((const uint16_t*)VT);
+    uint16x4_t vt_high = vld1_u16((const uint16_t*)VT+4);
+	int16x8_t zero = vdupq_n_s16(0);
+
+    uint32x4_t lo = vmull_u16(vs_low, vt_low);
+    uint32x4_t high = vmull_u16(vs_high, vt_high);
+    uint16x8_t result = vcombine_u16(vshrn_n_u32(lo,16),vshrn_n_u32(high,16));
+	vst1q_u16(VACC_L, result);
+	vst1q_s16(VACC_M, zero);
+	vst1q_s16(VACC_H, zero);
+	vector_copy(VD, VACC_L);
+	return;
+	
+#else
+
    register int i;

    for (i = 0; i < N; i++)
@ -25,6 +45,7 @@ INLINE static void do_mudl(usf_state_t * state, short* VD, short* VS, short* VT)
        VACC_H[i] = 0x0000;
    vector_copy(VD, VACC_L); /* no possibilities to clamp */
    return;
+#endif
 }

 static void VMUDL(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vmudm.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vmudm.h
@ -15,6 +15,39 @@

 INLINE static void do_mudm(usf_state_t * state, short* VD, short* VS, short* VT)
 {
+#ifdef ARCH_MIN_ARM_NEON
+	
+	int16x8_t vd, vs,vt,res,four,vacc_l, vacc_m, vacc_h;
+	
+	int32x4_t zero = vdupq_n_s32(0);
+	int16x8_t zero16 = vdupq_n_s16(0);
+
+	int16x4_t vs_low = vld1_s16((const uint16_t *)VS);
+	int16x4_t vs_high = vld1_s16((const uint16_t *)VS+4);
+	uint16x4_t vt_low = vld1_u16((const int16_t *)VT);
+	uint16x4_t vt_high = vld1_u16((const int16_t *)VT+4);
+	 
+    int32x4_t l1 = vmovl_s16(vs_low);
+	int32x4_t h1 = vmovl_s16(vs_high);
+	uint32x4_t l2 = vmovl_u16(vt_low);
+	uint32x4_t h2 = vmovl_u16(vt_high);
+	
+    int32x4_t l = vmulq_s32(l1,(int32x4_t)l2);
+	int32x4_t h = vmulq_s32(h1,(int32x4_t)h2);
+    
+	int16x8_t vaccl = vcombine_s16(vmovn_s32(l),vmovn_s32(h));
+	int16x8_t vaccm = vcombine_s16(vaddhn_s32(l,zero),vaddhn_s32(h,zero));
+	uint16x8_t uvacch = vcltq_s16(vaccm, zero16);
+
+	vst1q_s16(VACC_L, vaccl);
+	vst1q_s16(VACC_M, vaccm);
+	vst1q_s16(VACC_H, (int16x8_t)uvacch);
+
+    vector_copy(VD, VACC_M); /* no possibilities to clamp */
+	
+	return;
+#else
+
    register int i;

    for (i = 0; i < N; i++)
@ -25,6 +58,7 @@ INLINE static void do_mudm(usf_state_t * state, short* VD, short* VS, short* VT)
        VACC_H[i] = -(VACC_M[i] < 0);
    vector_copy(VD, VACC_M); /* no possibilities to clamp */
    return;
+#endif
 }

 static void VMUDM(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vmudn.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vmudn.h
@ -15,6 +15,40 @@

 INLINE static void do_mudn(usf_state_t * state, short* VD, short* VS, short* VT)
 {
+
+#ifdef ARCH_MIN_ARM_NEON
+
+	int16x8_t vd, vs,vt,res,four,vacc_l, vacc_m, vacc_h;
+	
+	int32x4_t zero = vdupq_n_s32(0);
+	int16x8_t zero16 = vdupq_n_s16(0);
+
+	uint16x4_t vs_low = vld1_u16((const uint16_t *)VS);
+	uint16x4_t vs_high = vld1_u16((const uint16_t *)VS+4);
+	int16x4_t vt_low = vld1_s16((const int16_t *)VT);
+	int16x4_t vt_high = vld1_s16((const int16_t *)VT+4);
+	 
+    uint32x4_t l1 = vmovl_u16(vs_low);
+	uint32x4_t h1 = vmovl_u16(vs_high);
+	int32x4_t l2 = vmovl_s16(vt_low);
+	int32x4_t h2 = vmovl_s16(vt_high);
+	
+    int32x4_t l = vmulq_s32((int32x4_t)l1,l2);
+	int32x4_t h = vmulq_s32((int32x4_t)h1,h2);
+    
+	int16x8_t vaccl = vcombine_s16(vmovn_s32(l),vmovn_s32(h));
+	int16x8_t vaccm = vcombine_s16(vaddhn_s32(l,zero),vaddhn_s32(h,zero));
+	uint16x8_t uvacch = vcltq_s16(vaccm, zero16);
+	vst1q_s16(VACC_L, vaccl);
+	vst1q_s16(VACC_M, vaccm);
+	vst1q_s16(VACC_H, (int16x8_t)uvacch);
+	
+	vector_copy(VD, VACC_L); /* no possibilities to clamp */
+	
+	return;
+	
+#else
+
    register int i;

    for (i = 0; i < N; i++)
@ -25,6 +59,7 @@ INLINE static void do_mudn(usf_state_t * state, short* VD, short* VS, short* VT)
        VACC_H[i] = -(VACC_M[i] < 0);
    vector_copy(VD, VACC_L); /* no possibilities to clamp */
    return;
+#endif
 }

 static void VMUDN(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vmulf.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vmulf.h
@ -22,11 +22,44 @@
 * Wrong:  ACC(HI) = -((INT32)(acc) < 0)
 * Right:  ACC(HI) = -(SEMIFRAC < 0)
 */
-#define SEMIFRAC    (VS[i]*VT[i]*2/2 + 0x8000/2)
+
+#define SEMIFRAC    (VS[i]*VT[i] + 0x4000)
 #endif

 INLINE static void do_mulf(usf_state_t * state, short* VD, short* VS, short* VT)
 {
+
+#ifdef ARCH_MIN_ARM_NEON
+
+	int16x8_t vs,vt,res,four,zero,vacc_l, vacc_m, vacc_h;
+	uint16x8_t cond_u, vacc_m_cond_u,one;
+	   
+	one = vdupq_n_u16(1);
+    four = vdupq_n_s16(0x4000);
+    zero = vdupq_n_s16(0);
+   
+    vs = vld1q_s16((const int16_t *)VS);
+    vt = vld1q_s16((const int16_t *)VT);
+
+    vacc_m = vqrdmulhq_s16(vs, vt);
+    vacc_l = vmlaq_s16(four, vs,vt);
+    vacc_l = vshlq_n_s16(vacc_l,1);
+   
+    cond_u = vceqq_s16(vs,vt);
+    cond_u = vaddq_u16(cond_u, one);
+    vacc_m_cond_u = vcltq_s16(vacc_m, zero);
+    cond_u = vandq_u16(vacc_m_cond_u, cond_u);
+    vacc_h = vqnegq_s16((int16x8_t)cond_u);
+
+	vst1q_s16(VACC_L,vacc_l);
+	vst1q_s16(VACC_M,vacc_m);
+	vst1q_s16(VACC_H,vacc_h);
+	
+	SIGNED_CLAMP_AM(state, VD);
+	return;
+	
+#else
+
    register int i;

    for (i = 0; i < N; i++)
@ -35,7 +68,7 @@ INLINE static void do_mulf(usf_state_t * state, short* VD, short* VS, short* VT)
        VACC_M[i] = (SEMIFRAC << 1) >> 16;
    for (i = 0; i < N; i++)
        VACC_H[i] = -((VACC_M[i] < 0) & (VS[i] != VT[i])); /* -32768 * -32768 */
-#ifndef ARCH_MIN_SSE2
+#if !defined ARCH_MIN_SSE2
    vector_copy(VD, VACC_M);
    for (i = 0; i < N; i++)
        VD[i] -= (VACC_M[i] < 0) & (VS[i] == VT[i]); /* ACC b 31 set, min*min */
@ -43,6 +76,7 @@ INLINE static void do_mulf(usf_state_t * state, short* VD, short* VS, short* VT)
    SIGNED_CLAMP_AM(state, VD);
 #endif
    return;
+#endif
 }

 static void VMULF(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vmulu.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vmulu.h
@ -22,11 +22,50 @@
 * Wrong:  ACC(HI) = -((INT32)(acc) < 0)
 * Right:  ACC(HI) = -(SEMIFRAC < 0)
 */
-#define SEMIFRAC    (VS[i]*VT[i]*2/2 + 0x8000/2)
+#define SEMIFRAC    (VS[i]*VT[i]*2/2 + 0x4000)
 #endif

 INLINE static void do_mulu(usf_state_t * state, short* VD, short* VS, short* VT)
 {
+
+#ifdef ARCH_MIN_ARM_NEON
+	
+	int16x8_t vd, vs,vt,res,four,zero,vacc_l, vacc_m, vacc_h;
+	uint16x8_t cond_u, vacc_m_cond_u,one;
+	   
+	one = vdupq_n_u16(1);
+    four = vdupq_n_s16(0x4000);
+    zero = vdupq_n_s16(0);
+   
+    vs = vld1q_s16((const int16_t *)VS);
+    vt = vld1q_s16((const int16_t *)VT);
+	
+
+    vacc_m = vqrdmulhq_s16(vs, vt);
+    vacc_l = vmlaq_s16(four, vs,vt);
+    vacc_l = vshlq_n_s16(vacc_l,1);
+   
+    cond_u = vceqq_s16(vs,vt);
+    cond_u = vaddq_u16(cond_u, one);
+    vacc_m_cond_u = vcltq_s16(vacc_m, zero);
+    cond_u = vandq_u16(vacc_m_cond_u, cond_u);
+    vacc_h = vqnegq_s16((int16x8_t)cond_u);
+
+	vst1q_s16(VACC_L,vacc_l);
+	vst1q_s16(VACC_M,vacc_m);
+	vst1q_s16(VACC_H,vacc_h);
+	
+	
+	vd = vacc_m;
+
+	uint16x8_t vacc_m_u = vshrq_n_u16((uint16x8_t)vacc_m, 15);
+	vd = vorrq_s16(vd, (int16x8_t)vacc_m_u);
+	vd = vbicq_s16(vd, vacc_h);
+	vst1q_s16(VD, vd);
+	return;
+	
+#else
+
    register int i;

    for (i = 0; i < N; i++)
@ -45,6 +84,7 @@ INLINE static void do_mulu(usf_state_t * state, short* VD, short* VS, short* VT)
        VD[i] &= ~(VACC_H[i] >>  0); /* VD &= -(result >= 0x000000000000) */
 #endif
    return;
+#endif
 }

 static void VMULU(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vnand.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vnand.h
@ -13,14 +13,27 @@
 \******************************************************************************/
 #include "vu.h"

-static INLINE void do_nand(usf_state_t * state, short* VD, short* VS, short* VT)
+INLINE void do_nand(usf_state_t * state, short* VD, short* VS, short* VT)
 {
+
+#ifdef ARCH_MIN_ARM_NEON
+	int16x8_t vs, vt,vaccl;
+	vs = vld1q_s16((const int16_t*)VS);
+	vt = vld1q_s16((const int16_t*)VT);
+	vaccl = vandq_s16(vs,vt);
+	vaccl = vmvnq_s16(vaccl);
+	vst1q_s16(VACC_L, vaccl);
+    vector_copy(VD, VACC_L);
+	return;
+	
+#else
    register int i;

    for (i = 0; i < N; i++)
        VACC_L[i] = ~(VS[i] & VT[i]);
    vector_copy(VD, VACC_L);
    return;
+#endif
 }

 static void VNAND(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vne.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vne.h
@ -15,6 +15,33 @@

 INLINE static void do_ne(usf_state_t * state, short* VD, short* VS, short* VT)
 {
+
+#ifdef ARCH_MIN_ARM_NEON
+	
+	int16x8_t vs, vt,vaccl, ne;
+	int16x8_t zero = vdupq_n_s16(0);
+	uint16x8_t cond;
+	
+	vs = vld1q_s16((const int16_t*)VS);
+	vt = vld1q_s16((const int16_t*)VT);
+	ne = vld1q_s16((const int16_t*)state->ne);
+
+	cond = vceqq_s16(vs,vt);
+	cond = vmvnq_u16(cond); // this is needed if you need to do "not-equal"
+	cond = (uint16x8_t)vnegq_s16((int16x8_t)cond);
+	uint16x8_t comp = vorrq_u16(cond, (uint16x8_t)ne);
+	
+	vst1q_s16(state->clip, zero);
+	vst1q_s16(state->comp, (int16x8_t)cond);
+
+	vector_copy(VACC_L, VS);		
+	vector_copy(VD, VACC_L);
+	vst1q_s16(state->ne, zero);
+	vst1q_s16(state->co, zero);	
+	
+	return;
+#else
+
    register int i;

    for (i = 0; i < N; i++)
@ -35,6 +62,7 @@ INLINE static void do_ne(usf_state_t * state, short* VD, short* VS, short* VT)
    for (i = 0; i < N; i++)
        state->co[i] = 0;
    return;
+#endif
 }

 static void VNE(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vnor.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vnor.h
@ -13,14 +13,27 @@
 \******************************************************************************/
 #include "vu.h"

-static INLINE void do_nor(usf_state_t * state, short* VD, short* VS, short* VT)
+INLINE void do_nor(usf_state_t * state, short* VD, short* VS, short* VT)
 {
+
+#ifdef ARCH_MIN_ARM_NEON
+	int16x8_t vs, vt, vaccl;
+	vs = vld1q_s16((const int16_t*)VS);
+	vt = vld1q_s16((const int16_t*)VT);
+	vaccl = vorrq_s16(vs,vt);
+	vaccl = vmvnq_s16(vaccl);
+
+	vst1q_s16(VACC_L, vaccl);
+	return;
+#else
+
    register int i;

    for (i = 0; i < N; i++)
        VACC_L[i] = ~(VS[i] | VT[i]);
    vector_copy(VD, VACC_L);
    return;
+#endif
 }

 static void VNOR(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vnxor.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vnxor.h
@ -13,14 +13,29 @@
 \******************************************************************************/
 #include "vu.h"

-static INLINE void do_nxor(usf_state_t * state, short* VD, short* VS, short* VT)
+INLINE void do_nxor(usf_state_t * state, short* VD, short* VS, short* VT)
 {
+
+#ifdef ARCH_MIN_ARM_NEON
+
+	int16x8_t vs, vt,vaccl;
+	vs = vld1q_s16((const int16_t*)VS);
+	vt = vld1q_s16((const int16_t*)VT);
+	vaccl = veorq_s16(vs,vt);
+	vaccl = vmvnq_s16(vaccl);
+
+	vst1q_s16(VACC_L, vaccl);
+	vector_copy(VD, VACC_L);
+	return;
+#else
+
    register int i;

    for (i = 0; i < N; i++)
        VACC_L[i] = ~(VS[i] ^ VT[i]);
    vector_copy(VD, VACC_L);
    return;
+#endif
 }

 static void VNXOR(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vor.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vor.h
@ -13,14 +13,28 @@
 \******************************************************************************/
 #include "vu.h"

-static INLINE void do_or(usf_state_t * state, short* VD, short* VS, short* VT)
+INLINE void do_or(usf_state_t * state, short* VD, short* VS, short* VT)
 {
+
+#ifdef ARCH_MIN_ARM_NEON
+	
+	int16x8_t vs, vt,vaccl;
+	vs = vld1q_s16((const int16_t*)VS);
+	vt = vld1q_s16((const int16_t*)VT);
+	vaccl = vorrq_s16(vs,vt);
+	vst1q_s16(VACC_L, vaccl);
+	vector_copy(VD, VACC_L);
+	return;
+	
+#else
+
    register int i;

    for (i = 0; i < N; i++)
        VACC_L[i] = VS[i] | VT[i];
    vector_copy(VD, VACC_L);
    return;
+#endif
 }

 static void VOR(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vsaw.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vsaw.h
@ -34,13 +34,25 @@ static void VSAR(int vd, int vs, int vt, int e)
    if (e > 2)
    {
        message(state, "VSAR\nInvalid mask.", 2);
+		#if ARCH_MIN_ARM_NEON
+		int16x8_t zero = vdupq_n_s16(0);
+		vst1q_s16(VR[vd], zero);
+		#else
        for (i = 0; i < N; i++)
            VR[vd][i] = 0x0000; /* override behavior (zilmar) */
+		#endif
    }
    else
+	{
+		#if ARCH_MIN_ARM_NEON
+		vector_copy(VR[vd], VACC[e]);
+		#else
        for (i = 0; i < N; i++)
            VR[vd][i] = VACC[e][i];
-    for (i = 0; i < N; i++)
+		#endif
+	}
+	
+	for (i = 0; i < N; i++)
        VACC[e][i] = oldval[i]; /* ... = VS */
    return;
 }
@ -59,9 +71,19 @@ static void VSAW(usf_state_t * state, int vd, int vs, int vt, int e)
    if (e > 0x2)
    { /* branch very unlikely...never seen a game do VSAW illegally */
        message(state, "VSAW\nIllegal mask.", 2);
-        for (i = 0; i < N; i++)
+
+        #if ARCH_MIN_ARM_NEON
+		
+		int16x8_t zero = vdupq_n_s16(0);
+		vst1q_s16(state->VR[vd], zero);
+		
+		#else
+
+		for (i = 0; i < N; i++)
            state->VR[vd][i] = 0x0000; /* override behavior (zilmar) */
        return;
+
+		#endif
    }
    vector_copy(state->VR[vd], state->VACC[e]);
    return;
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vsub.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vsub.h
@ -15,6 +15,25 @@

 INLINE static void clr_bi(usf_state_t * state, short* VD, short* VS, short* VT)
 { /* clear CARRY and borrow in to accumulators */
+
+#ifdef ARCH_MIN_ARM_NEON
+    
+	int16x8_t vs, vt, zero, co;
+	zero = vdupq_n_s16(0);
+	vs = vld1q_s16((const int16_t*)VS);
+	vt = vld1q_s16((const int16_t*)VT);
+	co = vld1q_s16((const int16_t*)state->co);
+	
+	vs = vsubq_s16(vs,vt);
+	vs = vsubq_s16(vs,co);
+	vst1q_s16(VACC_L, vs);
+	
+	SIGNED_CLAMP_SUB(state, VD, VS, VT);
+	vst1q_s16(state->ne, zero);
+	vst1q_s16(state->co, zero);
+	return;
+#else
+
    register int i;

    for (i = 0; i < N; i++)
@ -25,6 +44,7 @@ INLINE static void clr_bi(usf_state_t * state, short* VD, short* VS, short* VT)
    for (i = 0; i < N; i++)
        state->co[i] = 0;
    return;
+#endif
 }

 static void VSUB(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vsubc.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vsubc.h
@ -15,6 +15,36 @@

 INLINE static void set_bo(usf_state_t * state, short* VD, short* VS, short* VT)
 { /* set CARRY and borrow out from difference */
+
+#ifdef ARCH_MIN_ARM_NEON
+
+	int16x8_t vs, vt,vaccl,ne, co2;
+	uint16x8_t cond;
+	
+	int16x8_t zero = vdupq_n_s16(0);
+	int16x8_t one = vdupq_n_s16(1);
+	vs = vld1q_s16((const int16_t*)VS);
+	vt = vld1q_s16((const int16_t*)VT);
+	
+	vaccl = vsubq_s16(vs, vt);
+	uint16x8_t vdif = vqsubq_u16((uint16x8_t)vs, (uint16x8_t)vt);
+	
+	vst1q_s16(VACC_L, vaccl);
+	vector_copy(VD, VACC_L);
+	
+	cond = vceqq_s16(vs, vt);
+	ne = vaddq_s16((int16x8_t)cond, one);
+
+	vdif = vorrq_u16(vdif,cond);
+	cond = vceqq_u16(vdif, (uint16x8_t)zero);
+	co2 = vnegq_s16((int16x8_t)cond);
+	
+	vst1q_s16(state->ne, ne);
+	vst1q_s16(state->co, co2);
+	return;
+
+#else
+
    ALIGNED int32_t dif[N];
    register int i;

@ -28,6 +58,7 @@ INLINE static void set_bo(usf_state_t * state, short* VD, short* VS, short* VT)
    for (i = 0; i < N; i++)
        state->co[i] = (dif[i] < 0);
    return;
+#endif
 }

 static void VSUBC(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vu.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vu.h
@ -54,9 +54,18 @@ static void res_V(usf_state_t * state, int vd, int vs, int vt, int e)
    if (vs != vt || vt != e)
        return;
    message(state, "C2\nRESERVED", 2); /* uncertain how to handle reserved, untested */
+
+#ifdef ARCH_MIN_ARM_NEON
+	int16x8_t zero = vdupq_n_s16(0);
+	vst1q_s16(state->VR[vd], zero);
+	return;
+#else
+
+
    for (i = 0; i < N; i++)
        state->VR[vd][i] = 0x0000; /* override behavior (bpoint) */
    return;
+#endif
 }
 static void res_M(usf_state_t * state, int vd, int vs, int vt, int e)
 {
--- a/Frameworks/lazyusf/lazyusf/rsp/vu/vxor.h
+++ b/Frameworks/lazyusf/lazyusf/rsp/vu/vxor.h
@ -13,14 +13,28 @@
 \******************************************************************************/
 #include "vu.h"

-static INLINE void do_xor(usf_state_t * state, short* VD, short* VS, short* VT)
+INLINE void do_xor(usf_state_t * state, short* VD, short* VS, short* VT)
 {
+
+#ifdef ARCH_MIN_ARM_NEON
+
+	int16x8_t vs, vt;
+	vs = vld1q_s16((const int16_t*)VS);
+	vt = vld1q_s16((const int16_t*)VT);
+	vs = veorq_s16(vs, vt);
+	vst1q_s16(VACC_L, vs);
+	vector_copy(VD, VACC_L);
+
+	return;
+#else
+
    register int i;

    for (i = 0; i < N; i++)
        VACC_L[i] = VS[i] ^ VT[i];
    vector_copy(VD, VACC_L);
    return;
+#endif
 }

 static void VXOR(usf_state_t * state, int vd, int vs, int vt, int e)
--- a/Frameworks/lazyusf/lazyusf/rsp_hle/alist.c
+++ b/Frameworks/lazyusf/lazyusf/rsp_hle/alist.c
@ -29,6 +29,8 @@
 #include <stdint.h>
 #include <string.h>

+#include "common.h"
+
 #include "alist.h"
 #include "arithmetics.h"
 #include "audio.h"
@ -58,12 +60,12 @@ static int16_t* sample(struct hle_t* hle, unsigned pos)

 static uint8_t* alist_u8(struct hle_t* hle, uint16_t dmem)
 {
-    return &hle->alist_buffer[dmem ^ S8];
+    return u8(hle->alist_buffer, dmem);
 }

 static int16_t* alist_s16(struct hle_t* hle, uint16_t dmem)
 {
-    return (int16_t*)(&hle->alist_buffer[dmem ^ S16]);
+    return (int16_t*)u16(hle->alist_buffer, dmem);
 }


@ -82,13 +84,13 @@ static void alist_envmix_mix(size_t n, int16_t** dst, const int16_t* gains, int1

 static int16_t ramp_step(struct ramp_t* ramp)
 {
-	bool target_reached;
-
+    bool target_reached;
+    
    ramp->value += ramp->step;

    target_reached = (ramp->step <= 0)
-    ? (ramp->value <= ramp->target)
-    : (ramp->value >= ramp->target);
+        ? (ramp->value <= ramp->target)
+        : (ramp->value >= ramp->target);

    if (target_reached)
    {
@ -184,7 +186,7 @@ void alist_move(struct hle_t* hle, uint16_t dmemo, uint16_t dmemi, uint16_t coun
 void alist_copy_every_other_sample(struct hle_t* hle, uint16_t dmemo, uint16_t dmemi, uint16_t count)
 {
    while (count != 0) {
-        *(uint16_t*)(alist_u8(hle, dmemo)) = *(uint16_t*)(alist_u8(hle, dmemi));
+        *alist_s16(hle, dmemo) = *alist_s16(hle, dmemi);
        dmemo += 2;
        dmemi += 4;
        --count;
@ -373,7 +375,7 @@ void alist_envmix_ge(
        const int32_t *rate,
        uint32_t address)
 {
-    unsigned k, i, ptr;
+    unsigned k;
    size_t n = (aux) ? 4 : 2;

    const int16_t* const in = (int16_t*)(hle->alist_buffer + dmemi);
@ -390,8 +392,8 @@ void alist_envmix_ge(
        ramps[1].value  = (vol[1] << 16);
        ramps[0].target = (target[0] << 16);
        ramps[1].target = (target[1] << 16);
-        ramps[0].step   = rate[0];
-        ramps[1].step   = rate[1];
+        ramps[0].step   = rate[0] / 8;
+        ramps[1].step   = rate[1] / 8;
    } else {
        memcpy((uint8_t *)save_buffer, (hle->dram + address), 80);
        wet             = *(int16_t *)(save_buffer +  0);   /* 0-1 */
@ -407,26 +409,23 @@ void alist_envmix_ge(
    }

    count >>= 1;
-    for (ptr = 0, k = 0; k < count; k += 8) {
+    for (k = 0; k < count; k++) {
        int16_t  gains[4];
        int16_t* buffers[4];
        int16_t l_vol = ramp_step(&ramps[0]);
        int16_t r_vol = ramp_step(&ramps[1]);

-		gains[0] = clamp_s16((l_vol * dry + 0x4000) >> 15);
-		gains[1] = clamp_s16((r_vol * dry + 0x4000) >> 15);
-		gains[2] = clamp_s16((l_vol * wet + 0x4000) >> 15);
-		gains[3] = clamp_s16((r_vol * wet + 0x4000) >> 15);
+        buffers[0] = dl + (k^S);
+        buffers[1] = dr + (k^S);
+        buffers[2] = wl + (k^S);
+        buffers[3] = wr + (k^S);

-		for (i = 0; i < 8; i++) {
-			buffers[0] = dl + (ptr^S);
-			buffers[1] = dr + (ptr^S);
-			buffers[2] = wl + (ptr^S);
-			buffers[3] = wr + (ptr^S);
+        gains[0] = clamp_s16((l_vol * dry + 0x4000) >> 15);
+        gains[1] = clamp_s16((r_vol * dry + 0x4000) >> 15);
+        gains[2] = clamp_s16((l_vol * wet + 0x4000) >> 15);
+        gains[3] = clamp_s16((r_vol * wet + 0x4000) >> 15);

-			alist_envmix_mix(n, buffers, gains, in[ptr^S]);
-			ptr++;
-		}
+        alist_envmix_mix(n, buffers, gains, in[k^S]);
    }

    *(int16_t *)(save_buffer +  0) = wet;               /* 0-1 */
--- a/Frameworks/lazyusf/lazyusf/rsp_hle/alist_audio.c
+++ b/Frameworks/lazyusf/lazyusf/rsp_hle/alist_audio.c
@ -29,6 +29,8 @@
 #include <stdint.h>
 #include <string.h>

+#include "common.h"
+
 #include "alist.h"
 #include "hle_internal.h"
 #include "memory.h"
@ -52,7 +54,7 @@ static void clear_segments(struct hle_t* hle)
 }

 /* audio commands definition */
-static void SPNOOP(struct hle_t* hle, uint32_t w1, uint32_t w2)
+static void SPNOOP(struct hle_t* UNUSED(hle), uint32_t UNUSED(w1), uint32_t UNUSED(w2))
 {
 }

@ -142,7 +144,7 @@ static void SETVOL(struct hle_t* hle, uint32_t w1, uint32_t w2)
    }
 }

-static void SETLOOP(struct hle_t* hle, uint32_t w1, uint32_t w2)
+static void SETLOOP(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
 {
    hle->alist_audio.loop = get_address(hle, w2);
 }
@ -165,7 +167,7 @@ static void ADPCM(struct hle_t* hle, uint32_t w1, uint32_t w2)
            address);
 }

-static void LOADBUFF(struct hle_t* hle, uint32_t w1, uint32_t w2)
+static void LOADBUFF(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
 {
    uint32_t address = get_address(hle, w2);

@ -175,7 +177,7 @@ static void LOADBUFF(struct hle_t* hle, uint32_t w1, uint32_t w2)
    alist_load(hle, hle->alist_audio.in, address, hle->alist_audio.count);
 }

-static void SAVEBUFF(struct hle_t* hle, uint32_t w1, uint32_t w2)
+static void SAVEBUFF(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
 {
    uint32_t address = get_address(hle, w2);

@ -220,7 +222,7 @@ static void LOADADPCM(struct hle_t* hle, uint32_t w1, uint32_t w2)
    dram_load_u16(hle, (uint16_t*)hle->alist_audio.table, address, align(count, 8) >> 1);
 }

-static void INTERLEAVE(struct hle_t* hle, uint32_t w1, uint32_t w2)
+static void INTERLEAVE(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
 {
    uint16_t left  = (w2 >> 16) + DMEM_BASE;
    uint16_t right = w2 + DMEM_BASE;
@ -243,7 +245,7 @@ static void MIXER(struct hle_t* hle, uint32_t w1, uint32_t w2)
    alist_mix(hle, dmemo, dmemi, align(hle->alist_audio.count, 32), gain);
 }

-static void SEGMENT(struct hle_t* hle, uint32_t w1, uint32_t w2)
+static void SEGMENT(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
 {
    set_address(hle, w2);
 }
--- a/Frameworks/lazyusf/lazyusf/rsp_hle/alist_naudio.c
+++ b/Frameworks/lazyusf/lazyusf/rsp_hle/alist_naudio.c
@ -28,12 +28,13 @@
 #endif
 #include <stdint.h>

+#include "common.h"
+
 #include "alist.h"
 #include "hle_external.h"
 #include "hle_internal.h"
 #include "memory.h"
-
-static void MP3(struct hle_t* hle, uint32_t w1, uint32_t w2);
+#include "ucodes.h"

 enum { NAUDIO_COUNT = 0x170 }; /* ie 184 samples */
 enum {
@ -57,7 +58,7 @@ static void UNKNOWN(struct hle_t* hle, uint32_t w1, uint32_t w2)
 }


-static void SPNOOP(struct hle_t* hle, uint32_t w1, uint32_t w2)
+static void SPNOOP(struct hle_t* UNUSED(hle), uint32_t UNUSED(w1), uint32_t UNUSED(w2))
 {
 }

@ -67,10 +68,11 @@ static void NAUDIO_0000(struct hle_t* hle, uint32_t w1, uint32_t w2)
    UNKNOWN(hle, w1, w2);
 }

-static void NAUDIO_02B0(struct hle_t* hle, uint32_t w1, uint32_t w2)
+static void NAUDIO_02B0(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
 {
-    uint32_t rate = (hle->alist_naudio.rate[1] & 0xffff0000) | (w2 & 0xffff);
-    hle->alist_naudio.rate[1] = rate;
+    /* emulate code at 0x12b0 (inside SETVOL), because PC always execute in IMEM */
+    hle->alist_naudio.rate[1] &= ~0xffff;
+    hle->alist_naudio.rate[1] |= (w2 & 0xffff);
 }

 static void NAUDIO_14(struct hle_t* hle, uint32_t w1, uint32_t w2)
@ -196,7 +198,7 @@ static void DMEMMOVE(struct hle_t* hle, uint32_t w1, uint32_t w2)
    alist_move(hle, dmemo, dmemi, (count + 3) & ~3);
 }

-static void SETLOOP(struct hle_t* hle, uint32_t w1, uint32_t w2)
+static void SETLOOP(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
 {
    hle->alist_naudio.loop = (w2 & 0xffffff);
 }
@ -241,12 +243,12 @@ static void RESAMPLE(struct hle_t* hle, uint32_t w1, uint32_t w2)
            address);
 }

-static void INTERLEAVE(struct hle_t* hle, uint32_t w1, uint32_t w2)
+static void INTERLEAVE(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t UNUSED(w2))
 {
    alist_interleave(hle, NAUDIO_MAIN, NAUDIO_DRY_LEFT, NAUDIO_DRY_RIGHT, NAUDIO_COUNT);
 }

-static void MP3ADDY(struct hle_t* hle, uint32_t w1, uint32_t w2)
+static void MP3ADDY(struct hle_t* UNUSED(hle), uint32_t UNUSED(w1), uint32_t UNUSED(w2))
 {
 }

--- a/Frameworks/lazyusf/lazyusf/rsp_hle/alist_nead.c
+++ b/Frameworks/lazyusf/lazyusf/rsp_hle/alist_nead.c
@ -28,6 +28,8 @@
 #endif
 #include <stdint.h>

+#include "common.h"
+
 #include "alist.h"
 #include "hle_external.h"
 #include "hle_internal.h"
@ -49,7 +51,7 @@ static void UNKNOWN(struct hle_t* hle, uint32_t w1, uint32_t w2)
 }


-static void SPNOOP(struct hle_t* hle, uint32_t w1, uint32_t w2)
+static void SPNOOP(struct hle_t* UNUSED(hle), uint32_t UNUSED(w1), uint32_t UNUSED(w2))
 {
 }

@ -61,7 +63,7 @@ static void LOADADPCM(struct hle_t* hle, uint32_t w1, uint32_t w2)
    dram_load_u16(hle, (uint16_t*)hle->alist_nead.table, address, count >> 1);
 }

-static void SETLOOP(struct hle_t* hle, uint32_t w1, uint32_t w2)
+static void SETLOOP(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
 {
    hle->alist_nead.loop = w2 & 0xffffff;
 }
@ -190,7 +192,7 @@ static void ENVSETUP1(struct hle_t* hle, uint32_t w1, uint32_t w2)
    hle->alist_nead.env_steps[1]  = w2;
 }

-static void ENVSETUP2(struct hle_t* hle, uint32_t w1, uint32_t w2)
+static void ENVSETUP2(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
 {
    hle->alist_nead.env_values[0] = (w2 >> 16);
    hle->alist_nead.env_values[1] = w2;
@ -206,6 +208,7 @@ static void ENVMIXER_MK(struct hle_t* hle, uint32_t w1, uint32_t w2)
    uint16_t dmem_dr = (w2 >> 12) & 0xff0;
    uint16_t dmem_wl = (w2 >>  4) & 0xff0;
    uint16_t dmem_wr = (w2 <<  4) & 0xff0;
+
    xors[2] = 0;    /* unsupported by this ucode */
    xors[3] = 0;    /* unsupported by this ucode */
    xors[0] = 0 - (int16_t)((w1 & 0x2) >> 1);
@ -233,6 +236,7 @@ static void ENVMIXER(struct hle_t* hle, uint32_t w1, uint32_t w2)
    uint16_t dmem_dr = (w2 >> 12) & 0xff0;
    uint16_t dmem_wl = (w2 >>  4) & 0xff0;
    uint16_t dmem_wr = (w2 <<  4) & 0xff0;
+
    xors[2] = 0 - (int16_t)((w1 & 0x8) >> 1);
    xors[3] = 0 - (int16_t)((w1 & 0x4) >> 1);
    xors[0] = 0 - (int16_t)((w1 & 0x2) >> 1);
@ -267,7 +271,7 @@ static void INTERL(struct hle_t* hle, uint32_t w1, uint32_t w2)
    alist_copy_every_other_sample(hle, dmemo, dmemi, count);
 }

-static void INTERLEAVE_MK(struct hle_t* hle, uint32_t w1, uint32_t w2)
+static void INTERLEAVE_MK(struct hle_t* hle, uint32_t UNUSED(w1), uint32_t w2)
 {
    uint16_t left = (w2 >> 16);
    uint16_t right = w2;
@ -323,7 +327,7 @@ static void FILTER(struct hle_t* hle, uint32_t w1, uint32_t w2)
    }
 }

-static void SEGMENT(struct hle_t* hle, uint32_t w1, uint32_t w2)
+static void SEGMENT(struct hle_t* UNUSED(hle), uint32_t UNUSED(w1), uint32_t UNUSED(w2))
 {
 }

--- a/Frameworks/lazyusf/lazyusf/rsp_hle/arithmetics.h
+++ b/Frameworks/lazyusf/lazyusf/rsp_hle/arithmetics.h
@ -24,13 +24,7 @@

 #include <stdint.h>

-#ifdef _MSC_VER
-#define INLINE      __forceinline
-#else
-#define INLINE      inline __attribute__((always_inline))
-#endif
-
-INLINE static int16_t clamp_s16(int_fast32_t x)
+static inline int16_t clamp_s16(int_fast32_t x)
 {
    x = (x < INT16_MIN) ? INT16_MIN: x;
    x = (x > INT16_MAX) ? INT16_MAX: x;
--- a/Frameworks/lazyusf/lazyusf/rsp_hle/audio.c
+++ b/Frameworks/lazyusf/lazyusf/rsp_hle/audio.c
@ -23,41 +23,75 @@
 #include <stddef.h>
 #include <stdint.h>

+#include "common.h"
+
 #include "arithmetics.h"

 const int16_t RESAMPLE_LUT[64 * 4] = {
-    0x0c39, 0x66ad, 0x0d46, 0xffdf, 0x0b39, 0x6696, 0x0e5f, 0xffd8,
-    0x0a44, 0x6669, 0x0f83, 0xffd0, 0x095a, 0x6626, 0x10b4, 0xffc8,
-    0x087d, 0x65cd, 0x11f0, 0xffbf, 0x07ab, 0x655e, 0x1338, 0xffb6,
-    0x06e4, 0x64d9, 0x148c, 0xffac, 0x0628, 0x643f, 0x15eb, 0xffa1,
-    0x0577, 0x638f, 0x1756, 0xff96, 0x04d1, 0x62cb, 0x18cb, 0xff8a,
-    0x0435, 0x61f3, 0x1a4c, 0xff7e, 0x03a4, 0x6106, 0x1bd7, 0xff71,
-    0x031c, 0x6007, 0x1d6c, 0xff64, 0x029f, 0x5ef5, 0x1f0b, 0xff56,
-    0x022a, 0x5dd0, 0x20b3, 0xff48, 0x01be, 0x5c9a, 0x2264, 0xff3a,
-    0x015b, 0x5b53, 0x241e, 0xff2c, 0x0101, 0x59fc, 0x25e0, 0xff1e,
-    0x00ae, 0x5896, 0x27a9, 0xff10, 0x0063, 0x5720, 0x297a, 0xff02,
-    0x001f, 0x559d, 0x2b50, 0xfef4, 0xffe2, 0x540d, 0x2d2c, 0xfee8,
-    0xffac, 0x5270, 0x2f0d, 0xfedb, 0xff7c, 0x50c7, 0x30f3, 0xfed0,
-    0xff53, 0x4f14, 0x32dc, 0xfec6, 0xff2e, 0x4d57, 0x34c8, 0xfebd,
-    0xff0f, 0x4b91, 0x36b6, 0xfeb6, 0xfef5, 0x49c2, 0x38a5, 0xfeb0,
-    0xfedf, 0x47ed, 0x3a95, 0xfeac, 0xfece, 0x4611, 0x3c85, 0xfeab,
-    0xfec0, 0x4430, 0x3e74, 0xfeac, 0xfeb6, 0x424a, 0x4060, 0xfeaf,
-    0xfeaf, 0x4060, 0x424a, 0xfeb6, 0xfeac, 0x3e74, 0x4430, 0xfec0,
-    0xfeab, 0x3c85, 0x4611, 0xfece, 0xfeac, 0x3a95, 0x47ed, 0xfedf,
-    0xfeb0, 0x38a5, 0x49c2, 0xfef5, 0xfeb6, 0x36b6, 0x4b91, 0xff0f,
-    0xfebd, 0x34c8, 0x4d57, 0xff2e, 0xfec6, 0x32dc, 0x4f14, 0xff53,
-    0xfed0, 0x30f3, 0x50c7, 0xff7c, 0xfedb, 0x2f0d, 0x5270, 0xffac,
-    0xfee8, 0x2d2c, 0x540d, 0xffe2, 0xfef4, 0x2b50, 0x559d, 0x001f,
-    0xff02, 0x297a, 0x5720, 0x0063, 0xff10, 0x27a9, 0x5896, 0x00ae,
-    0xff1e, 0x25e0, 0x59fc, 0x0101, 0xff2c, 0x241e, 0x5b53, 0x015b,
-    0xff3a, 0x2264, 0x5c9a, 0x01be, 0xff48, 0x20b3, 0x5dd0, 0x022a,
-    0xff56, 0x1f0b, 0x5ef5, 0x029f, 0xff64, 0x1d6c, 0x6007, 0x031c,
-    0xff71, 0x1bd7, 0x6106, 0x03a4, 0xff7e, 0x1a4c, 0x61f3, 0x0435,
-    0xff8a, 0x18cb, 0x62cb, 0x04d1, 0xff96, 0x1756, 0x638f, 0x0577,
-    0xffa1, 0x15eb, 0x643f, 0x0628, 0xffac, 0x148c, 0x64d9, 0x06e4,
-    0xffb6, 0x1338, 0x655e, 0x07ab, 0xffbf, 0x11f0, 0x65cd, 0x087d,
-    0xffc8, 0x10b4, 0x6626, 0x095a, 0xffd0, 0x0f83, 0x6669, 0x0a44,
-    0xffd8, 0x0e5f, 0x6696, 0x0b39, 0xffdf, 0x0d46, 0x66ad, 0x0c39
+    (int16_t)0x0c39, (int16_t)0x66ad, (int16_t)0x0d46, (int16_t)0xffdf,
+    (int16_t)0x0b39, (int16_t)0x6696, (int16_t)0x0e5f, (int16_t)0xffd8,
+    (int16_t)0x0a44, (int16_t)0x6669, (int16_t)0x0f83, (int16_t)0xffd0,
+    (int16_t)0x095a, (int16_t)0x6626, (int16_t)0x10b4, (int16_t)0xffc8,
+    (int16_t)0x087d, (int16_t)0x65cd, (int16_t)0x11f0, (int16_t)0xffbf,
+    (int16_t)0x07ab, (int16_t)0x655e, (int16_t)0x1338, (int16_t)0xffb6,
+    (int16_t)0x06e4, (int16_t)0x64d9, (int16_t)0x148c, (int16_t)0xffac,
+    (int16_t)0x0628, (int16_t)0x643f, (int16_t)0x15eb, (int16_t)0xffa1,
+    (int16_t)0x0577, (int16_t)0x638f, (int16_t)0x1756, (int16_t)0xff96,
+    (int16_t)0x04d1, (int16_t)0x62cb, (int16_t)0x18cb, (int16_t)0xff8a,
+    (int16_t)0x0435, (int16_t)0x61f3, (int16_t)0x1a4c, (int16_t)0xff7e,
+    (int16_t)0x03a4, (int16_t)0x6106, (int16_t)0x1bd7, (int16_t)0xff71,
+    (int16_t)0x031c, (int16_t)0x6007, (int16_t)0x1d6c, (int16_t)0xff64,
+    (int16_t)0x029f, (int16_t)0x5ef5, (int16_t)0x1f0b, (int16_t)0xff56,
+    (int16_t)0x022a, (int16_t)0x5dd0, (int16_t)0x20b3, (int16_t)0xff48,
+    (int16_t)0x01be, (int16_t)0x5c9a, (int16_t)0x2264, (int16_t)0xff3a,
+    (int16_t)0x015b, (int16_t)0x5b53, (int16_t)0x241e, (int16_t)0xff2c,
+    (int16_t)0x0101, (int16_t)0x59fc, (int16_t)0x25e0, (int16_t)0xff1e,
+    (int16_t)0x00ae, (int16_t)0x5896, (int16_t)0x27a9, (int16_t)0xff10,
+    (int16_t)0x0063, (int16_t)0x5720, (int16_t)0x297a, (int16_t)0xff02,
+    (int16_t)0x001f, (int16_t)0x559d, (int16_t)0x2b50, (int16_t)0xfef4,
+    (int16_t)0xffe2, (int16_t)0x540d, (int16_t)0x2d2c, (int16_t)0xfee8,
+    (int16_t)0xffac, (int16_t)0x5270, (int16_t)0x2f0d, (int16_t)0xfedb,
+    (int16_t)0xff7c, (int16_t)0x50c7, (int16_t)0x30f3, (int16_t)0xfed0,
+    (int16_t)0xff53, (int16_t)0x4f14, (int16_t)0x32dc, (int16_t)0xfec6,
+    (int16_t)0xff2e, (int16_t)0x4d57, (int16_t)0x34c8, (int16_t)0xfebd,
+    (int16_t)0xff0f, (int16_t)0x4b91, (int16_t)0x36b6, (int16_t)0xfeb6,
+    (int16_t)0xfef5, (int16_t)0x49c2, (int16_t)0x38a5, (int16_t)0xfeb0,
+    (int16_t)0xfedf, (int16_t)0x47ed, (int16_t)0x3a95, (int16_t)0xfeac,
+    (int16_t)0xfece, (int16_t)0x4611, (int16_t)0x3c85, (int16_t)0xfeab,
+    (int16_t)0xfec0, (int16_t)0x4430, (int16_t)0x3e74, (int16_t)0xfeac,
+    (int16_t)0xfeb6, (int16_t)0x424a, (int16_t)0x4060, (int16_t)0xfeaf,
+    (int16_t)0xfeaf, (int16_t)0x4060, (int16_t)0x424a, (int16_t)0xfeb6,
+    (int16_t)0xfeac, (int16_t)0x3e74, (int16_t)0x4430, (int16_t)0xfec0,
+    (int16_t)0xfeab, (int16_t)0x3c85, (int16_t)0x4611, (int16_t)0xfece,
+    (int16_t)0xfeac, (int16_t)0x3a95, (int16_t)0x47ed, (int16_t)0xfedf,
+    (int16_t)0xfeb0, (int16_t)0x38a5, (int16_t)0x49c2, (int16_t)0xfef5,
+    (int16_t)0xfeb6, (int16_t)0x36b6, (int16_t)0x4b91, (int16_t)0xff0f,
+    (int16_t)0xfebd, (int16_t)0x34c8, (int16_t)0x4d57, (int16_t)0xff2e,
+    (int16_t)0xfec6, (int16_t)0x32dc, (int16_t)0x4f14, (int16_t)0xff53,
+    (int16_t)0xfed0, (int16_t)0x30f3, (int16_t)0x50c7, (int16_t)0xff7c,
+    (int16_t)0xfedb, (int16_t)0x2f0d, (int16_t)0x5270, (int16_t)0xffac,
+    (int16_t)0xfee8, (int16_t)0x2d2c, (int16_t)0x540d, (int16_t)0xffe2,
+    (int16_t)0xfef4, (int16_t)0x2b50, (int16_t)0x559d, (int16_t)0x001f,
+    (int16_t)0xff02, (int16_t)0x297a, (int16_t)0x5720, (int16_t)0x0063,
+    (int16_t)0xff10, (int16_t)0x27a9, (int16_t)0x5896, (int16_t)0x00ae,
+    (int16_t)0xff1e, (int16_t)0x25e0, (int16_t)0x59fc, (int16_t)0x0101,
+    (int16_t)0xff2c, (int16_t)0x241e, (int16_t)0x5b53, (int16_t)0x015b,
+    (int16_t)0xff3a, (int16_t)0x2264, (int16_t)0x5c9a, (int16_t)0x01be,
+    (int16_t)0xff48, (int16_t)0x20b3, (int16_t)0x5dd0, (int16_t)0x022a,
+    (int16_t)0xff56, (int16_t)0x1f0b, (int16_t)0x5ef5, (int16_t)0x029f,
+    (int16_t)0xff64, (int16_t)0x1d6c, (int16_t)0x6007, (int16_t)0x031c,
+    (int16_t)0xff71, (int16_t)0x1bd7, (int16_t)0x6106, (int16_t)0x03a4,
+    (int16_t)0xff7e, (int16_t)0x1a4c, (int16_t)0x61f3, (int16_t)0x0435,
+    (int16_t)0xff8a, (int16_t)0x18cb, (int16_t)0x62cb, (int16_t)0x04d1,
+    (int16_t)0xff96, (int16_t)0x1756, (int16_t)0x638f, (int16_t)0x0577,
+    (int16_t)0xffa1, (int16_t)0x15eb, (int16_t)0x643f, (int16_t)0x0628,
+    (int16_t)0xffac, (int16_t)0x148c, (int16_t)0x64d9, (int16_t)0x06e4,
+    (int16_t)0xffb6, (int16_t)0x1338, (int16_t)0x655e, (int16_t)0x07ab,
+    (int16_t)0xffbf, (int16_t)0x11f0, (int16_t)0x65cd, (int16_t)0x087d,
+    (int16_t)0xffc8, (int16_t)0x10b4, (int16_t)0x6626, (int16_t)0x095a,
+    (int16_t)0xffd0, (int16_t)0x0f83, (int16_t)0x6669, (int16_t)0x0a44,
+    (int16_t)0xffd8, (int16_t)0x0e5f, (int16_t)0x6696, (int16_t)0x0b39,
+    (int16_t)0xffdf, (int16_t)0x0d46, (int16_t)0x66ad, (int16_t)0x0c39
 };

 int32_t rdot(size_t n, const int16_t *x, const int16_t *y)
--- a/Frameworks/lazyusf/lazyusf/rsp_hle/audio.h
+++ b/Frameworks/lazyusf/lazyusf/rsp_hle/audio.h
@ -29,13 +29,7 @@ extern const int16_t RESAMPLE_LUT[64 * 4];

 int32_t rdot(size_t n, const int16_t *x, const int16_t *y);

-#ifdef _MSC_VER
-#define INLINE      __forceinline
-#else
-#define INLINE      inline __attribute__((always_inline))
-#endif
-
-INLINE static int16_t adpcm_predict_sample(uint8_t byte, uint8_t mask,
+static inline int16_t adpcm_predict_sample(uint8_t byte, uint8_t mask,
        unsigned lshift, unsigned rshift)
 {
    int16_t sample = (uint16_t)(byte & mask) << lshift;
--- a/Frameworks/lazyusf/lazyusf/rsp_hle/common.h
+++ b/Frameworks/lazyusf/lazyusf/rsp_hle/common.h
@ -0,0 +1,39 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ *   Mupen64plus-rsp-hle - common.h                                        *
+ *   Mupen64Plus homepage: http://code.google.com/p/mupen64plus/           *
+ *   Copyright (C) 2014 Bobby Smiles                                       *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#ifndef COMMON_H
+#define COMMON_H
+
+/* macro for unused variable warning suppression */
+#ifdef __GNUC__
+#  define UNUSED(x) UNUSED_ ## x __attribute__((__unused__))
+#else
+#  define UNUSED(x) UNUSED_ ## x
+#endif
+
+#ifdef _MSC_VER
+#  define inline __forceinline
+#elif defined __GNUC__
+#  define inline inline __attribute__((always_inline))
+#endif
+
+#endif
+
--- a/Frameworks/lazyusf/lazyusf/rsp_hle/hle.c
+++ b/Frameworks/lazyusf/lazyusf/rsp_hle/hle.c
@ -32,6 +32,8 @@
 #include <stdio.h>
 #endif

+#include "common.h"
+
 #include "hle_external.h"
 #include "hle_internal.h"
 #include "memory.h"
--- a/Frameworks/lazyusf/lazyusf/rsp_hle/hle_internal.h
+++ b/Frameworks/lazyusf/lazyusf/rsp_hle/hle_internal.h
@ -72,12 +72,6 @@ struct hle_t

    /* mp3.c */
    uint8_t  mp3_buffer[0x1000];
-    /* FIXME: rewrite mp3 module to avoid these */
-    uint32_t mp3_inPtr;
-    uint32_t mp3_outPtr;
-    uint32_t mp3_t6;
-    uint32_t mp3_t5;
-    uint32_t mp3_t4;
 };

 #endif
--- a/Frameworks/lazyusf/lazyusf/rsp_hle/jpeg.c
+++ b/Frameworks/lazyusf/lazyusf/rsp_hle/jpeg.c
@ -25,6 +25,8 @@
 #include <stdint.h>
 #include <stdlib.h>

+#include "common.h"
+
 #include "arithmetics.h"
 #include "hle_external.h"
 #include "hle_internal.h"
--- a/Frameworks/lazyusf/lazyusf/rsp_hle/memory.c
+++ b/Frameworks/lazyusf/lazyusf/rsp_hle/memory.c
@ -21,103 +21,56 @@

 #include <string.h>

+#include "common.h"
+
 #include "memory.h"

 /* Global functions */
-void dmem_load_u8(struct hle_t* hle, uint8_t* dst, uint16_t address, size_t count)
+void load_u8(uint8_t* dst, const unsigned char* buffer, unsigned address, size_t count)
 {
    while (count != 0) {
-        *(dst++) = *dmem_u8(hle, address);
+        *(dst++) = *u8(buffer, address);
        address += 1;
        --count;
    }
 }

-void dmem_load_u16(struct hle_t* hle, uint16_t* dst, uint16_t address, size_t count)
+void load_u16(uint16_t* dst, const unsigned char* buffer, unsigned address, size_t count)
 {
    while (count != 0) {
-        *(dst++) = *dmem_u16(hle, address);
+        *(dst++) = *u16(buffer, address);
        address += 2;
        --count;
    }
 }

-void dmem_load_u32(struct hle_t* hle, uint32_t* dst, uint16_t address, size_t count)
+void load_u32(uint32_t* dst, const unsigned char* buffer, unsigned address, size_t count)
 {
    /* Optimization for uint32_t */
-    memcpy(dst, dmem_u32(hle, address), count * sizeof(uint32_t));
+    memcpy(dst, u32(buffer, address), count * sizeof(uint32_t));
 }

-void dmem_store_u8(struct hle_t* hle, const uint8_t* src, uint16_t address, size_t count)
+void store_u8(unsigned char* buffer, unsigned address, const uint8_t* src, size_t count)
 {
    while (count != 0) {
-        *dmem_u8(hle, address) = *(src++);
+        *u8(buffer, address) = *(src++);
        address += 1;
        --count;
    }
 }

-void dmem_store_u16(struct hle_t* hle, const uint16_t* src, uint16_t address, size_t count)
+void store_u16(unsigned char* buffer, unsigned address, const uint16_t* src, size_t count)
 {
    while (count != 0) {
-        *dmem_u16(hle, address) = *(src++);
+        *u16(buffer, address) = *(src++);
        address += 2;
        --count;
    }
 }

-void dmem_store_u32(struct hle_t* hle, const uint32_t* src, uint16_t address, size_t count)
+void store_u32(unsigned char* buffer, unsigned address, const uint32_t* src, size_t count)
 {
    /* Optimization for uint32_t */
-    memcpy(dmem_u32(hle, address), src, count * sizeof(uint32_t));
-}
-
-
-void dram_load_u8(struct hle_t* hle, uint8_t* dst, uint32_t address, size_t count)
-{
-    while (count != 0) {
-        *(dst++) = *dram_u8(hle, address);
-        address += 1;
-        --count;
-    }
-}
-
-void dram_load_u16(struct hle_t* hle, uint16_t* dst, uint32_t address, size_t count)
-{
-    while (count != 0) {
-        *(dst++) = *dram_u16(hle, address);
-        address += 2;
-        --count;
-    }
-}
-
-void dram_load_u32(struct hle_t* hle, uint32_t* dst, uint32_t address, size_t count)
-{
-    /* Optimization for uint32_t */
-    memcpy(dst, dram_u32(hle, address), count * sizeof(uint32_t));
-}
-
-void dram_store_u8(struct hle_t* hle, const uint8_t* src, uint32_t address, size_t count)
-{
-    while (count != 0) {
-        *dram_u8(hle, address) = *(src++);
-        address += 1;
-        --count;
-    }
-}
-
-void dram_store_u16(struct hle_t* hle, const uint16_t* src, uint32_t address, size_t count)
-{
-    while (count != 0) {
-        *dram_u16(hle, address) = *(src++);
-        address += 2;
-        --count;
-    }
-}
-
-void dram_store_u32(struct hle_t* hle, const uint32_t* src, uint32_t address, size_t count)
-{
-    /* Optimization for uint32_t */
-    memcpy(dram_u32(hle, address), src, count * sizeof(uint32_t));
+    memcpy(u32(buffer, address), src, count * sizeof(uint32_t));
 }

--- a/Frameworks/lazyusf/lazyusf/rsp_hle/memory.h
+++ b/Frameworks/lazyusf/lazyusf/rsp_hle/memory.h
@ -57,65 +57,128 @@ enum {
    TASK_YIELD_DATA_SIZE    = 0xffc
 };

-#ifdef _MSC_VER
-#define INLINE      __forceinline
-#else
-#define INLINE      inline __attribute__((always_inline))
-#endif
-
-INLINE static unsigned int align(unsigned int x, unsigned amount)
+static inline unsigned int align(unsigned int x, unsigned amount)
 {
    --amount;
    return (x + amount) & ~amount;
 }

-INLINE static uint8_t* const dmem_u8(struct hle_t* hle, uint16_t address)
+static inline uint8_t* u8(const unsigned char* buffer, unsigned address)
 {
-    return (uint8_t*)(&hle->dmem[(address & 0xfff) ^ S8]);
+    return (uint8_t*)(buffer + (address ^ S8));
 }

-INLINE static uint16_t* const dmem_u16(struct hle_t* hle, uint16_t address)
+static inline uint16_t* u16(const unsigned char* buffer, unsigned address)
 {
    assert((address & 1) == 0);
-    return (uint16_t*)(&hle->dmem[(address & 0xfff) ^ S16]);
+    return (uint16_t*)(buffer + (address ^ S16));
 }

-INLINE static uint32_t* const dmem_u32(struct hle_t* hle, uint16_t address)
+static inline uint32_t* u32(const unsigned char* buffer, unsigned address)
 {
    assert((address & 3) == 0);
-    return (uint32_t*)(&hle->dmem[(address & 0xfff)]);
+    return (uint32_t*)(buffer + address);
 }

-INLINE static uint8_t* const dram_u8(struct hle_t* hle, uint32_t address)
+void load_u8 (uint8_t*  dst, const unsigned char* buffer, unsigned address, size_t count);
+void load_u16(uint16_t* dst, const unsigned char* buffer, unsigned address, size_t count);
+void load_u32(uint32_t* dst, const unsigned char* buffer, unsigned address, size_t count);
+void store_u8 (unsigned char* buffer, unsigned address, const uint8_t*  src, size_t count);
+void store_u16(unsigned char* buffer, unsigned address, const uint16_t* src, size_t count);
+void store_u32(unsigned char* buffer, unsigned address, const uint32_t* src, size_t count);
+
+
+/* convenient functions for DMEM access */
+static inline uint8_t* dmem_u8(struct hle_t* hle, uint16_t address)
 {
-    return (uint8_t*)&hle->dram[(address & 0xffffff) ^ S8];
+    return u8(hle->dmem, address & 0xfff);
 }

-INLINE static uint16_t* const dram_u16(struct hle_t* hle, uint32_t address)
+static inline uint16_t* dmem_u16(struct hle_t* hle, uint16_t address)
 {
-    assert((address & 1) == 0);
-    return (uint16_t*)&hle->dram[(address & 0xffffff) ^ S16];
+    return u16(hle->dmem, address & 0xfff);
 }

-INLINE static uint32_t* const dram_u32(struct hle_t* hle, uint32_t address)
+static inline uint32_t* dmem_u32(struct hle_t* hle, uint16_t address)
 {
-    assert((address & 3) == 0);
-    return (uint32_t*)&hle->dram[address & 0xffffff];
+    return u32(hle->dmem, address & 0xfff);
 }

-void dmem_load_u8 (struct hle_t* hle, uint8_t*  dst, uint16_t address, size_t count);
-void dmem_load_u16(struct hle_t* hle, uint16_t* dst, uint16_t address, size_t count);
-void dmem_load_u32(struct hle_t* hle, uint32_t* dst, uint16_t address, size_t count);
-void dmem_store_u8 (struct hle_t* hle, const uint8_t*  src, uint16_t address, size_t count);
-void dmem_store_u16(struct hle_t* hle, const uint16_t* src, uint16_t address, size_t count);
-void dmem_store_u32(struct hle_t* hle, const uint32_t* src, uint16_t address, size_t count);
+static inline void dmem_load_u8(struct hle_t* hle, uint8_t* dst, uint16_t address, size_t count)
+{
+    load_u8(dst, hle->dmem, address & 0xfff, count);
+}

-void dram_load_u8 (struct hle_t* hle, uint8_t*  dst, uint32_t address, size_t count);
-void dram_load_u16(struct hle_t* hle, uint16_t* dst, uint32_t address, size_t count);
-void dram_load_u32(struct hle_t* hle, uint32_t* dst, uint32_t address, size_t count);
-void dram_store_u8 (struct hle_t* hle, const uint8_t*  src, uint32_t address, size_t count);
-void dram_store_u16(struct hle_t* hle, const uint16_t* src, uint32_t address, size_t count);
-void dram_store_u32(struct hle_t* hle, const uint32_t* src, uint32_t address, size_t count);
+static inline void dmem_load_u16(struct hle_t* hle, uint16_t* dst, uint16_t address, size_t count)
+{
+    load_u16(dst, hle->dmem, address & 0xfff, count);
+}
+
+static inline void dmem_load_u32(struct hle_t* hle, uint32_t* dst, uint16_t address, size_t count)
+{
+    load_u32(dst, hle->dmem, address & 0xfff, count);
+}
+
+static inline void dmem_store_u8(struct hle_t* hle, const uint8_t* src, uint16_t address, size_t count)
+{
+    store_u8(hle->dmem, address & 0xfff, src, count);
+}
+
+static inline void dmem_store_u16(struct hle_t* hle, const uint16_t* src, uint16_t address, size_t count)
+{
+    store_u16(hle->dmem, address & 0xfff, src, count);
+}
+
+static inline void dmem_store_u32(struct hle_t* hle, const uint32_t* src, uint16_t address, size_t count)
+{
+    store_u32(hle->dmem, address & 0xfff, src, count);
+}
+
+/* convenient functions DRAM access */
+static inline uint8_t* dram_u8(struct hle_t* hle, uint32_t address)
+{
+    return u8(hle->dram, address & 0xffffff);
+}
+
+static inline uint16_t* dram_u16(struct hle_t* hle, uint32_t address)
+{
+    return u16(hle->dram, address & 0xffffff);
+}
+
+static inline uint32_t* dram_u32(struct hle_t* hle, uint32_t address)
+{
+    return u32(hle->dram, address & 0xffffff);
+}
+
+static inline void dram_load_u8(struct hle_t* hle, uint8_t* dst, uint32_t address, size_t count)
+{
+    load_u8(dst, hle->dram, address & 0xffffff, count);
+}
+
+static inline void dram_load_u16(struct hle_t* hle, uint16_t* dst, uint32_t address, size_t count)
+{
+    load_u16(dst, hle->dram, address & 0xffffff, count);
+}
+
+static inline void dram_load_u32(struct hle_t* hle, uint32_t* dst, uint32_t address, size_t count)
+{
+    load_u32(dst, hle->dram, address & 0xffffff, count);
+}
+
+static inline void dram_store_u8(struct hle_t* hle, const uint8_t* src, uint32_t address, size_t count)
+{
+    store_u8(hle->dram, address & 0xffffff, src, count);
+}
+
+static inline void dram_store_u16(struct hle_t* hle, const uint16_t* src, uint32_t address, size_t count)
+{
+    store_u16(hle->dram, address & 0xffffff, src, count);
+}
+
+static inline void dram_store_u32(struct hle_t* hle, const uint32_t* src, uint32_t address, size_t count)
+{
+    store_u32(hle->dram, address & 0xffffff, src, count);
+}

 #endif

--- a/Frameworks/lazyusf/lazyusf/rsp_hle/mp3.c
+++ b/Frameworks/lazyusf/lazyusf/rsp_hle/mp3.c
@ -1,6 +1,7 @@
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
- *   Mupen64plus-rsp-hle - ucode3mp3.h                                     *
+ *   Mupen64plus-rsp-hle - mp3.c                                           *
 *   Mupen64Plus homepage: http://code.google.com/p/mupen64plus/           *
+ *   Copyright (C) 2014 Bobby Smiles                                       *
 *   Copyright (C) 2009 Richard Goedeken                                   *
 *   Copyright (C) 2002 Hacktarux                                          *
 *                                                                         *
@ -23,10 +24,16 @@
 #include <string.h>
 #include <stdint.h>

-#include "hle_external.h"
+#include "common.h"
+
+#include "arithmetics.h"
 #include "hle_internal.h"
 #include "memory.h"

+static void InnerLoop(struct hle_t* hle,
+                      uint32_t outPtr, uint32_t inPtr,
+                      uint32_t t6, uint32_t t5, uint32_t t4);
+
 static const uint16_t DeWindowLUT [0x420] = {
    0x0000, 0xFFF3, 0x005D, 0xFF38, 0x037A, 0xF736, 0x0B37, 0xC00E,
    0x7FFF, 0x3FF2, 0x0B37, 0x08CA, 0x037A, 0x00C8, 0x005D, 0x000D,
@ -198,10 +205,13 @@ static void MP3AB0(int32_t* v)
    }
 }

-static void InnerLoop(struct hle_t* hle);
-
 void mp3_task(struct hle_t* hle, unsigned int index, uint32_t address)
 {
+    uint32_t inPtr, outPtr;
+    uint32_t t6;/* = 0x08A0; - I think these are temporary storage buffers */
+    uint32_t t5;/* = 0x0AC0; */
+    uint32_t t4;/* = (w1 & 0x1E); */
+
    /* Initialization Code */
    uint32_t readPtr; /* s5 */
    uint32_t writePtr; /* s6 */
@ -209,9 +219,9 @@ void mp3_task(struct hle_t* hle, unsigned int index, uint32_t address)
    int cnt, cnt2;

    /* I think these are temporary storage buffers */
-    hle->mp3_t6 = 0x08A0;
-    hle->mp3_t5 = 0x0AC0;
-    hle->mp3_t4 = index;
+    t6 = 0x08A0;
+    t5 = 0x0AC0;
+    t4 = index;

    writePtr = readPtr = address;
    /* Just do that for efficiency... may remove and use directly later anyway */
@ -222,20 +232,21 @@ void mp3_task(struct hle_t* hle, unsigned int index, uint32_t address)
    for (cnt = 0; cnt < 0x480; cnt += 0x180) {
        /* DMA: 0xCF0 <- RDRAM[s5] : 0x180 */
        memcpy(hle->mp3_buffer + 0xCF0, hle->dram + readPtr, 0x180);
-        hle->mp3_inPtr  = 0xCF0; /* s7 */
-        hle->mp3_outPtr = 0xE70; /* s3 */
+        inPtr  = 0xCF0; /* s7 */
+        outPtr = 0xE70; /* s3 */
 /* --------------- Inner Loop Start -------------------- */
        for (cnt2 = 0; cnt2 < 0x180; cnt2 += 0x40) {
-            hle->mp3_t6 &= 0xFFE0;
-            hle->mp3_t5 &= 0xFFE0;
-            hle->mp3_t6 |= hle->mp3_t4;
-            hle->mp3_t5 |= hle->mp3_t4;
-            InnerLoop(hle);
-            hle->mp3_t4 = (hle->mp3_t4 - 2) & 0x1E;
-            tmp = hle->mp3_t6;
-            hle->mp3_t6 = hle->mp3_t5;
-            hle->mp3_t5 = tmp;
-            hle->mp3_inPtr += 0x40;
+            t6 &= 0xFFE0;
+            t5 &= 0xFFE0;
+            t6 |= t4;
+            t5 |= t4;
+            InnerLoop(hle, outPtr, inPtr, t6, t5, t4);
+            t4 = (t4 - 2) & 0x1E;
+            tmp = t6;
+            t6 = t5;
+            t5 = tmp;
+            inPtr += 0x40;
+            outPtr += 0x40;
        }
 /* --------------- Inner Loop End -------------------- */
        memcpy(hle->dram + writePtr, hle->mp3_buffer + 0xe70, 0x180);
@ -244,9 +255,9 @@ void mp3_task(struct hle_t* hle, unsigned int index, uint32_t address)
    }
 }

-
-
-static void InnerLoop(struct hle_t* hle)
+static void InnerLoop(struct hle_t* hle,
+                      uint32_t outPtr, uint32_t inPtr,
+                      uint32_t t6, uint32_t t5, uint32_t t4)
 {
    /* Part 1: 100% Accurate */

@ -274,56 +285,56 @@ static void InnerLoop(struct hle_t* hle)
    int32_t vt;
    int32_t v[32];

-    v[0] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x00 ^ S16));
-    v[31] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x3E ^ S16));
+    v[0] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x00 ^ S16));
+    v[31] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x3E ^ S16));
    v[0] += v[31];
-    v[1] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x02 ^ S16));
-    v[30] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x3C ^ S16));
+    v[1] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x02 ^ S16));
+    v[30] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x3C ^ S16));
    v[1] += v[30];
-    v[2] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x06 ^ S16));
-    v[28] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x38 ^ S16));
+    v[2] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x06 ^ S16));
+    v[28] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x38 ^ S16));
    v[2] += v[28];
-    v[3] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x04 ^ S16));
-    v[29] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x3A ^ S16));
+    v[3] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x04 ^ S16));
+    v[29] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x3A ^ S16));
    v[3] += v[29];

-    v[4] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x0E ^ S16));
-    v[24] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x30 ^ S16));
+    v[4] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x0E ^ S16));
+    v[24] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x30 ^ S16));
    v[4] += v[24];
-    v[5] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x0C ^ S16));
-    v[25] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x32 ^ S16));
+    v[5] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x0C ^ S16));
+    v[25] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x32 ^ S16));
    v[5] += v[25];
-    v[6] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x08 ^ S16));
-    v[27] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x36 ^ S16));
+    v[6] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x08 ^ S16));
+    v[27] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x36 ^ S16));
    v[6] += v[27];
-    v[7] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x0A ^ S16));
-    v[26] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x34 ^ S16));
+    v[7] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x0A ^ S16));
+    v[26] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x34 ^ S16));
    v[7] += v[26];

-    v[8] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x1E ^ S16));
-    v[16] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x20 ^ S16));
+    v[8] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x1E ^ S16));
+    v[16] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x20 ^ S16));
    v[8] += v[16];
-    v[9] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x1C ^ S16));
-    v[17] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x22 ^ S16));
+    v[9] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x1C ^ S16));
+    v[17] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x22 ^ S16));
    v[9] += v[17];
-    v[10] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x18 ^ S16));
-    v[19] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x26 ^ S16));
+    v[10] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x18 ^ S16));
+    v[19] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x26 ^ S16));
    v[10] += v[19];
-    v[11] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x1A ^ S16));
-    v[18] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x24 ^ S16));
+    v[11] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x1A ^ S16));
+    v[18] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x24 ^ S16));
    v[11] += v[18];

-    v[12] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x10 ^ S16));
-    v[23] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x2E ^ S16));
+    v[12] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x10 ^ S16));
+    v[23] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x2E ^ S16));
    v[12] += v[23];
-    v[13] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x12 ^ S16));
-    v[22] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x2C ^ S16));
+    v[13] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x12 ^ S16));
+    v[22] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x2C ^ S16));
    v[13] += v[22];
-    v[14] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x16 ^ S16));
-    v[20] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x28 ^ S16));
+    v[14] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x16 ^ S16));
+    v[20] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x28 ^ S16));
    v[14] += v[20];
-    v[15] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x14 ^ S16));
-    v[21] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x2A ^ S16));
+    v[15] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x14 ^ S16));
+    v[21] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x2A ^ S16));
    v[15] += v[21];

    /* Part 2-4 */
@ -332,10 +343,10 @@ static void InnerLoop(struct hle_t* hle)

    /* Part 5 - 1-Wide Butterflies - 100% Accurate but need SSVs!!! */

-    t0 = hle->mp3_t6 + 0x100;
-    t1 = hle->mp3_t6 + 0x200;
-    t2 = hle->mp3_t5 + 0x100;
-    t3 = hle->mp3_t5 + 0x200;
+    t0 = t6 + 0x100;
+    t1 = t6 + 0x200;
+    t2 = t5 + 0x100;
+    t3 = t5 + 0x200;

    /* 0x13A8 */
    v[1] = 0;
@ -344,14 +355,14 @@ static void InnerLoop(struct hle_t* hle)
    v[16] = -v[16] - v[17];
    v[2] = v[18] + v[19];
    /* ** Store v[11] -> (T6 + 0)** */
-    *(int16_t *)(hle->mp3_buffer + ((hle->mp3_t6 + (short)0x0))) = (short)v[11];
+    *(int16_t *)(hle->mp3_buffer + ((t6 + (short)0x0))) = (short)v[11];


    v[11] = -v[11];
    /* ** Store v[16] -> (T3 + 0)** */
    *(int16_t *)(hle->mp3_buffer + ((t3 + (short)0x0))) = (short)v[16];
    /* ** Store v[11] -> (T5 + 0)** */
-    *(int16_t *)(hle->mp3_buffer + ((hle->mp3_t5 + (short)0x0))) = (short)v[11];
+    *(int16_t *)(hle->mp3_buffer + ((t5 + (short)0x0))) = (short)v[11];
    /* 0x13E8 - Verified.... */
    v[2] = -v[2];
    /* ** Store v[2] -> (T2 + 0)** */
@ -398,7 +409,7 @@ static void InnerLoop(struct hle_t* hle)
    v[17] = v[13] - v[10];
    v[9] = v[9] + v[14];
    /* ** Store v[9] -> (T6 + 0x40) */
-    *(int16_t *)(hle->mp3_buffer + ((hle->mp3_t6 + (short)0x40))) = (short)v[9];
+    *(int16_t *)(hle->mp3_buffer + ((t6 + (short)0x40))) = (short)v[9];
    v[11] = v[11] - v[13];
    /* ** Store v[17] -> (T0 + 0xFFC0) */
    *(int16_t *)(hle->mp3_buffer + ((t0 + (short)0xFFC0))) = (short)v[17];
@ -414,63 +425,63 @@ static void InnerLoop(struct hle_t* hle)
    /* ** Store v[8] -> (T3 + 0xFFC0) */
    *(int16_t *)(hle->mp3_buffer + ((t3 + (short)0xFFC0))) = (short)v[8];
    /* ** Store v[14] -> (T5 + 0x40) */
-    *(int16_t *)(hle->mp3_buffer + ((hle->mp3_t5 + (short)0x40))) = (short)v[14];
+    *(int16_t *)(hle->mp3_buffer + ((t5 + (short)0x40))) = (short)v[14];
    /* ** Store v[10] -> (T2 + 0xFFC0) */
    *(int16_t *)(hle->mp3_buffer + ((t2 + (short)0xFFC0))) = (short)v[10];
    /* 0x14FC - Verified... */

    /* Part 6 - 100% Accurate */

-    v[0] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x00 ^ S16));
-    v[31] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x3E ^ S16));
+    v[0] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x00 ^ S16));
+    v[31] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x3E ^ S16));
    v[0] -= v[31];
-    v[1] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x02 ^ S16));
-    v[30] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x3C ^ S16));
+    v[1] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x02 ^ S16));
+    v[30] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x3C ^ S16));
    v[1] -= v[30];
-    v[2] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x06 ^ S16));
-    v[28] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x38 ^ S16));
+    v[2] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x06 ^ S16));
+    v[28] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x38 ^ S16));
    v[2] -= v[28];
-    v[3] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x04 ^ S16));
-    v[29] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x3A ^ S16));
+    v[3] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x04 ^ S16));
+    v[29] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x3A ^ S16));
    v[3] -= v[29];

-    v[4] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x0E ^ S16));
-    v[24] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x30 ^ S16));
+    v[4] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x0E ^ S16));
+    v[24] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x30 ^ S16));
    v[4] -= v[24];
-    v[5] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x0C ^ S16));
-    v[25] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x32 ^ S16));
+    v[5] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x0C ^ S16));
+    v[25] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x32 ^ S16));
    v[5] -= v[25];
-    v[6] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x08 ^ S16));
-    v[27] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x36 ^ S16));
+    v[6] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x08 ^ S16));
+    v[27] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x36 ^ S16));
    v[6] -= v[27];
-    v[7] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x0A ^ S16));
-    v[26] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x34 ^ S16));
+    v[7] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x0A ^ S16));
+    v[26] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x34 ^ S16));
    v[7] -= v[26];

-    v[8] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x1E ^ S16));
-    v[16] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x20 ^ S16));
+    v[8] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x1E ^ S16));
+    v[16] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x20 ^ S16));
    v[8] -= v[16];
-    v[9] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x1C ^ S16));
-    v[17] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x22 ^ S16));
+    v[9] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x1C ^ S16));
+    v[17] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x22 ^ S16));
    v[9] -= v[17];
-    v[10] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x18 ^ S16));
-    v[19] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x26 ^ S16));
+    v[10] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x18 ^ S16));
+    v[19] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x26 ^ S16));
    v[10] -= v[19];
-    v[11] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x1A ^ S16));
-    v[18] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x24 ^ S16));
+    v[11] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x1A ^ S16));
+    v[18] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x24 ^ S16));
    v[11] -= v[18];

-    v[12] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x10 ^ S16));
-    v[23] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x2E ^ S16));
+    v[12] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x10 ^ S16));
+    v[23] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x2E ^ S16));
    v[12] -= v[23];
-    v[13] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x12 ^ S16));
-    v[22] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x2C ^ S16));
+    v[13] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x12 ^ S16));
+    v[22] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x2C ^ S16));
    v[13] -= v[22];
-    v[14] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x16 ^ S16));
-    v[20] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x28 ^ S16));
+    v[14] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x16 ^ S16));
+    v[20] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x28 ^ S16));
    v[14] -= v[20];
-    v[15] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x14 ^ S16));
-    v[21] = *(int16_t *)(hle->mp3_buffer + hle->mp3_inPtr + (0x2A ^ S16));
+    v[15] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x14 ^ S16));
+    v[21] = *(int16_t *)(hle->mp3_buffer + inPtr + (0x2A ^ S16));
    v[15] -= v[21];

    for (i = 0; i < 16; i++)
@ -516,10 +527,10 @@ static void InnerLoop(struct hle_t* hle)
    v[14] = v[6] - v[14];
    v[15] = (((v[30] - v[31]) * 0x5A827) >> 0x10) - v[7];
    /* Store v14 -> (T5 + 0x20) */
-    *(int16_t *)(hle->mp3_buffer + ((hle->mp3_t5 + (short)0x20))) = (short)v[14];
+    *(int16_t *)(hle->mp3_buffer + ((t5 + (short)0x20))) = (short)v[14];
    v[14] = v[14] + v[1];
    /* Store v[14] -> (T6 + 0x20) */
-    *(int16_t *)(hle->mp3_buffer + ((hle->mp3_t6 + (short)0x20))) = (short)v[14];
+    *(int16_t *)(hle->mp3_buffer + ((t6 + (short)0x20))) = (short)v[14];
    /* Store v[15] -> (T1 + 0xFFE0) */
    *(int16_t *)(hle->mp3_buffer + ((t1 + (short)0xFFE0))) = (short)v[15];
    v[9] = v[9] + v[10];
@ -527,7 +538,7 @@ static void InnerLoop(struct hle_t* hle)
    v[6] = v[10] - v[6];
    v[1] = v[9] - v[1];
    /* Store v[6] -> (T5 + 0x60) */
-    *(int16_t *)(hle->mp3_buffer + ((hle->mp3_t5 + (short)0x60))) = (short)v[6];
+    *(int16_t *)(hle->mp3_buffer + ((t5 + (short)0x60))) = (short)v[6];
    v[10] = v[10] + v[2];
    v[10] = v[4] - v[10];
    /* Store v[10] -> (T2 + 0xFFA0) */
@ -547,7 +558,7 @@ static void InnerLoop(struct hle_t* hle)
    *(int16_t *)(hle->mp3_buffer + ((t1 + (short)0xFFA0))) = (short)v[7];
    v[11] = v[11] - v[3];
    /* Store v[1] -> (T6 + 0x60) */
-    *(int16_t *)(hle->mp3_buffer + ((hle->mp3_t6 + (short)0x60))) = (short)v[1];
+    *(int16_t *)(hle->mp3_buffer + ((t6 + (short)0x60))) = (short)v[1];
    v[11] = v[11] - v[5];
    /* Store v[11] -> (T0 + 0x60) */
    *(int16_t *)(hle->mp3_buffer + ((t0 + (short)0x60))) = (short)v[11];
@ -564,9 +575,9 @@ static void InnerLoop(struct hle_t* hle)

    /* Step 8 - Dewindowing */

-    addptr = hle->mp3_t6 & 0xFFE0;
+    addptr = t6 & 0xFFE0;

-    offset = 0x10 - (hle->mp3_t4 >> 1);
+    offset = 0x10 - (t4 >> 1);
    for (x = 0; x < 8; x++) {
        int32_t v0;
        int32_t v18;
@ -585,14 +596,14 @@ static void InnerLoop(struct hle_t* hle)
        /* Clamp(v0); */
        /* Clamp(v18); */
        /* clamp??? */
-        *(int16_t *)(hle->mp3_buffer + (hle->mp3_outPtr ^ S16)) = v0;
-        *(int16_t *)(hle->mp3_buffer + ((hle->mp3_outPtr + 2)^S16)) = v18;
-        hle->mp3_outPtr += 4;
+        *(int16_t *)(hle->mp3_buffer + (outPtr ^ S16)) = v0;
+        *(int16_t *)(hle->mp3_buffer + ((outPtr + 2)^S16)) = v18;
+        outPtr += 4;
        addptr += 0x30;
        offset += 0x38;
    }

-    offset = 0x10 - (hle->mp3_t4 >> 1) + 8 * 0x40;
+    offset = 0x10 - (t4 >> 1) + 8 * 0x40;
    v2 = v4 = 0;
    for (i = 0; i < 4; i++) {
        v2 += ((int) * (int16_t *)(hle->mp3_buffer + (addptr) + 0x00) * (short)DeWindowLUT[offset + 0x00] + 0x4000) >> 0xF;
@ -606,12 +617,12 @@ static void InnerLoop(struct hle_t* hle)
    }
    mult6 = *(int32_t *)(hle->mp3_buffer + 0xCE8);
    mult4 = *(int32_t *)(hle->mp3_buffer + 0xCEC);
-    if (hle->mp3_t4 & 0x2) {
+    if (t4 & 0x2) {
        v2 = (v2 **(uint32_t *)(hle->mp3_buffer + 0xCE8)) >> 0x10;
-        *(int16_t *)(hle->mp3_buffer + (hle->mp3_outPtr ^ S16)) = v2;
+        *(int16_t *)(hle->mp3_buffer + (outPtr ^ S16)) = v2;
    } else {
        v4 = (v4 **(uint32_t *)(hle->mp3_buffer + 0xCE8)) >> 0x10;
-        *(int16_t *)(hle->mp3_buffer + (hle->mp3_outPtr ^ S16)) = v4;
+        *(int16_t *)(hle->mp3_buffer + (outPtr ^ S16)) = v4;
        mult4 = *(uint32_t *)(hle->mp3_buffer + 0xCE8);
    }
    addptr -= 0x50;
@ -621,7 +632,7 @@ static void InnerLoop(struct hle_t* hle)
        int32_t v18;
        v2 = v4 = v6 = v8 = 0;

-        offset = (0x22F - (hle->mp3_t4 >> 1) + x * 0x40);
+        offset = (0x22F - (t4 >> 1) + x * 0x40);

        for (i = 0; i < 4; i++) {
            v2 += ((int) * (int16_t *)(hle->mp3_buffer + (addptr) + 0x20) * (short)DeWindowLUT[offset + 0x00] + 0x4000) >> 0xF;
@ -640,13 +651,13 @@ static void InnerLoop(struct hle_t* hle)
        /* Clamp(v0); */
        /* Clamp(v18); */
        /* clamp??? */
-        *(int16_t *)(hle->mp3_buffer + ((hle->mp3_outPtr + 2)^S16)) = v0;
-        *(int16_t *)(hle->mp3_buffer + ((hle->mp3_outPtr + 4)^S16)) = v18;
-        hle->mp3_outPtr += 4;
+        *(int16_t *)(hle->mp3_buffer + ((outPtr + 2)^S16)) = v0;
+        *(int16_t *)(hle->mp3_buffer + ((outPtr + 4)^S16)) = v18;
+        outPtr += 4;
        addptr -= 0x50;
    }

-    tmp = hle->mp3_outPtr;
+    tmp = outPtr;
    hi0 = mult6;
    hi1 = mult4;

@ -655,43 +666,20 @@ static void InnerLoop(struct hle_t* hle)
    for (i = 0; i < 8; i++) {
        /* v0 */
        vt = (*(int16_t *)(hle->mp3_buffer + ((tmp - 0x40)^S16)) * hi0);
-        if (vt > 32767) {
-            vt = 32767;
-        } else {
-            if (vt < -32767)
-                vt = -32767;
-        }
-        *(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0x40)^S16)) = (int16_t)vt;
+        *(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0x40)^S16)) = clamp_s16(vt);

        /* v17 */
        vt = (*(int16_t *)(hle->mp3_buffer + ((tmp - 0x30)^S16)) * hi0);
-        if (vt > 32767) {
-            vt = 32767;
-        } else {
-            if (vt < -32767)
-                vt = -32767;
-        }
-        *(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0x30)^S16)) = vt;
+        *(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0x30)^S16)) = clamp_s16(vt);

        /* v2 */
        vt = (*(int16_t *)(hle->mp3_buffer + ((tmp - 0x1E)^S16)) * hi1);
-        if (vt > 32767) {
-            vt = 32767;
-        } else {
-            if (vt < -32767)
-                vt = -32767;
-        }
-        *(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0x1E)^S16)) = vt;
+        *(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0x1E)^S16)) = clamp_s16(vt);

        /* v4 */
        vt = (*(int16_t *)(hle->mp3_buffer + ((tmp - 0xE)^S16)) * hi1);
-        if (vt > 32767) {
-            vt = 32767;
-        } else {
-            if (vt < -32767)
-                vt = -32767;
-        }
-        *(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0xE)^S16)) = vt;
+        *(int16_t *)((uint8_t *)hle->mp3_buffer + ((tmp - 0xE)^S16)) = clamp_s16(vt);
+
        tmp += 2;
    }
 }
--- a/Frameworks/lazyusf/lazyusf/rsp_hle/musyx.c
+++ b/Frameworks/lazyusf/lazyusf/rsp_hle/musyx.c
@ -28,6 +28,8 @@
 #include <string.h>
 #include <stddef.h>

+#include "common.h"
+
 #include "arithmetics.h"
 #include "audio.h"
 #include "hle_external.h"
@ -436,20 +438,16 @@ static void init_subframes_v1(musyx_t *musyx)
 static void init_subframes_v2(musyx_t *musyx)
 {
    unsigned i,k;
+    int16_t values[4];
+    int16_t* subframes[4];

-    int16_t values[4] = {
-        clamp_s16(musyx->base_vol[0]),
-        clamp_s16(musyx->base_vol[1]),
-        clamp_s16(musyx->base_vol[2]),
-        clamp_s16(musyx->base_vol[3])
-    };
+    for(k = 0; k < 4; ++k)
+        values[k] = clamp_s16(musyx->base_vol[k]);

-    int16_t* subframes[4] = {
-        musyx->left,
-        musyx->right,
-        musyx->cc0,
-        musyx->e50
-    };
+    subframes[0] = musyx->left;
+    subframes[1] = musyx->right;
+    subframes[2] = musyx->cc0;
+    subframes[3] = musyx->e50;

    for (i = 0; i < SUBFRAME_SIZE; ++i) {

@ -855,7 +853,7 @@ static void sfx_stage(struct hle_t* hle, mix_sfx_with_main_subframes_t mix_sfx_w
 }

 static void mix_sfx_with_main_subframes_v1(musyx_t *musyx, const int16_t *subframe,
-                                           const uint16_t* gains)
+                                           const uint16_t* UNUSED(gains))
 {
    unsigned i;

--- a/Frameworks/lazyusf/lazyusf/usf.h
+++ b/Frameworks/lazyusf/lazyusf/usf.h
@ -47,6 +47,12 @@ int usf_upload_section(void * state, const uint8_t * data, size_t size);
   Requesting zero samples with a null pointer is an acceptable way to
   force at least one block of samples to render and return the current
   sample rate in the variable passed in.
+   Requesting a non-zero number of samples with a null buffer pointer will
+   result in exactly count samples being rendered and discarded.
+   Emulation runs in whole blocks until there have been exactly enough
+   Audio Interface DMA transfers to at least fill count samples, at which
+   point the remainder is buffered in the emulator state until the next
+   usf_render() call.
   Returns 0 on success, or a pointer to the last error message on failure. */
 const char * usf_render(void * state, int16_t * buffer, size_t count, int32_t * sample_rate);

--- a/Frameworks/lazyusf/lazyusf/usf_internal.h
+++ b/Frameworks/lazyusf/lazyusf/usf_internal.h
@ -77,10 +77,20 @@ struct usf_state
    int16_t * sample_buffer;

    // audio.c
+    // SampleRate is usually guaranteed to stay the same for the duration
+    // of a given track, and depends on the game.
    int32_t SampleRate;
+    // Audio is rendered in whole Audio Interface DMA transfers, which are
+    // then copied directly to the caller's buffer. Any left over samples
+    // from the last DMA transfer that fills the caller's buffer will be
+    // stored here until the next call to usf_render()
    int16_t samplebuf[16384];
    size_t samples_in_buffer;
    
+    // This buffer does not really need to be that large, as it is likely
+    // to only accumulate a handlful of error messages, at which point
+    // emulation is immediately halted and the messages are returned to
+    // the caller.
    const char * last_error;
    char error_message[1024];
    
--- a/Plugins/HighlyComplete/HighlyComplete/HCDecoder.mm
+++ b/Plugins/HighlyComplete/HighlyComplete/HCDecoder.mm
@ -1030,6 +1030,8 @@ static int usf_info(void * context, const char * name, const char * value)
        sampleRate = samplerate;
        
        usfRemoveSilence = YES;
+        
+        silence_seconds = 10;
    }
    else if ( type == 0x22 )
    {