OpenMPT: Enable SSE2 and ARM NEON optimizations for a slight improvement in performance

CQTexperiment
Christopher Snowhill 2021-12-22 00:18:57 -08:00
parent c2585f5567
commit 7da3324e32
8 changed files with 8912 additions and 23 deletions

View File

@ -183,12 +183,12 @@
//#define MPT_WITH_MEDIAFOUNDATION
//#define MPT_WITH_MINIMP3
//#define MPT_WITH_MINIZ
#define MPT_WITH_MPG123
#define MPT_WITH_OGG
//#define MPT_WITH_MPG123
//#define MPT_WITH_OGG
//#define MPT_WITH_STBVORBIS
#define MPT_WITH_VORBIS
#define MPT_WITH_VORBISFILE
#define MPT_WITH_ZLIB
//#define MPT_WITH_VORBIS
//#define MPT_WITH_VORBISFILE
//#define MPT_WITH_ZLIB
#endif // LIBOPENMPT_BUILD
@ -405,10 +405,6 @@
#endif
#endif
#if !MPT_COMPILER_MSVC && defined(ENABLE_ASM)
#undef ENABLE_ASM // inline assembly requires MSVC compiler
#endif
#if defined(ENABLE_ASM)
#if MPT_COMPILER_MSVC && defined(_M_IX86)
@ -448,6 +444,24 @@
// Generate AVX2 instructions (only used when the CPU supports it).
#define ENABLE_AVX2
#elif MPT_BUILD_XCODE && defined(__x86_64__)
// No CPUID enabled, only one code path supported anyway
// #define ENABLE_CPUID
// Enable the SSE2 intrinsic functions unconditionally
#define ENABLE_SSE2
#elif MPT_BUILD_XCODE && defined(__arm64__)
// No CPUID, it's kind of a pain on ARM anyway
// #define ENABLE_CPUID
// Enable the NEON intrinsic functions unconditionally
#define ENABLE_NEON
#else
#undef ENABLE_ASM
#endif // arch
#endif // ENABLE_ASM

View File

@ -218,6 +218,54 @@ void InitProcSupport()
}
#elif MPT_BUILD_XCODE && defined(__x86_64__)
void InitProcSupport()
{
RealProcSupport = 0;
ProcSupport = 0;
mpt::String::WriteAutoBuf(ProcVendorID) = "";
mpt::String::WriteAutoBuf(ProcBrandID) = "";
ProcRawCPUID = 0;
ProcFamily = 0;
ProcModel = 0;
ProcStepping = 0;
ProcSupport |= PROCSUPPORT_ASM_INTRIN;
ProcSupport |= PROCSUPPORT_SSE2;
RealProcSupport = ProcSupport;
}
#elif MPT_BUILD_XCODE && defined(__arm64__)
void InitProcSupport()
{
RealProcSupport = 0;
ProcSupport = 0;
mpt::String::WriteAutoBuf(ProcVendorID) = "";
mpt::String::WriteAutoBuf(ProcBrandID) = "";
ProcRawCPUID = 0;
ProcFamily = 0;
ProcModel = 0;
ProcStepping = 0;
ProcSupport |= PROCSUPPORT_ASM_INTRIN;
ProcSupport |= PROCSUPPORT_NEON;
RealProcSupport = ProcSupport;
}
#else // !( MPT_COMPILER_MSVC && ENABLE_X86 )
@ -229,6 +277,16 @@ void InitProcSupport()
#endif // MPT_COMPILER_MSVC && ENABLE_X86
#ifndef MODPLUG_TRACKER
static struct initProcSupport
{
initProcSupport()
{
InitProcSupport();
}
} doInitProcSupport;
#endif
#endif // ENABLE_ASM

View File

@ -15,8 +15,7 @@
OPENMPT_NAMESPACE_BEGIN
#ifdef MODPLUG_TRACKER
#ifdef ENABLE_ASM
#define PROCSUPPORT_ASM_INTRIN 0x00001 // assembly and intrinsics are enabled at runtime
#define PROCSUPPORT_CPUID 0x00002 // Processor supports modern cpuid
@ -31,15 +30,16 @@ OPENMPT_NAMESPACE_BEGIN
#define PROCSUPPORT_AVX 0x10000 // Processor supports AVX instructions
#define PROCSUPPORT_AVX2 0x20000 // Processor supports AVX2 instructions
#define PROCSUPPORT_NEON 0x40000 // Processor supports NEON instructions
static constexpr uint32 PROCSUPPORT_i586 = 0u ;
static constexpr uint32 PROCSUPPORT_x86_SSE = 0u | PROCSUPPORT_SSE ;
static constexpr uint32 PROCSUPPORT_x86_SSE2 = 0u | PROCSUPPORT_SSE | PROCSUPPORT_SSE2 ;
static constexpr uint32 PROCSUPPORT_AMD64 = 0u | PROCSUPPORT_SSE | PROCSUPPORT_SSE2 | PROCSUPPORT_LM;
#endif
static constexpr uint32 PROCSUPPORT_ARM64 = 0u | PROCSUPPORT_NEON ;
#ifdef ENABLE_ASM
extern uint32 RealProcSupport;
extern uint32 ProcSupport;

File diff suppressed because it is too large Load Diff

View File

@ -19,6 +19,10 @@
#include <emmintrin.h>
#endif
#ifdef ENABLE_NEON
#include "../common/sse2neon.h"
#endif
#endif // NO_REVERB
@ -28,7 +32,7 @@ OPENMPT_NAMESPACE_BEGIN
#ifndef NO_REVERB
#ifdef ENABLE_SSE2
#if defined(ENABLE_SSE2) || defined(ENABLE_NEON)
// Load two 32-bit values
static MPT_FORCEINLINE __m128i Load64SSE(const int32 *x) { return _mm_loadl_epi64(reinterpret_cast<const __m128i *>(x)); }
// Load four 16-bit values
@ -594,6 +598,11 @@ void CReverb::ReverbProcessPostFiltering1x(const int32 * MPT_RESTRICT pRvb, int3
{
#ifdef ENABLE_SSE2
if(GetProcSupport() & PROCSUPPORT_SSE2)
#endif
#ifdef ENABLE_NEON
if(GetProcSupport() & PROCSUPPORT_NEON)
#endif
#if defined(ENABLE_SSE2) || defined(ENABLE_NEON)
{
__m128i nDCRRvb_Y1 = Load64SSE(gnDCRRvb_Y1);
__m128i nDCRRvb_X1 = Load64SSE(gnDCRRvb_X1);
@ -656,6 +665,11 @@ void CReverb::ReverbDCRemoval(int32 * MPT_RESTRICT pBuffer, uint32 nSamples)
{
#ifdef ENABLE_SSE2
if(GetProcSupport() & PROCSUPPORT_SSE2)
#endif
#ifdef ENABLE_NEON
if(GetProcSupport() & PROCSUPPORT_NEON)
#endif
#if defined(ENABLE_SSE2) || defined(ENABLE_NEON)
{
__m128i nDCRRvb_Y1 = Load64SSE(gnDCRRvb_Y1);
__m128i nDCRRvb_X1 = Load64SSE(gnDCRRvb_X1);
@ -721,6 +735,11 @@ void CReverb::ProcessPreDelay(SWRvbRefDelay * MPT_RESTRICT pPreDelay, const int3
uint32 delayPos = pPreDelay->nDelayPos - 1;
#ifdef ENABLE_SSE2
if(GetProcSupport() & PROCSUPPORT_SSE2)
#endif
#ifdef ENABLE_NEON
if(GetProcSupport() & PROCSUPPORT_NEON)
#endif
#if defined(ENABLE_SSE2) || defined(ENABLE_NEON)
{
__m128i coeffs = _mm_cvtsi32_si128(pPreDelay->nCoeffs.lr);
__m128i history = _mm_cvtsi32_si128(pPreDelay->History.lr);
@ -793,6 +812,11 @@ void CReverb::ProcessReflections(SWRvbRefDelay * MPT_RESTRICT pPreDelay, LR16 *
{
#ifdef ENABLE_SSE2
if(GetProcSupport() & PROCSUPPORT_SSE2)
#endif
#ifdef ENABLE_NEON
if(GetProcSupport() & PROCSUPPORT_NEON)
#endif
#if defined(ENABLE_SSE2) || defined(ENABLE_NEON)
{
union
{
@ -888,6 +912,11 @@ void CReverb::ProcessLateReverb(SWLateReverb * MPT_RESTRICT pReverb, LR16 * MPT_
#ifdef ENABLE_SSE2
if(GetProcSupport() & PROCSUPPORT_SSE2)
#endif
#ifdef ENABLE_NEON
if(GetProcSupport() & PROCSUPPORT_NEON)
#endif
#if defined(ENABLE_SSE2) || defined(ENABLE_NEON)
{
int delayPos = pReverb->nDelayPos & RVBDLY_MASK;
__m128i rvbOutGains = Load64SSE(pReverb->RvbOutGains);

View File

@ -18,6 +18,9 @@
#ifdef ENABLE_SSE2
#include <emmintrin.h>
#endif
#ifdef ENABLE_NEON
#include "../common/sse2neon.h"
#endif
OPENMPT_NAMESPACE_BEGIN
@ -25,7 +28,7 @@ OPENMPT_NAMESPACE_BEGIN
///////////////////////////////////////////////////////////////////////////////////////
// SSE Optimizations
#ifdef ENABLE_SSE2
#if defined(ENABLE_SSE2) || defined(ENABLE_NEON)
static void SSE2_StereoMixToFloat(const int32 *pSrc, float *pOut1, float *pOut2, uint32 nCount, const float _i2fc)
{
@ -182,12 +185,17 @@ void StereoMixToFloat(const int32 *pSrc, float *pOut1, float *pOut2, uint32 nCou
#ifdef ENABLE_SSE2
if(GetProcSupport() & PROCSUPPORT_SSE2)
#endif
#ifdef ENABLE_NEON
if(GetProcSupport() & PROCSUPPORT_NEON)
#endif
#if defined(ENABLE_SSE2) || defined(ENABLE_NEON)
{
SSE2_StereoMixToFloat(pSrc, pOut1, pOut2, nCount, _i2fc);
return;
}
#endif // ENABLE_SSE2
#endif // ENABLE_SSE2 || ENABLE_NEON
{
C_StereoMixToFloat(pSrc, pOut1, pOut2, nCount, _i2fc);
}
@ -199,12 +207,17 @@ void FloatToStereoMix(const float *pIn1, const float *pIn2, int32 *pOut, uint32
{
#ifdef ENABLE_SSE2
if(GetProcSupport() & PROCSUPPORT_SSE2)
#endif
#ifdef ENABLE_NEON
if(GetProcSupport() & PROCSUPPORT_NEON)
#endif
#if defined(ENABLE_SSE2) || defined(ENABLE_NEON)
{
SSE2_FloatToStereoMix(pIn1, pIn2, pOut, nCount, _f2ic);
return;
}
#endif // ENABLE_SSE2
#endif // ENABLE_SSE2 || ENABLE_NEON
{
C_FloatToStereoMix(pIn1, pIn2, pOut, nCount, _f2ic);
}

View File

@ -13,5 +13,8 @@
#define MPT_WITH_ZLIB 1
#define MPT_BUILD_XCODE 1
#define ENABLE_ASM 1
#endif

View File

@ -87,7 +87,6 @@
83E5FC951FFEFA0D00659F0F /* mptOS.h in Headers */ = {isa = PBXBuildFile; fileRef = 83E5FC5C1FFEFA0D00659F0F /* mptOS.h */; };
83E5FC961FFEFA0D00659F0F /* mptIO.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 83E5FC5D1FFEFA0D00659F0F /* mptIO.cpp */; };
83E5FC971FFEFA0D00659F0F /* mptCPU.h in Headers */ = {isa = PBXBuildFile; fileRef = 83E5FC5E1FFEFA0D00659F0F /* mptCPU.h */; };
83E5FC981FFEFA0D00659F0F /* mptBufferIO.h in Headers */ = {isa = PBXBuildFile; fileRef = 83E5FC5F1FFEFA0D00659F0F /* mptBufferIO.h */; };
83E5FC991FFEFA0D00659F0F /* versionNumber.h in Headers */ = {isa = PBXBuildFile; fileRef = 83E5FC601FFEFA0D00659F0F /* versionNumber.h */; };
83E5FC9A1FFEFA0D00659F0F /* misc_util.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 83E5FC611FFEFA0D00659F0F /* misc_util.cpp */; };
83E5FCCB1FFEFA1A00659F0F /* libopenmpt_impl.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 83E5FC9C1FFEFA1A00659F0F /* libopenmpt_impl.hpp */; };
@ -370,7 +369,6 @@
83E5FC5C1FFEFA0D00659F0F /* mptOS.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mptOS.h; sourceTree = "<group>"; };
83E5FC5D1FFEFA0D00659F0F /* mptIO.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mptIO.cpp; sourceTree = "<group>"; };
83E5FC5E1FFEFA0D00659F0F /* mptCPU.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mptCPU.h; sourceTree = "<group>"; };
83E5FC5F1FFEFA0D00659F0F /* mptBufferIO.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mptBufferIO.h; sourceTree = "<group>"; };
83E5FC601FFEFA0D00659F0F /* versionNumber.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = versionNumber.h; sourceTree = "<group>"; };
83E5FC611FFEFA0D00659F0F /* misc_util.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = misc_util.cpp; sourceTree = "<group>"; };
83E5FC9C1FFEFA1A00659F0F /* libopenmpt_impl.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = libopenmpt_impl.hpp; sourceTree = "<group>"; };
@ -665,7 +663,6 @@
831132CB21F955B0001F678F /* mptBaseMacros.h */,
831132CE21F955B1001F678F /* mptBaseTypes.h */,
831132D121F955B1001F678F /* mptBaseUtils.h */,
83E5FC5F1FFEFA0D00659F0F /* mptBufferIO.h */,
83E5FC3D1FFEFA0D00659F0F /* mptCPU.cpp */,
83E5FC5E1FFEFA0D00659F0F /* mptCPU.h */,
83E5FC2A1FFEFA0D00659F0F /* mptCRC.h */,
@ -1050,7 +1047,6 @@
83E5FE121FFEFA8500659F0F /* XMTools.h in Headers */,
83E5FCCB1FFEFA1A00659F0F /* libopenmpt_impl.hpp in Headers */,
831132E821F9565F001F678F /* BitReader.h in Headers */,
83E5FC981FFEFA0D00659F0F /* mptBufferIO.h in Headers */,
83E5FC741FFEFA0D00659F0F /* BuildSettings.h in Headers */,
831132D521F955B2001F678F /* mptMemory.h in Headers */,
831132DE21F955B2001F678F /* mptException.h in Headers */,