/*
 * DMOPlugin.h
 * -----------
 * Purpose: DirectX Media Object plugin handling / processing.
 * Notes  : Some default plugins only have the same output characteristics in the floating point code path (compared to integer PCM)
 *          if we feed them input in the range [-32768, +32768] rather than the more usual [-1, +1].
 *          Hence, OpenMPT uses this range for both the floating-point and integer path.
 * Authors: OpenMPT Devs
 * The OpenMPT source code is released under the BSD license. Read LICENSE for more details.
 */


#include "stdafx.h"

#if defined(MPT_WITH_DMO)
#include "../../Sndfile.h"
#include "../../../common/mptUUID.h"
#include "DMOPlugin.h"
#include "../PluginManager.h"
#include <uuids.h>
#include <medparam.h>
#include <mmsystem.h>
#endif // MPT_WITH_DMO

OPENMPT_NAMESPACE_BEGIN


#if defined(MPT_WITH_DMO)


#ifdef MPT_ALL_LOGGING
#define DMO_LOG
#else
#define DMO_LOG
#endif


IMixPlugin* DMOPlugin::Create(VSTPluginLib &factory, CSoundFile &sndFile, SNDMIXPLUGIN *mixStruct)
{
	CLSID clsid;
	if(Util::VerifyStringToCLSID(factory.dllPath.AsNative(), clsid))
	{
		IMediaObject *pMO = nullptr;
		IMediaObjectInPlace *pMOIP = nullptr;
		if ((CoCreateInstance(clsid, nullptr, CLSCTX_INPROC_SERVER, IID_IMediaObject, (VOID **)&pMO) == S_OK) && (pMO))
		{
			if (pMO->QueryInterface(IID_IMediaObjectInPlace, (void **)&pMOIP) != S_OK) pMOIP = nullptr;
		} else pMO = nullptr;
		if ((pMO) && (pMOIP))
		{
			DWORD dwInputs = 0, dwOutputs = 0;
			pMO->GetStreamCount(&dwInputs, &dwOutputs);
			if (dwInputs == 1 && dwOutputs == 1)
			{
				DMOPlugin *p = new (std::nothrow) DMOPlugin(factory, sndFile, mixStruct, pMO, pMOIP, clsid.Data1);
				return p;
			}
#ifdef DMO_LOG
			MPT_LOG(LogDebug, "DMO", factory.libraryName.ToUnicode() + U_(": Unable to use this DMO"));
#endif
		}
#ifdef DMO_LOG
		else MPT_LOG(LogDebug, "DMO", factory.libraryName.ToUnicode() + U_(": Failed to get IMediaObject & IMediaObjectInPlace interfaces"));
#endif
		if (pMO) pMO->Release();
		if (pMOIP) pMOIP->Release();
	}
	return nullptr;
}


DMOPlugin::DMOPlugin(VSTPluginLib &factory, CSoundFile &sndFile, SNDMIXPLUGIN *mixStruct, IMediaObject *pMO, IMediaObjectInPlace *pMOIP, uint32 uid)
	: IMixPlugin(factory, sndFile, mixStruct)
	, m_pMediaObject(pMO)
	, m_pMediaProcess(pMOIP)
	, m_pParamInfo(nullptr)
	, m_pMediaParams(nullptr)
	, m_nSamplesPerSec(sndFile.GetSampleRate())
	, m_uid(uid)
{
	if(FAILED(m_pMediaObject->QueryInterface(IID_IMediaParamInfo, (void **)&m_pParamInfo)))
		m_pParamInfo = nullptr;
	if (FAILED(m_pMediaObject->QueryInterface(IID_IMediaParams, (void **)&m_pMediaParams)))
		m_pMediaParams = nullptr;
	m_alignedBuffer.f32 = (float *)((((intptr_t)m_interleavedBuffer.f32) + 15) & ~15);

	m_mixBuffer.Initialize(2, 2);
	InsertIntoFactoryList();

}


DMOPlugin::~DMOPlugin()
{
	if(m_pMediaParams)
	{
		m_pMediaParams->Release();
		m_pMediaParams = nullptr;
	}
	if(m_pParamInfo)
	{
		m_pParamInfo->Release();
		m_pParamInfo = nullptr;
	}
	if(m_pMediaProcess)
	{
		m_pMediaProcess->Release();
		m_pMediaProcess = nullptr;
	}
	if(m_pMediaObject)
	{
		m_pMediaObject->Release();
		m_pMediaObject = nullptr;
	}
}


uint32 DMOPlugin::GetLatency() const
{
	REFERENCE_TIME time;	// Unit 100-nanoseconds
	if(m_pMediaProcess->GetLatency(&time) == S_OK)
	{
		return static_cast<uint32>(time * m_nSamplesPerSec / (10 * 1000 * 1000));
	}
	return 0;
}


static constexpr float _f2si = 32768.0f;
static constexpr float _si2f = 1.0f / 32768.0f;


static void InterleaveStereo(const float * MPT_RESTRICT inputL, const float * MPT_RESTRICT inputR, float * MPT_RESTRICT output, uint32 numFrames)
{
#if defined(ENABLE_SSE)
	if(GetProcSupport() & PROCSUPPORT_SSE)
	{
		// We may read beyond the wanted length... this works because we know that we will always work on our buffers of size MIXBUFFERSIZE
		static_assert((MIXBUFFERSIZE & 7) == 0);
		__m128 factor = _mm_set_ps1(_f2si);
		numFrames = (numFrames + 3) / 4;
		do
		{
			__m128 fl = _mm_loadu_ps(inputL);		// Load four float values, LLLL
			__m128 fr = _mm_loadu_ps(inputR);		// Load four float values, RRRR
			fl = _mm_mul_ps(fl, factor);			// Scale them
			fr = _mm_mul_ps(fr, factor);			// Scale them
			inputL += 4;
			inputR += 4;
			__m128 f1 = _mm_unpacklo_ps(fl, fr);	// LL__+RR__ => LRLR
			__m128 f2 = _mm_unpackhi_ps(fl, fr);	// __LL+__RR => LRLR
			_mm_store_ps(output, f1);				// Store four int values, LRLR
			_mm_store_ps(output + 4, f2);			// Store four int values, LRLR
			output += 8;
		} while(--numFrames);
		return;
	}
#endif
	while(numFrames--)
	{
		*(output++) = *(inputL++) * _f2si;
		*(output++) = *(inputR++) * _f2si;
	}
}


static void DeinterleaveStereo(const float * MPT_RESTRICT input, float * MPT_RESTRICT outputL, float * MPT_RESTRICT outputR, uint32 numFrames)
{
#if defined(ENABLE_SSE)
	if(GetProcSupport() & PROCSUPPORT_SSE)
	{
		// We may read beyond the wanted length... this works because we know that we will always work on our buffers of size MIXBUFFERSIZE
		static_assert((MIXBUFFERSIZE & 7) == 0);
		__m128 factor = _mm_set_ps1(_si2f);
		numFrames = (numFrames + 3) / 4;
		do
		{
			__m128 f1 = _mm_load_ps(input);		// Load four float values, LRLR
			__m128 f2 = _mm_load_ps(input + 4);	// Load four float values, LRLR
			f1 = _mm_mul_ps(f1, factor);		// Scale them
			f2 = _mm_mul_ps(f2, factor);		// Scale them
			input += 8;
			__m128 fl = _mm_shuffle_ps(f1, f2, _MM_SHUFFLE(2, 0, 2, 0));	// LRLR+LRLR => LLLL
			__m128 fr = _mm_shuffle_ps(f1, f2, _MM_SHUFFLE(3, 1, 3, 1));	// LRLR+LRLR => RRRR
			_mm_storeu_ps(outputL, fl);				// Store four float values, LLLL
			_mm_storeu_ps(outputR, fr);				// Store four float values, RRRR
			outputL += 4;
			outputR += 4;
		} while(--numFrames);
		return;
	}
#endif
	while(numFrames--)
	{
		*(outputL++) = *(input++) * _si2f;
		*(outputR++) = *(input++) * _si2f;
	}
}


// Interleave two float streams into one int16 stereo stream.
static void InterleaveFloatToInt16(const float * MPT_RESTRICT inputL, const float * MPT_RESTRICT inputR, int16 * MPT_RESTRICT output, uint32 numFrames)
{
#if defined(ENABLE_MMX) && defined(ENABLE_SSE)
	// This uses __m64, so it's not available on the MSVC 64-bit compiler.
	// But if the user runs a 64-bit operating system, they will go the floating-point path anyway.
	if((GetProcSupport() & (PROCSUPPORT_MMX | PROCSUPPORT_SSE)) == (PROCSUPPORT_MMX | PROCSUPPORT_SSE))
	{
		// We may read beyond the wanted length... this works because we know that we will always work on our buffers of size MIXBUFFERSIZE
		static_assert((MIXBUFFERSIZE & 7) == 0);
		__m64 *out = reinterpret_cast<__m64 *>(output);
		__m128 factor = _mm_set_ps1(_f2si);
		numFrames = (numFrames + 3) / 4;
		do
		{
			__m128 fl = _mm_loadu_ps(inputL);		// Load four float values, L1L2L3L4
			__m128 fr = _mm_loadu_ps(inputR);		// Load four float values, R1R2R3R4
			fl = _mm_mul_ps(fl, factor);			// Scale them
			fr = _mm_mul_ps(fr, factor);			// Scale them
			inputL += 4;
			inputR += 4;

			// First two stereo pairs
			__m128 f12 = _mm_shuffle_ps(fl, fr, _MM_SHUFFLE(1, 0, 1, 0));	// L1 L2 R1 R2
			f12 = _mm_shuffle_ps(f12 , f12, _MM_SHUFFLE(3, 1, 2, 0));		// L1 R1 L2 R2
			__m64 i1 = _mm_cvtps_pi32(f12);									// Convert to two ints, L1R1
			f12 = _mm_shuffle_ps(f12 , f12, _MM_SHUFFLE(1, 0, 3, 2));		// L2 R2 L1 R1
			__m64 i2 = _mm_cvtps_pi32(f12);									// Convert to two ints, L2R2
			__m64 sat12 = _mm_packs_pi32(i1, i2);							// Pack and saturate them to 16-bit
			*(out++) = sat12;												// Store L1R1L2R2

			// Second two stereo pairs
			__m128 f34 = _mm_shuffle_ps(fl, fr, _MM_SHUFFLE(3, 1, 3, 1));	// L3 L4 R3 R4
			f34 = _mm_shuffle_ps(f34 , f34, _MM_SHUFFLE(3, 1, 2, 0));		// L3 R3 L4 R4
			__m64 i3 = _mm_cvtps_pi32(f34);									// Convert to two ints, L3R3
			f34 = _mm_shuffle_ps(f34 , f34, _MM_SHUFFLE(1, 0, 3, 2));		// L4 R4 L3 R3
			__m64 i4 = _mm_cvtps_pi32(f34);									// Convert to two ints, L4R4
			__m64 sat34 = _mm_packs_pi32(i3, i4);							// Pack and saturate them to 16-bit
			*(out++) = sat34;												// Store L3R3L4R4
		} while(--numFrames);
		_mm_empty();
		return;
	}
#endif
	while(numFrames--)
	{
		*(output++) = static_cast<int16>(Clamp(*(inputL++) * _f2si, static_cast<float>(int16_min), static_cast<float>(int16_max)));
		*(output++) = static_cast<int16>(Clamp(*(inputR++) * _f2si, static_cast<float>(int16_min), static_cast<float>(int16_max)));
	}
}


// Deinterleave an int16 stereo stream into two float streams.
static void DeinterleaveInt16ToFloat(const int16 * MPT_RESTRICT input, float * MPT_RESTRICT outputL, float * MPT_RESTRICT outputR, uint32 numFrames)
{
#if defined(ENABLE_MMX) && defined(ENABLE_SSE)
	// This uses __m64, so it's not available on the MSVC 64-bit compiler.
	// But if the user runs a 64-bit operating system, they will go the floating-point path anyway.
	if((GetProcSupport() & (PROCSUPPORT_MMX | PROCSUPPORT_SSE)) == (PROCSUPPORT_MMX | PROCSUPPORT_SSE))
	{
		// We may read beyond the wanted length... this works because we know that we will always work on our buffers of size MIXBUFFERSIZE
		static_assert((MIXBUFFERSIZE & 7) == 0);
		const __m128i *in = reinterpret_cast<const __m128i *>(input);
		__m128 factor = _mm_set_ps1(_si2f);
		numFrames = (numFrames + 3) / 4;
		do
		{
			__m128i in16 = _mm_load_si128(in);		// Load eight int16 values, LRLRLRLR
			in++;
			__m128i lo = _mm_unpacklo_epi16(_mm_setzero_si128(), in16);	// 0L0R0L0R (1)
			__m128i hi = _mm_unpackhi_epi16(_mm_setzero_si128(), in16);	// 0L0R0L0R (2)
			lo = _mm_srai_epi32(lo, 16);			// LsRsLsRs, s = sign (1)
			hi = _mm_srai_epi32(hi, 16);			// LsRsLsRs, s = sign (2)

			__m64 lo1, lo2, hi1, hi2;
			_mm_storel_pi(&lo1, _mm_castsi128_ps(lo));				// L1R1
			_mm_storeh_pi(&lo2, _mm_castsi128_ps(lo));				// L2R2
			_mm_storel_pi(&hi1, _mm_castsi128_ps(hi));				// L3R3
			_mm_storeh_pi(&hi2, _mm_castsi128_ps(hi));				// L4R4
			__m128 f1 = _mm_cvt_pi2ps(_mm_setzero_ps(), lo1);		// Convert to two floats, L1R1
			__m128 f2 = _mm_cvt_pi2ps(_mm_setzero_ps(), lo2);		// Convert to two floats, L2R2
			f1 = _mm_shuffle_ps(f1, f1, _MM_SHUFFLE(1, 0, 1, 0));	// Move to upper
			f2 = _mm_shuffle_ps(f2, f2, _MM_SHUFFLE(1, 0, 1, 0));	// Move to upper
			f1 = _mm_cvt_pi2ps(f1, hi1);							// Convert to two floats, L3R3 | L1R1
			f2 = _mm_cvt_pi2ps(f2, hi2);							// Convert to two floats, L4R4 | L2R2

			__m128 fl = _mm_shuffle_ps(f1, f2, _MM_SHUFFLE(0, 2, 0, 2));	// => L1L3L2L4
			__m128 fr = _mm_shuffle_ps(f1, f2, _MM_SHUFFLE(1, 3, 1, 3));	// => R1R3R2R4
			fl = _mm_shuffle_ps(fl, fl, _MM_SHUFFLE(3, 1, 2, 0));			// => L1L2L3L4
			fr = _mm_shuffle_ps(fr, fr, _MM_SHUFFLE(3, 1, 2, 0));			// => R1R2R3R4
			fl = _mm_mul_ps(fl, factor);			// Scale them
			fr = _mm_mul_ps(fr, factor);			// Scale them
			_mm_storeu_ps(outputL, fl);				// Store four float values, LLLL
			_mm_storeu_ps(outputR, fr);				// Store four float values, RRRR
			outputL += 4;
			outputR += 4;
		} while(--numFrames);
		_mm_empty();
		return;
	}
#endif
	while(numFrames--)
	{
		*outputL++ += _si2f * static_cast<float>(*input++);
		*outputR++ += _si2f * static_cast<float>(*input++);
	}
}


void DMOPlugin::Process(float *pOutL, float *pOutR, uint32 numFrames)
{
	if(!numFrames || !m_mixBuffer.Ok())
		return;
	m_mixBuffer.ClearOutputBuffers(numFrames);
	REFERENCE_TIME startTime = Util::muldiv(m_SndFile.GetTotalSampleCount(), 10000000, m_nSamplesPerSec);
	
	if(m_useFloat)
	{
		InterleaveStereo(m_mixBuffer.GetInputBuffer(0), m_mixBuffer.GetInputBuffer(1), m_alignedBuffer.f32, numFrames);
		m_pMediaProcess->Process(numFrames * 2 * sizeof(float), reinterpret_cast<BYTE *>(m_alignedBuffer.f32), startTime, DMO_INPLACE_NORMAL);
		DeinterleaveStereo(m_alignedBuffer.f32, m_mixBuffer.GetOutputBuffer(0), m_mixBuffer.GetOutputBuffer(1), numFrames);
	} else
	{
		InterleaveFloatToInt16(m_mixBuffer.GetInputBuffer(0), m_mixBuffer.GetInputBuffer(1), m_alignedBuffer.i16, numFrames);
		m_pMediaProcess->Process(numFrames * 2 * sizeof(int16), reinterpret_cast<BYTE *>(m_alignedBuffer.i16), startTime, DMO_INPLACE_NORMAL);
		DeinterleaveInt16ToFloat(m_alignedBuffer.i16, m_mixBuffer.GetOutputBuffer(0), m_mixBuffer.GetOutputBuffer(1), numFrames);
	}

	ProcessMixOps(pOutL, pOutR, m_mixBuffer.GetOutputBuffer(0), m_mixBuffer.GetOutputBuffer(1), numFrames);
}


PlugParamIndex DMOPlugin::GetNumParameters() const
{
	DWORD dwParamCount = 0;
	m_pParamInfo->GetParamCount(&dwParamCount);
	return dwParamCount;
}


PlugParamValue DMOPlugin::GetParameter(PlugParamIndex index)
{
	if(index < GetNumParameters() && m_pParamInfo != nullptr && m_pMediaParams != nullptr)
	{
		MP_PARAMINFO mpi;
		MP_DATA md;

		MemsetZero(mpi);
		md = 0;
		if (m_pParamInfo->GetParamInfo(index, &mpi) == S_OK
			&& m_pMediaParams->GetParam(index, &md) == S_OK)
		{
			float fValue, fMin, fMax, fDefault;

			fValue = md;
			fMin = mpi.mpdMinValue;
			fMax = mpi.mpdMaxValue;
			fDefault = mpi.mpdNeutralValue;
			if (mpi.mpType == MPT_BOOL)
			{
				fMin = 0;
				fMax = 1;
			}
			fValue -= fMin;
			if (fMax > fMin) fValue /= (fMax - fMin);
			return fValue;
		}
	}
	return 0;
}


void DMOPlugin::SetParameter(PlugParamIndex index, PlugParamValue value)
{
	if(index < GetNumParameters() && m_pParamInfo != nullptr && m_pMediaParams != nullptr)
	{
		MP_PARAMINFO mpi;
		MemsetZero(mpi);
		if (m_pParamInfo->GetParamInfo(index, &mpi) == S_OK)
		{
			float fMin = mpi.mpdMinValue;
			float fMax = mpi.mpdMaxValue;

			if (mpi.mpType == MPT_BOOL)
			{
				fMin = 0;
				fMax = 1;
				value = (value > 0.5f) ? 1.0f : 0.0f;
			}
			if (fMax > fMin) value *= (fMax - fMin);
			value += fMin;
			Limit(value, fMin, fMax);
			if (mpi.mpType != MPT_FLOAT) value = mpt::round(value);
			m_pMediaParams->SetParam(index, value);
		}
	}
}


void DMOPlugin::Resume()
{
	m_nSamplesPerSec = m_SndFile.GetSampleRate();
	m_isResumed = true;

	DMO_MEDIA_TYPE mt;
	WAVEFORMATEX wfx;

	mt.majortype = MEDIATYPE_Audio;
	mt.subtype = MEDIASUBTYPE_PCM;
	mt.bFixedSizeSamples = TRUE;
	mt.bTemporalCompression = FALSE;
	mt.formattype = FORMAT_WaveFormatEx;
	mt.pUnk = nullptr;
	mt.pbFormat = (LPBYTE)&wfx;
	mt.cbFormat = sizeof(WAVEFORMATEX);
	mt.lSampleSize = 2 * sizeof(float);
	wfx.wFormatTag = 3; // WAVE_FORMAT_IEEE_FLOAT;
	wfx.nChannels = 2;
	wfx.nSamplesPerSec = m_nSamplesPerSec;
	wfx.wBitsPerSample = sizeof(float) * 8;
	wfx.nBlockAlign = wfx.nChannels * (wfx.wBitsPerSample / 8);
	wfx.nAvgBytesPerSec = wfx.nSamplesPerSec * wfx.nBlockAlign;
	wfx.cbSize = 0;

	// First try 32-bit float (DirectX 9+)
	m_useFloat = true;
	if(FAILED(m_pMediaObject->SetInputType(0, &mt, 0))
		|| FAILED(m_pMediaObject->SetOutputType(0, &mt, 0)))
	{
		m_useFloat = false;
		// Try again with 16-bit PCM
		mt.lSampleSize = 2 * sizeof(int16);
		wfx.wFormatTag = WAVE_FORMAT_PCM;
		wfx.wBitsPerSample = sizeof(int16) * 8;
		wfx.nBlockAlign = wfx.nChannels * (wfx.wBitsPerSample / 8);
		wfx.nAvgBytesPerSec = wfx.nSamplesPerSec * wfx.nBlockAlign;
		if(FAILED(m_pMediaObject->SetInputType(0, &mt, 0))
			|| FAILED(m_pMediaObject->SetOutputType(0, &mt, 0)))
		{
#ifdef DMO_LOG
			MPT_LOG(LogDebug, "DMO", U_("DMO: Failed to set I/O media type"));
#endif
		}
	}
}


void DMOPlugin::PositionChanged()
{
	m_pMediaObject->Discontinuity(0);
	m_pMediaObject->Flush();
}


void DMOPlugin::Suspend()
{
	m_isResumed = false;
	m_pMediaObject->Flush();
	m_pMediaObject->SetInputType(0, nullptr, DMO_SET_TYPEF_CLEAR);
	m_pMediaObject->SetOutputType(0, nullptr, DMO_SET_TYPEF_CLEAR);
}


#ifdef MODPLUG_TRACKER

CString DMOPlugin::GetParamName(PlugParamIndex param)
{
	if(param < GetNumParameters() && m_pParamInfo != nullptr)
	{
		MP_PARAMINFO mpi;
		mpi.mpType = MPT_INT;
		mpi.szUnitText[0] = 0;
		mpi.szLabel[0] = 0;
		if(m_pParamInfo->GetParamInfo(param, &mpi) == S_OK)
		{
			return mpi.szLabel;
		}
	}
	return CString();

}


CString DMOPlugin::GetParamLabel(PlugParamIndex param)
{
	if(param < GetNumParameters() && m_pParamInfo != nullptr)
	{
		MP_PARAMINFO mpi;
		mpi.mpType = MPT_INT;
		mpi.szUnitText[0] = 0;
		mpi.szLabel[0] = 0;
		if(m_pParamInfo->GetParamInfo(param, &mpi) == S_OK)
		{
			return mpi.szUnitText;
		}
	}
	return CString();
}


CString DMOPlugin::GetParamDisplay(PlugParamIndex param)
{
	if(param < GetNumParameters() && m_pParamInfo != nullptr && m_pMediaParams != nullptr)
	{
		MP_PARAMINFO mpi;
		mpi.mpType = MPT_INT;
		mpi.szUnitText[0] = 0;
		mpi.szLabel[0] = 0;
		if (m_pParamInfo->GetParamInfo(param, &mpi) == S_OK)
		{
			MP_DATA md;
			if(m_pMediaParams->GetParam(param, &md) == S_OK)
			{
				switch(mpi.mpType)
				{
				case MPT_FLOAT:
					{
						CString s;
						s.Format(_T("%.2f"), md);
						return s;
					}
					break;

				case MPT_BOOL:
					return ((int)md) ? _T("Yes") : _T("No");
					break;

				case MPT_ENUM:
					{
						WCHAR *text = nullptr;
						m_pParamInfo->GetParamText(param, &text);

						const int nValue = mpt::saturate_round<int>(md * (mpi.mpdMaxValue - mpi.mpdMinValue));
						// Always skip first two strings (param name, unit name)
						for(int i = 0; i < nValue + 2; i++)
						{
							text += wcslen(text) + 1;
						}
						return CString(text);
					}
					break;

				case MPT_INT:
				default:
					{
						CString s;
						s.Format(_T("%d"), mpt::saturate_round<int>(md));
						return s;
					}
					break;
				}
			}
		}
	}
	return CString();
}

#endif // MODPLUG_TRACKER

#else // !MPT_WITH_DMO

MPT_MSVC_WORKAROUND_LNK4221(DMOPlugin)

#endif // MPT_WITH_DMO

OPENMPT_NAMESPACE_END