/* Copyright (C) 2010-2021 The RetroArch team * * --------------------------------------------------------------------------------------- * The following license statement only applies to this file (s16_to_float.c). * --------------------------------------------------------------------------------------- * * Permission is hereby granted, free of charge, * to any person obtaining a copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation the rights to * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, * and to permit persons to whom the Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #if defined(__SSE2__) #include <emmintrin.h> #endif #include <boolean.h> #include <features/features_cpu.h> #include <audio/conversion/s16_to_float.h> #if (defined(__ARM_NEON__) || defined(HAVE_NEON)) static bool s16_to_float_neon_enabled = false; #include <arm_neon.h> void convert_s16_to_float(float *out, const int16_t *in, size_t samples, float gain) { unsigned i = 0; if (s16_to_float_neon_enabled) { float gf = gain / (1 << 15); float32x4_t vgf = {gf, gf, gf, gf}; while (samples >= 8) { float32x4x2_t oreg; int16x4x2_t inreg = vld1_s16_x2(in); // why were these interleaved before? int32x4_t p1 = vmovl_s16(inreg.val[0]); int32x4_t p2 = vmovl_s16(inreg.val[1]); oreg.val[0] = vmulq_f32(vcvtq_f32_s32(p1), vgf); oreg.val[1] = vmulq_f32(vcvtq_f32_s32(p2), vgf); vst1q_f32_x2(out, oreg); in += 8; out += 8; samples -= 8; } } gain /= 0x8000; for (; i < samples; i++) out[i] = (float)in[i] * gain; } void convert_s16_to_float_init_simd(void) { uint64_t cpu = cpu_features_get(); if (cpu & RETRO_SIMD_NEON) s16_to_float_neon_enabled = true; } #else void convert_s16_to_float(float *out, const int16_t *in, size_t samples, float gain) { unsigned i = 0; #if defined(__SSE2__) float fgain = gain / UINT32_C(0x80000000); __m128 factor = _mm_set1_ps(fgain); for (i = 0; i + 8 <= samples; i += 8, in += 8, out += 8) { __m128i input = _mm_loadu_si128((const __m128i *)in); __m128i regs_l = _mm_unpacklo_epi16(_mm_setzero_si128(), input); __m128i regs_r = _mm_unpackhi_epi16(_mm_setzero_si128(), input); __m128 output_l = _mm_mul_ps(_mm_cvtepi32_ps(regs_l), factor); __m128 output_r = _mm_mul_ps(_mm_cvtepi32_ps(regs_r), factor); _mm_storeu_ps(out + 0, output_l); _mm_storeu_ps(out + 4, output_r); } samples = samples - i; i = 0; #endif gain /= 0x8000; for (; i < samples; i++) out[i] = (float)in[i] * gain; } void convert_s16_to_float_init_simd(void) { } #endif