From 7a2dbca39e4fde30430a1717045134824940a73b Mon Sep 17 00:00:00 2001 From: amass <168062547@qq.com> Date: Sat, 7 Sep 2024 15:42:44 +0800 Subject: [PATCH] add code. --- CMakeLists.txt | 40 +- Record/CMakeLists.txt | 54 ++- Record/main.cpp | 12 +- VocieProcess/CMakeLists.txt | 36 +- .../resampler/sinc_resampler_avx2.cc | 66 +++ .../resampler/sinc_resampler_sse.cc | 63 +++ .../ooura/fft_size_128/ooura_fft_sse2.cc | 439 ++++++++++++++++++ .../aec3/adaptive_fir_filter_avx2.cc | 188 ++++++++ .../aec3/adaptive_fir_filter_erl_avx2.cc | 37 ++ .../audio_processing/aec3/fft_data_avx2.cc | 32 ++ .../aec3/matched_filter_avx2.cc | 261 +++++++++++ .../audio_processing/aec3/vector_math_avx2.cc | 81 ++++ .../system_wrappers/source/cpu_features.cc | 117 +++++ .../source/cpu_features_linux.cc | 97 ++++ 14 files changed, 1472 insertions(+), 51 deletions(-) create mode 100644 VocieProcess/common_audio/resampler/sinc_resampler_avx2.cc create mode 100644 VocieProcess/common_audio/resampler/sinc_resampler_sse.cc create mode 100644 VocieProcess/common_audio/third_party/ooura/fft_size_128/ooura_fft_sse2.cc create mode 100644 VocieProcess/modules/audio_processing/aec3/adaptive_fir_filter_avx2.cc create mode 100644 VocieProcess/modules/audio_processing/aec3/adaptive_fir_filter_erl_avx2.cc create mode 100644 VocieProcess/modules/audio_processing/aec3/fft_data_avx2.cc create mode 100644 VocieProcess/modules/audio_processing/aec3/matched_filter_avx2.cc create mode 100644 VocieProcess/modules/audio_processing/aec3/vector_math_avx2.cc create mode 100644 VocieProcess/system_wrappers/source/cpu_features.cc create mode 100644 VocieProcess/system_wrappers/source/cpu_features_linux.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 202d334..de7bf10 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,24 +4,34 @@ cmake_minimum_required(VERSION 3.27) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(OPENSSL_ROOT /opt/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf/lib/openssl-3.3.1) -set(OPENSSL_INCLUDE_DIR ${OPENSSL_ROOT}/include) -set(OPENSSL_LIBRARY_DIRS ${OPENSSL_ROOT}/lib) +option(CROSS_BUILD "build for arm." ON) + +if(CROSS_BUILD) + set(OPENSSL_ROOT /opt/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf/lib/openssl-3.3.1) + set(OPENSSL_INCLUDE_DIR ${OPENSSL_ROOT}/include) + set(OPENSSL_LIBRARY_DIRS ${OPENSSL_ROOT}/lib) + + set(ALSA_ROOT /opt/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf/lib/libalsa-1.1.5) + set(ALSA_INCLUDE_DIR ${ALSA_ROOT}/include) + set(ALSA_LIBRARY_DIRS ${ALSA_ROOT}/lib) + + set(FFMPEG_ROOT /opt/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf/lib/ffmpeg-4.1.3) + set(FFMPEG_INCLUDE_DIR ${FFMPEG_ROOT}/include) + set(FFMPEG_LIBRARY_DIRS ${FFMPEG_ROOT}/lib) + + set(MPP_ROOT /opt/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf/lib/rockchip_mpp) + set(MPP_INCLUDE_DIR ${MPP_ROOT}/include) + set(MPP_LIBRARY_DIRS ${MPP_ROOT}/rk-libs) + + set(KINESIS_ROOT /opt/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf/lib/amazon-kinesis-video-streams-webrtc-sdk-c) +else() + set(BOOST_ROOT /opt/Libraries/boost_1_86_0) + set(KINESIS_ROOT /opt/Libraries/amazon-kinesis-video-streams-webrtc-sdk-c) +endif() +option(Boost_USE_STATIC_LIBS OFF) set(OPENSSL_LIBRARIES ssl crypto) - -set(ALSA_ROOT /opt/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf/lib/libalsa-1.1.5) -set(ALSA_INCLUDE_DIR ${ALSA_ROOT}/include) -set(ALSA_LIBRARY_DIRS ${ALSA_ROOT}/lib) - -set(FFMPEG_ROOT /opt/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf/lib/ffmpeg-4.1.3) -set(FFMPEG_INCLUDE_DIR ${FFMPEG_ROOT}/include) -set(FFMPEG_LIBRARY_DIRS ${FFMPEG_ROOT}/lib) set(FFMPEG_LIBRARY avcodec avdevice avfilter avformat avutil postproc swresample swscale) -set(MPP_ROOT /opt/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf/lib/rockchip_mpp) -set(MPP_INCLUDE_DIR ${MPP_ROOT}/include) -set(MPP_LIBRARY_DIRS ${MPP_ROOT}/rk-libs) - include(FetchContent) FetchContent_Declare(Kylin diff --git a/Record/CMakeLists.txt b/Record/CMakeLists.txt index 4b86771..f43c7b4 100644 --- a/Record/CMakeLists.txt +++ b/Record/CMakeLists.txt @@ -1,17 +1,17 @@ find_package(Boost REQUIRED COMPONENTS json) add_executable(Record main.cpp - RkAudio.h RkAudio.cpp + $<$:RkAudio.h RkAudio.cpp> OpusCodec.h OpusCodec.cpp FFmpegResample.h FFmpegResample.cpp - EchoRecord.cpp - Player.cpp ProcessFile.cpp - Recorder.cpp SpeexDsp.h SpeexDsp.cpp Utility.h Utility.cpp WebRtcAecm.h WebRtcAecm.cpp WebRTCPublisher.h WebRTCPublisher.cpp + $<$:Player.cpp> + $<$:EchoRecord.cpp> + $<$:Recorder.cpp> ) target_include_directories(Record @@ -19,7 +19,7 @@ target_include_directories(Record PRIVATE ${MPP_INCLUDE_DIR} PRIVATE ${MPP_INCLUDE_DIR}/rkmedia PRIVATE /opt/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf/lib/opus-1.4/include - PRIVATE /opt/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf/lib/amazon-kinesis-video-streams-webrtc-sdk-c/include + PRIVATE ${KINESIS_ROOT}/include PRIVATE /opt/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf/lib/speexdsp-1.2.1/include PRIVATE ${FFMPEG_INCLUDE_DIR} # PRIVATE ${CMAKE_SOURCE_DIR}/rkap/include @@ -32,30 +32,38 @@ target_link_directories(Record PRIVATE ${FFMPEG_LIBRARY_DIRS} PRIVATE /opt/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf/lib/speexdsp-1.2.1/lib PRIVATE /opt/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf/lib/opus-1.4/lib - PRIVATE /opt/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf/lib/amazon-kinesis-video-streams-webrtc-sdk-c/lib + PRIVATE ${KINESIS_ROOT}/lib PRIVATE /opt/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf/lib/usrsctp-0.9.5.0/lib PRIVATE /opt/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf/lib/libsrtp-2.6.0/lib # PRIVATE ${CMAKE_SOURCE_DIR}/rkap/lib ) +if(CROSS_BUILD) + set(RK_LIBS + asound + easymedia + drm + rkaiq + rockchip_mpp + v4l2 + v4lconvert + jpeg + png16 + fontconfig + freetype + expat + rga + glib-2.0 + pcre + RKAP_ANR + RKAP_Common + uuid + ) +endif() + target_link_libraries(Record PRIVATE VocieProcess PRIVATE absl::optional - PRIVATE asound - PRIVATE easymedia - PRIVATE drm - PRIVATE rkaiq - PRIVATE rockchip_mpp - PRIVATE v4l2 - PRIVATE v4lconvert - PRIVATE jpeg - PRIVATE png16 - PRIVATE fontconfig - PRIVATE freetype - PRIVATE expat - PRIVATE rga - PRIVATE glib-2.0 - PRIVATE pcre PRIVATE opus PRIVATE speexdsp PRIVATE Boost::json @@ -70,12 +78,10 @@ target_link_libraries(Record PRIVATE Universal PRIVATE HttpProxy PRIVATE stdc++fs - PRIVATE RKAP_ANR - PRIVATE RKAP_Common - PRIVATE uuid PRIVATE dl PRIVATE z PRIVATE ${FFMPEG_LIBRARY} + ${RK_LIBS} # PRIVATE RKAP_Common # PRIVATE RKAP_3A ) \ No newline at end of file diff --git a/Record/main.cpp b/Record/main.cpp index 57d9206..b7325db 100644 --- a/Record/main.cpp +++ b/Record/main.cpp @@ -10,7 +10,9 @@ #include #include #include +#ifdef __RV1109__ #include +#endif void signal_handler(const boost::system::error_code &error, int signal_number) { if (!error) { @@ -54,14 +56,17 @@ int main(int argc, char **argv) { if (variablesMap.count("channels")) { channels = variablesMap["channels"].as(); } - +#ifdef __RV1109__ auto t = std::make_shared(); t->setDsp(dspFromString(variablesMap["dsp"].as())); t->setChannels(channels); t->setDumpEnabled(variablesMap["dump"].as()); task = std::dynamic_pointer_cast(t); +#endif } else if (variablesMap.count("record")) { +#ifdef __RV1109__ task = std::make_shared(); +#endif } else if (variablesMap.count("play")) { std::string path; if (variablesMap.count("path")) { @@ -72,11 +77,12 @@ int main(int argc, char **argv) { if (variablesMap.count("channels")) { channels = variablesMap["channels"].as(); } - +#ifdef __RV1109__ auto t = std::make_shared(); t->setChannels(channels); t->setPath(path); task = std::dynamic_pointer_cast(t); +#endif } else if (variablesMap.count("file")) { auto t = std::make_shared(); t->setDsp(dspFromString(variablesMap["dsp"].as())); @@ -94,7 +100,9 @@ int main(int argc, char **argv) { try { LOG(info) << "app start."; +#ifdef __RV1109__ RK_MPI_SYS_Init(); +#endif initKvsWebRtc(); auto ioConext = Singleton::instance(); boost::asio::signal_set signals(*ioConext->ioContext(), SIGINT, SIGTERM); diff --git a/VocieProcess/CMakeLists.txt b/VocieProcess/CMakeLists.txt index a3906f7..e23cfde 100644 --- a/VocieProcess/CMakeLists.txt +++ b/VocieProcess/CMakeLists.txt @@ -36,33 +36,36 @@ add_library(VocieProcess common_audio/audio_converter.h common_audio/audio_converter.cc common_audio/audio_util.cc common_audio/channel_buffer.h common_audio/channel_buffer.cc - common_audio/fir_filter_neon.h common_audio/fir_filter_neon.cc + $<$:common_audio/fir_filter_neon.h common_audio/fir_filter_neon.cc> common_audio/ring_buffer.h common_audio/ring_buffer.c common_audio/wav_file.h common_audio/wav_file.cc common_audio/wav_header.h common_audio/wav_header.cc common_audio/resampler/push_sinc_resampler.h common_audio/resampler/push_sinc_resampler.cc - common_audio/resampler/sinc_resampler.h common_audio/resampler/sinc_resampler_neon.cc - common_audio/resampler/sinc_resampler.cc + common_audio/resampler/sinc_resampler.h common_audio/resampler/sinc_resampler.cc + $<$:common_audio/resampler/sinc_resampler_neon.cc> + common_audio/resampler/sinc_resampler_sse.cc + common_audio/resampler/sinc_resampler_avx2.cc common_audio/signal_processing/complex_bit_reverse.c common_audio/signal_processing/complex_fft.c - common_audio/signal_processing/cross_correlation_neon.c common_audio/signal_processing/cross_correlation.c common_audio/signal_processing/division_operations.c common_audio/signal_processing/dot_product_with_scale.h common_audio/signal_processing/dot_product_with_scale.cc common_audio/signal_processing/downsample_fast.c - common_audio/signal_processing/downsample_fast_neon.c + $<$:common_audio/signal_processing/cross_correlation_neon.c> + $<$:common_audio/signal_processing/downsample_fast_neon.c> + $<$:common_audio/signal_processing/min_max_operations_neon.c> common_audio/signal_processing/min_max_operations.c - common_audio/signal_processing/min_max_operations_neon.c common_audio/signal_processing/randomization_functions.c common_audio/signal_processing/real_fft.c common_audio/signal_processing/spl_init.c common_audio/signal_processing/splitting_filter.c common_audio/signal_processing/vector_scaling_operations.c - common_audio/third_party/ooura/fft_size_128/ooura_fft.h common_audio/third_party/ooura/fft_size_128/ooura_fft_neon.cc - common_audio/third_party/ooura/fft_size_128/ooura_fft.cc + common_audio/third_party/ooura/fft_size_128/ooura_fft.h common_audio/third_party/ooura/fft_size_128/ooura_fft.cc + $<$:common_audio/third_party/ooura/fft_size_128/ooura_fft_neon.cc> + common_audio/third_party/ooura/fft_size_128/ooura_fft_sse2.cc common_audio/third_party/ooura/fft_size_256/fft4g.h common_audio/third_party/ooura/fft_size_256/fft4g.cc common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.h common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.c @@ -109,6 +112,8 @@ add_library(VocieProcess modules/audio_processing/include/aec_dump.h modules/audio_processing/include/aec_dump.cc modules/audio_processing/include/audio_frame_proxies.h modules/audio_processing/include/audio_frame_proxies.cc + modules/audio_processing/aec3/adaptive_fir_filter_avx2.cc + modules/audio_processing/aec3/adaptive_fir_filter_erl_avx2.cc modules/audio_processing/aec3/adaptive_fir_filter_erl.h modules/audio_processing/aec3/adaptive_fir_filter_erl.cc modules/audio_processing/aec3/adaptive_fir_filter.h modules/audio_processing/aec3/adaptive_fir_filter.cc modules/audio_processing/aec3/aec_state.h modules/audio_processing/aec3/aec_state.cc @@ -137,9 +142,11 @@ add_library(VocieProcess modules/audio_processing/aec3/erl_estimator.h modules/audio_processing/aec3/erl_estimator.cc modules/audio_processing/aec3/erle_estimator.h modules/audio_processing/aec3/erle_estimator.cc modules/audio_processing/aec3/fft_buffer.h modules/audio_processing/aec3/fft_buffer.cc + modules/audio_processing/aec3/fft_data_avx2.cc modules/audio_processing/aec3/filter_analyzer.h modules/audio_processing/aec3/filter_analyzer.cc modules/audio_processing/aec3/frame_blocker.h modules/audio_processing/aec3/frame_blocker.cc modules/audio_processing/aec3/fullband_erle_estimator.h modules/audio_processing/aec3/fullband_erle_estimator.cc + modules/audio_processing/aec3/matched_filter_avx2.cc modules/audio_processing/aec3/matched_filter_lag_aggregator.h modules/audio_processing/aec3/matched_filter_lag_aggregator.cc modules/audio_processing/aec3/matched_filter.h modules/audio_processing/aec3/matched_filter.cc modules/audio_processing/aec3/moving_average.h modules/audio_processing/aec3/moving_average.cc @@ -166,9 +173,10 @@ add_library(VocieProcess modules/audio_processing/aec3/suppression_filter.h modules/audio_processing/aec3/suppression_filter.cc modules/audio_processing/aec3/suppression_gain.h modules/audio_processing/aec3/suppression_gain.cc modules/audio_processing/aec3/transparent_mode.h modules/audio_processing/aec3/transparent_mode.cc + modules/audio_processing/aec3/vector_math_avx2.cc modules/audio_processing/aecm/aecm_core.h modules/audio_processing/aecm/aecm_core.cc modules/audio_processing/aecm/aecm_core_c.cc - modules/audio_processing/aecm/aecm_core_neon.cc + $<$:modules/audio_processing/aecm/aecm_core_neon.cc> modules/audio_processing/aecm/echo_control_mobile.h modules/audio_processing/aecm/echo_control_mobile.cc modules/audio_processing/capture_levels_adjuster/audio_samples_scaler.h modules/audio_processing/capture_levels_adjuster/audio_samples_scaler.cc @@ -205,18 +213,26 @@ add_library(VocieProcess modules/third_party/fft/fft.h modules/third_party/fft/fft.c + system_wrappers/source/cpu_features_linux.cc + system_wrappers/source/cpu_features.cc system_wrappers/source/field_trial.cc system_wrappers/source/metrics.cc ) +if(NOT CROSS_BUILD) + target_compile_options(VocieProcess + PRIVATE -Wpsabi -mavx2 -mfma + ) +endif() + target_compile_definitions(VocieProcess PRIVATE NOMINMAX # # PRIVATE RTC_DISABLE_LOGGING # PUBLIC RTC_DISABLE_METRICS - PUBLIC WEBRTC_HAS_NEON PUBLIC WEBRTC_APM_DEBUG_DUMP=1 $<$:WEBRTC_WIN> $<$:WEBRTC_POSIX WEBRTC_LINUX> + $<$:WEBRTC_HAS_NEON> ) target_include_directories(VocieProcess diff --git a/VocieProcess/common_audio/resampler/sinc_resampler_avx2.cc b/VocieProcess/common_audio/resampler/sinc_resampler_avx2.cc new file mode 100644 index 0000000..d945a10 --- /dev/null +++ b/VocieProcess/common_audio/resampler/sinc_resampler_avx2.cc @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2020 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include + +#include "common_audio/resampler/sinc_resampler.h" + +namespace webrtc { + +float SincResampler::Convolve_AVX2(const float* input_ptr, + const float* k1, + const float* k2, + double kernel_interpolation_factor) { + __m256 m_input; + __m256 m_sums1 = _mm256_setzero_ps(); + __m256 m_sums2 = _mm256_setzero_ps(); + + // Based on `input_ptr` alignment, we need to use loadu or load. Unrolling + // these loops has not been tested or benchmarked. + bool aligned_input = (reinterpret_cast(input_ptr) & 0x1F) == 0; + if (!aligned_input) { + for (size_t i = 0; i < kKernelSize; i += 8) { + m_input = _mm256_loadu_ps(input_ptr + i); + m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1); + m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2); + } + } else { + for (size_t i = 0; i < kKernelSize; i += 8) { + m_input = _mm256_load_ps(input_ptr + i); + m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1); + m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2); + } + } + + // Linearly interpolate the two "convolutions". + __m128 m128_sums1 = _mm_add_ps(_mm256_extractf128_ps(m_sums1, 0), + _mm256_extractf128_ps(m_sums1, 1)); + __m128 m128_sums2 = _mm_add_ps(_mm256_extractf128_ps(m_sums2, 0), + _mm256_extractf128_ps(m_sums2, 1)); + m128_sums1 = _mm_mul_ps( + m128_sums1, + _mm_set_ps1(static_cast(1.0 - kernel_interpolation_factor))); + m128_sums2 = _mm_mul_ps( + m128_sums2, _mm_set_ps1(static_cast(kernel_interpolation_factor))); + m128_sums1 = _mm_add_ps(m128_sums1, m128_sums2); + + // Sum components together. + float result; + m128_sums2 = _mm_add_ps(_mm_movehl_ps(m128_sums1, m128_sums1), m128_sums1); + _mm_store_ss(&result, _mm_add_ss(m128_sums2, + _mm_shuffle_ps(m128_sums2, m128_sums2, 1))); + + return result; +} + +} // namespace webrtc diff --git a/VocieProcess/common_audio/resampler/sinc_resampler_sse.cc b/VocieProcess/common_audio/resampler/sinc_resampler_sse.cc new file mode 100644 index 0000000..30a8d1b --- /dev/null +++ b/VocieProcess/common_audio/resampler/sinc_resampler_sse.cc @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// Modified from the Chromium original: +// src/media/base/simd/sinc_resampler_sse.cc + +#include +#include +#include + +#include "common_audio/resampler/sinc_resampler.h" + +namespace webrtc { + +float SincResampler::Convolve_SSE(const float* input_ptr, + const float* k1, + const float* k2, + double kernel_interpolation_factor) { + __m128 m_input; + __m128 m_sums1 = _mm_setzero_ps(); + __m128 m_sums2 = _mm_setzero_ps(); + + // Based on `input_ptr` alignment, we need to use loadu or load. Unrolling + // these loops hurt performance in local testing. + if (reinterpret_cast(input_ptr) & 0x0F) { + for (size_t i = 0; i < kKernelSize; i += 4) { + m_input = _mm_loadu_ps(input_ptr + i); + m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); + m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); + } + } else { + for (size_t i = 0; i < kKernelSize; i += 4) { + m_input = _mm_load_ps(input_ptr + i); + m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); + m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); + } + } + + // Linearly interpolate the two "convolutions". + m_sums1 = _mm_mul_ps( + m_sums1, + _mm_set_ps1(static_cast(1.0 - kernel_interpolation_factor))); + m_sums2 = _mm_mul_ps( + m_sums2, _mm_set_ps1(static_cast(kernel_interpolation_factor))); + m_sums1 = _mm_add_ps(m_sums1, m_sums2); + + // Sum components together. + float result; + m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); + _mm_store_ss(&result, + _mm_add_ss(m_sums2, _mm_shuffle_ps(m_sums2, m_sums2, 1))); + + return result; +} + +} // namespace webrtc diff --git a/VocieProcess/common_audio/third_party/ooura/fft_size_128/ooura_fft_sse2.cc b/VocieProcess/common_audio/third_party/ooura/fft_size_128/ooura_fft_sse2.cc new file mode 100644 index 0000000..7f0802d --- /dev/null +++ b/VocieProcess/common_audio/third_party/ooura/fft_size_128/ooura_fft_sse2.cc @@ -0,0 +1,439 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "common_audio/third_party/ooura/fft_size_128/ooura_fft.h" +#include "common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_common.h" +#include "common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_neon_sse2.h" +#include "rtc_base/system/arch.h" + +namespace webrtc { + +#if defined(WEBRTC_ARCH_X86_FAMILY) + +namespace { +// These intrinsics were unavailable before VS 2008. +// TODO(andrew): move to a common file. +#if defined(_MSC_VER) && _MSC_VER < 1500 +static __inline __m128 _mm_castsi128_ps(__m128i a) { + return *(__m128*)&a; +} +static __inline __m128i _mm_castps_si128(__m128 a) { + return *(__m128i*)&a; +} +#endif + +} // namespace + +void cft1st_128_SSE2(float* a) { + const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); + int j, k2; + + for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) { + __m128 a00v = _mm_loadu_ps(&a[j + 0]); + __m128 a04v = _mm_loadu_ps(&a[j + 4]); + __m128 a08v = _mm_loadu_ps(&a[j + 8]); + __m128 a12v = _mm_loadu_ps(&a[j + 12]); + __m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1, 0)); + __m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3, 2)); + __m128 a45v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(1, 0, 1, 0)); + __m128 a67v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(3, 2, 3, 2)); + + const __m128 wk1rv = _mm_load_ps(&rdft_wk1r[k2]); + const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2]); + const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2]); + const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2]); + const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2]); + const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2]); + __m128 x0v = _mm_add_ps(a01v, a23v); + const __m128 x1v = _mm_sub_ps(a01v, a23v); + const __m128 x2v = _mm_add_ps(a45v, a67v); + const __m128 x3v = _mm_sub_ps(a45v, a67v); + __m128 x0w; + a01v = _mm_add_ps(x0v, x2v); + x0v = _mm_sub_ps(x0v, x2v); + x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1)); + { + const __m128 a45_0v = _mm_mul_ps(wk2rv, x0v); + const __m128 a45_1v = _mm_mul_ps(wk2iv, x0w); + a45v = _mm_add_ps(a45_0v, a45_1v); + } + { + __m128 a23_0v, a23_1v; + const __m128 x3w = _mm_shuffle_ps(x3v, x3v, _MM_SHUFFLE(2, 3, 0, 1)); + const __m128 x3s = _mm_mul_ps(mm_swap_sign, x3w); + x0v = _mm_add_ps(x1v, x3s); + x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1)); + a23_0v = _mm_mul_ps(wk1rv, x0v); + a23_1v = _mm_mul_ps(wk1iv, x0w); + a23v = _mm_add_ps(a23_0v, a23_1v); + + x0v = _mm_sub_ps(x1v, x3s); + x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1)); + } + { + const __m128 a67_0v = _mm_mul_ps(wk3rv, x0v); + const __m128 a67_1v = _mm_mul_ps(wk3iv, x0w); + a67v = _mm_add_ps(a67_0v, a67_1v); + } + + a00v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(1, 0, 1, 0)); + a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1, 0)); + a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3, 2)); + a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3, 2)); + _mm_storeu_ps(&a[j + 0], a00v); + _mm_storeu_ps(&a[j + 4], a04v); + _mm_storeu_ps(&a[j + 8], a08v); + _mm_storeu_ps(&a[j + 12], a12v); + } +} + +void cftmdl_128_SSE2(float* a) { + const int l = 8; + const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); + int j0; + + __m128 wk1rv = _mm_load_ps(cftmdl_wk1r); + for (j0 = 0; j0 < l; j0 += 2) { + const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); + const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); + const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); + const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); + const __m128 a_00_32 = + _mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32), + _MM_SHUFFLE(1, 0, 1, 0)); + const __m128 a_08_40 = + _mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40), + _MM_SHUFFLE(1, 0, 1, 0)); + __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); + const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); + + const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); + const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); + const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); + const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); + const __m128 a_16_48 = + _mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48), + _MM_SHUFFLE(1, 0, 1, 0)); + const __m128 a_24_56 = + _mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56), + _MM_SHUFFLE(1, 0, 1, 0)); + const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); + const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); + + const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); + const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); + + const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32( + _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1))); + const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); + const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); + const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped); + + const __m128 yy0 = + _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(2, 2, 2, 2)); + const __m128 yy1 = + _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(3, 3, 3, 3)); + const __m128 yy2 = _mm_mul_ps(mm_swap_sign, yy1); + const __m128 yy3 = _mm_add_ps(yy0, yy2); + const __m128 yy4 = _mm_mul_ps(wk1rv, yy3); + + _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx0)); + _mm_storel_epi64( + (__m128i*)&a[j0 + 32], + _mm_shuffle_epi32(_mm_castps_si128(xx0), _MM_SHUFFLE(3, 2, 3, 2))); + + _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx1)); + _mm_storel_epi64( + (__m128i*)&a[j0 + 48], + _mm_shuffle_epi32(_mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 2, 3))); + a[j0 + 48] = -a[j0 + 48]; + + _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(x1_x3_add)); + _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(x1_x3_sub)); + + _mm_storel_epi64((__m128i*)&a[j0 + 40], _mm_castps_si128(yy4)); + _mm_storel_epi64( + (__m128i*)&a[j0 + 56], + _mm_shuffle_epi32(_mm_castps_si128(yy4), _MM_SHUFFLE(2, 3, 2, 3))); + } + + { + int k = 64; + int k1 = 2; + int k2 = 2 * k1; + const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2 + 0]); + const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2 + 0]); + const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2 + 0]); + const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2 + 0]); + const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2 + 0]); + wk1rv = _mm_load_ps(&rdft_wk1r[k2 + 0]); + for (j0 = k; j0 < l + k; j0 += 2) { + const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); + const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); + const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); + const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); + const __m128 a_00_32 = + _mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32), + _MM_SHUFFLE(1, 0, 1, 0)); + const __m128 a_08_40 = + _mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40), + _MM_SHUFFLE(1, 0, 1, 0)); + __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); + const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); + + const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); + const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); + const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); + const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); + const __m128 a_16_48 = + _mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48), + _MM_SHUFFLE(1, 0, 1, 0)); + const __m128 a_24_56 = + _mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56), + _MM_SHUFFLE(1, 0, 1, 0)); + const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); + const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); + + const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); + const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); + const __m128 xx2 = _mm_mul_ps(xx1, wk2rv); + const __m128 xx3 = _mm_mul_ps( + wk2iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xx1), + _MM_SHUFFLE(2, 3, 0, 1)))); + const __m128 xx4 = _mm_add_ps(xx2, xx3); + + const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32( + _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1))); + const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); + const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); + const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped); + + const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv); + const __m128 xx11 = _mm_mul_ps( + wk1iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add), + _MM_SHUFFLE(2, 3, 0, 1)))); + const __m128 xx12 = _mm_add_ps(xx10, xx11); + + const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv); + const __m128 xx21 = _mm_mul_ps( + wk3iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub), + _MM_SHUFFLE(2, 3, 0, 1)))); + const __m128 xx22 = _mm_add_ps(xx20, xx21); + + _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx)); + _mm_storel_epi64( + (__m128i*)&a[j0 + 32], + _mm_shuffle_epi32(_mm_castps_si128(xx), _MM_SHUFFLE(3, 2, 3, 2))); + + _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4)); + _mm_storel_epi64( + (__m128i*)&a[j0 + 48], + _mm_shuffle_epi32(_mm_castps_si128(xx4), _MM_SHUFFLE(3, 2, 3, 2))); + + _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12)); + _mm_storel_epi64( + (__m128i*)&a[j0 + 40], + _mm_shuffle_epi32(_mm_castps_si128(xx12), _MM_SHUFFLE(3, 2, 3, 2))); + + _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22)); + _mm_storel_epi64( + (__m128i*)&a[j0 + 56], + _mm_shuffle_epi32(_mm_castps_si128(xx22), _MM_SHUFFLE(3, 2, 3, 2))); + } + } +} + +void rftfsub_128_SSE2(float* a) { + const float* c = rdft_w + 32; + int j1, j2, k1, k2; + float wkr, wki, xr, xi, yr, yi; + + static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f, + 0.5f}; + const __m128 mm_half = _mm_load_ps(k_half); + + // Vectorized code (four at once). + // Note: commented number are indexes for the first iteration of the loop. + for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { + // Load 'wk'. + const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4, + const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31, + const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31, + const __m128 wkr_ = + _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28, + const __m128 wki_ = c_j1; // 1, 2, 3, 4, + // Load and shuffle 'a'. + const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5, + const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9, + const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]); // 120, 121, 122, 123, + const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]); // 124, 125, 126, 127, + const __m128 a_j2_p0 = _mm_shuffle_ps( + a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0)); // 2, 4, 6, 8, + const __m128 a_j2_p1 = _mm_shuffle_ps( + a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1)); // 3, 5, 7, 9, + const __m128 a_k2_p0 = _mm_shuffle_ps( + a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2)); // 126, 124, 122, 120, + const __m128 a_k2_p1 = _mm_shuffle_ps( + a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3)); // 127, 125, 123, 121, + // Calculate 'x'. + const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0); + // 2-126, 4-124, 6-122, 8-120, + const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1); + // 3-127, 5-125, 7-123, 9-121, + // Calculate product into 'y'. + // yr = wkr * xr - wki * xi; + // yi = wkr * xi + wki * xr; + const __m128 a_ = _mm_mul_ps(wkr_, xr_); + const __m128 b_ = _mm_mul_ps(wki_, xi_); + const __m128 c_ = _mm_mul_ps(wkr_, xi_); + const __m128 d_ = _mm_mul_ps(wki_, xr_); + const __m128 yr_ = _mm_sub_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120, + const __m128 yi_ = _mm_add_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121, + // Update 'a'. + // a[j2 + 0] -= yr; + // a[j2 + 1] -= yi; + // a[k2 + 0] += yr; + // a[k2 + 1] -= yi; + const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_); // 2, 4, 6, 8, + const __m128 a_j2_p1n = _mm_sub_ps(a_j2_p1, yi_); // 3, 5, 7, 9, + const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_); // 126, 124, 122, 120, + const __m128 a_k2_p1n = _mm_sub_ps(a_k2_p1, yi_); // 127, 125, 123, 121, + // Shuffle in right order and store. + const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n); + // 2, 3, 4, 5, + const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n); + // 6, 7, 8, 9, + const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n); + // 122, 123, 120, 121, + const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n); + // 126, 127, 124, 125, + const __m128 a_k2_0n = _mm_shuffle_ps( + a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2)); // 120, 121, 122, 123, + const __m128 a_k2_4n = _mm_shuffle_ps( + a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2)); // 124, 125, 126, 127, + _mm_storeu_ps(&a[0 + j2], a_j2_0n); + _mm_storeu_ps(&a[4 + j2], a_j2_4n); + _mm_storeu_ps(&a[122 - j2], a_k2_0n); + _mm_storeu_ps(&a[126 - j2], a_k2_4n); + } + // Scalar code for the remaining items. + for (; j2 < 64; j1 += 1, j2 += 2) { + k2 = 128 - j2; + k1 = 32 - j1; + wkr = 0.5f - c[k1]; + wki = c[j1]; + xr = a[j2 + 0] - a[k2 + 0]; + xi = a[j2 + 1] + a[k2 + 1]; + yr = wkr * xr - wki * xi; + yi = wkr * xi + wki * xr; + a[j2 + 0] -= yr; + a[j2 + 1] -= yi; + a[k2 + 0] += yr; + a[k2 + 1] -= yi; + } +} + +void rftbsub_128_SSE2(float* a) { + const float* c = rdft_w + 32; + int j1, j2, k1, k2; + float wkr, wki, xr, xi, yr, yi; + + static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f, + 0.5f}; + const __m128 mm_half = _mm_load_ps(k_half); + + a[1] = -a[1]; + // Vectorized code (four at once). + // Note: commented number are indexes for the first iteration of the loop. + for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { + // Load 'wk'. + const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4, + const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31, + const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31, + const __m128 wkr_ = + _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28, + const __m128 wki_ = c_j1; // 1, 2, 3, 4, + // Load and shuffle 'a'. + const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5, + const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9, + const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]); // 120, 121, 122, 123, + const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]); // 124, 125, 126, 127, + const __m128 a_j2_p0 = _mm_shuffle_ps( + a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0)); // 2, 4, 6, 8, + const __m128 a_j2_p1 = _mm_shuffle_ps( + a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1)); // 3, 5, 7, 9, + const __m128 a_k2_p0 = _mm_shuffle_ps( + a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2)); // 126, 124, 122, 120, + const __m128 a_k2_p1 = _mm_shuffle_ps( + a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3)); // 127, 125, 123, 121, + // Calculate 'x'. + const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0); + // 2-126, 4-124, 6-122, 8-120, + const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1); + // 3-127, 5-125, 7-123, 9-121, + // Calculate product into 'y'. + // yr = wkr * xr + wki * xi; + // yi = wkr * xi - wki * xr; + const __m128 a_ = _mm_mul_ps(wkr_, xr_); + const __m128 b_ = _mm_mul_ps(wki_, xi_); + const __m128 c_ = _mm_mul_ps(wkr_, xi_); + const __m128 d_ = _mm_mul_ps(wki_, xr_); + const __m128 yr_ = _mm_add_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120, + const __m128 yi_ = _mm_sub_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121, + // Update 'a'. + // a[j2 + 0] = a[j2 + 0] - yr; + // a[j2 + 1] = yi - a[j2 + 1]; + // a[k2 + 0] = yr + a[k2 + 0]; + // a[k2 + 1] = yi - a[k2 + 1]; + const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_); // 2, 4, 6, 8, + const __m128 a_j2_p1n = _mm_sub_ps(yi_, a_j2_p1); // 3, 5, 7, 9, + const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_); // 126, 124, 122, 120, + const __m128 a_k2_p1n = _mm_sub_ps(yi_, a_k2_p1); // 127, 125, 123, 121, + // Shuffle in right order and store. + const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n); + // 2, 3, 4, 5, + const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n); + // 6, 7, 8, 9, + const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n); + // 122, 123, 120, 121, + const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n); + // 126, 127, 124, 125, + const __m128 a_k2_0n = _mm_shuffle_ps( + a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2)); // 120, 121, 122, 123, + const __m128 a_k2_4n = _mm_shuffle_ps( + a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2)); // 124, 125, 126, 127, + _mm_storeu_ps(&a[0 + j2], a_j2_0n); + _mm_storeu_ps(&a[4 + j2], a_j2_4n); + _mm_storeu_ps(&a[122 - j2], a_k2_0n); + _mm_storeu_ps(&a[126 - j2], a_k2_4n); + } + // Scalar code for the remaining items. + for (; j2 < 64; j1 += 1, j2 += 2) { + k2 = 128 - j2; + k1 = 32 - j1; + wkr = 0.5f - c[k1]; + wki = c[j1]; + xr = a[j2 + 0] - a[k2 + 0]; + xi = a[j2 + 1] + a[k2 + 1]; + yr = wkr * xr + wki * xi; + yi = wkr * xi - wki * xr; + a[j2 + 0] = a[j2 + 0] - yr; + a[j2 + 1] = yi - a[j2 + 1]; + a[k2 + 0] = yr + a[k2 + 0]; + a[k2 + 1] = yi - a[k2 + 1]; + } + a[65] = -a[65]; +} +#endif + +} // namespace webrtc diff --git a/VocieProcess/modules/audio_processing/aec3/adaptive_fir_filter_avx2.cc b/VocieProcess/modules/audio_processing/aec3/adaptive_fir_filter_avx2.cc new file mode 100644 index 0000000..b6eda9f --- /dev/null +++ b/VocieProcess/modules/audio_processing/aec3/adaptive_fir_filter_avx2.cc @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2020 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "modules/audio_processing/aec3/adaptive_fir_filter.h" +#include "rtc_base/checks.h" + +namespace webrtc { + +namespace aec3 { + +// Computes and stores the frequency response of the filter. +void ComputeFrequencyResponse_Avx2( + size_t num_partitions, + const std::vector>& H, + std::vector>* H2) { + for (auto& H2_ch : *H2) { + H2_ch.fill(0.f); + } + + const size_t num_render_channels = H[0].size(); + RTC_DCHECK_EQ(H.size(), H2->capacity()); + for (size_t p = 0; p < num_partitions; ++p) { + RTC_DCHECK_EQ(kFftLengthBy2Plus1, (*H2)[p].size()); + auto& H2_p = (*H2)[p]; + for (size_t ch = 0; ch < num_render_channels; ++ch) { + const FftData& H_p_ch = H[p][ch]; + for (size_t j = 0; j < kFftLengthBy2; j += 8) { + __m256 re = _mm256_loadu_ps(&H_p_ch.re[j]); + __m256 re2 = _mm256_mul_ps(re, re); + __m256 im = _mm256_loadu_ps(&H_p_ch.im[j]); + re2 = _mm256_fmadd_ps(im, im, re2); + __m256 H2_k_j = _mm256_loadu_ps(&H2_p[j]); + H2_k_j = _mm256_max_ps(H2_k_j, re2); + _mm256_storeu_ps(&H2_p[j], H2_k_j); + } + float H2_new = H_p_ch.re[kFftLengthBy2] * H_p_ch.re[kFftLengthBy2] + + H_p_ch.im[kFftLengthBy2] * H_p_ch.im[kFftLengthBy2]; + H2_p[kFftLengthBy2] = std::max(H2_p[kFftLengthBy2], H2_new); + } + } +} + +// Adapts the filter partitions. +void AdaptPartitions_Avx2(const RenderBuffer& render_buffer, + const FftData& G, + size_t num_partitions, + std::vector>* H) { + rtc::ArrayView> render_buffer_data = + render_buffer.GetFftBuffer(); + const size_t num_render_channels = render_buffer_data[0].size(); + const size_t lim1 = std::min( + render_buffer_data.size() - render_buffer.Position(), num_partitions); + const size_t lim2 = num_partitions; + constexpr size_t kNumEightBinBands = kFftLengthBy2 / 8; + + size_t X_partition = render_buffer.Position(); + size_t limit = lim1; + size_t p = 0; + do { + for (; p < limit; ++p, ++X_partition) { + for (size_t ch = 0; ch < num_render_channels; ++ch) { + FftData& H_p_ch = (*H)[p][ch]; + const FftData& X = render_buffer_data[X_partition][ch]; + + for (size_t k = 0, n = 0; n < kNumEightBinBands; ++n, k += 8) { + const __m256 G_re = _mm256_loadu_ps(&G.re[k]); + const __m256 G_im = _mm256_loadu_ps(&G.im[k]); + const __m256 X_re = _mm256_loadu_ps(&X.re[k]); + const __m256 X_im = _mm256_loadu_ps(&X.im[k]); + const __m256 H_re = _mm256_loadu_ps(&H_p_ch.re[k]); + const __m256 H_im = _mm256_loadu_ps(&H_p_ch.im[k]); + const __m256 a = _mm256_mul_ps(X_re, G_re); + const __m256 b = _mm256_mul_ps(X_im, G_im); + const __m256 c = _mm256_mul_ps(X_re, G_im); + const __m256 d = _mm256_mul_ps(X_im, G_re); + const __m256 e = _mm256_add_ps(a, b); + const __m256 f = _mm256_sub_ps(c, d); + const __m256 g = _mm256_add_ps(H_re, e); + const __m256 h = _mm256_add_ps(H_im, f); + _mm256_storeu_ps(&H_p_ch.re[k], g); + _mm256_storeu_ps(&H_p_ch.im[k], h); + } + } + } + X_partition = 0; + limit = lim2; + } while (p < lim2); + + X_partition = render_buffer.Position(); + limit = lim1; + p = 0; + do { + for (; p < limit; ++p, ++X_partition) { + for (size_t ch = 0; ch < num_render_channels; ++ch) { + FftData& H_p_ch = (*H)[p][ch]; + const FftData& X = render_buffer_data[X_partition][ch]; + + H_p_ch.re[kFftLengthBy2] += X.re[kFftLengthBy2] * G.re[kFftLengthBy2] + + X.im[kFftLengthBy2] * G.im[kFftLengthBy2]; + H_p_ch.im[kFftLengthBy2] += X.re[kFftLengthBy2] * G.im[kFftLengthBy2] - + X.im[kFftLengthBy2] * G.re[kFftLengthBy2]; + } + } + + X_partition = 0; + limit = lim2; + } while (p < lim2); +} + +// Produces the filter output (AVX2 variant). +void ApplyFilter_Avx2(const RenderBuffer& render_buffer, + size_t num_partitions, + const std::vector>& H, + FftData* S) { + RTC_DCHECK_GE(H.size(), H.size() - 1); + S->re.fill(0.f); + S->im.fill(0.f); + + rtc::ArrayView> render_buffer_data = + render_buffer.GetFftBuffer(); + const size_t num_render_channels = render_buffer_data[0].size(); + const size_t lim1 = std::min( + render_buffer_data.size() - render_buffer.Position(), num_partitions); + const size_t lim2 = num_partitions; + constexpr size_t kNumEightBinBands = kFftLengthBy2 / 8; + + size_t X_partition = render_buffer.Position(); + size_t p = 0; + size_t limit = lim1; + do { + for (; p < limit; ++p, ++X_partition) { + for (size_t ch = 0; ch < num_render_channels; ++ch) { + const FftData& H_p_ch = H[p][ch]; + const FftData& X = render_buffer_data[X_partition][ch]; + for (size_t k = 0, n = 0; n < kNumEightBinBands; ++n, k += 8) { + const __m256 X_re = _mm256_loadu_ps(&X.re[k]); + const __m256 X_im = _mm256_loadu_ps(&X.im[k]); + const __m256 H_re = _mm256_loadu_ps(&H_p_ch.re[k]); + const __m256 H_im = _mm256_loadu_ps(&H_p_ch.im[k]); + const __m256 S_re = _mm256_loadu_ps(&S->re[k]); + const __m256 S_im = _mm256_loadu_ps(&S->im[k]); + const __m256 a = _mm256_mul_ps(X_re, H_re); + const __m256 b = _mm256_mul_ps(X_im, H_im); + const __m256 c = _mm256_mul_ps(X_re, H_im); + const __m256 d = _mm256_mul_ps(X_im, H_re); + const __m256 e = _mm256_sub_ps(a, b); + const __m256 f = _mm256_add_ps(c, d); + const __m256 g = _mm256_add_ps(S_re, e); + const __m256 h = _mm256_add_ps(S_im, f); + _mm256_storeu_ps(&S->re[k], g); + _mm256_storeu_ps(&S->im[k], h); + } + } + } + limit = lim2; + X_partition = 0; + } while (p < lim2); + + X_partition = render_buffer.Position(); + p = 0; + limit = lim1; + do { + for (; p < limit; ++p, ++X_partition) { + for (size_t ch = 0; ch < num_render_channels; ++ch) { + const FftData& H_p_ch = H[p][ch]; + const FftData& X = render_buffer_data[X_partition][ch]; + S->re[kFftLengthBy2] += X.re[kFftLengthBy2] * H_p_ch.re[kFftLengthBy2] - + X.im[kFftLengthBy2] * H_p_ch.im[kFftLengthBy2]; + S->im[kFftLengthBy2] += X.re[kFftLengthBy2] * H_p_ch.im[kFftLengthBy2] + + X.im[kFftLengthBy2] * H_p_ch.re[kFftLengthBy2]; + } + } + limit = lim2; + X_partition = 0; + } while (p < lim2); +} + +} // namespace aec3 +} // namespace webrtc diff --git a/VocieProcess/modules/audio_processing/aec3/adaptive_fir_filter_erl_avx2.cc b/VocieProcess/modules/audio_processing/aec3/adaptive_fir_filter_erl_avx2.cc new file mode 100644 index 0000000..1e63cf8 --- /dev/null +++ b/VocieProcess/modules/audio_processing/aec3/adaptive_fir_filter_erl_avx2.cc @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2020 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "modules/audio_processing/aec3/adaptive_fir_filter_erl.h" + +namespace webrtc { + +namespace aec3 { + +// Computes and stores the echo return loss estimate of the filter, which is the +// sum of the partition frequency responses. +void ErlComputer_AVX2( + const std::vector>& H2, + rtc::ArrayView erl) { + std::fill(erl.begin(), erl.end(), 0.f); + for (auto& H2_j : H2) { + for (size_t k = 0; k < kFftLengthBy2; k += 8) { + const __m256 H2_j_k = _mm256_loadu_ps(&H2_j[k]); + __m256 erl_k = _mm256_loadu_ps(&erl[k]); + erl_k = _mm256_add_ps(erl_k, H2_j_k); + _mm256_storeu_ps(&erl[k], erl_k); + } + erl[kFftLengthBy2] += H2_j[kFftLengthBy2]; + } +} + +} // namespace aec3 +} // namespace webrtc diff --git a/VocieProcess/modules/audio_processing/aec3/fft_data_avx2.cc b/VocieProcess/modules/audio_processing/aec3/fft_data_avx2.cc new file mode 100644 index 0000000..a4b3056 --- /dev/null +++ b/VocieProcess/modules/audio_processing/aec3/fft_data_avx2.cc @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2020 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "api/array_view.h" +#include "modules/audio_processing/aec3/fft_data.h" + +namespace webrtc { + +// Computes the power spectrum of the data. +void FftData::SpectrumAVX2(rtc::ArrayView power_spectrum) const { + RTC_DCHECK_EQ(kFftLengthBy2Plus1, power_spectrum.size()); + for (size_t k = 0; k < kFftLengthBy2; k += 8) { + __m256 r = _mm256_loadu_ps(&re[k]); + __m256 i = _mm256_loadu_ps(&im[k]); + __m256 ii = _mm256_mul_ps(i, i); + ii = _mm256_fmadd_ps(r, r, ii); + _mm256_storeu_ps(&power_spectrum[k], ii); + } + power_spectrum[kFftLengthBy2] = re[kFftLengthBy2] * re[kFftLengthBy2] + + im[kFftLengthBy2] * im[kFftLengthBy2]; +} + +} // namespace webrtc diff --git a/VocieProcess/modules/audio_processing/aec3/matched_filter_avx2.cc b/VocieProcess/modules/audio_processing/aec3/matched_filter_avx2.cc new file mode 100644 index 0000000..8c2ffcb --- /dev/null +++ b/VocieProcess/modules/audio_processing/aec3/matched_filter_avx2.cc @@ -0,0 +1,261 @@ +/* + * Copyright (c) 2020 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "modules/audio_processing/aec3/matched_filter.h" +#include "rtc_base/checks.h" + +namespace webrtc { +namespace aec3 { + +// Let ha denote the horizontal of a, and hb the horizontal sum of b +// returns [ha, hb, ha, hb] +inline __m128 hsum_ab(__m256 a, __m256 b) { + __m256 s_256 = _mm256_hadd_ps(a, b); + const __m256i mask = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + s_256 = _mm256_permutevar8x32_ps(s_256, mask); + __m128 s = _mm_hadd_ps(_mm256_extractf128_ps(s_256, 0), + _mm256_extractf128_ps(s_256, 1)); + s = _mm_hadd_ps(s, s); + return s; +} + +void MatchedFilterCore_AccumulatedError_AVX2( + size_t x_start_index, + float x2_sum_threshold, + float smoothing, + rtc::ArrayView x, + rtc::ArrayView y, + rtc::ArrayView h, + bool* filters_updated, + float* error_sum, + rtc::ArrayView accumulated_error, + rtc::ArrayView scratch_memory) { + const int h_size = static_cast(h.size()); + const int x_size = static_cast(x.size()); + RTC_DCHECK_EQ(0, h_size % 16); + std::fill(accumulated_error.begin(), accumulated_error.end(), 0.0f); + + // Process for all samples in the sub-block. + for (size_t i = 0; i < y.size(); ++i) { + // Apply the matched filter as filter * x, and compute x * x. + RTC_DCHECK_GT(x_size, x_start_index); + const int chunk1 = + std::min(h_size, static_cast(x_size - x_start_index)); + if (chunk1 != h_size) { + const int chunk2 = h_size - chunk1; + std::copy(x.begin() + x_start_index, x.end(), scratch_memory.begin()); + std::copy(x.begin(), x.begin() + chunk2, scratch_memory.begin() + chunk1); + } + const float* x_p = + chunk1 != h_size ? scratch_memory.data() : &x[x_start_index]; + const float* h_p = &h[0]; + float* a_p = &accumulated_error[0]; + __m256 s_inst_hadd_256; + __m256 s_inst_256; + __m256 s_inst_256_8; + __m256 x2_sum_256 = _mm256_set1_ps(0); + __m256 x2_sum_256_8 = _mm256_set1_ps(0); + __m128 e_128; + float x2_sum = 0.0f; + float s_acum = 0; + const int limit_by_16 = h_size >> 4; + for (int k = limit_by_16; k > 0; --k, h_p += 16, x_p += 16, a_p += 4) { + // Load the data into 256 bit vectors. + __m256 x_k = _mm256_loadu_ps(x_p); + __m256 h_k = _mm256_loadu_ps(h_p); + __m256 x_k_8 = _mm256_loadu_ps(x_p + 8); + __m256 h_k_8 = _mm256_loadu_ps(h_p + 8); + // Compute and accumulate x * x and h * x. + x2_sum_256 = _mm256_fmadd_ps(x_k, x_k, x2_sum_256); + x2_sum_256_8 = _mm256_fmadd_ps(x_k_8, x_k_8, x2_sum_256_8); + s_inst_256 = _mm256_mul_ps(h_k, x_k); + s_inst_256_8 = _mm256_mul_ps(h_k_8, x_k_8); + s_inst_hadd_256 = _mm256_hadd_ps(s_inst_256, s_inst_256_8); + s_inst_hadd_256 = _mm256_hadd_ps(s_inst_hadd_256, s_inst_hadd_256); + s_acum += s_inst_hadd_256[0]; + e_128[0] = s_acum - y[i]; + s_acum += s_inst_hadd_256[4]; + e_128[1] = s_acum - y[i]; + s_acum += s_inst_hadd_256[1]; + e_128[2] = s_acum - y[i]; + s_acum += s_inst_hadd_256[5]; + e_128[3] = s_acum - y[i]; + + __m128 accumulated_error = _mm_load_ps(a_p); + accumulated_error = _mm_fmadd_ps(e_128, e_128, accumulated_error); + _mm_storeu_ps(a_p, accumulated_error); + } + // Sum components together. + x2_sum_256 = _mm256_add_ps(x2_sum_256, x2_sum_256_8); + __m128 x2_sum_128 = _mm_add_ps(_mm256_extractf128_ps(x2_sum_256, 0), + _mm256_extractf128_ps(x2_sum_256, 1)); + // Combine the accumulated vector and scalar values. + float* v = reinterpret_cast(&x2_sum_128); + x2_sum += v[0] + v[1] + v[2] + v[3]; + + // Compute the matched filter error. + float e = y[i] - s_acum; + const bool saturation = y[i] >= 32000.f || y[i] <= -32000.f; + (*error_sum) += e * e; + + // Update the matched filter estimate in an NLMS manner. + if (x2_sum > x2_sum_threshold && !saturation) { + RTC_DCHECK_LT(0.f, x2_sum); + const float alpha = smoothing * e / x2_sum; + const __m256 alpha_256 = _mm256_set1_ps(alpha); + + // filter = filter + smoothing * (y - filter * x) * x / x * x. + float* h_p = &h[0]; + const float* x_p = + chunk1 != h_size ? scratch_memory.data() : &x[x_start_index]; + // Perform 256 bit vector operations. + const int limit_by_8 = h_size >> 3; + for (int k = limit_by_8; k > 0; --k, h_p += 8, x_p += 8) { + // Load the data into 256 bit vectors. + __m256 h_k = _mm256_loadu_ps(h_p); + __m256 x_k = _mm256_loadu_ps(x_p); + // Compute h = h + alpha * x. + h_k = _mm256_fmadd_ps(x_k, alpha_256, h_k); + + // Store the result. + _mm256_storeu_ps(h_p, h_k); + } + *filters_updated = true; + } + + x_start_index = x_start_index > 0 ? x_start_index - 1 : x_size - 1; + } +} + +void MatchedFilterCore_AVX2(size_t x_start_index, + float x2_sum_threshold, + float smoothing, + rtc::ArrayView x, + rtc::ArrayView y, + rtc::ArrayView h, + bool* filters_updated, + float* error_sum, + bool compute_accumulated_error, + rtc::ArrayView accumulated_error, + rtc::ArrayView scratch_memory) { + if (compute_accumulated_error) { + return MatchedFilterCore_AccumulatedError_AVX2( + x_start_index, x2_sum_threshold, smoothing, x, y, h, filters_updated, + error_sum, accumulated_error, scratch_memory); + } + const int h_size = static_cast(h.size()); + const int x_size = static_cast(x.size()); + RTC_DCHECK_EQ(0, h_size % 8); + + // Process for all samples in the sub-block. + for (size_t i = 0; i < y.size(); ++i) { + // Apply the matched filter as filter * x, and compute x * x. + + RTC_DCHECK_GT(x_size, x_start_index); + const float* x_p = &x[x_start_index]; + const float* h_p = &h[0]; + + // Initialize values for the accumulation. + __m256 s_256 = _mm256_set1_ps(0); + __m256 s_256_8 = _mm256_set1_ps(0); + __m256 x2_sum_256 = _mm256_set1_ps(0); + __m256 x2_sum_256_8 = _mm256_set1_ps(0); + float x2_sum = 0.f; + float s = 0; + + // Compute loop chunk sizes until, and after, the wraparound of the circular + // buffer for x. + const int chunk1 = + std::min(h_size, static_cast(x_size - x_start_index)); + + // Perform the loop in two chunks. + const int chunk2 = h_size - chunk1; + for (int limit : {chunk1, chunk2}) { + // Perform 256 bit vector operations. + const int limit_by_16 = limit >> 4; + for (int k = limit_by_16; k > 0; --k, h_p += 16, x_p += 16) { + // Load the data into 256 bit vectors. + __m256 x_k = _mm256_loadu_ps(x_p); + __m256 h_k = _mm256_loadu_ps(h_p); + __m256 x_k_8 = _mm256_loadu_ps(x_p + 8); + __m256 h_k_8 = _mm256_loadu_ps(h_p + 8); + // Compute and accumulate x * x and h * x. + x2_sum_256 = _mm256_fmadd_ps(x_k, x_k, x2_sum_256); + x2_sum_256_8 = _mm256_fmadd_ps(x_k_8, x_k_8, x2_sum_256_8); + s_256 = _mm256_fmadd_ps(h_k, x_k, s_256); + s_256_8 = _mm256_fmadd_ps(h_k_8, x_k_8, s_256_8); + } + + // Perform non-vector operations for any remaining items. + for (int k = limit - limit_by_16 * 16; k > 0; --k, ++h_p, ++x_p) { + const float x_k = *x_p; + x2_sum += x_k * x_k; + s += *h_p * x_k; + } + + x_p = &x[0]; + } + + // Sum components together. + x2_sum_256 = _mm256_add_ps(x2_sum_256, x2_sum_256_8); + s_256 = _mm256_add_ps(s_256, s_256_8); + __m128 sum = hsum_ab(x2_sum_256, s_256); + x2_sum += sum[0]; + s += sum[1]; + + // Compute the matched filter error. + float e = y[i] - s; + const bool saturation = y[i] >= 32000.f || y[i] <= -32000.f; + (*error_sum) += e * e; + + // Update the matched filter estimate in an NLMS manner. + if (x2_sum > x2_sum_threshold && !saturation) { + RTC_DCHECK_LT(0.f, x2_sum); + const float alpha = smoothing * e / x2_sum; + const __m256 alpha_256 = _mm256_set1_ps(alpha); + + // filter = filter + smoothing * (y - filter * x) * x / x * x. + float* h_p = &h[0]; + x_p = &x[x_start_index]; + + // Perform the loop in two chunks. + for (int limit : {chunk1, chunk2}) { + // Perform 256 bit vector operations. + const int limit_by_8 = limit >> 3; + for (int k = limit_by_8; k > 0; --k, h_p += 8, x_p += 8) { + // Load the data into 256 bit vectors. + __m256 h_k = _mm256_loadu_ps(h_p); + __m256 x_k = _mm256_loadu_ps(x_p); + // Compute h = h + alpha * x. + h_k = _mm256_fmadd_ps(x_k, alpha_256, h_k); + + // Store the result. + _mm256_storeu_ps(h_p, h_k); + } + + // Perform non-vector operations for any remaining items. + for (int k = limit - limit_by_8 * 8; k > 0; --k, ++h_p, ++x_p) { + *h_p += alpha * *x_p; + } + + x_p = &x[0]; + } + + *filters_updated = true; + } + + x_start_index = x_start_index > 0 ? x_start_index - 1 : x_size - 1; + } +} + +} // namespace aec3 +} // namespace webrtc diff --git a/VocieProcess/modules/audio_processing/aec3/vector_math_avx2.cc b/VocieProcess/modules/audio_processing/aec3/vector_math_avx2.cc new file mode 100644 index 0000000..a9805da --- /dev/null +++ b/VocieProcess/modules/audio_processing/aec3/vector_math_avx2.cc @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2020 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "api/array_view.h" +#include "modules/audio_processing/aec3/vector_math.h" +#include "rtc_base/checks.h" + +namespace webrtc { +namespace aec3 { + +// Elementwise square root. +void VectorMath::SqrtAVX2(rtc::ArrayView x) { + const int x_size = static_cast(x.size()); + const int vector_limit = x_size >> 3; + + int j = 0; + for (; j < vector_limit * 8; j += 8) { + __m256 g = _mm256_loadu_ps(&x[j]); + g = _mm256_sqrt_ps(g); + _mm256_storeu_ps(&x[j], g); + } + + for (; j < x_size; ++j) { + x[j] = sqrtf(x[j]); + } +} + +// Elementwise vector multiplication z = x * y. +void VectorMath::MultiplyAVX2(rtc::ArrayView x, + rtc::ArrayView y, + rtc::ArrayView z) { + RTC_DCHECK_EQ(z.size(), x.size()); + RTC_DCHECK_EQ(z.size(), y.size()); + const int x_size = static_cast(x.size()); + const int vector_limit = x_size >> 3; + + int j = 0; + for (; j < vector_limit * 8; j += 8) { + const __m256 x_j = _mm256_loadu_ps(&x[j]); + const __m256 y_j = _mm256_loadu_ps(&y[j]); + const __m256 z_j = _mm256_mul_ps(x_j, y_j); + _mm256_storeu_ps(&z[j], z_j); + } + + for (; j < x_size; ++j) { + z[j] = x[j] * y[j]; + } +} + +// Elementwise vector accumulation z += x. +void VectorMath::AccumulateAVX2(rtc::ArrayView x, + rtc::ArrayView z) { + RTC_DCHECK_EQ(z.size(), x.size()); + const int x_size = static_cast(x.size()); + const int vector_limit = x_size >> 3; + + int j = 0; + for (; j < vector_limit * 8; j += 8) { + const __m256 x_j = _mm256_loadu_ps(&x[j]); + __m256 z_j = _mm256_loadu_ps(&z[j]); + z_j = _mm256_add_ps(x_j, z_j); + _mm256_storeu_ps(&z[j], z_j); + } + + for (; j < x_size; ++j) { + z[j] += x[j]; + } +} + +} // namespace aec3 +} // namespace webrtc diff --git a/VocieProcess/system_wrappers/source/cpu_features.cc b/VocieProcess/system_wrappers/source/cpu_features.cc new file mode 100644 index 0000000..4a6170c --- /dev/null +++ b/VocieProcess/system_wrappers/source/cpu_features.cc @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// Parts of this file derived from Chromium's base/cpu.cc. + +#include "rtc_base/system/arch.h" +#include "system_wrappers/include/cpu_features_wrapper.h" + +#if defined(WEBRTC_ARCH_X86_FAMILY) && defined(_MSC_VER) +#include +#endif + +namespace webrtc { + +// No CPU feature is available => straight C path. +int GetCPUInfoNoASM(CPUFeature feature) { + (void)feature; + return 0; +} + +#if defined(WEBRTC_ARCH_X86_FAMILY) + +#if defined(WEBRTC_ENABLE_AVX2) +// xgetbv returns the value of an Intel Extended Control Register (XCR). +// Currently only XCR0 is defined by Intel so `xcr` should always be zero. +static uint64_t xgetbv(uint32_t xcr) { +#if defined(_MSC_VER) + return _xgetbv(xcr); +#else + uint32_t eax, edx; + + __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr)); + return (static_cast(edx) << 32) | eax; +#endif // _MSC_VER +} +#endif // WEBRTC_ENABLE_AVX2 + +#ifndef _MSC_VER +// Intrinsic for "cpuid". +#if defined(__pic__) && defined(__i386__) +static inline void __cpuid(int cpu_info[4], int info_type) { + __asm__ volatile( + "mov %%ebx, %%edi\n" + "cpuid\n" + "xchg %%edi, %%ebx\n" + : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), + "=d"(cpu_info[3]) + : "a"(info_type)); +} +#else +static inline void __cpuid(int cpu_info[4], int info_type) { + __asm__ volatile("cpuid\n" + : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), + "=d"(cpu_info[3]) + : "a"(info_type), "c"(0)); +} +#endif +#endif // _MSC_VER +#endif // WEBRTC_ARCH_X86_FAMILY + +#if defined(WEBRTC_ARCH_X86_FAMILY) +// Actual feature detection for x86. +int GetCPUInfo(CPUFeature feature) { + int cpu_info[4]; + __cpuid(cpu_info, 1); + if (feature == kSSE2) { + return 0 != (cpu_info[3] & 0x04000000); + } + if (feature == kSSE3) { + return 0 != (cpu_info[2] & 0x00000001); + } +#if defined(WEBRTC_ENABLE_AVX2) + if (feature == kAVX2) { + int cpu_info7[4]; + __cpuid(cpu_info7, 0); + int num_ids = cpu_info7[0]; + if (num_ids < 7) { + return 0; + } + // Interpret CPU feature information. + __cpuid(cpu_info7, 7); + + // AVX instructions can be used when + // a) AVX are supported by the CPU, + // b) XSAVE is supported by the CPU, + // c) XSAVE is enabled by the kernel. + // Compiling with MSVC and /arch:AVX2 surprisingly generates BMI2 + // instructions (see crbug.com/1315519). + return (cpu_info[2] & 0x10000000) != 0 /* AVX */ && + (cpu_info[2] & 0x04000000) != 0 /* XSAVE */ && + (cpu_info[2] & 0x08000000) != 0 /* OSXSAVE */ && + (xgetbv(0) & 0x00000006) == 6 /* XSAVE enabled by kernel */ && + (cpu_info7[1] & 0x00000020) != 0 /* AVX2 */ && + (cpu_info7[1] & 0x00000100) != 0 /* BMI2 */; + } +#endif // WEBRTC_ENABLE_AVX2 + if (feature == kFMA3) { + return 0 != (cpu_info[2] & 0x00001000); + } + return 0; +} +#else +// Default to straight C for other platforms. +int GetCPUInfo(CPUFeature feature) { + (void)feature; + return 0; +} +#endif + +} // namespace webrtc diff --git a/VocieProcess/system_wrappers/source/cpu_features_linux.cc b/VocieProcess/system_wrappers/source/cpu_features_linux.cc new file mode 100644 index 0000000..2d79dde --- /dev/null +++ b/VocieProcess/system_wrappers/source/cpu_features_linux.cc @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2016 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#ifdef __GLIBC_PREREQ +#define WEBRTC_GLIBC_PREREQ(a, b) __GLIBC_PREREQ(a, b) +#else +#define WEBRTC_GLIBC_PREREQ(a, b) 0 +#endif + +#if WEBRTC_GLIBC_PREREQ(2, 16) +#include +#else +#include +#include +#include +#include +#endif + +#include "rtc_base/system/arch.h" +#include "system_wrappers/include/cpu_features_wrapper.h" + +#if defined(WEBRTC_ARCH_ARM_FAMILY) +#include + +namespace webrtc { + +uint64_t GetCPUFeaturesARM(void) { + uint64_t result = 0; + int architecture = 0; + uint64_t hwcap = 0; + const char* platform = NULL; +#if WEBRTC_GLIBC_PREREQ(2, 16) + hwcap = getauxval(AT_HWCAP); + platform = (const char*)getauxval(AT_PLATFORM); +#else + ElfW(auxv_t) auxv; + int fd = open("/proc/self/auxv", O_RDONLY); + if (fd >= 0) { + while (hwcap == 0 || platform == NULL) { + if (read(fd, &auxv, sizeof(auxv)) < (ssize_t)sizeof(auxv)) { + if (errno == EINTR) + continue; + break; + } + switch (auxv.a_type) { + case AT_HWCAP: + hwcap = auxv.a_un.a_val; + break; + case AT_PLATFORM: + platform = (const char*)auxv.a_un.a_val; + break; + } + } + close(fd); + } +#endif // WEBRTC_GLIBC_PREREQ(2, 16) +#if defined(__aarch64__) + (void)platform; + architecture = 8; + if ((hwcap & HWCAP_FP) != 0) + result |= kCPUFeatureVFPv3; + if ((hwcap & HWCAP_ASIMD) != 0) + result |= kCPUFeatureNEON; +#else + if (platform != NULL) { + /* expect a string in the form "v6l" or "v7l", etc. + */ + if (platform[0] == 'v' && '0' <= platform[1] && platform[1] <= '9' && + (platform[2] == 'l' || platform[2] == 'b')) { + architecture = platform[1] - '0'; + } + } + if ((hwcap & HWCAP_VFPv3) != 0) + result |= kCPUFeatureVFPv3; + if ((hwcap & HWCAP_NEON) != 0) + result |= kCPUFeatureNEON; +#endif + if (architecture >= 7) + result |= kCPUFeatureARMv7; + if (architecture >= 6) + result |= kCPUFeatureLDREXSTREX; + return result; +} + +} // namespace webrtc +#endif // WEBRTC_ARCH_ARM_FAMILY