add neon code.

2024-09-06 11:01:01 +08:00 · 2024-09-06 11:01:01 +08:00 · c6debcc62a
commit c6debcc62a
parent fe7ab957c8
11 changed files with 1312 additions and 21 deletions
--- a/Record/EchoRecord.cpp
+++ b/Record/EchoRecord.cpp
@ -10,19 +10,18 @@
 class EchoRecordPrivate {
 public:
-    EchoRecordPrivate() {
+    void initialize(int sampleRate, int channels, int period) {
        std::unique_ptr<webrtc::EchoCanceller3Factory> factory = std::make_unique<webrtc::EchoCanceller3Factory>();
-
+        echoCanceller = factory->Create(sampleRate, channels, channels);
-        echoCanceller = factory->Create(16000, 1, 1);
+        nearendBuffer = std::make_unique<webrtc::AudioBuffer>(sampleRate, channels, sampleRate, channels, sampleRate, channels);
-
+        farendBuffer = std::make_unique<webrtc::AudioBuffer>(sampleRate, channels, sampleRate, channels, sampleRate, channels);
-        // nearendBuffer = std::make_unique<webrtc::AudioBuffer>(16000, 1, 16000, 1, 16000, 1);
+        linearOutputBuffer = std::make_unique<webrtc::AudioBuffer>(sampleRate, channels, sampleRate, channels, sampleRate, channels);
        // farendBuffer = std::make_unique<webrtc::AudioBuffer>(16000, 1, 16000, 1, 16000, 1);
    }
    std::unique_ptr<webrtc::EchoControl> echoCanceller;
-    // std::unique_ptr<webrtc::AudioBuffer> nearendBuffer;
+    std::unique_ptr<webrtc::AudioBuffer> nearendBuffer;
-    // std::unique_ptr<webrtc::AudioBuffer> farendBuffer;
+    std::unique_ptr<webrtc::AudioBuffer> farendBuffer;
    std::unique_ptr<webrtc::AudioBuffer> linearOutputBuffer;
 };
 EchoRecordTask::EchoRecordTask() : m_d{new EchoRecordPrivate()} {
@ -56,7 +55,7 @@ void EchoRecordTask::run() {
    RkAudio::Format format;
    format.channels = m_channels;
-    format.period = 20;
+    format.period = 10;
    m_speex = std::make_shared<SpeexDsp>();
    m_speex->start(format.sampleRate, m_channels, format.period);
@ -66,6 +65,8 @@ void EchoRecordTask::run() {
    m_webRtcAecm = std::make_shared<WebRtcAecm>();
    m_webRtcAecm->start(format.sampleRate, format.channels, format.period);
    m_d->initialize(format.sampleRate, m_channels, format.period);
    m_output = std::make_shared<RkAudio::Output>();
    if (!m_output->open(sizeof(uint16_t), format.sampleRate, 2, format.period, m_dsp == Vqe)) {
        LOG(error) << "audio output open failed.";
@ -87,18 +88,16 @@ void EchoRecordTask::run() {
                                           reinterpret_cast<int16_t *>(m_outBuffer.data()), frame.frameSize);
        } else if (m_dsp == Aec3) {
            webrtc::StreamConfig config(format.sampleRate, format.channels); // 单声道
-            webrtc::AudioBuffer nearendBuffer(format.sampleRate, 1, format.sampleRate, 1, format.sampleRate, 1);
+            m_d->nearendBuffer->CopyFrom(reinterpret_cast<const int16_t *>(frame.data), config);
            webrtc::AudioBuffer farendBuffer(format.sampleRate, 1, format.sampleRate, 1, format.sampleRate, 1);
            webrtc::AudioBuffer linearOutputBuffer(format.sampleRate, 1, format.sampleRate, 1, format.sampleRate, 1);
            nearendBuffer.CopyFrom(reinterpret_cast<const int16_t *>(frame.data), config);
-            farendBuffer.CopyFrom(reinterpret_cast<const int16_t *>(m_farendBuffer.data()), config);
+            m_d->farendBuffer->CopyFrom(reinterpret_cast<const int16_t *>(m_farendBuffer.data()), config);
-            m_d->echoCanceller->AnalyzeRender(&farendBuffer);
+            m_d->echoCanceller->AnalyzeRender(m_d->farendBuffer.get());
-            m_d->echoCanceller->AnalyzeCapture(&nearendBuffer);
+            m_d->echoCanceller->AnalyzeCapture(m_d->nearendBuffer.get());
-            m_d->echoCanceller->ProcessCapture(&nearendBuffer, &linearOutputBuffer, /*level_change=*/false);
+            m_d->echoCanceller->ProcessCapture(m_d->nearendBuffer.get(), false);
            // m_d->echoCanceller->ProcessCapture(&nearendBuffer, &linearOutputBuffer, /*level_change=*/false);
-            linearOutputBuffer.CopyTo(config, reinterpret_cast<int16_t *>(m_outBuffer.data()));
+            m_d->nearendBuffer->CopyTo(config, reinterpret_cast<int16_t *>(m_outBuffer.data()));
        }
        if (m_channels == 2) {
--- a/VocieProcess/CMakeLists.txt
+++ b/VocieProcess/CMakeLists.txt
@ -28,25 +28,31 @@ add_library(VocieProcess
    common_audio/audio_util.cc
    common_audio/channel_buffer.h common_audio/channel_buffer.cc
    common_audio/fir_filter_neon.h common_audio/fir_filter_neon.cc
    common_audio/ring_buffer.h common_audio/ring_buffer.c
    common_audio/resampler/push_sinc_resampler.h common_audio/resampler/push_sinc_resampler.cc
-    common_audio/resampler/sinc_resampler.h common_audio/resampler/sinc_resampler.cc
+    common_audio/resampler/sinc_resampler.h common_audio/resampler/sinc_resampler_neon.cc
    common_audio/resampler/sinc_resampler.cc
    common_audio/signal_processing/complex_bit_reverse.c
    common_audio/signal_processing/complex_fft.c
    common_audio/signal_processing/cross_correlation_neon.c
    common_audio/signal_processing/cross_correlation.c
    common_audio/signal_processing/division_operations.c
    common_audio/signal_processing/dot_product_with_scale.h common_audio/signal_processing/dot_product_with_scale.cc
    common_audio/signal_processing/downsample_fast.c
    common_audio/signal_processing/downsample_fast_neon.c
    common_audio/signal_processing/min_max_operations.c
    common_audio/signal_processing/min_max_operations_neon.c
    common_audio/signal_processing/randomization_functions.c
    common_audio/signal_processing/real_fft.c
    common_audio/signal_processing/spl_init.c
    common_audio/signal_processing/splitting_filter.c
    common_audio/signal_processing/vector_scaling_operations.c
-    common_audio/third_party/ooura/fft_size_128/ooura_fft.h common_audio/third_party/ooura/fft_size_128/ooura_fft.cc
+    common_audio/third_party/ooura/fft_size_128/ooura_fft.h common_audio/third_party/ooura/fft_size_128/ooura_fft_neon.cc 
    common_audio/third_party/ooura/fft_size_128/ooura_fft.cc
    common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.h common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.c
    rtc_base/checks.h rtc_base/checks.cc
@ -132,6 +138,7 @@ add_library(VocieProcess
    modules/audio_processing/aec3/transparent_mode.h modules/audio_processing/aec3/transparent_mode.cc
    modules/audio_processing/aecm/aecm_core.h modules/audio_processing/aecm/aecm_core.cc modules/audio_processing/aecm/aecm_core_c.cc
    modules/audio_processing/aecm/aecm_core_neon.cc
    modules/audio_processing/aecm/echo_control_mobile.h  modules/audio_processing/aecm/echo_control_mobile.cc
    modules/audio_processing/logging/apm_data_dumper.h modules/audio_processing/logging/apm_data_dumper.cc
@ -148,6 +155,7 @@ target_compile_definitions(VocieProcess
    PRIVATE NOMINMAX # <windows.h>
    PRIVATE RTC_DISABLE_LOGGING
    PUBLIC RTC_DISABLE_METRICS
    PUBLIC WEBRTC_HAS_NEON
    PUBLIC WEBRTC_APM_DEBUG_DUMP=0
    $<$<PLATFORM_ID:Windows>:WEBRTC_WIN>
    $<$<PLATFORM_ID:Linux>:WEBRTC_POSIX WEBRTC_LINUX>
--- a/VocieProcess/common_audio/fir_filter.h
+++ b/VocieProcess/common_audio/fir_filter.h
@ -0,0 +1,30 @@
 /*
 *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef COMMON_AUDIO_FIR_FILTER_H_
 #define COMMON_AUDIO_FIR_FILTER_H_
 #include <string.h>
 namespace webrtc {
 // Finite Impulse Response filter using floating-point arithmetic.
 class FIRFilter {
 public:
  virtual ~FIRFilter() {}
  // Filters the `in` data supplied.
  // `out` must be previously allocated and it must be at least of `length`.
  virtual void Filter(const float* in, size_t length, float* out) = 0;
 };
 }  // namespace webrtc
 #endif  // COMMON_AUDIO_FIR_FILTER_H_
--- a/VocieProcess/common_audio/fir_filter_neon.cc
+++ b/VocieProcess/common_audio/fir_filter_neon.cc
@ -0,0 +1,73 @@
 /*
 *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "common_audio/fir_filter_neon.h"
 #include <arm_neon.h>
 #include <string.h>
 #include "rtc_base/checks.h"
 #include "rtc_base/memory/aligned_malloc.h"
 namespace webrtc {
 FIRFilterNEON::~FIRFilterNEON() {}
 FIRFilterNEON::FIRFilterNEON(const float* coefficients,
                             size_t coefficients_length,
                             size_t max_input_length)
    :  // Closest higher multiple of four.
      coefficients_length_((coefficients_length + 3) & ~0x03),
      state_length_(coefficients_length_ - 1),
      coefficients_(static_cast<float*>(
          AlignedMalloc(sizeof(float) * coefficients_length_, 16))),
      state_(static_cast<float*>(
          AlignedMalloc(sizeof(float) * (max_input_length + state_length_),
                        16))) {
  // Add zeros at the end of the coefficients.
  size_t padding = coefficients_length_ - coefficients_length;
  memset(coefficients_.get(), 0.f, padding * sizeof(coefficients_[0]));
  // The coefficients are reversed to compensate for the order in which the
  // input samples are acquired (most recent last).
  for (size_t i = 0; i < coefficients_length; ++i) {
    coefficients_[i + padding] = coefficients[coefficients_length - i - 1];
  }
  memset(state_.get(), 0.f,
         (max_input_length + state_length_) * sizeof(state_[0]));
 }
 void FIRFilterNEON::Filter(const float* in, size_t length, float* out) {
  RTC_DCHECK_GT(length, 0);
  memcpy(&state_[state_length_], in, length * sizeof(*in));
  // Convolves the input signal `in` with the filter kernel `coefficients_`
  // taking into account the previous state.
  for (size_t i = 0; i < length; ++i) {
    float* in_ptr = &state_[i];
    float* coef_ptr = coefficients_.get();
    float32x4_t m_sum = vmovq_n_f32(0);
    float32x4_t m_in;
    for (size_t j = 0; j < coefficients_length_; j += 4) {
      m_in = vld1q_f32(in_ptr + j);
      m_sum = vmlaq_f32(m_sum, m_in, vld1q_f32(coef_ptr + j));
    }
    float32x2_t m_half = vadd_f32(vget_high_f32(m_sum), vget_low_f32(m_sum));
    out[i] = vget_lane_f32(vpadd_f32(m_half, m_half), 0);
  }
  // Update current state.
  memmove(state_.get(), &state_[length], state_length_ * sizeof(state_[0]));
 }
 }  // namespace webrtc
--- a/VocieProcess/common_audio/fir_filter_neon.h
+++ b/VocieProcess/common_audio/fir_filter_neon.h
@ -0,0 +1,39 @@
 /*
 *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef COMMON_AUDIO_FIR_FILTER_NEON_H_
 #define COMMON_AUDIO_FIR_FILTER_NEON_H_
 #include <memory>
 #include "common_audio/fir_filter.h"
 #include "rtc_base/memory/aligned_malloc.h"
 namespace webrtc {
 class FIRFilterNEON : public FIRFilter {
 public:
  FIRFilterNEON(const float* coefficients,
                size_t coefficients_length,
                size_t max_input_length);
  ~FIRFilterNEON() override;
  void Filter(const float* in, size_t length, float* out) override;
 private:
  size_t coefficients_length_;
  size_t state_length_;
  std::unique_ptr<float[], AlignedFreeDeleter> coefficients_;
  std::unique_ptr<float[], AlignedFreeDeleter> state_;
 };
 }  // namespace webrtc
 #endif  // COMMON_AUDIO_FIR_FILTER_NEON_H_
--- a/VocieProcess/common_audio/resampler/sinc_resampler_neon.cc
+++ b/VocieProcess/common_audio/resampler/sinc_resampler_neon.cc
@ -0,0 +1,48 @@
 /*
 *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 // Modified from the Chromium original:
 // src/media/base/sinc_resampler.cc
 #include <arm_neon.h>
 #include "common_audio/resampler/sinc_resampler.h"
 namespace webrtc {
 float SincResampler::Convolve_NEON(const float* input_ptr,
                                   const float* k1,
                                   const float* k2,
                                   double kernel_interpolation_factor) {
  float32x4_t m_input;
  float32x4_t m_sums1 = vmovq_n_f32(0);
  float32x4_t m_sums2 = vmovq_n_f32(0);
  const float* upper = input_ptr + kKernelSize;
  for (; input_ptr < upper;) {
    m_input = vld1q_f32(input_ptr);
    input_ptr += 4;
    m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1));
    k1 += 4;
    m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2));
    k2 += 4;
  }
  // Linearly interpolate the two "convolutions".
  m_sums1 = vmlaq_f32(
      vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),
      m_sums2, vmovq_n_f32(kernel_interpolation_factor));
  // Sum components together.
  float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));
  return vget_lane_f32(vpadd_f32(m_half, m_half), 0);
 }
 }  // namespace webrtc
--- a/VocieProcess/common_audio/signal_processing/cross_correlation_neon.c
+++ b/VocieProcess/common_audio/signal_processing/cross_correlation_neon.c
@ -0,0 +1,88 @@
 /*
 *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "common_audio/signal_processing/include/signal_processing_library.h"
 #include "rtc_base/system/arch.h"
 #include <arm_neon.h>
 static inline void DotProductWithScaleNeon(int32_t* cross_correlation,
                                           const int16_t* vector1,
                                           const int16_t* vector2,
                                           size_t length,
                                           int scaling) {
  size_t i = 0;
  size_t len1 = length >> 3;
  size_t len2 = length & 7;
  int64x2_t sum0 = vdupq_n_s64(0);
  int64x2_t sum1 = vdupq_n_s64(0);
  for (i = len1; i > 0; i -= 1) {
    int16x8_t seq1_16x8 = vld1q_s16(vector1);
    int16x8_t seq2_16x8 = vld1q_s16(vector2);
 #if defined(WEBRTC_ARCH_ARM64)
    int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8),
                               vget_low_s16(seq2_16x8));
    int32x4_t tmp1 = vmull_high_s16(seq1_16x8, seq2_16x8);
 #else
    int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8),
                               vget_low_s16(seq2_16x8));
    int32x4_t tmp1 = vmull_s16(vget_high_s16(seq1_16x8),
                               vget_high_s16(seq2_16x8));
 #endif
    sum0 = vpadalq_s32(sum0, tmp0);
    sum1 = vpadalq_s32(sum1, tmp1);
    vector1 += 8;
    vector2 += 8;
  }
  // Calculate the rest of the samples.
  int64_t sum_res = 0;
  for (i = len2; i > 0; i -= 1) {
    sum_res += WEBRTC_SPL_MUL_16_16(*vector1, *vector2);
    vector1++;
    vector2++;
  }
  sum0 = vaddq_s64(sum0, sum1);
 #if defined(WEBRTC_ARCH_ARM64)
  int64_t sum2 = vaddvq_s64(sum0);
  *cross_correlation = (int32_t)((sum2 + sum_res) >> scaling);
 #else
  int64x1_t shift = vdup_n_s64(-scaling);
  int64x1_t sum2 = vadd_s64(vget_low_s64(sum0), vget_high_s64(sum0));
  sum2 = vadd_s64(sum2, vdup_n_s64(sum_res));
  sum2 = vshl_s64(sum2, shift);
  vst1_lane_s32(cross_correlation, vreinterpret_s32_s64(sum2), 0);
 #endif
 }
 /* NEON version of WebRtcSpl_CrossCorrelation() for ARM32/64 platforms. */
 void WebRtcSpl_CrossCorrelationNeon(int32_t* cross_correlation,
                                    const int16_t* seq1,
                                    const int16_t* seq2,
                                    size_t dim_seq,
                                    size_t dim_cross_correlation,
                                    int right_shifts,
                                    int step_seq2) {
  int i = 0;
  for (i = 0; i < (int)dim_cross_correlation; i++) {
    const int16_t* seq1_ptr = seq1;
    const int16_t* seq2_ptr = seq2 + (step_seq2 * i);
    DotProductWithScaleNeon(cross_correlation,
                            seq1_ptr,
                            seq2_ptr,
                            dim_seq,
                            right_shifts);
    cross_correlation++;
  }
 }
--- a/VocieProcess/common_audio/signal_processing/downsample_fast_neon.c
+++ b/VocieProcess/common_audio/signal_processing/downsample_fast_neon.c
@ -0,0 +1,224 @@
 /*
 *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <arm_neon.h>
 #include "common_audio/signal_processing/include/signal_processing_library.h"
 #include "rtc_base/checks.h"
 // NEON intrinsics version of WebRtcSpl_DownsampleFast()
 // for ARM 32-bit/64-bit platforms.
 int WebRtcSpl_DownsampleFastNeon(const int16_t* data_in,
                                 size_t data_in_length,
                                 int16_t* data_out,
                                 size_t data_out_length,
                                 const int16_t* __restrict coefficients,
                                 size_t coefficients_length,
                                 int factor,
                                 size_t delay) {
  // Using signed indexes to be able to compute negative i-j that
  // is used to index data_in.
  int i = 0;
  int j = 0;
  int32_t out_s32 = 0;
  int endpos = delay + factor * (data_out_length - 1) + 1;
  size_t res = data_out_length & 0x7;
  int endpos1 = endpos - factor * res;
  // Return error if any of the running conditions doesn't meet.
  if (data_out_length == 0 || coefficients_length == 0
                           || (int)data_in_length < endpos) {
    return -1;
  }
  RTC_DCHECK_GE(endpos, 0);
  RTC_DCHECK_GE(endpos1, 0);
  // First part, unroll the loop 8 times, with 3 subcases
  // (factor == 2, 4, others).
  switch (factor) {
    case 2: {
      for (i = delay; i < endpos1; i += 16) {
        // Round value, 0.5 in Q12.
        int32x4_t out32x4_0 = vdupq_n_s32(2048);
        int32x4_t out32x4_1 = vdupq_n_s32(2048);
 #if defined(WEBRTC_ARCH_ARM64)
        // Unroll the loop 2 times.
        for (j = 0; j < (int)coefficients_length - 1; j += 2) {
          int32x2_t coeff32 = vld1_dup_s32((int32_t*)&coefficients[j]);
          int16x4_t coeff16x4 = vreinterpret_s16_s32(coeff32);
          int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j - 1]);
          // Mul and accumulate low 64-bit data.
          int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]);
          int16x4_t in16x4_1 = vget_low_s16(in16x8x2.val[1]);
          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 1);
          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_1, coeff16x4, 0);
          // Mul and accumulate high 64-bit data.
          // TODO: vget_high_s16 need extra cost on ARM64. This could be
          // replaced by vmlal_high_lane_s16. But for the interface of
          // vmlal_high_lane_s16, there is a bug in gcc 4.9.
          // This issue need to be tracked in the future.
          int16x4_t in16x4_2 = vget_high_s16(in16x8x2.val[0]);
          int16x4_t in16x4_3 = vget_high_s16(in16x8x2.val[1]);
          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_2, coeff16x4, 1);
          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_3, coeff16x4, 0);
        }
        for (; j < (int)coefficients_length; j++) {
          int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
          int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j]);
          // Mul and accumulate low 64-bit data.
          int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]);
          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
          // Mul and accumulate high 64-bit data.
          // TODO: vget_high_s16 need extra cost on ARM64. This could be
          // replaced by vmlal_high_lane_s16. But for the interface of
          // vmlal_high_lane_s16, there is a bug in gcc 4.9.
          // This issue need to be tracked in the future.
          int16x4_t in16x4_1 = vget_high_s16(in16x8x2.val[0]);
          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
        }
 #else
        // On ARMv7, the loop unrolling 2 times results in performance
        // regression.
        for (j = 0; j < (int)coefficients_length; j++) {
          int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
          int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j]);
          // Mul and accumulate.
          int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]);
          int16x4_t in16x4_1 = vget_high_s16(in16x8x2.val[0]);
          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
        }
 #endif
        // Saturate and store the output.
        int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12);
        int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12);
        vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1));
        data_out += 8;
      }
      break;
    }
    case 4: {
      for (i = delay; i < endpos1; i += 32) {
        // Round value, 0.5 in Q12.
        int32x4_t out32x4_0 = vdupq_n_s32(2048);
        int32x4_t out32x4_1 = vdupq_n_s32(2048);
        // Unroll the loop 4 times.
        for (j = 0; j < (int)coefficients_length - 3; j += 4) {
          int16x4_t coeff16x4 = vld1_s16(&coefficients[j]);
          int16x8x4_t in16x8x4 = vld4q_s16(&data_in[i - j - 3]);
          // Mul and accumulate low 64-bit data.
          int16x4_t in16x4_0 = vget_low_s16(in16x8x4.val[0]);
          int16x4_t in16x4_2 = vget_low_s16(in16x8x4.val[1]);
          int16x4_t in16x4_4 = vget_low_s16(in16x8x4.val[2]);
          int16x4_t in16x4_6 = vget_low_s16(in16x8x4.val[3]);
          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 3);
          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_2, coeff16x4, 2);
          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_4, coeff16x4, 1);
          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_6, coeff16x4, 0);
          // Mul and accumulate high 64-bit data.
          // TODO: vget_high_s16 need extra cost on ARM64. This could be
          // replaced by vmlal_high_lane_s16. But for the interface of
          // vmlal_high_lane_s16, there is a bug in gcc 4.9.
          // This issue need to be tracked in the future.
          int16x4_t in16x4_1 = vget_high_s16(in16x8x4.val[0]);
          int16x4_t in16x4_3 = vget_high_s16(in16x8x4.val[1]);
          int16x4_t in16x4_5 = vget_high_s16(in16x8x4.val[2]);
          int16x4_t in16x4_7 = vget_high_s16(in16x8x4.val[3]);
          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 3);
          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_3, coeff16x4, 2);
          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_5, coeff16x4, 1);
          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_7, coeff16x4, 0);
        }
        for (; j < (int)coefficients_length; j++) {
          int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
          int16x8x4_t in16x8x4 = vld4q_s16(&data_in[i - j]);
          // Mul and accumulate low 64-bit data.
          int16x4_t in16x4_0 = vget_low_s16(in16x8x4.val[0]);
          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
          // Mul and accumulate high 64-bit data.
          // TODO: vget_high_s16 need extra cost on ARM64. This could be
          // replaced by vmlal_high_lane_s16. But for the interface of
          // vmlal_high_lane_s16, there is a bug in gcc 4.9.
          // This issue need to be tracked in the future.
          int16x4_t in16x4_1 = vget_high_s16(in16x8x4.val[0]);
          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
        }
        // Saturate and store the output.
        int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12);
        int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12);
        vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1));
        data_out += 8;
      }
      break;
    }
    default: {
      for (i = delay; i < endpos1; i += factor * 8) {
        // Round value, 0.5 in Q12.
        int32x4_t out32x4_0 = vdupq_n_s32(2048);
        int32x4_t out32x4_1 = vdupq_n_s32(2048);
        for (j = 0; j < (int)coefficients_length; j++) {
          int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
          int16x4_t in16x4_0 = vld1_dup_s16(&data_in[i - j]);
          in16x4_0 = vld1_lane_s16(&data_in[i + factor - j], in16x4_0, 1);
          in16x4_0 = vld1_lane_s16(&data_in[i + factor * 2 - j], in16x4_0, 2);
          in16x4_0 = vld1_lane_s16(&data_in[i + factor * 3 - j], in16x4_0, 3);
          int16x4_t in16x4_1 = vld1_dup_s16(&data_in[i + factor * 4 - j]);
          in16x4_1 = vld1_lane_s16(&data_in[i + factor * 5 - j], in16x4_1, 1);
          in16x4_1 = vld1_lane_s16(&data_in[i + factor * 6 - j], in16x4_1, 2);
          in16x4_1 = vld1_lane_s16(&data_in[i + factor * 7 - j], in16x4_1, 3);
          // Mul and accumulate.
          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
        }
        // Saturate and store the output.
        int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12);
        int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12);
        vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1));
        data_out += 8;
      }
      break;
    }
  }
  // Second part, do the rest iterations (if any).
  for (; i < endpos; i += factor) {
    out_s32 = 2048;  // Round value, 0.5 in Q12.
    for (j = 0; j < (int)coefficients_length; j++) {
      out_s32 = WebRtc_MulAccumW16(coefficients[j], data_in[i - j], out_s32);
    }
    // Saturate and store the output.
    out_s32 >>= 12;
    *data_out++ = WebRtcSpl_SatW32ToW16(out_s32);
  }
  return 0;
 }
--- a/VocieProcess/common_audio/signal_processing/min_max_operations_neon.c
+++ b/VocieProcess/common_audio/signal_processing/min_max_operations_neon.c
@ -0,0 +1,333 @@
 /*
 *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <arm_neon.h>
 #include <stdlib.h>
 #include "rtc_base/checks.h"
 #include "common_audio/signal_processing/include/signal_processing_library.h"
 // Maximum absolute value of word16 vector. C version for generic platforms.
 int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, size_t length) {
  int absolute = 0, maximum = 0;
  RTC_DCHECK_GT(length, 0);
  const int16_t* p_start = vector;
  size_t rest = length & 7;
  const int16_t* p_end = vector + length - rest;
  int16x8_t v;
  uint16x8_t max_qv;
  max_qv = vdupq_n_u16(0);
  while (p_start < p_end) {
    v = vld1q_s16(p_start);
    // Note vabs doesn't change the value of -32768.
    v = vabsq_s16(v);
    // Use u16 so we don't lose the value -32768.
    max_qv = vmaxq_u16(max_qv, vreinterpretq_u16_s16(v));
    p_start += 8;
  }
 #ifdef WEBRTC_ARCH_ARM64
  maximum = (int)vmaxvq_u16(max_qv);
 #else
  uint16x4_t max_dv;
  max_dv = vmax_u16(vget_low_u16(max_qv), vget_high_u16(max_qv));
  max_dv = vpmax_u16(max_dv, max_dv);
  max_dv = vpmax_u16(max_dv, max_dv);
  maximum = (int)vget_lane_u16(max_dv, 0);
 #endif
  p_end = vector + length;
  while (p_start < p_end) {
    absolute = abs((int)(*p_start));
    if (absolute > maximum) {
      maximum = absolute;
    }
    p_start++;
  }
  // Guard the case for abs(-32768).
  if (maximum > WEBRTC_SPL_WORD16_MAX) {
    maximum = WEBRTC_SPL_WORD16_MAX;
  }
  return (int16_t)maximum;
 }
 // Maximum absolute value of word32 vector. NEON intrinsics version for
 // ARM 32-bit/64-bit platforms.
 int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, size_t length) {
  // Use uint32_t for the local variables, to accommodate the return value
  // of abs(0x80000000), which is 0x80000000.
  uint32_t absolute = 0, maximum = 0;
  size_t i = 0;
  size_t residual = length & 0x7;
  RTC_DCHECK_GT(length, 0);
  const int32_t* p_start = vector;
  uint32x4_t max32x4_0 = vdupq_n_u32(0);
  uint32x4_t max32x4_1 = vdupq_n_u32(0);
  // First part, unroll the loop 8 times.
  for (i = 0; i < length - residual; i += 8) {
    int32x4_t in32x4_0 = vld1q_s32(p_start);
    p_start += 4;
    int32x4_t in32x4_1 = vld1q_s32(p_start);
    p_start += 4;
    in32x4_0 = vabsq_s32(in32x4_0);
    in32x4_1 = vabsq_s32(in32x4_1);
    // vabs doesn't change the value of 0x80000000.
    // Use u32 so we don't lose the value 0x80000000.
    max32x4_0 = vmaxq_u32(max32x4_0, vreinterpretq_u32_s32(in32x4_0));
    max32x4_1 = vmaxq_u32(max32x4_1, vreinterpretq_u32_s32(in32x4_1));
  }
  uint32x4_t max32x4 = vmaxq_u32(max32x4_0, max32x4_1);
 #if defined(WEBRTC_ARCH_ARM64)
  maximum = vmaxvq_u32(max32x4);
 #else
  uint32x2_t max32x2 = vmax_u32(vget_low_u32(max32x4), vget_high_u32(max32x4));
  max32x2 = vpmax_u32(max32x2, max32x2);
  maximum = vget_lane_u32(max32x2, 0);
 #endif
  // Second part, do the remaining iterations (if any).
  for (i = residual; i > 0; i--) {
    absolute = abs((int)(*p_start));
    if (absolute > maximum) {
      maximum = absolute;
    }
    p_start++;
  }
  // Guard against the case for 0x80000000.
  maximum = WEBRTC_SPL_MIN(maximum, WEBRTC_SPL_WORD32_MAX);
  return (int32_t)maximum;
 }
 // Maximum value of word16 vector. NEON intrinsics version for
 // ARM 32-bit/64-bit platforms.
 int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, size_t length) {
  int16_t maximum = WEBRTC_SPL_WORD16_MIN;
  size_t i = 0;
  size_t residual = length & 0x7;
  RTC_DCHECK_GT(length, 0);
  const int16_t* p_start = vector;
  int16x8_t max16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MIN);
  // First part, unroll the loop 8 times.
  for (i = 0; i < length - residual; i += 8) {
    int16x8_t in16x8 = vld1q_s16(p_start);
    max16x8 = vmaxq_s16(max16x8, in16x8);
    p_start += 8;
  }
 #if defined(WEBRTC_ARCH_ARM64)
  maximum = vmaxvq_s16(max16x8);
 #else
  int16x4_t max16x4 = vmax_s16(vget_low_s16(max16x8), vget_high_s16(max16x8));
  max16x4 = vpmax_s16(max16x4, max16x4);
  max16x4 = vpmax_s16(max16x4, max16x4);
  maximum = vget_lane_s16(max16x4, 0);
 #endif
  // Second part, do the remaining iterations (if any).
  for (i = residual; i > 0; i--) {
    if (*p_start > maximum)
      maximum = *p_start;
    p_start++;
  }
  return maximum;
 }
 // Maximum value of word32 vector. NEON intrinsics version for
 // ARM 32-bit/64-bit platforms.
 int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, size_t length) {
  int32_t maximum = WEBRTC_SPL_WORD32_MIN;
  size_t i = 0;
  size_t residual = length & 0x7;
  RTC_DCHECK_GT(length, 0);
  const int32_t* p_start = vector;
  int32x4_t max32x4_0 = vdupq_n_s32(WEBRTC_SPL_WORD32_MIN);
  int32x4_t max32x4_1 = vdupq_n_s32(WEBRTC_SPL_WORD32_MIN);
  // First part, unroll the loop 8 times.
  for (i = 0; i < length - residual; i += 8) {
    int32x4_t in32x4_0 = vld1q_s32(p_start);
    p_start += 4;
    int32x4_t in32x4_1 = vld1q_s32(p_start);
    p_start += 4;
    max32x4_0 = vmaxq_s32(max32x4_0, in32x4_0);
    max32x4_1 = vmaxq_s32(max32x4_1, in32x4_1);
  }
  int32x4_t max32x4 = vmaxq_s32(max32x4_0, max32x4_1);
 #if defined(WEBRTC_ARCH_ARM64)
  maximum = vmaxvq_s32(max32x4);
 #else
  int32x2_t max32x2 = vmax_s32(vget_low_s32(max32x4), vget_high_s32(max32x4));
  max32x2 = vpmax_s32(max32x2, max32x2);
  maximum = vget_lane_s32(max32x2, 0);
 #endif
  // Second part, do the remaining iterations (if any).
  for (i = residual; i > 0; i--) {
    if (*p_start > maximum)
      maximum = *p_start;
    p_start++;
  }
  return maximum;
 }
 // Minimum value of word16 vector. NEON intrinsics version for
 // ARM 32-bit/64-bit platforms.
 int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, size_t length) {
  int16_t minimum = WEBRTC_SPL_WORD16_MAX;
  size_t i = 0;
  size_t residual = length & 0x7;
  RTC_DCHECK_GT(length, 0);
  const int16_t* p_start = vector;
  int16x8_t min16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MAX);
  // First part, unroll the loop 8 times.
  for (i = 0; i < length - residual; i += 8) {
    int16x8_t in16x8 = vld1q_s16(p_start);
    min16x8 = vminq_s16(min16x8, in16x8);
    p_start += 8;
  }
 #if defined(WEBRTC_ARCH_ARM64)
  minimum = vminvq_s16(min16x8);
 #else
  int16x4_t min16x4 = vmin_s16(vget_low_s16(min16x8), vget_high_s16(min16x8));
  min16x4 = vpmin_s16(min16x4, min16x4);
  min16x4 = vpmin_s16(min16x4, min16x4);
  minimum = vget_lane_s16(min16x4, 0);
 #endif
  // Second part, do the remaining iterations (if any).
  for (i = residual; i > 0; i--) {
    if (*p_start < minimum)
      minimum = *p_start;
    p_start++;
  }
  return minimum;
 }
 // Minimum value of word32 vector. NEON intrinsics version for
 // ARM 32-bit/64-bit platforms.
 int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, size_t length) {
  int32_t minimum = WEBRTC_SPL_WORD32_MAX;
  size_t i = 0;
  size_t residual = length & 0x7;
  RTC_DCHECK_GT(length, 0);
  const int32_t* p_start = vector;
  int32x4_t min32x4_0 = vdupq_n_s32(WEBRTC_SPL_WORD32_MAX);
  int32x4_t min32x4_1 = vdupq_n_s32(WEBRTC_SPL_WORD32_MAX);
  // First part, unroll the loop 8 times.
  for (i = 0; i < length - residual; i += 8) {
    int32x4_t in32x4_0 = vld1q_s32(p_start);
    p_start += 4;
    int32x4_t in32x4_1 = vld1q_s32(p_start);
    p_start += 4;
    min32x4_0 = vminq_s32(min32x4_0, in32x4_0);
    min32x4_1 = vminq_s32(min32x4_1, in32x4_1);
  }
  int32x4_t min32x4 = vminq_s32(min32x4_0, min32x4_1);
 #if defined(WEBRTC_ARCH_ARM64)
  minimum = vminvq_s32(min32x4);
 #else
  int32x2_t min32x2 = vmin_s32(vget_low_s32(min32x4), vget_high_s32(min32x4));
  min32x2 = vpmin_s32(min32x2, min32x2);
  minimum = vget_lane_s32(min32x2, 0);
 #endif
  // Second part, do the remaining iterations (if any).
  for (i = residual; i > 0; i--) {
    if (*p_start < minimum)
      minimum = *p_start;
    p_start++;
  }
  return minimum;
 }
 // Finds both the minimum and maximum elements in an array of 16-bit integers.
 void WebRtcSpl_MinMaxW16Neon(const int16_t* vector, size_t length,
                             int16_t* min_val, int16_t* max_val) {
  int16_t minimum = WEBRTC_SPL_WORD16_MAX;
  int16_t maximum = WEBRTC_SPL_WORD16_MIN;
  size_t i = 0;
  size_t residual = length & 0x7;
  RTC_DCHECK_GT(length, 0);
  const int16_t* p_start = vector;
  int16x8_t min16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MAX);
  int16x8_t max16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MIN);
  // First part, unroll the loop 8 times.
  for (i = 0; i < length - residual; i += 8) {
    int16x8_t in16x8 = vld1q_s16(p_start);
    min16x8 = vminq_s16(min16x8, in16x8);
    max16x8 = vmaxq_s16(max16x8, in16x8);
    p_start += 8;
  }
 #if defined(WEBRTC_ARCH_ARM64)
  minimum = vminvq_s16(min16x8);
  maximum = vmaxvq_s16(max16x8);
 #else
  int16x4_t min16x4 = vmin_s16(vget_low_s16(min16x8), vget_high_s16(min16x8));
  min16x4 = vpmin_s16(min16x4, min16x4);
  min16x4 = vpmin_s16(min16x4, min16x4);
  minimum = vget_lane_s16(min16x4, 0);
  int16x4_t max16x4 = vmax_s16(vget_low_s16(max16x8), vget_high_s16(max16x8));
  max16x4 = vpmax_s16(max16x4, max16x4);
  max16x4 = vpmax_s16(max16x4, max16x4);
  maximum = vget_lane_s16(max16x4, 0);
 #endif
  // Second part, do the remaining iterations (if any).
  for (i = residual; i > 0; i--) {
    if (*p_start < minimum)
      minimum = *p_start;
    if (*p_start > maximum)
      maximum = *p_start;
    p_start++;
  }
  *min_val = minimum;
  *max_val = maximum;
 }
--- a/VocieProcess/common_audio/third_party/ooura/fft_size_128/ooura_fft_neon.cc
+++ b/VocieProcess/common_audio/third_party/ooura/fft_size_128/ooura_fft_neon.cc
@ -0,0 +1,351 @@
 /*
 *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 /*
 * The rdft AEC algorithm, neon version of speed-critical functions.
 *
 * Based on the sse2 version.
 */
 #include <arm_neon.h>
 #include "common_audio/third_party/ooura/fft_size_128/ooura_fft.h"
 #include "common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_common.h"
 #include "common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_neon_sse2.h"
 namespace webrtc {
 #if defined(WEBRTC_HAS_NEON)
 void cft1st_128_neon(float* a) {
  const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
  int j, k2;
  for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {
    float32x4_t a00v = vld1q_f32(&a[j + 0]);
    float32x4_t a04v = vld1q_f32(&a[j + 4]);
    float32x4_t a08v = vld1q_f32(&a[j + 8]);
    float32x4_t a12v = vld1q_f32(&a[j + 12]);
    float32x4_t a01v = vcombine_f32(vget_low_f32(a00v), vget_low_f32(a08v));
    float32x4_t a23v = vcombine_f32(vget_high_f32(a00v), vget_high_f32(a08v));
    float32x4_t a45v = vcombine_f32(vget_low_f32(a04v), vget_low_f32(a12v));
    float32x4_t a67v = vcombine_f32(vget_high_f32(a04v), vget_high_f32(a12v));
    const float32x4_t wk1rv = vld1q_f32(&rdft_wk1r[k2]);
    const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2]);
    const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2]);
    const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2]);
    const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2]);
    const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2]);
    float32x4_t x0v = vaddq_f32(a01v, a23v);
    const float32x4_t x1v = vsubq_f32(a01v, a23v);
    const float32x4_t x2v = vaddq_f32(a45v, a67v);
    const float32x4_t x3v = vsubq_f32(a45v, a67v);
    const float32x4_t x3w = vrev64q_f32(x3v);
    float32x4_t x0w;
    a01v = vaddq_f32(x0v, x2v);
    x0v = vsubq_f32(x0v, x2v);
    x0w = vrev64q_f32(x0v);
    a45v = vmulq_f32(wk2rv, x0v);
    a45v = vmlaq_f32(a45v, wk2iv, x0w);
    x0v = vmlaq_f32(x1v, x3w, vec_swap_sign);
    x0w = vrev64q_f32(x0v);
    a23v = vmulq_f32(wk1rv, x0v);
    a23v = vmlaq_f32(a23v, wk1iv, x0w);
    x0v = vmlsq_f32(x1v, x3w, vec_swap_sign);
    x0w = vrev64q_f32(x0v);
    a67v = vmulq_f32(wk3rv, x0v);
    a67v = vmlaq_f32(a67v, wk3iv, x0w);
    a00v = vcombine_f32(vget_low_f32(a01v), vget_low_f32(a23v));
    a04v = vcombine_f32(vget_low_f32(a45v), vget_low_f32(a67v));
    a08v = vcombine_f32(vget_high_f32(a01v), vget_high_f32(a23v));
    a12v = vcombine_f32(vget_high_f32(a45v), vget_high_f32(a67v));
    vst1q_f32(&a[j + 0], a00v);
    vst1q_f32(&a[j + 4], a04v);
    vst1q_f32(&a[j + 8], a08v);
    vst1q_f32(&a[j + 12], a12v);
  }
 }
 void cftmdl_128_neon(float* a) {
  int j;
  const int l = 8;
  const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
  float32x4_t wk1rv = vld1q_f32(cftmdl_wk1r);
  for (j = 0; j < l; j += 2) {
    const float32x2_t a_00 = vld1_f32(&a[j + 0]);
    const float32x2_t a_08 = vld1_f32(&a[j + 8]);
    const float32x2_t a_32 = vld1_f32(&a[j + 32]);
    const float32x2_t a_40 = vld1_f32(&a[j + 40]);
    const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);
    const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);
    const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);
    const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);
    const float32x2_t a_16 = vld1_f32(&a[j + 16]);
    const float32x2_t a_24 = vld1_f32(&a[j + 24]);
    const float32x2_t a_48 = vld1_f32(&a[j + 48]);
    const float32x2_t a_56 = vld1_f32(&a[j + 56]);
    const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);
    const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);
    const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);
    const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);
    const float32x4_t xx0 = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
    const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
    const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);
    const float32x4_t x1_x3_add =
        vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
    const float32x4_t x1_x3_sub =
        vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
    const float32x2_t yy0_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 0);
    const float32x2_t yy0_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 0);
    const float32x4_t yy0_as = vcombine_f32(yy0_a, yy0_s);
    const float32x2_t yy1_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 1);
    const float32x2_t yy1_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 1);
    const float32x4_t yy1_as = vcombine_f32(yy1_a, yy1_s);
    const float32x4_t yy0 = vmlaq_f32(yy0_as, vec_swap_sign, yy1_as);
    const float32x4_t yy4 = vmulq_f32(wk1rv, yy0);
    const float32x4_t xx1_rev = vrev64q_f32(xx1);
    const float32x4_t yy4_rev = vrev64q_f32(yy4);
    vst1_f32(&a[j + 0], vget_low_f32(xx0));
    vst1_f32(&a[j + 32], vget_high_f32(xx0));
    vst1_f32(&a[j + 16], vget_low_f32(xx1));
    vst1_f32(&a[j + 48], vget_high_f32(xx1_rev));
    a[j + 48] = -a[j + 48];
    vst1_f32(&a[j + 8], vget_low_f32(x1_x3_add));
    vst1_f32(&a[j + 24], vget_low_f32(x1_x3_sub));
    vst1_f32(&a[j + 40], vget_low_f32(yy4));
    vst1_f32(&a[j + 56], vget_high_f32(yy4_rev));
  }
  {
    const int k = 64;
    const int k1 = 2;
    const int k2 = 2 * k1;
    const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2 + 0]);
    const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2 + 0]);
    const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2 + 0]);
    const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2 + 0]);
    const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2 + 0]);
    wk1rv = vld1q_f32(&rdft_wk1r[k2 + 0]);
    for (j = k; j < l + k; j += 2) {
      const float32x2_t a_00 = vld1_f32(&a[j + 0]);
      const float32x2_t a_08 = vld1_f32(&a[j + 8]);
      const float32x2_t a_32 = vld1_f32(&a[j + 32]);
      const float32x2_t a_40 = vld1_f32(&a[j + 40]);
      const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);
      const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);
      const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);
      const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);
      const float32x2_t a_16 = vld1_f32(&a[j + 16]);
      const float32x2_t a_24 = vld1_f32(&a[j + 24]);
      const float32x2_t a_48 = vld1_f32(&a[j + 48]);
      const float32x2_t a_56 = vld1_f32(&a[j + 56]);
      const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);
      const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);
      const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);
      const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);
      const float32x4_t xx = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
      const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
      const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);
      const float32x4_t x1_x3_add =
          vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
      const float32x4_t x1_x3_sub =
          vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
      float32x4_t xx4 = vmulq_f32(wk2rv, xx1);
      float32x4_t xx12 = vmulq_f32(wk1rv, x1_x3_add);
      float32x4_t xx22 = vmulq_f32(wk3rv, x1_x3_sub);
      xx4 = vmlaq_f32(xx4, wk2iv, vrev64q_f32(xx1));
      xx12 = vmlaq_f32(xx12, wk1iv, vrev64q_f32(x1_x3_add));
      xx22 = vmlaq_f32(xx22, wk3iv, vrev64q_f32(x1_x3_sub));
      vst1_f32(&a[j + 0], vget_low_f32(xx));
      vst1_f32(&a[j + 32], vget_high_f32(xx));
      vst1_f32(&a[j + 16], vget_low_f32(xx4));
      vst1_f32(&a[j + 48], vget_high_f32(xx4));
      vst1_f32(&a[j + 8], vget_low_f32(xx12));
      vst1_f32(&a[j + 40], vget_high_f32(xx12));
      vst1_f32(&a[j + 24], vget_low_f32(xx22));
      vst1_f32(&a[j + 56], vget_high_f32(xx22));
    }
  }
 }
 __inline static float32x4_t reverse_order_f32x4(float32x4_t in) {
  // A B C D -> C D A B
  const float32x4_t rev = vcombine_f32(vget_high_f32(in), vget_low_f32(in));
  // C D A B -> D C B A
  return vrev64q_f32(rev);
 }
 void rftfsub_128_neon(float* a) {
  const float* c = rdft_w + 32;
  int j1, j2;
  const float32x4_t mm_half = vdupq_n_f32(0.5f);
  // Vectorized code (four at once).
  // Note: commented number are indexes for the first iteration of the loop.
  for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
    // Load 'wk'.
    const float32x4_t c_j1 = vld1q_f32(&c[j1]);          //  1,  2,  3,  4,
    const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]);     // 28, 29, 30, 31,
    const float32x4_t wkrt = vsubq_f32(mm_half, c_k1);   // 28, 29, 30, 31,
    const float32x4_t wkr_ = reverse_order_f32x4(wkrt);  // 31, 30, 29, 28,
    const float32x4_t wki_ = c_j1;                       //  1,  2,  3,  4,
    // Load and shuffle 'a'.
    //   2,   4,   6,   8,   3,   5,   7,   9
    float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);
    // 120, 122, 124, 126, 121, 123, 125, 127,
    const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);
    // 126, 124, 122, 120
    const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);
    // 127, 125, 123, 121
    const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);
    // Calculate 'x'.
    const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0);
    // 2-126, 4-124, 6-122, 8-120,
    const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1);
    // 3-127, 5-125, 7-123, 9-121,
    // Calculate product into 'y'.
    //    yr = wkr * xr - wki * xi;
    //    yi = wkr * xi + wki * xr;
    const float32x4_t a_ = vmulq_f32(wkr_, xr_);
    const float32x4_t b_ = vmulq_f32(wki_, xi_);
    const float32x4_t c_ = vmulq_f32(wkr_, xi_);
    const float32x4_t d_ = vmulq_f32(wki_, xr_);
    const float32x4_t yr_ = vsubq_f32(a_, b_);  // 2-126, 4-124, 6-122, 8-120,
    const float32x4_t yi_ = vaddq_f32(c_, d_);  // 3-127, 5-125, 7-123, 9-121,
                                                // Update 'a'.
                                                //    a[j2 + 0] -= yr;
                                                //    a[j2 + 1] -= yi;
                                                //    a[k2 + 0] += yr;
                                                //    a[k2 + 1] -= yi;
    // 126, 124, 122, 120,
    const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_);
    // 127, 125, 123, 121,
    const float32x4_t a_k2_p1n = vsubq_f32(a_k2_p1, yi_);
    // Shuffle in right order and store.
    const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n);
    const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n);
    // 124, 125, 126, 127, 120, 121, 122, 123
    const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr);
    //   2,   4,   6,   8,
    a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_);
    //   3,   5,   7,   9,
    a_j2_p.val[1] = vsubq_f32(a_j2_p.val[1], yi_);
    //   2,   3,   4,   5,   6,   7,   8,   9,
    vst2q_f32(&a[0 + j2], a_j2_p);
    vst1q_f32(&a[122 - j2], a_k2_n.val[1]);
    vst1q_f32(&a[126 - j2], a_k2_n.val[0]);
  }
  // Scalar code for the remaining items.
  for (; j2 < 64; j1 += 1, j2 += 2) {
    const int k2 = 128 - j2;
    const int k1 = 32 - j1;
    const float wkr = 0.5f - c[k1];
    const float wki = c[j1];
    const float xr = a[j2 + 0] - a[k2 + 0];
    const float xi = a[j2 + 1] + a[k2 + 1];
    const float yr = wkr * xr - wki * xi;
    const float yi = wkr * xi + wki * xr;
    a[j2 + 0] -= yr;
    a[j2 + 1] -= yi;
    a[k2 + 0] += yr;
    a[k2 + 1] -= yi;
  }
 }
 void rftbsub_128_neon(float* a) {
  const float* c = rdft_w + 32;
  int j1, j2;
  const float32x4_t mm_half = vdupq_n_f32(0.5f);
  a[1] = -a[1];
  // Vectorized code (four at once).
  //    Note: commented number are indexes for the first iteration of the loop.
  for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
    // Load 'wk'.
    const float32x4_t c_j1 = vld1q_f32(&c[j1]);          //  1,  2,  3,  4,
    const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]);     // 28, 29, 30, 31,
    const float32x4_t wkrt = vsubq_f32(mm_half, c_k1);   // 28, 29, 30, 31,
    const float32x4_t wkr_ = reverse_order_f32x4(wkrt);  // 31, 30, 29, 28,
    const float32x4_t wki_ = c_j1;                       //  1,  2,  3,  4,
    // Load and shuffle 'a'.
    //   2,   4,   6,   8,   3,   5,   7,   9
    float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);
    // 120, 122, 124, 126, 121, 123, 125, 127,
    const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);
    // 126, 124, 122, 120
    const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);
    // 127, 125, 123, 121
    const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);
    // Calculate 'x'.
    const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0);
    // 2-126, 4-124, 6-122, 8-120,
    const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1);
    // 3-127, 5-125, 7-123, 9-121,
    // Calculate product into 'y'.
    //    yr = wkr * xr - wki * xi;
    //    yi = wkr * xi + wki * xr;
    const float32x4_t a_ = vmulq_f32(wkr_, xr_);
    const float32x4_t b_ = vmulq_f32(wki_, xi_);
    const float32x4_t c_ = vmulq_f32(wkr_, xi_);
    const float32x4_t d_ = vmulq_f32(wki_, xr_);
    const float32x4_t yr_ = vaddq_f32(a_, b_);  // 2-126, 4-124, 6-122, 8-120,
    const float32x4_t yi_ = vsubq_f32(c_, d_);  // 3-127, 5-125, 7-123, 9-121,
                                                // Update 'a'.
                                                //    a[j2 + 0] -= yr;
                                                //    a[j2 + 1] -= yi;
                                                //    a[k2 + 0] += yr;
                                                //    a[k2 + 1] -= yi;
    // 126, 124, 122, 120,
    const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_);
    // 127, 125, 123, 121,
    const float32x4_t a_k2_p1n = vsubq_f32(yi_, a_k2_p1);
    // Shuffle in right order and store.
    //   2,   3,   4,   5,   6,   7,   8,   9,
    const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n);
    const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n);
    // 124, 125, 126, 127, 120, 121, 122, 123
    const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr);
    //   2,   4,   6,   8,
    a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_);
    //   3,   5,   7,   9,
    a_j2_p.val[1] = vsubq_f32(yi_, a_j2_p.val[1]);
    //   2,   3,   4,   5,   6,   7,   8,   9,
    vst2q_f32(&a[0 + j2], a_j2_p);
    vst1q_f32(&a[122 - j2], a_k2_n.val[1]);
    vst1q_f32(&a[126 - j2], a_k2_n.val[0]);
  }
  // Scalar code for the remaining items.
  for (; j2 < 64; j1 += 1, j2 += 2) {
    const int k2 = 128 - j2;
    const int k1 = 32 - j1;
    const float wkr = 0.5f - c[k1];
    const float wki = c[j1];
    const float xr = a[j2 + 0] - a[k2 + 0];
    const float xi = a[j2 + 1] + a[k2 + 1];
    const float yr = wkr * xr + wki * xi;
    const float yi = wkr * xi - wki * xr;
    a[j2 + 0] = a[j2 + 0] - yr;
    a[j2 + 1] = yi - a[j2 + 1];
    a[k2 + 0] = yr + a[k2 + 0];
    a[k2 + 1] = yi - a[k2 + 1];
  }
  a[65] = -a[65];
 }
 #endif
 }  // namespace webrtc
--- a/VocieProcess/common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_neon_sse2.h
+++ b/VocieProcess/common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_neon_sse2.h
@ -0,0 +1,98 @@
 /*
 *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_NEON_SSE2_H_
 #define MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_NEON_SSE2_H_
 #include "common_audio/third_party/ooura/fft_size_128/ooura_fft.h"
 #include "rtc_base/system/arch.h"
 #ifdef _MSC_VER /* visual c++ */
 #define ALIGN16_BEG __declspec(align(16))
 #define ALIGN16_END
 #else /* gcc or icc */
 #define ALIGN16_BEG
 #define ALIGN16_END __attribute__((aligned(16)))
 #endif
 namespace webrtc {
 // These tables used to be computed at run-time. For example, refer to:
 // https://code.google.com/p/webrtc/source/browse/trunk/webrtc/modules/audio_processing/utility/apm_rdft.c?r=6564
 // to see the initialization code.
 #if defined(WEBRTC_ARCH_X86_FAMILY) || defined(WEBRTC_HAS_NEON)
 // Constants used by SSE2 and NEON but initialized in the C path.
 const ALIGN16_BEG float ALIGN16_END k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f};
 ALIGN16_BEG const float ALIGN16_END rdft_wk1r[32] = {
    1.000000000f, 1.000000000f, 0.707106769f, 0.707106769f, 0.923879564f,
    0.923879564f, 0.382683456f, 0.382683456f, 0.980785251f, 0.980785251f,
    0.555570245f, 0.555570245f, 0.831469595f, 0.831469595f, 0.195090324f,
    0.195090324f, 0.995184720f, 0.995184720f, 0.634393334f, 0.634393334f,
    0.881921291f, 0.881921291f, 0.290284663f, 0.290284663f, 0.956940353f,
    0.956940353f, 0.471396744f, 0.471396744f, 0.773010433f, 0.773010433f,
    0.098017141f, 0.098017141f,
 };
 ALIGN16_BEG const float ALIGN16_END rdft_wk2r[32] = {
    1.000000000f,  1.000000000f,  -0.000000000f, -0.000000000f, 0.707106769f,
    0.707106769f,  -0.707106769f, -0.707106769f, 0.923879564f,  0.923879564f,
    -0.382683456f, -0.382683456f, 0.382683456f,  0.382683456f,  -0.923879564f,
    -0.923879564f, 0.980785251f,  0.980785251f,  -0.195090324f, -0.195090324f,
    0.555570245f,  0.555570245f,  -0.831469595f, -0.831469595f, 0.831469595f,
    0.831469595f,  -0.555570245f, -0.555570245f, 0.195090324f,  0.195090324f,
    -0.980785251f, -0.980785251f,
 };
 ALIGN16_BEG const float ALIGN16_END rdft_wk3r[32] = {
    1.000000000f,  1.000000000f,  -0.707106769f, -0.707106769f, 0.382683456f,
    0.382683456f,  -0.923879564f, -0.923879564f, 0.831469536f,  0.831469536f,
    -0.980785251f, -0.980785251f, -0.195090353f, -0.195090353f, -0.555570245f,
    -0.555570245f, 0.956940353f,  0.956940353f,  -0.881921172f, -0.881921172f,
    0.098017156f,  0.098017156f,  -0.773010492f, -0.773010492f, 0.634393334f,
    0.634393334f,  -0.995184720f, -0.995184720f, -0.471396863f, -0.471396863f,
    -0.290284693f, -0.290284693f,
 };
 ALIGN16_BEG const float ALIGN16_END rdft_wk1i[32] = {
    -0.000000000f, 0.000000000f,  -0.707106769f, 0.707106769f,  -0.382683456f,
    0.382683456f,  -0.923879564f, 0.923879564f,  -0.195090324f, 0.195090324f,
    -0.831469595f, 0.831469595f,  -0.555570245f, 0.555570245f,  -0.980785251f,
    0.980785251f,  -0.098017141f, 0.098017141f,  -0.773010433f, 0.773010433f,
    -0.471396744f, 0.471396744f,  -0.956940353f, 0.956940353f,  -0.290284663f,
    0.290284663f,  -0.881921291f, 0.881921291f,  -0.634393334f, 0.634393334f,
    -0.995184720f, 0.995184720f,
 };
 ALIGN16_BEG const float ALIGN16_END rdft_wk2i[32] = {
    -0.000000000f, 0.000000000f,  -1.000000000f, 1.000000000f,  -0.707106769f,
    0.707106769f,  -0.707106769f, 0.707106769f,  -0.382683456f, 0.382683456f,
    -0.923879564f, 0.923879564f,  -0.923879564f, 0.923879564f,  -0.382683456f,
    0.382683456f,  -0.195090324f, 0.195090324f,  -0.980785251f, 0.980785251f,
    -0.831469595f, 0.831469595f,  -0.555570245f, 0.555570245f,  -0.555570245f,
    0.555570245f,  -0.831469595f, 0.831469595f,  -0.980785251f, 0.980785251f,
    -0.195090324f, 0.195090324f,
 };
 ALIGN16_BEG const float ALIGN16_END rdft_wk3i[32] = {
    -0.000000000f, 0.000000000f,  -0.707106769f, 0.707106769f,  -0.923879564f,
    0.923879564f,  0.382683456f,  -0.382683456f, -0.555570245f, 0.555570245f,
    -0.195090353f, 0.195090353f,  -0.980785251f, 0.980785251f,  0.831469536f,
    -0.831469536f, -0.290284693f, 0.290284693f,  -0.471396863f, 0.471396863f,
    -0.995184720f, 0.995184720f,  0.634393334f,  -0.634393334f, -0.773010492f,
    0.773010492f,  0.098017156f,  -0.098017156f, -0.881921172f, 0.881921172f,
    0.956940353f,  -0.956940353f,
 };
 ALIGN16_BEG const float ALIGN16_END cftmdl_wk1r[4] = {
    0.707106769f,
    0.707106769f,
    0.707106769f,
    -0.707106769f,
 };
 #endif
 }  // namespace webrtc
 #endif  // MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_NEON_SSE2_H_