add neon code.
This commit is contained in:
parent
fe7ab957c8
commit
c6debcc62a
@ -10,19 +10,18 @@
|
|||||||
|
|
||||||
class EchoRecordPrivate {
|
class EchoRecordPrivate {
|
||||||
public:
|
public:
|
||||||
EchoRecordPrivate() {
|
void initialize(int sampleRate, int channels, int period) {
|
||||||
|
|
||||||
std::unique_ptr<webrtc::EchoCanceller3Factory> factory = std::make_unique<webrtc::EchoCanceller3Factory>();
|
std::unique_ptr<webrtc::EchoCanceller3Factory> factory = std::make_unique<webrtc::EchoCanceller3Factory>();
|
||||||
|
echoCanceller = factory->Create(sampleRate, channels, channels);
|
||||||
echoCanceller = factory->Create(16000, 1, 1);
|
nearendBuffer = std::make_unique<webrtc::AudioBuffer>(sampleRate, channels, sampleRate, channels, sampleRate, channels);
|
||||||
|
farendBuffer = std::make_unique<webrtc::AudioBuffer>(sampleRate, channels, sampleRate, channels, sampleRate, channels);
|
||||||
// nearendBuffer = std::make_unique<webrtc::AudioBuffer>(16000, 1, 16000, 1, 16000, 1);
|
linearOutputBuffer = std::make_unique<webrtc::AudioBuffer>(sampleRate, channels, sampleRate, channels, sampleRate, channels);
|
||||||
// farendBuffer = std::make_unique<webrtc::AudioBuffer>(16000, 1, 16000, 1, 16000, 1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::unique_ptr<webrtc::EchoControl> echoCanceller;
|
std::unique_ptr<webrtc::EchoControl> echoCanceller;
|
||||||
// std::unique_ptr<webrtc::AudioBuffer> nearendBuffer;
|
std::unique_ptr<webrtc::AudioBuffer> nearendBuffer;
|
||||||
// std::unique_ptr<webrtc::AudioBuffer> farendBuffer;
|
std::unique_ptr<webrtc::AudioBuffer> farendBuffer;
|
||||||
|
std::unique_ptr<webrtc::AudioBuffer> linearOutputBuffer;
|
||||||
};
|
};
|
||||||
|
|
||||||
EchoRecordTask::EchoRecordTask() : m_d{new EchoRecordPrivate()} {
|
EchoRecordTask::EchoRecordTask() : m_d{new EchoRecordPrivate()} {
|
||||||
@ -56,7 +55,7 @@ void EchoRecordTask::run() {
|
|||||||
|
|
||||||
RkAudio::Format format;
|
RkAudio::Format format;
|
||||||
format.channels = m_channels;
|
format.channels = m_channels;
|
||||||
format.period = 20;
|
format.period = 10;
|
||||||
|
|
||||||
m_speex = std::make_shared<SpeexDsp>();
|
m_speex = std::make_shared<SpeexDsp>();
|
||||||
m_speex->start(format.sampleRate, m_channels, format.period);
|
m_speex->start(format.sampleRate, m_channels, format.period);
|
||||||
@ -66,6 +65,8 @@ void EchoRecordTask::run() {
|
|||||||
m_webRtcAecm = std::make_shared<WebRtcAecm>();
|
m_webRtcAecm = std::make_shared<WebRtcAecm>();
|
||||||
m_webRtcAecm->start(format.sampleRate, format.channels, format.period);
|
m_webRtcAecm->start(format.sampleRate, format.channels, format.period);
|
||||||
|
|
||||||
|
m_d->initialize(format.sampleRate, m_channels, format.period);
|
||||||
|
|
||||||
m_output = std::make_shared<RkAudio::Output>();
|
m_output = std::make_shared<RkAudio::Output>();
|
||||||
if (!m_output->open(sizeof(uint16_t), format.sampleRate, 2, format.period, m_dsp == Vqe)) {
|
if (!m_output->open(sizeof(uint16_t), format.sampleRate, 2, format.period, m_dsp == Vqe)) {
|
||||||
LOG(error) << "audio output open failed.";
|
LOG(error) << "audio output open failed.";
|
||||||
@ -87,18 +88,16 @@ void EchoRecordTask::run() {
|
|||||||
reinterpret_cast<int16_t *>(m_outBuffer.data()), frame.frameSize);
|
reinterpret_cast<int16_t *>(m_outBuffer.data()), frame.frameSize);
|
||||||
} else if (m_dsp == Aec3) {
|
} else if (m_dsp == Aec3) {
|
||||||
webrtc::StreamConfig config(format.sampleRate, format.channels); // 单声道
|
webrtc::StreamConfig config(format.sampleRate, format.channels); // 单声道
|
||||||
webrtc::AudioBuffer nearendBuffer(format.sampleRate, 1, format.sampleRate, 1, format.sampleRate, 1);
|
m_d->nearendBuffer->CopyFrom(reinterpret_cast<const int16_t *>(frame.data), config);
|
||||||
webrtc::AudioBuffer farendBuffer(format.sampleRate, 1, format.sampleRate, 1, format.sampleRate, 1);
|
|
||||||
webrtc::AudioBuffer linearOutputBuffer(format.sampleRate, 1, format.sampleRate, 1, format.sampleRate, 1);
|
|
||||||
nearendBuffer.CopyFrom(reinterpret_cast<const int16_t *>(frame.data), config);
|
|
||||||
|
|
||||||
farendBuffer.CopyFrom(reinterpret_cast<const int16_t *>(m_farendBuffer.data()), config);
|
m_d->farendBuffer->CopyFrom(reinterpret_cast<const int16_t *>(m_farendBuffer.data()), config);
|
||||||
|
|
||||||
m_d->echoCanceller->AnalyzeRender(&farendBuffer);
|
m_d->echoCanceller->AnalyzeRender(m_d->farendBuffer.get());
|
||||||
m_d->echoCanceller->AnalyzeCapture(&nearendBuffer);
|
m_d->echoCanceller->AnalyzeCapture(m_d->nearendBuffer.get());
|
||||||
m_d->echoCanceller->ProcessCapture(&nearendBuffer, &linearOutputBuffer, /*level_change=*/false);
|
m_d->echoCanceller->ProcessCapture(m_d->nearendBuffer.get(), false);
|
||||||
|
// m_d->echoCanceller->ProcessCapture(&nearendBuffer, &linearOutputBuffer, /*level_change=*/false);
|
||||||
|
|
||||||
linearOutputBuffer.CopyTo(config, reinterpret_cast<int16_t *>(m_outBuffer.data()));
|
m_d->nearendBuffer->CopyTo(config, reinterpret_cast<int16_t *>(m_outBuffer.data()));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (m_channels == 2) {
|
if (m_channels == 2) {
|
||||||
|
@ -28,25 +28,31 @@ add_library(VocieProcess
|
|||||||
|
|
||||||
common_audio/audio_util.cc
|
common_audio/audio_util.cc
|
||||||
common_audio/channel_buffer.h common_audio/channel_buffer.cc
|
common_audio/channel_buffer.h common_audio/channel_buffer.cc
|
||||||
|
common_audio/fir_filter_neon.h common_audio/fir_filter_neon.cc
|
||||||
common_audio/ring_buffer.h common_audio/ring_buffer.c
|
common_audio/ring_buffer.h common_audio/ring_buffer.c
|
||||||
|
|
||||||
common_audio/resampler/push_sinc_resampler.h common_audio/resampler/push_sinc_resampler.cc
|
common_audio/resampler/push_sinc_resampler.h common_audio/resampler/push_sinc_resampler.cc
|
||||||
common_audio/resampler/sinc_resampler.h common_audio/resampler/sinc_resampler.cc
|
common_audio/resampler/sinc_resampler.h common_audio/resampler/sinc_resampler_neon.cc
|
||||||
|
common_audio/resampler/sinc_resampler.cc
|
||||||
|
|
||||||
common_audio/signal_processing/complex_bit_reverse.c
|
common_audio/signal_processing/complex_bit_reverse.c
|
||||||
common_audio/signal_processing/complex_fft.c
|
common_audio/signal_processing/complex_fft.c
|
||||||
|
common_audio/signal_processing/cross_correlation_neon.c
|
||||||
common_audio/signal_processing/cross_correlation.c
|
common_audio/signal_processing/cross_correlation.c
|
||||||
common_audio/signal_processing/division_operations.c
|
common_audio/signal_processing/division_operations.c
|
||||||
common_audio/signal_processing/dot_product_with_scale.h common_audio/signal_processing/dot_product_with_scale.cc
|
common_audio/signal_processing/dot_product_with_scale.h common_audio/signal_processing/dot_product_with_scale.cc
|
||||||
common_audio/signal_processing/downsample_fast.c
|
common_audio/signal_processing/downsample_fast.c
|
||||||
|
common_audio/signal_processing/downsample_fast_neon.c
|
||||||
common_audio/signal_processing/min_max_operations.c
|
common_audio/signal_processing/min_max_operations.c
|
||||||
|
common_audio/signal_processing/min_max_operations_neon.c
|
||||||
common_audio/signal_processing/randomization_functions.c
|
common_audio/signal_processing/randomization_functions.c
|
||||||
common_audio/signal_processing/real_fft.c
|
common_audio/signal_processing/real_fft.c
|
||||||
common_audio/signal_processing/spl_init.c
|
common_audio/signal_processing/spl_init.c
|
||||||
common_audio/signal_processing/splitting_filter.c
|
common_audio/signal_processing/splitting_filter.c
|
||||||
common_audio/signal_processing/vector_scaling_operations.c
|
common_audio/signal_processing/vector_scaling_operations.c
|
||||||
|
|
||||||
common_audio/third_party/ooura/fft_size_128/ooura_fft.h common_audio/third_party/ooura/fft_size_128/ooura_fft.cc
|
common_audio/third_party/ooura/fft_size_128/ooura_fft.h common_audio/third_party/ooura/fft_size_128/ooura_fft_neon.cc
|
||||||
|
common_audio/third_party/ooura/fft_size_128/ooura_fft.cc
|
||||||
common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.h common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.c
|
common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.h common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.c
|
||||||
|
|
||||||
rtc_base/checks.h rtc_base/checks.cc
|
rtc_base/checks.h rtc_base/checks.cc
|
||||||
@ -132,6 +138,7 @@ add_library(VocieProcess
|
|||||||
modules/audio_processing/aec3/transparent_mode.h modules/audio_processing/aec3/transparent_mode.cc
|
modules/audio_processing/aec3/transparent_mode.h modules/audio_processing/aec3/transparent_mode.cc
|
||||||
|
|
||||||
modules/audio_processing/aecm/aecm_core.h modules/audio_processing/aecm/aecm_core.cc modules/audio_processing/aecm/aecm_core_c.cc
|
modules/audio_processing/aecm/aecm_core.h modules/audio_processing/aecm/aecm_core.cc modules/audio_processing/aecm/aecm_core_c.cc
|
||||||
|
modules/audio_processing/aecm/aecm_core_neon.cc
|
||||||
modules/audio_processing/aecm/echo_control_mobile.h modules/audio_processing/aecm/echo_control_mobile.cc
|
modules/audio_processing/aecm/echo_control_mobile.h modules/audio_processing/aecm/echo_control_mobile.cc
|
||||||
|
|
||||||
modules/audio_processing/logging/apm_data_dumper.h modules/audio_processing/logging/apm_data_dumper.cc
|
modules/audio_processing/logging/apm_data_dumper.h modules/audio_processing/logging/apm_data_dumper.cc
|
||||||
@ -148,6 +155,7 @@ target_compile_definitions(VocieProcess
|
|||||||
PRIVATE NOMINMAX # <windows.h>
|
PRIVATE NOMINMAX # <windows.h>
|
||||||
PRIVATE RTC_DISABLE_LOGGING
|
PRIVATE RTC_DISABLE_LOGGING
|
||||||
PUBLIC RTC_DISABLE_METRICS
|
PUBLIC RTC_DISABLE_METRICS
|
||||||
|
PUBLIC WEBRTC_HAS_NEON
|
||||||
PUBLIC WEBRTC_APM_DEBUG_DUMP=0
|
PUBLIC WEBRTC_APM_DEBUG_DUMP=0
|
||||||
$<$<PLATFORM_ID:Windows>:WEBRTC_WIN>
|
$<$<PLATFORM_ID:Windows>:WEBRTC_WIN>
|
||||||
$<$<PLATFORM_ID:Linux>:WEBRTC_POSIX WEBRTC_LINUX>
|
$<$<PLATFORM_ID:Linux>:WEBRTC_POSIX WEBRTC_LINUX>
|
||||||
|
30
VocieProcess/common_audio/fir_filter.h
Normal file
30
VocieProcess/common_audio/fir_filter.h
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef COMMON_AUDIO_FIR_FILTER_H_
|
||||||
|
#define COMMON_AUDIO_FIR_FILTER_H_
|
||||||
|
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
namespace webrtc {
|
||||||
|
|
||||||
|
// Finite Impulse Response filter using floating-point arithmetic.
|
||||||
|
class FIRFilter {
|
||||||
|
public:
|
||||||
|
virtual ~FIRFilter() {}
|
||||||
|
|
||||||
|
// Filters the `in` data supplied.
|
||||||
|
// `out` must be previously allocated and it must be at least of `length`.
|
||||||
|
virtual void Filter(const float* in, size_t length, float* out) = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace webrtc
|
||||||
|
|
||||||
|
#endif // COMMON_AUDIO_FIR_FILTER_H_
|
73
VocieProcess/common_audio/fir_filter_neon.cc
Normal file
73
VocieProcess/common_audio/fir_filter_neon.cc
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "common_audio/fir_filter_neon.h"
|
||||||
|
|
||||||
|
#include <arm_neon.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#include "rtc_base/checks.h"
|
||||||
|
#include "rtc_base/memory/aligned_malloc.h"
|
||||||
|
|
||||||
|
namespace webrtc {
|
||||||
|
|
||||||
|
FIRFilterNEON::~FIRFilterNEON() {}
|
||||||
|
|
||||||
|
FIRFilterNEON::FIRFilterNEON(const float* coefficients,
|
||||||
|
size_t coefficients_length,
|
||||||
|
size_t max_input_length)
|
||||||
|
: // Closest higher multiple of four.
|
||||||
|
coefficients_length_((coefficients_length + 3) & ~0x03),
|
||||||
|
state_length_(coefficients_length_ - 1),
|
||||||
|
coefficients_(static_cast<float*>(
|
||||||
|
AlignedMalloc(sizeof(float) * coefficients_length_, 16))),
|
||||||
|
state_(static_cast<float*>(
|
||||||
|
AlignedMalloc(sizeof(float) * (max_input_length + state_length_),
|
||||||
|
16))) {
|
||||||
|
// Add zeros at the end of the coefficients.
|
||||||
|
size_t padding = coefficients_length_ - coefficients_length;
|
||||||
|
memset(coefficients_.get(), 0.f, padding * sizeof(coefficients_[0]));
|
||||||
|
// The coefficients are reversed to compensate for the order in which the
|
||||||
|
// input samples are acquired (most recent last).
|
||||||
|
for (size_t i = 0; i < coefficients_length; ++i) {
|
||||||
|
coefficients_[i + padding] = coefficients[coefficients_length - i - 1];
|
||||||
|
}
|
||||||
|
memset(state_.get(), 0.f,
|
||||||
|
(max_input_length + state_length_) * sizeof(state_[0]));
|
||||||
|
}
|
||||||
|
|
||||||
|
void FIRFilterNEON::Filter(const float* in, size_t length, float* out) {
|
||||||
|
RTC_DCHECK_GT(length, 0);
|
||||||
|
|
||||||
|
memcpy(&state_[state_length_], in, length * sizeof(*in));
|
||||||
|
|
||||||
|
// Convolves the input signal `in` with the filter kernel `coefficients_`
|
||||||
|
// taking into account the previous state.
|
||||||
|
for (size_t i = 0; i < length; ++i) {
|
||||||
|
float* in_ptr = &state_[i];
|
||||||
|
float* coef_ptr = coefficients_.get();
|
||||||
|
|
||||||
|
float32x4_t m_sum = vmovq_n_f32(0);
|
||||||
|
float32x4_t m_in;
|
||||||
|
|
||||||
|
for (size_t j = 0; j < coefficients_length_; j += 4) {
|
||||||
|
m_in = vld1q_f32(in_ptr + j);
|
||||||
|
m_sum = vmlaq_f32(m_sum, m_in, vld1q_f32(coef_ptr + j));
|
||||||
|
}
|
||||||
|
|
||||||
|
float32x2_t m_half = vadd_f32(vget_high_f32(m_sum), vget_low_f32(m_sum));
|
||||||
|
out[i] = vget_lane_f32(vpadd_f32(m_half, m_half), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update current state.
|
||||||
|
memmove(state_.get(), &state_[length], state_length_ * sizeof(state_[0]));
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace webrtc
|
39
VocieProcess/common_audio/fir_filter_neon.h
Normal file
39
VocieProcess/common_audio/fir_filter_neon.h
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef COMMON_AUDIO_FIR_FILTER_NEON_H_
|
||||||
|
#define COMMON_AUDIO_FIR_FILTER_NEON_H_
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
#include "common_audio/fir_filter.h"
|
||||||
|
#include "rtc_base/memory/aligned_malloc.h"
|
||||||
|
|
||||||
|
namespace webrtc {
|
||||||
|
|
||||||
|
class FIRFilterNEON : public FIRFilter {
|
||||||
|
public:
|
||||||
|
FIRFilterNEON(const float* coefficients,
|
||||||
|
size_t coefficients_length,
|
||||||
|
size_t max_input_length);
|
||||||
|
~FIRFilterNEON() override;
|
||||||
|
|
||||||
|
void Filter(const float* in, size_t length, float* out) override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
size_t coefficients_length_;
|
||||||
|
size_t state_length_;
|
||||||
|
std::unique_ptr<float[], AlignedFreeDeleter> coefficients_;
|
||||||
|
std::unique_ptr<float[], AlignedFreeDeleter> state_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace webrtc
|
||||||
|
|
||||||
|
#endif // COMMON_AUDIO_FIR_FILTER_NEON_H_
|
48
VocieProcess/common_audio/resampler/sinc_resampler_neon.cc
Normal file
48
VocieProcess/common_audio/resampler/sinc_resampler_neon.cc
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Modified from the Chromium original:
|
||||||
|
// src/media/base/sinc_resampler.cc
|
||||||
|
|
||||||
|
#include <arm_neon.h>
|
||||||
|
|
||||||
|
#include "common_audio/resampler/sinc_resampler.h"
|
||||||
|
|
||||||
|
namespace webrtc {
|
||||||
|
|
||||||
|
float SincResampler::Convolve_NEON(const float* input_ptr,
|
||||||
|
const float* k1,
|
||||||
|
const float* k2,
|
||||||
|
double kernel_interpolation_factor) {
|
||||||
|
float32x4_t m_input;
|
||||||
|
float32x4_t m_sums1 = vmovq_n_f32(0);
|
||||||
|
float32x4_t m_sums2 = vmovq_n_f32(0);
|
||||||
|
|
||||||
|
const float* upper = input_ptr + kKernelSize;
|
||||||
|
for (; input_ptr < upper;) {
|
||||||
|
m_input = vld1q_f32(input_ptr);
|
||||||
|
input_ptr += 4;
|
||||||
|
m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1));
|
||||||
|
k1 += 4;
|
||||||
|
m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2));
|
||||||
|
k2 += 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Linearly interpolate the two "convolutions".
|
||||||
|
m_sums1 = vmlaq_f32(
|
||||||
|
vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),
|
||||||
|
m_sums2, vmovq_n_f32(kernel_interpolation_factor));
|
||||||
|
|
||||||
|
// Sum components together.
|
||||||
|
float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));
|
||||||
|
return vget_lane_f32(vpadd_f32(m_half, m_half), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace webrtc
|
@ -0,0 +1,88 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "common_audio/signal_processing/include/signal_processing_library.h"
|
||||||
|
#include "rtc_base/system/arch.h"
|
||||||
|
|
||||||
|
#include <arm_neon.h>
|
||||||
|
|
||||||
|
static inline void DotProductWithScaleNeon(int32_t* cross_correlation,
|
||||||
|
const int16_t* vector1,
|
||||||
|
const int16_t* vector2,
|
||||||
|
size_t length,
|
||||||
|
int scaling) {
|
||||||
|
size_t i = 0;
|
||||||
|
size_t len1 = length >> 3;
|
||||||
|
size_t len2 = length & 7;
|
||||||
|
int64x2_t sum0 = vdupq_n_s64(0);
|
||||||
|
int64x2_t sum1 = vdupq_n_s64(0);
|
||||||
|
|
||||||
|
for (i = len1; i > 0; i -= 1) {
|
||||||
|
int16x8_t seq1_16x8 = vld1q_s16(vector1);
|
||||||
|
int16x8_t seq2_16x8 = vld1q_s16(vector2);
|
||||||
|
#if defined(WEBRTC_ARCH_ARM64)
|
||||||
|
int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8),
|
||||||
|
vget_low_s16(seq2_16x8));
|
||||||
|
int32x4_t tmp1 = vmull_high_s16(seq1_16x8, seq2_16x8);
|
||||||
|
#else
|
||||||
|
int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8),
|
||||||
|
vget_low_s16(seq2_16x8));
|
||||||
|
int32x4_t tmp1 = vmull_s16(vget_high_s16(seq1_16x8),
|
||||||
|
vget_high_s16(seq2_16x8));
|
||||||
|
#endif
|
||||||
|
sum0 = vpadalq_s32(sum0, tmp0);
|
||||||
|
sum1 = vpadalq_s32(sum1, tmp1);
|
||||||
|
vector1 += 8;
|
||||||
|
vector2 += 8;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate the rest of the samples.
|
||||||
|
int64_t sum_res = 0;
|
||||||
|
for (i = len2; i > 0; i -= 1) {
|
||||||
|
sum_res += WEBRTC_SPL_MUL_16_16(*vector1, *vector2);
|
||||||
|
vector1++;
|
||||||
|
vector2++;
|
||||||
|
}
|
||||||
|
|
||||||
|
sum0 = vaddq_s64(sum0, sum1);
|
||||||
|
#if defined(WEBRTC_ARCH_ARM64)
|
||||||
|
int64_t sum2 = vaddvq_s64(sum0);
|
||||||
|
*cross_correlation = (int32_t)((sum2 + sum_res) >> scaling);
|
||||||
|
#else
|
||||||
|
int64x1_t shift = vdup_n_s64(-scaling);
|
||||||
|
int64x1_t sum2 = vadd_s64(vget_low_s64(sum0), vget_high_s64(sum0));
|
||||||
|
sum2 = vadd_s64(sum2, vdup_n_s64(sum_res));
|
||||||
|
sum2 = vshl_s64(sum2, shift);
|
||||||
|
vst1_lane_s32(cross_correlation, vreinterpret_s32_s64(sum2), 0);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
/* NEON version of WebRtcSpl_CrossCorrelation() for ARM32/64 platforms. */
|
||||||
|
void WebRtcSpl_CrossCorrelationNeon(int32_t* cross_correlation,
|
||||||
|
const int16_t* seq1,
|
||||||
|
const int16_t* seq2,
|
||||||
|
size_t dim_seq,
|
||||||
|
size_t dim_cross_correlation,
|
||||||
|
int right_shifts,
|
||||||
|
int step_seq2) {
|
||||||
|
int i = 0;
|
||||||
|
|
||||||
|
for (i = 0; i < (int)dim_cross_correlation; i++) {
|
||||||
|
const int16_t* seq1_ptr = seq1;
|
||||||
|
const int16_t* seq2_ptr = seq2 + (step_seq2 * i);
|
||||||
|
|
||||||
|
DotProductWithScaleNeon(cross_correlation,
|
||||||
|
seq1_ptr,
|
||||||
|
seq2_ptr,
|
||||||
|
dim_seq,
|
||||||
|
right_shifts);
|
||||||
|
cross_correlation++;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,224 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <arm_neon.h>
|
||||||
|
|
||||||
|
#include "common_audio/signal_processing/include/signal_processing_library.h"
|
||||||
|
|
||||||
|
#include "rtc_base/checks.h"
|
||||||
|
|
||||||
|
// NEON intrinsics version of WebRtcSpl_DownsampleFast()
|
||||||
|
// for ARM 32-bit/64-bit platforms.
|
||||||
|
int WebRtcSpl_DownsampleFastNeon(const int16_t* data_in,
|
||||||
|
size_t data_in_length,
|
||||||
|
int16_t* data_out,
|
||||||
|
size_t data_out_length,
|
||||||
|
const int16_t* __restrict coefficients,
|
||||||
|
size_t coefficients_length,
|
||||||
|
int factor,
|
||||||
|
size_t delay) {
|
||||||
|
// Using signed indexes to be able to compute negative i-j that
|
||||||
|
// is used to index data_in.
|
||||||
|
int i = 0;
|
||||||
|
int j = 0;
|
||||||
|
int32_t out_s32 = 0;
|
||||||
|
int endpos = delay + factor * (data_out_length - 1) + 1;
|
||||||
|
size_t res = data_out_length & 0x7;
|
||||||
|
int endpos1 = endpos - factor * res;
|
||||||
|
|
||||||
|
// Return error if any of the running conditions doesn't meet.
|
||||||
|
if (data_out_length == 0 || coefficients_length == 0
|
||||||
|
|| (int)data_in_length < endpos) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
RTC_DCHECK_GE(endpos, 0);
|
||||||
|
RTC_DCHECK_GE(endpos1, 0);
|
||||||
|
|
||||||
|
// First part, unroll the loop 8 times, with 3 subcases
|
||||||
|
// (factor == 2, 4, others).
|
||||||
|
switch (factor) {
|
||||||
|
case 2: {
|
||||||
|
for (i = delay; i < endpos1; i += 16) {
|
||||||
|
// Round value, 0.5 in Q12.
|
||||||
|
int32x4_t out32x4_0 = vdupq_n_s32(2048);
|
||||||
|
int32x4_t out32x4_1 = vdupq_n_s32(2048);
|
||||||
|
|
||||||
|
#if defined(WEBRTC_ARCH_ARM64)
|
||||||
|
// Unroll the loop 2 times.
|
||||||
|
for (j = 0; j < (int)coefficients_length - 1; j += 2) {
|
||||||
|
int32x2_t coeff32 = vld1_dup_s32((int32_t*)&coefficients[j]);
|
||||||
|
int16x4_t coeff16x4 = vreinterpret_s16_s32(coeff32);
|
||||||
|
int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j - 1]);
|
||||||
|
|
||||||
|
// Mul and accumulate low 64-bit data.
|
||||||
|
int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]);
|
||||||
|
int16x4_t in16x4_1 = vget_low_s16(in16x8x2.val[1]);
|
||||||
|
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 1);
|
||||||
|
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_1, coeff16x4, 0);
|
||||||
|
|
||||||
|
// Mul and accumulate high 64-bit data.
|
||||||
|
// TODO: vget_high_s16 need extra cost on ARM64. This could be
|
||||||
|
// replaced by vmlal_high_lane_s16. But for the interface of
|
||||||
|
// vmlal_high_lane_s16, there is a bug in gcc 4.9.
|
||||||
|
// This issue need to be tracked in the future.
|
||||||
|
int16x4_t in16x4_2 = vget_high_s16(in16x8x2.val[0]);
|
||||||
|
int16x4_t in16x4_3 = vget_high_s16(in16x8x2.val[1]);
|
||||||
|
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_2, coeff16x4, 1);
|
||||||
|
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_3, coeff16x4, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; j < (int)coefficients_length; j++) {
|
||||||
|
int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
|
||||||
|
int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j]);
|
||||||
|
|
||||||
|
// Mul and accumulate low 64-bit data.
|
||||||
|
int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]);
|
||||||
|
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
|
||||||
|
|
||||||
|
// Mul and accumulate high 64-bit data.
|
||||||
|
// TODO: vget_high_s16 need extra cost on ARM64. This could be
|
||||||
|
// replaced by vmlal_high_lane_s16. But for the interface of
|
||||||
|
// vmlal_high_lane_s16, there is a bug in gcc 4.9.
|
||||||
|
// This issue need to be tracked in the future.
|
||||||
|
int16x4_t in16x4_1 = vget_high_s16(in16x8x2.val[0]);
|
||||||
|
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
// On ARMv7, the loop unrolling 2 times results in performance
|
||||||
|
// regression.
|
||||||
|
for (j = 0; j < (int)coefficients_length; j++) {
|
||||||
|
int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
|
||||||
|
int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j]);
|
||||||
|
|
||||||
|
// Mul and accumulate.
|
||||||
|
int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]);
|
||||||
|
int16x4_t in16x4_1 = vget_high_s16(in16x8x2.val[0]);
|
||||||
|
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
|
||||||
|
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Saturate and store the output.
|
||||||
|
int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12);
|
||||||
|
int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12);
|
||||||
|
vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1));
|
||||||
|
data_out += 8;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 4: {
|
||||||
|
for (i = delay; i < endpos1; i += 32) {
|
||||||
|
// Round value, 0.5 in Q12.
|
||||||
|
int32x4_t out32x4_0 = vdupq_n_s32(2048);
|
||||||
|
int32x4_t out32x4_1 = vdupq_n_s32(2048);
|
||||||
|
|
||||||
|
// Unroll the loop 4 times.
|
||||||
|
for (j = 0; j < (int)coefficients_length - 3; j += 4) {
|
||||||
|
int16x4_t coeff16x4 = vld1_s16(&coefficients[j]);
|
||||||
|
int16x8x4_t in16x8x4 = vld4q_s16(&data_in[i - j - 3]);
|
||||||
|
|
||||||
|
// Mul and accumulate low 64-bit data.
|
||||||
|
int16x4_t in16x4_0 = vget_low_s16(in16x8x4.val[0]);
|
||||||
|
int16x4_t in16x4_2 = vget_low_s16(in16x8x4.val[1]);
|
||||||
|
int16x4_t in16x4_4 = vget_low_s16(in16x8x4.val[2]);
|
||||||
|
int16x4_t in16x4_6 = vget_low_s16(in16x8x4.val[3]);
|
||||||
|
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 3);
|
||||||
|
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_2, coeff16x4, 2);
|
||||||
|
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_4, coeff16x4, 1);
|
||||||
|
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_6, coeff16x4, 0);
|
||||||
|
|
||||||
|
// Mul and accumulate high 64-bit data.
|
||||||
|
// TODO: vget_high_s16 need extra cost on ARM64. This could be
|
||||||
|
// replaced by vmlal_high_lane_s16. But for the interface of
|
||||||
|
// vmlal_high_lane_s16, there is a bug in gcc 4.9.
|
||||||
|
// This issue need to be tracked in the future.
|
||||||
|
int16x4_t in16x4_1 = vget_high_s16(in16x8x4.val[0]);
|
||||||
|
int16x4_t in16x4_3 = vget_high_s16(in16x8x4.val[1]);
|
||||||
|
int16x4_t in16x4_5 = vget_high_s16(in16x8x4.val[2]);
|
||||||
|
int16x4_t in16x4_7 = vget_high_s16(in16x8x4.val[3]);
|
||||||
|
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 3);
|
||||||
|
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_3, coeff16x4, 2);
|
||||||
|
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_5, coeff16x4, 1);
|
||||||
|
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_7, coeff16x4, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; j < (int)coefficients_length; j++) {
|
||||||
|
int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
|
||||||
|
int16x8x4_t in16x8x4 = vld4q_s16(&data_in[i - j]);
|
||||||
|
|
||||||
|
// Mul and accumulate low 64-bit data.
|
||||||
|
int16x4_t in16x4_0 = vget_low_s16(in16x8x4.val[0]);
|
||||||
|
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
|
||||||
|
|
||||||
|
// Mul and accumulate high 64-bit data.
|
||||||
|
// TODO: vget_high_s16 need extra cost on ARM64. This could be
|
||||||
|
// replaced by vmlal_high_lane_s16. But for the interface of
|
||||||
|
// vmlal_high_lane_s16, there is a bug in gcc 4.9.
|
||||||
|
// This issue need to be tracked in the future.
|
||||||
|
int16x4_t in16x4_1 = vget_high_s16(in16x8x4.val[0]);
|
||||||
|
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Saturate and store the output.
|
||||||
|
int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12);
|
||||||
|
int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12);
|
||||||
|
vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1));
|
||||||
|
data_out += 8;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default: {
|
||||||
|
for (i = delay; i < endpos1; i += factor * 8) {
|
||||||
|
// Round value, 0.5 in Q12.
|
||||||
|
int32x4_t out32x4_0 = vdupq_n_s32(2048);
|
||||||
|
int32x4_t out32x4_1 = vdupq_n_s32(2048);
|
||||||
|
|
||||||
|
for (j = 0; j < (int)coefficients_length; j++) {
|
||||||
|
int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
|
||||||
|
int16x4_t in16x4_0 = vld1_dup_s16(&data_in[i - j]);
|
||||||
|
in16x4_0 = vld1_lane_s16(&data_in[i + factor - j], in16x4_0, 1);
|
||||||
|
in16x4_0 = vld1_lane_s16(&data_in[i + factor * 2 - j], in16x4_0, 2);
|
||||||
|
in16x4_0 = vld1_lane_s16(&data_in[i + factor * 3 - j], in16x4_0, 3);
|
||||||
|
int16x4_t in16x4_1 = vld1_dup_s16(&data_in[i + factor * 4 - j]);
|
||||||
|
in16x4_1 = vld1_lane_s16(&data_in[i + factor * 5 - j], in16x4_1, 1);
|
||||||
|
in16x4_1 = vld1_lane_s16(&data_in[i + factor * 6 - j], in16x4_1, 2);
|
||||||
|
in16x4_1 = vld1_lane_s16(&data_in[i + factor * 7 - j], in16x4_1, 3);
|
||||||
|
|
||||||
|
// Mul and accumulate.
|
||||||
|
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
|
||||||
|
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Saturate and store the output.
|
||||||
|
int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12);
|
||||||
|
int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12);
|
||||||
|
vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1));
|
||||||
|
data_out += 8;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Second part, do the rest iterations (if any).
|
||||||
|
for (; i < endpos; i += factor) {
|
||||||
|
out_s32 = 2048; // Round value, 0.5 in Q12.
|
||||||
|
|
||||||
|
for (j = 0; j < (int)coefficients_length; j++) {
|
||||||
|
out_s32 = WebRtc_MulAccumW16(coefficients[j], data_in[i - j], out_s32);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Saturate and store the output.
|
||||||
|
out_s32 >>= 12;
|
||||||
|
*data_out++ = WebRtcSpl_SatW32ToW16(out_s32);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
@ -0,0 +1,333 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <arm_neon.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#include "rtc_base/checks.h"
|
||||||
|
#include "common_audio/signal_processing/include/signal_processing_library.h"
|
||||||
|
|
||||||
|
// Maximum absolute value of word16 vector. C version for generic platforms.
|
||||||
|
int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, size_t length) {
|
||||||
|
int absolute = 0, maximum = 0;
|
||||||
|
|
||||||
|
RTC_DCHECK_GT(length, 0);
|
||||||
|
|
||||||
|
const int16_t* p_start = vector;
|
||||||
|
size_t rest = length & 7;
|
||||||
|
const int16_t* p_end = vector + length - rest;
|
||||||
|
|
||||||
|
int16x8_t v;
|
||||||
|
uint16x8_t max_qv;
|
||||||
|
max_qv = vdupq_n_u16(0);
|
||||||
|
|
||||||
|
while (p_start < p_end) {
|
||||||
|
v = vld1q_s16(p_start);
|
||||||
|
// Note vabs doesn't change the value of -32768.
|
||||||
|
v = vabsq_s16(v);
|
||||||
|
// Use u16 so we don't lose the value -32768.
|
||||||
|
max_qv = vmaxq_u16(max_qv, vreinterpretq_u16_s16(v));
|
||||||
|
p_start += 8;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef WEBRTC_ARCH_ARM64
|
||||||
|
maximum = (int)vmaxvq_u16(max_qv);
|
||||||
|
#else
|
||||||
|
uint16x4_t max_dv;
|
||||||
|
max_dv = vmax_u16(vget_low_u16(max_qv), vget_high_u16(max_qv));
|
||||||
|
max_dv = vpmax_u16(max_dv, max_dv);
|
||||||
|
max_dv = vpmax_u16(max_dv, max_dv);
|
||||||
|
|
||||||
|
maximum = (int)vget_lane_u16(max_dv, 0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
p_end = vector + length;
|
||||||
|
while (p_start < p_end) {
|
||||||
|
absolute = abs((int)(*p_start));
|
||||||
|
|
||||||
|
if (absolute > maximum) {
|
||||||
|
maximum = absolute;
|
||||||
|
}
|
||||||
|
p_start++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Guard the case for abs(-32768).
|
||||||
|
if (maximum > WEBRTC_SPL_WORD16_MAX) {
|
||||||
|
maximum = WEBRTC_SPL_WORD16_MAX;
|
||||||
|
}
|
||||||
|
|
||||||
|
return (int16_t)maximum;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Maximum absolute value of word32 vector. NEON intrinsics version for
|
||||||
|
// ARM 32-bit/64-bit platforms.
|
||||||
|
int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, size_t length) {
|
||||||
|
// Use uint32_t for the local variables, to accommodate the return value
|
||||||
|
// of abs(0x80000000), which is 0x80000000.
|
||||||
|
|
||||||
|
uint32_t absolute = 0, maximum = 0;
|
||||||
|
size_t i = 0;
|
||||||
|
size_t residual = length & 0x7;
|
||||||
|
|
||||||
|
RTC_DCHECK_GT(length, 0);
|
||||||
|
|
||||||
|
const int32_t* p_start = vector;
|
||||||
|
uint32x4_t max32x4_0 = vdupq_n_u32(0);
|
||||||
|
uint32x4_t max32x4_1 = vdupq_n_u32(0);
|
||||||
|
|
||||||
|
// First part, unroll the loop 8 times.
|
||||||
|
for (i = 0; i < length - residual; i += 8) {
|
||||||
|
int32x4_t in32x4_0 = vld1q_s32(p_start);
|
||||||
|
p_start += 4;
|
||||||
|
int32x4_t in32x4_1 = vld1q_s32(p_start);
|
||||||
|
p_start += 4;
|
||||||
|
in32x4_0 = vabsq_s32(in32x4_0);
|
||||||
|
in32x4_1 = vabsq_s32(in32x4_1);
|
||||||
|
// vabs doesn't change the value of 0x80000000.
|
||||||
|
// Use u32 so we don't lose the value 0x80000000.
|
||||||
|
max32x4_0 = vmaxq_u32(max32x4_0, vreinterpretq_u32_s32(in32x4_0));
|
||||||
|
max32x4_1 = vmaxq_u32(max32x4_1, vreinterpretq_u32_s32(in32x4_1));
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32x4_t max32x4 = vmaxq_u32(max32x4_0, max32x4_1);
|
||||||
|
#if defined(WEBRTC_ARCH_ARM64)
|
||||||
|
maximum = vmaxvq_u32(max32x4);
|
||||||
|
#else
|
||||||
|
uint32x2_t max32x2 = vmax_u32(vget_low_u32(max32x4), vget_high_u32(max32x4));
|
||||||
|
max32x2 = vpmax_u32(max32x2, max32x2);
|
||||||
|
|
||||||
|
maximum = vget_lane_u32(max32x2, 0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Second part, do the remaining iterations (if any).
|
||||||
|
for (i = residual; i > 0; i--) {
|
||||||
|
absolute = abs((int)(*p_start));
|
||||||
|
if (absolute > maximum) {
|
||||||
|
maximum = absolute;
|
||||||
|
}
|
||||||
|
p_start++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Guard against the case for 0x80000000.
|
||||||
|
maximum = WEBRTC_SPL_MIN(maximum, WEBRTC_SPL_WORD32_MAX);
|
||||||
|
|
||||||
|
return (int32_t)maximum;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Maximum value of word16 vector. NEON intrinsics version for
|
||||||
|
// ARM 32-bit/64-bit platforms.
|
||||||
|
int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, size_t length) {
|
||||||
|
int16_t maximum = WEBRTC_SPL_WORD16_MIN;
|
||||||
|
size_t i = 0;
|
||||||
|
size_t residual = length & 0x7;
|
||||||
|
|
||||||
|
RTC_DCHECK_GT(length, 0);
|
||||||
|
|
||||||
|
const int16_t* p_start = vector;
|
||||||
|
int16x8_t max16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MIN);
|
||||||
|
|
||||||
|
// First part, unroll the loop 8 times.
|
||||||
|
for (i = 0; i < length - residual; i += 8) {
|
||||||
|
int16x8_t in16x8 = vld1q_s16(p_start);
|
||||||
|
max16x8 = vmaxq_s16(max16x8, in16x8);
|
||||||
|
p_start += 8;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(WEBRTC_ARCH_ARM64)
|
||||||
|
maximum = vmaxvq_s16(max16x8);
|
||||||
|
#else
|
||||||
|
int16x4_t max16x4 = vmax_s16(vget_low_s16(max16x8), vget_high_s16(max16x8));
|
||||||
|
max16x4 = vpmax_s16(max16x4, max16x4);
|
||||||
|
max16x4 = vpmax_s16(max16x4, max16x4);
|
||||||
|
|
||||||
|
maximum = vget_lane_s16(max16x4, 0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Second part, do the remaining iterations (if any).
|
||||||
|
for (i = residual; i > 0; i--) {
|
||||||
|
if (*p_start > maximum)
|
||||||
|
maximum = *p_start;
|
||||||
|
p_start++;
|
||||||
|
}
|
||||||
|
return maximum;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Maximum value of word32 vector. NEON intrinsics version for
|
||||||
|
// ARM 32-bit/64-bit platforms.
|
||||||
|
int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, size_t length) {
|
||||||
|
int32_t maximum = WEBRTC_SPL_WORD32_MIN;
|
||||||
|
size_t i = 0;
|
||||||
|
size_t residual = length & 0x7;
|
||||||
|
|
||||||
|
RTC_DCHECK_GT(length, 0);
|
||||||
|
|
||||||
|
const int32_t* p_start = vector;
|
||||||
|
int32x4_t max32x4_0 = vdupq_n_s32(WEBRTC_SPL_WORD32_MIN);
|
||||||
|
int32x4_t max32x4_1 = vdupq_n_s32(WEBRTC_SPL_WORD32_MIN);
|
||||||
|
|
||||||
|
// First part, unroll the loop 8 times.
|
||||||
|
for (i = 0; i < length - residual; i += 8) {
|
||||||
|
int32x4_t in32x4_0 = vld1q_s32(p_start);
|
||||||
|
p_start += 4;
|
||||||
|
int32x4_t in32x4_1 = vld1q_s32(p_start);
|
||||||
|
p_start += 4;
|
||||||
|
max32x4_0 = vmaxq_s32(max32x4_0, in32x4_0);
|
||||||
|
max32x4_1 = vmaxq_s32(max32x4_1, in32x4_1);
|
||||||
|
}
|
||||||
|
|
||||||
|
int32x4_t max32x4 = vmaxq_s32(max32x4_0, max32x4_1);
|
||||||
|
#if defined(WEBRTC_ARCH_ARM64)
|
||||||
|
maximum = vmaxvq_s32(max32x4);
|
||||||
|
#else
|
||||||
|
int32x2_t max32x2 = vmax_s32(vget_low_s32(max32x4), vget_high_s32(max32x4));
|
||||||
|
max32x2 = vpmax_s32(max32x2, max32x2);
|
||||||
|
|
||||||
|
maximum = vget_lane_s32(max32x2, 0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Second part, do the remaining iterations (if any).
|
||||||
|
for (i = residual; i > 0; i--) {
|
||||||
|
if (*p_start > maximum)
|
||||||
|
maximum = *p_start;
|
||||||
|
p_start++;
|
||||||
|
}
|
||||||
|
return maximum;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Minimum value of word16 vector. NEON intrinsics version for
|
||||||
|
// ARM 32-bit/64-bit platforms.
|
||||||
|
int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, size_t length) {
|
||||||
|
int16_t minimum = WEBRTC_SPL_WORD16_MAX;
|
||||||
|
size_t i = 0;
|
||||||
|
size_t residual = length & 0x7;
|
||||||
|
|
||||||
|
RTC_DCHECK_GT(length, 0);
|
||||||
|
|
||||||
|
const int16_t* p_start = vector;
|
||||||
|
int16x8_t min16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MAX);
|
||||||
|
|
||||||
|
// First part, unroll the loop 8 times.
|
||||||
|
for (i = 0; i < length - residual; i += 8) {
|
||||||
|
int16x8_t in16x8 = vld1q_s16(p_start);
|
||||||
|
min16x8 = vminq_s16(min16x8, in16x8);
|
||||||
|
p_start += 8;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(WEBRTC_ARCH_ARM64)
|
||||||
|
minimum = vminvq_s16(min16x8);
|
||||||
|
#else
|
||||||
|
int16x4_t min16x4 = vmin_s16(vget_low_s16(min16x8), vget_high_s16(min16x8));
|
||||||
|
min16x4 = vpmin_s16(min16x4, min16x4);
|
||||||
|
min16x4 = vpmin_s16(min16x4, min16x4);
|
||||||
|
|
||||||
|
minimum = vget_lane_s16(min16x4, 0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Second part, do the remaining iterations (if any).
|
||||||
|
for (i = residual; i > 0; i--) {
|
||||||
|
if (*p_start < minimum)
|
||||||
|
minimum = *p_start;
|
||||||
|
p_start++;
|
||||||
|
}
|
||||||
|
return minimum;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Minimum value of word32 vector. NEON intrinsics version for
|
||||||
|
// ARM 32-bit/64-bit platforms.
|
||||||
|
int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, size_t length) {
|
||||||
|
int32_t minimum = WEBRTC_SPL_WORD32_MAX;
|
||||||
|
size_t i = 0;
|
||||||
|
size_t residual = length & 0x7;
|
||||||
|
|
||||||
|
RTC_DCHECK_GT(length, 0);
|
||||||
|
|
||||||
|
const int32_t* p_start = vector;
|
||||||
|
int32x4_t min32x4_0 = vdupq_n_s32(WEBRTC_SPL_WORD32_MAX);
|
||||||
|
int32x4_t min32x4_1 = vdupq_n_s32(WEBRTC_SPL_WORD32_MAX);
|
||||||
|
|
||||||
|
// First part, unroll the loop 8 times.
|
||||||
|
for (i = 0; i < length - residual; i += 8) {
|
||||||
|
int32x4_t in32x4_0 = vld1q_s32(p_start);
|
||||||
|
p_start += 4;
|
||||||
|
int32x4_t in32x4_1 = vld1q_s32(p_start);
|
||||||
|
p_start += 4;
|
||||||
|
min32x4_0 = vminq_s32(min32x4_0, in32x4_0);
|
||||||
|
min32x4_1 = vminq_s32(min32x4_1, in32x4_1);
|
||||||
|
}
|
||||||
|
|
||||||
|
int32x4_t min32x4 = vminq_s32(min32x4_0, min32x4_1);
|
||||||
|
#if defined(WEBRTC_ARCH_ARM64)
|
||||||
|
minimum = vminvq_s32(min32x4);
|
||||||
|
#else
|
||||||
|
int32x2_t min32x2 = vmin_s32(vget_low_s32(min32x4), vget_high_s32(min32x4));
|
||||||
|
min32x2 = vpmin_s32(min32x2, min32x2);
|
||||||
|
|
||||||
|
minimum = vget_lane_s32(min32x2, 0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Second part, do the remaining iterations (if any).
|
||||||
|
for (i = residual; i > 0; i--) {
|
||||||
|
if (*p_start < minimum)
|
||||||
|
minimum = *p_start;
|
||||||
|
p_start++;
|
||||||
|
}
|
||||||
|
return minimum;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Finds both the minimum and maximum elements in an array of 16-bit integers.
|
||||||
|
void WebRtcSpl_MinMaxW16Neon(const int16_t* vector, size_t length,
|
||||||
|
int16_t* min_val, int16_t* max_val) {
|
||||||
|
int16_t minimum = WEBRTC_SPL_WORD16_MAX;
|
||||||
|
int16_t maximum = WEBRTC_SPL_WORD16_MIN;
|
||||||
|
size_t i = 0;
|
||||||
|
size_t residual = length & 0x7;
|
||||||
|
|
||||||
|
RTC_DCHECK_GT(length, 0);
|
||||||
|
|
||||||
|
const int16_t* p_start = vector;
|
||||||
|
int16x8_t min16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MAX);
|
||||||
|
int16x8_t max16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MIN);
|
||||||
|
|
||||||
|
// First part, unroll the loop 8 times.
|
||||||
|
for (i = 0; i < length - residual; i += 8) {
|
||||||
|
int16x8_t in16x8 = vld1q_s16(p_start);
|
||||||
|
min16x8 = vminq_s16(min16x8, in16x8);
|
||||||
|
max16x8 = vmaxq_s16(max16x8, in16x8);
|
||||||
|
p_start += 8;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(WEBRTC_ARCH_ARM64)
|
||||||
|
minimum = vminvq_s16(min16x8);
|
||||||
|
maximum = vmaxvq_s16(max16x8);
|
||||||
|
#else
|
||||||
|
int16x4_t min16x4 = vmin_s16(vget_low_s16(min16x8), vget_high_s16(min16x8));
|
||||||
|
min16x4 = vpmin_s16(min16x4, min16x4);
|
||||||
|
min16x4 = vpmin_s16(min16x4, min16x4);
|
||||||
|
|
||||||
|
minimum = vget_lane_s16(min16x4, 0);
|
||||||
|
|
||||||
|
int16x4_t max16x4 = vmax_s16(vget_low_s16(max16x8), vget_high_s16(max16x8));
|
||||||
|
max16x4 = vpmax_s16(max16x4, max16x4);
|
||||||
|
max16x4 = vpmax_s16(max16x4, max16x4);
|
||||||
|
|
||||||
|
maximum = vget_lane_s16(max16x4, 0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Second part, do the remaining iterations (if any).
|
||||||
|
for (i = residual; i > 0; i--) {
|
||||||
|
if (*p_start < minimum)
|
||||||
|
minimum = *p_start;
|
||||||
|
if (*p_start > maximum)
|
||||||
|
maximum = *p_start;
|
||||||
|
p_start++;
|
||||||
|
}
|
||||||
|
*min_val = minimum;
|
||||||
|
*max_val = maximum;
|
||||||
|
}
|
351
VocieProcess/common_audio/third_party/ooura/fft_size_128/ooura_fft_neon.cc
vendored
Normal file
351
VocieProcess/common_audio/third_party/ooura/fft_size_128/ooura_fft_neon.cc
vendored
Normal file
@ -0,0 +1,351 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The rdft AEC algorithm, neon version of speed-critical functions.
|
||||||
|
*
|
||||||
|
* Based on the sse2 version.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <arm_neon.h>
|
||||||
|
|
||||||
|
#include "common_audio/third_party/ooura/fft_size_128/ooura_fft.h"
|
||||||
|
#include "common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_common.h"
|
||||||
|
#include "common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_neon_sse2.h"
|
||||||
|
|
||||||
|
namespace webrtc {
|
||||||
|
|
||||||
|
#if defined(WEBRTC_HAS_NEON)
|
||||||
|
void cft1st_128_neon(float* a) {
|
||||||
|
const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
|
||||||
|
int j, k2;
|
||||||
|
|
||||||
|
for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {
|
||||||
|
float32x4_t a00v = vld1q_f32(&a[j + 0]);
|
||||||
|
float32x4_t a04v = vld1q_f32(&a[j + 4]);
|
||||||
|
float32x4_t a08v = vld1q_f32(&a[j + 8]);
|
||||||
|
float32x4_t a12v = vld1q_f32(&a[j + 12]);
|
||||||
|
float32x4_t a01v = vcombine_f32(vget_low_f32(a00v), vget_low_f32(a08v));
|
||||||
|
float32x4_t a23v = vcombine_f32(vget_high_f32(a00v), vget_high_f32(a08v));
|
||||||
|
float32x4_t a45v = vcombine_f32(vget_low_f32(a04v), vget_low_f32(a12v));
|
||||||
|
float32x4_t a67v = vcombine_f32(vget_high_f32(a04v), vget_high_f32(a12v));
|
||||||
|
const float32x4_t wk1rv = vld1q_f32(&rdft_wk1r[k2]);
|
||||||
|
const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2]);
|
||||||
|
const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2]);
|
||||||
|
const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2]);
|
||||||
|
const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2]);
|
||||||
|
const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2]);
|
||||||
|
float32x4_t x0v = vaddq_f32(a01v, a23v);
|
||||||
|
const float32x4_t x1v = vsubq_f32(a01v, a23v);
|
||||||
|
const float32x4_t x2v = vaddq_f32(a45v, a67v);
|
||||||
|
const float32x4_t x3v = vsubq_f32(a45v, a67v);
|
||||||
|
const float32x4_t x3w = vrev64q_f32(x3v);
|
||||||
|
float32x4_t x0w;
|
||||||
|
a01v = vaddq_f32(x0v, x2v);
|
||||||
|
x0v = vsubq_f32(x0v, x2v);
|
||||||
|
x0w = vrev64q_f32(x0v);
|
||||||
|
a45v = vmulq_f32(wk2rv, x0v);
|
||||||
|
a45v = vmlaq_f32(a45v, wk2iv, x0w);
|
||||||
|
x0v = vmlaq_f32(x1v, x3w, vec_swap_sign);
|
||||||
|
x0w = vrev64q_f32(x0v);
|
||||||
|
a23v = vmulq_f32(wk1rv, x0v);
|
||||||
|
a23v = vmlaq_f32(a23v, wk1iv, x0w);
|
||||||
|
x0v = vmlsq_f32(x1v, x3w, vec_swap_sign);
|
||||||
|
x0w = vrev64q_f32(x0v);
|
||||||
|
a67v = vmulq_f32(wk3rv, x0v);
|
||||||
|
a67v = vmlaq_f32(a67v, wk3iv, x0w);
|
||||||
|
a00v = vcombine_f32(vget_low_f32(a01v), vget_low_f32(a23v));
|
||||||
|
a04v = vcombine_f32(vget_low_f32(a45v), vget_low_f32(a67v));
|
||||||
|
a08v = vcombine_f32(vget_high_f32(a01v), vget_high_f32(a23v));
|
||||||
|
a12v = vcombine_f32(vget_high_f32(a45v), vget_high_f32(a67v));
|
||||||
|
vst1q_f32(&a[j + 0], a00v);
|
||||||
|
vst1q_f32(&a[j + 4], a04v);
|
||||||
|
vst1q_f32(&a[j + 8], a08v);
|
||||||
|
vst1q_f32(&a[j + 12], a12v);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void cftmdl_128_neon(float* a) {
|
||||||
|
int j;
|
||||||
|
const int l = 8;
|
||||||
|
const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
|
||||||
|
float32x4_t wk1rv = vld1q_f32(cftmdl_wk1r);
|
||||||
|
|
||||||
|
for (j = 0; j < l; j += 2) {
|
||||||
|
const float32x2_t a_00 = vld1_f32(&a[j + 0]);
|
||||||
|
const float32x2_t a_08 = vld1_f32(&a[j + 8]);
|
||||||
|
const float32x2_t a_32 = vld1_f32(&a[j + 32]);
|
||||||
|
const float32x2_t a_40 = vld1_f32(&a[j + 40]);
|
||||||
|
const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);
|
||||||
|
const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);
|
||||||
|
const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);
|
||||||
|
const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);
|
||||||
|
const float32x2_t a_16 = vld1_f32(&a[j + 16]);
|
||||||
|
const float32x2_t a_24 = vld1_f32(&a[j + 24]);
|
||||||
|
const float32x2_t a_48 = vld1_f32(&a[j + 48]);
|
||||||
|
const float32x2_t a_56 = vld1_f32(&a[j + 56]);
|
||||||
|
const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);
|
||||||
|
const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);
|
||||||
|
const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);
|
||||||
|
const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);
|
||||||
|
const float32x4_t xx0 = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
|
||||||
|
const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
|
||||||
|
const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);
|
||||||
|
const float32x4_t x1_x3_add =
|
||||||
|
vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
|
||||||
|
const float32x4_t x1_x3_sub =
|
||||||
|
vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
|
||||||
|
const float32x2_t yy0_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 0);
|
||||||
|
const float32x2_t yy0_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 0);
|
||||||
|
const float32x4_t yy0_as = vcombine_f32(yy0_a, yy0_s);
|
||||||
|
const float32x2_t yy1_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 1);
|
||||||
|
const float32x2_t yy1_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 1);
|
||||||
|
const float32x4_t yy1_as = vcombine_f32(yy1_a, yy1_s);
|
||||||
|
const float32x4_t yy0 = vmlaq_f32(yy0_as, vec_swap_sign, yy1_as);
|
||||||
|
const float32x4_t yy4 = vmulq_f32(wk1rv, yy0);
|
||||||
|
const float32x4_t xx1_rev = vrev64q_f32(xx1);
|
||||||
|
const float32x4_t yy4_rev = vrev64q_f32(yy4);
|
||||||
|
|
||||||
|
vst1_f32(&a[j + 0], vget_low_f32(xx0));
|
||||||
|
vst1_f32(&a[j + 32], vget_high_f32(xx0));
|
||||||
|
vst1_f32(&a[j + 16], vget_low_f32(xx1));
|
||||||
|
vst1_f32(&a[j + 48], vget_high_f32(xx1_rev));
|
||||||
|
|
||||||
|
a[j + 48] = -a[j + 48];
|
||||||
|
|
||||||
|
vst1_f32(&a[j + 8], vget_low_f32(x1_x3_add));
|
||||||
|
vst1_f32(&a[j + 24], vget_low_f32(x1_x3_sub));
|
||||||
|
vst1_f32(&a[j + 40], vget_low_f32(yy4));
|
||||||
|
vst1_f32(&a[j + 56], vget_high_f32(yy4_rev));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
const int k = 64;
|
||||||
|
const int k1 = 2;
|
||||||
|
const int k2 = 2 * k1;
|
||||||
|
const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2 + 0]);
|
||||||
|
const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2 + 0]);
|
||||||
|
const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2 + 0]);
|
||||||
|
const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2 + 0]);
|
||||||
|
const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2 + 0]);
|
||||||
|
wk1rv = vld1q_f32(&rdft_wk1r[k2 + 0]);
|
||||||
|
for (j = k; j < l + k; j += 2) {
|
||||||
|
const float32x2_t a_00 = vld1_f32(&a[j + 0]);
|
||||||
|
const float32x2_t a_08 = vld1_f32(&a[j + 8]);
|
||||||
|
const float32x2_t a_32 = vld1_f32(&a[j + 32]);
|
||||||
|
const float32x2_t a_40 = vld1_f32(&a[j + 40]);
|
||||||
|
const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);
|
||||||
|
const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);
|
||||||
|
const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);
|
||||||
|
const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);
|
||||||
|
const float32x2_t a_16 = vld1_f32(&a[j + 16]);
|
||||||
|
const float32x2_t a_24 = vld1_f32(&a[j + 24]);
|
||||||
|
const float32x2_t a_48 = vld1_f32(&a[j + 48]);
|
||||||
|
const float32x2_t a_56 = vld1_f32(&a[j + 56]);
|
||||||
|
const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);
|
||||||
|
const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);
|
||||||
|
const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);
|
||||||
|
const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);
|
||||||
|
const float32x4_t xx = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
|
||||||
|
const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
|
||||||
|
const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);
|
||||||
|
const float32x4_t x1_x3_add =
|
||||||
|
vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
|
||||||
|
const float32x4_t x1_x3_sub =
|
||||||
|
vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
|
||||||
|
float32x4_t xx4 = vmulq_f32(wk2rv, xx1);
|
||||||
|
float32x4_t xx12 = vmulq_f32(wk1rv, x1_x3_add);
|
||||||
|
float32x4_t xx22 = vmulq_f32(wk3rv, x1_x3_sub);
|
||||||
|
xx4 = vmlaq_f32(xx4, wk2iv, vrev64q_f32(xx1));
|
||||||
|
xx12 = vmlaq_f32(xx12, wk1iv, vrev64q_f32(x1_x3_add));
|
||||||
|
xx22 = vmlaq_f32(xx22, wk3iv, vrev64q_f32(x1_x3_sub));
|
||||||
|
|
||||||
|
vst1_f32(&a[j + 0], vget_low_f32(xx));
|
||||||
|
vst1_f32(&a[j + 32], vget_high_f32(xx));
|
||||||
|
vst1_f32(&a[j + 16], vget_low_f32(xx4));
|
||||||
|
vst1_f32(&a[j + 48], vget_high_f32(xx4));
|
||||||
|
vst1_f32(&a[j + 8], vget_low_f32(xx12));
|
||||||
|
vst1_f32(&a[j + 40], vget_high_f32(xx12));
|
||||||
|
vst1_f32(&a[j + 24], vget_low_f32(xx22));
|
||||||
|
vst1_f32(&a[j + 56], vget_high_f32(xx22));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__inline static float32x4_t reverse_order_f32x4(float32x4_t in) {
|
||||||
|
// A B C D -> C D A B
|
||||||
|
const float32x4_t rev = vcombine_f32(vget_high_f32(in), vget_low_f32(in));
|
||||||
|
// C D A B -> D C B A
|
||||||
|
return vrev64q_f32(rev);
|
||||||
|
}
|
||||||
|
|
||||||
|
void rftfsub_128_neon(float* a) {
|
||||||
|
const float* c = rdft_w + 32;
|
||||||
|
int j1, j2;
|
||||||
|
const float32x4_t mm_half = vdupq_n_f32(0.5f);
|
||||||
|
|
||||||
|
// Vectorized code (four at once).
|
||||||
|
// Note: commented number are indexes for the first iteration of the loop.
|
||||||
|
for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
|
||||||
|
// Load 'wk'.
|
||||||
|
const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4,
|
||||||
|
const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31,
|
||||||
|
const float32x4_t wkrt = vsubq_f32(mm_half, c_k1); // 28, 29, 30, 31,
|
||||||
|
const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28,
|
||||||
|
const float32x4_t wki_ = c_j1; // 1, 2, 3, 4,
|
||||||
|
// Load and shuffle 'a'.
|
||||||
|
// 2, 4, 6, 8, 3, 5, 7, 9
|
||||||
|
float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);
|
||||||
|
// 120, 122, 124, 126, 121, 123, 125, 127,
|
||||||
|
const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);
|
||||||
|
// 126, 124, 122, 120
|
||||||
|
const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);
|
||||||
|
// 127, 125, 123, 121
|
||||||
|
const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);
|
||||||
|
// Calculate 'x'.
|
||||||
|
const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0);
|
||||||
|
// 2-126, 4-124, 6-122, 8-120,
|
||||||
|
const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1);
|
||||||
|
// 3-127, 5-125, 7-123, 9-121,
|
||||||
|
// Calculate product into 'y'.
|
||||||
|
// yr = wkr * xr - wki * xi;
|
||||||
|
// yi = wkr * xi + wki * xr;
|
||||||
|
const float32x4_t a_ = vmulq_f32(wkr_, xr_);
|
||||||
|
const float32x4_t b_ = vmulq_f32(wki_, xi_);
|
||||||
|
const float32x4_t c_ = vmulq_f32(wkr_, xi_);
|
||||||
|
const float32x4_t d_ = vmulq_f32(wki_, xr_);
|
||||||
|
const float32x4_t yr_ = vsubq_f32(a_, b_); // 2-126, 4-124, 6-122, 8-120,
|
||||||
|
const float32x4_t yi_ = vaddq_f32(c_, d_); // 3-127, 5-125, 7-123, 9-121,
|
||||||
|
// Update 'a'.
|
||||||
|
// a[j2 + 0] -= yr;
|
||||||
|
// a[j2 + 1] -= yi;
|
||||||
|
// a[k2 + 0] += yr;
|
||||||
|
// a[k2 + 1] -= yi;
|
||||||
|
// 126, 124, 122, 120,
|
||||||
|
const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_);
|
||||||
|
// 127, 125, 123, 121,
|
||||||
|
const float32x4_t a_k2_p1n = vsubq_f32(a_k2_p1, yi_);
|
||||||
|
// Shuffle in right order and store.
|
||||||
|
const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n);
|
||||||
|
const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n);
|
||||||
|
// 124, 125, 126, 127, 120, 121, 122, 123
|
||||||
|
const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr);
|
||||||
|
// 2, 4, 6, 8,
|
||||||
|
a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_);
|
||||||
|
// 3, 5, 7, 9,
|
||||||
|
a_j2_p.val[1] = vsubq_f32(a_j2_p.val[1], yi_);
|
||||||
|
// 2, 3, 4, 5, 6, 7, 8, 9,
|
||||||
|
vst2q_f32(&a[0 + j2], a_j2_p);
|
||||||
|
|
||||||
|
vst1q_f32(&a[122 - j2], a_k2_n.val[1]);
|
||||||
|
vst1q_f32(&a[126 - j2], a_k2_n.val[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scalar code for the remaining items.
|
||||||
|
for (; j2 < 64; j1 += 1, j2 += 2) {
|
||||||
|
const int k2 = 128 - j2;
|
||||||
|
const int k1 = 32 - j1;
|
||||||
|
const float wkr = 0.5f - c[k1];
|
||||||
|
const float wki = c[j1];
|
||||||
|
const float xr = a[j2 + 0] - a[k2 + 0];
|
||||||
|
const float xi = a[j2 + 1] + a[k2 + 1];
|
||||||
|
const float yr = wkr * xr - wki * xi;
|
||||||
|
const float yi = wkr * xi + wki * xr;
|
||||||
|
a[j2 + 0] -= yr;
|
||||||
|
a[j2 + 1] -= yi;
|
||||||
|
a[k2 + 0] += yr;
|
||||||
|
a[k2 + 1] -= yi;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void rftbsub_128_neon(float* a) {
|
||||||
|
const float* c = rdft_w + 32;
|
||||||
|
int j1, j2;
|
||||||
|
const float32x4_t mm_half = vdupq_n_f32(0.5f);
|
||||||
|
|
||||||
|
a[1] = -a[1];
|
||||||
|
// Vectorized code (four at once).
|
||||||
|
// Note: commented number are indexes for the first iteration of the loop.
|
||||||
|
for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
|
||||||
|
// Load 'wk'.
|
||||||
|
const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4,
|
||||||
|
const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31,
|
||||||
|
const float32x4_t wkrt = vsubq_f32(mm_half, c_k1); // 28, 29, 30, 31,
|
||||||
|
const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28,
|
||||||
|
const float32x4_t wki_ = c_j1; // 1, 2, 3, 4,
|
||||||
|
// Load and shuffle 'a'.
|
||||||
|
// 2, 4, 6, 8, 3, 5, 7, 9
|
||||||
|
float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);
|
||||||
|
// 120, 122, 124, 126, 121, 123, 125, 127,
|
||||||
|
const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);
|
||||||
|
// 126, 124, 122, 120
|
||||||
|
const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);
|
||||||
|
// 127, 125, 123, 121
|
||||||
|
const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);
|
||||||
|
// Calculate 'x'.
|
||||||
|
const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0);
|
||||||
|
// 2-126, 4-124, 6-122, 8-120,
|
||||||
|
const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1);
|
||||||
|
// 3-127, 5-125, 7-123, 9-121,
|
||||||
|
// Calculate product into 'y'.
|
||||||
|
// yr = wkr * xr - wki * xi;
|
||||||
|
// yi = wkr * xi + wki * xr;
|
||||||
|
const float32x4_t a_ = vmulq_f32(wkr_, xr_);
|
||||||
|
const float32x4_t b_ = vmulq_f32(wki_, xi_);
|
||||||
|
const float32x4_t c_ = vmulq_f32(wkr_, xi_);
|
||||||
|
const float32x4_t d_ = vmulq_f32(wki_, xr_);
|
||||||
|
const float32x4_t yr_ = vaddq_f32(a_, b_); // 2-126, 4-124, 6-122, 8-120,
|
||||||
|
const float32x4_t yi_ = vsubq_f32(c_, d_); // 3-127, 5-125, 7-123, 9-121,
|
||||||
|
// Update 'a'.
|
||||||
|
// a[j2 + 0] -= yr;
|
||||||
|
// a[j2 + 1] -= yi;
|
||||||
|
// a[k2 + 0] += yr;
|
||||||
|
// a[k2 + 1] -= yi;
|
||||||
|
// 126, 124, 122, 120,
|
||||||
|
const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_);
|
||||||
|
// 127, 125, 123, 121,
|
||||||
|
const float32x4_t a_k2_p1n = vsubq_f32(yi_, a_k2_p1);
|
||||||
|
// Shuffle in right order and store.
|
||||||
|
// 2, 3, 4, 5, 6, 7, 8, 9,
|
||||||
|
const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n);
|
||||||
|
const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n);
|
||||||
|
// 124, 125, 126, 127, 120, 121, 122, 123
|
||||||
|
const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr);
|
||||||
|
// 2, 4, 6, 8,
|
||||||
|
a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_);
|
||||||
|
// 3, 5, 7, 9,
|
||||||
|
a_j2_p.val[1] = vsubq_f32(yi_, a_j2_p.val[1]);
|
||||||
|
// 2, 3, 4, 5, 6, 7, 8, 9,
|
||||||
|
vst2q_f32(&a[0 + j2], a_j2_p);
|
||||||
|
|
||||||
|
vst1q_f32(&a[122 - j2], a_k2_n.val[1]);
|
||||||
|
vst1q_f32(&a[126 - j2], a_k2_n.val[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scalar code for the remaining items.
|
||||||
|
for (; j2 < 64; j1 += 1, j2 += 2) {
|
||||||
|
const int k2 = 128 - j2;
|
||||||
|
const int k1 = 32 - j1;
|
||||||
|
const float wkr = 0.5f - c[k1];
|
||||||
|
const float wki = c[j1];
|
||||||
|
const float xr = a[j2 + 0] - a[k2 + 0];
|
||||||
|
const float xi = a[j2 + 1] + a[k2 + 1];
|
||||||
|
const float yr = wkr * xr + wki * xi;
|
||||||
|
const float yi = wkr * xi - wki * xr;
|
||||||
|
a[j2 + 0] = a[j2 + 0] - yr;
|
||||||
|
a[j2 + 1] = yi - a[j2 + 1];
|
||||||
|
a[k2 + 0] = yr + a[k2 + 0];
|
||||||
|
a[k2 + 1] = yi - a[k2 + 1];
|
||||||
|
}
|
||||||
|
a[65] = -a[65];
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
} // namespace webrtc
|
98
VocieProcess/common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_neon_sse2.h
vendored
Normal file
98
VocieProcess/common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_neon_sse2.h
vendored
Normal file
@ -0,0 +1,98 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_NEON_SSE2_H_
|
||||||
|
#define MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_NEON_SSE2_H_
|
||||||
|
|
||||||
|
#include "common_audio/third_party/ooura/fft_size_128/ooura_fft.h"
|
||||||
|
#include "rtc_base/system/arch.h"
|
||||||
|
|
||||||
|
#ifdef _MSC_VER /* visual c++ */
|
||||||
|
#define ALIGN16_BEG __declspec(align(16))
|
||||||
|
#define ALIGN16_END
|
||||||
|
#else /* gcc or icc */
|
||||||
|
#define ALIGN16_BEG
|
||||||
|
#define ALIGN16_END __attribute__((aligned(16)))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
namespace webrtc {
|
||||||
|
|
||||||
|
// These tables used to be computed at run-time. For example, refer to:
|
||||||
|
// https://code.google.com/p/webrtc/source/browse/trunk/webrtc/modules/audio_processing/utility/apm_rdft.c?r=6564
|
||||||
|
// to see the initialization code.
|
||||||
|
#if defined(WEBRTC_ARCH_X86_FAMILY) || defined(WEBRTC_HAS_NEON)
|
||||||
|
// Constants used by SSE2 and NEON but initialized in the C path.
|
||||||
|
const ALIGN16_BEG float ALIGN16_END k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f};
|
||||||
|
|
||||||
|
ALIGN16_BEG const float ALIGN16_END rdft_wk1r[32] = {
|
||||||
|
1.000000000f, 1.000000000f, 0.707106769f, 0.707106769f, 0.923879564f,
|
||||||
|
0.923879564f, 0.382683456f, 0.382683456f, 0.980785251f, 0.980785251f,
|
||||||
|
0.555570245f, 0.555570245f, 0.831469595f, 0.831469595f, 0.195090324f,
|
||||||
|
0.195090324f, 0.995184720f, 0.995184720f, 0.634393334f, 0.634393334f,
|
||||||
|
0.881921291f, 0.881921291f, 0.290284663f, 0.290284663f, 0.956940353f,
|
||||||
|
0.956940353f, 0.471396744f, 0.471396744f, 0.773010433f, 0.773010433f,
|
||||||
|
0.098017141f, 0.098017141f,
|
||||||
|
};
|
||||||
|
ALIGN16_BEG const float ALIGN16_END rdft_wk2r[32] = {
|
||||||
|
1.000000000f, 1.000000000f, -0.000000000f, -0.000000000f, 0.707106769f,
|
||||||
|
0.707106769f, -0.707106769f, -0.707106769f, 0.923879564f, 0.923879564f,
|
||||||
|
-0.382683456f, -0.382683456f, 0.382683456f, 0.382683456f, -0.923879564f,
|
||||||
|
-0.923879564f, 0.980785251f, 0.980785251f, -0.195090324f, -0.195090324f,
|
||||||
|
0.555570245f, 0.555570245f, -0.831469595f, -0.831469595f, 0.831469595f,
|
||||||
|
0.831469595f, -0.555570245f, -0.555570245f, 0.195090324f, 0.195090324f,
|
||||||
|
-0.980785251f, -0.980785251f,
|
||||||
|
};
|
||||||
|
ALIGN16_BEG const float ALIGN16_END rdft_wk3r[32] = {
|
||||||
|
1.000000000f, 1.000000000f, -0.707106769f, -0.707106769f, 0.382683456f,
|
||||||
|
0.382683456f, -0.923879564f, -0.923879564f, 0.831469536f, 0.831469536f,
|
||||||
|
-0.980785251f, -0.980785251f, -0.195090353f, -0.195090353f, -0.555570245f,
|
||||||
|
-0.555570245f, 0.956940353f, 0.956940353f, -0.881921172f, -0.881921172f,
|
||||||
|
0.098017156f, 0.098017156f, -0.773010492f, -0.773010492f, 0.634393334f,
|
||||||
|
0.634393334f, -0.995184720f, -0.995184720f, -0.471396863f, -0.471396863f,
|
||||||
|
-0.290284693f, -0.290284693f,
|
||||||
|
};
|
||||||
|
ALIGN16_BEG const float ALIGN16_END rdft_wk1i[32] = {
|
||||||
|
-0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f, -0.382683456f,
|
||||||
|
0.382683456f, -0.923879564f, 0.923879564f, -0.195090324f, 0.195090324f,
|
||||||
|
-0.831469595f, 0.831469595f, -0.555570245f, 0.555570245f, -0.980785251f,
|
||||||
|
0.980785251f, -0.098017141f, 0.098017141f, -0.773010433f, 0.773010433f,
|
||||||
|
-0.471396744f, 0.471396744f, -0.956940353f, 0.956940353f, -0.290284663f,
|
||||||
|
0.290284663f, -0.881921291f, 0.881921291f, -0.634393334f, 0.634393334f,
|
||||||
|
-0.995184720f, 0.995184720f,
|
||||||
|
};
|
||||||
|
ALIGN16_BEG const float ALIGN16_END rdft_wk2i[32] = {
|
||||||
|
-0.000000000f, 0.000000000f, -1.000000000f, 1.000000000f, -0.707106769f,
|
||||||
|
0.707106769f, -0.707106769f, 0.707106769f, -0.382683456f, 0.382683456f,
|
||||||
|
-0.923879564f, 0.923879564f, -0.923879564f, 0.923879564f, -0.382683456f,
|
||||||
|
0.382683456f, -0.195090324f, 0.195090324f, -0.980785251f, 0.980785251f,
|
||||||
|
-0.831469595f, 0.831469595f, -0.555570245f, 0.555570245f, -0.555570245f,
|
||||||
|
0.555570245f, -0.831469595f, 0.831469595f, -0.980785251f, 0.980785251f,
|
||||||
|
-0.195090324f, 0.195090324f,
|
||||||
|
};
|
||||||
|
ALIGN16_BEG const float ALIGN16_END rdft_wk3i[32] = {
|
||||||
|
-0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f, -0.923879564f,
|
||||||
|
0.923879564f, 0.382683456f, -0.382683456f, -0.555570245f, 0.555570245f,
|
||||||
|
-0.195090353f, 0.195090353f, -0.980785251f, 0.980785251f, 0.831469536f,
|
||||||
|
-0.831469536f, -0.290284693f, 0.290284693f, -0.471396863f, 0.471396863f,
|
||||||
|
-0.995184720f, 0.995184720f, 0.634393334f, -0.634393334f, -0.773010492f,
|
||||||
|
0.773010492f, 0.098017156f, -0.098017156f, -0.881921172f, 0.881921172f,
|
||||||
|
0.956940353f, -0.956940353f,
|
||||||
|
};
|
||||||
|
ALIGN16_BEG const float ALIGN16_END cftmdl_wk1r[4] = {
|
||||||
|
0.707106769f,
|
||||||
|
0.707106769f,
|
||||||
|
0.707106769f,
|
||||||
|
-0.707106769f,
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
} // namespace webrtc
|
||||||
|
|
||||||
|
#endif // MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_NEON_SSE2_H_
|
Loading…
Reference in New Issue
Block a user