add neon code.

This commit is contained in:
luocai 2024-09-06 11:01:01 +08:00
parent fe7ab957c8
commit c6debcc62a
11 changed files with 1312 additions and 21 deletions

View File

@ -10,19 +10,18 @@
class EchoRecordPrivate { class EchoRecordPrivate {
public: public:
EchoRecordPrivate() { void initialize(int sampleRate, int channels, int period) {
std::unique_ptr<webrtc::EchoCanceller3Factory> factory = std::make_unique<webrtc::EchoCanceller3Factory>(); std::unique_ptr<webrtc::EchoCanceller3Factory> factory = std::make_unique<webrtc::EchoCanceller3Factory>();
echoCanceller = factory->Create(sampleRate, channels, channels);
echoCanceller = factory->Create(16000, 1, 1); nearendBuffer = std::make_unique<webrtc::AudioBuffer>(sampleRate, channels, sampleRate, channels, sampleRate, channels);
farendBuffer = std::make_unique<webrtc::AudioBuffer>(sampleRate, channels, sampleRate, channels, sampleRate, channels);
// nearendBuffer = std::make_unique<webrtc::AudioBuffer>(16000, 1, 16000, 1, 16000, 1); linearOutputBuffer = std::make_unique<webrtc::AudioBuffer>(sampleRate, channels, sampleRate, channels, sampleRate, channels);
// farendBuffer = std::make_unique<webrtc::AudioBuffer>(16000, 1, 16000, 1, 16000, 1);
} }
std::unique_ptr<webrtc::EchoControl> echoCanceller; std::unique_ptr<webrtc::EchoControl> echoCanceller;
// std::unique_ptr<webrtc::AudioBuffer> nearendBuffer; std::unique_ptr<webrtc::AudioBuffer> nearendBuffer;
// std::unique_ptr<webrtc::AudioBuffer> farendBuffer; std::unique_ptr<webrtc::AudioBuffer> farendBuffer;
std::unique_ptr<webrtc::AudioBuffer> linearOutputBuffer;
}; };
EchoRecordTask::EchoRecordTask() : m_d{new EchoRecordPrivate()} { EchoRecordTask::EchoRecordTask() : m_d{new EchoRecordPrivate()} {
@ -56,7 +55,7 @@ void EchoRecordTask::run() {
RkAudio::Format format; RkAudio::Format format;
format.channels = m_channels; format.channels = m_channels;
format.period = 20; format.period = 10;
m_speex = std::make_shared<SpeexDsp>(); m_speex = std::make_shared<SpeexDsp>();
m_speex->start(format.sampleRate, m_channels, format.period); m_speex->start(format.sampleRate, m_channels, format.period);
@ -66,6 +65,8 @@ void EchoRecordTask::run() {
m_webRtcAecm = std::make_shared<WebRtcAecm>(); m_webRtcAecm = std::make_shared<WebRtcAecm>();
m_webRtcAecm->start(format.sampleRate, format.channels, format.period); m_webRtcAecm->start(format.sampleRate, format.channels, format.period);
m_d->initialize(format.sampleRate, m_channels, format.period);
m_output = std::make_shared<RkAudio::Output>(); m_output = std::make_shared<RkAudio::Output>();
if (!m_output->open(sizeof(uint16_t), format.sampleRate, 2, format.period, m_dsp == Vqe)) { if (!m_output->open(sizeof(uint16_t), format.sampleRate, 2, format.period, m_dsp == Vqe)) {
LOG(error) << "audio output open failed."; LOG(error) << "audio output open failed.";
@ -87,18 +88,16 @@ void EchoRecordTask::run() {
reinterpret_cast<int16_t *>(m_outBuffer.data()), frame.frameSize); reinterpret_cast<int16_t *>(m_outBuffer.data()), frame.frameSize);
} else if (m_dsp == Aec3) { } else if (m_dsp == Aec3) {
webrtc::StreamConfig config(format.sampleRate, format.channels); // 单声道 webrtc::StreamConfig config(format.sampleRate, format.channels); // 单声道
webrtc::AudioBuffer nearendBuffer(format.sampleRate, 1, format.sampleRate, 1, format.sampleRate, 1); m_d->nearendBuffer->CopyFrom(reinterpret_cast<const int16_t *>(frame.data), config);
webrtc::AudioBuffer farendBuffer(format.sampleRate, 1, format.sampleRate, 1, format.sampleRate, 1);
webrtc::AudioBuffer linearOutputBuffer(format.sampleRate, 1, format.sampleRate, 1, format.sampleRate, 1);
nearendBuffer.CopyFrom(reinterpret_cast<const int16_t *>(frame.data), config);
farendBuffer.CopyFrom(reinterpret_cast<const int16_t *>(m_farendBuffer.data()), config); m_d->farendBuffer->CopyFrom(reinterpret_cast<const int16_t *>(m_farendBuffer.data()), config);
m_d->echoCanceller->AnalyzeRender(&farendBuffer); m_d->echoCanceller->AnalyzeRender(m_d->farendBuffer.get());
m_d->echoCanceller->AnalyzeCapture(&nearendBuffer); m_d->echoCanceller->AnalyzeCapture(m_d->nearendBuffer.get());
m_d->echoCanceller->ProcessCapture(&nearendBuffer, &linearOutputBuffer, /*level_change=*/false); m_d->echoCanceller->ProcessCapture(m_d->nearendBuffer.get(), false);
// m_d->echoCanceller->ProcessCapture(&nearendBuffer, &linearOutputBuffer, /*level_change=*/false);
linearOutputBuffer.CopyTo(config, reinterpret_cast<int16_t *>(m_outBuffer.data())); m_d->nearendBuffer->CopyTo(config, reinterpret_cast<int16_t *>(m_outBuffer.data()));
} }
if (m_channels == 2) { if (m_channels == 2) {

View File

@ -28,25 +28,31 @@ add_library(VocieProcess
common_audio/audio_util.cc common_audio/audio_util.cc
common_audio/channel_buffer.h common_audio/channel_buffer.cc common_audio/channel_buffer.h common_audio/channel_buffer.cc
common_audio/fir_filter_neon.h common_audio/fir_filter_neon.cc
common_audio/ring_buffer.h common_audio/ring_buffer.c common_audio/ring_buffer.h common_audio/ring_buffer.c
common_audio/resampler/push_sinc_resampler.h common_audio/resampler/push_sinc_resampler.cc common_audio/resampler/push_sinc_resampler.h common_audio/resampler/push_sinc_resampler.cc
common_audio/resampler/sinc_resampler.h common_audio/resampler/sinc_resampler.cc common_audio/resampler/sinc_resampler.h common_audio/resampler/sinc_resampler_neon.cc
common_audio/resampler/sinc_resampler.cc
common_audio/signal_processing/complex_bit_reverse.c common_audio/signal_processing/complex_bit_reverse.c
common_audio/signal_processing/complex_fft.c common_audio/signal_processing/complex_fft.c
common_audio/signal_processing/cross_correlation_neon.c
common_audio/signal_processing/cross_correlation.c common_audio/signal_processing/cross_correlation.c
common_audio/signal_processing/division_operations.c common_audio/signal_processing/division_operations.c
common_audio/signal_processing/dot_product_with_scale.h common_audio/signal_processing/dot_product_with_scale.cc common_audio/signal_processing/dot_product_with_scale.h common_audio/signal_processing/dot_product_with_scale.cc
common_audio/signal_processing/downsample_fast.c common_audio/signal_processing/downsample_fast.c
common_audio/signal_processing/downsample_fast_neon.c
common_audio/signal_processing/min_max_operations.c common_audio/signal_processing/min_max_operations.c
common_audio/signal_processing/min_max_operations_neon.c
common_audio/signal_processing/randomization_functions.c common_audio/signal_processing/randomization_functions.c
common_audio/signal_processing/real_fft.c common_audio/signal_processing/real_fft.c
common_audio/signal_processing/spl_init.c common_audio/signal_processing/spl_init.c
common_audio/signal_processing/splitting_filter.c common_audio/signal_processing/splitting_filter.c
common_audio/signal_processing/vector_scaling_operations.c common_audio/signal_processing/vector_scaling_operations.c
common_audio/third_party/ooura/fft_size_128/ooura_fft.h common_audio/third_party/ooura/fft_size_128/ooura_fft.cc common_audio/third_party/ooura/fft_size_128/ooura_fft.h common_audio/third_party/ooura/fft_size_128/ooura_fft_neon.cc
common_audio/third_party/ooura/fft_size_128/ooura_fft.cc
common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.h common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.c common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.h common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.c
rtc_base/checks.h rtc_base/checks.cc rtc_base/checks.h rtc_base/checks.cc
@ -132,6 +138,7 @@ add_library(VocieProcess
modules/audio_processing/aec3/transparent_mode.h modules/audio_processing/aec3/transparent_mode.cc modules/audio_processing/aec3/transparent_mode.h modules/audio_processing/aec3/transparent_mode.cc
modules/audio_processing/aecm/aecm_core.h modules/audio_processing/aecm/aecm_core.cc modules/audio_processing/aecm/aecm_core_c.cc modules/audio_processing/aecm/aecm_core.h modules/audio_processing/aecm/aecm_core.cc modules/audio_processing/aecm/aecm_core_c.cc
modules/audio_processing/aecm/aecm_core_neon.cc
modules/audio_processing/aecm/echo_control_mobile.h modules/audio_processing/aecm/echo_control_mobile.cc modules/audio_processing/aecm/echo_control_mobile.h modules/audio_processing/aecm/echo_control_mobile.cc
modules/audio_processing/logging/apm_data_dumper.h modules/audio_processing/logging/apm_data_dumper.cc modules/audio_processing/logging/apm_data_dumper.h modules/audio_processing/logging/apm_data_dumper.cc
@ -148,6 +155,7 @@ target_compile_definitions(VocieProcess
PRIVATE NOMINMAX # <windows.h> PRIVATE NOMINMAX # <windows.h>
PRIVATE RTC_DISABLE_LOGGING PRIVATE RTC_DISABLE_LOGGING
PUBLIC RTC_DISABLE_METRICS PUBLIC RTC_DISABLE_METRICS
PUBLIC WEBRTC_HAS_NEON
PUBLIC WEBRTC_APM_DEBUG_DUMP=0 PUBLIC WEBRTC_APM_DEBUG_DUMP=0
$<$<PLATFORM_ID:Windows>:WEBRTC_WIN> $<$<PLATFORM_ID:Windows>:WEBRTC_WIN>
$<$<PLATFORM_ID:Linux>:WEBRTC_POSIX WEBRTC_LINUX> $<$<PLATFORM_ID:Linux>:WEBRTC_POSIX WEBRTC_LINUX>

View File

@ -0,0 +1,30 @@
/*
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef COMMON_AUDIO_FIR_FILTER_H_
#define COMMON_AUDIO_FIR_FILTER_H_
#include <string.h>
namespace webrtc {
// Finite Impulse Response filter using floating-point arithmetic.
class FIRFilter {
public:
virtual ~FIRFilter() {}
// Filters the `in` data supplied.
// `out` must be previously allocated and it must be at least of `length`.
virtual void Filter(const float* in, size_t length, float* out) = 0;
};
} // namespace webrtc
#endif // COMMON_AUDIO_FIR_FILTER_H_

View File

@ -0,0 +1,73 @@
/*
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "common_audio/fir_filter_neon.h"
#include <arm_neon.h>
#include <string.h>
#include "rtc_base/checks.h"
#include "rtc_base/memory/aligned_malloc.h"
namespace webrtc {
FIRFilterNEON::~FIRFilterNEON() {}
FIRFilterNEON::FIRFilterNEON(const float* coefficients,
size_t coefficients_length,
size_t max_input_length)
: // Closest higher multiple of four.
coefficients_length_((coefficients_length + 3) & ~0x03),
state_length_(coefficients_length_ - 1),
coefficients_(static_cast<float*>(
AlignedMalloc(sizeof(float) * coefficients_length_, 16))),
state_(static_cast<float*>(
AlignedMalloc(sizeof(float) * (max_input_length + state_length_),
16))) {
// Add zeros at the end of the coefficients.
size_t padding = coefficients_length_ - coefficients_length;
memset(coefficients_.get(), 0.f, padding * sizeof(coefficients_[0]));
// The coefficients are reversed to compensate for the order in which the
// input samples are acquired (most recent last).
for (size_t i = 0; i < coefficients_length; ++i) {
coefficients_[i + padding] = coefficients[coefficients_length - i - 1];
}
memset(state_.get(), 0.f,
(max_input_length + state_length_) * sizeof(state_[0]));
}
void FIRFilterNEON::Filter(const float* in, size_t length, float* out) {
RTC_DCHECK_GT(length, 0);
memcpy(&state_[state_length_], in, length * sizeof(*in));
// Convolves the input signal `in` with the filter kernel `coefficients_`
// taking into account the previous state.
for (size_t i = 0; i < length; ++i) {
float* in_ptr = &state_[i];
float* coef_ptr = coefficients_.get();
float32x4_t m_sum = vmovq_n_f32(0);
float32x4_t m_in;
for (size_t j = 0; j < coefficients_length_; j += 4) {
m_in = vld1q_f32(in_ptr + j);
m_sum = vmlaq_f32(m_sum, m_in, vld1q_f32(coef_ptr + j));
}
float32x2_t m_half = vadd_f32(vget_high_f32(m_sum), vget_low_f32(m_sum));
out[i] = vget_lane_f32(vpadd_f32(m_half, m_half), 0);
}
// Update current state.
memmove(state_.get(), &state_[length], state_length_ * sizeof(state_[0]));
}
} // namespace webrtc

View File

@ -0,0 +1,39 @@
/*
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef COMMON_AUDIO_FIR_FILTER_NEON_H_
#define COMMON_AUDIO_FIR_FILTER_NEON_H_
#include <memory>
#include "common_audio/fir_filter.h"
#include "rtc_base/memory/aligned_malloc.h"
namespace webrtc {
class FIRFilterNEON : public FIRFilter {
public:
FIRFilterNEON(const float* coefficients,
size_t coefficients_length,
size_t max_input_length);
~FIRFilterNEON() override;
void Filter(const float* in, size_t length, float* out) override;
private:
size_t coefficients_length_;
size_t state_length_;
std::unique_ptr<float[], AlignedFreeDeleter> coefficients_;
std::unique_ptr<float[], AlignedFreeDeleter> state_;
};
} // namespace webrtc
#endif // COMMON_AUDIO_FIR_FILTER_NEON_H_

View File

@ -0,0 +1,48 @@
/*
* Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
// Modified from the Chromium original:
// src/media/base/sinc_resampler.cc
#include <arm_neon.h>
#include "common_audio/resampler/sinc_resampler.h"
namespace webrtc {
float SincResampler::Convolve_NEON(const float* input_ptr,
const float* k1,
const float* k2,
double kernel_interpolation_factor) {
float32x4_t m_input;
float32x4_t m_sums1 = vmovq_n_f32(0);
float32x4_t m_sums2 = vmovq_n_f32(0);
const float* upper = input_ptr + kKernelSize;
for (; input_ptr < upper;) {
m_input = vld1q_f32(input_ptr);
input_ptr += 4;
m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1));
k1 += 4;
m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2));
k2 += 4;
}
// Linearly interpolate the two "convolutions".
m_sums1 = vmlaq_f32(
vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),
m_sums2, vmovq_n_f32(kernel_interpolation_factor));
// Sum components together.
float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));
return vget_lane_f32(vpadd_f32(m_half, m_half), 0);
}
} // namespace webrtc

View File

@ -0,0 +1,88 @@
/*
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "common_audio/signal_processing/include/signal_processing_library.h"
#include "rtc_base/system/arch.h"
#include <arm_neon.h>
static inline void DotProductWithScaleNeon(int32_t* cross_correlation,
const int16_t* vector1,
const int16_t* vector2,
size_t length,
int scaling) {
size_t i = 0;
size_t len1 = length >> 3;
size_t len2 = length & 7;
int64x2_t sum0 = vdupq_n_s64(0);
int64x2_t sum1 = vdupq_n_s64(0);
for (i = len1; i > 0; i -= 1) {
int16x8_t seq1_16x8 = vld1q_s16(vector1);
int16x8_t seq2_16x8 = vld1q_s16(vector2);
#if defined(WEBRTC_ARCH_ARM64)
int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8),
vget_low_s16(seq2_16x8));
int32x4_t tmp1 = vmull_high_s16(seq1_16x8, seq2_16x8);
#else
int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8),
vget_low_s16(seq2_16x8));
int32x4_t tmp1 = vmull_s16(vget_high_s16(seq1_16x8),
vget_high_s16(seq2_16x8));
#endif
sum0 = vpadalq_s32(sum0, tmp0);
sum1 = vpadalq_s32(sum1, tmp1);
vector1 += 8;
vector2 += 8;
}
// Calculate the rest of the samples.
int64_t sum_res = 0;
for (i = len2; i > 0; i -= 1) {
sum_res += WEBRTC_SPL_MUL_16_16(*vector1, *vector2);
vector1++;
vector2++;
}
sum0 = vaddq_s64(sum0, sum1);
#if defined(WEBRTC_ARCH_ARM64)
int64_t sum2 = vaddvq_s64(sum0);
*cross_correlation = (int32_t)((sum2 + sum_res) >> scaling);
#else
int64x1_t shift = vdup_n_s64(-scaling);
int64x1_t sum2 = vadd_s64(vget_low_s64(sum0), vget_high_s64(sum0));
sum2 = vadd_s64(sum2, vdup_n_s64(sum_res));
sum2 = vshl_s64(sum2, shift);
vst1_lane_s32(cross_correlation, vreinterpret_s32_s64(sum2), 0);
#endif
}
/* NEON version of WebRtcSpl_CrossCorrelation() for ARM32/64 platforms. */
void WebRtcSpl_CrossCorrelationNeon(int32_t* cross_correlation,
const int16_t* seq1,
const int16_t* seq2,
size_t dim_seq,
size_t dim_cross_correlation,
int right_shifts,
int step_seq2) {
int i = 0;
for (i = 0; i < (int)dim_cross_correlation; i++) {
const int16_t* seq1_ptr = seq1;
const int16_t* seq2_ptr = seq2 + (step_seq2 * i);
DotProductWithScaleNeon(cross_correlation,
seq1_ptr,
seq2_ptr,
dim_seq,
right_shifts);
cross_correlation++;
}
}

View File

@ -0,0 +1,224 @@
/*
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "common_audio/signal_processing/include/signal_processing_library.h"
#include "rtc_base/checks.h"
// NEON intrinsics version of WebRtcSpl_DownsampleFast()
// for ARM 32-bit/64-bit platforms.
int WebRtcSpl_DownsampleFastNeon(const int16_t* data_in,
size_t data_in_length,
int16_t* data_out,
size_t data_out_length,
const int16_t* __restrict coefficients,
size_t coefficients_length,
int factor,
size_t delay) {
// Using signed indexes to be able to compute negative i-j that
// is used to index data_in.
int i = 0;
int j = 0;
int32_t out_s32 = 0;
int endpos = delay + factor * (data_out_length - 1) + 1;
size_t res = data_out_length & 0x7;
int endpos1 = endpos - factor * res;
// Return error if any of the running conditions doesn't meet.
if (data_out_length == 0 || coefficients_length == 0
|| (int)data_in_length < endpos) {
return -1;
}
RTC_DCHECK_GE(endpos, 0);
RTC_DCHECK_GE(endpos1, 0);
// First part, unroll the loop 8 times, with 3 subcases
// (factor == 2, 4, others).
switch (factor) {
case 2: {
for (i = delay; i < endpos1; i += 16) {
// Round value, 0.5 in Q12.
int32x4_t out32x4_0 = vdupq_n_s32(2048);
int32x4_t out32x4_1 = vdupq_n_s32(2048);
#if defined(WEBRTC_ARCH_ARM64)
// Unroll the loop 2 times.
for (j = 0; j < (int)coefficients_length - 1; j += 2) {
int32x2_t coeff32 = vld1_dup_s32((int32_t*)&coefficients[j]);
int16x4_t coeff16x4 = vreinterpret_s16_s32(coeff32);
int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j - 1]);
// Mul and accumulate low 64-bit data.
int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]);
int16x4_t in16x4_1 = vget_low_s16(in16x8x2.val[1]);
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 1);
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_1, coeff16x4, 0);
// Mul and accumulate high 64-bit data.
// TODO: vget_high_s16 need extra cost on ARM64. This could be
// replaced by vmlal_high_lane_s16. But for the interface of
// vmlal_high_lane_s16, there is a bug in gcc 4.9.
// This issue need to be tracked in the future.
int16x4_t in16x4_2 = vget_high_s16(in16x8x2.val[0]);
int16x4_t in16x4_3 = vget_high_s16(in16x8x2.val[1]);
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_2, coeff16x4, 1);
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_3, coeff16x4, 0);
}
for (; j < (int)coefficients_length; j++) {
int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j]);
// Mul and accumulate low 64-bit data.
int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]);
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
// Mul and accumulate high 64-bit data.
// TODO: vget_high_s16 need extra cost on ARM64. This could be
// replaced by vmlal_high_lane_s16. But for the interface of
// vmlal_high_lane_s16, there is a bug in gcc 4.9.
// This issue need to be tracked in the future.
int16x4_t in16x4_1 = vget_high_s16(in16x8x2.val[0]);
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
}
#else
// On ARMv7, the loop unrolling 2 times results in performance
// regression.
for (j = 0; j < (int)coefficients_length; j++) {
int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j]);
// Mul and accumulate.
int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]);
int16x4_t in16x4_1 = vget_high_s16(in16x8x2.val[0]);
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
}
#endif
// Saturate and store the output.
int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12);
int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12);
vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1));
data_out += 8;
}
break;
}
case 4: {
for (i = delay; i < endpos1; i += 32) {
// Round value, 0.5 in Q12.
int32x4_t out32x4_0 = vdupq_n_s32(2048);
int32x4_t out32x4_1 = vdupq_n_s32(2048);
// Unroll the loop 4 times.
for (j = 0; j < (int)coefficients_length - 3; j += 4) {
int16x4_t coeff16x4 = vld1_s16(&coefficients[j]);
int16x8x4_t in16x8x4 = vld4q_s16(&data_in[i - j - 3]);
// Mul and accumulate low 64-bit data.
int16x4_t in16x4_0 = vget_low_s16(in16x8x4.val[0]);
int16x4_t in16x4_2 = vget_low_s16(in16x8x4.val[1]);
int16x4_t in16x4_4 = vget_low_s16(in16x8x4.val[2]);
int16x4_t in16x4_6 = vget_low_s16(in16x8x4.val[3]);
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 3);
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_2, coeff16x4, 2);
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_4, coeff16x4, 1);
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_6, coeff16x4, 0);
// Mul and accumulate high 64-bit data.
// TODO: vget_high_s16 need extra cost on ARM64. This could be
// replaced by vmlal_high_lane_s16. But for the interface of
// vmlal_high_lane_s16, there is a bug in gcc 4.9.
// This issue need to be tracked in the future.
int16x4_t in16x4_1 = vget_high_s16(in16x8x4.val[0]);
int16x4_t in16x4_3 = vget_high_s16(in16x8x4.val[1]);
int16x4_t in16x4_5 = vget_high_s16(in16x8x4.val[2]);
int16x4_t in16x4_7 = vget_high_s16(in16x8x4.val[3]);
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 3);
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_3, coeff16x4, 2);
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_5, coeff16x4, 1);
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_7, coeff16x4, 0);
}
for (; j < (int)coefficients_length; j++) {
int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
int16x8x4_t in16x8x4 = vld4q_s16(&data_in[i - j]);
// Mul and accumulate low 64-bit data.
int16x4_t in16x4_0 = vget_low_s16(in16x8x4.val[0]);
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
// Mul and accumulate high 64-bit data.
// TODO: vget_high_s16 need extra cost on ARM64. This could be
// replaced by vmlal_high_lane_s16. But for the interface of
// vmlal_high_lane_s16, there is a bug in gcc 4.9.
// This issue need to be tracked in the future.
int16x4_t in16x4_1 = vget_high_s16(in16x8x4.val[0]);
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
}
// Saturate and store the output.
int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12);
int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12);
vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1));
data_out += 8;
}
break;
}
default: {
for (i = delay; i < endpos1; i += factor * 8) {
// Round value, 0.5 in Q12.
int32x4_t out32x4_0 = vdupq_n_s32(2048);
int32x4_t out32x4_1 = vdupq_n_s32(2048);
for (j = 0; j < (int)coefficients_length; j++) {
int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
int16x4_t in16x4_0 = vld1_dup_s16(&data_in[i - j]);
in16x4_0 = vld1_lane_s16(&data_in[i + factor - j], in16x4_0, 1);
in16x4_0 = vld1_lane_s16(&data_in[i + factor * 2 - j], in16x4_0, 2);
in16x4_0 = vld1_lane_s16(&data_in[i + factor * 3 - j], in16x4_0, 3);
int16x4_t in16x4_1 = vld1_dup_s16(&data_in[i + factor * 4 - j]);
in16x4_1 = vld1_lane_s16(&data_in[i + factor * 5 - j], in16x4_1, 1);
in16x4_1 = vld1_lane_s16(&data_in[i + factor * 6 - j], in16x4_1, 2);
in16x4_1 = vld1_lane_s16(&data_in[i + factor * 7 - j], in16x4_1, 3);
// Mul and accumulate.
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
}
// Saturate and store the output.
int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12);
int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12);
vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1));
data_out += 8;
}
break;
}
}
// Second part, do the rest iterations (if any).
for (; i < endpos; i += factor) {
out_s32 = 2048; // Round value, 0.5 in Q12.
for (j = 0; j < (int)coefficients_length; j++) {
out_s32 = WebRtc_MulAccumW16(coefficients[j], data_in[i - j], out_s32);
}
// Saturate and store the output.
out_s32 >>= 12;
*data_out++ = WebRtcSpl_SatW32ToW16(out_s32);
}
return 0;
}

View File

@ -0,0 +1,333 @@
/*
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include <stdlib.h>
#include "rtc_base/checks.h"
#include "common_audio/signal_processing/include/signal_processing_library.h"
// Maximum absolute value of word16 vector. C version for generic platforms.
int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, size_t length) {
int absolute = 0, maximum = 0;
RTC_DCHECK_GT(length, 0);
const int16_t* p_start = vector;
size_t rest = length & 7;
const int16_t* p_end = vector + length - rest;
int16x8_t v;
uint16x8_t max_qv;
max_qv = vdupq_n_u16(0);
while (p_start < p_end) {
v = vld1q_s16(p_start);
// Note vabs doesn't change the value of -32768.
v = vabsq_s16(v);
// Use u16 so we don't lose the value -32768.
max_qv = vmaxq_u16(max_qv, vreinterpretq_u16_s16(v));
p_start += 8;
}
#ifdef WEBRTC_ARCH_ARM64
maximum = (int)vmaxvq_u16(max_qv);
#else
uint16x4_t max_dv;
max_dv = vmax_u16(vget_low_u16(max_qv), vget_high_u16(max_qv));
max_dv = vpmax_u16(max_dv, max_dv);
max_dv = vpmax_u16(max_dv, max_dv);
maximum = (int)vget_lane_u16(max_dv, 0);
#endif
p_end = vector + length;
while (p_start < p_end) {
absolute = abs((int)(*p_start));
if (absolute > maximum) {
maximum = absolute;
}
p_start++;
}
// Guard the case for abs(-32768).
if (maximum > WEBRTC_SPL_WORD16_MAX) {
maximum = WEBRTC_SPL_WORD16_MAX;
}
return (int16_t)maximum;
}
// Maximum absolute value of word32 vector. NEON intrinsics version for
// ARM 32-bit/64-bit platforms.
int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, size_t length) {
// Use uint32_t for the local variables, to accommodate the return value
// of abs(0x80000000), which is 0x80000000.
uint32_t absolute = 0, maximum = 0;
size_t i = 0;
size_t residual = length & 0x7;
RTC_DCHECK_GT(length, 0);
const int32_t* p_start = vector;
uint32x4_t max32x4_0 = vdupq_n_u32(0);
uint32x4_t max32x4_1 = vdupq_n_u32(0);
// First part, unroll the loop 8 times.
for (i = 0; i < length - residual; i += 8) {
int32x4_t in32x4_0 = vld1q_s32(p_start);
p_start += 4;
int32x4_t in32x4_1 = vld1q_s32(p_start);
p_start += 4;
in32x4_0 = vabsq_s32(in32x4_0);
in32x4_1 = vabsq_s32(in32x4_1);
// vabs doesn't change the value of 0x80000000.
// Use u32 so we don't lose the value 0x80000000.
max32x4_0 = vmaxq_u32(max32x4_0, vreinterpretq_u32_s32(in32x4_0));
max32x4_1 = vmaxq_u32(max32x4_1, vreinterpretq_u32_s32(in32x4_1));
}
uint32x4_t max32x4 = vmaxq_u32(max32x4_0, max32x4_1);
#if defined(WEBRTC_ARCH_ARM64)
maximum = vmaxvq_u32(max32x4);
#else
uint32x2_t max32x2 = vmax_u32(vget_low_u32(max32x4), vget_high_u32(max32x4));
max32x2 = vpmax_u32(max32x2, max32x2);
maximum = vget_lane_u32(max32x2, 0);
#endif
// Second part, do the remaining iterations (if any).
for (i = residual; i > 0; i--) {
absolute = abs((int)(*p_start));
if (absolute > maximum) {
maximum = absolute;
}
p_start++;
}
// Guard against the case for 0x80000000.
maximum = WEBRTC_SPL_MIN(maximum, WEBRTC_SPL_WORD32_MAX);
return (int32_t)maximum;
}
// Maximum value of word16 vector. NEON intrinsics version for
// ARM 32-bit/64-bit platforms.
int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, size_t length) {
int16_t maximum = WEBRTC_SPL_WORD16_MIN;
size_t i = 0;
size_t residual = length & 0x7;
RTC_DCHECK_GT(length, 0);
const int16_t* p_start = vector;
int16x8_t max16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MIN);
// First part, unroll the loop 8 times.
for (i = 0; i < length - residual; i += 8) {
int16x8_t in16x8 = vld1q_s16(p_start);
max16x8 = vmaxq_s16(max16x8, in16x8);
p_start += 8;
}
#if defined(WEBRTC_ARCH_ARM64)
maximum = vmaxvq_s16(max16x8);
#else
int16x4_t max16x4 = vmax_s16(vget_low_s16(max16x8), vget_high_s16(max16x8));
max16x4 = vpmax_s16(max16x4, max16x4);
max16x4 = vpmax_s16(max16x4, max16x4);
maximum = vget_lane_s16(max16x4, 0);
#endif
// Second part, do the remaining iterations (if any).
for (i = residual; i > 0; i--) {
if (*p_start > maximum)
maximum = *p_start;
p_start++;
}
return maximum;
}
// Maximum value of word32 vector. NEON intrinsics version for
// ARM 32-bit/64-bit platforms.
int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, size_t length) {
int32_t maximum = WEBRTC_SPL_WORD32_MIN;
size_t i = 0;
size_t residual = length & 0x7;
RTC_DCHECK_GT(length, 0);
const int32_t* p_start = vector;
int32x4_t max32x4_0 = vdupq_n_s32(WEBRTC_SPL_WORD32_MIN);
int32x4_t max32x4_1 = vdupq_n_s32(WEBRTC_SPL_WORD32_MIN);
// First part, unroll the loop 8 times.
for (i = 0; i < length - residual; i += 8) {
int32x4_t in32x4_0 = vld1q_s32(p_start);
p_start += 4;
int32x4_t in32x4_1 = vld1q_s32(p_start);
p_start += 4;
max32x4_0 = vmaxq_s32(max32x4_0, in32x4_0);
max32x4_1 = vmaxq_s32(max32x4_1, in32x4_1);
}
int32x4_t max32x4 = vmaxq_s32(max32x4_0, max32x4_1);
#if defined(WEBRTC_ARCH_ARM64)
maximum = vmaxvq_s32(max32x4);
#else
int32x2_t max32x2 = vmax_s32(vget_low_s32(max32x4), vget_high_s32(max32x4));
max32x2 = vpmax_s32(max32x2, max32x2);
maximum = vget_lane_s32(max32x2, 0);
#endif
// Second part, do the remaining iterations (if any).
for (i = residual; i > 0; i--) {
if (*p_start > maximum)
maximum = *p_start;
p_start++;
}
return maximum;
}
// Minimum value of word16 vector. NEON intrinsics version for
// ARM 32-bit/64-bit platforms.
int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, size_t length) {
int16_t minimum = WEBRTC_SPL_WORD16_MAX;
size_t i = 0;
size_t residual = length & 0x7;
RTC_DCHECK_GT(length, 0);
const int16_t* p_start = vector;
int16x8_t min16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MAX);
// First part, unroll the loop 8 times.
for (i = 0; i < length - residual; i += 8) {
int16x8_t in16x8 = vld1q_s16(p_start);
min16x8 = vminq_s16(min16x8, in16x8);
p_start += 8;
}
#if defined(WEBRTC_ARCH_ARM64)
minimum = vminvq_s16(min16x8);
#else
int16x4_t min16x4 = vmin_s16(vget_low_s16(min16x8), vget_high_s16(min16x8));
min16x4 = vpmin_s16(min16x4, min16x4);
min16x4 = vpmin_s16(min16x4, min16x4);
minimum = vget_lane_s16(min16x4, 0);
#endif
// Second part, do the remaining iterations (if any).
for (i = residual; i > 0; i--) {
if (*p_start < minimum)
minimum = *p_start;
p_start++;
}
return minimum;
}
// Minimum value of word32 vector. NEON intrinsics version for
// ARM 32-bit/64-bit platforms.
int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, size_t length) {
int32_t minimum = WEBRTC_SPL_WORD32_MAX;
size_t i = 0;
size_t residual = length & 0x7;
RTC_DCHECK_GT(length, 0);
const int32_t* p_start = vector;
int32x4_t min32x4_0 = vdupq_n_s32(WEBRTC_SPL_WORD32_MAX);
int32x4_t min32x4_1 = vdupq_n_s32(WEBRTC_SPL_WORD32_MAX);
// First part, unroll the loop 8 times.
for (i = 0; i < length - residual; i += 8) {
int32x4_t in32x4_0 = vld1q_s32(p_start);
p_start += 4;
int32x4_t in32x4_1 = vld1q_s32(p_start);
p_start += 4;
min32x4_0 = vminq_s32(min32x4_0, in32x4_0);
min32x4_1 = vminq_s32(min32x4_1, in32x4_1);
}
int32x4_t min32x4 = vminq_s32(min32x4_0, min32x4_1);
#if defined(WEBRTC_ARCH_ARM64)
minimum = vminvq_s32(min32x4);
#else
int32x2_t min32x2 = vmin_s32(vget_low_s32(min32x4), vget_high_s32(min32x4));
min32x2 = vpmin_s32(min32x2, min32x2);
minimum = vget_lane_s32(min32x2, 0);
#endif
// Second part, do the remaining iterations (if any).
for (i = residual; i > 0; i--) {
if (*p_start < minimum)
minimum = *p_start;
p_start++;
}
return minimum;
}
// Finds both the minimum and maximum elements in an array of 16-bit integers.
void WebRtcSpl_MinMaxW16Neon(const int16_t* vector, size_t length,
int16_t* min_val, int16_t* max_val) {
int16_t minimum = WEBRTC_SPL_WORD16_MAX;
int16_t maximum = WEBRTC_SPL_WORD16_MIN;
size_t i = 0;
size_t residual = length & 0x7;
RTC_DCHECK_GT(length, 0);
const int16_t* p_start = vector;
int16x8_t min16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MAX);
int16x8_t max16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MIN);
// First part, unroll the loop 8 times.
for (i = 0; i < length - residual; i += 8) {
int16x8_t in16x8 = vld1q_s16(p_start);
min16x8 = vminq_s16(min16x8, in16x8);
max16x8 = vmaxq_s16(max16x8, in16x8);
p_start += 8;
}
#if defined(WEBRTC_ARCH_ARM64)
minimum = vminvq_s16(min16x8);
maximum = vmaxvq_s16(max16x8);
#else
int16x4_t min16x4 = vmin_s16(vget_low_s16(min16x8), vget_high_s16(min16x8));
min16x4 = vpmin_s16(min16x4, min16x4);
min16x4 = vpmin_s16(min16x4, min16x4);
minimum = vget_lane_s16(min16x4, 0);
int16x4_t max16x4 = vmax_s16(vget_low_s16(max16x8), vget_high_s16(max16x8));
max16x4 = vpmax_s16(max16x4, max16x4);
max16x4 = vpmax_s16(max16x4, max16x4);
maximum = vget_lane_s16(max16x4, 0);
#endif
// Second part, do the remaining iterations (if any).
for (i = residual; i > 0; i--) {
if (*p_start < minimum)
minimum = *p_start;
if (*p_start > maximum)
maximum = *p_start;
p_start++;
}
*min_val = minimum;
*max_val = maximum;
}

View File

@ -0,0 +1,351 @@
/*
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
/*
* The rdft AEC algorithm, neon version of speed-critical functions.
*
* Based on the sse2 version.
*/
#include <arm_neon.h>
#include "common_audio/third_party/ooura/fft_size_128/ooura_fft.h"
#include "common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_common.h"
#include "common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_neon_sse2.h"
namespace webrtc {
#if defined(WEBRTC_HAS_NEON)
void cft1st_128_neon(float* a) {
const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
int j, k2;
for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {
float32x4_t a00v = vld1q_f32(&a[j + 0]);
float32x4_t a04v = vld1q_f32(&a[j + 4]);
float32x4_t a08v = vld1q_f32(&a[j + 8]);
float32x4_t a12v = vld1q_f32(&a[j + 12]);
float32x4_t a01v = vcombine_f32(vget_low_f32(a00v), vget_low_f32(a08v));
float32x4_t a23v = vcombine_f32(vget_high_f32(a00v), vget_high_f32(a08v));
float32x4_t a45v = vcombine_f32(vget_low_f32(a04v), vget_low_f32(a12v));
float32x4_t a67v = vcombine_f32(vget_high_f32(a04v), vget_high_f32(a12v));
const float32x4_t wk1rv = vld1q_f32(&rdft_wk1r[k2]);
const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2]);
const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2]);
const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2]);
const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2]);
const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2]);
float32x4_t x0v = vaddq_f32(a01v, a23v);
const float32x4_t x1v = vsubq_f32(a01v, a23v);
const float32x4_t x2v = vaddq_f32(a45v, a67v);
const float32x4_t x3v = vsubq_f32(a45v, a67v);
const float32x4_t x3w = vrev64q_f32(x3v);
float32x4_t x0w;
a01v = vaddq_f32(x0v, x2v);
x0v = vsubq_f32(x0v, x2v);
x0w = vrev64q_f32(x0v);
a45v = vmulq_f32(wk2rv, x0v);
a45v = vmlaq_f32(a45v, wk2iv, x0w);
x0v = vmlaq_f32(x1v, x3w, vec_swap_sign);
x0w = vrev64q_f32(x0v);
a23v = vmulq_f32(wk1rv, x0v);
a23v = vmlaq_f32(a23v, wk1iv, x0w);
x0v = vmlsq_f32(x1v, x3w, vec_swap_sign);
x0w = vrev64q_f32(x0v);
a67v = vmulq_f32(wk3rv, x0v);
a67v = vmlaq_f32(a67v, wk3iv, x0w);
a00v = vcombine_f32(vget_low_f32(a01v), vget_low_f32(a23v));
a04v = vcombine_f32(vget_low_f32(a45v), vget_low_f32(a67v));
a08v = vcombine_f32(vget_high_f32(a01v), vget_high_f32(a23v));
a12v = vcombine_f32(vget_high_f32(a45v), vget_high_f32(a67v));
vst1q_f32(&a[j + 0], a00v);
vst1q_f32(&a[j + 4], a04v);
vst1q_f32(&a[j + 8], a08v);
vst1q_f32(&a[j + 12], a12v);
}
}
void cftmdl_128_neon(float* a) {
int j;
const int l = 8;
const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
float32x4_t wk1rv = vld1q_f32(cftmdl_wk1r);
for (j = 0; j < l; j += 2) {
const float32x2_t a_00 = vld1_f32(&a[j + 0]);
const float32x2_t a_08 = vld1_f32(&a[j + 8]);
const float32x2_t a_32 = vld1_f32(&a[j + 32]);
const float32x2_t a_40 = vld1_f32(&a[j + 40]);
const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);
const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);
const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);
const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);
const float32x2_t a_16 = vld1_f32(&a[j + 16]);
const float32x2_t a_24 = vld1_f32(&a[j + 24]);
const float32x2_t a_48 = vld1_f32(&a[j + 48]);
const float32x2_t a_56 = vld1_f32(&a[j + 56]);
const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);
const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);
const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);
const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);
const float32x4_t xx0 = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);
const float32x4_t x1_x3_add =
vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
const float32x4_t x1_x3_sub =
vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
const float32x2_t yy0_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 0);
const float32x2_t yy0_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 0);
const float32x4_t yy0_as = vcombine_f32(yy0_a, yy0_s);
const float32x2_t yy1_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 1);
const float32x2_t yy1_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 1);
const float32x4_t yy1_as = vcombine_f32(yy1_a, yy1_s);
const float32x4_t yy0 = vmlaq_f32(yy0_as, vec_swap_sign, yy1_as);
const float32x4_t yy4 = vmulq_f32(wk1rv, yy0);
const float32x4_t xx1_rev = vrev64q_f32(xx1);
const float32x4_t yy4_rev = vrev64q_f32(yy4);
vst1_f32(&a[j + 0], vget_low_f32(xx0));
vst1_f32(&a[j + 32], vget_high_f32(xx0));
vst1_f32(&a[j + 16], vget_low_f32(xx1));
vst1_f32(&a[j + 48], vget_high_f32(xx1_rev));
a[j + 48] = -a[j + 48];
vst1_f32(&a[j + 8], vget_low_f32(x1_x3_add));
vst1_f32(&a[j + 24], vget_low_f32(x1_x3_sub));
vst1_f32(&a[j + 40], vget_low_f32(yy4));
vst1_f32(&a[j + 56], vget_high_f32(yy4_rev));
}
{
const int k = 64;
const int k1 = 2;
const int k2 = 2 * k1;
const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2 + 0]);
const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2 + 0]);
const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2 + 0]);
const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2 + 0]);
const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2 + 0]);
wk1rv = vld1q_f32(&rdft_wk1r[k2 + 0]);
for (j = k; j < l + k; j += 2) {
const float32x2_t a_00 = vld1_f32(&a[j + 0]);
const float32x2_t a_08 = vld1_f32(&a[j + 8]);
const float32x2_t a_32 = vld1_f32(&a[j + 32]);
const float32x2_t a_40 = vld1_f32(&a[j + 40]);
const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);
const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);
const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);
const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);
const float32x2_t a_16 = vld1_f32(&a[j + 16]);
const float32x2_t a_24 = vld1_f32(&a[j + 24]);
const float32x2_t a_48 = vld1_f32(&a[j + 48]);
const float32x2_t a_56 = vld1_f32(&a[j + 56]);
const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);
const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);
const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);
const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);
const float32x4_t xx = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);
const float32x4_t x1_x3_add =
vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
const float32x4_t x1_x3_sub =
vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
float32x4_t xx4 = vmulq_f32(wk2rv, xx1);
float32x4_t xx12 = vmulq_f32(wk1rv, x1_x3_add);
float32x4_t xx22 = vmulq_f32(wk3rv, x1_x3_sub);
xx4 = vmlaq_f32(xx4, wk2iv, vrev64q_f32(xx1));
xx12 = vmlaq_f32(xx12, wk1iv, vrev64q_f32(x1_x3_add));
xx22 = vmlaq_f32(xx22, wk3iv, vrev64q_f32(x1_x3_sub));
vst1_f32(&a[j + 0], vget_low_f32(xx));
vst1_f32(&a[j + 32], vget_high_f32(xx));
vst1_f32(&a[j + 16], vget_low_f32(xx4));
vst1_f32(&a[j + 48], vget_high_f32(xx4));
vst1_f32(&a[j + 8], vget_low_f32(xx12));
vst1_f32(&a[j + 40], vget_high_f32(xx12));
vst1_f32(&a[j + 24], vget_low_f32(xx22));
vst1_f32(&a[j + 56], vget_high_f32(xx22));
}
}
}
__inline static float32x4_t reverse_order_f32x4(float32x4_t in) {
// A B C D -> C D A B
const float32x4_t rev = vcombine_f32(vget_high_f32(in), vget_low_f32(in));
// C D A B -> D C B A
return vrev64q_f32(rev);
}
void rftfsub_128_neon(float* a) {
const float* c = rdft_w + 32;
int j1, j2;
const float32x4_t mm_half = vdupq_n_f32(0.5f);
// Vectorized code (four at once).
// Note: commented number are indexes for the first iteration of the loop.
for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
// Load 'wk'.
const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4,
const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31,
const float32x4_t wkrt = vsubq_f32(mm_half, c_k1); // 28, 29, 30, 31,
const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28,
const float32x4_t wki_ = c_j1; // 1, 2, 3, 4,
// Load and shuffle 'a'.
// 2, 4, 6, 8, 3, 5, 7, 9
float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);
// 120, 122, 124, 126, 121, 123, 125, 127,
const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);
// 126, 124, 122, 120
const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);
// 127, 125, 123, 121
const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);
// Calculate 'x'.
const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0);
// 2-126, 4-124, 6-122, 8-120,
const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1);
// 3-127, 5-125, 7-123, 9-121,
// Calculate product into 'y'.
// yr = wkr * xr - wki * xi;
// yi = wkr * xi + wki * xr;
const float32x4_t a_ = vmulq_f32(wkr_, xr_);
const float32x4_t b_ = vmulq_f32(wki_, xi_);
const float32x4_t c_ = vmulq_f32(wkr_, xi_);
const float32x4_t d_ = vmulq_f32(wki_, xr_);
const float32x4_t yr_ = vsubq_f32(a_, b_); // 2-126, 4-124, 6-122, 8-120,
const float32x4_t yi_ = vaddq_f32(c_, d_); // 3-127, 5-125, 7-123, 9-121,
// Update 'a'.
// a[j2 + 0] -= yr;
// a[j2 + 1] -= yi;
// a[k2 + 0] += yr;
// a[k2 + 1] -= yi;
// 126, 124, 122, 120,
const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_);
// 127, 125, 123, 121,
const float32x4_t a_k2_p1n = vsubq_f32(a_k2_p1, yi_);
// Shuffle in right order and store.
const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n);
const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n);
// 124, 125, 126, 127, 120, 121, 122, 123
const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr);
// 2, 4, 6, 8,
a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_);
// 3, 5, 7, 9,
a_j2_p.val[1] = vsubq_f32(a_j2_p.val[1], yi_);
// 2, 3, 4, 5, 6, 7, 8, 9,
vst2q_f32(&a[0 + j2], a_j2_p);
vst1q_f32(&a[122 - j2], a_k2_n.val[1]);
vst1q_f32(&a[126 - j2], a_k2_n.val[0]);
}
// Scalar code for the remaining items.
for (; j2 < 64; j1 += 1, j2 += 2) {
const int k2 = 128 - j2;
const int k1 = 32 - j1;
const float wkr = 0.5f - c[k1];
const float wki = c[j1];
const float xr = a[j2 + 0] - a[k2 + 0];
const float xi = a[j2 + 1] + a[k2 + 1];
const float yr = wkr * xr - wki * xi;
const float yi = wkr * xi + wki * xr;
a[j2 + 0] -= yr;
a[j2 + 1] -= yi;
a[k2 + 0] += yr;
a[k2 + 1] -= yi;
}
}
void rftbsub_128_neon(float* a) {
const float* c = rdft_w + 32;
int j1, j2;
const float32x4_t mm_half = vdupq_n_f32(0.5f);
a[1] = -a[1];
// Vectorized code (four at once).
// Note: commented number are indexes for the first iteration of the loop.
for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
// Load 'wk'.
const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4,
const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31,
const float32x4_t wkrt = vsubq_f32(mm_half, c_k1); // 28, 29, 30, 31,
const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28,
const float32x4_t wki_ = c_j1; // 1, 2, 3, 4,
// Load and shuffle 'a'.
// 2, 4, 6, 8, 3, 5, 7, 9
float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);
// 120, 122, 124, 126, 121, 123, 125, 127,
const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);
// 126, 124, 122, 120
const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);
// 127, 125, 123, 121
const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);
// Calculate 'x'.
const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0);
// 2-126, 4-124, 6-122, 8-120,
const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1);
// 3-127, 5-125, 7-123, 9-121,
// Calculate product into 'y'.
// yr = wkr * xr - wki * xi;
// yi = wkr * xi + wki * xr;
const float32x4_t a_ = vmulq_f32(wkr_, xr_);
const float32x4_t b_ = vmulq_f32(wki_, xi_);
const float32x4_t c_ = vmulq_f32(wkr_, xi_);
const float32x4_t d_ = vmulq_f32(wki_, xr_);
const float32x4_t yr_ = vaddq_f32(a_, b_); // 2-126, 4-124, 6-122, 8-120,
const float32x4_t yi_ = vsubq_f32(c_, d_); // 3-127, 5-125, 7-123, 9-121,
// Update 'a'.
// a[j2 + 0] -= yr;
// a[j2 + 1] -= yi;
// a[k2 + 0] += yr;
// a[k2 + 1] -= yi;
// 126, 124, 122, 120,
const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_);
// 127, 125, 123, 121,
const float32x4_t a_k2_p1n = vsubq_f32(yi_, a_k2_p1);
// Shuffle in right order and store.
// 2, 3, 4, 5, 6, 7, 8, 9,
const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n);
const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n);
// 124, 125, 126, 127, 120, 121, 122, 123
const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr);
// 2, 4, 6, 8,
a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_);
// 3, 5, 7, 9,
a_j2_p.val[1] = vsubq_f32(yi_, a_j2_p.val[1]);
// 2, 3, 4, 5, 6, 7, 8, 9,
vst2q_f32(&a[0 + j2], a_j2_p);
vst1q_f32(&a[122 - j2], a_k2_n.val[1]);
vst1q_f32(&a[126 - j2], a_k2_n.val[0]);
}
// Scalar code for the remaining items.
for (; j2 < 64; j1 += 1, j2 += 2) {
const int k2 = 128 - j2;
const int k1 = 32 - j1;
const float wkr = 0.5f - c[k1];
const float wki = c[j1];
const float xr = a[j2 + 0] - a[k2 + 0];
const float xi = a[j2 + 1] + a[k2 + 1];
const float yr = wkr * xr + wki * xi;
const float yi = wkr * xi - wki * xr;
a[j2 + 0] = a[j2 + 0] - yr;
a[j2 + 1] = yi - a[j2 + 1];
a[k2 + 0] = yr + a[k2 + 0];
a[k2 + 1] = yi - a[k2 + 1];
}
a[65] = -a[65];
}
#endif
} // namespace webrtc

View File

@ -0,0 +1,98 @@
/*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_NEON_SSE2_H_
#define MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_NEON_SSE2_H_
#include "common_audio/third_party/ooura/fft_size_128/ooura_fft.h"
#include "rtc_base/system/arch.h"
#ifdef _MSC_VER /* visual c++ */
#define ALIGN16_BEG __declspec(align(16))
#define ALIGN16_END
#else /* gcc or icc */
#define ALIGN16_BEG
#define ALIGN16_END __attribute__((aligned(16)))
#endif
namespace webrtc {
// These tables used to be computed at run-time. For example, refer to:
// https://code.google.com/p/webrtc/source/browse/trunk/webrtc/modules/audio_processing/utility/apm_rdft.c?r=6564
// to see the initialization code.
#if defined(WEBRTC_ARCH_X86_FAMILY) || defined(WEBRTC_HAS_NEON)
// Constants used by SSE2 and NEON but initialized in the C path.
const ALIGN16_BEG float ALIGN16_END k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f};
ALIGN16_BEG const float ALIGN16_END rdft_wk1r[32] = {
1.000000000f, 1.000000000f, 0.707106769f, 0.707106769f, 0.923879564f,
0.923879564f, 0.382683456f, 0.382683456f, 0.980785251f, 0.980785251f,
0.555570245f, 0.555570245f, 0.831469595f, 0.831469595f, 0.195090324f,
0.195090324f, 0.995184720f, 0.995184720f, 0.634393334f, 0.634393334f,
0.881921291f, 0.881921291f, 0.290284663f, 0.290284663f, 0.956940353f,
0.956940353f, 0.471396744f, 0.471396744f, 0.773010433f, 0.773010433f,
0.098017141f, 0.098017141f,
};
ALIGN16_BEG const float ALIGN16_END rdft_wk2r[32] = {
1.000000000f, 1.000000000f, -0.000000000f, -0.000000000f, 0.707106769f,
0.707106769f, -0.707106769f, -0.707106769f, 0.923879564f, 0.923879564f,
-0.382683456f, -0.382683456f, 0.382683456f, 0.382683456f, -0.923879564f,
-0.923879564f, 0.980785251f, 0.980785251f, -0.195090324f, -0.195090324f,
0.555570245f, 0.555570245f, -0.831469595f, -0.831469595f, 0.831469595f,
0.831469595f, -0.555570245f, -0.555570245f, 0.195090324f, 0.195090324f,
-0.980785251f, -0.980785251f,
};
ALIGN16_BEG const float ALIGN16_END rdft_wk3r[32] = {
1.000000000f, 1.000000000f, -0.707106769f, -0.707106769f, 0.382683456f,
0.382683456f, -0.923879564f, -0.923879564f, 0.831469536f, 0.831469536f,
-0.980785251f, -0.980785251f, -0.195090353f, -0.195090353f, -0.555570245f,
-0.555570245f, 0.956940353f, 0.956940353f, -0.881921172f, -0.881921172f,
0.098017156f, 0.098017156f, -0.773010492f, -0.773010492f, 0.634393334f,
0.634393334f, -0.995184720f, -0.995184720f, -0.471396863f, -0.471396863f,
-0.290284693f, -0.290284693f,
};
ALIGN16_BEG const float ALIGN16_END rdft_wk1i[32] = {
-0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f, -0.382683456f,
0.382683456f, -0.923879564f, 0.923879564f, -0.195090324f, 0.195090324f,
-0.831469595f, 0.831469595f, -0.555570245f, 0.555570245f, -0.980785251f,
0.980785251f, -0.098017141f, 0.098017141f, -0.773010433f, 0.773010433f,
-0.471396744f, 0.471396744f, -0.956940353f, 0.956940353f, -0.290284663f,
0.290284663f, -0.881921291f, 0.881921291f, -0.634393334f, 0.634393334f,
-0.995184720f, 0.995184720f,
};
ALIGN16_BEG const float ALIGN16_END rdft_wk2i[32] = {
-0.000000000f, 0.000000000f, -1.000000000f, 1.000000000f, -0.707106769f,
0.707106769f, -0.707106769f, 0.707106769f, -0.382683456f, 0.382683456f,
-0.923879564f, 0.923879564f, -0.923879564f, 0.923879564f, -0.382683456f,
0.382683456f, -0.195090324f, 0.195090324f, -0.980785251f, 0.980785251f,
-0.831469595f, 0.831469595f, -0.555570245f, 0.555570245f, -0.555570245f,
0.555570245f, -0.831469595f, 0.831469595f, -0.980785251f, 0.980785251f,
-0.195090324f, 0.195090324f,
};
ALIGN16_BEG const float ALIGN16_END rdft_wk3i[32] = {
-0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f, -0.923879564f,
0.923879564f, 0.382683456f, -0.382683456f, -0.555570245f, 0.555570245f,
-0.195090353f, 0.195090353f, -0.980785251f, 0.980785251f, 0.831469536f,
-0.831469536f, -0.290284693f, 0.290284693f, -0.471396863f, 0.471396863f,
-0.995184720f, 0.995184720f, 0.634393334f, -0.634393334f, -0.773010492f,
0.773010492f, 0.098017156f, -0.098017156f, -0.881921172f, 0.881921172f,
0.956940353f, -0.956940353f,
};
ALIGN16_BEG const float ALIGN16_END cftmdl_wk1r[4] = {
0.707106769f,
0.707106769f,
0.707106769f,
-0.707106769f,
};
#endif
} // namespace webrtc
#endif // MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_NEON_SSE2_H_