Skip to content
Snippets Groups Projects
Commit fed81a77 authored by Arun Raghavan's avatar Arun Raghavan :feet:
Browse files

Allow disabling inline SSE

Should make building on i686 without SSE feasible.

Fixes: #5
parent c144c530
No related branches found
No related tags found
1 merge request!49Allow disabling inline SSE
...@@ -110,6 +110,7 @@ have_neon = false ...@@ -110,6 +110,7 @@ have_neon = false
have_mips = false have_mips = false
have_mips64 = false have_mips64 = false
have_x86 = false have_x86 = false
have_inline_sse = false
have_avx2 = false have_avx2 = false
if host_machine.cpu_family() == 'arm' if host_machine.cpu_family() == 'arm'
if cc.compiles('''#ifndef __ARM_ARCH_ISA_ARM if cc.compiles('''#ifndef __ARM_ARCH_ISA_ARM
...@@ -140,10 +141,19 @@ if host_machine.cpu_family() == 'mips64' ...@@ -140,10 +141,19 @@ if host_machine.cpu_family() == 'mips64'
endif endif
if ['x86', 'x86_64'].contains(host_machine.cpu_family()) if ['x86', 'x86_64'].contains(host_machine.cpu_family())
have_x86 = true have_x86 = true
# This is unconditionally enabled for now, actual usage is determined by # AVX2 support is unconditionally available, since all the code (compiled
# runtime CPU detection, so we're just assuming the compiler supports avx2 # with -mavx2) is in separate files from runtime detection (which should not
# be compiled with SIMD flags for cases where the CPU does not support it).
# Unfortunately, a bunch of SSE code is inline with the runtime detection,
# and we can't support that on systems that don't support SSE.
have_avx2 = true have_avx2 = true
arch_cflags += ['-DWEBRTC_ENABLE_AVX2'] arch_cflags += ['-DWEBRTC_ENABLE_AVX2']
if get_option('inline-sse')
have_inline_sse = true
else
have_inline_sse = false
arch_cflags += ['-DWAP_DISABLE_INLINE_SSE']
endif
endif endif
neon_opt = get_option('neon') neon_opt = get_option('neon')
......
...@@ -3,4 +3,7 @@ option('gnustl', type: 'feature', ...@@ -3,4 +3,7 @@ option('gnustl', type: 'feature',
description: 'Use gnustl for a c++ library implementation (only used on Android)') description: 'Use gnustl for a c++ library implementation (only used on Android)')
option('neon', type: 'combo', option('neon', type: 'combo',
choices: ['no', 'yes', 'auto', 'runtime'], choices: ['no', 'yes', 'auto', 'runtime'],
description: '') description: 'Enable NEON optimisations')
option('inline-sse', type: 'boolean',
value: true,
description: 'Enable inline SSE/SSE2 optimisations (i.e. assume CPU supports SSE/SSE2)')
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#if defined(WEBRTC_HAS_NEON) #if defined(WEBRTC_HAS_NEON)
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
#if defined(WEBRTC_ARCH_X86_FAMILY) #if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
#include <emmintrin.h> #include <emmintrin.h>
#endif #endif
#include <math.h> #include <math.h>
...@@ -88,7 +88,7 @@ void ComputeFrequencyResponse_Neon( ...@@ -88,7 +88,7 @@ void ComputeFrequencyResponse_Neon(
} }
#endif #endif
#if defined(WEBRTC_ARCH_X86_FAMILY) #if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
// Computes and stores the frequency response of the filter. // Computes and stores the frequency response of the filter.
void ComputeFrequencyResponse_Sse2( void ComputeFrequencyResponse_Sse2(
size_t num_partitions, size_t num_partitions,
...@@ -212,7 +212,7 @@ void AdaptPartitions_Neon(const RenderBuffer& render_buffer, ...@@ -212,7 +212,7 @@ void AdaptPartitions_Neon(const RenderBuffer& render_buffer,
} }
#endif #endif
#if defined(WEBRTC_ARCH_X86_FAMILY) #if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
// Adapts the filter partitions. (SSE2 variant) // Adapts the filter partitions. (SSE2 variant)
void AdaptPartitions_Sse2(const RenderBuffer& render_buffer, void AdaptPartitions_Sse2(const RenderBuffer& render_buffer,
const FftData& G, const FftData& G,
...@@ -377,7 +377,7 @@ void ApplyFilter_Neon(const RenderBuffer& render_buffer, ...@@ -377,7 +377,7 @@ void ApplyFilter_Neon(const RenderBuffer& render_buffer,
} }
#endif #endif
#if defined(WEBRTC_ARCH_X86_FAMILY) #if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
// Produces the filter output (SSE2 variant). // Produces the filter output (SSE2 variant).
void ApplyFilter_Sse2(const RenderBuffer& render_buffer, void ApplyFilter_Sse2(const RenderBuffer& render_buffer,
size_t num_partitions, size_t num_partitions,
...@@ -557,9 +557,11 @@ void AdaptiveFirFilter::Filter(const RenderBuffer& render_buffer, ...@@ -557,9 +557,11 @@ void AdaptiveFirFilter::Filter(const RenderBuffer& render_buffer,
RTC_DCHECK(S); RTC_DCHECK(S);
switch (optimization_) { switch (optimization_) {
#if defined(WEBRTC_ARCH_X86_FAMILY) #if defined(WEBRTC_ARCH_X86_FAMILY)
#if !defined(WAP_DISABLE_INLINE_SSE)
case Aec3Optimization::kSse2: case Aec3Optimization::kSse2:
aec3::ApplyFilter_Sse2(render_buffer, current_size_partitions_, H_, S); aec3::ApplyFilter_Sse2(render_buffer, current_size_partitions_, H_, S);
break; break;
#endif
case Aec3Optimization::kAvx2: case Aec3Optimization::kAvx2:
aec3::ApplyFilter_Avx2(render_buffer, current_size_partitions_, H_, S); aec3::ApplyFilter_Avx2(render_buffer, current_size_partitions_, H_, S);
break; break;
...@@ -601,9 +603,11 @@ void AdaptiveFirFilter::ComputeFrequencyResponse( ...@@ -601,9 +603,11 @@ void AdaptiveFirFilter::ComputeFrequencyResponse(
switch (optimization_) { switch (optimization_) {
#if defined(WEBRTC_ARCH_X86_FAMILY) #if defined(WEBRTC_ARCH_X86_FAMILY)
#if !defined(WAP_DISABLE_INLINE_SSE)
case Aec3Optimization::kSse2: case Aec3Optimization::kSse2:
aec3::ComputeFrequencyResponse_Sse2(current_size_partitions_, H_, H2); aec3::ComputeFrequencyResponse_Sse2(current_size_partitions_, H_, H2);
break; break;
#endif
case Aec3Optimization::kAvx2: case Aec3Optimization::kAvx2:
aec3::ComputeFrequencyResponse_Avx2(current_size_partitions_, H_, H2); aec3::ComputeFrequencyResponse_Avx2(current_size_partitions_, H_, H2);
break; break;
...@@ -626,10 +630,12 @@ void AdaptiveFirFilter::AdaptAndUpdateSize(const RenderBuffer& render_buffer, ...@@ -626,10 +630,12 @@ void AdaptiveFirFilter::AdaptAndUpdateSize(const RenderBuffer& render_buffer,
// Adapt the filter. // Adapt the filter.
switch (optimization_) { switch (optimization_) {
#if defined(WEBRTC_ARCH_X86_FAMILY) #if defined(WEBRTC_ARCH_X86_FAMILY)
#if !defined(WAP_DISABLE_INLINE_SSE)
case Aec3Optimization::kSse2: case Aec3Optimization::kSse2:
aec3::AdaptPartitions_Sse2(render_buffer, G, current_size_partitions_, aec3::AdaptPartitions_Sse2(render_buffer, G, current_size_partitions_,
&H_); &H_);
break; break;
#endif
case Aec3Optimization::kAvx2: case Aec3Optimization::kAvx2:
aec3::AdaptPartitions_Avx2(render_buffer, G, current_size_partitions_, aec3::AdaptPartitions_Avx2(render_buffer, G, current_size_partitions_,
&H_); &H_);
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#if defined(WEBRTC_HAS_NEON) #if defined(WEBRTC_HAS_NEON)
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
#if defined(WEBRTC_ARCH_X86_FAMILY) #if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
#include <emmintrin.h> #include <emmintrin.h>
#endif #endif
...@@ -54,7 +54,7 @@ void ErlComputer_NEON( ...@@ -54,7 +54,7 @@ void ErlComputer_NEON(
} }
#endif #endif
#if defined(WEBRTC_ARCH_X86_FAMILY) #if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
// Computes and stores the echo return loss estimate of the filter, which is the // Computes and stores the echo return loss estimate of the filter, which is the
// sum of the partition frequency responses. // sum of the partition frequency responses.
void ErlComputer_SSE2( void ErlComputer_SSE2(
...@@ -82,9 +82,11 @@ void ComputeErl(const Aec3Optimization& optimization, ...@@ -82,9 +82,11 @@ void ComputeErl(const Aec3Optimization& optimization,
// Update the frequency response and echo return loss for the filter. // Update the frequency response and echo return loss for the filter.
switch (optimization) { switch (optimization) {
#if defined(WEBRTC_ARCH_X86_FAMILY) #if defined(WEBRTC_ARCH_X86_FAMILY)
#if !defined(WAP_DISABLE_INLINE_SSE)
case Aec3Optimization::kSse2: case Aec3Optimization::kSse2:
aec3::ErlComputer_SSE2(H2, erl); aec3::ErlComputer_SSE2(H2, erl);
break; break;
#endif
case Aec3Optimization::kAvx2: case Aec3Optimization::kAvx2:
aec3::ErlComputer_AVX2(H2, erl); aec3::ErlComputer_AVX2(H2, erl);
break; break;
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
// Defines WEBRTC_ARCH_X86_FAMILY, used below. // Defines WEBRTC_ARCH_X86_FAMILY, used below.
#include "rtc_base/system/arch.h" #include "rtc_base/system/arch.h"
#if defined(WEBRTC_ARCH_X86_FAMILY) #if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
#include <emmintrin.h> #include <emmintrin.h>
#endif #endif
#include <algorithm> #include <algorithm>
...@@ -49,6 +49,7 @@ struct FftData { ...@@ -49,6 +49,7 @@ struct FftData {
RTC_DCHECK_EQ(kFftLengthBy2Plus1, power_spectrum.size()); RTC_DCHECK_EQ(kFftLengthBy2Plus1, power_spectrum.size());
switch (optimization) { switch (optimization) {
#if defined(WEBRTC_ARCH_X86_FAMILY) #if defined(WEBRTC_ARCH_X86_FAMILY)
#if !defined(WAP_DISABLE_INLINE_SSE)
case Aec3Optimization::kSse2: { case Aec3Optimization::kSse2: {
constexpr int kNumFourBinBands = kFftLengthBy2 / 4; constexpr int kNumFourBinBands = kFftLengthBy2 / 4;
constexpr int kLimit = kNumFourBinBands * 4; constexpr int kLimit = kNumFourBinBands * 4;
...@@ -63,6 +64,7 @@ struct FftData { ...@@ -63,6 +64,7 @@ struct FftData {
power_spectrum[kFftLengthBy2] = re[kFftLengthBy2] * re[kFftLengthBy2] + power_spectrum[kFftLengthBy2] = re[kFftLengthBy2] * re[kFftLengthBy2] +
im[kFftLengthBy2] * im[kFftLengthBy2]; im[kFftLengthBy2] * im[kFftLengthBy2];
} break; } break;
#endif
case Aec3Optimization::kAvx2: case Aec3Optimization::kAvx2:
SpectrumAVX2(power_spectrum); SpectrumAVX2(power_spectrum);
break; break;
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#if defined(WEBRTC_HAS_NEON) #if defined(WEBRTC_HAS_NEON)
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
#if defined(WEBRTC_ARCH_X86_FAMILY) #if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
#include <emmintrin.h> #include <emmintrin.h>
#endif #endif
#include <algorithm> #include <algorithm>
...@@ -286,7 +286,7 @@ void MatchedFilterCore_NEON(size_t x_start_index, ...@@ -286,7 +286,7 @@ void MatchedFilterCore_NEON(size_t x_start_index,
#endif #endif
#if defined(WEBRTC_ARCH_X86_FAMILY) #if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
void MatchedFilterCore_AccumulatedError_SSE2( void MatchedFilterCore_AccumulatedError_SSE2(
size_t x_start_index, size_t x_start_index,
...@@ -695,12 +695,14 @@ void MatchedFilter::Update(const DownsampledRenderBuffer& render_buffer, ...@@ -695,12 +695,14 @@ void MatchedFilter::Update(const DownsampledRenderBuffer& render_buffer,
switch (optimization_) { switch (optimization_) {
#if defined(WEBRTC_ARCH_X86_FAMILY) #if defined(WEBRTC_ARCH_X86_FAMILY)
#if !defined(WAP_DISABLE_INLINE_SSE)
case Aec3Optimization::kSse2: case Aec3Optimization::kSse2:
aec3::MatchedFilterCore_SSE2( aec3::MatchedFilterCore_SSE2(
x_start_index, x2_sum_threshold, smoothing, render_buffer.buffer, y, x_start_index, x2_sum_threshold, smoothing, render_buffer.buffer, y,
filters_[n], &filters_updated, &error_sum, compute_pre_echo, filters_[n], &filters_updated, &error_sum, compute_pre_echo,
instantaneous_accumulated_error_, scratch_memory_); instantaneous_accumulated_error_, scratch_memory_);
break; break;
#endif
case Aec3Optimization::kAvx2: case Aec3Optimization::kAvx2:
aec3::MatchedFilterCore_AVX2( aec3::MatchedFilterCore_AVX2(
x_start_index, x2_sum_threshold, smoothing, render_buffer.buffer, y, x_start_index, x2_sum_threshold, smoothing, render_buffer.buffer, y,
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#if defined(WEBRTC_HAS_NEON) #if defined(WEBRTC_HAS_NEON)
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
#if defined(WEBRTC_ARCH_X86_FAMILY) #if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
#include <emmintrin.h> #include <emmintrin.h>
#endif #endif
#include <math.h> #include <math.h>
...@@ -43,7 +43,7 @@ class VectorMath { ...@@ -43,7 +43,7 @@ class VectorMath {
void SqrtAVX2(rtc::ArrayView<float> x); void SqrtAVX2(rtc::ArrayView<float> x);
void Sqrt(rtc::ArrayView<float> x) { void Sqrt(rtc::ArrayView<float> x) {
switch (optimization_) { switch (optimization_) {
#if defined(WEBRTC_ARCH_X86_FAMILY) #if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
case Aec3Optimization::kSse2: { case Aec3Optimization::kSse2: {
const int x_size = static_cast<int>(x.size()); const int x_size = static_cast<int>(x.size());
const int vector_limit = x_size >> 2; const int vector_limit = x_size >> 2;
...@@ -123,7 +123,7 @@ class VectorMath { ...@@ -123,7 +123,7 @@ class VectorMath {
RTC_DCHECK_EQ(z.size(), x.size()); RTC_DCHECK_EQ(z.size(), x.size());
RTC_DCHECK_EQ(z.size(), y.size()); RTC_DCHECK_EQ(z.size(), y.size());
switch (optimization_) { switch (optimization_) {
#if defined(WEBRTC_ARCH_X86_FAMILY) #if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
case Aec3Optimization::kSse2: { case Aec3Optimization::kSse2: {
const int x_size = static_cast<int>(x.size()); const int x_size = static_cast<int>(x.size());
const int vector_limit = x_size >> 2; const int vector_limit = x_size >> 2;
...@@ -174,6 +174,7 @@ class VectorMath { ...@@ -174,6 +174,7 @@ class VectorMath {
RTC_DCHECK_EQ(z.size(), x.size()); RTC_DCHECK_EQ(z.size(), x.size());
switch (optimization_) { switch (optimization_) {
#if defined(WEBRTC_ARCH_X86_FAMILY) #if defined(WEBRTC_ARCH_X86_FAMILY)
#if !defined(WAP_DISABLE_INLINE_SSE)
case Aec3Optimization::kSse2: { case Aec3Optimization::kSse2: {
const int x_size = static_cast<int>(x.size()); const int x_size = static_cast<int>(x.size());
const int vector_limit = x_size >> 2; const int vector_limit = x_size >> 2;
...@@ -190,6 +191,7 @@ class VectorMath { ...@@ -190,6 +191,7 @@ class VectorMath {
z[j] += x[j]; z[j] += x[j];
} }
} break; } break;
#endif
case Aec3Optimization::kAvx2: case Aec3Optimization::kAvx2:
AccumulateAVX2(x, z); AccumulateAVX2(x, z);
break; break;
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#if defined(WEBRTC_HAS_NEON) #if defined(WEBRTC_HAS_NEON)
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
#if defined(WEBRTC_ARCH_X86_FAMILY) #if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
#include <emmintrin.h> #include <emmintrin.h>
#endif #endif
...@@ -47,6 +47,7 @@ class VectorMath { ...@@ -47,6 +47,7 @@ class VectorMath {
if (cpu_features_.avx2) { if (cpu_features_.avx2) {
return DotProductAvx2(x, y); return DotProductAvx2(x, y);
} else if (cpu_features_.sse2) { } else if (cpu_features_.sse2) {
#if !defined(WAP_DISABLE_INLINE_SSE)
__m128 accumulator = _mm_setzero_ps(); __m128 accumulator = _mm_setzero_ps();
constexpr int kBlockSizeLog2 = 2; constexpr int kBlockSizeLog2 = 2;
constexpr int kBlockSize = 1 << kBlockSizeLog2; constexpr int kBlockSize = 1 << kBlockSizeLog2;
...@@ -72,6 +73,7 @@ class VectorMath { ...@@ -72,6 +73,7 @@ class VectorMath {
dot_product += x[i] * y[i]; dot_product += x[i] * y[i];
} }
return dot_product; return dot_product;
#endif
} }
#elif defined(WEBRTC_HAS_NEON) && defined(WEBRTC_ARCH_ARM64) #elif defined(WEBRTC_HAS_NEON) && defined(WEBRTC_ARCH_ARM64)
if (cpu_features_.neon) { if (cpu_features_.neon) {
......
...@@ -4,7 +4,7 @@ pffft_sources = [ ...@@ -4,7 +4,7 @@ pffft_sources = [
pffft_cflags = [ '-D_GNU_SOURCE' ] pffft_cflags = [ '-D_GNU_SOURCE' ]
if (have_arm and not have_neon) or (have_mips and host_machine.endian() == 'little') or have_mips64 if not have_inline_sse or (have_arm and not have_neon) or (have_mips and host_machine.endian() == 'little') or have_mips64
pffft_cflags += [ '-DPFFFT_SIMD_DISABLE' ] pffft_cflags += [ '-DPFFFT_SIMD_DISABLE' ]
endif endif
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment