From 69673c7a642ee532d43a40c9cd273c2015323d5e Mon Sep 17 00:00:00 2001 From: Christian Daniel Date: Thu, 8 Aug 2013 21:39:26 +0200 Subject: optimize interpolator with SSE --- include-gpl/dsp/interpolator.h | 71 ++++++++++++++++++++++++++++++++++++++++-- sdrbase/audio/audiooutput.cpp | 7 +++-- sdrbase/dsp/interpolator.cpp | 58 ++++++++++++++++++++++++++++------ 3 files changed, 121 insertions(+), 15 deletions(-) diff --git a/include-gpl/dsp/interpolator.h b/include-gpl/dsp/interpolator.h index bdf538e..f51f3ff 100644 --- a/include-gpl/dsp/interpolator.h +++ b/include-gpl/dsp/interpolator.h @@ -1,14 +1,19 @@ #ifndef INCLUDE_INTERPOLATOR_H #define INCLUDE_INTERPOLATOR_H +#include #include "dsp/dsptypes.h" #include "util/export.h" +#include +#include class SDRANGELOVE_API Interpolator { public: Interpolator(); + ~Interpolator(); void create(int phaseSteps, double sampleRate, double cutoff); + void free(); bool interpolate(Real* distance, const Complex& next, bool* consumed, Complex* result) { @@ -27,7 +32,10 @@ public: private: - std::vector m_taps; + float* m_taps; + float* m_alignedTaps; + float* m_taps2; + float* m_alignedTaps2; std::vector m_samples; int m_ptr; int m_phaseSteps; @@ -45,8 +53,63 @@ private: void doInterpolate(int phase, Complex* result) { +#if 1 + // beware of the ringbuffer + if(m_ptr == 0) { + // only one straight block + const float* src = (const float*)&m_samples[0]; + const __m128* filter = (const __m128*)&m_alignedTaps[phase * m_nTaps * 2]; + __m128 sum = _mm_setzero_ps(); + int todo = m_nTaps / 2; + + for(int i = 0; i < todo; i++) { + sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(src), *filter)); + src += 4; + filter += 1; + } + + // add upper half to lower half and store + _mm_storel_pi((__m64*)result, _mm_add_ps(sum, _mm_shuffle_ps(sum, _mm_setzero_ps(), _MM_SHUFFLE(1, 0, 3, 2)))); + } else { + // two blocks + const float* src = (const float*)&m_samples[m_ptr]; + const __m128* filter = (const __m128*)&m_alignedTaps[phase * m_nTaps * 2]; + __m128 sum = _mm_setzero_ps(); + + // first block + int block = m_nTaps - m_ptr; + int todo = block / 2; + if(block & 1) + todo++; + for(int i = 0; i < todo; i++) { + sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(src), *filter)); + src += 4; + filter += 1; + } + if(block & 1) { + // one sample beyond the end -> switch coefficient table + filter = (const __m128*)&m_alignedTaps2[phase * m_nTaps * 2 + todo * 4 - 4]; + } + // second block + src = (const float*)&m_samples[0]; + block = m_ptr; + todo = block / 2; + for(int i = 0; i < todo; i++) { + sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(src), *filter)); + src += 4; + filter += 1; + } + if(block & 1) { + // one sample remaining + sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadl_pi(_mm_setzero_ps(), (const __m64*)src), filter[0])); + } + + // add upper half to lower half and store + _mm_storel_pi((__m64*)result, _mm_add_ps(sum, _mm_shuffle_ps(sum, _mm_setzero_ps(), _MM_SHUFFLE(1, 0, 3, 2)))); + } +#else int sample = m_ptr; - const Real* coeff = &m_taps[phase * m_nTaps]; + const Real* coeff = &m_alignedTaps[phase * m_nTaps * 2]; Real rAcc = 0; Real iAcc = 0; @@ -54,9 +117,11 @@ private: rAcc += *coeff * m_samples[sample].real(); iAcc += *coeff * m_samples[sample].imag(); sample = (sample + 1) % m_nTaps; - coeff++; + coeff += 2; } *result = Complex(rAcc, iAcc); +#endif + } }; diff --git a/sdrbase/audio/audiooutput.cpp b/sdrbase/audio/audiooutput.cpp index 88ca634..455241a 100644 --- a/sdrbase/audio/audiooutput.cpp +++ b/sdrbase/audio/audiooutput.cpp @@ -110,6 +110,9 @@ bool AudioOutput::open(OpenMode mode) qint64 AudioOutput::readData(char* data, qint64 maxLen) { + if(maxLen == 0) + return 0; + QMutexLocker mutexLocker(&m_mutex); maxLen -= maxLen % 4; @@ -125,7 +128,7 @@ qint64 AudioOutput::readData(char* data, qint64 maxLen) // sum up a block from all fifos for(AudioFifos::iterator it = m_audioFifos.begin(); it != m_audioFifos.end(); ++it) { // use outputBuffer as temp - yes, one memcpy could be saved - uint samples = (*it)->read((quint8*)data, framesPerBuffer, 0); + uint samples = (*it)->read((quint8*)data, framesPerBuffer, 1); const qint16* src = (const qint16*)data; std::vector::iterator dst = m_mixBuffer.begin(); for(uint i = 0; i < samples; i++) { @@ -158,7 +161,7 @@ qint64 AudioOutput::readData(char* data, qint64 maxLen) *dst++ = s; } - return maxLen; + return framesPerBuffer * 4; } qint64 AudioOutput::writeData(const char* data, qint64 len) diff --git a/sdrbase/dsp/interpolator.cpp b/sdrbase/dsp/interpolator.cpp index ee441c4..781864b 100644 --- a/sdrbase/dsp/interpolator.cpp +++ b/sdrbase/dsp/interpolator.cpp @@ -1,5 +1,4 @@ #define _USE_MATH_DEFINES -#include #include #include #include "dsp/interpolator.h" @@ -13,7 +12,7 @@ static std::vector createPolyphaseLowPass( double oobAttenuationdB) { int ntaps = (int)(oobAttenuationdB * sampleRateHz / (22.0 * transitionWidthHz)); - if((ntaps % 2) == 0) + if((ntaps % 2) != 0) ntaps++; ntaps *= phaseSteps; @@ -42,12 +41,21 @@ static std::vector createPolyphaseLowPass( return taps; } -Interpolator::Interpolator() +Interpolator::Interpolator() : + m_taps(NULL), + m_alignedTaps(NULL) { } +Interpolator::~Interpolator() +{ + free(); +} + void Interpolator::create(int phaseSteps, double sampleRate, double cutoff) { + free(); + std::vector taps = createPolyphaseLowPass( phaseSteps, // number of polyphases 1.0, // gain @@ -60,23 +68,53 @@ void Interpolator::create(int phaseSteps, double sampleRate, double cutoff) m_ptr = 0; m_nTaps = taps.size() / phaseSteps; m_phaseSteps = phaseSteps; - m_taps.resize(taps.size()); - m_samples.resize(m_nTaps); - for(int i = 0; i < m_nTaps; i++) + m_samples.resize(m_nTaps + 2); + for(int i = 0; i < m_nTaps + 2; i++) m_samples[i] = 0; - // copy to filter taps + // reorder into polyphase + std::vector polyphase(taps.size()); for(int phase = 0; phase < phaseSteps; phase++) { for(int i = 0; i < m_nTaps; i++) - m_taps[phase * m_nTaps + i] = taps[i * phaseSteps + phase]; + polyphase[phase * m_nTaps + i] = taps[i * phaseSteps + phase]; } // normalize phase filters for(int phase = 0; phase < phaseSteps; phase++) { Real sum = 0; for(int i = phase * m_nTaps; i < phase * m_nTaps + m_nTaps; i++) - sum += m_taps[i]; + sum += polyphase[i]; for(int i = phase * m_nTaps; i < phase * m_nTaps + m_nTaps; i++) - m_taps[i] /= sum; + polyphase[i] /= sum; + } + + // move taps around to match sse storage requirements + m_taps = new float[2 * taps.size() + 8]; + for(int i = 0; i < 2 * taps.size() + 8; ++i) + m_taps[i] = 0; + m_alignedTaps = (float*)((((quint64)m_taps) + 15) & ~15); + for(int i = 0; i < taps.size(); ++i) { + m_alignedTaps[2 * i + 0] = polyphase[i]; + m_alignedTaps[2 * i + 1] = polyphase[i]; + } + m_taps2 = new float[2 * taps.size() + 8]; + for(int i = 0; i < 2 * taps.size() + 8; ++i) + m_taps2[i] = 0; + m_alignedTaps2 = (float*)((((quint64)m_taps2) + 15) & ~15); + for(int i = 1; i < taps.size(); ++i) { + m_alignedTaps2[2 * (i - 1) + 0] = polyphase[i]; + m_alignedTaps2[2 * (i - 1) + 1] = polyphase[i]; + } +} + +void Interpolator::free() +{ + if(m_taps != NULL) { + delete[] m_taps; + m_taps = NULL; + m_alignedTaps = NULL; + delete[] m_taps2; + m_taps2 = NULL; + m_alignedTaps2 = NULL; } } -- cgit v1.2.3