summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristian Daniel <cd@maintech.de>2013-08-08 21:39:26 +0200
committerChristian Daniel <cd@maintech.de>2013-08-08 21:39:26 +0200
commit69673c7a642ee532d43a40c9cd273c2015323d5e (patch)
tree0ea76f9f2efb2b86b5de66ed1a954da0e537c4f1
parent6fa57d6d2c791ffeb2279b2324ce61f557ce4cd5 (diff)
optimize interpolator with SSE
-rw-r--r--include-gpl/dsp/interpolator.h71
-rw-r--r--sdrbase/audio/audiooutput.cpp7
-rw-r--r--sdrbase/dsp/interpolator.cpp58
3 files changed, 121 insertions, 15 deletions
diff --git a/include-gpl/dsp/interpolator.h b/include-gpl/dsp/interpolator.h
index bdf538e..f51f3ff 100644
--- a/include-gpl/dsp/interpolator.h
+++ b/include-gpl/dsp/interpolator.h
@@ -1,14 +1,19 @@
#ifndef INCLUDE_INTERPOLATOR_H
#define INCLUDE_INTERPOLATOR_H
+#include <immintrin.h>
#include "dsp/dsptypes.h"
#include "util/export.h"
+#include <stdio.h>
+#include <unistd.h>
class SDRANGELOVE_API Interpolator {
public:
Interpolator();
+ ~Interpolator();
void create(int phaseSteps, double sampleRate, double cutoff);
+ void free();
bool interpolate(Real* distance, const Complex& next, bool* consumed, Complex* result)
{
@@ -27,7 +32,10 @@ public:
private:
- std::vector<Real> m_taps;
+ float* m_taps;
+ float* m_alignedTaps;
+ float* m_taps2;
+ float* m_alignedTaps2;
std::vector<Complex> m_samples;
int m_ptr;
int m_phaseSteps;
@@ -45,8 +53,63 @@ private:
void doInterpolate(int phase, Complex* result)
{
+#if 1
+ // beware of the ringbuffer
+ if(m_ptr == 0) {
+ // only one straight block
+ const float* src = (const float*)&m_samples[0];
+ const __m128* filter = (const __m128*)&m_alignedTaps[phase * m_nTaps * 2];
+ __m128 sum = _mm_setzero_ps();
+ int todo = m_nTaps / 2;
+
+ for(int i = 0; i < todo; i++) {
+ sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(src), *filter));
+ src += 4;
+ filter += 1;
+ }
+
+ // add upper half to lower half and store
+ _mm_storel_pi((__m64*)result, _mm_add_ps(sum, _mm_shuffle_ps(sum, _mm_setzero_ps(), _MM_SHUFFLE(1, 0, 3, 2))));
+ } else {
+ // two blocks
+ const float* src = (const float*)&m_samples[m_ptr];
+ const __m128* filter = (const __m128*)&m_alignedTaps[phase * m_nTaps * 2];
+ __m128 sum = _mm_setzero_ps();
+
+ // first block
+ int block = m_nTaps - m_ptr;
+ int todo = block / 2;
+ if(block & 1)
+ todo++;
+ for(int i = 0; i < todo; i++) {
+ sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(src), *filter));
+ src += 4;
+ filter += 1;
+ }
+ if(block & 1) {
+ // one sample beyond the end -> switch coefficient table
+ filter = (const __m128*)&m_alignedTaps2[phase * m_nTaps * 2 + todo * 4 - 4];
+ }
+ // second block
+ src = (const float*)&m_samples[0];
+ block = m_ptr;
+ todo = block / 2;
+ for(int i = 0; i < todo; i++) {
+ sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(src), *filter));
+ src += 4;
+ filter += 1;
+ }
+ if(block & 1) {
+ // one sample remaining
+ sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadl_pi(_mm_setzero_ps(), (const __m64*)src), filter[0]));
+ }
+
+ // add upper half to lower half and store
+ _mm_storel_pi((__m64*)result, _mm_add_ps(sum, _mm_shuffle_ps(sum, _mm_setzero_ps(), _MM_SHUFFLE(1, 0, 3, 2))));
+ }
+#else
int sample = m_ptr;
- const Real* coeff = &m_taps[phase * m_nTaps];
+ const Real* coeff = &m_alignedTaps[phase * m_nTaps * 2];
Real rAcc = 0;
Real iAcc = 0;
@@ -54,9 +117,11 @@ private:
rAcc += *coeff * m_samples[sample].real();
iAcc += *coeff * m_samples[sample].imag();
sample = (sample + 1) % m_nTaps;
- coeff++;
+ coeff += 2;
}
*result = Complex(rAcc, iAcc);
+#endif
+
}
};
diff --git a/sdrbase/audio/audiooutput.cpp b/sdrbase/audio/audiooutput.cpp
index 88ca634..455241a 100644
--- a/sdrbase/audio/audiooutput.cpp
+++ b/sdrbase/audio/audiooutput.cpp
@@ -110,6 +110,9 @@ bool AudioOutput::open(OpenMode mode)
qint64 AudioOutput::readData(char* data, qint64 maxLen)
{
+ if(maxLen == 0)
+ return 0;
+
QMutexLocker mutexLocker(&m_mutex);
maxLen -= maxLen % 4;
@@ -125,7 +128,7 @@ qint64 AudioOutput::readData(char* data, qint64 maxLen)
// sum up a block from all fifos
for(AudioFifos::iterator it = m_audioFifos.begin(); it != m_audioFifos.end(); ++it) {
// use outputBuffer as temp - yes, one memcpy could be saved
- uint samples = (*it)->read((quint8*)data, framesPerBuffer, 0);
+ uint samples = (*it)->read((quint8*)data, framesPerBuffer, 1);
const qint16* src = (const qint16*)data;
std::vector<qint32>::iterator dst = m_mixBuffer.begin();
for(uint i = 0; i < samples; i++) {
@@ -158,7 +161,7 @@ qint64 AudioOutput::readData(char* data, qint64 maxLen)
*dst++ = s;
}
- return maxLen;
+ return framesPerBuffer * 4;
}
qint64 AudioOutput::writeData(const char* data, qint64 len)
diff --git a/sdrbase/dsp/interpolator.cpp b/sdrbase/dsp/interpolator.cpp
index ee441c4..781864b 100644
--- a/sdrbase/dsp/interpolator.cpp
+++ b/sdrbase/dsp/interpolator.cpp
@@ -1,5 +1,4 @@
#define _USE_MATH_DEFINES
-#include <stdio.h>
#include <math.h>
#include <vector>
#include "dsp/interpolator.h"
@@ -13,7 +12,7 @@ static std::vector<Real> createPolyphaseLowPass(
double oobAttenuationdB)
{
int ntaps = (int)(oobAttenuationdB * sampleRateHz / (22.0 * transitionWidthHz));
- if((ntaps % 2) == 0)
+ if((ntaps % 2) != 0)
ntaps++;
ntaps *= phaseSteps;
@@ -42,12 +41,21 @@ static std::vector<Real> createPolyphaseLowPass(
return taps;
}
-Interpolator::Interpolator()
+Interpolator::Interpolator() :
+ m_taps(NULL),
+ m_alignedTaps(NULL)
{
}
+Interpolator::~Interpolator()
+{
+ free();
+}
+
void Interpolator::create(int phaseSteps, double sampleRate, double cutoff)
{
+ free();
+
std::vector<Real> taps = createPolyphaseLowPass(
phaseSteps, // number of polyphases
1.0, // gain
@@ -60,23 +68,53 @@ void Interpolator::create(int phaseSteps, double sampleRate, double cutoff)
m_ptr = 0;
m_nTaps = taps.size() / phaseSteps;
m_phaseSteps = phaseSteps;
- m_taps.resize(taps.size());
- m_samples.resize(m_nTaps);
- for(int i = 0; i < m_nTaps; i++)
+ m_samples.resize(m_nTaps + 2);
+ for(int i = 0; i < m_nTaps + 2; i++)
m_samples[i] = 0;
- // copy to filter taps
+ // reorder into polyphase
+ std::vector<Real> polyphase(taps.size());
for(int phase = 0; phase < phaseSteps; phase++) {
for(int i = 0; i < m_nTaps; i++)
- m_taps[phase * m_nTaps + i] = taps[i * phaseSteps + phase];
+ polyphase[phase * m_nTaps + i] = taps[i * phaseSteps + phase];
}
// normalize phase filters
for(int phase = 0; phase < phaseSteps; phase++) {
Real sum = 0;
for(int i = phase * m_nTaps; i < phase * m_nTaps + m_nTaps; i++)
- sum += m_taps[i];
+ sum += polyphase[i];
for(int i = phase * m_nTaps; i < phase * m_nTaps + m_nTaps; i++)
- m_taps[i] /= sum;
+ polyphase[i] /= sum;
+ }
+
+ // move taps around to match sse storage requirements
+ m_taps = new float[2 * taps.size() + 8];
+ for(int i = 0; i < 2 * taps.size() + 8; ++i)
+ m_taps[i] = 0;
+ m_alignedTaps = (float*)((((quint64)m_taps) + 15) & ~15);
+ for(int i = 0; i < taps.size(); ++i) {
+ m_alignedTaps[2 * i + 0] = polyphase[i];
+ m_alignedTaps[2 * i + 1] = polyphase[i];
+ }
+ m_taps2 = new float[2 * taps.size() + 8];
+ for(int i = 0; i < 2 * taps.size() + 8; ++i)
+ m_taps2[i] = 0;
+ m_alignedTaps2 = (float*)((((quint64)m_taps2) + 15) & ~15);
+ for(int i = 1; i < taps.size(); ++i) {
+ m_alignedTaps2[2 * (i - 1) + 0] = polyphase[i];
+ m_alignedTaps2[2 * (i - 1) + 1] = polyphase[i];
+ }
+}
+
+void Interpolator::free()
+{
+ if(m_taps != NULL) {
+ delete[] m_taps;
+ m_taps = NULL;
+ m_alignedTaps = NULL;
+ delete[] m_taps2;
+ m_taps2 = NULL;
+ m_alignedTaps2 = NULL;
}
}