diff --git a/src/ofdm.c b/src/ofdm.c index 41e3d1548..acec6fdda 100644 --- a/src/ofdm.c +++ b/src/ofdm.c @@ -671,6 +671,86 @@ static complex float vector_sum(complex float *a, int num_elements) { return sum; } +/* Determine if we can use vector ops below. Only for non-embedded platforms + as double can be significantly slower on those. */ +#ifndef __EMBEDDED__ +#if __GNUC__ > 4 || \ + (__GNUC__ == 4 && (__GNUC_MINOR__ > 6 || \ + (__GNUC_MINOR__ == 6 && \ + __GNUC_PATCHLEVEL__ > 0))) +#define USE_VECTOR_OPS 1 +#elif __clang_major__ > 3 || \ + (__clang_minor__ == 3 && (__clang_minor__ > 7 || \ + (__clang_minor__ == 7 && \ + __clang_patchlevel__ > 0))) +#define USE_VECTOR_OPS 1 +#endif +#else +#include "codec2_math.h" +#endif /* __EMBEDDED__ */ + +#if USE_VECTOR_OPS +typedef float float4 __attribute__ ((vector_size (16))); +#endif /* USE_VECTOR_OPS */ + +static complex float ofdm_complex_dot_product(complex float *left, complex float *right, int numSamples) +{ + complex float result; + +#if USE_VECTOR_OPS + float *leftPtr = (float*)left; + float *rightPtr = (float*)right; + float4 accumPos = { 0, 0, 0, 0 }; + float4 accumNeg = { 0, 0, 0, 0 }; + float4 accumImag = { 0, 0, 0, 0 }; + float resultReal = 0; + float resultImag = 0; + int numBlocks = numSamples >> 1; + for (int i = 0; i < numBlocks; i++) + { + /* Lay out vectors as follows: + vec1 = rx[0].a, rx[0].b, rx[1].a, rx[1].b, ... + vec2 = mvec[0].c, mvec[0].d, mvec[1].c, mvec1[1].d, ... */ + float4 vec1 = { leftPtr[0], leftPtr[1], leftPtr[2], leftPtr[3] }; + float4 vec2 = { rightPtr[0], rightPtr[1], rightPtr[2], rightPtr[3] }; + + accumPos += vec1 * vec2; + accumNeg -= vec1 * vec2; + + /* Lay out vec3 as { rx[0].b, rx[0].a, rx[1].b, rx[0].b, ... }. + Multiply vec3 by vec2 to get us bc, ad, bc, ad + and add to second accumulator. */ + float4 vec3 = { leftPtr[1], leftPtr[0], leftPtr[3], leftPtr[2] }; + accumImag += vec3 * vec2; + + /* Shift pointers forward by 4 (2 complex floats). */ + leftPtr += 4; rightPtr += 4; + } + + /* dot product: (a + bi)(c + di) = (ac - bd) + i(bc + ad) */ + resultReal = accumPos[0] + accumNeg[1] + accumPos[2] + accumNeg[3]; + resultImag = accumImag[0] + accumImag[1] + accumImag[2] + accumImag[3]; + result = resultReal + I * resultImag; + + /* Add remaining values to corr that couldn't be vectorized above. */ + for (int i = numBlocks << 1; i < numSamples; i++) + { + result += left[i] * right[i]; + } +#elif __EMBEDDED__ + float resultReal = 0, resultImag = 0; + codec2_complex_dot_product_f32((COMP*)left, (COMP*)right, numSamples, &resultReal, &resultImag); + result = resultReal + I * resultImag; +#else + for (int i = 0; i < numSamples; i++) + { + result += left[i] * right[i]; + } +#endif /* USE_VECTOR_OPS */ + + return result; +} + /* * Correlates the OFDM pilot symbol samples with a window of received @@ -748,10 +828,9 @@ static int est_timing(struct OFDM *ofdm, complex float *rx, int length, #ifdef __EMBEDDED__ #ifdef __REAL__ - // Note: this code untested float re,im; - codec2_dot_product_f32(&rx_real[i], wvec_pilot_real, ofdm->samplespersymbol, &re); + codec2_dot_product_f32(&rx_real[i], wvec_pilot_real, ofdm->samplespersymbol, &re); codec2_dot_product_f32(&rx_real[i], wvec_pilot_imag, ofdm->samplespersymbol, &im); corr_st = re + im * I; @@ -769,12 +848,8 @@ static int est_timing(struct OFDM *ofdm, complex float *rx, int length, corr_en = re + im * I; #endif #else - for (j = 0; j < ofdm->samplespersymbol; j++) { - int ind = i + j; - - corr_st = corr_st + (rx[ind ] * wvec_pilot[j]); - corr_en = corr_en + (rx[ind + ofdm->samplesperframe] * wvec_pilot[j]); - } + corr_st = ofdm_complex_dot_product(&rx[i], wvec_pilot, ofdm->samplespersymbol); + corr_en = ofdm_complex_dot_product(&rx[i + ofdm->samplesperframe], wvec_pilot, ofdm->samplespersymbol); #endif // __EMBEDDED__ corr[i] = (cabsf(corr_st) + cabsf(corr_en)) * av_level; } @@ -1140,12 +1215,11 @@ static float est_timing_and_freq(struct OFDM *ofdm, complex float mvec[Npsam]; for(int i=0; i max_corr) { max_corr = cabsf(corr); *t_est = t;