Demodulation.c

#include "Demodulation.h"
#include "Biquad.h"
#include "drv/rts-logger.h"
#include "util/inlineMath.h"
#include <x86intrin.h>


double section3_decim_coeffs[13] ={ 0.00162185538923,  0.73342532779703,
                                -0.02862365091314, 1.44110125961504,
                                1.67905228090487,  0.77657563380963,
                                -0.08304311675394, 0.18851328163424,
                                0.32889453107067,  0.83213081484618,
                                -0.12573495191273, 0.0940911979108501,
                                0.13622543115194 };


/* M_PI
 ************************************************************************/
#ifndef M_PI
#define M_PI 3.1415926535897932385
#endif

#ifdef __KERNEL__
void *memset(void *s, int c, size_t n);
double fabs(double x);
#endif //__KERNEL__

/* cordic
 ************************************************************************/
inline static
void cordic_step(double* rotation, const double* anglexy)
{
	const double tempc = anglexy[0] * rotation[0] - anglexy[1] * rotation[1];
	rotation[1] = anglexy[1] * rotation[0] + anglexy[0] * rotation[1];
	rotation[0] = tempc;
}

/* avx_stride_struct_init
 ************************************************************************/
void avx_stride_struct_init(avx_stride_struct* self, double rate)
{
	if (!self) return;
	memset(self, 0, sizeof(avx_stride_struct));
	if (rate > 0.0)	self->two_pi_over_rate = 2 * M_PI / rate;
	self->rotation[0] = 1.0;
	self->rotation[1] = 0.0;
	self->anglexy[0] = 1.0;
	self->anglexy[1] = 0.0;
}

/* avx_stride_struct_set_frequency
 ************************************************************************/
void avx_stride_struct_set_frequency(avx_stride_struct* self, double freq)
{
	self->freqhist = freq;
        sincos(freq * self->two_pi_over_rate, self->anglexy + 1, self->anglexy + 0);
}

/* avx_stride_struct_cordic
 ************************************************************************/
void avx_stride_struct_cordic(avx_stride_struct* self)
{
	cordic_step(self->rotation, self->anglexy);
}


/* avx_rotation_struct_init
 ************************************************************************/
void avx_rotation_struct_init(avx_rotation_struct* init, double rate)
{
	memset(init, 0, sizeof(avx_rotation_struct));
	if (rate > 0.0) init->two_pi_over_rate = 2 * M_PI / rate;
        size_t i;
	for (i = 0; i < 2 * MAX_STRIDE; i += 2)
	{
		init->rotation[i + 0] = 1.0;
		init->rotation[i + 1] = 0.0;
		init->anglexy[i + 0] = 1.0;
		init->anglexy[i + 1] = 0.0;
	}
}

/* avx_rotation_struct_set_frequency
 ************************************************************************/
void avx_rotation_struct_set_frequency(avx_rotation_struct* self, double freq, unsigned int idx)
{
	if (idx >= MAX_STRIDE) return;
	self->freqhist[idx] = freq;
	sincos(freq * self->two_pi_over_rate, self->anglexy + 2 * idx + 1, self->anglexy + 2 * idx + 0);
        RTSLOG_DEBUG("sin=%d cos=%d\n", (int)(1000000.0*self->anglexy[2*idx+1]), (int)(1000000.0*self->anglexy[2*idx]));
}


/* demodulation_decimation_stride8_section3
 ************************************************************************/
void demodulation_decimation_stride8_section3_std(const double* inp, double freq, 
	double* out, const double* coeff, avx_stride_struct* hist, size_t stride)
{
#define stride8_mm64 8
#define stride_mm64_max 32
	// number of sections are 3
#define sections3_intrinsic 3

	if ((stride < stride8_mm64) || (stride % stride8_mm64 != 0) || (stride > stride_mm64_max)) return;

	// Check if new frequency
	if (++hist->pause >= 16)
	{
		hist->pause = 0;
		if (fabs(freq - hist->freqhist) > 1E-6) {
			avx_stride_struct_set_frequency(hist, freq);
		}
	}

	double x[2 * stride_mm64_max];
	// loop over stride
	double c = hist->rotation[0];
	double s = hist->rotation[1];
        size_t i;
	for (i = 0; i < stride; ++i)
	{
		x[2 * i + 0] = c * inp[i];
		x[2 * i + 1] = s * inp[i];
        }
	biquad_stride2_std(x, out, coeff, hist->hist, sections3_intrinsic, 2 * stride, 1, 0);

	avx_stride_struct_cordic(hist);
}

/* demodulation_decimation_rotation8_section3_std
 ************************************************************************/
void demodulation_decimation_rotation8_section3_std(double inp, const double* freq, 
	double* out, const double* coeff, avx_rotation_struct* hist, size_t stride)
{
#define stride8_mm64 8
#define stride_mm64_max 32
	// number of sections are 3
#define sections3_intrinsic 3
	if ((stride < stride8_mm64) || (stride % stride8_mm64 != 0) || (stride > stride_mm64_max)) return;

	// loop over stride to check for a frequency change
	// recalculate cordic angle if necessary
	// but do only one every 16th cycle
	if (++hist->pause >= 16)
	{
		hist->pause = 0;
		if (hist->idx >= stride) hist->idx = 0;
		if (fabs(freq[hist->idx] - hist->freqhist[hist->idx]) > 1E-6)
		{
                        RTSLOG_DEBUG("setting frequencies for id %d\n", hist->idx);
			avx_rotation_struct_set_frequency(hist, freq[hist->idx], hist->idx);
		}
		++hist->idx;
	}

	double x[2 * stride_mm64_max];
	// loop over stride
        size_t i;
	for (i = 0; i < stride; ++i)
	{
		x[2 * i + 0] = hist->rotation[2 * i + 0] * inp;
		x[2 * i + 1] = hist->rotation[2 * i + 1] * inp;
		cordic_step(hist->rotation + 2 * i, hist->anglexy + 2 * i);

        }
	biquad_stride2_std(x, out, coeff, hist->hist, sections3_intrinsic, 2 * stride, 1, 0);
}


#if USE_SSE3
/* demodulation_decimation_stride8_section3_sse3
 ************************************************************************/
void demodulation_decimation_stride8_section3_sse3(const double* inp, double freq,
	double* out, const double* coeff, avx_stride_struct* hist, size_t stride)
{
#define stride_mm128 2
#define stride_mm128_q(i) (	 (i == 0) ? 0 * stride_mm128: \
							 (i == 1) ? 1 * stride_mm128 : \
							 (i == 2) ? 2 * stride_mm128 : \
							 (i == 3) ? 3 * stride_mm128 : \
							 (i == 4) ? 4 * stride_mm128 : \
							 (i == 5) ? 5 * stride_mm128 : \
							 (i == 6) ? 6 * stride_mm128 : \
									    7 * stride_mm128)
#define stride8_mm128 8
#define stride_mm128_max 32


	if ((stride < stride8_mm128) || (stride % stride8_mm128 != 0) || (stride > stride_mm128_max)) return;


	// Check if new frequency
	if (++hist->pause >= 16)
	{
		hist->pause = 0;
		if (fabs(freq - hist->freqhist) > 1E-6) {
			avx_stride_struct_set_frequency(hist, freq);
		}
	}

	// Load rotation
	__m128d rotation = *(__m128d*)hist->rotation;
	alignas(16) double x[2 * stride_mm128_max];

	// loop over stride
        size_t i = 0;
	for (i = 0; i <=  stride - stride8_mm128 ; i = i + stride8_mm128)
	{
		_mm_store_pd(x + 2 * i + stride_mm128_q(0), _mm_mul_pd(rotation, _mm_set1_pd(inp[i + 0])));
		_mm_store_pd(x + 2 * i + stride_mm128_q(1), _mm_mul_pd(rotation, _mm_set1_pd(inp[i + 1])));
		_mm_store_pd(x + 2 * i + stride_mm128_q(2), _mm_mul_pd(rotation, _mm_set1_pd(inp[i + 2])));
		_mm_store_pd(x + 2 * i + stride_mm128_q(3), _mm_mul_pd(rotation, _mm_set1_pd(inp[i + 3])));
		_mm_store_pd(x + 2 * i + stride_mm128_q(4), _mm_mul_pd(rotation, _mm_set1_pd(inp[i + 4])));
		_mm_store_pd(x + 2 * i + stride_mm128_q(5), _mm_mul_pd(rotation, _mm_set1_pd(inp[i + 5])));
		_mm_store_pd(x + 2 * i + stride_mm128_q(6), _mm_mul_pd(rotation, _mm_set1_pd(inp[i + 6])));
		_mm_store_pd(x + 2 * i + stride_mm128_q(7), _mm_mul_pd(rotation, _mm_set1_pd(inp[i + 7])));
	}
#if	defined(__AVX512F__) && defined(__AVX512CD__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
	biquad_stride8_section3_sse3(x, out, coeff, hist->hist, 2 * stride, 1, 0);
#else
	biquad_stride2_section3_sse3(x, out, coeff, hist->hist, 2 * stride, 1, 0);
#endif
	avx_stride_struct_cordic(hist);


}

/* demodulation_decimation_rotation8_section3_sse3
 ************************************************************************/
void demodulation_decimation_rotation8_section3_sse3(double inp, const double* freq,
	double* out, const double* coeff, avx_rotation_struct* hist, size_t stride)
{
#define stride_mm128 2
#define stride_mm128_q(i) (	 (i == 0) ? 0 * stride_mm128: \
							 (i == 1) ? 1 * stride_mm128 : \
							 (i == 2) ? 2 * stride_mm128 : \
							 (i == 3) ? 3 * stride_mm128 : \
							 (i == 4) ? 4 * stride_mm128 : \
							 (i == 5) ? 5 * stride_mm128 : \
							 (i == 6) ? 6 * stride_mm128 : \
									    7 * stride_mm128)
#define stride8_mm128 8
#define stride16_mm128 16
#define stride_mm128_max 32

	if ((stride < stride8_mm128) || (stride % stride8_mm128 != 0) || (stride > stride_mm128_max)) return;

	// loop over stride to check for a frequency change
	// recalculate cordic angle if necessary
	// but do only one every 16th cycle
	if (++hist->pause >= 16)
	{
		hist->pause = 0;
		if (hist->idx >= stride) hist->idx = 0;
		if (fabs(freq[hist->idx] - hist->freqhist[hist->idx]) > 1E-6)
		{
			avx_rotation_struct_set_frequency(hist, freq[hist->idx], hist->idx);
		}
		++hist->idx;
	}
	// Load input 
	__m128d input = _mm_set1_pd(inp);
	alignas(16) double x[2 * stride_mm128_max];

	// loop over stride: rotate input and apply cordic
        size_t i;
	for (i = 0; i <= 2 * stride - stride8_mm128; i = i + stride8_mm128)
	{
		const __m128d r1 = _mm_load_pd(hist->rotation + i + stride_mm128_q(0));
		const __m128d r2 = _mm_load_pd(hist->rotation + i + stride_mm128_q(1));
		const __m128d r3 = _mm_load_pd(hist->rotation + i + stride_mm128_q(2));
		const __m128d r4 = _mm_load_pd(hist->rotation + i + stride_mm128_q(3));
		const __m128d cor1 = _mm_load_pd(hist->anglexy + i + stride_mm128_q(0));
		const __m128d cor2 = _mm_load_pd(hist->anglexy + i + stride_mm128_q(1));
		const __m128d cor3 = _mm_load_pd(hist->anglexy + i + stride_mm128_q(2));
		const __m128d cor4 = _mm_load_pd(hist->anglexy + i + stride_mm128_q(3));

		_mm_store_pd(x + i + stride_mm128_q(0), _mm_mul_pd(input, r1));
		_mm_store_pd(x + i + stride_mm128_q(1), _mm_mul_pd(input, r2));
		_mm_store_pd(x + i + stride_mm128_q(2), _mm_mul_pd(input, r3));
		_mm_store_pd(x + i + stride_mm128_q(3), _mm_mul_pd(input, r4));

		__m128d temp1 = _mm_mul_pd(_mm_unpackhi_pd(r1, r1), _mm_shuffle_pd(cor1, cor1, 0x01));
		__m128d temp2 = _mm_mul_pd(_mm_unpackhi_pd(r2, r2), _mm_shuffle_pd(cor2, cor2, 0x01));
		__m128d temp3 = _mm_mul_pd(_mm_unpackhi_pd(r3, r3), _mm_shuffle_pd(cor3, cor3, 0x01));
		__m128d temp4 = _mm_mul_pd(_mm_unpackhi_pd(r4, r4), _mm_shuffle_pd(cor4, cor4, 0x01));
		temp1 = _mm_fmaddsub_pd(_mm_unpacklo_pd(r1, r1), cor1, temp1);
		temp2 = _mm_fmaddsub_pd(_mm_unpacklo_pd(r2, r2), cor2, temp2);
		temp3 = _mm_fmaddsub_pd(_mm_unpacklo_pd(r3, r3), cor3, temp3);
		temp4 = _mm_fmaddsub_pd(_mm_unpacklo_pd(r4, r4), cor4, temp4);
		_mm_store_pd(hist->rotation + i + stride_mm128_q(0), temp1);
		_mm_store_pd(hist->rotation + i + stride_mm128_q(1), temp2);
		_mm_store_pd(hist->rotation + i + stride_mm128_q(2), temp3);
		_mm_store_pd(hist->rotation + i + stride_mm128_q(3), temp4);
	}
	// filter
#if	defined(__AVX512F__) && defined(__AVX512CD__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
	biquad_stride8_section3_sse3(x, out, coeff, hist->hist, 2 * stride, 1, 0);
#else
	biquad_stride2_section3_sse3(x, out, coeff, hist->hist, 2 * stride, 1, 0);
#endif
}
#endif //USE_SSE3


#if USE_AVX2
/* demodulation_decimation_avx2_stride8_section3
 ************************************************************************/

void demodulation_decimation_stride8_section3_avx2(const double* inp, double freq, 
	double* out, const double* coeff, avx_stride_struct* hist, size_t stride)
{
#define stride_mm256 4
#define stride_mm256_q(i) (	 (i == 0) ? 0 * stride_mm256: \
									 (i == 1) ? 1 * stride_mm256 : \
									 (i == 2) ? 2 * stride_mm256 : \
									 (i == 3) ? 3 * stride_mm256 : \
									 (i == 4) ? 4 * stride_mm256 : \
									 (i == 5) ? 5 * stride_mm256 : \
									 (i == 6) ? 6 * stride_mm256 : \
									 7 * stride_mm256)
#define stride8_mm256 8
#define stride_mm256_max 32

	if ((stride < stride8_mm256) || (stride % stride8_mm256 != 0) || (stride > stride_mm256_max)) return;

	// Check if new frequency
	if (++hist->pause >= 16)
	{
		hist->pause = 0;
		if (fabs(freq - hist->freqhist) > 1E-6) {
			avx_stride_struct_set_frequency(hist, freq);
		}
	}

	// Load rotation
	__m256d rotation = _mm256_broadcast_pd((__m128d*)hist->rotation);
	alignas(32) double x[2 * stride_mm256_max];

	// loop over stride
        size_t i;
	for (i = 0; i <= stride - stride8_mm256; i = i + stride8_mm256)
	{
		__m256d i1 = _mm256_loadu_pd(inp + i + stride_mm256_q(0));
		__m256d i2 = _mm256_loadu_pd(inp + i + stride_mm256_q(1));
		_mm256_store_pd(x + 2 * i + stride_mm256_q(0), _mm256_mul_pd(rotation, _mm256_permute4x64_pd(i1, 0x50)));
		_mm256_store_pd(x + 2 * i + stride_mm256_q(1), _mm256_mul_pd(rotation, _mm256_permute4x64_pd(i1, 0xFA)));
		_mm256_store_pd(x + 2 * i + stride_mm256_q(2), _mm256_mul_pd(rotation, _mm256_permute4x64_pd(i2, 0x50)));
		_mm256_store_pd(x + 2 * i + stride_mm256_q(3), _mm256_mul_pd(rotation, _mm256_permute4x64_pd(i2, 0xFA)));
	}
#if	defined(__AVX512F__) && defined(__AVX512CD__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
	biquad_stride16_section3_avx2(x, out, coeff, hist->hist, 2 * stride, 1, 0);
#else
	biquad_stride4_section3_avx2(x, out, coeff, hist->hist, 2 * stride, 1, 0);
#endif
	avx_stride_struct_cordic(hist);
}

/* demodulation_decimation_rotation8_section3_avx2
 ************************************************************************/
void demodulation_decimation_rotation8_section3_avx2(double inp, const double* freq, 
	double* out, const double* coeff, avx_rotation_struct* hist, size_t stride)
{
#define stride_mm256 4
#define stride_mm256_q(i) (	 (i == 0) ? 0 * stride_mm256: \
									 (i == 1) ? 1 * stride_mm256 : \
									 (i == 2) ? 2 * stride_mm256 : \
									 (i == 3) ? 3 * stride_mm256 : \
									 (i == 4) ? 4 * stride_mm256 : \
									 (i == 5) ? 5 * stride_mm256 : \
									 (i == 6) ? 6 * stride_mm256 : \
									 7 * stride_mm256)
#define stride8_mm256 8
#define stride16_mm256 16
#define stride_mm256_max 32

	if ((stride < stride8_mm256) || (stride % stride8_mm256 != 0) || (stride > stride_mm256_max)) return;

	// loop over stride to check for a frequency change
	// recalculate cordic angle if necessary
	// but do only one every 16th cycle
	if (++hist->pause >= 16)
	{
		hist->pause = 0;
		if (hist->idx >= stride) hist->idx = 0;
		if (fabs(freq[hist->idx] - hist->freqhist[hist->idx]) > 1E-6)
		{
			avx_rotation_struct_set_frequency(hist, freq[hist->idx], hist->idx);
		}
		++hist->idx;
	}
	// Load input 
	__m256d input = _mm256_set1_pd(inp);
	alignas(32) double x[2 * stride_mm256_max];

	// loop over stride: rotate input and apply cordic
        size_t i;
	for (i = 0; i <= 2 * stride - stride8_mm256; i = i + stride8_mm256)
	{
		const __m256d r1 = _mm256_load_pd(hist->rotation + i + stride_mm256_q(0));
		const __m256d r2 = _mm256_load_pd(hist->rotation + i + stride_mm256_q(1));
		const __m256d cor1 = _mm256_load_pd(hist->anglexy + i + stride_mm256_q(0));
		const __m256d cor2 = _mm256_load_pd(hist->anglexy + i + stride_mm256_q(1));

		_mm256_store_pd(x + i + stride_mm256_q(0), _mm256_mul_pd(input, r1));
		_mm256_store_pd(x + i + stride_mm256_q(1), _mm256_mul_pd(input, r2));

		__m256d temp1 = _mm256_mul_pd(_mm256_permute_pd(r1, 0x0F), _mm256_permute_pd(cor1, 0x05));
		__m256d temp2 = _mm256_mul_pd(_mm256_permute_pd(r2, 0x0F), _mm256_permute_pd(cor2, 0x05));
		temp1 = _mm256_fmaddsub_pd(_mm256_permute_pd(r1, 0x00), cor1, temp1);
		temp2 = _mm256_fmaddsub_pd(_mm256_permute_pd(r2, 0x00), cor2, temp2);
		_mm256_store_pd(hist->rotation + i + stride_mm256_q(0), temp1);
		_mm256_store_pd(hist->rotation + i + stride_mm256_q(1), temp2);
	}
	// filter
#if	defined(__AVX512F__) && defined(__AVX512CD__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
	biquad_stride16_section3_avx2(x, out, coeff, hist->hist, 2 * stride, 1, 0);
#else
	biquad_stride4_section3_avx2(x, out, coeff, hist->hist, 2 * stride, 1, 0);
#endif
}
#endif // USE_AVX2


#if USE_AVX512
/* demodulation_decimation_stride8_section3_avx512
 ************************************************************************/

void demodulation_decimation_stride8_section3_avx512(const double* inp, double freq, 
	double* out, const double* coeff, avx_stride_struct* hist, size_t stride)
{
#define stride_mm512 8
#define stride_mm512_q(i) (	 (i == 0) ? 0 * stride_mm512: \
									 (i == 1) ? 1 * stride_mm512 : \
									 (i == 2) ? 2 * stride_mm512 : \
									 (i == 3) ? 3 * stride_mm512 : \
									 (i == 4) ? 4 * stride_mm512 : \
									 (i == 5) ? 5 * stride_mm512 : \
									 (i == 6) ? 6 * stride_mm512 : \
									 7 * stride_mm512)
#define stride8_mm512 8
#define stride_mm512_max 32

	if ((stride < stride8_mm512) || (stride % stride8_mm512 != 0) || (stride > stride_mm512_max)) return;

	// Check if new frequency
	if (++hist->pause >= 16)
	{
		hist->pause = 0;
		if (fabs(freq - hist->freqhist) > 1E-6) {
			avx_stride_struct_set_frequency(hist, freq);
		}
	}


	// Load rotation
	__m512d rotation = _mm512_broadcast_f64x2(*(__m128d*)hist->rotation);// _mm512_broadcast_f64x4(_mm256_broadcast_f64x2(*(__m128d*)hist->rotation));
	alignas(64) double x[2 * stride_mm512_max];

	// loop over stride
        size_t i;
	for (i = 0; i <= stride - stride8_mm512; i = i + stride8_mm512)
	{
		_mm512_store_pd(x + 2 * i + stride_mm512_q(0), _mm512_mul_pd(_mm512_setr_pd(inp[i + 0], inp[i + 0], inp[i + 1], inp[i + 1], inp[i + 2], inp[i + 2], inp[i + 3], inp[i + 3]), rotation));
		_mm512_store_pd(x + 2 * i + stride_mm512_q(1), _mm512_mul_pd(_mm512_setr_pd(inp[i + 4], inp[i + 4], inp[i + 5], inp[i + 5], inp[i + 6], inp[i + 6], inp[i + 7], inp[i + 7]), rotation));
	}
	biquad_stride16_section3_avx512(x, out, coeff, hist->hist, 2 * stride, 1, 0);
	avx_stride_struct_cordic(hist);
}

/* demodulation_decimation_rotation8_section3_avx512
 ************************************************************************/
void demodulation_decimation_rotation8_section3_avx512(double inp, const double* freq, 
	double* out, const double* coeff, avx_rotation_struct* hist, size_t stride)
{
#define stride_mm512 8
#define stride_mm512_q(i) (	 (i == 0) ? 0 * stride_mm512: \
									 (i == 1) ? 1 * stride_mm512 : \
									 (i == 2) ? 2 * stride_mm512 : \
									 (i == 3) ? 3 * stride_mm512 : \
									 (i == 4) ? 4 * stride_mm512 : \
									 (i == 5) ? 5 * stride_mm512 : \
									 (i == 6) ? 6 * stride_mm512 : \
									 7 * stride_mm512)
#define stride8_mm512 8
#define stride_mm512_max 32

	if ((stride < stride8_mm512) || (stride % stride8_mm512 != 0) || (stride > stride_mm512_max)) return;

	// loop over stride to check for a frequency change
	// recalculate cordic angle if necessary
	// but do only one every 16th cycle
	if (++hist->pause >= 16)
	{
		hist->pause = 0;
		if (hist->idx >= stride) hist->idx = 0;
		if (fabs(freq[hist->idx] - hist->freqhist[hist->idx]) > 1E-6)
		{
			avx_rotation_struct_set_frequency(hist, freq[hist->idx], hist->idx);
		}
		++hist->idx;
	}
	// Load input 
	__m512d input = _mm512_set1_pd(inp);
	alignas(64) double x[2 * stride_mm512_max];

	// loop over stride: rotate input and apply cordic
        size_t i;
	for (i = 0; i <= 2 * stride - stride8_mm512; i = i + stride8_mm512)
	{
		const __m512d r1 = _mm512_load_pd(hist->rotation + i);
		const __m512d cor1 = _mm512_load_pd(hist->anglexy + i);

		// apply rotation
		_mm512_store_pd(x + i, _mm512_mul_pd(input, r1));

		// cordic
		__m512d temp1 = _mm512_mul_pd(_mm512_permute_pd(r1, 0xFF), _mm512_permute_pd(cor1, 0x55));
		temp1 = _mm512_fmaddsub_pd(_mm512_permute_pd(r1, 0x00), cor1, temp1);
		_mm512_store_pd(hist->rotation + i, temp1);
	}
	// filter
	biquad_stride16_section3_avx512(x, out, coeff, hist->hist, 2 * stride, 1, 0);
}
#endif // USE_AVX512

void demodulation_decimation_stride8_section3(const double* inp, double freq,
                                          double* out, const double* coeff, avx_stride_struct* hist, size_t stride)
{
#if USE_AVX512
#define VEC_MESSAGE   "Using AVX512 demodulation"
    demodulation_decimation_stride8_section3_avx512(inp, freq, out, coeff, hist, stride);
#elif USE_AVX2
#define VEC_MESSAGE "Using AVX2 demodulation"
    demodulation_decimation_stride8_section3_avx2(inp, freq, out, coeff, hist, stride);
#elif USE_SSE3
#define VEC_MESSAGE "Using SSE3 demodulation"
    demodulation_decimation_stride8_section3_sse3(inp, freq, out, coeff, hist, stride);
#else
#define VEC_MESSAGE "Using non-vectorized demodulation"
    demodulation_decimation_stride8_section3_std(inp, freq, out, coeff, hist, stride);
#endif
}


void demodulation_decimation_rotation8_section3(double inp, const double* freq,
                                            double* out, const double* coeff, avx_rotation_struct* hist, size_t stride)
{
#if USE_AVX512
    demodulation_decimation_rotation8_section3_avx512(inp, freq, out, coeff, hist, stride);
#elif USE_AVX2
    demodulation_decimation_rotation8_section3_avx2(inp, freq, out, coeff, hist, stride);
#elif USE_SSE3
    demodulation_decimation_rotation8_section3_sse3(inp, freq, out, coeff, hist, stride);
#else
    demodulation_decimation_rotation8_section3_std(inp, freq, out, coeff, hist, stride);
#endif
}

int demodulation_init(void)
{
    RTSLOG_INFO("%s", VEC_MESSAGE);
    return 0;
}