use core::arch::x86_64::*;
use num_complex::Complex;
use crate::{common::FftNum, FftDirection};
use crate::array_utils;
use crate::array_utils::workaround_transmute_mut;
use crate::array_utils::DoubleBuf;
use crate::common::{fft_error_inplace, fft_error_outofplace};
use crate::twiddles;
use crate::{Direction, Fft, Length};
use super::sse_common::{assert_f32, assert_f64};
use super::sse_utils::*;
use super::sse_vector::{SseArrayMut};
use super::sse_butterflies::{parallel_fft2_interleaved_f32, solo_fft2_f64};
pub struct SseF32Butterfly7<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
rotate: Rotate90F32,
twiddle1re: __m128,
twiddle1im: __m128,
twiddle2re: __m128,
twiddle2im: __m128,
twiddle3re: __m128,
twiddle3im: __m128,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly7, 7, |this: &SseF32Butterfly7<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF32Butterfly7, 7, |this: &SseF32Butterfly7<_>| this
.direction);
impl<T: FftNum> SseF32Butterfly7<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
let rotate = Rotate90F32::new(true);
let tw1: Complex<f32> = twiddles::compute_twiddle(1, 7, direction);
let tw2: Complex<f32> = twiddles::compute_twiddle(2, 7, direction);
let tw3: Complex<f32> = twiddles::compute_twiddle(3, 7, direction);
let twiddle1re = unsafe { _mm_set_ps(tw1.re, tw1.re, tw1.re, tw1.re) };
let twiddle1im = unsafe { _mm_set_ps(tw1.im, tw1.im, tw1.im, tw1.im) };
let twiddle2re = unsafe { _mm_set_ps(tw2.re, tw2.re, tw2.re, tw2.re) };
let twiddle2im = unsafe { _mm_set_ps(tw2.im, tw2.im, tw2.im, tw2.im) };
let twiddle3re = unsafe { _mm_set_ps(tw3.re, tw3.re, tw3.re, tw3.re) };
let twiddle3im = unsafe { _mm_set_ps(tw3.im, tw3.im, tw3.im, tw3.im) };
Self {
direction,
_phantom: std::marker::PhantomData,
rotate,
twiddle1re,
twiddle1im,
twiddle2re,
twiddle2im,
twiddle3re,
twiddle3im,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let values = read_partial1_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6});
let out = self.perform_parallel_fft_direct(values);
write_partial_lo_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6});
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10, 12});
let values = [
extract_lo_hi_f32(input_packed[0], input_packed[3]),
extract_hi_lo_f32(input_packed[0], input_packed[4]),
extract_lo_hi_f32(input_packed[1], input_packed[4]),
extract_hi_lo_f32(input_packed[1], input_packed[5]),
extract_lo_hi_f32(input_packed[2], input_packed[5]),
extract_hi_lo_f32(input_packed[2], input_packed[6]),
extract_lo_hi_f32(input_packed[3], input_packed[6]),
];
let out = self.perform_parallel_fft_direct(values);
let out_packed = [
extract_lo_lo_f32(out[0], out[1]),
extract_lo_lo_f32(out[2], out[3]),
extract_lo_lo_f32(out[4], out[5]),
extract_lo_hi_f32(out[6], out[0]),
extract_hi_hi_f32(out[1], out[2]),
extract_hi_hi_f32(out[3], out[4]),
extract_hi_hi_f32(out[5], out[6]),
];
write_complex_to_array_strided!(out_packed, buffer, 2, {0,1,2,3,4,5,6});
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 7]) -> [__m128; 7] {
let [x1p6, x1m6] = parallel_fft2_interleaved_f32(values[1], values[6]);
let [x2p5, x2m5] = parallel_fft2_interleaved_f32(values[2], values[5]);
let [x3p4, x3m4] = parallel_fft2_interleaved_f32(values[3], values[4]);
let t_a1_1 = _mm_mul_ps(self.twiddle1re, x1p6);
let t_a1_2 = _mm_mul_ps(self.twiddle2re, x2p5);
let t_a1_3 = _mm_mul_ps(self.twiddle3re, x3p4);
let t_a2_1 = _mm_mul_ps(self.twiddle2re, x1p6);
let t_a2_2 = _mm_mul_ps(self.twiddle3re, x2p5);
let t_a2_3 = _mm_mul_ps(self.twiddle1re, x3p4);
let t_a3_1 = _mm_mul_ps(self.twiddle3re, x1p6);
let t_a3_2 = _mm_mul_ps(self.twiddle1re, x2p5);
let t_a3_3 = _mm_mul_ps(self.twiddle2re, x3p4);
let t_b1_1 = _mm_mul_ps(self.twiddle1im, x1m6);
let t_b1_2 = _mm_mul_ps(self.twiddle2im, x2m5);
let t_b1_3 = _mm_mul_ps(self.twiddle3im, x3m4);
let t_b2_1 = _mm_mul_ps(self.twiddle2im, x1m6);
let t_b2_2 = _mm_mul_ps(self.twiddle3im, x2m5);
let t_b2_3 = _mm_mul_ps(self.twiddle1im, x3m4);
let t_b3_1 = _mm_mul_ps(self.twiddle3im, x1m6);
let t_b3_2 = _mm_mul_ps(self.twiddle1im, x2m5);
let t_b3_3 = _mm_mul_ps(self.twiddle2im, x3m4);
let x0 = values[0];
let t_a1 = calc_f32!(x0 + t_a1_1 + t_a1_2 + t_a1_3);
let t_a2 = calc_f32!(x0 + t_a2_1 + t_a2_2 + t_a2_3);
let t_a3 = calc_f32!(x0 + t_a3_1 + t_a3_2 + t_a3_3);
let t_b1 = calc_f32!(t_b1_1 + t_b1_2 + t_b1_3);
let t_b2 = calc_f32!(t_b2_1 - t_b2_2 - t_b2_3);
let t_b3 = calc_f32!(t_b3_1 - t_b3_2 + t_b3_3);
let t_b1_rot = self.rotate.rotate_both(t_b1);
let t_b2_rot = self.rotate.rotate_both(t_b2);
let t_b3_rot = self.rotate.rotate_both(t_b3);
let y0 = calc_f32!(x0 + x1p6 + x2p5 + x3p4);
let [y1, y6] = parallel_fft2_interleaved_f32(t_a1, t_b1_rot);
let [y2, y5] = parallel_fft2_interleaved_f32(t_a2, t_b2_rot);
let [y3, y4] = parallel_fft2_interleaved_f32(t_a3, t_b3_rot);
[y0, y1, y2, y3, y4, y5, y6]
}
}
pub struct SseF64Butterfly7<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
rotate: Rotate90F64,
twiddle1re: __m128d,
twiddle1im: __m128d,
twiddle2re: __m128d,
twiddle2im: __m128d,
twiddle3re: __m128d,
twiddle3im: __m128d,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly7, 7, |this: &SseF64Butterfly7<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF64Butterfly7, 7, |this: &SseF64Butterfly7<_>| this
.direction);
impl<T: FftNum> SseF64Butterfly7<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
let rotate = Rotate90F64::new(true);
let tw1: Complex<f64> = twiddles::compute_twiddle(1, 7, direction);
let tw2: Complex<f64> = twiddles::compute_twiddle(2, 7, direction);
let tw3: Complex<f64> = twiddles::compute_twiddle(3, 7, direction);
let twiddle1re = unsafe { _mm_set_pd(tw1.re, tw1.re) };
let twiddle1im = unsafe { _mm_set_pd(tw1.im, tw1.im) };
let twiddle2re = unsafe { _mm_set_pd(tw2.re, tw2.re) };
let twiddle2im = unsafe { _mm_set_pd(tw2.im, tw2.im) };
let twiddle3re = unsafe { _mm_set_pd(tw3.re, tw3.re) };
let twiddle3im = unsafe { _mm_set_pd(tw3.im, tw3.im) };
Self {
direction,
_phantom: std::marker::PhantomData,
rotate,
twiddle1re,
twiddle1im,
twiddle2re,
twiddle2im,
twiddle3re,
twiddle3im,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let values = read_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6});
let out = self.perform_fft_direct(values);
write_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6});
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 7]) -> [__m128d; 7] {
let [x1p6, x1m6] = solo_fft2_f64(values[1], values[6]);
let [x2p5, x2m5] = solo_fft2_f64(values[2], values[5]);
let [x3p4, x3m4] = solo_fft2_f64(values[3], values[4]);
let t_a1_1 = _mm_mul_pd(self.twiddle1re, x1p6);
let t_a1_2 = _mm_mul_pd(self.twiddle2re, x2p5);
let t_a1_3 = _mm_mul_pd(self.twiddle3re, x3p4);
let t_a2_1 = _mm_mul_pd(self.twiddle2re, x1p6);
let t_a2_2 = _mm_mul_pd(self.twiddle3re, x2p5);
let t_a2_3 = _mm_mul_pd(self.twiddle1re, x3p4);
let t_a3_1 = _mm_mul_pd(self.twiddle3re, x1p6);
let t_a3_2 = _mm_mul_pd(self.twiddle1re, x2p5);
let t_a3_3 = _mm_mul_pd(self.twiddle2re, x3p4);
let t_b1_1 = _mm_mul_pd(self.twiddle1im, x1m6);
let t_b1_2 = _mm_mul_pd(self.twiddle2im, x2m5);
let t_b1_3 = _mm_mul_pd(self.twiddle3im, x3m4);
let t_b2_1 = _mm_mul_pd(self.twiddle2im, x1m6);
let t_b2_2 = _mm_mul_pd(self.twiddle3im, x2m5);
let t_b2_3 = _mm_mul_pd(self.twiddle1im, x3m4);
let t_b3_1 = _mm_mul_pd(self.twiddle3im, x1m6);
let t_b3_2 = _mm_mul_pd(self.twiddle1im, x2m5);
let t_b3_3 = _mm_mul_pd(self.twiddle2im, x3m4);
let x0 = values[0];
let t_a1 = calc_f64!(x0 + t_a1_1 + t_a1_2 + t_a1_3);
let t_a2 = calc_f64!(x0 + t_a2_1 + t_a2_2 + t_a2_3);
let t_a3 = calc_f64!(x0 + t_a3_1 + t_a3_2 + t_a3_3);
let t_b1 = calc_f64!(t_b1_1 + t_b1_2 + t_b1_3);
let t_b2 = calc_f64!(t_b2_1 - t_b2_2 - t_b2_3);
let t_b3 = calc_f64!(t_b3_1 - t_b3_2 + t_b3_3);
let t_b1_rot = self.rotate.rotate(t_b1);
let t_b2_rot = self.rotate.rotate(t_b2);
let t_b3_rot = self.rotate.rotate(t_b3);
let y0 = calc_f64!(x0 + x1p6 + x2p5 + x3p4);
let [y1, y6] = solo_fft2_f64(t_a1, t_b1_rot);
let [y2, y5] = solo_fft2_f64(t_a2, t_b2_rot);
let [y3, y4] = solo_fft2_f64(t_a3, t_b3_rot);
[y0, y1, y2, y3, y4, y5, y6]
}
}
pub struct SseF32Butterfly11<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
rotate: Rotate90F32,
twiddle1re: __m128,
twiddle1im: __m128,
twiddle2re: __m128,
twiddle2im: __m128,
twiddle3re: __m128,
twiddle3im: __m128,
twiddle4re: __m128,
twiddle4im: __m128,
twiddle5re: __m128,
twiddle5im: __m128,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly11, 11, |this: &SseF32Butterfly11<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF32Butterfly11, 11, |this: &SseF32Butterfly11<_>| this
.direction);
impl<T: FftNum> SseF32Butterfly11<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
let rotate = Rotate90F32::new(true);
let tw1: Complex<f32> = twiddles::compute_twiddle(1, 11, direction);
let tw2: Complex<f32> = twiddles::compute_twiddle(2, 11, direction);
let tw3: Complex<f32> = twiddles::compute_twiddle(3, 11, direction);
let tw4: Complex<f32> = twiddles::compute_twiddle(4, 11, direction);
let tw5: Complex<f32> = twiddles::compute_twiddle(5, 11, direction);
let twiddle1re = unsafe { _mm_load1_ps(&tw1.re) };
let twiddle1im = unsafe { _mm_load1_ps(&tw1.im) };
let twiddle2re = unsafe { _mm_load1_ps(&tw2.re) };
let twiddle2im = unsafe { _mm_load1_ps(&tw2.im) };
let twiddle3re = unsafe { _mm_load1_ps(&tw3.re) };
let twiddle3im = unsafe { _mm_load1_ps(&tw3.im) };
let twiddle4re = unsafe { _mm_load1_ps(&tw4.re) };
let twiddle4im = unsafe { _mm_load1_ps(&tw4.im) };
let twiddle5re = unsafe { _mm_load1_ps(&tw5.re) };
let twiddle5im = unsafe { _mm_load1_ps(&tw5.im) };
Self {
direction,
_phantom: std::marker::PhantomData,
rotate,
twiddle1re,
twiddle1im,
twiddle2re,
twiddle2im,
twiddle3re,
twiddle3im,
twiddle4re,
twiddle4im,
twiddle5re,
twiddle5im,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let values = read_partial1_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
let out = self.perform_parallel_fft_direct(values);
write_partial_lo_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20});
let values = [
extract_lo_hi_f32(input_packed[0], input_packed[5]),
extract_hi_lo_f32(input_packed[0], input_packed[6]),
extract_lo_hi_f32(input_packed[1], input_packed[6]),
extract_hi_lo_f32(input_packed[1], input_packed[7]),
extract_lo_hi_f32(input_packed[2], input_packed[7]),
extract_hi_lo_f32(input_packed[2], input_packed[8]),
extract_lo_hi_f32(input_packed[3], input_packed[8]),
extract_hi_lo_f32(input_packed[3], input_packed[9]),
extract_lo_hi_f32(input_packed[4], input_packed[9]),
extract_hi_lo_f32(input_packed[4], input_packed[10]),
extract_lo_hi_f32(input_packed[5], input_packed[10]),
];
let out = self.perform_parallel_fft_direct(values);
let out_packed = [
extract_lo_lo_f32(out[0], out[1]),
extract_lo_lo_f32(out[2], out[3]),
extract_lo_lo_f32(out[4], out[5]),
extract_lo_lo_f32(out[6], out[7]),
extract_lo_lo_f32(out[8], out[9]),
extract_lo_hi_f32(out[10], out[0]),
extract_hi_hi_f32(out[1], out[2]),
extract_hi_hi_f32(out[3], out[4]),
extract_hi_hi_f32(out[5], out[6]),
extract_hi_hi_f32(out[7], out[8]),
extract_hi_hi_f32(out[9], out[10]),
];
write_complex_to_array_strided!(out_packed, buffer, 2, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 11]) -> [__m128; 11] {
let [x1p10, x1m10] = parallel_fft2_interleaved_f32(values[1], values[10]);
let [x2p9, x2m9] = parallel_fft2_interleaved_f32(values[2], values[9]);
let [x3p8, x3m8] = parallel_fft2_interleaved_f32(values[3], values[8]);
let [x4p7, x4m7] = parallel_fft2_interleaved_f32(values[4], values[7]);
let [x5p6, x5m6] = parallel_fft2_interleaved_f32(values[5], values[6]);
let t_a1_1 = _mm_mul_ps(self.twiddle1re, x1p10);
let t_a1_2 = _mm_mul_ps(self.twiddle2re, x2p9);
let t_a1_3 = _mm_mul_ps(self.twiddle3re, x3p8);
let t_a1_4 = _mm_mul_ps(self.twiddle4re, x4p7);
let t_a1_5 = _mm_mul_ps(self.twiddle5re, x5p6);
let t_a2_1 = _mm_mul_ps(self.twiddle2re, x1p10);
let t_a2_2 = _mm_mul_ps(self.twiddle4re, x2p9);
let t_a2_3 = _mm_mul_ps(self.twiddle5re, x3p8);
let t_a2_4 = _mm_mul_ps(self.twiddle3re, x4p7);
let t_a2_5 = _mm_mul_ps(self.twiddle1re, x5p6);
let t_a3_1 = _mm_mul_ps(self.twiddle3re, x1p10);
let t_a3_2 = _mm_mul_ps(self.twiddle5re, x2p9);
let t_a3_3 = _mm_mul_ps(self.twiddle2re, x3p8);
let t_a3_4 = _mm_mul_ps(self.twiddle1re, x4p7);
let t_a3_5 = _mm_mul_ps(self.twiddle4re, x5p6);
let t_a4_1 = _mm_mul_ps(self.twiddle4re, x1p10);
let t_a4_2 = _mm_mul_ps(self.twiddle3re, x2p9);
let t_a4_3 = _mm_mul_ps(self.twiddle1re, x3p8);
let t_a4_4 = _mm_mul_ps(self.twiddle5re, x4p7);
let t_a4_5 = _mm_mul_ps(self.twiddle2re, x5p6);
let t_a5_1 = _mm_mul_ps(self.twiddle5re, x1p10);
let t_a5_2 = _mm_mul_ps(self.twiddle1re, x2p9);
let t_a5_3 = _mm_mul_ps(self.twiddle4re, x3p8);
let t_a5_4 = _mm_mul_ps(self.twiddle2re, x4p7);
let t_a5_5 = _mm_mul_ps(self.twiddle3re, x5p6);
let t_b1_1 = _mm_mul_ps(self.twiddle1im, x1m10);
let t_b1_2 = _mm_mul_ps(self.twiddle2im, x2m9);
let t_b1_3 = _mm_mul_ps(self.twiddle3im, x3m8);
let t_b1_4 = _mm_mul_ps(self.twiddle4im, x4m7);
let t_b1_5 = _mm_mul_ps(self.twiddle5im, x5m6);
let t_b2_1 = _mm_mul_ps(self.twiddle2im, x1m10);
let t_b2_2 = _mm_mul_ps(self.twiddle4im, x2m9);
let t_b2_3 = _mm_mul_ps(self.twiddle5im, x3m8);
let t_b2_4 = _mm_mul_ps(self.twiddle3im, x4m7);
let t_b2_5 = _mm_mul_ps(self.twiddle1im, x5m6);
let t_b3_1 = _mm_mul_ps(self.twiddle3im, x1m10);
let t_b3_2 = _mm_mul_ps(self.twiddle5im, x2m9);
let t_b3_3 = _mm_mul_ps(self.twiddle2im, x3m8);
let t_b3_4 = _mm_mul_ps(self.twiddle1im, x4m7);
let t_b3_5 = _mm_mul_ps(self.twiddle4im, x5m6);
let t_b4_1 = _mm_mul_ps(self.twiddle4im, x1m10);
let t_b4_2 = _mm_mul_ps(self.twiddle3im, x2m9);
let t_b4_3 = _mm_mul_ps(self.twiddle1im, x3m8);
let t_b4_4 = _mm_mul_ps(self.twiddle5im, x4m7);
let t_b4_5 = _mm_mul_ps(self.twiddle2im, x5m6);
let t_b5_1 = _mm_mul_ps(self.twiddle5im, x1m10);
let t_b5_2 = _mm_mul_ps(self.twiddle1im, x2m9);
let t_b5_3 = _mm_mul_ps(self.twiddle4im, x3m8);
let t_b5_4 = _mm_mul_ps(self.twiddle2im, x4m7);
let t_b5_5 = _mm_mul_ps(self.twiddle3im, x5m6);
let x0 = values[0];
let t_a1 = calc_f32!(x0 + t_a1_1 + t_a1_2 + t_a1_3 + t_a1_4 + t_a1_5);
let t_a2 = calc_f32!(x0 + t_a2_1 + t_a2_2 + t_a2_3 + t_a2_4 + t_a2_5);
let t_a3 = calc_f32!(x0 + t_a3_1 + t_a3_2 + t_a3_3 + t_a3_4 + t_a3_5);
let t_a4 = calc_f32!(x0 + t_a4_1 + t_a4_2 + t_a4_3 + t_a4_4 + t_a4_5);
let t_a5 = calc_f32!(x0 + t_a5_1 + t_a5_2 + t_a5_3 + t_a5_4 + t_a5_5);
let t_b1 = calc_f32!(t_b1_1 + t_b1_2 + t_b1_3 + t_b1_4 + t_b1_5);
let t_b2 = calc_f32!(t_b2_1 + t_b2_2 - t_b2_3 - t_b2_4 - t_b2_5);
let t_b3 = calc_f32!(t_b3_1 - t_b3_2 - t_b3_3 + t_b3_4 + t_b3_5);
let t_b4 = calc_f32!(t_b4_1 - t_b4_2 + t_b4_3 + t_b4_4 - t_b4_5);
let t_b5 = calc_f32!(t_b5_1 - t_b5_2 + t_b5_3 - t_b5_4 + t_b5_5);
let t_b1_rot = self.rotate.rotate_both(t_b1);
let t_b2_rot = self.rotate.rotate_both(t_b2);
let t_b3_rot = self.rotate.rotate_both(t_b3);
let t_b4_rot = self.rotate.rotate_both(t_b4);
let t_b5_rot = self.rotate.rotate_both(t_b5);
let y0 = calc_f32!(x0 + x1p10 + x2p9 + x3p8 + x4p7 + x5p6);
let [y1, y10] = parallel_fft2_interleaved_f32(t_a1, t_b1_rot);
let [y2, y9] = parallel_fft2_interleaved_f32(t_a2, t_b2_rot);
let [y3, y8] = parallel_fft2_interleaved_f32(t_a3, t_b3_rot);
let [y4, y7] = parallel_fft2_interleaved_f32(t_a4, t_b4_rot);
let [y5, y6] = parallel_fft2_interleaved_f32(t_a5, t_b5_rot);
[y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10]
}
}
pub struct SseF64Butterfly11<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
rotate: Rotate90F64,
twiddle1re: __m128d,
twiddle1im: __m128d,
twiddle2re: __m128d,
twiddle2im: __m128d,
twiddle3re: __m128d,
twiddle3im: __m128d,
twiddle4re: __m128d,
twiddle4im: __m128d,
twiddle5re: __m128d,
twiddle5im: __m128d,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly11, 11, |this: &SseF64Butterfly11<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF64Butterfly11, 11, |this: &SseF64Butterfly11<_>| this
.direction);
impl<T: FftNum> SseF64Butterfly11<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
let rotate = Rotate90F64::new(true);
let tw1: Complex<f64> = twiddles::compute_twiddle(1, 11, direction);
let tw2: Complex<f64> = twiddles::compute_twiddle(2, 11, direction);
let tw3: Complex<f64> = twiddles::compute_twiddle(3, 11, direction);
let tw4: Complex<f64> = twiddles::compute_twiddle(4, 11, direction);
let tw5: Complex<f64> = twiddles::compute_twiddle(5, 11, direction);
let twiddle1re = unsafe { _mm_set_pd(tw1.re, tw1.re) };
let twiddle1im = unsafe { _mm_set_pd(tw1.im, tw1.im) };
let twiddle2re = unsafe { _mm_set_pd(tw2.re, tw2.re) };
let twiddle2im = unsafe { _mm_set_pd(tw2.im, tw2.im) };
let twiddle3re = unsafe { _mm_set_pd(tw3.re, tw3.re) };
let twiddle3im = unsafe { _mm_set_pd(tw3.im, tw3.im) };
let twiddle4re = unsafe { _mm_set_pd(tw4.re, tw4.re) };
let twiddle4im = unsafe { _mm_set_pd(tw4.im, tw4.im) };
let twiddle5re = unsafe { _mm_set_pd(tw5.re, tw5.re) };
let twiddle5im = unsafe { _mm_set_pd(tw5.im, tw5.im) };
Self {
direction,
_phantom: std::marker::PhantomData,
rotate,
twiddle1re,
twiddle1im,
twiddle2re,
twiddle2im,
twiddle3re,
twiddle3im,
twiddle4re,
twiddle4im,
twiddle5re,
twiddle5im,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let values = read_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
let out = self.perform_fft_direct(values);
write_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 11]) -> [__m128d; 11] {
let [x1p10, x1m10] = solo_fft2_f64(values[1], values[10]);
let [x2p9, x2m9] = solo_fft2_f64(values[2], values[9]);
let [x3p8, x3m8] = solo_fft2_f64(values[3], values[8]);
let [x4p7, x4m7] = solo_fft2_f64(values[4], values[7]);
let [x5p6, x5m6] = solo_fft2_f64(values[5], values[6]);
let t_a1_1 = _mm_mul_pd(self.twiddle1re, x1p10);
let t_a1_2 = _mm_mul_pd(self.twiddle2re, x2p9);
let t_a1_3 = _mm_mul_pd(self.twiddle3re, x3p8);
let t_a1_4 = _mm_mul_pd(self.twiddle4re, x4p7);
let t_a1_5 = _mm_mul_pd(self.twiddle5re, x5p6);
let t_a2_1 = _mm_mul_pd(self.twiddle2re, x1p10);
let t_a2_2 = _mm_mul_pd(self.twiddle4re, x2p9);
let t_a2_3 = _mm_mul_pd(self.twiddle5re, x3p8);
let t_a2_4 = _mm_mul_pd(self.twiddle3re, x4p7);
let t_a2_5 = _mm_mul_pd(self.twiddle1re, x5p6);
let t_a3_1 = _mm_mul_pd(self.twiddle3re, x1p10);
let t_a3_2 = _mm_mul_pd(self.twiddle5re, x2p9);
let t_a3_3 = _mm_mul_pd(self.twiddle2re, x3p8);
let t_a3_4 = _mm_mul_pd(self.twiddle1re, x4p7);
let t_a3_5 = _mm_mul_pd(self.twiddle4re, x5p6);
let t_a4_1 = _mm_mul_pd(self.twiddle4re, x1p10);
let t_a4_2 = _mm_mul_pd(self.twiddle3re, x2p9);
let t_a4_3 = _mm_mul_pd(self.twiddle1re, x3p8);
let t_a4_4 = _mm_mul_pd(self.twiddle5re, x4p7);
let t_a4_5 = _mm_mul_pd(self.twiddle2re, x5p6);
let t_a5_1 = _mm_mul_pd(self.twiddle5re, x1p10);
let t_a5_2 = _mm_mul_pd(self.twiddle1re, x2p9);
let t_a5_3 = _mm_mul_pd(self.twiddle4re, x3p8);
let t_a5_4 = _mm_mul_pd(self.twiddle2re, x4p7);
let t_a5_5 = _mm_mul_pd(self.twiddle3re, x5p6);
let t_b1_1 = _mm_mul_pd(self.twiddle1im, x1m10);
let t_b1_2 = _mm_mul_pd(self.twiddle2im, x2m9);
let t_b1_3 = _mm_mul_pd(self.twiddle3im, x3m8);
let t_b1_4 = _mm_mul_pd(self.twiddle4im, x4m7);
let t_b1_5 = _mm_mul_pd(self.twiddle5im, x5m6);
let t_b2_1 = _mm_mul_pd(self.twiddle2im, x1m10);
let t_b2_2 = _mm_mul_pd(self.twiddle4im, x2m9);
let t_b2_3 = _mm_mul_pd(self.twiddle5im, x3m8);
let t_b2_4 = _mm_mul_pd(self.twiddle3im, x4m7);
let t_b2_5 = _mm_mul_pd(self.twiddle1im, x5m6);
let t_b3_1 = _mm_mul_pd(self.twiddle3im, x1m10);
let t_b3_2 = _mm_mul_pd(self.twiddle5im, x2m9);
let t_b3_3 = _mm_mul_pd(self.twiddle2im, x3m8);
let t_b3_4 = _mm_mul_pd(self.twiddle1im, x4m7);
let t_b3_5 = _mm_mul_pd(self.twiddle4im, x5m6);
let t_b4_1 = _mm_mul_pd(self.twiddle4im, x1m10);
let t_b4_2 = _mm_mul_pd(self.twiddle3im, x2m9);
let t_b4_3 = _mm_mul_pd(self.twiddle1im, x3m8);
let t_b4_4 = _mm_mul_pd(self.twiddle5im, x4m7);
let t_b4_5 = _mm_mul_pd(self.twiddle2im, x5m6);
let t_b5_1 = _mm_mul_pd(self.twiddle5im, x1m10);
let t_b5_2 = _mm_mul_pd(self.twiddle1im, x2m9);
let t_b5_3 = _mm_mul_pd(self.twiddle4im, x3m8);
let t_b5_4 = _mm_mul_pd(self.twiddle2im, x4m7);
let t_b5_5 = _mm_mul_pd(self.twiddle3im, x5m6);
let x0 = values[0];
let t_a1 = calc_f64!(x0 + t_a1_1 + t_a1_2 + t_a1_3 + t_a1_4 + t_a1_5);
let t_a2 = calc_f64!(x0 + t_a2_1 + t_a2_2 + t_a2_3 + t_a2_4 + t_a2_5);
let t_a3 = calc_f64!(x0 + t_a3_1 + t_a3_2 + t_a3_3 + t_a3_4 + t_a3_5);
let t_a4 = calc_f64!(x0 + t_a4_1 + t_a4_2 + t_a4_3 + t_a4_4 + t_a4_5);
let t_a5 = calc_f64!(x0 + t_a5_1 + t_a5_2 + t_a5_3 + t_a5_4 + t_a5_5);
let t_b1 = calc_f64!(t_b1_1 + t_b1_2 + t_b1_3 + t_b1_4 + t_b1_5);
let t_b2 = calc_f64!(t_b2_1 + t_b2_2 - t_b2_3 - t_b2_4 - t_b2_5);
let t_b3 = calc_f64!(t_b3_1 - t_b3_2 - t_b3_3 + t_b3_4 + t_b3_5);
let t_b4 = calc_f64!(t_b4_1 - t_b4_2 + t_b4_3 + t_b4_4 - t_b4_5);
let t_b5 = calc_f64!(t_b5_1 - t_b5_2 + t_b5_3 - t_b5_4 + t_b5_5);
let t_b1_rot = self.rotate.rotate(t_b1);
let t_b2_rot = self.rotate.rotate(t_b2);
let t_b3_rot = self.rotate.rotate(t_b3);
let t_b4_rot = self.rotate.rotate(t_b4);
let t_b5_rot = self.rotate.rotate(t_b5);
let y0 = calc_f64!(x0 + x1p10 + x2p9 + x3p8 + x4p7 + x5p6);
let [y1, y10] = solo_fft2_f64(t_a1, t_b1_rot);
let [y2, y9] = solo_fft2_f64(t_a2, t_b2_rot);
let [y3, y8] = solo_fft2_f64(t_a3, t_b3_rot);
let [y4, y7] = solo_fft2_f64(t_a4, t_b4_rot);
let [y5, y6] = solo_fft2_f64(t_a5, t_b5_rot);
[y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10]
}
}
pub struct SseF32Butterfly13<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
rotate: Rotate90F32,
twiddle1re: __m128,
twiddle1im: __m128,
twiddle2re: __m128,
twiddle2im: __m128,
twiddle3re: __m128,
twiddle3im: __m128,
twiddle4re: __m128,
twiddle4im: __m128,
twiddle5re: __m128,
twiddle5im: __m128,
twiddle6re: __m128,
twiddle6im: __m128,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly13, 13, |this: &SseF32Butterfly13<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF32Butterfly13, 13, |this: &SseF32Butterfly13<_>| this
.direction);
impl<T: FftNum> SseF32Butterfly13<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
let rotate = Rotate90F32::new(true);
let tw1: Complex<f32> = twiddles::compute_twiddle(1, 13, direction);
let tw2: Complex<f32> = twiddles::compute_twiddle(2, 13, direction);
let tw3: Complex<f32> = twiddles::compute_twiddle(3, 13, direction);
let tw4: Complex<f32> = twiddles::compute_twiddle(4, 13, direction);
let tw5: Complex<f32> = twiddles::compute_twiddle(5, 13, direction);
let tw6: Complex<f32> = twiddles::compute_twiddle(6, 13, direction);
let twiddle1re = unsafe { _mm_load1_ps(&tw1.re) };
let twiddle1im = unsafe { _mm_load1_ps(&tw1.im) };
let twiddle2re = unsafe { _mm_load1_ps(&tw2.re) };
let twiddle2im = unsafe { _mm_load1_ps(&tw2.im) };
let twiddle3re = unsafe { _mm_load1_ps(&tw3.re) };
let twiddle3im = unsafe { _mm_load1_ps(&tw3.im) };
let twiddle4re = unsafe { _mm_load1_ps(&tw4.re) };
let twiddle4im = unsafe { _mm_load1_ps(&tw4.im) };
let twiddle5re = unsafe { _mm_load1_ps(&tw5.re) };
let twiddle5im = unsafe { _mm_load1_ps(&tw5.im) };
let twiddle6re = unsafe { _mm_load1_ps(&tw6.re) };
let twiddle6im = unsafe { _mm_load1_ps(&tw6.im) };
Self {
direction,
_phantom: std::marker::PhantomData,
rotate,
twiddle1re,
twiddle1im,
twiddle2re,
twiddle2im,
twiddle3re,
twiddle3im,
twiddle4re,
twiddle4im,
twiddle5re,
twiddle5im,
twiddle6re,
twiddle6im,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let values = read_partial1_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
let out = self.perform_parallel_fft_direct(values);
write_partial_lo_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24});
let values = [
extract_lo_hi_f32(input_packed[0], input_packed[6]),
extract_hi_lo_f32(input_packed[0], input_packed[7]),
extract_lo_hi_f32(input_packed[1], input_packed[7]),
extract_hi_lo_f32(input_packed[1], input_packed[8]),
extract_lo_hi_f32(input_packed[2], input_packed[8]),
extract_hi_lo_f32(input_packed[2], input_packed[9]),
extract_lo_hi_f32(input_packed[3], input_packed[9]),
extract_hi_lo_f32(input_packed[3], input_packed[10]),
extract_lo_hi_f32(input_packed[4], input_packed[10]),
extract_hi_lo_f32(input_packed[4], input_packed[11]),
extract_lo_hi_f32(input_packed[5], input_packed[11]),
extract_hi_lo_f32(input_packed[5], input_packed[12]),
extract_lo_hi_f32(input_packed[6], input_packed[12]),
];
let out = self.perform_parallel_fft_direct(values);
let out_packed = [
extract_lo_lo_f32(out[0], out[1]),
extract_lo_lo_f32(out[2], out[3]),
extract_lo_lo_f32(out[4], out[5]),
extract_lo_lo_f32(out[6], out[7]),
extract_lo_lo_f32(out[8], out[9]),
extract_lo_lo_f32(out[10], out[11]),
extract_lo_hi_f32(out[12], out[0]),
extract_hi_hi_f32(out[1], out[2]),
extract_hi_hi_f32(out[3], out[4]),
extract_hi_hi_f32(out[5], out[6]),
extract_hi_hi_f32(out[7], out[8]),
extract_hi_hi_f32(out[9], out[10]),
extract_hi_hi_f32(out[11], out[12]),
];
write_complex_to_array_strided!(out_packed, buffer, 2, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 13]) -> [__m128; 13] {
let [x1p12, x1m12] = parallel_fft2_interleaved_f32(values[1], values[12]);
let [x2p11, x2m11] = parallel_fft2_interleaved_f32(values[2], values[11]);
let [x3p10, x3m10] = parallel_fft2_interleaved_f32(values[3], values[10]);
let [x4p9, x4m9] = parallel_fft2_interleaved_f32(values[4], values[9]);
let [x5p8, x5m8] = parallel_fft2_interleaved_f32(values[5], values[8]);
let [x6p7, x6m7] = parallel_fft2_interleaved_f32(values[6], values[7]);
let t_a1_1 = _mm_mul_ps(self.twiddle1re, x1p12);
let t_a1_2 = _mm_mul_ps(self.twiddle2re, x2p11);
let t_a1_3 = _mm_mul_ps(self.twiddle3re, x3p10);
let t_a1_4 = _mm_mul_ps(self.twiddle4re, x4p9);
let t_a1_5 = _mm_mul_ps(self.twiddle5re, x5p8);
let t_a1_6 = _mm_mul_ps(self.twiddle6re, x6p7);
let t_a2_1 = _mm_mul_ps(self.twiddle2re, x1p12);
let t_a2_2 = _mm_mul_ps(self.twiddle4re, x2p11);
let t_a2_3 = _mm_mul_ps(self.twiddle6re, x3p10);
let t_a2_4 = _mm_mul_ps(self.twiddle5re, x4p9);
let t_a2_5 = _mm_mul_ps(self.twiddle3re, x5p8);
let t_a2_6 = _mm_mul_ps(self.twiddle1re, x6p7);
let t_a3_1 = _mm_mul_ps(self.twiddle3re, x1p12);
let t_a3_2 = _mm_mul_ps(self.twiddle6re, x2p11);
let t_a3_3 = _mm_mul_ps(self.twiddle4re, x3p10);
let t_a3_4 = _mm_mul_ps(self.twiddle1re, x4p9);
let t_a3_5 = _mm_mul_ps(self.twiddle2re, x5p8);
let t_a3_6 = _mm_mul_ps(self.twiddle5re, x6p7);
let t_a4_1 = _mm_mul_ps(self.twiddle4re, x1p12);
let t_a4_2 = _mm_mul_ps(self.twiddle5re, x2p11);
let t_a4_3 = _mm_mul_ps(self.twiddle1re, x3p10);
let t_a4_4 = _mm_mul_ps(self.twiddle3re, x4p9);
let t_a4_5 = _mm_mul_ps(self.twiddle6re, x5p8);
let t_a4_6 = _mm_mul_ps(self.twiddle2re, x6p7);
let t_a5_1 = _mm_mul_ps(self.twiddle5re, x1p12);
let t_a5_2 = _mm_mul_ps(self.twiddle3re, x2p11);
let t_a5_3 = _mm_mul_ps(self.twiddle2re, x3p10);
let t_a5_4 = _mm_mul_ps(self.twiddle6re, x4p9);
let t_a5_5 = _mm_mul_ps(self.twiddle1re, x5p8);
let t_a5_6 = _mm_mul_ps(self.twiddle4re, x6p7);
let t_a6_1 = _mm_mul_ps(self.twiddle6re, x1p12);
let t_a6_2 = _mm_mul_ps(self.twiddle1re, x2p11);
let t_a6_3 = _mm_mul_ps(self.twiddle5re, x3p10);
let t_a6_4 = _mm_mul_ps(self.twiddle2re, x4p9);
let t_a6_5 = _mm_mul_ps(self.twiddle4re, x5p8);
let t_a6_6 = _mm_mul_ps(self.twiddle3re, x6p7);
let t_b1_1 = _mm_mul_ps(self.twiddle1im, x1m12);
let t_b1_2 = _mm_mul_ps(self.twiddle2im, x2m11);
let t_b1_3 = _mm_mul_ps(self.twiddle3im, x3m10);
let t_b1_4 = _mm_mul_ps(self.twiddle4im, x4m9);
let t_b1_5 = _mm_mul_ps(self.twiddle5im, x5m8);
let t_b1_6 = _mm_mul_ps(self.twiddle6im, x6m7);
let t_b2_1 = _mm_mul_ps(self.twiddle2im, x1m12);
let t_b2_2 = _mm_mul_ps(self.twiddle4im, x2m11);
let t_b2_3 = _mm_mul_ps(self.twiddle6im, x3m10);
let t_b2_4 = _mm_mul_ps(self.twiddle5im, x4m9);
let t_b2_5 = _mm_mul_ps(self.twiddle3im, x5m8);
let t_b2_6 = _mm_mul_ps(self.twiddle1im, x6m7);
let t_b3_1 = _mm_mul_ps(self.twiddle3im, x1m12);
let t_b3_2 = _mm_mul_ps(self.twiddle6im, x2m11);
let t_b3_3 = _mm_mul_ps(self.twiddle4im, x3m10);
let t_b3_4 = _mm_mul_ps(self.twiddle1im, x4m9);
let t_b3_5 = _mm_mul_ps(self.twiddle2im, x5m8);
let t_b3_6 = _mm_mul_ps(self.twiddle5im, x6m7);
let t_b4_1 = _mm_mul_ps(self.twiddle4im, x1m12);
let t_b4_2 = _mm_mul_ps(self.twiddle5im, x2m11);
let t_b4_3 = _mm_mul_ps(self.twiddle1im, x3m10);
let t_b4_4 = _mm_mul_ps(self.twiddle3im, x4m9);
let t_b4_5 = _mm_mul_ps(self.twiddle6im, x5m8);
let t_b4_6 = _mm_mul_ps(self.twiddle2im, x6m7);
let t_b5_1 = _mm_mul_ps(self.twiddle5im, x1m12);
let t_b5_2 = _mm_mul_ps(self.twiddle3im, x2m11);
let t_b5_3 = _mm_mul_ps(self.twiddle2im, x3m10);
let t_b5_4 = _mm_mul_ps(self.twiddle6im, x4m9);
let t_b5_5 = _mm_mul_ps(self.twiddle1im, x5m8);
let t_b5_6 = _mm_mul_ps(self.twiddle4im, x6m7);
let t_b6_1 = _mm_mul_ps(self.twiddle6im, x1m12);
let t_b6_2 = _mm_mul_ps(self.twiddle1im, x2m11);
let t_b6_3 = _mm_mul_ps(self.twiddle5im, x3m10);
let t_b6_4 = _mm_mul_ps(self.twiddle2im, x4m9);
let t_b6_5 = _mm_mul_ps(self.twiddle4im, x5m8);
let t_b6_6 = _mm_mul_ps(self.twiddle3im, x6m7);
let x0 = values[0];
let t_a1 = calc_f32!(x0 + t_a1_1 + t_a1_2 + t_a1_3 + t_a1_4 + t_a1_5 + t_a1_6);
let t_a2 = calc_f32!(x0 + t_a2_1 + t_a2_2 + t_a2_3 + t_a2_4 + t_a2_5 + t_a2_6);
let t_a3 = calc_f32!(x0 + t_a3_1 + t_a3_2 + t_a3_3 + t_a3_4 + t_a3_5 + t_a3_6);
let t_a4 = calc_f32!(x0 + t_a4_1 + t_a4_2 + t_a4_3 + t_a4_4 + t_a4_5 + t_a4_6);
let t_a5 = calc_f32!(x0 + t_a5_1 + t_a5_2 + t_a5_3 + t_a5_4 + t_a5_5 + t_a5_6);
let t_a6 = calc_f32!(x0 + t_a6_1 + t_a6_2 + t_a6_3 + t_a6_4 + t_a6_5 + t_a6_6);
let t_b1 = calc_f32!(t_b1_1 + t_b1_2 + t_b1_3 + t_b1_4 + t_b1_5 + t_b1_6);
let t_b2 = calc_f32!(t_b2_1 + t_b2_2 + t_b2_3 - t_b2_4 - t_b2_5 - t_b2_6);
let t_b3 = calc_f32!(t_b3_1 + t_b3_2 - t_b3_3 - t_b3_4 + t_b3_5 + t_b3_6);
let t_b4 = calc_f32!(t_b4_1 - t_b4_2 - t_b4_3 + t_b4_4 - t_b4_5 - t_b4_6);
let t_b5 = calc_f32!(t_b5_1 - t_b5_2 + t_b5_3 - t_b5_4 - t_b5_5 + t_b5_6);
let t_b6 = calc_f32!(t_b6_1 - t_b6_2 + t_b6_3 - t_b6_4 + t_b6_5 - t_b6_6);
let t_b1_rot = self.rotate.rotate_both(t_b1);
let t_b2_rot = self.rotate.rotate_both(t_b2);
let t_b3_rot = self.rotate.rotate_both(t_b3);
let t_b4_rot = self.rotate.rotate_both(t_b4);
let t_b5_rot = self.rotate.rotate_both(t_b5);
let t_b6_rot = self.rotate.rotate_both(t_b6);
let y0 = calc_f32!(x0 + x1p12 + x2p11 + x3p10 + x4p9 + x5p8 + x6p7);
let [y1, y12] = parallel_fft2_interleaved_f32(t_a1, t_b1_rot);
let [y2, y11] = parallel_fft2_interleaved_f32(t_a2, t_b2_rot);
let [y3, y10] = parallel_fft2_interleaved_f32(t_a3, t_b3_rot);
let [y4, y9] = parallel_fft2_interleaved_f32(t_a4, t_b4_rot);
let [y5, y8] = parallel_fft2_interleaved_f32(t_a5, t_b5_rot);
let [y6, y7] = parallel_fft2_interleaved_f32(t_a6, t_b6_rot);
[y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12]
}
}
pub struct SseF64Butterfly13<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
rotate: Rotate90F64,
twiddle1re: __m128d,
twiddle1im: __m128d,
twiddle2re: __m128d,
twiddle2im: __m128d,
twiddle3re: __m128d,
twiddle3im: __m128d,
twiddle4re: __m128d,
twiddle4im: __m128d,
twiddle5re: __m128d,
twiddle5im: __m128d,
twiddle6re: __m128d,
twiddle6im: __m128d,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly13, 13, |this: &SseF64Butterfly13<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF64Butterfly13, 13, |this: &SseF64Butterfly13<_>| this
.direction);
impl<T: FftNum> SseF64Butterfly13<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
let rotate = Rotate90F64::new(true);
let tw1: Complex<f64> = twiddles::compute_twiddle(1, 13, direction);
let tw2: Complex<f64> = twiddles::compute_twiddle(2, 13, direction);
let tw3: Complex<f64> = twiddles::compute_twiddle(3, 13, direction);
let tw4: Complex<f64> = twiddles::compute_twiddle(4, 13, direction);
let tw5: Complex<f64> = twiddles::compute_twiddle(5, 13, direction);
let tw6: Complex<f64> = twiddles::compute_twiddle(6, 13, direction);
let twiddle1re = unsafe { _mm_set_pd(tw1.re, tw1.re) };
let twiddle1im = unsafe { _mm_set_pd(tw1.im, tw1.im) };
let twiddle2re = unsafe { _mm_set_pd(tw2.re, tw2.re) };
let twiddle2im = unsafe { _mm_set_pd(tw2.im, tw2.im) };
let twiddle3re = unsafe { _mm_set_pd(tw3.re, tw3.re) };
let twiddle3im = unsafe { _mm_set_pd(tw3.im, tw3.im) };
let twiddle4re = unsafe { _mm_set_pd(tw4.re, tw4.re) };
let twiddle4im = unsafe { _mm_set_pd(tw4.im, tw4.im) };
let twiddle5re = unsafe { _mm_set_pd(tw5.re, tw5.re) };
let twiddle5im = unsafe { _mm_set_pd(tw5.im, tw5.im) };
let twiddle6re = unsafe { _mm_set_pd(tw6.re, tw6.re) };
let twiddle6im = unsafe { _mm_set_pd(tw6.im, tw6.im) };
Self {
direction,
_phantom: std::marker::PhantomData,
rotate,
twiddle1re,
twiddle1im,
twiddle2re,
twiddle2im,
twiddle3re,
twiddle3im,
twiddle4re,
twiddle4im,
twiddle5re,
twiddle5im,
twiddle6re,
twiddle6im,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let values = read_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
let out = self.perform_fft_direct(values);
write_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 13]) -> [__m128d; 13] {
let [x1p12, x1m12] = solo_fft2_f64(values[1], values[12]);
let [x2p11, x2m11] = solo_fft2_f64(values[2], values[11]);
let [x3p10, x3m10] = solo_fft2_f64(values[3], values[10]);
let [x4p9, x4m9] = solo_fft2_f64(values[4], values[9]);
let [x5p8, x5m8] = solo_fft2_f64(values[5], values[8]);
let [x6p7, x6m7] = solo_fft2_f64(values[6], values[7]);
let t_a1_1 = _mm_mul_pd(self.twiddle1re, x1p12);
let t_a1_2 = _mm_mul_pd(self.twiddle2re, x2p11);
let t_a1_3 = _mm_mul_pd(self.twiddle3re, x3p10);
let t_a1_4 = _mm_mul_pd(self.twiddle4re, x4p9);
let t_a1_5 = _mm_mul_pd(self.twiddle5re, x5p8);
let t_a1_6 = _mm_mul_pd(self.twiddle6re, x6p7);
let t_a2_1 = _mm_mul_pd(self.twiddle2re, x1p12);
let t_a2_2 = _mm_mul_pd(self.twiddle4re, x2p11);
let t_a2_3 = _mm_mul_pd(self.twiddle6re, x3p10);
let t_a2_4 = _mm_mul_pd(self.twiddle5re, x4p9);
let t_a2_5 = _mm_mul_pd(self.twiddle3re, x5p8);
let t_a2_6 = _mm_mul_pd(self.twiddle1re, x6p7);
let t_a3_1 = _mm_mul_pd(self.twiddle3re, x1p12);
let t_a3_2 = _mm_mul_pd(self.twiddle6re, x2p11);
let t_a3_3 = _mm_mul_pd(self.twiddle4re, x3p10);
let t_a3_4 = _mm_mul_pd(self.twiddle1re, x4p9);
let t_a3_5 = _mm_mul_pd(self.twiddle2re, x5p8);
let t_a3_6 = _mm_mul_pd(self.twiddle5re, x6p7);
let t_a4_1 = _mm_mul_pd(self.twiddle4re, x1p12);
let t_a4_2 = _mm_mul_pd(self.twiddle5re, x2p11);
let t_a4_3 = _mm_mul_pd(self.twiddle1re, x3p10);
let t_a4_4 = _mm_mul_pd(self.twiddle3re, x4p9);
let t_a4_5 = _mm_mul_pd(self.twiddle6re, x5p8);
let t_a4_6 = _mm_mul_pd(self.twiddle2re, x6p7);
let t_a5_1 = _mm_mul_pd(self.twiddle5re, x1p12);
let t_a5_2 = _mm_mul_pd(self.twiddle3re, x2p11);
let t_a5_3 = _mm_mul_pd(self.twiddle2re, x3p10);
let t_a5_4 = _mm_mul_pd(self.twiddle6re, x4p9);
let t_a5_5 = _mm_mul_pd(self.twiddle1re, x5p8);
let t_a5_6 = _mm_mul_pd(self.twiddle4re, x6p7);
let t_a6_1 = _mm_mul_pd(self.twiddle6re, x1p12);
let t_a6_2 = _mm_mul_pd(self.twiddle1re, x2p11);
let t_a6_3 = _mm_mul_pd(self.twiddle5re, x3p10);
let t_a6_4 = _mm_mul_pd(self.twiddle2re, x4p9);
let t_a6_5 = _mm_mul_pd(self.twiddle4re, x5p8);
let t_a6_6 = _mm_mul_pd(self.twiddle3re, x6p7);
let t_b1_1 = _mm_mul_pd(self.twiddle1im, x1m12);
let t_b1_2 = _mm_mul_pd(self.twiddle2im, x2m11);
let t_b1_3 = _mm_mul_pd(self.twiddle3im, x3m10);
let t_b1_4 = _mm_mul_pd(self.twiddle4im, x4m9);
let t_b1_5 = _mm_mul_pd(self.twiddle5im, x5m8);
let t_b1_6 = _mm_mul_pd(self.twiddle6im, x6m7);
let t_b2_1 = _mm_mul_pd(self.twiddle2im, x1m12);
let t_b2_2 = _mm_mul_pd(self.twiddle4im, x2m11);
let t_b2_3 = _mm_mul_pd(self.twiddle6im, x3m10);
let t_b2_4 = _mm_mul_pd(self.twiddle5im, x4m9);
let t_b2_5 = _mm_mul_pd(self.twiddle3im, x5m8);
let t_b2_6 = _mm_mul_pd(self.twiddle1im, x6m7);
let t_b3_1 = _mm_mul_pd(self.twiddle3im, x1m12);
let t_b3_2 = _mm_mul_pd(self.twiddle6im, x2m11);
let t_b3_3 = _mm_mul_pd(self.twiddle4im, x3m10);
let t_b3_4 = _mm_mul_pd(self.twiddle1im, x4m9);
let t_b3_5 = _mm_mul_pd(self.twiddle2im, x5m8);
let t_b3_6 = _mm_mul_pd(self.twiddle5im, x6m7);
let t_b4_1 = _mm_mul_pd(self.twiddle4im, x1m12);
let t_b4_2 = _mm_mul_pd(self.twiddle5im, x2m11);
let t_b4_3 = _mm_mul_pd(self.twiddle1im, x3m10);
let t_b4_4 = _mm_mul_pd(self.twiddle3im, x4m9);
let t_b4_5 = _mm_mul_pd(self.twiddle6im, x5m8);
let t_b4_6 = _mm_mul_pd(self.twiddle2im, x6m7);
let t_b5_1 = _mm_mul_pd(self.twiddle5im, x1m12);
let t_b5_2 = _mm_mul_pd(self.twiddle3im, x2m11);
let t_b5_3 = _mm_mul_pd(self.twiddle2im, x3m10);
let t_b5_4 = _mm_mul_pd(self.twiddle6im, x4m9);
let t_b5_5 = _mm_mul_pd(self.twiddle1im, x5m8);
let t_b5_6 = _mm_mul_pd(self.twiddle4im, x6m7);
let t_b6_1 = _mm_mul_pd(self.twiddle6im, x1m12);
let t_b6_2 = _mm_mul_pd(self.twiddle1im, x2m11);
let t_b6_3 = _mm_mul_pd(self.twiddle5im, x3m10);
let t_b6_4 = _mm_mul_pd(self.twiddle2im, x4m9);
let t_b6_5 = _mm_mul_pd(self.twiddle4im, x5m8);
let t_b6_6 = _mm_mul_pd(self.twiddle3im, x6m7);
let x0 = values[0];
let t_a1 = calc_f64!(x0 + t_a1_1 + t_a1_2 + t_a1_3 + t_a1_4 + t_a1_5 + t_a1_6);
let t_a2 = calc_f64!(x0 + t_a2_1 + t_a2_2 + t_a2_3 + t_a2_4 + t_a2_5 + t_a2_6);
let t_a3 = calc_f64!(x0 + t_a3_1 + t_a3_2 + t_a3_3 + t_a3_4 + t_a3_5 + t_a3_6);
let t_a4 = calc_f64!(x0 + t_a4_1 + t_a4_2 + t_a4_3 + t_a4_4 + t_a4_5 + t_a4_6);
let t_a5 = calc_f64!(x0 + t_a5_1 + t_a5_2 + t_a5_3 + t_a5_4 + t_a5_5 + t_a5_6);
let t_a6 = calc_f64!(x0 + t_a6_1 + t_a6_2 + t_a6_3 + t_a6_4 + t_a6_5 + t_a6_6);
let t_b1 = calc_f64!(t_b1_1 + t_b1_2 + t_b1_3 + t_b1_4 + t_b1_5 + t_b1_6);
let t_b2 = calc_f64!(t_b2_1 + t_b2_2 + t_b2_3 - t_b2_4 - t_b2_5 - t_b2_6);
let t_b3 = calc_f64!(t_b3_1 + t_b3_2 - t_b3_3 - t_b3_4 + t_b3_5 + t_b3_6);
let t_b4 = calc_f64!(t_b4_1 - t_b4_2 - t_b4_3 + t_b4_4 - t_b4_5 - t_b4_6);
let t_b5 = calc_f64!(t_b5_1 - t_b5_2 + t_b5_3 - t_b5_4 - t_b5_5 + t_b5_6);
let t_b6 = calc_f64!(t_b6_1 - t_b6_2 + t_b6_3 - t_b6_4 + t_b6_5 - t_b6_6);
let t_b1_rot = self.rotate.rotate(t_b1);
let t_b2_rot = self.rotate.rotate(t_b2);
let t_b3_rot = self.rotate.rotate(t_b3);
let t_b4_rot = self.rotate.rotate(t_b4);
let t_b5_rot = self.rotate.rotate(t_b5);
let t_b6_rot = self.rotate.rotate(t_b6);
let y0 = calc_f64!(x0 + x1p12 + x2p11 + x3p10 + x4p9 + x5p8 + x6p7);
let [y1, y12] = solo_fft2_f64(t_a1, t_b1_rot);
let [y2, y11] = solo_fft2_f64(t_a2, t_b2_rot);
let [y3, y10] = solo_fft2_f64(t_a3, t_b3_rot);
let [y4, y9] = solo_fft2_f64(t_a4, t_b4_rot);
let [y5, y8] = solo_fft2_f64(t_a5, t_b5_rot);
let [y6, y7] = solo_fft2_f64(t_a6, t_b6_rot);
[y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12]
}
}
pub struct SseF32Butterfly17<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
rotate: Rotate90F32,
twiddle1re: __m128,
twiddle1im: __m128,
twiddle2re: __m128,
twiddle2im: __m128,
twiddle3re: __m128,
twiddle3im: __m128,
twiddle4re: __m128,
twiddle4im: __m128,
twiddle5re: __m128,
twiddle5im: __m128,
twiddle6re: __m128,
twiddle6im: __m128,
twiddle7re: __m128,
twiddle7im: __m128,
twiddle8re: __m128,
twiddle8im: __m128,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly17, 17, |this: &SseF32Butterfly17<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF32Butterfly17, 17, |this: &SseF32Butterfly17<_>| this
.direction);
impl<T: FftNum> SseF32Butterfly17<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
let rotate = Rotate90F32::new(true);
let tw1: Complex<f32> = twiddles::compute_twiddle(1, 17, direction);
let tw2: Complex<f32> = twiddles::compute_twiddle(2, 17, direction);
let tw3: Complex<f32> = twiddles::compute_twiddle(3, 17, direction);
let tw4: Complex<f32> = twiddles::compute_twiddle(4, 17, direction);
let tw5: Complex<f32> = twiddles::compute_twiddle(5, 17, direction);
let tw6: Complex<f32> = twiddles::compute_twiddle(6, 17, direction);
let tw7: Complex<f32> = twiddles::compute_twiddle(7, 17, direction);
let tw8: Complex<f32> = twiddles::compute_twiddle(8, 17, direction);
let twiddle1re = unsafe { _mm_load1_ps(&tw1.re) };
let twiddle1im = unsafe { _mm_load1_ps(&tw1.im) };
let twiddle2re = unsafe { _mm_load1_ps(&tw2.re) };
let twiddle2im = unsafe { _mm_load1_ps(&tw2.im) };
let twiddle3re = unsafe { _mm_load1_ps(&tw3.re) };
let twiddle3im = unsafe { _mm_load1_ps(&tw3.im) };
let twiddle4re = unsafe { _mm_load1_ps(&tw4.re) };
let twiddle4im = unsafe { _mm_load1_ps(&tw4.im) };
let twiddle5re = unsafe { _mm_load1_ps(&tw5.re) };
let twiddle5im = unsafe { _mm_load1_ps(&tw5.im) };
let twiddle6re = unsafe { _mm_load1_ps(&tw6.re) };
let twiddle6im = unsafe { _mm_load1_ps(&tw6.im) };
let twiddle7re = unsafe { _mm_load1_ps(&tw7.re) };
let twiddle7im = unsafe { _mm_load1_ps(&tw7.im) };
let twiddle8re = unsafe { _mm_load1_ps(&tw8.re) };
let twiddle8im = unsafe { _mm_load1_ps(&tw8.im) };
Self {
direction,
_phantom: std::marker::PhantomData,
rotate,
twiddle1re,
twiddle1im,
twiddle2re,
twiddle2im,
twiddle3re,
twiddle3im,
twiddle4re,
twiddle4im,
twiddle5re,
twiddle5im,
twiddle6re,
twiddle6im,
twiddle7re,
twiddle7im,
twiddle8re,
twiddle8im,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let values = read_partial1_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
let out = self.perform_parallel_fft_direct(values);
write_partial_lo_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32});
let values = [
extract_lo_hi_f32(input_packed[0], input_packed[8]),
extract_hi_lo_f32(input_packed[0], input_packed[9]),
extract_lo_hi_f32(input_packed[1], input_packed[9]),
extract_hi_lo_f32(input_packed[1], input_packed[10]),
extract_lo_hi_f32(input_packed[2], input_packed[10]),
extract_hi_lo_f32(input_packed[2], input_packed[11]),
extract_lo_hi_f32(input_packed[3], input_packed[11]),
extract_hi_lo_f32(input_packed[3], input_packed[12]),
extract_lo_hi_f32(input_packed[4], input_packed[12]),
extract_hi_lo_f32(input_packed[4], input_packed[13]),
extract_lo_hi_f32(input_packed[5], input_packed[13]),
extract_hi_lo_f32(input_packed[5], input_packed[14]),
extract_lo_hi_f32(input_packed[6], input_packed[14]),
extract_hi_lo_f32(input_packed[6], input_packed[15]),
extract_lo_hi_f32(input_packed[7], input_packed[15]),
extract_hi_lo_f32(input_packed[7], input_packed[16]),
extract_lo_hi_f32(input_packed[8], input_packed[16]),
];
let out = self.perform_parallel_fft_direct(values);
let out_packed = [
extract_lo_lo_f32(out[0], out[1]),
extract_lo_lo_f32(out[2], out[3]),
extract_lo_lo_f32(out[4], out[5]),
extract_lo_lo_f32(out[6], out[7]),
extract_lo_lo_f32(out[8], out[9]),
extract_lo_lo_f32(out[10], out[11]),
extract_lo_lo_f32(out[12], out[13]),
extract_lo_lo_f32(out[14], out[15]),
extract_lo_hi_f32(out[16], out[0]),
extract_hi_hi_f32(out[1], out[2]),
extract_hi_hi_f32(out[3], out[4]),
extract_hi_hi_f32(out[5], out[6]),
extract_hi_hi_f32(out[7], out[8]),
extract_hi_hi_f32(out[9], out[10]),
extract_hi_hi_f32(out[11], out[12]),
extract_hi_hi_f32(out[13], out[14]),
extract_hi_hi_f32(out[15], out[16]),
];
write_complex_to_array_strided!(out_packed, buffer, 2, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 17]) -> [__m128; 17] {
let [x1p16, x1m16] = parallel_fft2_interleaved_f32(values[1], values[16]);
let [x2p15, x2m15] = parallel_fft2_interleaved_f32(values[2], values[15]);
let [x3p14, x3m14] = parallel_fft2_interleaved_f32(values[3], values[14]);
let [x4p13, x4m13] = parallel_fft2_interleaved_f32(values[4], values[13]);
let [x5p12, x5m12] = parallel_fft2_interleaved_f32(values[5], values[12]);
let [x6p11, x6m11] = parallel_fft2_interleaved_f32(values[6], values[11]);
let [x7p10, x7m10] = parallel_fft2_interleaved_f32(values[7], values[10]);
let [x8p9, x8m9] = parallel_fft2_interleaved_f32(values[8], values[9]);
let t_a1_1 = _mm_mul_ps(self.twiddle1re, x1p16);
let t_a1_2 = _mm_mul_ps(self.twiddle2re, x2p15);
let t_a1_3 = _mm_mul_ps(self.twiddle3re, x3p14);
let t_a1_4 = _mm_mul_ps(self.twiddle4re, x4p13);
let t_a1_5 = _mm_mul_ps(self.twiddle5re, x5p12);
let t_a1_6 = _mm_mul_ps(self.twiddle6re, x6p11);
let t_a1_7 = _mm_mul_ps(self.twiddle7re, x7p10);
let t_a1_8 = _mm_mul_ps(self.twiddle8re, x8p9);
let t_a2_1 = _mm_mul_ps(self.twiddle2re, x1p16);
let t_a2_2 = _mm_mul_ps(self.twiddle4re, x2p15);
let t_a2_3 = _mm_mul_ps(self.twiddle6re, x3p14);
let t_a2_4 = _mm_mul_ps(self.twiddle8re, x4p13);
let t_a2_5 = _mm_mul_ps(self.twiddle7re, x5p12);
let t_a2_6 = _mm_mul_ps(self.twiddle5re, x6p11);
let t_a2_7 = _mm_mul_ps(self.twiddle3re, x7p10);
let t_a2_8 = _mm_mul_ps(self.twiddle1re, x8p9);
let t_a3_1 = _mm_mul_ps(self.twiddle3re, x1p16);
let t_a3_2 = _mm_mul_ps(self.twiddle6re, x2p15);
let t_a3_3 = _mm_mul_ps(self.twiddle8re, x3p14);
let t_a3_4 = _mm_mul_ps(self.twiddle5re, x4p13);
let t_a3_5 = _mm_mul_ps(self.twiddle2re, x5p12);
let t_a3_6 = _mm_mul_ps(self.twiddle1re, x6p11);
let t_a3_7 = _mm_mul_ps(self.twiddle4re, x7p10);
let t_a3_8 = _mm_mul_ps(self.twiddle7re, x8p9);
let t_a4_1 = _mm_mul_ps(self.twiddle4re, x1p16);
let t_a4_2 = _mm_mul_ps(self.twiddle8re, x2p15);
let t_a4_3 = _mm_mul_ps(self.twiddle5re, x3p14);
let t_a4_4 = _mm_mul_ps(self.twiddle1re, x4p13);
let t_a4_5 = _mm_mul_ps(self.twiddle3re, x5p12);
let t_a4_6 = _mm_mul_ps(self.twiddle7re, x6p11);
let t_a4_7 = _mm_mul_ps(self.twiddle6re, x7p10);
let t_a4_8 = _mm_mul_ps(self.twiddle2re, x8p9);
let t_a5_1 = _mm_mul_ps(self.twiddle5re, x1p16);
let t_a5_2 = _mm_mul_ps(self.twiddle7re, x2p15);
let t_a5_3 = _mm_mul_ps(self.twiddle2re, x3p14);
let t_a5_4 = _mm_mul_ps(self.twiddle3re, x4p13);
let t_a5_5 = _mm_mul_ps(self.twiddle8re, x5p12);
let t_a5_6 = _mm_mul_ps(self.twiddle4re, x6p11);
let t_a5_7 = _mm_mul_ps(self.twiddle1re, x7p10);
let t_a5_8 = _mm_mul_ps(self.twiddle6re, x8p9);
let t_a6_1 = _mm_mul_ps(self.twiddle6re, x1p16);
let t_a6_2 = _mm_mul_ps(self.twiddle5re, x2p15);
let t_a6_3 = _mm_mul_ps(self.twiddle1re, x3p14);
let t_a6_4 = _mm_mul_ps(self.twiddle7re, x4p13);
let t_a6_5 = _mm_mul_ps(self.twiddle4re, x5p12);
let t_a6_6 = _mm_mul_ps(self.twiddle2re, x6p11);
let t_a6_7 = _mm_mul_ps(self.twiddle8re, x7p10);
let t_a6_8 = _mm_mul_ps(self.twiddle3re, x8p9);
let t_a7_1 = _mm_mul_ps(self.twiddle7re, x1p16);
let t_a7_2 = _mm_mul_ps(self.twiddle3re, x2p15);
let t_a7_3 = _mm_mul_ps(self.twiddle4re, x3p14);
let t_a7_4 = _mm_mul_ps(self.twiddle6re, x4p13);
let t_a7_5 = _mm_mul_ps(self.twiddle1re, x5p12);
let t_a7_6 = _mm_mul_ps(self.twiddle8re, x6p11);
let t_a7_7 = _mm_mul_ps(self.twiddle2re, x7p10);
let t_a7_8 = _mm_mul_ps(self.twiddle5re, x8p9);
let t_a8_1 = _mm_mul_ps(self.twiddle8re, x1p16);
let t_a8_2 = _mm_mul_ps(self.twiddle1re, x2p15);
let t_a8_3 = _mm_mul_ps(self.twiddle7re, x3p14);
let t_a8_4 = _mm_mul_ps(self.twiddle2re, x4p13);
let t_a8_5 = _mm_mul_ps(self.twiddle6re, x5p12);
let t_a8_6 = _mm_mul_ps(self.twiddle3re, x6p11);
let t_a8_7 = _mm_mul_ps(self.twiddle5re, x7p10);
let t_a8_8 = _mm_mul_ps(self.twiddle4re, x8p9);
let t_b1_1 = _mm_mul_ps(self.twiddle1im, x1m16);
let t_b1_2 = _mm_mul_ps(self.twiddle2im, x2m15);
let t_b1_3 = _mm_mul_ps(self.twiddle3im, x3m14);
let t_b1_4 = _mm_mul_ps(self.twiddle4im, x4m13);
let t_b1_5 = _mm_mul_ps(self.twiddle5im, x5m12);
let t_b1_6 = _mm_mul_ps(self.twiddle6im, x6m11);
let t_b1_7 = _mm_mul_ps(self.twiddle7im, x7m10);
let t_b1_8 = _mm_mul_ps(self.twiddle8im, x8m9);
let t_b2_1 = _mm_mul_ps(self.twiddle2im, x1m16);
let t_b2_2 = _mm_mul_ps(self.twiddle4im, x2m15);
let t_b2_3 = _mm_mul_ps(self.twiddle6im, x3m14);
let t_b2_4 = _mm_mul_ps(self.twiddle8im, x4m13);
let t_b2_5 = _mm_mul_ps(self.twiddle7im, x5m12);
let t_b2_6 = _mm_mul_ps(self.twiddle5im, x6m11);
let t_b2_7 = _mm_mul_ps(self.twiddle3im, x7m10);
let t_b2_8 = _mm_mul_ps(self.twiddle1im, x8m9);
let t_b3_1 = _mm_mul_ps(self.twiddle3im, x1m16);
let t_b3_2 = _mm_mul_ps(self.twiddle6im, x2m15);
let t_b3_3 = _mm_mul_ps(self.twiddle8im, x3m14);
let t_b3_4 = _mm_mul_ps(self.twiddle5im, x4m13);
let t_b3_5 = _mm_mul_ps(self.twiddle2im, x5m12);
let t_b3_6 = _mm_mul_ps(self.twiddle1im, x6m11);
let t_b3_7 = _mm_mul_ps(self.twiddle4im, x7m10);
let t_b3_8 = _mm_mul_ps(self.twiddle7im, x8m9);
let t_b4_1 = _mm_mul_ps(self.twiddle4im, x1m16);
let t_b4_2 = _mm_mul_ps(self.twiddle8im, x2m15);
let t_b4_3 = _mm_mul_ps(self.twiddle5im, x3m14);
let t_b4_4 = _mm_mul_ps(self.twiddle1im, x4m13);
let t_b4_5 = _mm_mul_ps(self.twiddle3im, x5m12);
let t_b4_6 = _mm_mul_ps(self.twiddle7im, x6m11);
let t_b4_7 = _mm_mul_ps(self.twiddle6im, x7m10);
let t_b4_8 = _mm_mul_ps(self.twiddle2im, x8m9);
let t_b5_1 = _mm_mul_ps(self.twiddle5im, x1m16);
let t_b5_2 = _mm_mul_ps(self.twiddle7im, x2m15);
let t_b5_3 = _mm_mul_ps(self.twiddle2im, x3m14);
let t_b5_4 = _mm_mul_ps(self.twiddle3im, x4m13);
let t_b5_5 = _mm_mul_ps(self.twiddle8im, x5m12);
let t_b5_6 = _mm_mul_ps(self.twiddle4im, x6m11);
let t_b5_7 = _mm_mul_ps(self.twiddle1im, x7m10);
let t_b5_8 = _mm_mul_ps(self.twiddle6im, x8m9);
let t_b6_1 = _mm_mul_ps(self.twiddle6im, x1m16);
let t_b6_2 = _mm_mul_ps(self.twiddle5im, x2m15);
let t_b6_3 = _mm_mul_ps(self.twiddle1im, x3m14);
let t_b6_4 = _mm_mul_ps(self.twiddle7im, x4m13);
let t_b6_5 = _mm_mul_ps(self.twiddle4im, x5m12);
let t_b6_6 = _mm_mul_ps(self.twiddle2im, x6m11);
let t_b6_7 = _mm_mul_ps(self.twiddle8im, x7m10);
let t_b6_8 = _mm_mul_ps(self.twiddle3im, x8m9);
let t_b7_1 = _mm_mul_ps(self.twiddle7im, x1m16);
let t_b7_2 = _mm_mul_ps(self.twiddle3im, x2m15);
let t_b7_3 = _mm_mul_ps(self.twiddle4im, x3m14);
let t_b7_4 = _mm_mul_ps(self.twiddle6im, x4m13);
let t_b7_5 = _mm_mul_ps(self.twiddle1im, x5m12);
let t_b7_6 = _mm_mul_ps(self.twiddle8im, x6m11);
let t_b7_7 = _mm_mul_ps(self.twiddle2im, x7m10);
let t_b7_8 = _mm_mul_ps(self.twiddle5im, x8m9);
let t_b8_1 = _mm_mul_ps(self.twiddle8im, x1m16);
let t_b8_2 = _mm_mul_ps(self.twiddle1im, x2m15);
let t_b8_3 = _mm_mul_ps(self.twiddle7im, x3m14);
let t_b8_4 = _mm_mul_ps(self.twiddle2im, x4m13);
let t_b8_5 = _mm_mul_ps(self.twiddle6im, x5m12);
let t_b8_6 = _mm_mul_ps(self.twiddle3im, x6m11);
let t_b8_7 = _mm_mul_ps(self.twiddle5im, x7m10);
let t_b8_8 = _mm_mul_ps(self.twiddle4im, x8m9);
let x0 = values[0];
let t_a1 = calc_f32!(x0 + t_a1_1 + t_a1_2 + t_a1_3 + t_a1_4 + t_a1_5 + t_a1_6 + t_a1_7 + t_a1_8);
let t_a2 = calc_f32!(x0 + t_a2_1 + t_a2_2 + t_a2_3 + t_a2_4 + t_a2_5 + t_a2_6 + t_a2_7 + t_a2_8);
let t_a3 = calc_f32!(x0 + t_a3_1 + t_a3_2 + t_a3_3 + t_a3_4 + t_a3_5 + t_a3_6 + t_a3_7 + t_a3_8);
let t_a4 = calc_f32!(x0 + t_a4_1 + t_a4_2 + t_a4_3 + t_a4_4 + t_a4_5 + t_a4_6 + t_a4_7 + t_a4_8);
let t_a5 = calc_f32!(x0 + t_a5_1 + t_a5_2 + t_a5_3 + t_a5_4 + t_a5_5 + t_a5_6 + t_a5_7 + t_a5_8);
let t_a6 = calc_f32!(x0 + t_a6_1 + t_a6_2 + t_a6_3 + t_a6_4 + t_a6_5 + t_a6_6 + t_a6_7 + t_a6_8);
let t_a7 = calc_f32!(x0 + t_a7_1 + t_a7_2 + t_a7_3 + t_a7_4 + t_a7_5 + t_a7_6 + t_a7_7 + t_a7_8);
let t_a8 = calc_f32!(x0 + t_a8_1 + t_a8_2 + t_a8_3 + t_a8_4 + t_a8_5 + t_a8_6 + t_a8_7 + t_a8_8);
let t_b1 = calc_f32!(t_b1_1 + t_b1_2 + t_b1_3 + t_b1_4 + t_b1_5 + t_b1_6 + t_b1_7 + t_b1_8);
let t_b2 = calc_f32!(t_b2_1 + t_b2_2 + t_b2_3 + t_b2_4 - t_b2_5 - t_b2_6 - t_b2_7 - t_b2_8);
let t_b3 = calc_f32!(t_b3_1 + t_b3_2 - t_b3_3 - t_b3_4 - t_b3_5 + t_b3_6 + t_b3_7 + t_b3_8);
let t_b4 = calc_f32!(t_b4_1 + t_b4_2 - t_b4_3 - t_b4_4 + t_b4_5 + t_b4_6 - t_b4_7 - t_b4_8);
let t_b5 = calc_f32!(t_b5_1 - t_b5_2 - t_b5_3 + t_b5_4 + t_b5_5 - t_b5_6 + t_b5_7 + t_b5_8);
let t_b6 = calc_f32!(t_b6_1 - t_b6_2 + t_b6_3 + t_b6_4 - t_b6_5 + t_b6_6 + t_b6_7 - t_b6_8);
let t_b7 = calc_f32!(t_b7_1 - t_b7_2 + t_b7_3 - t_b7_4 + t_b7_5 + t_b7_6 - t_b7_7 + t_b7_8);
let t_b8 = calc_f32!(t_b8_1 - t_b8_2 + t_b8_3 - t_b8_4 + t_b8_5 - t_b8_6 + t_b8_7 - t_b8_8);
let t_b1_rot = self.rotate.rotate_both(t_b1);
let t_b2_rot = self.rotate.rotate_both(t_b2);
let t_b3_rot = self.rotate.rotate_both(t_b3);
let t_b4_rot = self.rotate.rotate_both(t_b4);
let t_b5_rot = self.rotate.rotate_both(t_b5);
let t_b6_rot = self.rotate.rotate_both(t_b6);
let t_b7_rot = self.rotate.rotate_both(t_b7);
let t_b8_rot = self.rotate.rotate_both(t_b8);
let y0 = calc_f32!(x0 + x1p16 + x2p15 + x3p14 + x4p13 + x5p12 + x6p11 + x7p10 + x8p9);
let [y1, y16] = parallel_fft2_interleaved_f32(t_a1, t_b1_rot);
let [y2, y15] = parallel_fft2_interleaved_f32(t_a2, t_b2_rot);
let [y3, y14] = parallel_fft2_interleaved_f32(t_a3, t_b3_rot);
let [y4, y13] = parallel_fft2_interleaved_f32(t_a4, t_b4_rot);
let [y5, y12] = parallel_fft2_interleaved_f32(t_a5, t_b5_rot);
let [y6, y11] = parallel_fft2_interleaved_f32(t_a6, t_b6_rot);
let [y7, y10] = parallel_fft2_interleaved_f32(t_a7, t_b7_rot);
let [y8, y9] = parallel_fft2_interleaved_f32(t_a8, t_b8_rot);
[y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15, y16]
}
}
pub struct SseF64Butterfly17<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
rotate: Rotate90F64,
twiddle1re: __m128d,
twiddle1im: __m128d,
twiddle2re: __m128d,
twiddle2im: __m128d,
twiddle3re: __m128d,
twiddle3im: __m128d,
twiddle4re: __m128d,
twiddle4im: __m128d,
twiddle5re: __m128d,
twiddle5im: __m128d,
twiddle6re: __m128d,
twiddle6im: __m128d,
twiddle7re: __m128d,
twiddle7im: __m128d,
twiddle8re: __m128d,
twiddle8im: __m128d,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly17, 17, |this: &SseF64Butterfly17<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF64Butterfly17, 17, |this: &SseF64Butterfly17<_>| this
.direction);
impl<T: FftNum> SseF64Butterfly17<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
let rotate = Rotate90F64::new(true);
let tw1: Complex<f64> = twiddles::compute_twiddle(1, 17, direction);
let tw2: Complex<f64> = twiddles::compute_twiddle(2, 17, direction);
let tw3: Complex<f64> = twiddles::compute_twiddle(3, 17, direction);
let tw4: Complex<f64> = twiddles::compute_twiddle(4, 17, direction);
let tw5: Complex<f64> = twiddles::compute_twiddle(5, 17, direction);
let tw6: Complex<f64> = twiddles::compute_twiddle(6, 17, direction);
let tw7: Complex<f64> = twiddles::compute_twiddle(7, 17, direction);
let tw8: Complex<f64> = twiddles::compute_twiddle(8, 17, direction);
let twiddle1re = unsafe { _mm_set_pd(tw1.re, tw1.re) };
let twiddle1im = unsafe { _mm_set_pd(tw1.im, tw1.im) };
let twiddle2re = unsafe { _mm_set_pd(tw2.re, tw2.re) };
let twiddle2im = unsafe { _mm_set_pd(tw2.im, tw2.im) };
let twiddle3re = unsafe { _mm_set_pd(tw3.re, tw3.re) };
let twiddle3im = unsafe { _mm_set_pd(tw3.im, tw3.im) };
let twiddle4re = unsafe { _mm_set_pd(tw4.re, tw4.re) };
let twiddle4im = unsafe { _mm_set_pd(tw4.im, tw4.im) };
let twiddle5re = unsafe { _mm_set_pd(tw5.re, tw5.re) };
let twiddle5im = unsafe { _mm_set_pd(tw5.im, tw5.im) };
let twiddle6re = unsafe { _mm_set_pd(tw6.re, tw6.re) };
let twiddle6im = unsafe { _mm_set_pd(tw6.im, tw6.im) };
let twiddle7re = unsafe { _mm_set_pd(tw7.re, tw7.re) };
let twiddle7im = unsafe { _mm_set_pd(tw7.im, tw7.im) };
let twiddle8re = unsafe { _mm_set_pd(tw8.re, tw8.re) };
let twiddle8im = unsafe { _mm_set_pd(tw8.im, tw8.im) };
Self {
direction,
_phantom: std::marker::PhantomData,
rotate,
twiddle1re,
twiddle1im,
twiddle2re,
twiddle2im,
twiddle3re,
twiddle3im,
twiddle4re,
twiddle4im,
twiddle5re,
twiddle5im,
twiddle6re,
twiddle6im,
twiddle7re,
twiddle7im,
twiddle8re,
twiddle8im,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let values = read_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
let out = self.perform_fft_direct(values);
write_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 17]) -> [__m128d; 17] {
let [x1p16, x1m16] = solo_fft2_f64(values[1], values[16]);
let [x2p15, x2m15] = solo_fft2_f64(values[2], values[15]);
let [x3p14, x3m14] = solo_fft2_f64(values[3], values[14]);
let [x4p13, x4m13] = solo_fft2_f64(values[4], values[13]);
let [x5p12, x5m12] = solo_fft2_f64(values[5], values[12]);
let [x6p11, x6m11] = solo_fft2_f64(values[6], values[11]);
let [x7p10, x7m10] = solo_fft2_f64(values[7], values[10]);
let [x8p9, x8m9] = solo_fft2_f64(values[8], values[9]);
let t_a1_1 = _mm_mul_pd(self.twiddle1re, x1p16);
let t_a1_2 = _mm_mul_pd(self.twiddle2re, x2p15);
let t_a1_3 = _mm_mul_pd(self.twiddle3re, x3p14);
let t_a1_4 = _mm_mul_pd(self.twiddle4re, x4p13);
let t_a1_5 = _mm_mul_pd(self.twiddle5re, x5p12);
let t_a1_6 = _mm_mul_pd(self.twiddle6re, x6p11);
let t_a1_7 = _mm_mul_pd(self.twiddle7re, x7p10);
let t_a1_8 = _mm_mul_pd(self.twiddle8re, x8p9);
let t_a2_1 = _mm_mul_pd(self.twiddle2re, x1p16);
let t_a2_2 = _mm_mul_pd(self.twiddle4re, x2p15);
let t_a2_3 = _mm_mul_pd(self.twiddle6re, x3p14);
let t_a2_4 = _mm_mul_pd(self.twiddle8re, x4p13);
let t_a2_5 = _mm_mul_pd(self.twiddle7re, x5p12);
let t_a2_6 = _mm_mul_pd(self.twiddle5re, x6p11);
let t_a2_7 = _mm_mul_pd(self.twiddle3re, x7p10);
let t_a2_8 = _mm_mul_pd(self.twiddle1re, x8p9);
let t_a3_1 = _mm_mul_pd(self.twiddle3re, x1p16);
let t_a3_2 = _mm_mul_pd(self.twiddle6re, x2p15);
let t_a3_3 = _mm_mul_pd(self.twiddle8re, x3p14);
let t_a3_4 = _mm_mul_pd(self.twiddle5re, x4p13);
let t_a3_5 = _mm_mul_pd(self.twiddle2re, x5p12);
let t_a3_6 = _mm_mul_pd(self.twiddle1re, x6p11);
let t_a3_7 = _mm_mul_pd(self.twiddle4re, x7p10);
let t_a3_8 = _mm_mul_pd(self.twiddle7re, x8p9);
let t_a4_1 = _mm_mul_pd(self.twiddle4re, x1p16);
let t_a4_2 = _mm_mul_pd(self.twiddle8re, x2p15);
let t_a4_3 = _mm_mul_pd(self.twiddle5re, x3p14);
let t_a4_4 = _mm_mul_pd(self.twiddle1re, x4p13);
let t_a4_5 = _mm_mul_pd(self.twiddle3re, x5p12);
let t_a4_6 = _mm_mul_pd(self.twiddle7re, x6p11);
let t_a4_7 = _mm_mul_pd(self.twiddle6re, x7p10);
let t_a4_8 = _mm_mul_pd(self.twiddle2re, x8p9);
let t_a5_1 = _mm_mul_pd(self.twiddle5re, x1p16);
let t_a5_2 = _mm_mul_pd(self.twiddle7re, x2p15);
let t_a5_3 = _mm_mul_pd(self.twiddle2re, x3p14);
let t_a5_4 = _mm_mul_pd(self.twiddle3re, x4p13);
let t_a5_5 = _mm_mul_pd(self.twiddle8re, x5p12);
let t_a5_6 = _mm_mul_pd(self.twiddle4re, x6p11);
let t_a5_7 = _mm_mul_pd(self.twiddle1re, x7p10);
let t_a5_8 = _mm_mul_pd(self.twiddle6re, x8p9);
let t_a6_1 = _mm_mul_pd(self.twiddle6re, x1p16);
let t_a6_2 = _mm_mul_pd(self.twiddle5re, x2p15);
let t_a6_3 = _mm_mul_pd(self.twiddle1re, x3p14);
let t_a6_4 = _mm_mul_pd(self.twiddle7re, x4p13);
let t_a6_5 = _mm_mul_pd(self.twiddle4re, x5p12);
let t_a6_6 = _mm_mul_pd(self.twiddle2re, x6p11);
let t_a6_7 = _mm_mul_pd(self.twiddle8re, x7p10);
let t_a6_8 = _mm_mul_pd(self.twiddle3re, x8p9);
let t_a7_1 = _mm_mul_pd(self.twiddle7re, x1p16);
let t_a7_2 = _mm_mul_pd(self.twiddle3re, x2p15);
let t_a7_3 = _mm_mul_pd(self.twiddle4re, x3p14);
let t_a7_4 = _mm_mul_pd(self.twiddle6re, x4p13);
let t_a7_5 = _mm_mul_pd(self.twiddle1re, x5p12);
let t_a7_6 = _mm_mul_pd(self.twiddle8re, x6p11);
let t_a7_7 = _mm_mul_pd(self.twiddle2re, x7p10);
let t_a7_8 = _mm_mul_pd(self.twiddle5re, x8p9);
let t_a8_1 = _mm_mul_pd(self.twiddle8re, x1p16);
let t_a8_2 = _mm_mul_pd(self.twiddle1re, x2p15);
let t_a8_3 = _mm_mul_pd(self.twiddle7re, x3p14);
let t_a8_4 = _mm_mul_pd(self.twiddle2re, x4p13);
let t_a8_5 = _mm_mul_pd(self.twiddle6re, x5p12);
let t_a8_6 = _mm_mul_pd(self.twiddle3re, x6p11);
let t_a8_7 = _mm_mul_pd(self.twiddle5re, x7p10);
let t_a8_8 = _mm_mul_pd(self.twiddle4re, x8p9);
let t_b1_1 = _mm_mul_pd(self.twiddle1im, x1m16);
let t_b1_2 = _mm_mul_pd(self.twiddle2im, x2m15);
let t_b1_3 = _mm_mul_pd(self.twiddle3im, x3m14);
let t_b1_4 = _mm_mul_pd(self.twiddle4im, x4m13);
let t_b1_5 = _mm_mul_pd(self.twiddle5im, x5m12);
let t_b1_6 = _mm_mul_pd(self.twiddle6im, x6m11);
let t_b1_7 = _mm_mul_pd(self.twiddle7im, x7m10);
let t_b1_8 = _mm_mul_pd(self.twiddle8im, x8m9);
let t_b2_1 = _mm_mul_pd(self.twiddle2im, x1m16);
let t_b2_2 = _mm_mul_pd(self.twiddle4im, x2m15);
let t_b2_3 = _mm_mul_pd(self.twiddle6im, x3m14);
let t_b2_4 = _mm_mul_pd(self.twiddle8im, x4m13);
let t_b2_5 = _mm_mul_pd(self.twiddle7im, x5m12);
let t_b2_6 = _mm_mul_pd(self.twiddle5im, x6m11);
let t_b2_7 = _mm_mul_pd(self.twiddle3im, x7m10);
let t_b2_8 = _mm_mul_pd(self.twiddle1im, x8m9);
let t_b3_1 = _mm_mul_pd(self.twiddle3im, x1m16);
let t_b3_2 = _mm_mul_pd(self.twiddle6im, x2m15);
let t_b3_3 = _mm_mul_pd(self.twiddle8im, x3m14);
let t_b3_4 = _mm_mul_pd(self.twiddle5im, x4m13);
let t_b3_5 = _mm_mul_pd(self.twiddle2im, x5m12);
let t_b3_6 = _mm_mul_pd(self.twiddle1im, x6m11);
let t_b3_7 = _mm_mul_pd(self.twiddle4im, x7m10);
let t_b3_8 = _mm_mul_pd(self.twiddle7im, x8m9);
let t_b4_1 = _mm_mul_pd(self.twiddle4im, x1m16);
let t_b4_2 = _mm_mul_pd(self.twiddle8im, x2m15);
let t_b4_3 = _mm_mul_pd(self.twiddle5im, x3m14);
let t_b4_4 = _mm_mul_pd(self.twiddle1im, x4m13);
let t_b4_5 = _mm_mul_pd(self.twiddle3im, x5m12);
let t_b4_6 = _mm_mul_pd(self.twiddle7im, x6m11);
let t_b4_7 = _mm_mul_pd(self.twiddle6im, x7m10);
let t_b4_8 = _mm_mul_pd(self.twiddle2im, x8m9);
let t_b5_1 = _mm_mul_pd(self.twiddle5im, x1m16);
let t_b5_2 = _mm_mul_pd(self.twiddle7im, x2m15);
let t_b5_3 = _mm_mul_pd(self.twiddle2im, x3m14);
let t_b5_4 = _mm_mul_pd(self.twiddle3im, x4m13);
let t_b5_5 = _mm_mul_pd(self.twiddle8im, x5m12);
let t_b5_6 = _mm_mul_pd(self.twiddle4im, x6m11);
let t_b5_7 = _mm_mul_pd(self.twiddle1im, x7m10);
let t_b5_8 = _mm_mul_pd(self.twiddle6im, x8m9);
let t_b6_1 = _mm_mul_pd(self.twiddle6im, x1m16);
let t_b6_2 = _mm_mul_pd(self.twiddle5im, x2m15);
let t_b6_3 = _mm_mul_pd(self.twiddle1im, x3m14);
let t_b6_4 = _mm_mul_pd(self.twiddle7im, x4m13);
let t_b6_5 = _mm_mul_pd(self.twiddle4im, x5m12);
let t_b6_6 = _mm_mul_pd(self.twiddle2im, x6m11);
let t_b6_7 = _mm_mul_pd(self.twiddle8im, x7m10);
let t_b6_8 = _mm_mul_pd(self.twiddle3im, x8m9);
let t_b7_1 = _mm_mul_pd(self.twiddle7im, x1m16);
let t_b7_2 = _mm_mul_pd(self.twiddle3im, x2m15);
let t_b7_3 = _mm_mul_pd(self.twiddle4im, x3m14);
let t_b7_4 = _mm_mul_pd(self.twiddle6im, x4m13);
let t_b7_5 = _mm_mul_pd(self.twiddle1im, x5m12);
let t_b7_6 = _mm_mul_pd(self.twiddle8im, x6m11);
let t_b7_7 = _mm_mul_pd(self.twiddle2im, x7m10);
let t_b7_8 = _mm_mul_pd(self.twiddle5im, x8m9);
let t_b8_1 = _mm_mul_pd(self.twiddle8im, x1m16);
let t_b8_2 = _mm_mul_pd(self.twiddle1im, x2m15);
let t_b8_3 = _mm_mul_pd(self.twiddle7im, x3m14);
let t_b8_4 = _mm_mul_pd(self.twiddle2im, x4m13);
let t_b8_5 = _mm_mul_pd(self.twiddle6im, x5m12);
let t_b8_6 = _mm_mul_pd(self.twiddle3im, x6m11);
let t_b8_7 = _mm_mul_pd(self.twiddle5im, x7m10);
let t_b8_8 = _mm_mul_pd(self.twiddle4im, x8m9);
let x0 = values[0];
let t_a1 = calc_f64!(x0 + t_a1_1 + t_a1_2 + t_a1_3 + t_a1_4 + t_a1_5 + t_a1_6 + t_a1_7 + t_a1_8);
let t_a2 = calc_f64!(x0 + t_a2_1 + t_a2_2 + t_a2_3 + t_a2_4 + t_a2_5 + t_a2_6 + t_a2_7 + t_a2_8);
let t_a3 = calc_f64!(x0 + t_a3_1 + t_a3_2 + t_a3_3 + t_a3_4 + t_a3_5 + t_a3_6 + t_a3_7 + t_a3_8);
let t_a4 = calc_f64!(x0 + t_a4_1 + t_a4_2 + t_a4_3 + t_a4_4 + t_a4_5 + t_a4_6 + t_a4_7 + t_a4_8);
let t_a5 = calc_f64!(x0 + t_a5_1 + t_a5_2 + t_a5_3 + t_a5_4 + t_a5_5 + t_a5_6 + t_a5_7 + t_a5_8);
let t_a6 = calc_f64!(x0 + t_a6_1 + t_a6_2 + t_a6_3 + t_a6_4 + t_a6_5 + t_a6_6 + t_a6_7 + t_a6_8);
let t_a7 = calc_f64!(x0 + t_a7_1 + t_a7_2 + t_a7_3 + t_a7_4 + t_a7_5 + t_a7_6 + t_a7_7 + t_a7_8);
let t_a8 = calc_f64!(x0 + t_a8_1 + t_a8_2 + t_a8_3 + t_a8_4 + t_a8_5 + t_a8_6 + t_a8_7 + t_a8_8);
let t_b1 = calc_f64!(t_b1_1 + t_b1_2 + t_b1_3 + t_b1_4 + t_b1_5 + t_b1_6 + t_b1_7 + t_b1_8);
let t_b2 = calc_f64!(t_b2_1 + t_b2_2 + t_b2_3 + t_b2_4 - t_b2_5 - t_b2_6 - t_b2_7 - t_b2_8);
let t_b3 = calc_f64!(t_b3_1 + t_b3_2 - t_b3_3 - t_b3_4 - t_b3_5 + t_b3_6 + t_b3_7 + t_b3_8);
let t_b4 = calc_f64!(t_b4_1 + t_b4_2 - t_b4_3 - t_b4_4 + t_b4_5 + t_b4_6 - t_b4_7 - t_b4_8);
let t_b5 = calc_f64!(t_b5_1 - t_b5_2 - t_b5_3 + t_b5_4 + t_b5_5 - t_b5_6 + t_b5_7 + t_b5_8);
let t_b6 = calc_f64!(t_b6_1 - t_b6_2 + t_b6_3 + t_b6_4 - t_b6_5 + t_b6_6 + t_b6_7 - t_b6_8);
let t_b7 = calc_f64!(t_b7_1 - t_b7_2 + t_b7_3 - t_b7_4 + t_b7_5 + t_b7_6 - t_b7_7 + t_b7_8);
let t_b8 = calc_f64!(t_b8_1 - t_b8_2 + t_b8_3 - t_b8_4 + t_b8_5 - t_b8_6 + t_b8_7 - t_b8_8);
let t_b1_rot = self.rotate.rotate(t_b1);
let t_b2_rot = self.rotate.rotate(t_b2);
let t_b3_rot = self.rotate.rotate(t_b3);
let t_b4_rot = self.rotate.rotate(t_b4);
let t_b5_rot = self.rotate.rotate(t_b5);
let t_b6_rot = self.rotate.rotate(t_b6);
let t_b7_rot = self.rotate.rotate(t_b7);
let t_b8_rot = self.rotate.rotate(t_b8);
let y0 = calc_f64!(x0 + x1p16 + x2p15 + x3p14 + x4p13 + x5p12 + x6p11 + x7p10 + x8p9);
let [y1, y16] = solo_fft2_f64(t_a1, t_b1_rot);
let [y2, y15] = solo_fft2_f64(t_a2, t_b2_rot);
let [y3, y14] = solo_fft2_f64(t_a3, t_b3_rot);
let [y4, y13] = solo_fft2_f64(t_a4, t_b4_rot);
let [y5, y12] = solo_fft2_f64(t_a5, t_b5_rot);
let [y6, y11] = solo_fft2_f64(t_a6, t_b6_rot);
let [y7, y10] = solo_fft2_f64(t_a7, t_b7_rot);
let [y8, y9] = solo_fft2_f64(t_a8, t_b8_rot);
[y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15, y16]
}
}
pub struct SseF32Butterfly19<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
rotate: Rotate90F32,
twiddle1re: __m128,
twiddle1im: __m128,
twiddle2re: __m128,
twiddle2im: __m128,
twiddle3re: __m128,
twiddle3im: __m128,
twiddle4re: __m128,
twiddle4im: __m128,
twiddle5re: __m128,
twiddle5im: __m128,
twiddle6re: __m128,
twiddle6im: __m128,
twiddle7re: __m128,
twiddle7im: __m128,
twiddle8re: __m128,
twiddle8im: __m128,
twiddle9re: __m128,
twiddle9im: __m128,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly19, 19, |this: &SseF32Butterfly19<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF32Butterfly19, 19, |this: &SseF32Butterfly19<_>| this
.direction);
impl<T: FftNum> SseF32Butterfly19<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
let rotate = Rotate90F32::new(true);
let tw1: Complex<f32> = twiddles::compute_twiddle(1, 19, direction);
let tw2: Complex<f32> = twiddles::compute_twiddle(2, 19, direction);
let tw3: Complex<f32> = twiddles::compute_twiddle(3, 19, direction);
let tw4: Complex<f32> = twiddles::compute_twiddle(4, 19, direction);
let tw5: Complex<f32> = twiddles::compute_twiddle(5, 19, direction);
let tw6: Complex<f32> = twiddles::compute_twiddle(6, 19, direction);
let tw7: Complex<f32> = twiddles::compute_twiddle(7, 19, direction);
let tw8: Complex<f32> = twiddles::compute_twiddle(8, 19, direction);
let tw9: Complex<f32> = twiddles::compute_twiddle(9, 19, direction);
let twiddle1re = unsafe { _mm_load1_ps(&tw1.re) };
let twiddle1im = unsafe { _mm_load1_ps(&tw1.im) };
let twiddle2re = unsafe { _mm_load1_ps(&tw2.re) };
let twiddle2im = unsafe { _mm_load1_ps(&tw2.im) };
let twiddle3re = unsafe { _mm_load1_ps(&tw3.re) };
let twiddle3im = unsafe { _mm_load1_ps(&tw3.im) };
let twiddle4re = unsafe { _mm_load1_ps(&tw4.re) };
let twiddle4im = unsafe { _mm_load1_ps(&tw4.im) };
let twiddle5re = unsafe { _mm_load1_ps(&tw5.re) };
let twiddle5im = unsafe { _mm_load1_ps(&tw5.im) };
let twiddle6re = unsafe { _mm_load1_ps(&tw6.re) };
let twiddle6im = unsafe { _mm_load1_ps(&tw6.im) };
let twiddle7re = unsafe { _mm_load1_ps(&tw7.re) };
let twiddle7im = unsafe { _mm_load1_ps(&tw7.im) };
let twiddle8re = unsafe { _mm_load1_ps(&tw8.re) };
let twiddle8im = unsafe { _mm_load1_ps(&tw8.im) };
let twiddle9re = unsafe { _mm_load1_ps(&tw9.re) };
let twiddle9im = unsafe { _mm_load1_ps(&tw9.im) };
Self {
direction,
_phantom: std::marker::PhantomData,
rotate,
twiddle1re,
twiddle1im,
twiddle2re,
twiddle2im,
twiddle3re,
twiddle3im,
twiddle4re,
twiddle4im,
twiddle5re,
twiddle5im,
twiddle6re,
twiddle6im,
twiddle7re,
twiddle7im,
twiddle8re,
twiddle8im,
twiddle9re,
twiddle9im,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let values = read_partial1_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18});
let out = self.perform_parallel_fft_direct(values);
write_partial_lo_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18});
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36});
let values = [
extract_lo_hi_f32(input_packed[0], input_packed[9]),
extract_hi_lo_f32(input_packed[0], input_packed[10]),
extract_lo_hi_f32(input_packed[1], input_packed[10]),
extract_hi_lo_f32(input_packed[1], input_packed[11]),
extract_lo_hi_f32(input_packed[2], input_packed[11]),
extract_hi_lo_f32(input_packed[2], input_packed[12]),
extract_lo_hi_f32(input_packed[3], input_packed[12]),
extract_hi_lo_f32(input_packed[3], input_packed[13]),
extract_lo_hi_f32(input_packed[4], input_packed[13]),
extract_hi_lo_f32(input_packed[4], input_packed[14]),
extract_lo_hi_f32(input_packed[5], input_packed[14]),
extract_hi_lo_f32(input_packed[5], input_packed[15]),
extract_lo_hi_f32(input_packed[6], input_packed[15]),
extract_hi_lo_f32(input_packed[6], input_packed[16]),
extract_lo_hi_f32(input_packed[7], input_packed[16]),
extract_hi_lo_f32(input_packed[7], input_packed[17]),
extract_lo_hi_f32(input_packed[8], input_packed[17]),
extract_hi_lo_f32(input_packed[8], input_packed[18]),
extract_lo_hi_f32(input_packed[9], input_packed[18]),
];
let out = self.perform_parallel_fft_direct(values);
let out_packed = [
extract_lo_lo_f32(out[0], out[1]),
extract_lo_lo_f32(out[2], out[3]),
extract_lo_lo_f32(out[4], out[5]),
extract_lo_lo_f32(out[6], out[7]),
extract_lo_lo_f32(out[8], out[9]),
extract_lo_lo_f32(out[10], out[11]),
extract_lo_lo_f32(out[12], out[13]),
extract_lo_lo_f32(out[14], out[15]),
extract_lo_lo_f32(out[16], out[17]),
extract_lo_hi_f32(out[18], out[0]),
extract_hi_hi_f32(out[1], out[2]),
extract_hi_hi_f32(out[3], out[4]),
extract_hi_hi_f32(out[5], out[6]),
extract_hi_hi_f32(out[7], out[8]),
extract_hi_hi_f32(out[9], out[10]),
extract_hi_hi_f32(out[11], out[12]),
extract_hi_hi_f32(out[13], out[14]),
extract_hi_hi_f32(out[15], out[16]),
extract_hi_hi_f32(out[17], out[18]),
];
write_complex_to_array_strided!(out_packed, buffer, 2, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18});
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 19]) -> [__m128; 19] {
let [x1p18, x1m18] = parallel_fft2_interleaved_f32(values[1], values[18]);
let [x2p17, x2m17] = parallel_fft2_interleaved_f32(values[2], values[17]);
let [x3p16, x3m16] = parallel_fft2_interleaved_f32(values[3], values[16]);
let [x4p15, x4m15] = parallel_fft2_interleaved_f32(values[4], values[15]);
let [x5p14, x5m14] = parallel_fft2_interleaved_f32(values[5], values[14]);
let [x6p13, x6m13] = parallel_fft2_interleaved_f32(values[6], values[13]);
let [x7p12, x7m12] = parallel_fft2_interleaved_f32(values[7], values[12]);
let [x8p11, x8m11] = parallel_fft2_interleaved_f32(values[8], values[11]);
let [x9p10, x9m10] = parallel_fft2_interleaved_f32(values[9], values[10]);
let t_a1_1 = _mm_mul_ps(self.twiddle1re, x1p18);
let t_a1_2 = _mm_mul_ps(self.twiddle2re, x2p17);
let t_a1_3 = _mm_mul_ps(self.twiddle3re, x3p16);
let t_a1_4 = _mm_mul_ps(self.twiddle4re, x4p15);
let t_a1_5 = _mm_mul_ps(self.twiddle5re, x5p14);
let t_a1_6 = _mm_mul_ps(self.twiddle6re, x6p13);
let t_a1_7 = _mm_mul_ps(self.twiddle7re, x7p12);
let t_a1_8 = _mm_mul_ps(self.twiddle8re, x8p11);
let t_a1_9 = _mm_mul_ps(self.twiddle9re, x9p10);
let t_a2_1 = _mm_mul_ps(self.twiddle2re, x1p18);
let t_a2_2 = _mm_mul_ps(self.twiddle4re, x2p17);
let t_a2_3 = _mm_mul_ps(self.twiddle6re, x3p16);
let t_a2_4 = _mm_mul_ps(self.twiddle8re, x4p15);
let t_a2_5 = _mm_mul_ps(self.twiddle9re, x5p14);
let t_a2_6 = _mm_mul_ps(self.twiddle7re, x6p13);
let t_a2_7 = _mm_mul_ps(self.twiddle5re, x7p12);
let t_a2_8 = _mm_mul_ps(self.twiddle3re, x8p11);
let t_a2_9 = _mm_mul_ps(self.twiddle1re, x9p10);
let t_a3_1 = _mm_mul_ps(self.twiddle3re, x1p18);
let t_a3_2 = _mm_mul_ps(self.twiddle6re, x2p17);
let t_a3_3 = _mm_mul_ps(self.twiddle9re, x3p16);
let t_a3_4 = _mm_mul_ps(self.twiddle7re, x4p15);
let t_a3_5 = _mm_mul_ps(self.twiddle4re, x5p14);
let t_a3_6 = _mm_mul_ps(self.twiddle1re, x6p13);
let t_a3_7 = _mm_mul_ps(self.twiddle2re, x7p12);
let t_a3_8 = _mm_mul_ps(self.twiddle5re, x8p11);
let t_a3_9 = _mm_mul_ps(self.twiddle8re, x9p10);
let t_a4_1 = _mm_mul_ps(self.twiddle4re, x1p18);
let t_a4_2 = _mm_mul_ps(self.twiddle8re, x2p17);
let t_a4_3 = _mm_mul_ps(self.twiddle7re, x3p16);
let t_a4_4 = _mm_mul_ps(self.twiddle3re, x4p15);
let t_a4_5 = _mm_mul_ps(self.twiddle1re, x5p14);
let t_a4_6 = _mm_mul_ps(self.twiddle5re, x6p13);
let t_a4_7 = _mm_mul_ps(self.twiddle9re, x7p12);
let t_a4_8 = _mm_mul_ps(self.twiddle6re, x8p11);
let t_a4_9 = _mm_mul_ps(self.twiddle2re, x9p10);
let t_a5_1 = _mm_mul_ps(self.twiddle5re, x1p18);
let t_a5_2 = _mm_mul_ps(self.twiddle9re, x2p17);
let t_a5_3 = _mm_mul_ps(self.twiddle4re, x3p16);
let t_a5_4 = _mm_mul_ps(self.twiddle1re, x4p15);
let t_a5_5 = _mm_mul_ps(self.twiddle6re, x5p14);
let t_a5_6 = _mm_mul_ps(self.twiddle8re, x6p13);
let t_a5_7 = _mm_mul_ps(self.twiddle3re, x7p12);
let t_a5_8 = _mm_mul_ps(self.twiddle2re, x8p11);
let t_a5_9 = _mm_mul_ps(self.twiddle7re, x9p10);
let t_a6_1 = _mm_mul_ps(self.twiddle6re, x1p18);
let t_a6_2 = _mm_mul_ps(self.twiddle7re, x2p17);
let t_a6_3 = _mm_mul_ps(self.twiddle1re, x3p16);
let t_a6_4 = _mm_mul_ps(self.twiddle5re, x4p15);
let t_a6_5 = _mm_mul_ps(self.twiddle8re, x5p14);
let t_a6_6 = _mm_mul_ps(self.twiddle2re, x6p13);
let t_a6_7 = _mm_mul_ps(self.twiddle4re, x7p12);
let t_a6_8 = _mm_mul_ps(self.twiddle9re, x8p11);
let t_a6_9 = _mm_mul_ps(self.twiddle3re, x9p10);
let t_a7_1 = _mm_mul_ps(self.twiddle7re, x1p18);
let t_a7_2 = _mm_mul_ps(self.twiddle5re, x2p17);
let t_a7_3 = _mm_mul_ps(self.twiddle2re, x3p16);
let t_a7_4 = _mm_mul_ps(self.twiddle9re, x4p15);
let t_a7_5 = _mm_mul_ps(self.twiddle3re, x5p14);
let t_a7_6 = _mm_mul_ps(self.twiddle4re, x6p13);
let t_a7_7 = _mm_mul_ps(self.twiddle8re, x7p12);
let t_a7_8 = _mm_mul_ps(self.twiddle1re, x8p11);
let t_a7_9 = _mm_mul_ps(self.twiddle6re, x9p10);
let t_a8_1 = _mm_mul_ps(self.twiddle8re, x1p18);
let t_a8_2 = _mm_mul_ps(self.twiddle3re, x2p17);
let t_a8_3 = _mm_mul_ps(self.twiddle5re, x3p16);
let t_a8_4 = _mm_mul_ps(self.twiddle6re, x4p15);
let t_a8_5 = _mm_mul_ps(self.twiddle2re, x5p14);
let t_a8_6 = _mm_mul_ps(self.twiddle9re, x6p13);
let t_a8_7 = _mm_mul_ps(self.twiddle1re, x7p12);
let t_a8_8 = _mm_mul_ps(self.twiddle7re, x8p11);
let t_a8_9 = _mm_mul_ps(self.twiddle4re, x9p10);
let t_a9_1 = _mm_mul_ps(self.twiddle9re, x1p18);
let t_a9_2 = _mm_mul_ps(self.twiddle1re, x2p17);
let t_a9_3 = _mm_mul_ps(self.twiddle8re, x3p16);
let t_a9_4 = _mm_mul_ps(self.twiddle2re, x4p15);
let t_a9_5 = _mm_mul_ps(self.twiddle7re, x5p14);
let t_a9_6 = _mm_mul_ps(self.twiddle3re, x6p13);
let t_a9_7 = _mm_mul_ps(self.twiddle6re, x7p12);
let t_a9_8 = _mm_mul_ps(self.twiddle4re, x8p11);
let t_a9_9 = _mm_mul_ps(self.twiddle5re, x9p10);
let t_b1_1 = _mm_mul_ps(self.twiddle1im, x1m18);
let t_b1_2 = _mm_mul_ps(self.twiddle2im, x2m17);
let t_b1_3 = _mm_mul_ps(self.twiddle3im, x3m16);
let t_b1_4 = _mm_mul_ps(self.twiddle4im, x4m15);
let t_b1_5 = _mm_mul_ps(self.twiddle5im, x5m14);
let t_b1_6 = _mm_mul_ps(self.twiddle6im, x6m13);
let t_b1_7 = _mm_mul_ps(self.twiddle7im, x7m12);
let t_b1_8 = _mm_mul_ps(self.twiddle8im, x8m11);
let t_b1_9 = _mm_mul_ps(self.twiddle9im, x9m10);
let t_b2_1 = _mm_mul_ps(self.twiddle2im, x1m18);
let t_b2_2 = _mm_mul_ps(self.twiddle4im, x2m17);
let t_b2_3 = _mm_mul_ps(self.twiddle6im, x3m16);
let t_b2_4 = _mm_mul_ps(self.twiddle8im, x4m15);
let t_b2_5 = _mm_mul_ps(self.twiddle9im, x5m14);
let t_b2_6 = _mm_mul_ps(self.twiddle7im, x6m13);
let t_b2_7 = _mm_mul_ps(self.twiddle5im, x7m12);
let t_b2_8 = _mm_mul_ps(self.twiddle3im, x8m11);
let t_b2_9 = _mm_mul_ps(self.twiddle1im, x9m10);
let t_b3_1 = _mm_mul_ps(self.twiddle3im, x1m18);
let t_b3_2 = _mm_mul_ps(self.twiddle6im, x2m17);
let t_b3_3 = _mm_mul_ps(self.twiddle9im, x3m16);
let t_b3_4 = _mm_mul_ps(self.twiddle7im, x4m15);
let t_b3_5 = _mm_mul_ps(self.twiddle4im, x5m14);
let t_b3_6 = _mm_mul_ps(self.twiddle1im, x6m13);
let t_b3_7 = _mm_mul_ps(self.twiddle2im, x7m12);
let t_b3_8 = _mm_mul_ps(self.twiddle5im, x8m11);
let t_b3_9 = _mm_mul_ps(self.twiddle8im, x9m10);
let t_b4_1 = _mm_mul_ps(self.twiddle4im, x1m18);
let t_b4_2 = _mm_mul_ps(self.twiddle8im, x2m17);
let t_b4_3 = _mm_mul_ps(self.twiddle7im, x3m16);
let t_b4_4 = _mm_mul_ps(self.twiddle3im, x4m15);
let t_b4_5 = _mm_mul_ps(self.twiddle1im, x5m14);
let t_b4_6 = _mm_mul_ps(self.twiddle5im, x6m13);
let t_b4_7 = _mm_mul_ps(self.twiddle9im, x7m12);
let t_b4_8 = _mm_mul_ps(self.twiddle6im, x8m11);
let t_b4_9 = _mm_mul_ps(self.twiddle2im, x9m10);
let t_b5_1 = _mm_mul_ps(self.twiddle5im, x1m18);
let t_b5_2 = _mm_mul_ps(self.twiddle9im, x2m17);
let t_b5_3 = _mm_mul_ps(self.twiddle4im, x3m16);
let t_b5_4 = _mm_mul_ps(self.twiddle1im, x4m15);
let t_b5_5 = _mm_mul_ps(self.twiddle6im, x5m14);
let t_b5_6 = _mm_mul_ps(self.twiddle8im, x6m13);
let t_b5_7 = _mm_mul_ps(self.twiddle3im, x7m12);
let t_b5_8 = _mm_mul_ps(self.twiddle2im, x8m11);
let t_b5_9 = _mm_mul_ps(self.twiddle7im, x9m10);
let t_b6_1 = _mm_mul_ps(self.twiddle6im, x1m18);
let t_b6_2 = _mm_mul_ps(self.twiddle7im, x2m17);
let t_b6_3 = _mm_mul_ps(self.twiddle1im, x3m16);
let t_b6_4 = _mm_mul_ps(self.twiddle5im, x4m15);
let t_b6_5 = _mm_mul_ps(self.twiddle8im, x5m14);
let t_b6_6 = _mm_mul_ps(self.twiddle2im, x6m13);
let t_b6_7 = _mm_mul_ps(self.twiddle4im, x7m12);
let t_b6_8 = _mm_mul_ps(self.twiddle9im, x8m11);
let t_b6_9 = _mm_mul_ps(self.twiddle3im, x9m10);
let t_b7_1 = _mm_mul_ps(self.twiddle7im, x1m18);
let t_b7_2 = _mm_mul_ps(self.twiddle5im, x2m17);
let t_b7_3 = _mm_mul_ps(self.twiddle2im, x3m16);
let t_b7_4 = _mm_mul_ps(self.twiddle9im, x4m15);
let t_b7_5 = _mm_mul_ps(self.twiddle3im, x5m14);
let t_b7_6 = _mm_mul_ps(self.twiddle4im, x6m13);
let t_b7_7 = _mm_mul_ps(self.twiddle8im, x7m12);
let t_b7_8 = _mm_mul_ps(self.twiddle1im, x8m11);
let t_b7_9 = _mm_mul_ps(self.twiddle6im, x9m10);
let t_b8_1 = _mm_mul_ps(self.twiddle8im, x1m18);
let t_b8_2 = _mm_mul_ps(self.twiddle3im, x2m17);
let t_b8_3 = _mm_mul_ps(self.twiddle5im, x3m16);
let t_b8_4 = _mm_mul_ps(self.twiddle6im, x4m15);
let t_b8_5 = _mm_mul_ps(self.twiddle2im, x5m14);
let t_b8_6 = _mm_mul_ps(self.twiddle9im, x6m13);
let t_b8_7 = _mm_mul_ps(self.twiddle1im, x7m12);
let t_b8_8 = _mm_mul_ps(self.twiddle7im, x8m11);
let t_b8_9 = _mm_mul_ps(self.twiddle4im, x9m10);
let t_b9_1 = _mm_mul_ps(self.twiddle9im, x1m18);
let t_b9_2 = _mm_mul_ps(self.twiddle1im, x2m17);
let t_b9_3 = _mm_mul_ps(self.twiddle8im, x3m16);
let t_b9_4 = _mm_mul_ps(self.twiddle2im, x4m15);
let t_b9_5 = _mm_mul_ps(self.twiddle7im, x5m14);
let t_b9_6 = _mm_mul_ps(self.twiddle3im, x6m13);
let t_b9_7 = _mm_mul_ps(self.twiddle6im, x7m12);
let t_b9_8 = _mm_mul_ps(self.twiddle4im, x8m11);
let t_b9_9 = _mm_mul_ps(self.twiddle5im, x9m10);
let x0 = values[0];
let t_a1 = calc_f32!(x0 + t_a1_1 + t_a1_2 + t_a1_3 + t_a1_4 + t_a1_5 + t_a1_6 + t_a1_7 + t_a1_8 + t_a1_9);
let t_a2 = calc_f32!(x0 + t_a2_1 + t_a2_2 + t_a2_3 + t_a2_4 + t_a2_5 + t_a2_6 + t_a2_7 + t_a2_8 + t_a2_9);
let t_a3 = calc_f32!(x0 + t_a3_1 + t_a3_2 + t_a3_3 + t_a3_4 + t_a3_5 + t_a3_6 + t_a3_7 + t_a3_8 + t_a3_9);
let t_a4 = calc_f32!(x0 + t_a4_1 + t_a4_2 + t_a4_3 + t_a4_4 + t_a4_5 + t_a4_6 + t_a4_7 + t_a4_8 + t_a4_9);
let t_a5 = calc_f32!(x0 + t_a5_1 + t_a5_2 + t_a5_3 + t_a5_4 + t_a5_5 + t_a5_6 + t_a5_7 + t_a5_8 + t_a5_9);
let t_a6 = calc_f32!(x0 + t_a6_1 + t_a6_2 + t_a6_3 + t_a6_4 + t_a6_5 + t_a6_6 + t_a6_7 + t_a6_8 + t_a6_9);
let t_a7 = calc_f32!(x0 + t_a7_1 + t_a7_2 + t_a7_3 + t_a7_4 + t_a7_5 + t_a7_6 + t_a7_7 + t_a7_8 + t_a7_9);
let t_a8 = calc_f32!(x0 + t_a8_1 + t_a8_2 + t_a8_3 + t_a8_4 + t_a8_5 + t_a8_6 + t_a8_7 + t_a8_8 + t_a8_9);
let t_a9 = calc_f32!(x0 + t_a9_1 + t_a9_2 + t_a9_3 + t_a9_4 + t_a9_5 + t_a9_6 + t_a9_7 + t_a9_8 + t_a9_9);
let t_b1 = calc_f32!(t_b1_1 + t_b1_2 + t_b1_3 + t_b1_4 + t_b1_5 + t_b1_6 + t_b1_7 + t_b1_8 + t_b1_9);
let t_b2 = calc_f32!(t_b2_1 + t_b2_2 + t_b2_3 + t_b2_4 - t_b2_5 - t_b2_6 - t_b2_7 - t_b2_8 - t_b2_9);
let t_b3 = calc_f32!(t_b3_1 + t_b3_2 + t_b3_3 - t_b3_4 - t_b3_5 - t_b3_6 + t_b3_7 + t_b3_8 + t_b3_9);
let t_b4 = calc_f32!(t_b4_1 + t_b4_2 - t_b4_3 - t_b4_4 + t_b4_5 + t_b4_6 + t_b4_7 - t_b4_8 - t_b4_9);
let t_b5 = calc_f32!(t_b5_1 - t_b5_2 - t_b5_3 + t_b5_4 + t_b5_5 - t_b5_6 - t_b5_7 + t_b5_8 + t_b5_9);
let t_b6 = calc_f32!(t_b6_1 - t_b6_2 - t_b6_3 + t_b6_4 - t_b6_5 - t_b6_6 + t_b6_7 - t_b6_8 - t_b6_9);
let t_b7 = calc_f32!(t_b7_1 - t_b7_2 + t_b7_3 + t_b7_4 - t_b7_5 + t_b7_6 - t_b7_7 - t_b7_8 + t_b7_9);
let t_b8 = calc_f32!(t_b8_1 - t_b8_2 + t_b8_3 - t_b8_4 + t_b8_5 - t_b8_6 - t_b8_7 + t_b8_8 - t_b8_9);
let t_b9 = calc_f32!(t_b9_1 - t_b9_2 + t_b9_3 - t_b9_4 + t_b9_5 - t_b9_6 + t_b9_7 - t_b9_8 + t_b9_9);
let t_b1_rot = self.rotate.rotate_both(t_b1);
let t_b2_rot = self.rotate.rotate_both(t_b2);
let t_b3_rot = self.rotate.rotate_both(t_b3);
let t_b4_rot = self.rotate.rotate_both(t_b4);
let t_b5_rot = self.rotate.rotate_both(t_b5);
let t_b6_rot = self.rotate.rotate_both(t_b6);
let t_b7_rot = self.rotate.rotate_both(t_b7);
let t_b8_rot = self.rotate.rotate_both(t_b8);
let t_b9_rot = self.rotate.rotate_both(t_b9);
let y0 = calc_f32!(x0 + x1p18 + x2p17 + x3p16 + x4p15 + x5p14 + x6p13 + x7p12 + x8p11 + x9p10);
let [y1, y18] = parallel_fft2_interleaved_f32(t_a1, t_b1_rot);
let [y2, y17] = parallel_fft2_interleaved_f32(t_a2, t_b2_rot);
let [y3, y16] = parallel_fft2_interleaved_f32(t_a3, t_b3_rot);
let [y4, y15] = parallel_fft2_interleaved_f32(t_a4, t_b4_rot);
let [y5, y14] = parallel_fft2_interleaved_f32(t_a5, t_b5_rot);
let [y6, y13] = parallel_fft2_interleaved_f32(t_a6, t_b6_rot);
let [y7, y12] = parallel_fft2_interleaved_f32(t_a7, t_b7_rot);
let [y8, y11] = parallel_fft2_interleaved_f32(t_a8, t_b8_rot);
let [y9, y10] = parallel_fft2_interleaved_f32(t_a9, t_b9_rot);
[y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15, y16, y17, y18]
}
}
pub struct SseF64Butterfly19<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
rotate: Rotate90F64,
twiddle1re: __m128d,
twiddle1im: __m128d,
twiddle2re: __m128d,
twiddle2im: __m128d,
twiddle3re: __m128d,
twiddle3im: __m128d,
twiddle4re: __m128d,
twiddle4im: __m128d,
twiddle5re: __m128d,
twiddle5im: __m128d,
twiddle6re: __m128d,
twiddle6im: __m128d,
twiddle7re: __m128d,
twiddle7im: __m128d,
twiddle8re: __m128d,
twiddle8im: __m128d,
twiddle9re: __m128d,
twiddle9im: __m128d,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly19, 19, |this: &SseF64Butterfly19<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF64Butterfly19, 19, |this: &SseF64Butterfly19<_>| this
.direction);
impl<T: FftNum> SseF64Butterfly19<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
let rotate = Rotate90F64::new(true);
let tw1: Complex<f64> = twiddles::compute_twiddle(1, 19, direction);
let tw2: Complex<f64> = twiddles::compute_twiddle(2, 19, direction);
let tw3: Complex<f64> = twiddles::compute_twiddle(3, 19, direction);
let tw4: Complex<f64> = twiddles::compute_twiddle(4, 19, direction);
let tw5: Complex<f64> = twiddles::compute_twiddle(5, 19, direction);
let tw6: Complex<f64> = twiddles::compute_twiddle(6, 19, direction);
let tw7: Complex<f64> = twiddles::compute_twiddle(7, 19, direction);
let tw8: Complex<f64> = twiddles::compute_twiddle(8, 19, direction);
let tw9: Complex<f64> = twiddles::compute_twiddle(9, 19, direction);
let twiddle1re = unsafe { _mm_set_pd(tw1.re, tw1.re) };
let twiddle1im = unsafe { _mm_set_pd(tw1.im, tw1.im) };
let twiddle2re = unsafe { _mm_set_pd(tw2.re, tw2.re) };
let twiddle2im = unsafe { _mm_set_pd(tw2.im, tw2.im) };
let twiddle3re = unsafe { _mm_set_pd(tw3.re, tw3.re) };
let twiddle3im = unsafe { _mm_set_pd(tw3.im, tw3.im) };
let twiddle4re = unsafe { _mm_set_pd(tw4.re, tw4.re) };
let twiddle4im = unsafe { _mm_set_pd(tw4.im, tw4.im) };
let twiddle5re = unsafe { _mm_set_pd(tw5.re, tw5.re) };
let twiddle5im = unsafe { _mm_set_pd(tw5.im, tw5.im) };
let twiddle6re = unsafe { _mm_set_pd(tw6.re, tw6.re) };
let twiddle6im = unsafe { _mm_set_pd(tw6.im, tw6.im) };
let twiddle7re = unsafe { _mm_set_pd(tw7.re, tw7.re) };
let twiddle7im = unsafe { _mm_set_pd(tw7.im, tw7.im) };
let twiddle8re = unsafe { _mm_set_pd(tw8.re, tw8.re) };
let twiddle8im = unsafe { _mm_set_pd(tw8.im, tw8.im) };
let twiddle9re = unsafe { _mm_set_pd(tw9.re, tw9.re) };
let twiddle9im = unsafe { _mm_set_pd(tw9.im, tw9.im) };
Self {
direction,
_phantom: std::marker::PhantomData,
rotate,
twiddle1re,
twiddle1im,
twiddle2re,
twiddle2im,
twiddle3re,
twiddle3im,
twiddle4re,
twiddle4im,
twiddle5re,
twiddle5im,
twiddle6re,
twiddle6im,
twiddle7re,
twiddle7im,
twiddle8re,
twiddle8im,
twiddle9re,
twiddle9im,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let values = read_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18});
let out = self.perform_fft_direct(values);
write_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18});
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 19]) -> [__m128d; 19] {
let [x1p18, x1m18] = solo_fft2_f64(values[1], values[18]);
let [x2p17, x2m17] = solo_fft2_f64(values[2], values[17]);
let [x3p16, x3m16] = solo_fft2_f64(values[3], values[16]);
let [x4p15, x4m15] = solo_fft2_f64(values[4], values[15]);
let [x5p14, x5m14] = solo_fft2_f64(values[5], values[14]);
let [x6p13, x6m13] = solo_fft2_f64(values[6], values[13]);
let [x7p12, x7m12] = solo_fft2_f64(values[7], values[12]);
let [x8p11, x8m11] = solo_fft2_f64(values[8], values[11]);
let [x9p10, x9m10] = solo_fft2_f64(values[9], values[10]);
let t_a1_1 = _mm_mul_pd(self.twiddle1re, x1p18);
let t_a1_2 = _mm_mul_pd(self.twiddle2re, x2p17);
let t_a1_3 = _mm_mul_pd(self.twiddle3re, x3p16);
let t_a1_4 = _mm_mul_pd(self.twiddle4re, x4p15);
let t_a1_5 = _mm_mul_pd(self.twiddle5re, x5p14);
let t_a1_6 = _mm_mul_pd(self.twiddle6re, x6p13);
let t_a1_7 = _mm_mul_pd(self.twiddle7re, x7p12);
let t_a1_8 = _mm_mul_pd(self.twiddle8re, x8p11);
let t_a1_9 = _mm_mul_pd(self.twiddle9re, x9p10);
let t_a2_1 = _mm_mul_pd(self.twiddle2re, x1p18);
let t_a2_2 = _mm_mul_pd(self.twiddle4re, x2p17);
let t_a2_3 = _mm_mul_pd(self.twiddle6re, x3p16);
let t_a2_4 = _mm_mul_pd(self.twiddle8re, x4p15);
let t_a2_5 = _mm_mul_pd(self.twiddle9re, x5p14);
let t_a2_6 = _mm_mul_pd(self.twiddle7re, x6p13);
let t_a2_7 = _mm_mul_pd(self.twiddle5re, x7p12);
let t_a2_8 = _mm_mul_pd(self.twiddle3re, x8p11);
let t_a2_9 = _mm_mul_pd(self.twiddle1re, x9p10);
let t_a3_1 = _mm_mul_pd(self.twiddle3re, x1p18);
let t_a3_2 = _mm_mul_pd(self.twiddle6re, x2p17);
let t_a3_3 = _mm_mul_pd(self.twiddle9re, x3p16);
let t_a3_4 = _mm_mul_pd(self.twiddle7re, x4p15);
let t_a3_5 = _mm_mul_pd(self.twiddle4re, x5p14);
let t_a3_6 = _mm_mul_pd(self.twiddle1re, x6p13);
let t_a3_7 = _mm_mul_pd(self.twiddle2re, x7p12);
let t_a3_8 = _mm_mul_pd(self.twiddle5re, x8p11);
let t_a3_9 = _mm_mul_pd(self.twiddle8re, x9p10);
let t_a4_1 = _mm_mul_pd(self.twiddle4re, x1p18);
let t_a4_2 = _mm_mul_pd(self.twiddle8re, x2p17);
let t_a4_3 = _mm_mul_pd(self.twiddle7re, x3p16);
let t_a4_4 = _mm_mul_pd(self.twiddle3re, x4p15);
let t_a4_5 = _mm_mul_pd(self.twiddle1re, x5p14);
let t_a4_6 = _mm_mul_pd(self.twiddle5re, x6p13);
let t_a4_7 = _mm_mul_pd(self.twiddle9re, x7p12);
let t_a4_8 = _mm_mul_pd(self.twiddle6re, x8p11);
let t_a4_9 = _mm_mul_pd(self.twiddle2re, x9p10);
let t_a5_1 = _mm_mul_pd(self.twiddle5re, x1p18);
let t_a5_2 = _mm_mul_pd(self.twiddle9re, x2p17);
let t_a5_3 = _mm_mul_pd(self.twiddle4re, x3p16);
let t_a5_4 = _mm_mul_pd(self.twiddle1re, x4p15);
let t_a5_5 = _mm_mul_pd(self.twiddle6re, x5p14);
let t_a5_6 = _mm_mul_pd(self.twiddle8re, x6p13);
let t_a5_7 = _mm_mul_pd(self.twiddle3re, x7p12);
let t_a5_8 = _mm_mul_pd(self.twiddle2re, x8p11);
let t_a5_9 = _mm_mul_pd(self.twiddle7re, x9p10);
let t_a6_1 = _mm_mul_pd(self.twiddle6re, x1p18);
let t_a6_2 = _mm_mul_pd(self.twiddle7re, x2p17);
let t_a6_3 = _mm_mul_pd(self.twiddle1re, x3p16);
let t_a6_4 = _mm_mul_pd(self.twiddle5re, x4p15);
let t_a6_5 = _mm_mul_pd(self.twiddle8re, x5p14);
let t_a6_6 = _mm_mul_pd(self.twiddle2re, x6p13);
let t_a6_7 = _mm_mul_pd(self.twiddle4re, x7p12);
let t_a6_8 = _mm_mul_pd(self.twiddle9re, x8p11);
let t_a6_9 = _mm_mul_pd(self.twiddle3re, x9p10);
let t_a7_1 = _mm_mul_pd(self.twiddle7re, x1p18);
let t_a7_2 = _mm_mul_pd(self.twiddle5re, x2p17);
let t_a7_3 = _mm_mul_pd(self.twiddle2re, x3p16);
let t_a7_4 = _mm_mul_pd(self.twiddle9re, x4p15);
let t_a7_5 = _mm_mul_pd(self.twiddle3re, x5p14);
let t_a7_6 = _mm_mul_pd(self.twiddle4re, x6p13);
let t_a7_7 = _mm_mul_pd(self.twiddle8re, x7p12);
let t_a7_8 = _mm_mul_pd(self.twiddle1re, x8p11);
let t_a7_9 = _mm_mul_pd(self.twiddle6re, x9p10);
let t_a8_1 = _mm_mul_pd(self.twiddle8re, x1p18);
let t_a8_2 = _mm_mul_pd(self.twiddle3re, x2p17);
let t_a8_3 = _mm_mul_pd(self.twiddle5re, x3p16);
let t_a8_4 = _mm_mul_pd(self.twiddle6re, x4p15);
let t_a8_5 = _mm_mul_pd(self.twiddle2re, x5p14);
let t_a8_6 = _mm_mul_pd(self.twiddle9re, x6p13);
let t_a8_7 = _mm_mul_pd(self.twiddle1re, x7p12);
let t_a8_8 = _mm_mul_pd(self.twiddle7re, x8p11);
let t_a8_9 = _mm_mul_pd(self.twiddle4re, x9p10);
let t_a9_1 = _mm_mul_pd(self.twiddle9re, x1p18);
let t_a9_2 = _mm_mul_pd(self.twiddle1re, x2p17);
let t_a9_3 = _mm_mul_pd(self.twiddle8re, x3p16);
let t_a9_4 = _mm_mul_pd(self.twiddle2re, x4p15);
let t_a9_5 = _mm_mul_pd(self.twiddle7re, x5p14);
let t_a9_6 = _mm_mul_pd(self.twiddle3re, x6p13);
let t_a9_7 = _mm_mul_pd(self.twiddle6re, x7p12);
let t_a9_8 = _mm_mul_pd(self.twiddle4re, x8p11);
let t_a9_9 = _mm_mul_pd(self.twiddle5re, x9p10);
let t_b1_1 = _mm_mul_pd(self.twiddle1im, x1m18);
let t_b1_2 = _mm_mul_pd(self.twiddle2im, x2m17);
let t_b1_3 = _mm_mul_pd(self.twiddle3im, x3m16);
let t_b1_4 = _mm_mul_pd(self.twiddle4im, x4m15);
let t_b1_5 = _mm_mul_pd(self.twiddle5im, x5m14);
let t_b1_6 = _mm_mul_pd(self.twiddle6im, x6m13);
let t_b1_7 = _mm_mul_pd(self.twiddle7im, x7m12);
let t_b1_8 = _mm_mul_pd(self.twiddle8im, x8m11);
let t_b1_9 = _mm_mul_pd(self.twiddle9im, x9m10);
let t_b2_1 = _mm_mul_pd(self.twiddle2im, x1m18);
let t_b2_2 = _mm_mul_pd(self.twiddle4im, x2m17);
let t_b2_3 = _mm_mul_pd(self.twiddle6im, x3m16);
let t_b2_4 = _mm_mul_pd(self.twiddle8im, x4m15);
let t_b2_5 = _mm_mul_pd(self.twiddle9im, x5m14);
let t_b2_6 = _mm_mul_pd(self.twiddle7im, x6m13);
let t_b2_7 = _mm_mul_pd(self.twiddle5im, x7m12);
let t_b2_8 = _mm_mul_pd(self.twiddle3im, x8m11);
let t_b2_9 = _mm_mul_pd(self.twiddle1im, x9m10);
let t_b3_1 = _mm_mul_pd(self.twiddle3im, x1m18);
let t_b3_2 = _mm_mul_pd(self.twiddle6im, x2m17);
let t_b3_3 = _mm_mul_pd(self.twiddle9im, x3m16);
let t_b3_4 = _mm_mul_pd(self.twiddle7im, x4m15);
let t_b3_5 = _mm_mul_pd(self.twiddle4im, x5m14);
let t_b3_6 = _mm_mul_pd(self.twiddle1im, x6m13);
let t_b3_7 = _mm_mul_pd(self.twiddle2im, x7m12);
let t_b3_8 = _mm_mul_pd(self.twiddle5im, x8m11);
let t_b3_9 = _mm_mul_pd(self.twiddle8im, x9m10);
let t_b4_1 = _mm_mul_pd(self.twiddle4im, x1m18);
let t_b4_2 = _mm_mul_pd(self.twiddle8im, x2m17);
let t_b4_3 = _mm_mul_pd(self.twiddle7im, x3m16);
let t_b4_4 = _mm_mul_pd(self.twiddle3im, x4m15);
let t_b4_5 = _mm_mul_pd(self.twiddle1im, x5m14);
let t_b4_6 = _mm_mul_pd(self.twiddle5im, x6m13);
let t_b4_7 = _mm_mul_pd(self.twiddle9im, x7m12);
let t_b4_8 = _mm_mul_pd(self.twiddle6im, x8m11);
let t_b4_9 = _mm_mul_pd(self.twiddle2im, x9m10);
let t_b5_1 = _mm_mul_pd(self.twiddle5im, x1m18);
let t_b5_2 = _mm_mul_pd(self.twiddle9im, x2m17);
let t_b5_3 = _mm_mul_pd(self.twiddle4im, x3m16);
let t_b5_4 = _mm_mul_pd(self.twiddle1im, x4m15);
let t_b5_5 = _mm_mul_pd(self.twiddle6im, x5m14);
let t_b5_6 = _mm_mul_pd(self.twiddle8im, x6m13);
let t_b5_7 = _mm_mul_pd(self.twiddle3im, x7m12);
let t_b5_8 = _mm_mul_pd(self.twiddle2im, x8m11);
let t_b5_9 = _mm_mul_pd(self.twiddle7im, x9m10);
let t_b6_1 = _mm_mul_pd(self.twiddle6im, x1m18);
let t_b6_2 = _mm_mul_pd(self.twiddle7im, x2m17);
let t_b6_3 = _mm_mul_pd(self.twiddle1im, x3m16);
let t_b6_4 = _mm_mul_pd(self.twiddle5im, x4m15);
let t_b6_5 = _mm_mul_pd(self.twiddle8im, x5m14);
let t_b6_6 = _mm_mul_pd(self.twiddle2im, x6m13);
let t_b6_7 = _mm_mul_pd(self.twiddle4im, x7m12);
let t_b6_8 = _mm_mul_pd(self.twiddle9im, x8m11);
let t_b6_9 = _mm_mul_pd(self.twiddle3im, x9m10);
let t_b7_1 = _mm_mul_pd(self.twiddle7im, x1m18);
let t_b7_2 = _mm_mul_pd(self.twiddle5im, x2m17);
let t_b7_3 = _mm_mul_pd(self.twiddle2im, x3m16);
let t_b7_4 = _mm_mul_pd(self.twiddle9im, x4m15);
let t_b7_5 = _mm_mul_pd(self.twiddle3im, x5m14);
let t_b7_6 = _mm_mul_pd(self.twiddle4im, x6m13);
let t_b7_7 = _mm_mul_pd(self.twiddle8im, x7m12);
let t_b7_8 = _mm_mul_pd(self.twiddle1im, x8m11);
let t_b7_9 = _mm_mul_pd(self.twiddle6im, x9m10);
let t_b8_1 = _mm_mul_pd(self.twiddle8im, x1m18);
let t_b8_2 = _mm_mul_pd(self.twiddle3im, x2m17);
let t_b8_3 = _mm_mul_pd(self.twiddle5im, x3m16);
let t_b8_4 = _mm_mul_pd(self.twiddle6im, x4m15);
let t_b8_5 = _mm_mul_pd(self.twiddle2im, x5m14);
let t_b8_6 = _mm_mul_pd(self.twiddle9im, x6m13);
let t_b8_7 = _mm_mul_pd(self.twiddle1im, x7m12);
let t_b8_8 = _mm_mul_pd(self.twiddle7im, x8m11);
let t_b8_9 = _mm_mul_pd(self.twiddle4im, x9m10);
let t_b9_1 = _mm_mul_pd(self.twiddle9im, x1m18);
let t_b9_2 = _mm_mul_pd(self.twiddle1im, x2m17);
let t_b9_3 = _mm_mul_pd(self.twiddle8im, x3m16);
let t_b9_4 = _mm_mul_pd(self.twiddle2im, x4m15);
let t_b9_5 = _mm_mul_pd(self.twiddle7im, x5m14);
let t_b9_6 = _mm_mul_pd(self.twiddle3im, x6m13);
let t_b9_7 = _mm_mul_pd(self.twiddle6im, x7m12);
let t_b9_8 = _mm_mul_pd(self.twiddle4im, x8m11);
let t_b9_9 = _mm_mul_pd(self.twiddle5im, x9m10);
let x0 = values[0];
let t_a1 = calc_f64!(x0 + t_a1_1 + t_a1_2 + t_a1_3 + t_a1_4 + t_a1_5 + t_a1_6 + t_a1_7 + t_a1_8 + t_a1_9);
let t_a2 = calc_f64!(x0 + t_a2_1 + t_a2_2 + t_a2_3 + t_a2_4 + t_a2_5 + t_a2_6 + t_a2_7 + t_a2_8 + t_a2_9);
let t_a3 = calc_f64!(x0 + t_a3_1 + t_a3_2 + t_a3_3 + t_a3_4 + t_a3_5 + t_a3_6 + t_a3_7 + t_a3_8 + t_a3_9);
let t_a4 = calc_f64!(x0 + t_a4_1 + t_a4_2 + t_a4_3 + t_a4_4 + t_a4_5 + t_a4_6 + t_a4_7 + t_a4_8 + t_a4_9);
let t_a5 = calc_f64!(x0 + t_a5_1 + t_a5_2 + t_a5_3 + t_a5_4 + t_a5_5 + t_a5_6 + t_a5_7 + t_a5_8 + t_a5_9);
let t_a6 = calc_f64!(x0 + t_a6_1 + t_a6_2 + t_a6_3 + t_a6_4 + t_a6_5 + t_a6_6 + t_a6_7 + t_a6_8 + t_a6_9);
let t_a7 = calc_f64!(x0 + t_a7_1 + t_a7_2 + t_a7_3 + t_a7_4 + t_a7_5 + t_a7_6 + t_a7_7 + t_a7_8 + t_a7_9);
let t_a8 = calc_f64!(x0 + t_a8_1 + t_a8_2 + t_a8_3 + t_a8_4 + t_a8_5 + t_a8_6 + t_a8_7 + t_a8_8 + t_a8_9);
let t_a9 = calc_f64!(x0 + t_a9_1 + t_a9_2 + t_a9_3 + t_a9_4 + t_a9_5 + t_a9_6 + t_a9_7 + t_a9_8 + t_a9_9);
let t_b1 = calc_f64!(t_b1_1 + t_b1_2 + t_b1_3 + t_b1_4 + t_b1_5 + t_b1_6 + t_b1_7 + t_b1_8 + t_b1_9);
let t_b2 = calc_f64!(t_b2_1 + t_b2_2 + t_b2_3 + t_b2_4 - t_b2_5 - t_b2_6 - t_b2_7 - t_b2_8 - t_b2_9);
let t_b3 = calc_f64!(t_b3_1 + t_b3_2 + t_b3_3 - t_b3_4 - t_b3_5 - t_b3_6 + t_b3_7 + t_b3_8 + t_b3_9);
let t_b4 = calc_f64!(t_b4_1 + t_b4_2 - t_b4_3 - t_b4_4 + t_b4_5 + t_b4_6 + t_b4_7 - t_b4_8 - t_b4_9);
let t_b5 = calc_f64!(t_b5_1 - t_b5_2 - t_b5_3 + t_b5_4 + t_b5_5 - t_b5_6 - t_b5_7 + t_b5_8 + t_b5_9);
let t_b6 = calc_f64!(t_b6_1 - t_b6_2 - t_b6_3 + t_b6_4 - t_b6_5 - t_b6_6 + t_b6_7 - t_b6_8 - t_b6_9);
let t_b7 = calc_f64!(t_b7_1 - t_b7_2 + t_b7_3 + t_b7_4 - t_b7_5 + t_b7_6 - t_b7_7 - t_b7_8 + t_b7_9);
let t_b8 = calc_f64!(t_b8_1 - t_b8_2 + t_b8_3 - t_b8_4 + t_b8_5 - t_b8_6 - t_b8_7 + t_b8_8 - t_b8_9);
let t_b9 = calc_f64!(t_b9_1 - t_b9_2 + t_b9_3 - t_b9_4 + t_b9_5 - t_b9_6 + t_b9_7 - t_b9_8 + t_b9_9);
let t_b1_rot = self.rotate.rotate(t_b1);
let t_b2_rot = self.rotate.rotate(t_b2);
let t_b3_rot = self.rotate.rotate(t_b3);
let t_b4_rot = self.rotate.rotate(t_b4);
let t_b5_rot = self.rotate.rotate(t_b5);
let t_b6_rot = self.rotate.rotate(t_b6);
let t_b7_rot = self.rotate.rotate(t_b7);
let t_b8_rot = self.rotate.rotate(t_b8);
let t_b9_rot = self.rotate.rotate(t_b9);
let y0 = calc_f64!(x0 + x1p18 + x2p17 + x3p16 + x4p15 + x5p14 + x6p13 + x7p12 + x8p11 + x9p10);
let [y1, y18] = solo_fft2_f64(t_a1, t_b1_rot);
let [y2, y17] = solo_fft2_f64(t_a2, t_b2_rot);
let [y3, y16] = solo_fft2_f64(t_a3, t_b3_rot);
let [y4, y15] = solo_fft2_f64(t_a4, t_b4_rot);
let [y5, y14] = solo_fft2_f64(t_a5, t_b5_rot);
let [y6, y13] = solo_fft2_f64(t_a6, t_b6_rot);
let [y7, y12] = solo_fft2_f64(t_a7, t_b7_rot);
let [y8, y11] = solo_fft2_f64(t_a8, t_b8_rot);
let [y9, y10] = solo_fft2_f64(t_a9, t_b9_rot);
[y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15, y16, y17, y18]
}
}
pub struct SseF32Butterfly23<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
rotate: Rotate90F32,
twiddle1re: __m128,
twiddle1im: __m128,
twiddle2re: __m128,
twiddle2im: __m128,
twiddle3re: __m128,
twiddle3im: __m128,
twiddle4re: __m128,
twiddle4im: __m128,
twiddle5re: __m128,
twiddle5im: __m128,
twiddle6re: __m128,
twiddle6im: __m128,
twiddle7re: __m128,
twiddle7im: __m128,
twiddle8re: __m128,
twiddle8im: __m128,
twiddle9re: __m128,
twiddle9im: __m128,
twiddle10re: __m128,
twiddle10im: __m128,
twiddle11re: __m128,
twiddle11im: __m128,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly23, 23, |this: &SseF32Butterfly23<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF32Butterfly23, 23, |this: &SseF32Butterfly23<_>| this
.direction);
impl<T: FftNum> SseF32Butterfly23<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
let rotate = Rotate90F32::new(true);
let tw1: Complex<f32> = twiddles::compute_twiddle(1, 23, direction);
let tw2: Complex<f32> = twiddles::compute_twiddle(2, 23, direction);
let tw3: Complex<f32> = twiddles::compute_twiddle(3, 23, direction);
let tw4: Complex<f32> = twiddles::compute_twiddle(4, 23, direction);
let tw5: Complex<f32> = twiddles::compute_twiddle(5, 23, direction);
let tw6: Complex<f32> = twiddles::compute_twiddle(6, 23, direction);
let tw7: Complex<f32> = twiddles::compute_twiddle(7, 23, direction);
let tw8: Complex<f32> = twiddles::compute_twiddle(8, 23, direction);
let tw9: Complex<f32> = twiddles::compute_twiddle(9, 23, direction);
let tw10: Complex<f32> = twiddles::compute_twiddle(10, 23, direction);
let tw11: Complex<f32> = twiddles::compute_twiddle(11, 23, direction);
let twiddle1re = unsafe { _mm_load1_ps(&tw1.re) };
let twiddle1im = unsafe { _mm_load1_ps(&tw1.im) };
let twiddle2re = unsafe { _mm_load1_ps(&tw2.re) };
let twiddle2im = unsafe { _mm_load1_ps(&tw2.im) };
let twiddle3re = unsafe { _mm_load1_ps(&tw3.re) };
let twiddle3im = unsafe { _mm_load1_ps(&tw3.im) };
let twiddle4re = unsafe { _mm_load1_ps(&tw4.re) };
let twiddle4im = unsafe { _mm_load1_ps(&tw4.im) };
let twiddle5re = unsafe { _mm_load1_ps(&tw5.re) };
let twiddle5im = unsafe { _mm_load1_ps(&tw5.im) };
let twiddle6re = unsafe { _mm_load1_ps(&tw6.re) };
let twiddle6im = unsafe { _mm_load1_ps(&tw6.im) };
let twiddle7re = unsafe { _mm_load1_ps(&tw7.re) };
let twiddle7im = unsafe { _mm_load1_ps(&tw7.im) };
let twiddle8re = unsafe { _mm_load1_ps(&tw8.re) };
let twiddle8im = unsafe { _mm_load1_ps(&tw8.im) };
let twiddle9re = unsafe { _mm_load1_ps(&tw9.re) };
let twiddle9im = unsafe { _mm_load1_ps(&tw9.im) };
let twiddle10re = unsafe { _mm_load1_ps(&tw10.re) };
let twiddle10im = unsafe { _mm_load1_ps(&tw10.im) };
let twiddle11re = unsafe { _mm_load1_ps(&tw11.re) };
let twiddle11im = unsafe { _mm_load1_ps(&tw11.im) };
Self {
direction,
_phantom: std::marker::PhantomData,
rotate,
twiddle1re,
twiddle1im,
twiddle2re,
twiddle2im,
twiddle3re,
twiddle3im,
twiddle4re,
twiddle4im,
twiddle5re,
twiddle5im,
twiddle6re,
twiddle6im,
twiddle7re,
twiddle7im,
twiddle8re,
twiddle8im,
twiddle9re,
twiddle9im,
twiddle10re,
twiddle10im,
twiddle11re,
twiddle11im,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let values = read_partial1_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22});
let out = self.perform_parallel_fft_direct(values);
write_partial_lo_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22});
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44});
let values = [
extract_lo_hi_f32(input_packed[0], input_packed[11]),
extract_hi_lo_f32(input_packed[0], input_packed[12]),
extract_lo_hi_f32(input_packed[1], input_packed[12]),
extract_hi_lo_f32(input_packed[1], input_packed[13]),
extract_lo_hi_f32(input_packed[2], input_packed[13]),
extract_hi_lo_f32(input_packed[2], input_packed[14]),
extract_lo_hi_f32(input_packed[3], input_packed[14]),
extract_hi_lo_f32(input_packed[3], input_packed[15]),
extract_lo_hi_f32(input_packed[4], input_packed[15]),
extract_hi_lo_f32(input_packed[4], input_packed[16]),
extract_lo_hi_f32(input_packed[5], input_packed[16]),
extract_hi_lo_f32(input_packed[5], input_packed[17]),
extract_lo_hi_f32(input_packed[6], input_packed[17]),
extract_hi_lo_f32(input_packed[6], input_packed[18]),
extract_lo_hi_f32(input_packed[7], input_packed[18]),
extract_hi_lo_f32(input_packed[7], input_packed[19]),
extract_lo_hi_f32(input_packed[8], input_packed[19]),
extract_hi_lo_f32(input_packed[8], input_packed[20]),
extract_lo_hi_f32(input_packed[9], input_packed[20]),
extract_hi_lo_f32(input_packed[9], input_packed[21]),
extract_lo_hi_f32(input_packed[10], input_packed[21]),
extract_hi_lo_f32(input_packed[10], input_packed[22]),
extract_lo_hi_f32(input_packed[11], input_packed[22]),
];
let out = self.perform_parallel_fft_direct(values);
let out_packed = [
extract_lo_lo_f32(out[0], out[1]),
extract_lo_lo_f32(out[2], out[3]),
extract_lo_lo_f32(out[4], out[5]),
extract_lo_lo_f32(out[6], out[7]),
extract_lo_lo_f32(out[8], out[9]),
extract_lo_lo_f32(out[10], out[11]),
extract_lo_lo_f32(out[12], out[13]),
extract_lo_lo_f32(out[14], out[15]),
extract_lo_lo_f32(out[16], out[17]),
extract_lo_lo_f32(out[18], out[19]),
extract_lo_lo_f32(out[20], out[21]),
extract_lo_hi_f32(out[22], out[0]),
extract_hi_hi_f32(out[1], out[2]),
extract_hi_hi_f32(out[3], out[4]),
extract_hi_hi_f32(out[5], out[6]),
extract_hi_hi_f32(out[7], out[8]),
extract_hi_hi_f32(out[9], out[10]),
extract_hi_hi_f32(out[11], out[12]),
extract_hi_hi_f32(out[13], out[14]),
extract_hi_hi_f32(out[15], out[16]),
extract_hi_hi_f32(out[17], out[18]),
extract_hi_hi_f32(out[19], out[20]),
extract_hi_hi_f32(out[21], out[22]),
];
write_complex_to_array_strided!(out_packed, buffer, 2, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22});
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 23]) -> [__m128; 23] {
let [x1p22, x1m22] = parallel_fft2_interleaved_f32(values[1], values[22]);
let [x2p21, x2m21] = parallel_fft2_interleaved_f32(values[2], values[21]);
let [x3p20, x3m20] = parallel_fft2_interleaved_f32(values[3], values[20]);
let [x4p19, x4m19] = parallel_fft2_interleaved_f32(values[4], values[19]);
let [x5p18, x5m18] = parallel_fft2_interleaved_f32(values[5], values[18]);
let [x6p17, x6m17] = parallel_fft2_interleaved_f32(values[6], values[17]);
let [x7p16, x7m16] = parallel_fft2_interleaved_f32(values[7], values[16]);
let [x8p15, x8m15] = parallel_fft2_interleaved_f32(values[8], values[15]);
let [x9p14, x9m14] = parallel_fft2_interleaved_f32(values[9], values[14]);
let [x10p13, x10m13] = parallel_fft2_interleaved_f32(values[10], values[13]);
let [x11p12, x11m12] = parallel_fft2_interleaved_f32(values[11], values[12]);
let t_a1_1 = _mm_mul_ps(self.twiddle1re, x1p22);
let t_a1_2 = _mm_mul_ps(self.twiddle2re, x2p21);
let t_a1_3 = _mm_mul_ps(self.twiddle3re, x3p20);
let t_a1_4 = _mm_mul_ps(self.twiddle4re, x4p19);
let t_a1_5 = _mm_mul_ps(self.twiddle5re, x5p18);
let t_a1_6 = _mm_mul_ps(self.twiddle6re, x6p17);
let t_a1_7 = _mm_mul_ps(self.twiddle7re, x7p16);
let t_a1_8 = _mm_mul_ps(self.twiddle8re, x8p15);
let t_a1_9 = _mm_mul_ps(self.twiddle9re, x9p14);
let t_a1_10 = _mm_mul_ps(self.twiddle10re, x10p13);
let t_a1_11 = _mm_mul_ps(self.twiddle11re, x11p12);
let t_a2_1 = _mm_mul_ps(self.twiddle2re, x1p22);
let t_a2_2 = _mm_mul_ps(self.twiddle4re, x2p21);
let t_a2_3 = _mm_mul_ps(self.twiddle6re, x3p20);
let t_a2_4 = _mm_mul_ps(self.twiddle8re, x4p19);
let t_a2_5 = _mm_mul_ps(self.twiddle10re, x5p18);
let t_a2_6 = _mm_mul_ps(self.twiddle11re, x6p17);
let t_a2_7 = _mm_mul_ps(self.twiddle9re, x7p16);
let t_a2_8 = _mm_mul_ps(self.twiddle7re, x8p15);
let t_a2_9 = _mm_mul_ps(self.twiddle5re, x9p14);
let t_a2_10 = _mm_mul_ps(self.twiddle3re, x10p13);
let t_a2_11 = _mm_mul_ps(self.twiddle1re, x11p12);
let t_a3_1 = _mm_mul_ps(self.twiddle3re, x1p22);
let t_a3_2 = _mm_mul_ps(self.twiddle6re, x2p21);
let t_a3_3 = _mm_mul_ps(self.twiddle9re, x3p20);
let t_a3_4 = _mm_mul_ps(self.twiddle11re, x4p19);
let t_a3_5 = _mm_mul_ps(self.twiddle8re, x5p18);
let t_a3_6 = _mm_mul_ps(self.twiddle5re, x6p17);
let t_a3_7 = _mm_mul_ps(self.twiddle2re, x7p16);
let t_a3_8 = _mm_mul_ps(self.twiddle1re, x8p15);
let t_a3_9 = _mm_mul_ps(self.twiddle4re, x9p14);
let t_a3_10 = _mm_mul_ps(self.twiddle7re, x10p13);
let t_a3_11 = _mm_mul_ps(self.twiddle10re, x11p12);
let t_a4_1 = _mm_mul_ps(self.twiddle4re, x1p22);
let t_a4_2 = _mm_mul_ps(self.twiddle8re, x2p21);
let t_a4_3 = _mm_mul_ps(self.twiddle11re, x3p20);
let t_a4_4 = _mm_mul_ps(self.twiddle7re, x4p19);
let t_a4_5 = _mm_mul_ps(self.twiddle3re, x5p18);
let t_a4_6 = _mm_mul_ps(self.twiddle1re, x6p17);
let t_a4_7 = _mm_mul_ps(self.twiddle5re, x7p16);
let t_a4_8 = _mm_mul_ps(self.twiddle9re, x8p15);
let t_a4_9 = _mm_mul_ps(self.twiddle10re, x9p14);
let t_a4_10 = _mm_mul_ps(self.twiddle6re, x10p13);
let t_a4_11 = _mm_mul_ps(self.twiddle2re, x11p12);
let t_a5_1 = _mm_mul_ps(self.twiddle5re, x1p22);
let t_a5_2 = _mm_mul_ps(self.twiddle10re, x2p21);
let t_a5_3 = _mm_mul_ps(self.twiddle8re, x3p20);
let t_a5_4 = _mm_mul_ps(self.twiddle3re, x4p19);
let t_a5_5 = _mm_mul_ps(self.twiddle2re, x5p18);
let t_a5_6 = _mm_mul_ps(self.twiddle7re, x6p17);
let t_a5_7 = _mm_mul_ps(self.twiddle11re, x7p16);
let t_a5_8 = _mm_mul_ps(self.twiddle6re, x8p15);
let t_a5_9 = _mm_mul_ps(self.twiddle1re, x9p14);
let t_a5_10 = _mm_mul_ps(self.twiddle4re, x10p13);
let t_a5_11 = _mm_mul_ps(self.twiddle9re, x11p12);
let t_a6_1 = _mm_mul_ps(self.twiddle6re, x1p22);
let t_a6_2 = _mm_mul_ps(self.twiddle11re, x2p21);
let t_a6_3 = _mm_mul_ps(self.twiddle5re, x3p20);
let t_a6_4 = _mm_mul_ps(self.twiddle1re, x4p19);
let t_a6_5 = _mm_mul_ps(self.twiddle7re, x5p18);
let t_a6_6 = _mm_mul_ps(self.twiddle10re, x6p17);
let t_a6_7 = _mm_mul_ps(self.twiddle4re, x7p16);
let t_a6_8 = _mm_mul_ps(self.twiddle2re, x8p15);
let t_a6_9 = _mm_mul_ps(self.twiddle8re, x9p14);
let t_a6_10 = _mm_mul_ps(self.twiddle9re, x10p13);
let t_a6_11 = _mm_mul_ps(self.twiddle3re, x11p12);
let t_a7_1 = _mm_mul_ps(self.twiddle7re, x1p22);
let t_a7_2 = _mm_mul_ps(self.twiddle9re, x2p21);
let t_a7_3 = _mm_mul_ps(self.twiddle2re, x3p20);
let t_a7_4 = _mm_mul_ps(self.twiddle5re, x4p19);
let t_a7_5 = _mm_mul_ps(self.twiddle11re, x5p18);
let t_a7_6 = _mm_mul_ps(self.twiddle4re, x6p17);
let t_a7_7 = _mm_mul_ps(self.twiddle3re, x7p16);
let t_a7_8 = _mm_mul_ps(self.twiddle10re, x8p15);
let t_a7_9 = _mm_mul_ps(self.twiddle6re, x9p14);
let t_a7_10 = _mm_mul_ps(self.twiddle1re, x10p13);
let t_a7_11 = _mm_mul_ps(self.twiddle8re, x11p12);
let t_a8_1 = _mm_mul_ps(self.twiddle8re, x1p22);
let t_a8_2 = _mm_mul_ps(self.twiddle7re, x2p21);
let t_a8_3 = _mm_mul_ps(self.twiddle1re, x3p20);
let t_a8_4 = _mm_mul_ps(self.twiddle9re, x4p19);
let t_a8_5 = _mm_mul_ps(self.twiddle6re, x5p18);
let t_a8_6 = _mm_mul_ps(self.twiddle2re, x6p17);
let t_a8_7 = _mm_mul_ps(self.twiddle10re, x7p16);
let t_a8_8 = _mm_mul_ps(self.twiddle5re, x8p15);
let t_a8_9 = _mm_mul_ps(self.twiddle3re, x9p14);
let t_a8_10 = _mm_mul_ps(self.twiddle11re, x10p13);
let t_a8_11 = _mm_mul_ps(self.twiddle4re, x11p12);
let t_a9_1 = _mm_mul_ps(self.twiddle9re, x1p22);
let t_a9_2 = _mm_mul_ps(self.twiddle5re, x2p21);
let t_a9_3 = _mm_mul_ps(self.twiddle4re, x3p20);
let t_a9_4 = _mm_mul_ps(self.twiddle10re, x4p19);
let t_a9_5 = _mm_mul_ps(self.twiddle1re, x5p18);
let t_a9_6 = _mm_mul_ps(self.twiddle8re, x6p17);
let t_a9_7 = _mm_mul_ps(self.twiddle6re, x7p16);
let t_a9_8 = _mm_mul_ps(self.twiddle3re, x8p15);
let t_a9_9 = _mm_mul_ps(self.twiddle11re, x9p14);
let t_a9_10 = _mm_mul_ps(self.twiddle2re, x10p13);
let t_a9_11 = _mm_mul_ps(self.twiddle7re, x11p12);
let t_a10_1 = _mm_mul_ps(self.twiddle10re, x1p22);
let t_a10_2 = _mm_mul_ps(self.twiddle3re, x2p21);
let t_a10_3 = _mm_mul_ps(self.twiddle7re, x3p20);
let t_a10_4 = _mm_mul_ps(self.twiddle6re, x4p19);
let t_a10_5 = _mm_mul_ps(self.twiddle4re, x5p18);
let t_a10_6 = _mm_mul_ps(self.twiddle9re, x6p17);
let t_a10_7 = _mm_mul_ps(self.twiddle1re, x7p16);
let t_a10_8 = _mm_mul_ps(self.twiddle11re, x8p15);
let t_a10_9 = _mm_mul_ps(self.twiddle2re, x9p14);
let t_a10_10 = _mm_mul_ps(self.twiddle8re, x10p13);
let t_a10_11 = _mm_mul_ps(self.twiddle5re, x11p12);
let t_a11_1 = _mm_mul_ps(self.twiddle11re, x1p22);
let t_a11_2 = _mm_mul_ps(self.twiddle1re, x2p21);
let t_a11_3 = _mm_mul_ps(self.twiddle10re, x3p20);
let t_a11_4 = _mm_mul_ps(self.twiddle2re, x4p19);
let t_a11_5 = _mm_mul_ps(self.twiddle9re, x5p18);
let t_a11_6 = _mm_mul_ps(self.twiddle3re, x6p17);
let t_a11_7 = _mm_mul_ps(self.twiddle8re, x7p16);
let t_a11_8 = _mm_mul_ps(self.twiddle4re, x8p15);
let t_a11_9 = _mm_mul_ps(self.twiddle7re, x9p14);
let t_a11_10 = _mm_mul_ps(self.twiddle5re, x10p13);
let t_a11_11 = _mm_mul_ps(self.twiddle6re, x11p12);
let t_b1_1 = _mm_mul_ps(self.twiddle1im, x1m22);
let t_b1_2 = _mm_mul_ps(self.twiddle2im, x2m21);
let t_b1_3 = _mm_mul_ps(self.twiddle3im, x3m20);
let t_b1_4 = _mm_mul_ps(self.twiddle4im, x4m19);
let t_b1_5 = _mm_mul_ps(self.twiddle5im, x5m18);
let t_b1_6 = _mm_mul_ps(self.twiddle6im, x6m17);
let t_b1_7 = _mm_mul_ps(self.twiddle7im, x7m16);
let t_b1_8 = _mm_mul_ps(self.twiddle8im, x8m15);
let t_b1_9 = _mm_mul_ps(self.twiddle9im, x9m14);
let t_b1_10 = _mm_mul_ps(self.twiddle10im, x10m13);
let t_b1_11 = _mm_mul_ps(self.twiddle11im, x11m12);
let t_b2_1 = _mm_mul_ps(self.twiddle2im, x1m22);
let t_b2_2 = _mm_mul_ps(self.twiddle4im, x2m21);
let t_b2_3 = _mm_mul_ps(self.twiddle6im, x3m20);
let t_b2_4 = _mm_mul_ps(self.twiddle8im, x4m19);
let t_b2_5 = _mm_mul_ps(self.twiddle10im, x5m18);
let t_b2_6 = _mm_mul_ps(self.twiddle11im, x6m17);
let t_b2_7 = _mm_mul_ps(self.twiddle9im, x7m16);
let t_b2_8 = _mm_mul_ps(self.twiddle7im, x8m15);
let t_b2_9 = _mm_mul_ps(self.twiddle5im, x9m14);
let t_b2_10 = _mm_mul_ps(self.twiddle3im, x10m13);
let t_b2_11 = _mm_mul_ps(self.twiddle1im, x11m12);
let t_b3_1 = _mm_mul_ps(self.twiddle3im, x1m22);
let t_b3_2 = _mm_mul_ps(self.twiddle6im, x2m21);
let t_b3_3 = _mm_mul_ps(self.twiddle9im, x3m20);
let t_b3_4 = _mm_mul_ps(self.twiddle11im, x4m19);
let t_b3_5 = _mm_mul_ps(self.twiddle8im, x5m18);
let t_b3_6 = _mm_mul_ps(self.twiddle5im, x6m17);
let t_b3_7 = _mm_mul_ps(self.twiddle2im, x7m16);
let t_b3_8 = _mm_mul_ps(self.twiddle1im, x8m15);
let t_b3_9 = _mm_mul_ps(self.twiddle4im, x9m14);
let t_b3_10 = _mm_mul_ps(self.twiddle7im, x10m13);
let t_b3_11 = _mm_mul_ps(self.twiddle10im, x11m12);
let t_b4_1 = _mm_mul_ps(self.twiddle4im, x1m22);
let t_b4_2 = _mm_mul_ps(self.twiddle8im, x2m21);
let t_b4_3 = _mm_mul_ps(self.twiddle11im, x3m20);
let t_b4_4 = _mm_mul_ps(self.twiddle7im, x4m19);
let t_b4_5 = _mm_mul_ps(self.twiddle3im, x5m18);
let t_b4_6 = _mm_mul_ps(self.twiddle1im, x6m17);
let t_b4_7 = _mm_mul_ps(self.twiddle5im, x7m16);
let t_b4_8 = _mm_mul_ps(self.twiddle9im, x8m15);
let t_b4_9 = _mm_mul_ps(self.twiddle10im, x9m14);
let t_b4_10 = _mm_mul_ps(self.twiddle6im, x10m13);
let t_b4_11 = _mm_mul_ps(self.twiddle2im, x11m12);
let t_b5_1 = _mm_mul_ps(self.twiddle5im, x1m22);
let t_b5_2 = _mm_mul_ps(self.twiddle10im, x2m21);
let t_b5_3 = _mm_mul_ps(self.twiddle8im, x3m20);
let t_b5_4 = _mm_mul_ps(self.twiddle3im, x4m19);
let t_b5_5 = _mm_mul_ps(self.twiddle2im, x5m18);
let t_b5_6 = _mm_mul_ps(self.twiddle7im, x6m17);
let t_b5_7 = _mm_mul_ps(self.twiddle11im, x7m16);
let t_b5_8 = _mm_mul_ps(self.twiddle6im, x8m15);
let t_b5_9 = _mm_mul_ps(self.twiddle1im, x9m14);
let t_b5_10 = _mm_mul_ps(self.twiddle4im, x10m13);
let t_b5_11 = _mm_mul_ps(self.twiddle9im, x11m12);
let t_b6_1 = _mm_mul_ps(self.twiddle6im, x1m22);
let t_b6_2 = _mm_mul_ps(self.twiddle11im, x2m21);
let t_b6_3 = _mm_mul_ps(self.twiddle5im, x3m20);
let t_b6_4 = _mm_mul_ps(self.twiddle1im, x4m19);
let t_b6_5 = _mm_mul_ps(self.twiddle7im, x5m18);
let t_b6_6 = _mm_mul_ps(self.twiddle10im, x6m17);
let t_b6_7 = _mm_mul_ps(self.twiddle4im, x7m16);
let t_b6_8 = _mm_mul_ps(self.twiddle2im, x8m15);
let t_b6_9 = _mm_mul_ps(self.twiddle8im, x9m14);
let t_b6_10 = _mm_mul_ps(self.twiddle9im, x10m13);
let t_b6_11 = _mm_mul_ps(self.twiddle3im, x11m12);
let t_b7_1 = _mm_mul_ps(self.twiddle7im, x1m22);
let t_b7_2 = _mm_mul_ps(self.twiddle9im, x2m21);
let t_b7_3 = _mm_mul_ps(self.twiddle2im, x3m20);
let t_b7_4 = _mm_mul_ps(self.twiddle5im, x4m19);
let t_b7_5 = _mm_mul_ps(self.twiddle11im, x5m18);
let t_b7_6 = _mm_mul_ps(self.twiddle4im, x6m17);
let t_b7_7 = _mm_mul_ps(self.twiddle3im, x7m16);
let t_b7_8 = _mm_mul_ps(self.twiddle10im, x8m15);
let t_b7_9 = _mm_mul_ps(self.twiddle6im, x9m14);
let t_b7_10 = _mm_mul_ps(self.twiddle1im, x10m13);
let t_b7_11 = _mm_mul_ps(self.twiddle8im, x11m12);
let t_b8_1 = _mm_mul_ps(self.twiddle8im, x1m22);
let t_b8_2 = _mm_mul_ps(self.twiddle7im, x2m21);
let t_b8_3 = _mm_mul_ps(self.twiddle1im, x3m20);
let t_b8_4 = _mm_mul_ps(self.twiddle9im, x4m19);
let t_b8_5 = _mm_mul_ps(self.twiddle6im, x5m18);
let t_b8_6 = _mm_mul_ps(self.twiddle2im, x6m17);
let t_b8_7 = _mm_mul_ps(self.twiddle10im, x7m16);
let t_b8_8 = _mm_mul_ps(self.twiddle5im, x8m15);
let t_b8_9 = _mm_mul_ps(self.twiddle3im, x9m14);
let t_b8_10 = _mm_mul_ps(self.twiddle11im, x10m13);
let t_b8_11 = _mm_mul_ps(self.twiddle4im, x11m12);
let t_b9_1 = _mm_mul_ps(self.twiddle9im, x1m22);
let t_b9_2 = _mm_mul_ps(self.twiddle5im, x2m21);
let t_b9_3 = _mm_mul_ps(self.twiddle4im, x3m20);
let t_b9_4 = _mm_mul_ps(self.twiddle10im, x4m19);
let t_b9_5 = _mm_mul_ps(self.twiddle1im, x5m18);
let t_b9_6 = _mm_mul_ps(self.twiddle8im, x6m17);
let t_b9_7 = _mm_mul_ps(self.twiddle6im, x7m16);
let t_b9_8 = _mm_mul_ps(self.twiddle3im, x8m15);
let t_b9_9 = _mm_mul_ps(self.twiddle11im, x9m14);
let t_b9_10 = _mm_mul_ps(self.twiddle2im, x10m13);
let t_b9_11 = _mm_mul_ps(self.twiddle7im, x11m12);
let t_b10_1 = _mm_mul_ps(self.twiddle10im, x1m22);
let t_b10_2 = _mm_mul_ps(self.twiddle3im, x2m21);
let t_b10_3 = _mm_mul_ps(self.twiddle7im, x3m20);
let t_b10_4 = _mm_mul_ps(self.twiddle6im, x4m19);
let t_b10_5 = _mm_mul_ps(self.twiddle4im, x5m18);
let t_b10_6 = _mm_mul_ps(self.twiddle9im, x6m17);
let t_b10_7 = _mm_mul_ps(self.twiddle1im, x7m16);
let t_b10_8 = _mm_mul_ps(self.twiddle11im, x8m15);
let t_b10_9 = _mm_mul_ps(self.twiddle2im, x9m14);
let t_b10_10 = _mm_mul_ps(self.twiddle8im, x10m13);
let t_b10_11 = _mm_mul_ps(self.twiddle5im, x11m12);
let t_b11_1 = _mm_mul_ps(self.twiddle11im, x1m22);
let t_b11_2 = _mm_mul_ps(self.twiddle1im, x2m21);
let t_b11_3 = _mm_mul_ps(self.twiddle10im, x3m20);
let t_b11_4 = _mm_mul_ps(self.twiddle2im, x4m19);
let t_b11_5 = _mm_mul_ps(self.twiddle9im, x5m18);
let t_b11_6 = _mm_mul_ps(self.twiddle3im, x6m17);
let t_b11_7 = _mm_mul_ps(self.twiddle8im, x7m16);
let t_b11_8 = _mm_mul_ps(self.twiddle4im, x8m15);
let t_b11_9 = _mm_mul_ps(self.twiddle7im, x9m14);
let t_b11_10 = _mm_mul_ps(self.twiddle5im, x10m13);
let t_b11_11 = _mm_mul_ps(self.twiddle6im, x11m12);
let x0 = values[0];
let t_a1 = calc_f32!(x0 + t_a1_1 + t_a1_2 + t_a1_3 + t_a1_4 + t_a1_5 + t_a1_6 + t_a1_7 + t_a1_8 + t_a1_9 + t_a1_10 + t_a1_11);
let t_a2 = calc_f32!(x0 + t_a2_1 + t_a2_2 + t_a2_3 + t_a2_4 + t_a2_5 + t_a2_6 + t_a2_7 + t_a2_8 + t_a2_9 + t_a2_10 + t_a2_11);
let t_a3 = calc_f32!(x0 + t_a3_1 + t_a3_2 + t_a3_3 + t_a3_4 + t_a3_5 + t_a3_6 + t_a3_7 + t_a3_8 + t_a3_9 + t_a3_10 + t_a3_11);
let t_a4 = calc_f32!(x0 + t_a4_1 + t_a4_2 + t_a4_3 + t_a4_4 + t_a4_5 + t_a4_6 + t_a4_7 + t_a4_8 + t_a4_9 + t_a4_10 + t_a4_11);
let t_a5 = calc_f32!(x0 + t_a5_1 + t_a5_2 + t_a5_3 + t_a5_4 + t_a5_5 + t_a5_6 + t_a5_7 + t_a5_8 + t_a5_9 + t_a5_10 + t_a5_11);
let t_a6 = calc_f32!(x0 + t_a6_1 + t_a6_2 + t_a6_3 + t_a6_4 + t_a6_5 + t_a6_6 + t_a6_7 + t_a6_8 + t_a6_9 + t_a6_10 + t_a6_11);
let t_a7 = calc_f32!(x0 + t_a7_1 + t_a7_2 + t_a7_3 + t_a7_4 + t_a7_5 + t_a7_6 + t_a7_7 + t_a7_8 + t_a7_9 + t_a7_10 + t_a7_11);
let t_a8 = calc_f32!(x0 + t_a8_1 + t_a8_2 + t_a8_3 + t_a8_4 + t_a8_5 + t_a8_6 + t_a8_7 + t_a8_8 + t_a8_9 + t_a8_10 + t_a8_11);
let t_a9 = calc_f32!(x0 + t_a9_1 + t_a9_2 + t_a9_3 + t_a9_4 + t_a9_5 + t_a9_6 + t_a9_7 + t_a9_8 + t_a9_9 + t_a9_10 + t_a9_11);
let t_a10 = calc_f32!(x0 + t_a10_1 + t_a10_2 + t_a10_3 + t_a10_4 + t_a10_5 + t_a10_6 + t_a10_7 + t_a10_8 + t_a10_9 + t_a10_10 + t_a10_11);
let t_a11 = calc_f32!(x0 + t_a11_1 + t_a11_2 + t_a11_3 + t_a11_4 + t_a11_5 + t_a11_6 + t_a11_7 + t_a11_8 + t_a11_9 + t_a11_10 + t_a11_11);
let t_b1 = calc_f32!(t_b1_1 + t_b1_2 + t_b1_3 + t_b1_4 + t_b1_5 + t_b1_6 + t_b1_7 + t_b1_8 + t_b1_9 + t_b1_10 + t_b1_11);
let t_b2 = calc_f32!(t_b2_1 + t_b2_2 + t_b2_3 + t_b2_4 + t_b2_5 - t_b2_6 - t_b2_7 - t_b2_8 - t_b2_9 - t_b2_10 - t_b2_11);
let t_b3 = calc_f32!(t_b3_1 + t_b3_2 + t_b3_3 - t_b3_4 - t_b3_5 - t_b3_6 - t_b3_7 + t_b3_8 + t_b3_9 + t_b3_10 + t_b3_11);
let t_b4 = calc_f32!(t_b4_1 + t_b4_2 - t_b4_3 - t_b4_4 - t_b4_5 + t_b4_6 + t_b4_7 + t_b4_8 - t_b4_9 - t_b4_10 - t_b4_11);
let t_b5 = calc_f32!(t_b5_1 + t_b5_2 - t_b5_3 - t_b5_4 + t_b5_5 + t_b5_6 - t_b5_7 - t_b5_8 - t_b5_9 + t_b5_10 + t_b5_11);
let t_b6 = calc_f32!(t_b6_1 - t_b6_2 - t_b6_3 + t_b6_4 + t_b6_5 - t_b6_6 - t_b6_7 + t_b6_8 + t_b6_9 - t_b6_10 - t_b6_11);
let t_b7 = calc_f32!(t_b7_1 - t_b7_2 - t_b7_3 + t_b7_4 - t_b7_5 - t_b7_6 + t_b7_7 + t_b7_8 - t_b7_9 + t_b7_10 + t_b7_11);
let t_b8 = calc_f32!(t_b8_1 - t_b8_2 + t_b8_3 + t_b8_4 - t_b8_5 + t_b8_6 + t_b8_7 - t_b8_8 + t_b8_9 + t_b8_10 - t_b8_11);
let t_b9 = calc_f32!(t_b9_1 - t_b9_2 + t_b9_3 - t_b9_4 - t_b9_5 + t_b9_6 - t_b9_7 + t_b9_8 - t_b9_9 - t_b9_10 + t_b9_11);
let t_b10 = calc_f32!(t_b10_1 - t_b10_2 + t_b10_3 - t_b10_4 + t_b10_5 - t_b10_6 + t_b10_7 + t_b10_8 - t_b10_9 + t_b10_10 - t_b10_11);
let t_b11 = calc_f32!(t_b11_1 - t_b11_2 + t_b11_3 - t_b11_4 + t_b11_5 - t_b11_6 + t_b11_7 - t_b11_8 + t_b11_9 - t_b11_10 + t_b11_11);
let t_b1_rot = self.rotate.rotate_both(t_b1);
let t_b2_rot = self.rotate.rotate_both(t_b2);
let t_b3_rot = self.rotate.rotate_both(t_b3);
let t_b4_rot = self.rotate.rotate_both(t_b4);
let t_b5_rot = self.rotate.rotate_both(t_b5);
let t_b6_rot = self.rotate.rotate_both(t_b6);
let t_b7_rot = self.rotate.rotate_both(t_b7);
let t_b8_rot = self.rotate.rotate_both(t_b8);
let t_b9_rot = self.rotate.rotate_both(t_b9);
let t_b10_rot = self.rotate.rotate_both(t_b10);
let t_b11_rot = self.rotate.rotate_both(t_b11);
let y0 = calc_f32!(x0 + x1p22 + x2p21 + x3p20 + x4p19 + x5p18 + x6p17 + x7p16 + x8p15 + x9p14 + x10p13 + x11p12);
let [y1, y22] = parallel_fft2_interleaved_f32(t_a1, t_b1_rot);
let [y2, y21] = parallel_fft2_interleaved_f32(t_a2, t_b2_rot);
let [y3, y20] = parallel_fft2_interleaved_f32(t_a3, t_b3_rot);
let [y4, y19] = parallel_fft2_interleaved_f32(t_a4, t_b4_rot);
let [y5, y18] = parallel_fft2_interleaved_f32(t_a5, t_b5_rot);
let [y6, y17] = parallel_fft2_interleaved_f32(t_a6, t_b6_rot);
let [y7, y16] = parallel_fft2_interleaved_f32(t_a7, t_b7_rot);
let [y8, y15] = parallel_fft2_interleaved_f32(t_a8, t_b8_rot);
let [y9, y14] = parallel_fft2_interleaved_f32(t_a9, t_b9_rot);
let [y10, y13] = parallel_fft2_interleaved_f32(t_a10, t_b10_rot);
let [y11, y12] = parallel_fft2_interleaved_f32(t_a11, t_b11_rot);
[y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22]
}
}
pub struct SseF64Butterfly23<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
rotate: Rotate90F64,
twiddle1re: __m128d,
twiddle1im: __m128d,
twiddle2re: __m128d,
twiddle2im: __m128d,
twiddle3re: __m128d,
twiddle3im: __m128d,
twiddle4re: __m128d,
twiddle4im: __m128d,
twiddle5re: __m128d,
twiddle5im: __m128d,
twiddle6re: __m128d,
twiddle6im: __m128d,
twiddle7re: __m128d,
twiddle7im: __m128d,
twiddle8re: __m128d,
twiddle8im: __m128d,
twiddle9re: __m128d,
twiddle9im: __m128d,
twiddle10re: __m128d,
twiddle10im: __m128d,
twiddle11re: __m128d,
twiddle11im: __m128d,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly23, 23, |this: &SseF64Butterfly23<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF64Butterfly23, 23, |this: &SseF64Butterfly23<_>| this
.direction);
impl<T: FftNum> SseF64Butterfly23<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
let rotate = Rotate90F64::new(true);
let tw1: Complex<f64> = twiddles::compute_twiddle(1, 23, direction);
let tw2: Complex<f64> = twiddles::compute_twiddle(2, 23, direction);
let tw3: Complex<f64> = twiddles::compute_twiddle(3, 23, direction);
let tw4: Complex<f64> = twiddles::compute_twiddle(4, 23, direction);
let tw5: Complex<f64> = twiddles::compute_twiddle(5, 23, direction);
let tw6: Complex<f64> = twiddles::compute_twiddle(6, 23, direction);
let tw7: Complex<f64> = twiddles::compute_twiddle(7, 23, direction);
let tw8: Complex<f64> = twiddles::compute_twiddle(8, 23, direction);
let tw9: Complex<f64> = twiddles::compute_twiddle(9, 23, direction);
let tw10: Complex<f64> = twiddles::compute_twiddle(10, 23, direction);
let tw11: Complex<f64> = twiddles::compute_twiddle(11, 23, direction);
let twiddle1re = unsafe { _mm_set_pd(tw1.re, tw1.re) };
let twiddle1im = unsafe { _mm_set_pd(tw1.im, tw1.im) };
let twiddle2re = unsafe { _mm_set_pd(tw2.re, tw2.re) };
let twiddle2im = unsafe { _mm_set_pd(tw2.im, tw2.im) };
let twiddle3re = unsafe { _mm_set_pd(tw3.re, tw3.re) };
let twiddle3im = unsafe { _mm_set_pd(tw3.im, tw3.im) };
let twiddle4re = unsafe { _mm_set_pd(tw4.re, tw4.re) };
let twiddle4im = unsafe { _mm_set_pd(tw4.im, tw4.im) };
let twiddle5re = unsafe { _mm_set_pd(tw5.re, tw5.re) };
let twiddle5im = unsafe { _mm_set_pd(tw5.im, tw5.im) };
let twiddle6re = unsafe { _mm_set_pd(tw6.re, tw6.re) };
let twiddle6im = unsafe { _mm_set_pd(tw6.im, tw6.im) };
let twiddle7re = unsafe { _mm_set_pd(tw7.re, tw7.re) };
let twiddle7im = unsafe { _mm_set_pd(tw7.im, tw7.im) };
let twiddle8re = unsafe { _mm_set_pd(tw8.re, tw8.re) };
let twiddle8im = unsafe { _mm_set_pd(tw8.im, tw8.im) };
let twiddle9re = unsafe { _mm_set_pd(tw9.re, tw9.re) };
let twiddle9im = unsafe { _mm_set_pd(tw9.im, tw9.im) };
let twiddle10re = unsafe { _mm_set_pd(tw10.re, tw10.re) };
let twiddle10im = unsafe { _mm_set_pd(tw10.im, tw10.im) };
let twiddle11re = unsafe { _mm_set_pd(tw11.re, tw11.re) };
let twiddle11im = unsafe { _mm_set_pd(tw11.im, tw11.im) };
Self {
direction,
_phantom: std::marker::PhantomData,
rotate,
twiddle1re,
twiddle1im,
twiddle2re,
twiddle2im,
twiddle3re,
twiddle3im,
twiddle4re,
twiddle4im,
twiddle5re,
twiddle5im,
twiddle6re,
twiddle6im,
twiddle7re,
twiddle7im,
twiddle8re,
twiddle8im,
twiddle9re,
twiddle9im,
twiddle10re,
twiddle10im,
twiddle11re,
twiddle11im,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let values = read_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22});
let out = self.perform_fft_direct(values);
write_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22});
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 23]) -> [__m128d; 23] {
let [x1p22, x1m22] = solo_fft2_f64(values[1], values[22]);
let [x2p21, x2m21] = solo_fft2_f64(values[2], values[21]);
let [x3p20, x3m20] = solo_fft2_f64(values[3], values[20]);
let [x4p19, x4m19] = solo_fft2_f64(values[4], values[19]);
let [x5p18, x5m18] = solo_fft2_f64(values[5], values[18]);
let [x6p17, x6m17] = solo_fft2_f64(values[6], values[17]);
let [x7p16, x7m16] = solo_fft2_f64(values[7], values[16]);
let [x8p15, x8m15] = solo_fft2_f64(values[8], values[15]);
let [x9p14, x9m14] = solo_fft2_f64(values[9], values[14]);
let [x10p13, x10m13] = solo_fft2_f64(values[10], values[13]);
let [x11p12, x11m12] = solo_fft2_f64(values[11], values[12]);
let t_a1_1 = _mm_mul_pd(self.twiddle1re, x1p22);
let t_a1_2 = _mm_mul_pd(self.twiddle2re, x2p21);
let t_a1_3 = _mm_mul_pd(self.twiddle3re, x3p20);
let t_a1_4 = _mm_mul_pd(self.twiddle4re, x4p19);
let t_a1_5 = _mm_mul_pd(self.twiddle5re, x5p18);
let t_a1_6 = _mm_mul_pd(self.twiddle6re, x6p17);
let t_a1_7 = _mm_mul_pd(self.twiddle7re, x7p16);
let t_a1_8 = _mm_mul_pd(self.twiddle8re, x8p15);
let t_a1_9 = _mm_mul_pd(self.twiddle9re, x9p14);
let t_a1_10 = _mm_mul_pd(self.twiddle10re, x10p13);
let t_a1_11 = _mm_mul_pd(self.twiddle11re, x11p12);
let t_a2_1 = _mm_mul_pd(self.twiddle2re, x1p22);
let t_a2_2 = _mm_mul_pd(self.twiddle4re, x2p21);
let t_a2_3 = _mm_mul_pd(self.twiddle6re, x3p20);
let t_a2_4 = _mm_mul_pd(self.twiddle8re, x4p19);
let t_a2_5 = _mm_mul_pd(self.twiddle10re, x5p18);
let t_a2_6 = _mm_mul_pd(self.twiddle11re, x6p17);
let t_a2_7 = _mm_mul_pd(self.twiddle9re, x7p16);
let t_a2_8 = _mm_mul_pd(self.twiddle7re, x8p15);
let t_a2_9 = _mm_mul_pd(self.twiddle5re, x9p14);
let t_a2_10 = _mm_mul_pd(self.twiddle3re, x10p13);
let t_a2_11 = _mm_mul_pd(self.twiddle1re, x11p12);
let t_a3_1 = _mm_mul_pd(self.twiddle3re, x1p22);
let t_a3_2 = _mm_mul_pd(self.twiddle6re, x2p21);
let t_a3_3 = _mm_mul_pd(self.twiddle9re, x3p20);
let t_a3_4 = _mm_mul_pd(self.twiddle11re, x4p19);
let t_a3_5 = _mm_mul_pd(self.twiddle8re, x5p18);
let t_a3_6 = _mm_mul_pd(self.twiddle5re, x6p17);
let t_a3_7 = _mm_mul_pd(self.twiddle2re, x7p16);
let t_a3_8 = _mm_mul_pd(self.twiddle1re, x8p15);
let t_a3_9 = _mm_mul_pd(self.twiddle4re, x9p14);
let t_a3_10 = _mm_mul_pd(self.twiddle7re, x10p13);
let t_a3_11 = _mm_mul_pd(self.twiddle10re, x11p12);
let t_a4_1 = _mm_mul_pd(self.twiddle4re, x1p22);
let t_a4_2 = _mm_mul_pd(self.twiddle8re, x2p21);
let t_a4_3 = _mm_mul_pd(self.twiddle11re, x3p20);
let t_a4_4 = _mm_mul_pd(self.twiddle7re, x4p19);
let t_a4_5 = _mm_mul_pd(self.twiddle3re, x5p18);
let t_a4_6 = _mm_mul_pd(self.twiddle1re, x6p17);
let t_a4_7 = _mm_mul_pd(self.twiddle5re, x7p16);
let t_a4_8 = _mm_mul_pd(self.twiddle9re, x8p15);
let t_a4_9 = _mm_mul_pd(self.twiddle10re, x9p14);
let t_a4_10 = _mm_mul_pd(self.twiddle6re, x10p13);
let t_a4_11 = _mm_mul_pd(self.twiddle2re, x11p12);
let t_a5_1 = _mm_mul_pd(self.twiddle5re, x1p22);
let t_a5_2 = _mm_mul_pd(self.twiddle10re, x2p21);
let t_a5_3 = _mm_mul_pd(self.twiddle8re, x3p20);
let t_a5_4 = _mm_mul_pd(self.twiddle3re, x4p19);
let t_a5_5 = _mm_mul_pd(self.twiddle2re, x5p18);
let t_a5_6 = _mm_mul_pd(self.twiddle7re, x6p17);
let t_a5_7 = _mm_mul_pd(self.twiddle11re, x7p16);
let t_a5_8 = _mm_mul_pd(self.twiddle6re, x8p15);
let t_a5_9 = _mm_mul_pd(self.twiddle1re, x9p14);
let t_a5_10 = _mm_mul_pd(self.twiddle4re, x10p13);
let t_a5_11 = _mm_mul_pd(self.twiddle9re, x11p12);
let t_a6_1 = _mm_mul_pd(self.twiddle6re, x1p22);
let t_a6_2 = _mm_mul_pd(self.twiddle11re, x2p21);
let t_a6_3 = _mm_mul_pd(self.twiddle5re, x3p20);
let t_a6_4 = _mm_mul_pd(self.twiddle1re, x4p19);
let t_a6_5 = _mm_mul_pd(self.twiddle7re, x5p18);
let t_a6_6 = _mm_mul_pd(self.twiddle10re, x6p17);
let t_a6_7 = _mm_mul_pd(self.twiddle4re, x7p16);
let t_a6_8 = _mm_mul_pd(self.twiddle2re, x8p15);
let t_a6_9 = _mm_mul_pd(self.twiddle8re, x9p14);
let t_a6_10 = _mm_mul_pd(self.twiddle9re, x10p13);
let t_a6_11 = _mm_mul_pd(self.twiddle3re, x11p12);
let t_a7_1 = _mm_mul_pd(self.twiddle7re, x1p22);
let t_a7_2 = _mm_mul_pd(self.twiddle9re, x2p21);
let t_a7_3 = _mm_mul_pd(self.twiddle2re, x3p20);
let t_a7_4 = _mm_mul_pd(self.twiddle5re, x4p19);
let t_a7_5 = _mm_mul_pd(self.twiddle11re, x5p18);
let t_a7_6 = _mm_mul_pd(self.twiddle4re, x6p17);
let t_a7_7 = _mm_mul_pd(self.twiddle3re, x7p16);
let t_a7_8 = _mm_mul_pd(self.twiddle10re, x8p15);
let t_a7_9 = _mm_mul_pd(self.twiddle6re, x9p14);
let t_a7_10 = _mm_mul_pd(self.twiddle1re, x10p13);
let t_a7_11 = _mm_mul_pd(self.twiddle8re, x11p12);
let t_a8_1 = _mm_mul_pd(self.twiddle8re, x1p22);
let t_a8_2 = _mm_mul_pd(self.twiddle7re, x2p21);
let t_a8_3 = _mm_mul_pd(self.twiddle1re, x3p20);
let t_a8_4 = _mm_mul_pd(self.twiddle9re, x4p19);
let t_a8_5 = _mm_mul_pd(self.twiddle6re, x5p18);
let t_a8_6 = _mm_mul_pd(self.twiddle2re, x6p17);
let t_a8_7 = _mm_mul_pd(self.twiddle10re, x7p16);
let t_a8_8 = _mm_mul_pd(self.twiddle5re, x8p15);
let t_a8_9 = _mm_mul_pd(self.twiddle3re, x9p14);
let t_a8_10 = _mm_mul_pd(self.twiddle11re, x10p13);
let t_a8_11 = _mm_mul_pd(self.twiddle4re, x11p12);
let t_a9_1 = _mm_mul_pd(self.twiddle9re, x1p22);
let t_a9_2 = _mm_mul_pd(self.twiddle5re, x2p21);
let t_a9_3 = _mm_mul_pd(self.twiddle4re, x3p20);
let t_a9_4 = _mm_mul_pd(self.twiddle10re, x4p19);
let t_a9_5 = _mm_mul_pd(self.twiddle1re, x5p18);
let t_a9_6 = _mm_mul_pd(self.twiddle8re, x6p17);
let t_a9_7 = _mm_mul_pd(self.twiddle6re, x7p16);
let t_a9_8 = _mm_mul_pd(self.twiddle3re, x8p15);
let t_a9_9 = _mm_mul_pd(self.twiddle11re, x9p14);
let t_a9_10 = _mm_mul_pd(self.twiddle2re, x10p13);
let t_a9_11 = _mm_mul_pd(self.twiddle7re, x11p12);
let t_a10_1 = _mm_mul_pd(self.twiddle10re, x1p22);
let t_a10_2 = _mm_mul_pd(self.twiddle3re, x2p21);
let t_a10_3 = _mm_mul_pd(self.twiddle7re, x3p20);
let t_a10_4 = _mm_mul_pd(self.twiddle6re, x4p19);
let t_a10_5 = _mm_mul_pd(self.twiddle4re, x5p18);
let t_a10_6 = _mm_mul_pd(self.twiddle9re, x6p17);
let t_a10_7 = _mm_mul_pd(self.twiddle1re, x7p16);
let t_a10_8 = _mm_mul_pd(self.twiddle11re, x8p15);
let t_a10_9 = _mm_mul_pd(self.twiddle2re, x9p14);
let t_a10_10 = _mm_mul_pd(self.twiddle8re, x10p13);
let t_a10_11 = _mm_mul_pd(self.twiddle5re, x11p12);
let t_a11_1 = _mm_mul_pd(self.twiddle11re, x1p22);
let t_a11_2 = _mm_mul_pd(self.twiddle1re, x2p21);
let t_a11_3 = _mm_mul_pd(self.twiddle10re, x3p20);
let t_a11_4 = _mm_mul_pd(self.twiddle2re, x4p19);
let t_a11_5 = _mm_mul_pd(self.twiddle9re, x5p18);
let t_a11_6 = _mm_mul_pd(self.twiddle3re, x6p17);
let t_a11_7 = _mm_mul_pd(self.twiddle8re, x7p16);
let t_a11_8 = _mm_mul_pd(self.twiddle4re, x8p15);
let t_a11_9 = _mm_mul_pd(self.twiddle7re, x9p14);
let t_a11_10 = _mm_mul_pd(self.twiddle5re, x10p13);
let t_a11_11 = _mm_mul_pd(self.twiddle6re, x11p12);
let t_b1_1 = _mm_mul_pd(self.twiddle1im, x1m22);
let t_b1_2 = _mm_mul_pd(self.twiddle2im, x2m21);
let t_b1_3 = _mm_mul_pd(self.twiddle3im, x3m20);
let t_b1_4 = _mm_mul_pd(self.twiddle4im, x4m19);
let t_b1_5 = _mm_mul_pd(self.twiddle5im, x5m18);
let t_b1_6 = _mm_mul_pd(self.twiddle6im, x6m17);
let t_b1_7 = _mm_mul_pd(self.twiddle7im, x7m16);
let t_b1_8 = _mm_mul_pd(self.twiddle8im, x8m15);
let t_b1_9 = _mm_mul_pd(self.twiddle9im, x9m14);
let t_b1_10 = _mm_mul_pd(self.twiddle10im, x10m13);
let t_b1_11 = _mm_mul_pd(self.twiddle11im, x11m12);
let t_b2_1 = _mm_mul_pd(self.twiddle2im, x1m22);
let t_b2_2 = _mm_mul_pd(self.twiddle4im, x2m21);
let t_b2_3 = _mm_mul_pd(self.twiddle6im, x3m20);
let t_b2_4 = _mm_mul_pd(self.twiddle8im, x4m19);
let t_b2_5 = _mm_mul_pd(self.twiddle10im, x5m18);
let t_b2_6 = _mm_mul_pd(self.twiddle11im, x6m17);
let t_b2_7 = _mm_mul_pd(self.twiddle9im, x7m16);
let t_b2_8 = _mm_mul_pd(self.twiddle7im, x8m15);
let t_b2_9 = _mm_mul_pd(self.twiddle5im, x9m14);
let t_b2_10 = _mm_mul_pd(self.twiddle3im, x10m13);
let t_b2_11 = _mm_mul_pd(self.twiddle1im, x11m12);
let t_b3_1 = _mm_mul_pd(self.twiddle3im, x1m22);
let t_b3_2 = _mm_mul_pd(self.twiddle6im, x2m21);
let t_b3_3 = _mm_mul_pd(self.twiddle9im, x3m20);
let t_b3_4 = _mm_mul_pd(self.twiddle11im, x4m19);
let t_b3_5 = _mm_mul_pd(self.twiddle8im, x5m18);
let t_b3_6 = _mm_mul_pd(self.twiddle5im, x6m17);
let t_b3_7 = _mm_mul_pd(self.twiddle2im, x7m16);
let t_b3_8 = _mm_mul_pd(self.twiddle1im, x8m15);
let t_b3_9 = _mm_mul_pd(self.twiddle4im, x9m14);
let t_b3_10 = _mm_mul_pd(self.twiddle7im, x10m13);
let t_b3_11 = _mm_mul_pd(self.twiddle10im, x11m12);
let t_b4_1 = _mm_mul_pd(self.twiddle4im, x1m22);
let t_b4_2 = _mm_mul_pd(self.twiddle8im, x2m21);
let t_b4_3 = _mm_mul_pd(self.twiddle11im, x3m20);
let t_b4_4 = _mm_mul_pd(self.twiddle7im, x4m19);
let t_b4_5 = _mm_mul_pd(self.twiddle3im, x5m18);
let t_b4_6 = _mm_mul_pd(self.twiddle1im, x6m17);
let t_b4_7 = _mm_mul_pd(self.twiddle5im, x7m16);
let t_b4_8 = _mm_mul_pd(self.twiddle9im, x8m15);
let t_b4_9 = _mm_mul_pd(self.twiddle10im, x9m14);
let t_b4_10 = _mm_mul_pd(self.twiddle6im, x10m13);
let t_b4_11 = _mm_mul_pd(self.twiddle2im, x11m12);
let t_b5_1 = _mm_mul_pd(self.twiddle5im, x1m22);
let t_b5_2 = _mm_mul_pd(self.twiddle10im, x2m21);
let t_b5_3 = _mm_mul_pd(self.twiddle8im, x3m20);
let t_b5_4 = _mm_mul_pd(self.twiddle3im, x4m19);
let t_b5_5 = _mm_mul_pd(self.twiddle2im, x5m18);
let t_b5_6 = _mm_mul_pd(self.twiddle7im, x6m17);
let t_b5_7 = _mm_mul_pd(self.twiddle11im, x7m16);
let t_b5_8 = _mm_mul_pd(self.twiddle6im, x8m15);
let t_b5_9 = _mm_mul_pd(self.twiddle1im, x9m14);
let t_b5_10 = _mm_mul_pd(self.twiddle4im, x10m13);
let t_b5_11 = _mm_mul_pd(self.twiddle9im, x11m12);
let t_b6_1 = _mm_mul_pd(self.twiddle6im, x1m22);
let t_b6_2 = _mm_mul_pd(self.twiddle11im, x2m21);
let t_b6_3 = _mm_mul_pd(self.twiddle5im, x3m20);
let t_b6_4 = _mm_mul_pd(self.twiddle1im, x4m19);
let t_b6_5 = _mm_mul_pd(self.twiddle7im, x5m18);
let t_b6_6 = _mm_mul_pd(self.twiddle10im, x6m17);
let t_b6_7 = _mm_mul_pd(self.twiddle4im, x7m16);
let t_b6_8 = _mm_mul_pd(self.twiddle2im, x8m15);
let t_b6_9 = _mm_mul_pd(self.twiddle8im, x9m14);
let t_b6_10 = _mm_mul_pd(self.twiddle9im, x10m13);
let t_b6_11 = _mm_mul_pd(self.twiddle3im, x11m12);
let t_b7_1 = _mm_mul_pd(self.twiddle7im, x1m22);
let t_b7_2 = _mm_mul_pd(self.twiddle9im, x2m21);
let t_b7_3 = _mm_mul_pd(self.twiddle2im, x3m20);
let t_b7_4 = _mm_mul_pd(self.twiddle5im, x4m19);
let t_b7_5 = _mm_mul_pd(self.twiddle11im, x5m18);
let t_b7_6 = _mm_mul_pd(self.twiddle4im, x6m17);
let t_b7_7 = _mm_mul_pd(self.twiddle3im, x7m16);
let t_b7_8 = _mm_mul_pd(self.twiddle10im, x8m15);
let t_b7_9 = _mm_mul_pd(self.twiddle6im, x9m14);
let t_b7_10 = _mm_mul_pd(self.twiddle1im, x10m13);
let t_b7_11 = _mm_mul_pd(self.twiddle8im, x11m12);
let t_b8_1 = _mm_mul_pd(self.twiddle8im, x1m22);
let t_b8_2 = _mm_mul_pd(self.twiddle7im, x2m21);
let t_b8_3 = _mm_mul_pd(self.twiddle1im, x3m20);
let t_b8_4 = _mm_mul_pd(self.twiddle9im, x4m19);
let t_b8_5 = _mm_mul_pd(self.twiddle6im, x5m18);
let t_b8_6 = _mm_mul_pd(self.twiddle2im, x6m17);
let t_b8_7 = _mm_mul_pd(self.twiddle10im, x7m16);
let t_b8_8 = _mm_mul_pd(self.twiddle5im, x8m15);
let t_b8_9 = _mm_mul_pd(self.twiddle3im, x9m14);
let t_b8_10 = _mm_mul_pd(self.twiddle11im, x10m13);
let t_b8_11 = _mm_mul_pd(self.twiddle4im, x11m12);
let t_b9_1 = _mm_mul_pd(self.twiddle9im, x1m22);
let t_b9_2 = _mm_mul_pd(self.twiddle5im, x2m21);
let t_b9_3 = _mm_mul_pd(self.twiddle4im, x3m20);
let t_b9_4 = _mm_mul_pd(self.twiddle10im, x4m19);
let t_b9_5 = _mm_mul_pd(self.twiddle1im, x5m18);
let t_b9_6 = _mm_mul_pd(self.twiddle8im, x6m17);
let t_b9_7 = _mm_mul_pd(self.twiddle6im, x7m16);
let t_b9_8 = _mm_mul_pd(self.twiddle3im, x8m15);
let t_b9_9 = _mm_mul_pd(self.twiddle11im, x9m14);
let t_b9_10 = _mm_mul_pd(self.twiddle2im, x10m13);
let t_b9_11 = _mm_mul_pd(self.twiddle7im, x11m12);
let t_b10_1 = _mm_mul_pd(self.twiddle10im, x1m22);
let t_b10_2 = _mm_mul_pd(self.twiddle3im, x2m21);
let t_b10_3 = _mm_mul_pd(self.twiddle7im, x3m20);
let t_b10_4 = _mm_mul_pd(self.twiddle6im, x4m19);
let t_b10_5 = _mm_mul_pd(self.twiddle4im, x5m18);
let t_b10_6 = _mm_mul_pd(self.twiddle9im, x6m17);
let t_b10_7 = _mm_mul_pd(self.twiddle1im, x7m16);
let t_b10_8 = _mm_mul_pd(self.twiddle11im, x8m15);
let t_b10_9 = _mm_mul_pd(self.twiddle2im, x9m14);
let t_b10_10 = _mm_mul_pd(self.twiddle8im, x10m13);
let t_b10_11 = _mm_mul_pd(self.twiddle5im, x11m12);
let t_b11_1 = _mm_mul_pd(self.twiddle11im, x1m22);
let t_b11_2 = _mm_mul_pd(self.twiddle1im, x2m21);
let t_b11_3 = _mm_mul_pd(self.twiddle10im, x3m20);
let t_b11_4 = _mm_mul_pd(self.twiddle2im, x4m19);
let t_b11_5 = _mm_mul_pd(self.twiddle9im, x5m18);
let t_b11_6 = _mm_mul_pd(self.twiddle3im, x6m17);
let t_b11_7 = _mm_mul_pd(self.twiddle8im, x7m16);
let t_b11_8 = _mm_mul_pd(self.twiddle4im, x8m15);
let t_b11_9 = _mm_mul_pd(self.twiddle7im, x9m14);
let t_b11_10 = _mm_mul_pd(self.twiddle5im, x10m13);
let t_b11_11 = _mm_mul_pd(self.twiddle6im, x11m12);
let x0 = values[0];
let t_a1 = calc_f64!(x0 + t_a1_1 + t_a1_2 + t_a1_3 + t_a1_4 + t_a1_5 + t_a1_6 + t_a1_7 + t_a1_8 + t_a1_9 + t_a1_10 + t_a1_11);
let t_a2 = calc_f64!(x0 + t_a2_1 + t_a2_2 + t_a2_3 + t_a2_4 + t_a2_5 + t_a2_6 + t_a2_7 + t_a2_8 + t_a2_9 + t_a2_10 + t_a2_11);
let t_a3 = calc_f64!(x0 + t_a3_1 + t_a3_2 + t_a3_3 + t_a3_4 + t_a3_5 + t_a3_6 + t_a3_7 + t_a3_8 + t_a3_9 + t_a3_10 + t_a3_11);
let t_a4 = calc_f64!(x0 + t_a4_1 + t_a4_2 + t_a4_3 + t_a4_4 + t_a4_5 + t_a4_6 + t_a4_7 + t_a4_8 + t_a4_9 + t_a4_10 + t_a4_11);
let t_a5 = calc_f64!(x0 + t_a5_1 + t_a5_2 + t_a5_3 + t_a5_4 + t_a5_5 + t_a5_6 + t_a5_7 + t_a5_8 + t_a5_9 + t_a5_10 + t_a5_11);
let t_a6 = calc_f64!(x0 + t_a6_1 + t_a6_2 + t_a6_3 + t_a6_4 + t_a6_5 + t_a6_6 + t_a6_7 + t_a6_8 + t_a6_9 + t_a6_10 + t_a6_11);
let t_a7 = calc_f64!(x0 + t_a7_1 + t_a7_2 + t_a7_3 + t_a7_4 + t_a7_5 + t_a7_6 + t_a7_7 + t_a7_8 + t_a7_9 + t_a7_10 + t_a7_11);
let t_a8 = calc_f64!(x0 + t_a8_1 + t_a8_2 + t_a8_3 + t_a8_4 + t_a8_5 + t_a8_6 + t_a8_7 + t_a8_8 + t_a8_9 + t_a8_10 + t_a8_11);
let t_a9 = calc_f64!(x0 + t_a9_1 + t_a9_2 + t_a9_3 + t_a9_4 + t_a9_5 + t_a9_6 + t_a9_7 + t_a9_8 + t_a9_9 + t_a9_10 + t_a9_11);
let t_a10 = calc_f64!(x0 + t_a10_1 + t_a10_2 + t_a10_3 + t_a10_4 + t_a10_5 + t_a10_6 + t_a10_7 + t_a10_8 + t_a10_9 + t_a10_10 + t_a10_11);
let t_a11 = calc_f64!(x0 + t_a11_1 + t_a11_2 + t_a11_3 + t_a11_4 + t_a11_5 + t_a11_6 + t_a11_7 + t_a11_8 + t_a11_9 + t_a11_10 + t_a11_11);
let t_b1 = calc_f64!(t_b1_1 + t_b1_2 + t_b1_3 + t_b1_4 + t_b1_5 + t_b1_6 + t_b1_7 + t_b1_8 + t_b1_9 + t_b1_10 + t_b1_11);
let t_b2 = calc_f64!(t_b2_1 + t_b2_2 + t_b2_3 + t_b2_4 + t_b2_5 - t_b2_6 - t_b2_7 - t_b2_8 - t_b2_9 - t_b2_10 - t_b2_11);
let t_b3 = calc_f64!(t_b3_1 + t_b3_2 + t_b3_3 - t_b3_4 - t_b3_5 - t_b3_6 - t_b3_7 + t_b3_8 + t_b3_9 + t_b3_10 + t_b3_11);
let t_b4 = calc_f64!(t_b4_1 + t_b4_2 - t_b4_3 - t_b4_4 - t_b4_5 + t_b4_6 + t_b4_7 + t_b4_8 - t_b4_9 - t_b4_10 - t_b4_11);
let t_b5 = calc_f64!(t_b5_1 + t_b5_2 - t_b5_3 - t_b5_4 + t_b5_5 + t_b5_6 - t_b5_7 - t_b5_8 - t_b5_9 + t_b5_10 + t_b5_11);
let t_b6 = calc_f64!(t_b6_1 - t_b6_2 - t_b6_3 + t_b6_4 + t_b6_5 - t_b6_6 - t_b6_7 + t_b6_8 + t_b6_9 - t_b6_10 - t_b6_11);
let t_b7 = calc_f64!(t_b7_1 - t_b7_2 - t_b7_3 + t_b7_4 - t_b7_5 - t_b7_6 + t_b7_7 + t_b7_8 - t_b7_9 + t_b7_10 + t_b7_11);
let t_b8 = calc_f64!(t_b8_1 - t_b8_2 + t_b8_3 + t_b8_4 - t_b8_5 + t_b8_6 + t_b8_7 - t_b8_8 + t_b8_9 + t_b8_10 - t_b8_11);
let t_b9 = calc_f64!(t_b9_1 - t_b9_2 + t_b9_3 - t_b9_4 - t_b9_5 + t_b9_6 - t_b9_7 + t_b9_8 - t_b9_9 - t_b9_10 + t_b9_11);
let t_b10 = calc_f64!(t_b10_1 - t_b10_2 + t_b10_3 - t_b10_4 + t_b10_5 - t_b10_6 + t_b10_7 + t_b10_8 - t_b10_9 + t_b10_10 - t_b10_11);
let t_b11 = calc_f64!(t_b11_1 - t_b11_2 + t_b11_3 - t_b11_4 + t_b11_5 - t_b11_6 + t_b11_7 - t_b11_8 + t_b11_9 - t_b11_10 + t_b11_11);
let t_b1_rot = self.rotate.rotate(t_b1);
let t_b2_rot = self.rotate.rotate(t_b2);
let t_b3_rot = self.rotate.rotate(t_b3);
let t_b4_rot = self.rotate.rotate(t_b4);
let t_b5_rot = self.rotate.rotate(t_b5);
let t_b6_rot = self.rotate.rotate(t_b6);
let t_b7_rot = self.rotate.rotate(t_b7);
let t_b8_rot = self.rotate.rotate(t_b8);
let t_b9_rot = self.rotate.rotate(t_b9);
let t_b10_rot = self.rotate.rotate(t_b10);
let t_b11_rot = self.rotate.rotate(t_b11);
let y0 = calc_f64!(x0 + x1p22 + x2p21 + x3p20 + x4p19 + x5p18 + x6p17 + x7p16 + x8p15 + x9p14 + x10p13 + x11p12);
let [y1, y22] = solo_fft2_f64(t_a1, t_b1_rot);
let [y2, y21] = solo_fft2_f64(t_a2, t_b2_rot);
let [y3, y20] = solo_fft2_f64(t_a3, t_b3_rot);
let [y4, y19] = solo_fft2_f64(t_a4, t_b4_rot);
let [y5, y18] = solo_fft2_f64(t_a5, t_b5_rot);
let [y6, y17] = solo_fft2_f64(t_a6, t_b6_rot);
let [y7, y16] = solo_fft2_f64(t_a7, t_b7_rot);
let [y8, y15] = solo_fft2_f64(t_a8, t_b8_rot);
let [y9, y14] = solo_fft2_f64(t_a9, t_b9_rot);
let [y10, y13] = solo_fft2_f64(t_a10, t_b10_rot);
let [y11, y12] = solo_fft2_f64(t_a11, t_b11_rot);
[y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22]
}
}
pub struct SseF32Butterfly29<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
rotate: Rotate90F32,
twiddle1re: __m128,
twiddle1im: __m128,
twiddle2re: __m128,
twiddle2im: __m128,
twiddle3re: __m128,
twiddle3im: __m128,
twiddle4re: __m128,
twiddle4im: __m128,
twiddle5re: __m128,
twiddle5im: __m128,
twiddle6re: __m128,
twiddle6im: __m128,
twiddle7re: __m128,
twiddle7im: __m128,
twiddle8re: __m128,
twiddle8im: __m128,
twiddle9re: __m128,
twiddle9im: __m128,
twiddle10re: __m128,
twiddle10im: __m128,
twiddle11re: __m128,
twiddle11im: __m128,
twiddle12re: __m128,
twiddle12im: __m128,
twiddle13re: __m128,
twiddle13im: __m128,
twiddle14re: __m128,
twiddle14im: __m128,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly29, 29, |this: &SseF32Butterfly29<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF32Butterfly29, 29, |this: &SseF32Butterfly29<_>| this
.direction);
impl<T: FftNum> SseF32Butterfly29<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
let rotate = Rotate90F32::new(true);
let tw1: Complex<f32> = twiddles::compute_twiddle(1, 29, direction);
let tw2: Complex<f32> = twiddles::compute_twiddle(2, 29, direction);
let tw3: Complex<f32> = twiddles::compute_twiddle(3, 29, direction);
let tw4: Complex<f32> = twiddles::compute_twiddle(4, 29, direction);
let tw5: Complex<f32> = twiddles::compute_twiddle(5, 29, direction);
let tw6: Complex<f32> = twiddles::compute_twiddle(6, 29, direction);
let tw7: Complex<f32> = twiddles::compute_twiddle(7, 29, direction);
let tw8: Complex<f32> = twiddles::compute_twiddle(8, 29, direction);
let tw9: Complex<f32> = twiddles::compute_twiddle(9, 29, direction);
let tw10: Complex<f32> = twiddles::compute_twiddle(10, 29, direction);
let tw11: Complex<f32> = twiddles::compute_twiddle(11, 29, direction);
let tw12: Complex<f32> = twiddles::compute_twiddle(12, 29, direction);
let tw13: Complex<f32> = twiddles::compute_twiddle(13, 29, direction);
let tw14: Complex<f32> = twiddles::compute_twiddle(14, 29, direction);
let twiddle1re = unsafe { _mm_load1_ps(&tw1.re) };
let twiddle1im = unsafe { _mm_load1_ps(&tw1.im) };
let twiddle2re = unsafe { _mm_load1_ps(&tw2.re) };
let twiddle2im = unsafe { _mm_load1_ps(&tw2.im) };
let twiddle3re = unsafe { _mm_load1_ps(&tw3.re) };
let twiddle3im = unsafe { _mm_load1_ps(&tw3.im) };
let twiddle4re = unsafe { _mm_load1_ps(&tw4.re) };
let twiddle4im = unsafe { _mm_load1_ps(&tw4.im) };
let twiddle5re = unsafe { _mm_load1_ps(&tw5.re) };
let twiddle5im = unsafe { _mm_load1_ps(&tw5.im) };
let twiddle6re = unsafe { _mm_load1_ps(&tw6.re) };
let twiddle6im = unsafe { _mm_load1_ps(&tw6.im) };
let twiddle7re = unsafe { _mm_load1_ps(&tw7.re) };
let twiddle7im = unsafe { _mm_load1_ps(&tw7.im) };
let twiddle8re = unsafe { _mm_load1_ps(&tw8.re) };
let twiddle8im = unsafe { _mm_load1_ps(&tw8.im) };
let twiddle9re = unsafe { _mm_load1_ps(&tw9.re) };
let twiddle9im = unsafe { _mm_load1_ps(&tw9.im) };
let twiddle10re = unsafe { _mm_load1_ps(&tw10.re) };
let twiddle10im = unsafe { _mm_load1_ps(&tw10.im) };
let twiddle11re = unsafe { _mm_load1_ps(&tw11.re) };
let twiddle11im = unsafe { _mm_load1_ps(&tw11.im) };
let twiddle12re = unsafe { _mm_load1_ps(&tw12.re) };
let twiddle12im = unsafe { _mm_load1_ps(&tw12.im) };
let twiddle13re = unsafe { _mm_load1_ps(&tw13.re) };
let twiddle13im = unsafe { _mm_load1_ps(&tw13.im) };
let twiddle14re = unsafe { _mm_load1_ps(&tw14.re) };
let twiddle14im = unsafe { _mm_load1_ps(&tw14.im) };
Self {
direction,
_phantom: std::marker::PhantomData,
rotate,
twiddle1re,
twiddle1im,
twiddle2re,
twiddle2im,
twiddle3re,
twiddle3im,
twiddle4re,
twiddle4im,
twiddle5re,
twiddle5im,
twiddle6re,
twiddle6im,
twiddle7re,
twiddle7im,
twiddle8re,
twiddle8im,
twiddle9re,
twiddle9im,
twiddle10re,
twiddle10im,
twiddle11re,
twiddle11im,
twiddle12re,
twiddle12im,
twiddle13re,
twiddle13im,
twiddle14re,
twiddle14im,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let values = read_partial1_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28});
let out = self.perform_parallel_fft_direct(values);
write_partial_lo_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28});
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56});
let values = [
extract_lo_hi_f32(input_packed[0], input_packed[14]),
extract_hi_lo_f32(input_packed[0], input_packed[15]),
extract_lo_hi_f32(input_packed[1], input_packed[15]),
extract_hi_lo_f32(input_packed[1], input_packed[16]),
extract_lo_hi_f32(input_packed[2], input_packed[16]),
extract_hi_lo_f32(input_packed[2], input_packed[17]),
extract_lo_hi_f32(input_packed[3], input_packed[17]),
extract_hi_lo_f32(input_packed[3], input_packed[18]),
extract_lo_hi_f32(input_packed[4], input_packed[18]),
extract_hi_lo_f32(input_packed[4], input_packed[19]),
extract_lo_hi_f32(input_packed[5], input_packed[19]),
extract_hi_lo_f32(input_packed[5], input_packed[20]),
extract_lo_hi_f32(input_packed[6], input_packed[20]),
extract_hi_lo_f32(input_packed[6], input_packed[21]),
extract_lo_hi_f32(input_packed[7], input_packed[21]),
extract_hi_lo_f32(input_packed[7], input_packed[22]),
extract_lo_hi_f32(input_packed[8], input_packed[22]),
extract_hi_lo_f32(input_packed[8], input_packed[23]),
extract_lo_hi_f32(input_packed[9], input_packed[23]),
extract_hi_lo_f32(input_packed[9], input_packed[24]),
extract_lo_hi_f32(input_packed[10], input_packed[24]),
extract_hi_lo_f32(input_packed[10], input_packed[25]),
extract_lo_hi_f32(input_packed[11], input_packed[25]),
extract_hi_lo_f32(input_packed[11], input_packed[26]),
extract_lo_hi_f32(input_packed[12], input_packed[26]),
extract_hi_lo_f32(input_packed[12], input_packed[27]),
extract_lo_hi_f32(input_packed[13], input_packed[27]),
extract_hi_lo_f32(input_packed[13], input_packed[28]),
extract_lo_hi_f32(input_packed[14], input_packed[28]),
];
let out = self.perform_parallel_fft_direct(values);
let out_packed = [
extract_lo_lo_f32(out[0], out[1]),
extract_lo_lo_f32(out[2], out[3]),
extract_lo_lo_f32(out[4], out[5]),
extract_lo_lo_f32(out[6], out[7]),
extract_lo_lo_f32(out[8], out[9]),
extract_lo_lo_f32(out[10], out[11]),
extract_lo_lo_f32(out[12], out[13]),
extract_lo_lo_f32(out[14], out[15]),
extract_lo_lo_f32(out[16], out[17]),
extract_lo_lo_f32(out[18], out[19]),
extract_lo_lo_f32(out[20], out[21]),
extract_lo_lo_f32(out[22], out[23]),
extract_lo_lo_f32(out[24], out[25]),
extract_lo_lo_f32(out[26], out[27]),
extract_lo_hi_f32(out[28], out[0]),
extract_hi_hi_f32(out[1], out[2]),
extract_hi_hi_f32(out[3], out[4]),
extract_hi_hi_f32(out[5], out[6]),
extract_hi_hi_f32(out[7], out[8]),
extract_hi_hi_f32(out[9], out[10]),
extract_hi_hi_f32(out[11], out[12]),
extract_hi_hi_f32(out[13], out[14]),
extract_hi_hi_f32(out[15], out[16]),
extract_hi_hi_f32(out[17], out[18]),
extract_hi_hi_f32(out[19], out[20]),
extract_hi_hi_f32(out[21], out[22]),
extract_hi_hi_f32(out[23], out[24]),
extract_hi_hi_f32(out[25], out[26]),
extract_hi_hi_f32(out[27], out[28]),
];
write_complex_to_array_strided!(out_packed, buffer, 2, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28});
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 29]) -> [__m128; 29] {
let [x1p28, x1m28] = parallel_fft2_interleaved_f32(values[1], values[28]);
let [x2p27, x2m27] = parallel_fft2_interleaved_f32(values[2], values[27]);
let [x3p26, x3m26] = parallel_fft2_interleaved_f32(values[3], values[26]);
let [x4p25, x4m25] = parallel_fft2_interleaved_f32(values[4], values[25]);
let [x5p24, x5m24] = parallel_fft2_interleaved_f32(values[5], values[24]);
let [x6p23, x6m23] = parallel_fft2_interleaved_f32(values[6], values[23]);
let [x7p22, x7m22] = parallel_fft2_interleaved_f32(values[7], values[22]);
let [x8p21, x8m21] = parallel_fft2_interleaved_f32(values[8], values[21]);
let [x9p20, x9m20] = parallel_fft2_interleaved_f32(values[9], values[20]);
let [x10p19, x10m19] = parallel_fft2_interleaved_f32(values[10], values[19]);
let [x11p18, x11m18] = parallel_fft2_interleaved_f32(values[11], values[18]);
let [x12p17, x12m17] = parallel_fft2_interleaved_f32(values[12], values[17]);
let [x13p16, x13m16] = parallel_fft2_interleaved_f32(values[13], values[16]);
let [x14p15, x14m15] = parallel_fft2_interleaved_f32(values[14], values[15]);
let t_a1_1 = _mm_mul_ps(self.twiddle1re, x1p28);
let t_a1_2 = _mm_mul_ps(self.twiddle2re, x2p27);
let t_a1_3 = _mm_mul_ps(self.twiddle3re, x3p26);
let t_a1_4 = _mm_mul_ps(self.twiddle4re, x4p25);
let t_a1_5 = _mm_mul_ps(self.twiddle5re, x5p24);
let t_a1_6 = _mm_mul_ps(self.twiddle6re, x6p23);
let t_a1_7 = _mm_mul_ps(self.twiddle7re, x7p22);
let t_a1_8 = _mm_mul_ps(self.twiddle8re, x8p21);
let t_a1_9 = _mm_mul_ps(self.twiddle9re, x9p20);
let t_a1_10 = _mm_mul_ps(self.twiddle10re, x10p19);
let t_a1_11 = _mm_mul_ps(self.twiddle11re, x11p18);
let t_a1_12 = _mm_mul_ps(self.twiddle12re, x12p17);
let t_a1_13 = _mm_mul_ps(self.twiddle13re, x13p16);
let t_a1_14 = _mm_mul_ps(self.twiddle14re, x14p15);
let t_a2_1 = _mm_mul_ps(self.twiddle2re, x1p28);
let t_a2_2 = _mm_mul_ps(self.twiddle4re, x2p27);
let t_a2_3 = _mm_mul_ps(self.twiddle6re, x3p26);
let t_a2_4 = _mm_mul_ps(self.twiddle8re, x4p25);
let t_a2_5 = _mm_mul_ps(self.twiddle10re, x5p24);
let t_a2_6 = _mm_mul_ps(self.twiddle12re, x6p23);
let t_a2_7 = _mm_mul_ps(self.twiddle14re, x7p22);
let t_a2_8 = _mm_mul_ps(self.twiddle13re, x8p21);
let t_a2_9 = _mm_mul_ps(self.twiddle11re, x9p20);
let t_a2_10 = _mm_mul_ps(self.twiddle9re, x10p19);
let t_a2_11 = _mm_mul_ps(self.twiddle7re, x11p18);
let t_a2_12 = _mm_mul_ps(self.twiddle5re, x12p17);
let t_a2_13 = _mm_mul_ps(self.twiddle3re, x13p16);
let t_a2_14 = _mm_mul_ps(self.twiddle1re, x14p15);
let t_a3_1 = _mm_mul_ps(self.twiddle3re, x1p28);
let t_a3_2 = _mm_mul_ps(self.twiddle6re, x2p27);
let t_a3_3 = _mm_mul_ps(self.twiddle9re, x3p26);
let t_a3_4 = _mm_mul_ps(self.twiddle12re, x4p25);
let t_a3_5 = _mm_mul_ps(self.twiddle14re, x5p24);
let t_a3_6 = _mm_mul_ps(self.twiddle11re, x6p23);
let t_a3_7 = _mm_mul_ps(self.twiddle8re, x7p22);
let t_a3_8 = _mm_mul_ps(self.twiddle5re, x8p21);
let t_a3_9 = _mm_mul_ps(self.twiddle2re, x9p20);
let t_a3_10 = _mm_mul_ps(self.twiddle1re, x10p19);
let t_a3_11 = _mm_mul_ps(self.twiddle4re, x11p18);
let t_a3_12 = _mm_mul_ps(self.twiddle7re, x12p17);
let t_a3_13 = _mm_mul_ps(self.twiddle10re, x13p16);
let t_a3_14 = _mm_mul_ps(self.twiddle13re, x14p15);
let t_a4_1 = _mm_mul_ps(self.twiddle4re, x1p28);
let t_a4_2 = _mm_mul_ps(self.twiddle8re, x2p27);
let t_a4_3 = _mm_mul_ps(self.twiddle12re, x3p26);
let t_a4_4 = _mm_mul_ps(self.twiddle13re, x4p25);
let t_a4_5 = _mm_mul_ps(self.twiddle9re, x5p24);
let t_a4_6 = _mm_mul_ps(self.twiddle5re, x6p23);
let t_a4_7 = _mm_mul_ps(self.twiddle1re, x7p22);
let t_a4_8 = _mm_mul_ps(self.twiddle3re, x8p21);
let t_a4_9 = _mm_mul_ps(self.twiddle7re, x9p20);
let t_a4_10 = _mm_mul_ps(self.twiddle11re, x10p19);
let t_a4_11 = _mm_mul_ps(self.twiddle14re, x11p18);
let t_a4_12 = _mm_mul_ps(self.twiddle10re, x12p17);
let t_a4_13 = _mm_mul_ps(self.twiddle6re, x13p16);
let t_a4_14 = _mm_mul_ps(self.twiddle2re, x14p15);
let t_a5_1 = _mm_mul_ps(self.twiddle5re, x1p28);
let t_a5_2 = _mm_mul_ps(self.twiddle10re, x2p27);
let t_a5_3 = _mm_mul_ps(self.twiddle14re, x3p26);
let t_a5_4 = _mm_mul_ps(self.twiddle9re, x4p25);
let t_a5_5 = _mm_mul_ps(self.twiddle4re, x5p24);
let t_a5_6 = _mm_mul_ps(self.twiddle1re, x6p23);
let t_a5_7 = _mm_mul_ps(self.twiddle6re, x7p22);
let t_a5_8 = _mm_mul_ps(self.twiddle11re, x8p21);
let t_a5_9 = _mm_mul_ps(self.twiddle13re, x9p20);
let t_a5_10 = _mm_mul_ps(self.twiddle8re, x10p19);
let t_a5_11 = _mm_mul_ps(self.twiddle3re, x11p18);
let t_a5_12 = _mm_mul_ps(self.twiddle2re, x12p17);
let t_a5_13 = _mm_mul_ps(self.twiddle7re, x13p16);
let t_a5_14 = _mm_mul_ps(self.twiddle12re, x14p15);
let t_a6_1 = _mm_mul_ps(self.twiddle6re, x1p28);
let t_a6_2 = _mm_mul_ps(self.twiddle12re, x2p27);
let t_a6_3 = _mm_mul_ps(self.twiddle11re, x3p26);
let t_a6_4 = _mm_mul_ps(self.twiddle5re, x4p25);
let t_a6_5 = _mm_mul_ps(self.twiddle1re, x5p24);
let t_a6_6 = _mm_mul_ps(self.twiddle7re, x6p23);
let t_a6_7 = _mm_mul_ps(self.twiddle13re, x7p22);
let t_a6_8 = _mm_mul_ps(self.twiddle10re, x8p21);
let t_a6_9 = _mm_mul_ps(self.twiddle4re, x9p20);
let t_a6_10 = _mm_mul_ps(self.twiddle2re, x10p19);
let t_a6_11 = _mm_mul_ps(self.twiddle8re, x11p18);
let t_a6_12 = _mm_mul_ps(self.twiddle14re, x12p17);
let t_a6_13 = _mm_mul_ps(self.twiddle9re, x13p16);
let t_a6_14 = _mm_mul_ps(self.twiddle3re, x14p15);
let t_a7_1 = _mm_mul_ps(self.twiddle7re, x1p28);
let t_a7_2 = _mm_mul_ps(self.twiddle14re, x2p27);
let t_a7_3 = _mm_mul_ps(self.twiddle8re, x3p26);
let t_a7_4 = _mm_mul_ps(self.twiddle1re, x4p25);
let t_a7_5 = _mm_mul_ps(self.twiddle6re, x5p24);
let t_a7_6 = _mm_mul_ps(self.twiddle13re, x6p23);
let t_a7_7 = _mm_mul_ps(self.twiddle9re, x7p22);
let t_a7_8 = _mm_mul_ps(self.twiddle2re, x8p21);
let t_a7_9 = _mm_mul_ps(self.twiddle5re, x9p20);
let t_a7_10 = _mm_mul_ps(self.twiddle12re, x10p19);
let t_a7_11 = _mm_mul_ps(self.twiddle10re, x11p18);
let t_a7_12 = _mm_mul_ps(self.twiddle3re, x12p17);
let t_a7_13 = _mm_mul_ps(self.twiddle4re, x13p16);
let t_a7_14 = _mm_mul_ps(self.twiddle11re, x14p15);
let t_a8_1 = _mm_mul_ps(self.twiddle8re, x1p28);
let t_a8_2 = _mm_mul_ps(self.twiddle13re, x2p27);
let t_a8_3 = _mm_mul_ps(self.twiddle5re, x3p26);
let t_a8_4 = _mm_mul_ps(self.twiddle3re, x4p25);
let t_a8_5 = _mm_mul_ps(self.twiddle11re, x5p24);
let t_a8_6 = _mm_mul_ps(self.twiddle10re, x6p23);
let t_a8_7 = _mm_mul_ps(self.twiddle2re, x7p22);
let t_a8_8 = _mm_mul_ps(self.twiddle6re, x8p21);
let t_a8_9 = _mm_mul_ps(self.twiddle14re, x9p20);
let t_a8_10 = _mm_mul_ps(self.twiddle7re, x10p19);
let t_a8_11 = _mm_mul_ps(self.twiddle1re, x11p18);
let t_a8_12 = _mm_mul_ps(self.twiddle9re, x12p17);
let t_a8_13 = _mm_mul_ps(self.twiddle12re, x13p16);
let t_a8_14 = _mm_mul_ps(self.twiddle4re, x14p15);
let t_a9_1 = _mm_mul_ps(self.twiddle9re, x1p28);
let t_a9_2 = _mm_mul_ps(self.twiddle11re, x2p27);
let t_a9_3 = _mm_mul_ps(self.twiddle2re, x3p26);
let t_a9_4 = _mm_mul_ps(self.twiddle7re, x4p25);
let t_a9_5 = _mm_mul_ps(self.twiddle13re, x5p24);
let t_a9_6 = _mm_mul_ps(self.twiddle4re, x6p23);
let t_a9_7 = _mm_mul_ps(self.twiddle5re, x7p22);
let t_a9_8 = _mm_mul_ps(self.twiddle14re, x8p21);
let t_a9_9 = _mm_mul_ps(self.twiddle6re, x9p20);
let t_a9_10 = _mm_mul_ps(self.twiddle3re, x10p19);
let t_a9_11 = _mm_mul_ps(self.twiddle12re, x11p18);
let t_a9_12 = _mm_mul_ps(self.twiddle8re, x12p17);
let t_a9_13 = _mm_mul_ps(self.twiddle1re, x13p16);
let t_a9_14 = _mm_mul_ps(self.twiddle10re, x14p15);
let t_a10_1 = _mm_mul_ps(self.twiddle10re, x1p28);
let t_a10_2 = _mm_mul_ps(self.twiddle9re, x2p27);
let t_a10_3 = _mm_mul_ps(self.twiddle1re, x3p26);
let t_a10_4 = _mm_mul_ps(self.twiddle11re, x4p25);
let t_a10_5 = _mm_mul_ps(self.twiddle8re, x5p24);
let t_a10_6 = _mm_mul_ps(self.twiddle2re, x6p23);
let t_a10_7 = _mm_mul_ps(self.twiddle12re, x7p22);
let t_a10_8 = _mm_mul_ps(self.twiddle7re, x8p21);
let t_a10_9 = _mm_mul_ps(self.twiddle3re, x9p20);
let t_a10_10 = _mm_mul_ps(self.twiddle13re, x10p19);
let t_a10_11 = _mm_mul_ps(self.twiddle6re, x11p18);
let t_a10_12 = _mm_mul_ps(self.twiddle4re, x12p17);
let t_a10_13 = _mm_mul_ps(self.twiddle14re, x13p16);
let t_a10_14 = _mm_mul_ps(self.twiddle5re, x14p15);
let t_a11_1 = _mm_mul_ps(self.twiddle11re, x1p28);
let t_a11_2 = _mm_mul_ps(self.twiddle7re, x2p27);
let t_a11_3 = _mm_mul_ps(self.twiddle4re, x3p26);
let t_a11_4 = _mm_mul_ps(self.twiddle14re, x4p25);
let t_a11_5 = _mm_mul_ps(self.twiddle3re, x5p24);
let t_a11_6 = _mm_mul_ps(self.twiddle8re, x6p23);
let t_a11_7 = _mm_mul_ps(self.twiddle10re, x7p22);
let t_a11_8 = _mm_mul_ps(self.twiddle1re, x8p21);
let t_a11_9 = _mm_mul_ps(self.twiddle12re, x9p20);
let t_a11_10 = _mm_mul_ps(self.twiddle6re, x10p19);
let t_a11_11 = _mm_mul_ps(self.twiddle5re, x11p18);
let t_a11_12 = _mm_mul_ps(self.twiddle13re, x12p17);
let t_a11_13 = _mm_mul_ps(self.twiddle2re, x13p16);
let t_a11_14 = _mm_mul_ps(self.twiddle9re, x14p15);
let t_a12_1 = _mm_mul_ps(self.twiddle12re, x1p28);
let t_a12_2 = _mm_mul_ps(self.twiddle5re, x2p27);
let t_a12_3 = _mm_mul_ps(self.twiddle7re, x3p26);
let t_a12_4 = _mm_mul_ps(self.twiddle10re, x4p25);
let t_a12_5 = _mm_mul_ps(self.twiddle2re, x5p24);
let t_a12_6 = _mm_mul_ps(self.twiddle14re, x6p23);
let t_a12_7 = _mm_mul_ps(self.twiddle3re, x7p22);
let t_a12_8 = _mm_mul_ps(self.twiddle9re, x8p21);
let t_a12_9 = _mm_mul_ps(self.twiddle8re, x9p20);
let t_a12_10 = _mm_mul_ps(self.twiddle4re, x10p19);
let t_a12_11 = _mm_mul_ps(self.twiddle13re, x11p18);
let t_a12_12 = _mm_mul_ps(self.twiddle1re, x12p17);
let t_a12_13 = _mm_mul_ps(self.twiddle11re, x13p16);
let t_a12_14 = _mm_mul_ps(self.twiddle6re, x14p15);
let t_a13_1 = _mm_mul_ps(self.twiddle13re, x1p28);
let t_a13_2 = _mm_mul_ps(self.twiddle3re, x2p27);
let t_a13_3 = _mm_mul_ps(self.twiddle10re, x3p26);
let t_a13_4 = _mm_mul_ps(self.twiddle6re, x4p25);
let t_a13_5 = _mm_mul_ps(self.twiddle7re, x5p24);
let t_a13_6 = _mm_mul_ps(self.twiddle9re, x6p23);
let t_a13_7 = _mm_mul_ps(self.twiddle4re, x7p22);
let t_a13_8 = _mm_mul_ps(self.twiddle12re, x8p21);
let t_a13_9 = _mm_mul_ps(self.twiddle1re, x9p20);
let t_a13_10 = _mm_mul_ps(self.twiddle14re, x10p19);
let t_a13_11 = _mm_mul_ps(self.twiddle2re, x11p18);
let t_a13_12 = _mm_mul_ps(self.twiddle11re, x12p17);
let t_a13_13 = _mm_mul_ps(self.twiddle5re, x13p16);
let t_a13_14 = _mm_mul_ps(self.twiddle8re, x14p15);
let t_a14_1 = _mm_mul_ps(self.twiddle14re, x1p28);
let t_a14_2 = _mm_mul_ps(self.twiddle1re, x2p27);
let t_a14_3 = _mm_mul_ps(self.twiddle13re, x3p26);
let t_a14_4 = _mm_mul_ps(self.twiddle2re, x4p25);
let t_a14_5 = _mm_mul_ps(self.twiddle12re, x5p24);
let t_a14_6 = _mm_mul_ps(self.twiddle3re, x6p23);
let t_a14_7 = _mm_mul_ps(self.twiddle11re, x7p22);
let t_a14_8 = _mm_mul_ps(self.twiddle4re, x8p21);
let t_a14_9 = _mm_mul_ps(self.twiddle10re, x9p20);
let t_a14_10 = _mm_mul_ps(self.twiddle5re, x10p19);
let t_a14_11 = _mm_mul_ps(self.twiddle9re, x11p18);
let t_a14_12 = _mm_mul_ps(self.twiddle6re, x12p17);
let t_a14_13 = _mm_mul_ps(self.twiddle8re, x13p16);
let t_a14_14 = _mm_mul_ps(self.twiddle7re, x14p15);
let t_b1_1 = _mm_mul_ps(self.twiddle1im, x1m28);
let t_b1_2 = _mm_mul_ps(self.twiddle2im, x2m27);
let t_b1_3 = _mm_mul_ps(self.twiddle3im, x3m26);
let t_b1_4 = _mm_mul_ps(self.twiddle4im, x4m25);
let t_b1_5 = _mm_mul_ps(self.twiddle5im, x5m24);
let t_b1_6 = _mm_mul_ps(self.twiddle6im, x6m23);
let t_b1_7 = _mm_mul_ps(self.twiddle7im, x7m22);
let t_b1_8 = _mm_mul_ps(self.twiddle8im, x8m21);
let t_b1_9 = _mm_mul_ps(self.twiddle9im, x9m20);
let t_b1_10 = _mm_mul_ps(self.twiddle10im, x10m19);
let t_b1_11 = _mm_mul_ps(self.twiddle11im, x11m18);
let t_b1_12 = _mm_mul_ps(self.twiddle12im, x12m17);
let t_b1_13 = _mm_mul_ps(self.twiddle13im, x13m16);
let t_b1_14 = _mm_mul_ps(self.twiddle14im, x14m15);
let t_b2_1 = _mm_mul_ps(self.twiddle2im, x1m28);
let t_b2_2 = _mm_mul_ps(self.twiddle4im, x2m27);
let t_b2_3 = _mm_mul_ps(self.twiddle6im, x3m26);
let t_b2_4 = _mm_mul_ps(self.twiddle8im, x4m25);
let t_b2_5 = _mm_mul_ps(self.twiddle10im, x5m24);
let t_b2_6 = _mm_mul_ps(self.twiddle12im, x6m23);
let t_b2_7 = _mm_mul_ps(self.twiddle14im, x7m22);
let t_b2_8 = _mm_mul_ps(self.twiddle13im, x8m21);
let t_b2_9 = _mm_mul_ps(self.twiddle11im, x9m20);
let t_b2_10 = _mm_mul_ps(self.twiddle9im, x10m19);
let t_b2_11 = _mm_mul_ps(self.twiddle7im, x11m18);
let t_b2_12 = _mm_mul_ps(self.twiddle5im, x12m17);
let t_b2_13 = _mm_mul_ps(self.twiddle3im, x13m16);
let t_b2_14 = _mm_mul_ps(self.twiddle1im, x14m15);
let t_b3_1 = _mm_mul_ps(self.twiddle3im, x1m28);
let t_b3_2 = _mm_mul_ps(self.twiddle6im, x2m27);
let t_b3_3 = _mm_mul_ps(self.twiddle9im, x3m26);
let t_b3_4 = _mm_mul_ps(self.twiddle12im, x4m25);
let t_b3_5 = _mm_mul_ps(self.twiddle14im, x5m24);
let t_b3_6 = _mm_mul_ps(self.twiddle11im, x6m23);
let t_b3_7 = _mm_mul_ps(self.twiddle8im, x7m22);
let t_b3_8 = _mm_mul_ps(self.twiddle5im, x8m21);
let t_b3_9 = _mm_mul_ps(self.twiddle2im, x9m20);
let t_b3_10 = _mm_mul_ps(self.twiddle1im, x10m19);
let t_b3_11 = _mm_mul_ps(self.twiddle4im, x11m18);
let t_b3_12 = _mm_mul_ps(self.twiddle7im, x12m17);
let t_b3_13 = _mm_mul_ps(self.twiddle10im, x13m16);
let t_b3_14 = _mm_mul_ps(self.twiddle13im, x14m15);
let t_b4_1 = _mm_mul_ps(self.twiddle4im, x1m28);
let t_b4_2 = _mm_mul_ps(self.twiddle8im, x2m27);
let t_b4_3 = _mm_mul_ps(self.twiddle12im, x3m26);
let t_b4_4 = _mm_mul_ps(self.twiddle13im, x4m25);
let t_b4_5 = _mm_mul_ps(self.twiddle9im, x5m24);
let t_b4_6 = _mm_mul_ps(self.twiddle5im, x6m23);
let t_b4_7 = _mm_mul_ps(self.twiddle1im, x7m22);
let t_b4_8 = _mm_mul_ps(self.twiddle3im, x8m21);
let t_b4_9 = _mm_mul_ps(self.twiddle7im, x9m20);
let t_b4_10 = _mm_mul_ps(self.twiddle11im, x10m19);
let t_b4_11 = _mm_mul_ps(self.twiddle14im, x11m18);
let t_b4_12 = _mm_mul_ps(self.twiddle10im, x12m17);
let t_b4_13 = _mm_mul_ps(self.twiddle6im, x13m16);
let t_b4_14 = _mm_mul_ps(self.twiddle2im, x14m15);
let t_b5_1 = _mm_mul_ps(self.twiddle5im, x1m28);
let t_b5_2 = _mm_mul_ps(self.twiddle10im, x2m27);
let t_b5_3 = _mm_mul_ps(self.twiddle14im, x3m26);
let t_b5_4 = _mm_mul_ps(self.twiddle9im, x4m25);
let t_b5_5 = _mm_mul_ps(self.twiddle4im, x5m24);
let t_b5_6 = _mm_mul_ps(self.twiddle1im, x6m23);
let t_b5_7 = _mm_mul_ps(self.twiddle6im, x7m22);
let t_b5_8 = _mm_mul_ps(self.twiddle11im, x8m21);
let t_b5_9 = _mm_mul_ps(self.twiddle13im, x9m20);
let t_b5_10 = _mm_mul_ps(self.twiddle8im, x10m19);
let t_b5_11 = _mm_mul_ps(self.twiddle3im, x11m18);
let t_b5_12 = _mm_mul_ps(self.twiddle2im, x12m17);
let t_b5_13 = _mm_mul_ps(self.twiddle7im, x13m16);
let t_b5_14 = _mm_mul_ps(self.twiddle12im, x14m15);
let t_b6_1 = _mm_mul_ps(self.twiddle6im, x1m28);
let t_b6_2 = _mm_mul_ps(self.twiddle12im, x2m27);
let t_b6_3 = _mm_mul_ps(self.twiddle11im, x3m26);
let t_b6_4 = _mm_mul_ps(self.twiddle5im, x4m25);
let t_b6_5 = _mm_mul_ps(self.twiddle1im, x5m24);
let t_b6_6 = _mm_mul_ps(self.twiddle7im, x6m23);
let t_b6_7 = _mm_mul_ps(self.twiddle13im, x7m22);
let t_b6_8 = _mm_mul_ps(self.twiddle10im, x8m21);
let t_b6_9 = _mm_mul_ps(self.twiddle4im, x9m20);
let t_b6_10 = _mm_mul_ps(self.twiddle2im, x10m19);
let t_b6_11 = _mm_mul_ps(self.twiddle8im, x11m18);
let t_b6_12 = _mm_mul_ps(self.twiddle14im, x12m17);
let t_b6_13 = _mm_mul_ps(self.twiddle9im, x13m16);
let t_b6_14 = _mm_mul_ps(self.twiddle3im, x14m15);
let t_b7_1 = _mm_mul_ps(self.twiddle7im, x1m28);
let t_b7_2 = _mm_mul_ps(self.twiddle14im, x2m27);
let t_b7_3 = _mm_mul_ps(self.twiddle8im, x3m26);
let t_b7_4 = _mm_mul_ps(self.twiddle1im, x4m25);
let t_b7_5 = _mm_mul_ps(self.twiddle6im, x5m24);
let t_b7_6 = _mm_mul_ps(self.twiddle13im, x6m23);
let t_b7_7 = _mm_mul_ps(self.twiddle9im, x7m22);
let t_b7_8 = _mm_mul_ps(self.twiddle2im, x8m21);
let t_b7_9 = _mm_mul_ps(self.twiddle5im, x9m20);
let t_b7_10 = _mm_mul_ps(self.twiddle12im, x10m19);
let t_b7_11 = _mm_mul_ps(self.twiddle10im, x11m18);
let t_b7_12 = _mm_mul_ps(self.twiddle3im, x12m17);
let t_b7_13 = _mm_mul_ps(self.twiddle4im, x13m16);
let t_b7_14 = _mm_mul_ps(self.twiddle11im, x14m15);
let t_b8_1 = _mm_mul_ps(self.twiddle8im, x1m28);
let t_b8_2 = _mm_mul_ps(self.twiddle13im, x2m27);
let t_b8_3 = _mm_mul_ps(self.twiddle5im, x3m26);
let t_b8_4 = _mm_mul_ps(self.twiddle3im, x4m25);
let t_b8_5 = _mm_mul_ps(self.twiddle11im, x5m24);
let t_b8_6 = _mm_mul_ps(self.twiddle10im, x6m23);
let t_b8_7 = _mm_mul_ps(self.twiddle2im, x7m22);
let t_b8_8 = _mm_mul_ps(self.twiddle6im, x8m21);
let t_b8_9 = _mm_mul_ps(self.twiddle14im, x9m20);
let t_b8_10 = _mm_mul_ps(self.twiddle7im, x10m19);
let t_b8_11 = _mm_mul_ps(self.twiddle1im, x11m18);
let t_b8_12 = _mm_mul_ps(self.twiddle9im, x12m17);
let t_b8_13 = _mm_mul_ps(self.twiddle12im, x13m16);
let t_b8_14 = _mm_mul_ps(self.twiddle4im, x14m15);
let t_b9_1 = _mm_mul_ps(self.twiddle9im, x1m28);
let t_b9_2 = _mm_mul_ps(self.twiddle11im, x2m27);
let t_b9_3 = _mm_mul_ps(self.twiddle2im, x3m26);
let t_b9_4 = _mm_mul_ps(self.twiddle7im, x4m25);
let t_b9_5 = _mm_mul_ps(self.twiddle13im, x5m24);
let t_b9_6 = _mm_mul_ps(self.twiddle4im, x6m23);
let t_b9_7 = _mm_mul_ps(self.twiddle5im, x7m22);
let t_b9_8 = _mm_mul_ps(self.twiddle14im, x8m21);
let t_b9_9 = _mm_mul_ps(self.twiddle6im, x9m20);
let t_b9_10 = _mm_mul_ps(self.twiddle3im, x10m19);
let t_b9_11 = _mm_mul_ps(self.twiddle12im, x11m18);
let t_b9_12 = _mm_mul_ps(self.twiddle8im, x12m17);
let t_b9_13 = _mm_mul_ps(self.twiddle1im, x13m16);
let t_b9_14 = _mm_mul_ps(self.twiddle10im, x14m15);
let t_b10_1 = _mm_mul_ps(self.twiddle10im, x1m28);
let t_b10_2 = _mm_mul_ps(self.twiddle9im, x2m27);
let t_b10_3 = _mm_mul_ps(self.twiddle1im, x3m26);
let t_b10_4 = _mm_mul_ps(self.twiddle11im, x4m25);
let t_b10_5 = _mm_mul_ps(self.twiddle8im, x5m24);
let t_b10_6 = _mm_mul_ps(self.twiddle2im, x6m23);
let t_b10_7 = _mm_mul_ps(self.twiddle12im, x7m22);
let t_b10_8 = _mm_mul_ps(self.twiddle7im, x8m21);
let t_b10_9 = _mm_mul_ps(self.twiddle3im, x9m20);
let t_b10_10 = _mm_mul_ps(self.twiddle13im, x10m19);
let t_b10_11 = _mm_mul_ps(self.twiddle6im, x11m18);
let t_b10_12 = _mm_mul_ps(self.twiddle4im, x12m17);
let t_b10_13 = _mm_mul_ps(self.twiddle14im, x13m16);
let t_b10_14 = _mm_mul_ps(self.twiddle5im, x14m15);
let t_b11_1 = _mm_mul_ps(self.twiddle11im, x1m28);
let t_b11_2 = _mm_mul_ps(self.twiddle7im, x2m27);
let t_b11_3 = _mm_mul_ps(self.twiddle4im, x3m26);
let t_b11_4 = _mm_mul_ps(self.twiddle14im, x4m25);
let t_b11_5 = _mm_mul_ps(self.twiddle3im, x5m24);
let t_b11_6 = _mm_mul_ps(self.twiddle8im, x6m23);
let t_b11_7 = _mm_mul_ps(self.twiddle10im, x7m22);
let t_b11_8 = _mm_mul_ps(self.twiddle1im, x8m21);
let t_b11_9 = _mm_mul_ps(self.twiddle12im, x9m20);
let t_b11_10 = _mm_mul_ps(self.twiddle6im, x10m19);
let t_b11_11 = _mm_mul_ps(self.twiddle5im, x11m18);
let t_b11_12 = _mm_mul_ps(self.twiddle13im, x12m17);
let t_b11_13 = _mm_mul_ps(self.twiddle2im, x13m16);
let t_b11_14 = _mm_mul_ps(self.twiddle9im, x14m15);
let t_b12_1 = _mm_mul_ps(self.twiddle12im, x1m28);
let t_b12_2 = _mm_mul_ps(self.twiddle5im, x2m27);
let t_b12_3 = _mm_mul_ps(self.twiddle7im, x3m26);
let t_b12_4 = _mm_mul_ps(self.twiddle10im, x4m25);
let t_b12_5 = _mm_mul_ps(self.twiddle2im, x5m24);
let t_b12_6 = _mm_mul_ps(self.twiddle14im, x6m23);
let t_b12_7 = _mm_mul_ps(self.twiddle3im, x7m22);
let t_b12_8 = _mm_mul_ps(self.twiddle9im, x8m21);
let t_b12_9 = _mm_mul_ps(self.twiddle8im, x9m20);
let t_b12_10 = _mm_mul_ps(self.twiddle4im, x10m19);
let t_b12_11 = _mm_mul_ps(self.twiddle13im, x11m18);
let t_b12_12 = _mm_mul_ps(self.twiddle1im, x12m17);
let t_b12_13 = _mm_mul_ps(self.twiddle11im, x13m16);
let t_b12_14 = _mm_mul_ps(self.twiddle6im, x14m15);
let t_b13_1 = _mm_mul_ps(self.twiddle13im, x1m28);
let t_b13_2 = _mm_mul_ps(self.twiddle3im, x2m27);
let t_b13_3 = _mm_mul_ps(self.twiddle10im, x3m26);
let t_b13_4 = _mm_mul_ps(self.twiddle6im, x4m25);
let t_b13_5 = _mm_mul_ps(self.twiddle7im, x5m24);
let t_b13_6 = _mm_mul_ps(self.twiddle9im, x6m23);
let t_b13_7 = _mm_mul_ps(self.twiddle4im, x7m22);
let t_b13_8 = _mm_mul_ps(self.twiddle12im, x8m21);
let t_b13_9 = _mm_mul_ps(self.twiddle1im, x9m20);
let t_b13_10 = _mm_mul_ps(self.twiddle14im, x10m19);
let t_b13_11 = _mm_mul_ps(self.twiddle2im, x11m18);
let t_b13_12 = _mm_mul_ps(self.twiddle11im, x12m17);
let t_b13_13 = _mm_mul_ps(self.twiddle5im, x13m16);
let t_b13_14 = _mm_mul_ps(self.twiddle8im, x14m15);
let t_b14_1 = _mm_mul_ps(self.twiddle14im, x1m28);
let t_b14_2 = _mm_mul_ps(self.twiddle1im, x2m27);
let t_b14_3 = _mm_mul_ps(self.twiddle13im, x3m26);
let t_b14_4 = _mm_mul_ps(self.twiddle2im, x4m25);
let t_b14_5 = _mm_mul_ps(self.twiddle12im, x5m24);
let t_b14_6 = _mm_mul_ps(self.twiddle3im, x6m23);
let t_b14_7 = _mm_mul_ps(self.twiddle11im, x7m22);
let t_b14_8 = _mm_mul_ps(self.twiddle4im, x8m21);
let t_b14_9 = _mm_mul_ps(self.twiddle10im, x9m20);
let t_b14_10 = _mm_mul_ps(self.twiddle5im, x10m19);
let t_b14_11 = _mm_mul_ps(self.twiddle9im, x11m18);
let t_b14_12 = _mm_mul_ps(self.twiddle6im, x12m17);
let t_b14_13 = _mm_mul_ps(self.twiddle8im, x13m16);
let t_b14_14 = _mm_mul_ps(self.twiddle7im, x14m15);
let x0 = values[0];
let t_a1 = calc_f32!(x0 + t_a1_1 + t_a1_2 + t_a1_3 + t_a1_4 + t_a1_5 + t_a1_6 + t_a1_7 + t_a1_8 + t_a1_9 + t_a1_10 + t_a1_11 + t_a1_12 + t_a1_13 + t_a1_14);
let t_a2 = calc_f32!(x0 + t_a2_1 + t_a2_2 + t_a2_3 + t_a2_4 + t_a2_5 + t_a2_6 + t_a2_7 + t_a2_8 + t_a2_9 + t_a2_10 + t_a2_11 + t_a2_12 + t_a2_13 + t_a2_14);
let t_a3 = calc_f32!(x0 + t_a3_1 + t_a3_2 + t_a3_3 + t_a3_4 + t_a3_5 + t_a3_6 + t_a3_7 + t_a3_8 + t_a3_9 + t_a3_10 + t_a3_11 + t_a3_12 + t_a3_13 + t_a3_14);
let t_a4 = calc_f32!(x0 + t_a4_1 + t_a4_2 + t_a4_3 + t_a4_4 + t_a4_5 + t_a4_6 + t_a4_7 + t_a4_8 + t_a4_9 + t_a4_10 + t_a4_11 + t_a4_12 + t_a4_13 + t_a4_14);
let t_a5 = calc_f32!(x0 + t_a5_1 + t_a5_2 + t_a5_3 + t_a5_4 + t_a5_5 + t_a5_6 + t_a5_7 + t_a5_8 + t_a5_9 + t_a5_10 + t_a5_11 + t_a5_12 + t_a5_13 + t_a5_14);
let t_a6 = calc_f32!(x0 + t_a6_1 + t_a6_2 + t_a6_3 + t_a6_4 + t_a6_5 + t_a6_6 + t_a6_7 + t_a6_8 + t_a6_9 + t_a6_10 + t_a6_11 + t_a6_12 + t_a6_13 + t_a6_14);
let t_a7 = calc_f32!(x0 + t_a7_1 + t_a7_2 + t_a7_3 + t_a7_4 + t_a7_5 + t_a7_6 + t_a7_7 + t_a7_8 + t_a7_9 + t_a7_10 + t_a7_11 + t_a7_12 + t_a7_13 + t_a7_14);
let t_a8 = calc_f32!(x0 + t_a8_1 + t_a8_2 + t_a8_3 + t_a8_4 + t_a8_5 + t_a8_6 + t_a8_7 + t_a8_8 + t_a8_9 + t_a8_10 + t_a8_11 + t_a8_12 + t_a8_13 + t_a8_14);
let t_a9 = calc_f32!(x0 + t_a9_1 + t_a9_2 + t_a9_3 + t_a9_4 + t_a9_5 + t_a9_6 + t_a9_7 + t_a9_8 + t_a9_9 + t_a9_10 + t_a9_11 + t_a9_12 + t_a9_13 + t_a9_14);
let t_a10 = calc_f32!(x0 + t_a10_1 + t_a10_2 + t_a10_3 + t_a10_4 + t_a10_5 + t_a10_6 + t_a10_7 + t_a10_8 + t_a10_9 + t_a10_10 + t_a10_11 + t_a10_12 + t_a10_13 + t_a10_14);
let t_a11 = calc_f32!(x0 + t_a11_1 + t_a11_2 + t_a11_3 + t_a11_4 + t_a11_5 + t_a11_6 + t_a11_7 + t_a11_8 + t_a11_9 + t_a11_10 + t_a11_11 + t_a11_12 + t_a11_13 + t_a11_14);
let t_a12 = calc_f32!(x0 + t_a12_1 + t_a12_2 + t_a12_3 + t_a12_4 + t_a12_5 + t_a12_6 + t_a12_7 + t_a12_8 + t_a12_9 + t_a12_10 + t_a12_11 + t_a12_12 + t_a12_13 + t_a12_14);
let t_a13 = calc_f32!(x0 + t_a13_1 + t_a13_2 + t_a13_3 + t_a13_4 + t_a13_5 + t_a13_6 + t_a13_7 + t_a13_8 + t_a13_9 + t_a13_10 + t_a13_11 + t_a13_12 + t_a13_13 + t_a13_14);
let t_a14 = calc_f32!(x0 + t_a14_1 + t_a14_2 + t_a14_3 + t_a14_4 + t_a14_5 + t_a14_6 + t_a14_7 + t_a14_8 + t_a14_9 + t_a14_10 + t_a14_11 + t_a14_12 + t_a14_13 + t_a14_14);
let t_b1 = calc_f32!(t_b1_1 + t_b1_2 + t_b1_3 + t_b1_4 + t_b1_5 + t_b1_6 + t_b1_7 + t_b1_8 + t_b1_9 + t_b1_10 + t_b1_11 + t_b1_12 + t_b1_13 + t_b1_14);
let t_b2 = calc_f32!(t_b2_1 + t_b2_2 + t_b2_3 + t_b2_4 + t_b2_5 + t_b2_6 + t_b2_7 - t_b2_8 - t_b2_9 - t_b2_10 - t_b2_11 - t_b2_12 - t_b2_13 - t_b2_14);
let t_b3 = calc_f32!(t_b3_1 + t_b3_2 + t_b3_3 + t_b3_4 - t_b3_5 - t_b3_6 - t_b3_7 - t_b3_8 - t_b3_9 + t_b3_10 + t_b3_11 + t_b3_12 + t_b3_13 + t_b3_14);
let t_b4 = calc_f32!(t_b4_1 + t_b4_2 + t_b4_3 - t_b4_4 - t_b4_5 - t_b4_6 - t_b4_7 + t_b4_8 + t_b4_9 + t_b4_10 - t_b4_11 - t_b4_12 - t_b4_13 - t_b4_14);
let t_b5 = calc_f32!(t_b5_1 + t_b5_2 - t_b5_3 - t_b5_4 - t_b5_5 + t_b5_6 + t_b5_7 + t_b5_8 - t_b5_9 - t_b5_10 - t_b5_11 + t_b5_12 + t_b5_13 + t_b5_14);
let t_b6 = calc_f32!(t_b6_1 + t_b6_2 - t_b6_3 - t_b6_4 + t_b6_5 + t_b6_6 + t_b6_7 - t_b6_8 - t_b6_9 + t_b6_10 + t_b6_11 + t_b6_12 - t_b6_13 - t_b6_14);
let t_b7 = calc_f32!(t_b7_1 + t_b7_2 - t_b7_3 - t_b7_4 + t_b7_5 + t_b7_6 - t_b7_7 - t_b7_8 + t_b7_9 + t_b7_10 - t_b7_11 - t_b7_12 + t_b7_13 + t_b7_14);
let t_b8 = calc_f32!(t_b8_1 - t_b8_2 - t_b8_3 + t_b8_4 + t_b8_5 - t_b8_6 - t_b8_7 + t_b8_8 + t_b8_9 - t_b8_10 + t_b8_11 + t_b8_12 - t_b8_13 - t_b8_14);
let t_b9 = calc_f32!(t_b9_1 - t_b9_2 - t_b9_3 + t_b9_4 - t_b9_5 - t_b9_6 + t_b9_7 + t_b9_8 - t_b9_9 + t_b9_10 + t_b9_11 - t_b9_12 + t_b9_13 + t_b9_14);
let t_b10 = calc_f32!(t_b10_1 - t_b10_2 + t_b10_3 + t_b10_4 - t_b10_5 + t_b10_6 + t_b10_7 - t_b10_8 + t_b10_9 + t_b10_10 - t_b10_11 + t_b10_12 + t_b10_13 - t_b10_14);
let t_b11 = calc_f32!(t_b11_1 - t_b11_2 + t_b11_3 - t_b11_4 - t_b11_5 + t_b11_6 - t_b11_7 + t_b11_8 + t_b11_9 - t_b11_10 + t_b11_11 - t_b11_12 - t_b11_13 + t_b11_14);
let t_b12 = calc_f32!(t_b12_1 - t_b12_2 + t_b12_3 - t_b12_4 + t_b12_5 + t_b12_6 - t_b12_7 + t_b12_8 - t_b12_9 + t_b12_10 - t_b12_11 - t_b12_12 + t_b12_13 - t_b12_14);
let t_b13 = calc_f32!(t_b13_1 - t_b13_2 + t_b13_3 - t_b13_4 + t_b13_5 - t_b13_6 + t_b13_7 - t_b13_8 + t_b13_9 + t_b13_10 - t_b13_11 + t_b13_12 - t_b13_13 + t_b13_14);
let t_b14 = calc_f32!(t_b14_1 - t_b14_2 + t_b14_3 - t_b14_4 + t_b14_5 - t_b14_6 + t_b14_7 - t_b14_8 + t_b14_9 - t_b14_10 + t_b14_11 - t_b14_12 + t_b14_13 - t_b14_14);
let t_b1_rot = self.rotate.rotate_both(t_b1);
let t_b2_rot = self.rotate.rotate_both(t_b2);
let t_b3_rot = self.rotate.rotate_both(t_b3);
let t_b4_rot = self.rotate.rotate_both(t_b4);
let t_b5_rot = self.rotate.rotate_both(t_b5);
let t_b6_rot = self.rotate.rotate_both(t_b6);
let t_b7_rot = self.rotate.rotate_both(t_b7);
let t_b8_rot = self.rotate.rotate_both(t_b8);
let t_b9_rot = self.rotate.rotate_both(t_b9);
let t_b10_rot = self.rotate.rotate_both(t_b10);
let t_b11_rot = self.rotate.rotate_both(t_b11);
let t_b12_rot = self.rotate.rotate_both(t_b12);
let t_b13_rot = self.rotate.rotate_both(t_b13);
let t_b14_rot = self.rotate.rotate_both(t_b14);
let y0 = calc_f32!(x0 + x1p28 + x2p27 + x3p26 + x4p25 + x5p24 + x6p23 + x7p22 + x8p21 + x9p20 + x10p19 + x11p18 + x12p17 + x13p16 + x14p15);
let [y1, y28] = parallel_fft2_interleaved_f32(t_a1, t_b1_rot);
let [y2, y27] = parallel_fft2_interleaved_f32(t_a2, t_b2_rot);
let [y3, y26] = parallel_fft2_interleaved_f32(t_a3, t_b3_rot);
let [y4, y25] = parallel_fft2_interleaved_f32(t_a4, t_b4_rot);
let [y5, y24] = parallel_fft2_interleaved_f32(t_a5, t_b5_rot);
let [y6, y23] = parallel_fft2_interleaved_f32(t_a6, t_b6_rot);
let [y7, y22] = parallel_fft2_interleaved_f32(t_a7, t_b7_rot);
let [y8, y21] = parallel_fft2_interleaved_f32(t_a8, t_b8_rot);
let [y9, y20] = parallel_fft2_interleaved_f32(t_a9, t_b9_rot);
let [y10, y19] = parallel_fft2_interleaved_f32(t_a10, t_b10_rot);
let [y11, y18] = parallel_fft2_interleaved_f32(t_a11, t_b11_rot);
let [y12, y17] = parallel_fft2_interleaved_f32(t_a12, t_b12_rot);
let [y13, y16] = parallel_fft2_interleaved_f32(t_a13, t_b13_rot);
let [y14, y15] = parallel_fft2_interleaved_f32(t_a14, t_b14_rot);
[y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22, y23, y24, y25, y26, y27, y28]
}
}
pub struct SseF64Butterfly29<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
rotate: Rotate90F64,
twiddle1re: __m128d,
twiddle1im: __m128d,
twiddle2re: __m128d,
twiddle2im: __m128d,
twiddle3re: __m128d,
twiddle3im: __m128d,
twiddle4re: __m128d,
twiddle4im: __m128d,
twiddle5re: __m128d,
twiddle5im: __m128d,
twiddle6re: __m128d,
twiddle6im: __m128d,
twiddle7re: __m128d,
twiddle7im: __m128d,
twiddle8re: __m128d,
twiddle8im: __m128d,
twiddle9re: __m128d,
twiddle9im: __m128d,
twiddle10re: __m128d,
twiddle10im: __m128d,
twiddle11re: __m128d,
twiddle11im: __m128d,
twiddle12re: __m128d,
twiddle12im: __m128d,
twiddle13re: __m128d,
twiddle13im: __m128d,
twiddle14re: __m128d,
twiddle14im: __m128d,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly29, 29, |this: &SseF64Butterfly29<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF64Butterfly29, 29, |this: &SseF64Butterfly29<_>| this
.direction);
impl<T: FftNum> SseF64Butterfly29<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
let rotate = Rotate90F64::new(true);
let tw1: Complex<f64> = twiddles::compute_twiddle(1, 29, direction);
let tw2: Complex<f64> = twiddles::compute_twiddle(2, 29, direction);
let tw3: Complex<f64> = twiddles::compute_twiddle(3, 29, direction);
let tw4: Complex<f64> = twiddles::compute_twiddle(4, 29, direction);
let tw5: Complex<f64> = twiddles::compute_twiddle(5, 29, direction);
let tw6: Complex<f64> = twiddles::compute_twiddle(6, 29, direction);
let tw7: Complex<f64> = twiddles::compute_twiddle(7, 29, direction);
let tw8: Complex<f64> = twiddles::compute_twiddle(8, 29, direction);
let tw9: Complex<f64> = twiddles::compute_twiddle(9, 29, direction);
let tw10: Complex<f64> = twiddles::compute_twiddle(10, 29, direction);
let tw11: Complex<f64> = twiddles::compute_twiddle(11, 29, direction);
let tw12: Complex<f64> = twiddles::compute_twiddle(12, 29, direction);
let tw13: Complex<f64> = twiddles::compute_twiddle(13, 29, direction);
let tw14: Complex<f64> = twiddles::compute_twiddle(14, 29, direction);
let twiddle1re = unsafe { _mm_set_pd(tw1.re, tw1.re) };
let twiddle1im = unsafe { _mm_set_pd(tw1.im, tw1.im) };
let twiddle2re = unsafe { _mm_set_pd(tw2.re, tw2.re) };
let twiddle2im = unsafe { _mm_set_pd(tw2.im, tw2.im) };
let twiddle3re = unsafe { _mm_set_pd(tw3.re, tw3.re) };
let twiddle3im = unsafe { _mm_set_pd(tw3.im, tw3.im) };
let twiddle4re = unsafe { _mm_set_pd(tw4.re, tw4.re) };
let twiddle4im = unsafe { _mm_set_pd(tw4.im, tw4.im) };
let twiddle5re = unsafe { _mm_set_pd(tw5.re, tw5.re) };
let twiddle5im = unsafe { _mm_set_pd(tw5.im, tw5.im) };
let twiddle6re = unsafe { _mm_set_pd(tw6.re, tw6.re) };
let twiddle6im = unsafe { _mm_set_pd(tw6.im, tw6.im) };
let twiddle7re = unsafe { _mm_set_pd(tw7.re, tw7.re) };
let twiddle7im = unsafe { _mm_set_pd(tw7.im, tw7.im) };
let twiddle8re = unsafe { _mm_set_pd(tw8.re, tw8.re) };
let twiddle8im = unsafe { _mm_set_pd(tw8.im, tw8.im) };
let twiddle9re = unsafe { _mm_set_pd(tw9.re, tw9.re) };
let twiddle9im = unsafe { _mm_set_pd(tw9.im, tw9.im) };
let twiddle10re = unsafe { _mm_set_pd(tw10.re, tw10.re) };
let twiddle10im = unsafe { _mm_set_pd(tw10.im, tw10.im) };
let twiddle11re = unsafe { _mm_set_pd(tw11.re, tw11.re) };
let twiddle11im = unsafe { _mm_set_pd(tw11.im, tw11.im) };
let twiddle12re = unsafe { _mm_set_pd(tw12.re, tw12.re) };
let twiddle12im = unsafe { _mm_set_pd(tw12.im, tw12.im) };
let twiddle13re = unsafe { _mm_set_pd(tw13.re, tw13.re) };
let twiddle13im = unsafe { _mm_set_pd(tw13.im, tw13.im) };
let twiddle14re = unsafe { _mm_set_pd(tw14.re, tw14.re) };
let twiddle14im = unsafe { _mm_set_pd(tw14.im, tw14.im) };
Self {
direction,
_phantom: std::marker::PhantomData,
rotate,
twiddle1re,
twiddle1im,
twiddle2re,
twiddle2im,
twiddle3re,
twiddle3im,
twiddle4re,
twiddle4im,
twiddle5re,
twiddle5im,
twiddle6re,
twiddle6im,
twiddle7re,
twiddle7im,
twiddle8re,
twiddle8im,
twiddle9re,
twiddle9im,
twiddle10re,
twiddle10im,
twiddle11re,
twiddle11im,
twiddle12re,
twiddle12im,
twiddle13re,
twiddle13im,
twiddle14re,
twiddle14im,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let values = read_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28});
let out = self.perform_fft_direct(values);
write_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28});
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 29]) -> [__m128d; 29] {
let [x1p28, x1m28] = solo_fft2_f64(values[1], values[28]);
let [x2p27, x2m27] = solo_fft2_f64(values[2], values[27]);
let [x3p26, x3m26] = solo_fft2_f64(values[3], values[26]);
let [x4p25, x4m25] = solo_fft2_f64(values[4], values[25]);
let [x5p24, x5m24] = solo_fft2_f64(values[5], values[24]);
let [x6p23, x6m23] = solo_fft2_f64(values[6], values[23]);
let [x7p22, x7m22] = solo_fft2_f64(values[7], values[22]);
let [x8p21, x8m21] = solo_fft2_f64(values[8], values[21]);
let [x9p20, x9m20] = solo_fft2_f64(values[9], values[20]);
let [x10p19, x10m19] = solo_fft2_f64(values[10], values[19]);
let [x11p18, x11m18] = solo_fft2_f64(values[11], values[18]);
let [x12p17, x12m17] = solo_fft2_f64(values[12], values[17]);
let [x13p16, x13m16] = solo_fft2_f64(values[13], values[16]);
let [x14p15, x14m15] = solo_fft2_f64(values[14], values[15]);
let t_a1_1 = _mm_mul_pd(self.twiddle1re, x1p28);
let t_a1_2 = _mm_mul_pd(self.twiddle2re, x2p27);
let t_a1_3 = _mm_mul_pd(self.twiddle3re, x3p26);
let t_a1_4 = _mm_mul_pd(self.twiddle4re, x4p25);
let t_a1_5 = _mm_mul_pd(self.twiddle5re, x5p24);
let t_a1_6 = _mm_mul_pd(self.twiddle6re, x6p23);
let t_a1_7 = _mm_mul_pd(self.twiddle7re, x7p22);
let t_a1_8 = _mm_mul_pd(self.twiddle8re, x8p21);
let t_a1_9 = _mm_mul_pd(self.twiddle9re, x9p20);
let t_a1_10 = _mm_mul_pd(self.twiddle10re, x10p19);
let t_a1_11 = _mm_mul_pd(self.twiddle11re, x11p18);
let t_a1_12 = _mm_mul_pd(self.twiddle12re, x12p17);
let t_a1_13 = _mm_mul_pd(self.twiddle13re, x13p16);
let t_a1_14 = _mm_mul_pd(self.twiddle14re, x14p15);
let t_a2_1 = _mm_mul_pd(self.twiddle2re, x1p28);
let t_a2_2 = _mm_mul_pd(self.twiddle4re, x2p27);
let t_a2_3 = _mm_mul_pd(self.twiddle6re, x3p26);
let t_a2_4 = _mm_mul_pd(self.twiddle8re, x4p25);
let t_a2_5 = _mm_mul_pd(self.twiddle10re, x5p24);
let t_a2_6 = _mm_mul_pd(self.twiddle12re, x6p23);
let t_a2_7 = _mm_mul_pd(self.twiddle14re, x7p22);
let t_a2_8 = _mm_mul_pd(self.twiddle13re, x8p21);
let t_a2_9 = _mm_mul_pd(self.twiddle11re, x9p20);
let t_a2_10 = _mm_mul_pd(self.twiddle9re, x10p19);
let t_a2_11 = _mm_mul_pd(self.twiddle7re, x11p18);
let t_a2_12 = _mm_mul_pd(self.twiddle5re, x12p17);
let t_a2_13 = _mm_mul_pd(self.twiddle3re, x13p16);
let t_a2_14 = _mm_mul_pd(self.twiddle1re, x14p15);
let t_a3_1 = _mm_mul_pd(self.twiddle3re, x1p28);
let t_a3_2 = _mm_mul_pd(self.twiddle6re, x2p27);
let t_a3_3 = _mm_mul_pd(self.twiddle9re, x3p26);
let t_a3_4 = _mm_mul_pd(self.twiddle12re, x4p25);
let t_a3_5 = _mm_mul_pd(self.twiddle14re, x5p24);
let t_a3_6 = _mm_mul_pd(self.twiddle11re, x6p23);
let t_a3_7 = _mm_mul_pd(self.twiddle8re, x7p22);
let t_a3_8 = _mm_mul_pd(self.twiddle5re, x8p21);
let t_a3_9 = _mm_mul_pd(self.twiddle2re, x9p20);
let t_a3_10 = _mm_mul_pd(self.twiddle1re, x10p19);
let t_a3_11 = _mm_mul_pd(self.twiddle4re, x11p18);
let t_a3_12 = _mm_mul_pd(self.twiddle7re, x12p17);
let t_a3_13 = _mm_mul_pd(self.twiddle10re, x13p16);
let t_a3_14 = _mm_mul_pd(self.twiddle13re, x14p15);
let t_a4_1 = _mm_mul_pd(self.twiddle4re, x1p28);
let t_a4_2 = _mm_mul_pd(self.twiddle8re, x2p27);
let t_a4_3 = _mm_mul_pd(self.twiddle12re, x3p26);
let t_a4_4 = _mm_mul_pd(self.twiddle13re, x4p25);
let t_a4_5 = _mm_mul_pd(self.twiddle9re, x5p24);
let t_a4_6 = _mm_mul_pd(self.twiddle5re, x6p23);
let t_a4_7 = _mm_mul_pd(self.twiddle1re, x7p22);
let t_a4_8 = _mm_mul_pd(self.twiddle3re, x8p21);
let t_a4_9 = _mm_mul_pd(self.twiddle7re, x9p20);
let t_a4_10 = _mm_mul_pd(self.twiddle11re, x10p19);
let t_a4_11 = _mm_mul_pd(self.twiddle14re, x11p18);
let t_a4_12 = _mm_mul_pd(self.twiddle10re, x12p17);
let t_a4_13 = _mm_mul_pd(self.twiddle6re, x13p16);
let t_a4_14 = _mm_mul_pd(self.twiddle2re, x14p15);
let t_a5_1 = _mm_mul_pd(self.twiddle5re, x1p28);
let t_a5_2 = _mm_mul_pd(self.twiddle10re, x2p27);
let t_a5_3 = _mm_mul_pd(self.twiddle14re, x3p26);
let t_a5_4 = _mm_mul_pd(self.twiddle9re, x4p25);
let t_a5_5 = _mm_mul_pd(self.twiddle4re, x5p24);
let t_a5_6 = _mm_mul_pd(self.twiddle1re, x6p23);
let t_a5_7 = _mm_mul_pd(self.twiddle6re, x7p22);
let t_a5_8 = _mm_mul_pd(self.twiddle11re, x8p21);
let t_a5_9 = _mm_mul_pd(self.twiddle13re, x9p20);
let t_a5_10 = _mm_mul_pd(self.twiddle8re, x10p19);
let t_a5_11 = _mm_mul_pd(self.twiddle3re, x11p18);
let t_a5_12 = _mm_mul_pd(self.twiddle2re, x12p17);
let t_a5_13 = _mm_mul_pd(self.twiddle7re, x13p16);
let t_a5_14 = _mm_mul_pd(self.twiddle12re, x14p15);
let t_a6_1 = _mm_mul_pd(self.twiddle6re, x1p28);
let t_a6_2 = _mm_mul_pd(self.twiddle12re, x2p27);
let t_a6_3 = _mm_mul_pd(self.twiddle11re, x3p26);
let t_a6_4 = _mm_mul_pd(self.twiddle5re, x4p25);
let t_a6_5 = _mm_mul_pd(self.twiddle1re, x5p24);
let t_a6_6 = _mm_mul_pd(self.twiddle7re, x6p23);
let t_a6_7 = _mm_mul_pd(self.twiddle13re, x7p22);
let t_a6_8 = _mm_mul_pd(self.twiddle10re, x8p21);
let t_a6_9 = _mm_mul_pd(self.twiddle4re, x9p20);
let t_a6_10 = _mm_mul_pd(self.twiddle2re, x10p19);
let t_a6_11 = _mm_mul_pd(self.twiddle8re, x11p18);
let t_a6_12 = _mm_mul_pd(self.twiddle14re, x12p17);
let t_a6_13 = _mm_mul_pd(self.twiddle9re, x13p16);
let t_a6_14 = _mm_mul_pd(self.twiddle3re, x14p15);
let t_a7_1 = _mm_mul_pd(self.twiddle7re, x1p28);
let t_a7_2 = _mm_mul_pd(self.twiddle14re, x2p27);
let t_a7_3 = _mm_mul_pd(self.twiddle8re, x3p26);
let t_a7_4 = _mm_mul_pd(self.twiddle1re, x4p25);
let t_a7_5 = _mm_mul_pd(self.twiddle6re, x5p24);
let t_a7_6 = _mm_mul_pd(self.twiddle13re, x6p23);
let t_a7_7 = _mm_mul_pd(self.twiddle9re, x7p22);
let t_a7_8 = _mm_mul_pd(self.twiddle2re, x8p21);
let t_a7_9 = _mm_mul_pd(self.twiddle5re, x9p20);
let t_a7_10 = _mm_mul_pd(self.twiddle12re, x10p19);
let t_a7_11 = _mm_mul_pd(self.twiddle10re, x11p18);
let t_a7_12 = _mm_mul_pd(self.twiddle3re, x12p17);
let t_a7_13 = _mm_mul_pd(self.twiddle4re, x13p16);
let t_a7_14 = _mm_mul_pd(self.twiddle11re, x14p15);
let t_a8_1 = _mm_mul_pd(self.twiddle8re, x1p28);
let t_a8_2 = _mm_mul_pd(self.twiddle13re, x2p27);
let t_a8_3 = _mm_mul_pd(self.twiddle5re, x3p26);
let t_a8_4 = _mm_mul_pd(self.twiddle3re, x4p25);
let t_a8_5 = _mm_mul_pd(self.twiddle11re, x5p24);
let t_a8_6 = _mm_mul_pd(self.twiddle10re, x6p23);
let t_a8_7 = _mm_mul_pd(self.twiddle2re, x7p22);
let t_a8_8 = _mm_mul_pd(self.twiddle6re, x8p21);
let t_a8_9 = _mm_mul_pd(self.twiddle14re, x9p20);
let t_a8_10 = _mm_mul_pd(self.twiddle7re, x10p19);
let t_a8_11 = _mm_mul_pd(self.twiddle1re, x11p18);
let t_a8_12 = _mm_mul_pd(self.twiddle9re, x12p17);
let t_a8_13 = _mm_mul_pd(self.twiddle12re, x13p16);
let t_a8_14 = _mm_mul_pd(self.twiddle4re, x14p15);
let t_a9_1 = _mm_mul_pd(self.twiddle9re, x1p28);
let t_a9_2 = _mm_mul_pd(self.twiddle11re, x2p27);
let t_a9_3 = _mm_mul_pd(self.twiddle2re, x3p26);
let t_a9_4 = _mm_mul_pd(self.twiddle7re, x4p25);
let t_a9_5 = _mm_mul_pd(self.twiddle13re, x5p24);
let t_a9_6 = _mm_mul_pd(self.twiddle4re, x6p23);
let t_a9_7 = _mm_mul_pd(self.twiddle5re, x7p22);
let t_a9_8 = _mm_mul_pd(self.twiddle14re, x8p21);
let t_a9_9 = _mm_mul_pd(self.twiddle6re, x9p20);
let t_a9_10 = _mm_mul_pd(self.twiddle3re, x10p19);
let t_a9_11 = _mm_mul_pd(self.twiddle12re, x11p18);
let t_a9_12 = _mm_mul_pd(self.twiddle8re, x12p17);
let t_a9_13 = _mm_mul_pd(self.twiddle1re, x13p16);
let t_a9_14 = _mm_mul_pd(self.twiddle10re, x14p15);
let t_a10_1 = _mm_mul_pd(self.twiddle10re, x1p28);
let t_a10_2 = _mm_mul_pd(self.twiddle9re, x2p27);
let t_a10_3 = _mm_mul_pd(self.twiddle1re, x3p26);
let t_a10_4 = _mm_mul_pd(self.twiddle11re, x4p25);
let t_a10_5 = _mm_mul_pd(self.twiddle8re, x5p24);
let t_a10_6 = _mm_mul_pd(self.twiddle2re, x6p23);
let t_a10_7 = _mm_mul_pd(self.twiddle12re, x7p22);
let t_a10_8 = _mm_mul_pd(self.twiddle7re, x8p21);
let t_a10_9 = _mm_mul_pd(self.twiddle3re, x9p20);
let t_a10_10 = _mm_mul_pd(self.twiddle13re, x10p19);
let t_a10_11 = _mm_mul_pd(self.twiddle6re, x11p18);
let t_a10_12 = _mm_mul_pd(self.twiddle4re, x12p17);
let t_a10_13 = _mm_mul_pd(self.twiddle14re, x13p16);
let t_a10_14 = _mm_mul_pd(self.twiddle5re, x14p15);
let t_a11_1 = _mm_mul_pd(self.twiddle11re, x1p28);
let t_a11_2 = _mm_mul_pd(self.twiddle7re, x2p27);
let t_a11_3 = _mm_mul_pd(self.twiddle4re, x3p26);
let t_a11_4 = _mm_mul_pd(self.twiddle14re, x4p25);
let t_a11_5 = _mm_mul_pd(self.twiddle3re, x5p24);
let t_a11_6 = _mm_mul_pd(self.twiddle8re, x6p23);
let t_a11_7 = _mm_mul_pd(self.twiddle10re, x7p22);
let t_a11_8 = _mm_mul_pd(self.twiddle1re, x8p21);
let t_a11_9 = _mm_mul_pd(self.twiddle12re, x9p20);
let t_a11_10 = _mm_mul_pd(self.twiddle6re, x10p19);
let t_a11_11 = _mm_mul_pd(self.twiddle5re, x11p18);
let t_a11_12 = _mm_mul_pd(self.twiddle13re, x12p17);
let t_a11_13 = _mm_mul_pd(self.twiddle2re, x13p16);
let t_a11_14 = _mm_mul_pd(self.twiddle9re, x14p15);
let t_a12_1 = _mm_mul_pd(self.twiddle12re, x1p28);
let t_a12_2 = _mm_mul_pd(self.twiddle5re, x2p27);
let t_a12_3 = _mm_mul_pd(self.twiddle7re, x3p26);
let t_a12_4 = _mm_mul_pd(self.twiddle10re, x4p25);
let t_a12_5 = _mm_mul_pd(self.twiddle2re, x5p24);
let t_a12_6 = _mm_mul_pd(self.twiddle14re, x6p23);
let t_a12_7 = _mm_mul_pd(self.twiddle3re, x7p22);
let t_a12_8 = _mm_mul_pd(self.twiddle9re, x8p21);
let t_a12_9 = _mm_mul_pd(self.twiddle8re, x9p20);
let t_a12_10 = _mm_mul_pd(self.twiddle4re, x10p19);
let t_a12_11 = _mm_mul_pd(self.twiddle13re, x11p18);
let t_a12_12 = _mm_mul_pd(self.twiddle1re, x12p17);
let t_a12_13 = _mm_mul_pd(self.twiddle11re, x13p16);
let t_a12_14 = _mm_mul_pd(self.twiddle6re, x14p15);
let t_a13_1 = _mm_mul_pd(self.twiddle13re, x1p28);
let t_a13_2 = _mm_mul_pd(self.twiddle3re, x2p27);
let t_a13_3 = _mm_mul_pd(self.twiddle10re, x3p26);
let t_a13_4 = _mm_mul_pd(self.twiddle6re, x4p25);
let t_a13_5 = _mm_mul_pd(self.twiddle7re, x5p24);
let t_a13_6 = _mm_mul_pd(self.twiddle9re, x6p23);
let t_a13_7 = _mm_mul_pd(self.twiddle4re, x7p22);
let t_a13_8 = _mm_mul_pd(self.twiddle12re, x8p21);
let t_a13_9 = _mm_mul_pd(self.twiddle1re, x9p20);
let t_a13_10 = _mm_mul_pd(self.twiddle14re, x10p19);
let t_a13_11 = _mm_mul_pd(self.twiddle2re, x11p18);
let t_a13_12 = _mm_mul_pd(self.twiddle11re, x12p17);
let t_a13_13 = _mm_mul_pd(self.twiddle5re, x13p16);
let t_a13_14 = _mm_mul_pd(self.twiddle8re, x14p15);
let t_a14_1 = _mm_mul_pd(self.twiddle14re, x1p28);
let t_a14_2 = _mm_mul_pd(self.twiddle1re, x2p27);
let t_a14_3 = _mm_mul_pd(self.twiddle13re, x3p26);
let t_a14_4 = _mm_mul_pd(self.twiddle2re, x4p25);
let t_a14_5 = _mm_mul_pd(self.twiddle12re, x5p24);
let t_a14_6 = _mm_mul_pd(self.twiddle3re, x6p23);
let t_a14_7 = _mm_mul_pd(self.twiddle11re, x7p22);
let t_a14_8 = _mm_mul_pd(self.twiddle4re, x8p21);
let t_a14_9 = _mm_mul_pd(self.twiddle10re, x9p20);
let t_a14_10 = _mm_mul_pd(self.twiddle5re, x10p19);
let t_a14_11 = _mm_mul_pd(self.twiddle9re, x11p18);
let t_a14_12 = _mm_mul_pd(self.twiddle6re, x12p17);
let t_a14_13 = _mm_mul_pd(self.twiddle8re, x13p16);
let t_a14_14 = _mm_mul_pd(self.twiddle7re, x14p15);
let t_b1_1 = _mm_mul_pd(self.twiddle1im, x1m28);
let t_b1_2 = _mm_mul_pd(self.twiddle2im, x2m27);
let t_b1_3 = _mm_mul_pd(self.twiddle3im, x3m26);
let t_b1_4 = _mm_mul_pd(self.twiddle4im, x4m25);
let t_b1_5 = _mm_mul_pd(self.twiddle5im, x5m24);
let t_b1_6 = _mm_mul_pd(self.twiddle6im, x6m23);
let t_b1_7 = _mm_mul_pd(self.twiddle7im, x7m22);
let t_b1_8 = _mm_mul_pd(self.twiddle8im, x8m21);
let t_b1_9 = _mm_mul_pd(self.twiddle9im, x9m20);
let t_b1_10 = _mm_mul_pd(self.twiddle10im, x10m19);
let t_b1_11 = _mm_mul_pd(self.twiddle11im, x11m18);
let t_b1_12 = _mm_mul_pd(self.twiddle12im, x12m17);
let t_b1_13 = _mm_mul_pd(self.twiddle13im, x13m16);
let t_b1_14 = _mm_mul_pd(self.twiddle14im, x14m15);
let t_b2_1 = _mm_mul_pd(self.twiddle2im, x1m28);
let t_b2_2 = _mm_mul_pd(self.twiddle4im, x2m27);
let t_b2_3 = _mm_mul_pd(self.twiddle6im, x3m26);
let t_b2_4 = _mm_mul_pd(self.twiddle8im, x4m25);
let t_b2_5 = _mm_mul_pd(self.twiddle10im, x5m24);
let t_b2_6 = _mm_mul_pd(self.twiddle12im, x6m23);
let t_b2_7 = _mm_mul_pd(self.twiddle14im, x7m22);
let t_b2_8 = _mm_mul_pd(self.twiddle13im, x8m21);
let t_b2_9 = _mm_mul_pd(self.twiddle11im, x9m20);
let t_b2_10 = _mm_mul_pd(self.twiddle9im, x10m19);
let t_b2_11 = _mm_mul_pd(self.twiddle7im, x11m18);
let t_b2_12 = _mm_mul_pd(self.twiddle5im, x12m17);
let t_b2_13 = _mm_mul_pd(self.twiddle3im, x13m16);
let t_b2_14 = _mm_mul_pd(self.twiddle1im, x14m15);
let t_b3_1 = _mm_mul_pd(self.twiddle3im, x1m28);
let t_b3_2 = _mm_mul_pd(self.twiddle6im, x2m27);
let t_b3_3 = _mm_mul_pd(self.twiddle9im, x3m26);
let t_b3_4 = _mm_mul_pd(self.twiddle12im, x4m25);
let t_b3_5 = _mm_mul_pd(self.twiddle14im, x5m24);
let t_b3_6 = _mm_mul_pd(self.twiddle11im, x6m23);
let t_b3_7 = _mm_mul_pd(self.twiddle8im, x7m22);
let t_b3_8 = _mm_mul_pd(self.twiddle5im, x8m21);
let t_b3_9 = _mm_mul_pd(self.twiddle2im, x9m20);
let t_b3_10 = _mm_mul_pd(self.twiddle1im, x10m19);
let t_b3_11 = _mm_mul_pd(self.twiddle4im, x11m18);
let t_b3_12 = _mm_mul_pd(self.twiddle7im, x12m17);
let t_b3_13 = _mm_mul_pd(self.twiddle10im, x13m16);
let t_b3_14 = _mm_mul_pd(self.twiddle13im, x14m15);
let t_b4_1 = _mm_mul_pd(self.twiddle4im, x1m28);
let t_b4_2 = _mm_mul_pd(self.twiddle8im, x2m27);
let t_b4_3 = _mm_mul_pd(self.twiddle12im, x3m26);
let t_b4_4 = _mm_mul_pd(self.twiddle13im, x4m25);
let t_b4_5 = _mm_mul_pd(self.twiddle9im, x5m24);
let t_b4_6 = _mm_mul_pd(self.twiddle5im, x6m23);
let t_b4_7 = _mm_mul_pd(self.twiddle1im, x7m22);
let t_b4_8 = _mm_mul_pd(self.twiddle3im, x8m21);
let t_b4_9 = _mm_mul_pd(self.twiddle7im, x9m20);
let t_b4_10 = _mm_mul_pd(self.twiddle11im, x10m19);
let t_b4_11 = _mm_mul_pd(self.twiddle14im, x11m18);
let t_b4_12 = _mm_mul_pd(self.twiddle10im, x12m17);
let t_b4_13 = _mm_mul_pd(self.twiddle6im, x13m16);
let t_b4_14 = _mm_mul_pd(self.twiddle2im, x14m15);
let t_b5_1 = _mm_mul_pd(self.twiddle5im, x1m28);
let t_b5_2 = _mm_mul_pd(self.twiddle10im, x2m27);
let t_b5_3 = _mm_mul_pd(self.twiddle14im, x3m26);
let t_b5_4 = _mm_mul_pd(self.twiddle9im, x4m25);
let t_b5_5 = _mm_mul_pd(self.twiddle4im, x5m24);
let t_b5_6 = _mm_mul_pd(self.twiddle1im, x6m23);
let t_b5_7 = _mm_mul_pd(self.twiddle6im, x7m22);
let t_b5_8 = _mm_mul_pd(self.twiddle11im, x8m21);
let t_b5_9 = _mm_mul_pd(self.twiddle13im, x9m20);
let t_b5_10 = _mm_mul_pd(self.twiddle8im, x10m19);
let t_b5_11 = _mm_mul_pd(self.twiddle3im, x11m18);
let t_b5_12 = _mm_mul_pd(self.twiddle2im, x12m17);
let t_b5_13 = _mm_mul_pd(self.twiddle7im, x13m16);
let t_b5_14 = _mm_mul_pd(self.twiddle12im, x14m15);
let t_b6_1 = _mm_mul_pd(self.twiddle6im, x1m28);
let t_b6_2 = _mm_mul_pd(self.twiddle12im, x2m27);
let t_b6_3 = _mm_mul_pd(self.twiddle11im, x3m26);
let t_b6_4 = _mm_mul_pd(self.twiddle5im, x4m25);
let t_b6_5 = _mm_mul_pd(self.twiddle1im, x5m24);
let t_b6_6 = _mm_mul_pd(self.twiddle7im, x6m23);
let t_b6_7 = _mm_mul_pd(self.twiddle13im, x7m22);
let t_b6_8 = _mm_mul_pd(self.twiddle10im, x8m21);
let t_b6_9 = _mm_mul_pd(self.twiddle4im, x9m20);
let t_b6_10 = _mm_mul_pd(self.twiddle2im, x10m19);
let t_b6_11 = _mm_mul_pd(self.twiddle8im, x11m18);
let t_b6_12 = _mm_mul_pd(self.twiddle14im, x12m17);
let t_b6_13 = _mm_mul_pd(self.twiddle9im, x13m16);
let t_b6_14 = _mm_mul_pd(self.twiddle3im, x14m15);
let t_b7_1 = _mm_mul_pd(self.twiddle7im, x1m28);
let t_b7_2 = _mm_mul_pd(self.twiddle14im, x2m27);
let t_b7_3 = _mm_mul_pd(self.twiddle8im, x3m26);
let t_b7_4 = _mm_mul_pd(self.twiddle1im, x4m25);
let t_b7_5 = _mm_mul_pd(self.twiddle6im, x5m24);
let t_b7_6 = _mm_mul_pd(self.twiddle13im, x6m23);
let t_b7_7 = _mm_mul_pd(self.twiddle9im, x7m22);
let t_b7_8 = _mm_mul_pd(self.twiddle2im, x8m21);
let t_b7_9 = _mm_mul_pd(self.twiddle5im, x9m20);
let t_b7_10 = _mm_mul_pd(self.twiddle12im, x10m19);
let t_b7_11 = _mm_mul_pd(self.twiddle10im, x11m18);
let t_b7_12 = _mm_mul_pd(self.twiddle3im, x12m17);
let t_b7_13 = _mm_mul_pd(self.twiddle4im, x13m16);
let t_b7_14 = _mm_mul_pd(self.twiddle11im, x14m15);
let t_b8_1 = _mm_mul_pd(self.twiddle8im, x1m28);
let t_b8_2 = _mm_mul_pd(self.twiddle13im, x2m27);
let t_b8_3 = _mm_mul_pd(self.twiddle5im, x3m26);
let t_b8_4 = _mm_mul_pd(self.twiddle3im, x4m25);
let t_b8_5 = _mm_mul_pd(self.twiddle11im, x5m24);
let t_b8_6 = _mm_mul_pd(self.twiddle10im, x6m23);
let t_b8_7 = _mm_mul_pd(self.twiddle2im, x7m22);
let t_b8_8 = _mm_mul_pd(self.twiddle6im, x8m21);
let t_b8_9 = _mm_mul_pd(self.twiddle14im, x9m20);
let t_b8_10 = _mm_mul_pd(self.twiddle7im, x10m19);
let t_b8_11 = _mm_mul_pd(self.twiddle1im, x11m18);
let t_b8_12 = _mm_mul_pd(self.twiddle9im, x12m17);
let t_b8_13 = _mm_mul_pd(self.twiddle12im, x13m16);
let t_b8_14 = _mm_mul_pd(self.twiddle4im, x14m15);
let t_b9_1 = _mm_mul_pd(self.twiddle9im, x1m28);
let t_b9_2 = _mm_mul_pd(self.twiddle11im, x2m27);
let t_b9_3 = _mm_mul_pd(self.twiddle2im, x3m26);
let t_b9_4 = _mm_mul_pd(self.twiddle7im, x4m25);
let t_b9_5 = _mm_mul_pd(self.twiddle13im, x5m24);
let t_b9_6 = _mm_mul_pd(self.twiddle4im, x6m23);
let t_b9_7 = _mm_mul_pd(self.twiddle5im, x7m22);
let t_b9_8 = _mm_mul_pd(self.twiddle14im, x8m21);
let t_b9_9 = _mm_mul_pd(self.twiddle6im, x9m20);
let t_b9_10 = _mm_mul_pd(self.twiddle3im, x10m19);
let t_b9_11 = _mm_mul_pd(self.twiddle12im, x11m18);
let t_b9_12 = _mm_mul_pd(self.twiddle8im, x12m17);
let t_b9_13 = _mm_mul_pd(self.twiddle1im, x13m16);
let t_b9_14 = _mm_mul_pd(self.twiddle10im, x14m15);
let t_b10_1 = _mm_mul_pd(self.twiddle10im, x1m28);
let t_b10_2 = _mm_mul_pd(self.twiddle9im, x2m27);
let t_b10_3 = _mm_mul_pd(self.twiddle1im, x3m26);
let t_b10_4 = _mm_mul_pd(self.twiddle11im, x4m25);
let t_b10_5 = _mm_mul_pd(self.twiddle8im, x5m24);
let t_b10_6 = _mm_mul_pd(self.twiddle2im, x6m23);
let t_b10_7 = _mm_mul_pd(self.twiddle12im, x7m22);
let t_b10_8 = _mm_mul_pd(self.twiddle7im, x8m21);
let t_b10_9 = _mm_mul_pd(self.twiddle3im, x9m20);
let t_b10_10 = _mm_mul_pd(self.twiddle13im, x10m19);
let t_b10_11 = _mm_mul_pd(self.twiddle6im, x11m18);
let t_b10_12 = _mm_mul_pd(self.twiddle4im, x12m17);
let t_b10_13 = _mm_mul_pd(self.twiddle14im, x13m16);
let t_b10_14 = _mm_mul_pd(self.twiddle5im, x14m15);
let t_b11_1 = _mm_mul_pd(self.twiddle11im, x1m28);
let t_b11_2 = _mm_mul_pd(self.twiddle7im, x2m27);
let t_b11_3 = _mm_mul_pd(self.twiddle4im, x3m26);
let t_b11_4 = _mm_mul_pd(self.twiddle14im, x4m25);
let t_b11_5 = _mm_mul_pd(self.twiddle3im, x5m24);
let t_b11_6 = _mm_mul_pd(self.twiddle8im, x6m23);
let t_b11_7 = _mm_mul_pd(self.twiddle10im, x7m22);
let t_b11_8 = _mm_mul_pd(self.twiddle1im, x8m21);
let t_b11_9 = _mm_mul_pd(self.twiddle12im, x9m20);
let t_b11_10 = _mm_mul_pd(self.twiddle6im, x10m19);
let t_b11_11 = _mm_mul_pd(self.twiddle5im, x11m18);
let t_b11_12 = _mm_mul_pd(self.twiddle13im, x12m17);
let t_b11_13 = _mm_mul_pd(self.twiddle2im, x13m16);
let t_b11_14 = _mm_mul_pd(self.twiddle9im, x14m15);
let t_b12_1 = _mm_mul_pd(self.twiddle12im, x1m28);
let t_b12_2 = _mm_mul_pd(self.twiddle5im, x2m27);
let t_b12_3 = _mm_mul_pd(self.twiddle7im, x3m26);
let t_b12_4 = _mm_mul_pd(self.twiddle10im, x4m25);
let t_b12_5 = _mm_mul_pd(self.twiddle2im, x5m24);
let t_b12_6 = _mm_mul_pd(self.twiddle14im, x6m23);
let t_b12_7 = _mm_mul_pd(self.twiddle3im, x7m22);
let t_b12_8 = _mm_mul_pd(self.twiddle9im, x8m21);
let t_b12_9 = _mm_mul_pd(self.twiddle8im, x9m20);
let t_b12_10 = _mm_mul_pd(self.twiddle4im, x10m19);
let t_b12_11 = _mm_mul_pd(self.twiddle13im, x11m18);
let t_b12_12 = _mm_mul_pd(self.twiddle1im, x12m17);
let t_b12_13 = _mm_mul_pd(self.twiddle11im, x13m16);
let t_b12_14 = _mm_mul_pd(self.twiddle6im, x14m15);
let t_b13_1 = _mm_mul_pd(self.twiddle13im, x1m28);
let t_b13_2 = _mm_mul_pd(self.twiddle3im, x2m27);
let t_b13_3 = _mm_mul_pd(self.twiddle10im, x3m26);
let t_b13_4 = _mm_mul_pd(self.twiddle6im, x4m25);
let t_b13_5 = _mm_mul_pd(self.twiddle7im, x5m24);
let t_b13_6 = _mm_mul_pd(self.twiddle9im, x6m23);
let t_b13_7 = _mm_mul_pd(self.twiddle4im, x7m22);
let t_b13_8 = _mm_mul_pd(self.twiddle12im, x8m21);
let t_b13_9 = _mm_mul_pd(self.twiddle1im, x9m20);
let t_b13_10 = _mm_mul_pd(self.twiddle14im, x10m19);
let t_b13_11 = _mm_mul_pd(self.twiddle2im, x11m18);
let t_b13_12 = _mm_mul_pd(self.twiddle11im, x12m17);
let t_b13_13 = _mm_mul_pd(self.twiddle5im, x13m16);
let t_b13_14 = _mm_mul_pd(self.twiddle8im, x14m15);
let t_b14_1 = _mm_mul_pd(self.twiddle14im, x1m28);
let t_b14_2 = _mm_mul_pd(self.twiddle1im, x2m27);
let t_b14_3 = _mm_mul_pd(self.twiddle13im, x3m26);
let t_b14_4 = _mm_mul_pd(self.twiddle2im, x4m25);
let t_b14_5 = _mm_mul_pd(self.twiddle12im, x5m24);
let t_b14_6 = _mm_mul_pd(self.twiddle3im, x6m23);
let t_b14_7 = _mm_mul_pd(self.twiddle11im, x7m22);
let t_b14_8 = _mm_mul_pd(self.twiddle4im, x8m21);
let t_b14_9 = _mm_mul_pd(self.twiddle10im, x9m20);
let t_b14_10 = _mm_mul_pd(self.twiddle5im, x10m19);
let t_b14_11 = _mm_mul_pd(self.twiddle9im, x11m18);
let t_b14_12 = _mm_mul_pd(self.twiddle6im, x12m17);
let t_b14_13 = _mm_mul_pd(self.twiddle8im, x13m16);
let t_b14_14 = _mm_mul_pd(self.twiddle7im, x14m15);
let x0 = values[0];
let t_a1 = calc_f64!(x0 + t_a1_1 + t_a1_2 + t_a1_3 + t_a1_4 + t_a1_5 + t_a1_6 + t_a1_7 + t_a1_8 + t_a1_9 + t_a1_10 + t_a1_11 + t_a1_12 + t_a1_13 + t_a1_14);
let t_a2 = calc_f64!(x0 + t_a2_1 + t_a2_2 + t_a2_3 + t_a2_4 + t_a2_5 + t_a2_6 + t_a2_7 + t_a2_8 + t_a2_9 + t_a2_10 + t_a2_11 + t_a2_12 + t_a2_13 + t_a2_14);
let t_a3 = calc_f64!(x0 + t_a3_1 + t_a3_2 + t_a3_3 + t_a3_4 + t_a3_5 + t_a3_6 + t_a3_7 + t_a3_8 + t_a3_9 + t_a3_10 + t_a3_11 + t_a3_12 + t_a3_13 + t_a3_14);
let t_a4 = calc_f64!(x0 + t_a4_1 + t_a4_2 + t_a4_3 + t_a4_4 + t_a4_5 + t_a4_6 + t_a4_7 + t_a4_8 + t_a4_9 + t_a4_10 + t_a4_11 + t_a4_12 + t_a4_13 + t_a4_14);
let t_a5 = calc_f64!(x0 + t_a5_1 + t_a5_2 + t_a5_3 + t_a5_4 + t_a5_5 + t_a5_6 + t_a5_7 + t_a5_8 + t_a5_9 + t_a5_10 + t_a5_11 + t_a5_12 + t_a5_13 + t_a5_14);
let t_a6 = calc_f64!(x0 + t_a6_1 + t_a6_2 + t_a6_3 + t_a6_4 + t_a6_5 + t_a6_6 + t_a6_7 + t_a6_8 + t_a6_9 + t_a6_10 + t_a6_11 + t_a6_12 + t_a6_13 + t_a6_14);
let t_a7 = calc_f64!(x0 + t_a7_1 + t_a7_2 + t_a7_3 + t_a7_4 + t_a7_5 + t_a7_6 + t_a7_7 + t_a7_8 + t_a7_9 + t_a7_10 + t_a7_11 + t_a7_12 + t_a7_13 + t_a7_14);
let t_a8 = calc_f64!(x0 + t_a8_1 + t_a8_2 + t_a8_3 + t_a8_4 + t_a8_5 + t_a8_6 + t_a8_7 + t_a8_8 + t_a8_9 + t_a8_10 + t_a8_11 + t_a8_12 + t_a8_13 + t_a8_14);
let t_a9 = calc_f64!(x0 + t_a9_1 + t_a9_2 + t_a9_3 + t_a9_4 + t_a9_5 + t_a9_6 + t_a9_7 + t_a9_8 + t_a9_9 + t_a9_10 + t_a9_11 + t_a9_12 + t_a9_13 + t_a9_14);
let t_a10 = calc_f64!(x0 + t_a10_1 + t_a10_2 + t_a10_3 + t_a10_4 + t_a10_5 + t_a10_6 + t_a10_7 + t_a10_8 + t_a10_9 + t_a10_10 + t_a10_11 + t_a10_12 + t_a10_13 + t_a10_14);
let t_a11 = calc_f64!(x0 + t_a11_1 + t_a11_2 + t_a11_3 + t_a11_4 + t_a11_5 + t_a11_6 + t_a11_7 + t_a11_8 + t_a11_9 + t_a11_10 + t_a11_11 + t_a11_12 + t_a11_13 + t_a11_14);
let t_a12 = calc_f64!(x0 + t_a12_1 + t_a12_2 + t_a12_3 + t_a12_4 + t_a12_5 + t_a12_6 + t_a12_7 + t_a12_8 + t_a12_9 + t_a12_10 + t_a12_11 + t_a12_12 + t_a12_13 + t_a12_14);
let t_a13 = calc_f64!(x0 + t_a13_1 + t_a13_2 + t_a13_3 + t_a13_4 + t_a13_5 + t_a13_6 + t_a13_7 + t_a13_8 + t_a13_9 + t_a13_10 + t_a13_11 + t_a13_12 + t_a13_13 + t_a13_14);
let t_a14 = calc_f64!(x0 + t_a14_1 + t_a14_2 + t_a14_3 + t_a14_4 + t_a14_5 + t_a14_6 + t_a14_7 + t_a14_8 + t_a14_9 + t_a14_10 + t_a14_11 + t_a14_12 + t_a14_13 + t_a14_14);
let t_b1 = calc_f64!(t_b1_1 + t_b1_2 + t_b1_3 + t_b1_4 + t_b1_5 + t_b1_6 + t_b1_7 + t_b1_8 + t_b1_9 + t_b1_10 + t_b1_11 + t_b1_12 + t_b1_13 + t_b1_14);
let t_b2 = calc_f64!(t_b2_1 + t_b2_2 + t_b2_3 + t_b2_4 + t_b2_5 + t_b2_6 + t_b2_7 - t_b2_8 - t_b2_9 - t_b2_10 - t_b2_11 - t_b2_12 - t_b2_13 - t_b2_14);
let t_b3 = calc_f64!(t_b3_1 + t_b3_2 + t_b3_3 + t_b3_4 - t_b3_5 - t_b3_6 - t_b3_7 - t_b3_8 - t_b3_9 + t_b3_10 + t_b3_11 + t_b3_12 + t_b3_13 + t_b3_14);
let t_b4 = calc_f64!(t_b4_1 + t_b4_2 + t_b4_3 - t_b4_4 - t_b4_5 - t_b4_6 - t_b4_7 + t_b4_8 + t_b4_9 + t_b4_10 - t_b4_11 - t_b4_12 - t_b4_13 - t_b4_14);
let t_b5 = calc_f64!(t_b5_1 + t_b5_2 - t_b5_3 - t_b5_4 - t_b5_5 + t_b5_6 + t_b5_7 + t_b5_8 - t_b5_9 - t_b5_10 - t_b5_11 + t_b5_12 + t_b5_13 + t_b5_14);
let t_b6 = calc_f64!(t_b6_1 + t_b6_2 - t_b6_3 - t_b6_4 + t_b6_5 + t_b6_6 + t_b6_7 - t_b6_8 - t_b6_9 + t_b6_10 + t_b6_11 + t_b6_12 - t_b6_13 - t_b6_14);
let t_b7 = calc_f64!(t_b7_1 + t_b7_2 - t_b7_3 - t_b7_4 + t_b7_5 + t_b7_6 - t_b7_7 - t_b7_8 + t_b7_9 + t_b7_10 - t_b7_11 - t_b7_12 + t_b7_13 + t_b7_14);
let t_b8 = calc_f64!(t_b8_1 - t_b8_2 - t_b8_3 + t_b8_4 + t_b8_5 - t_b8_6 - t_b8_7 + t_b8_8 + t_b8_9 - t_b8_10 + t_b8_11 + t_b8_12 - t_b8_13 - t_b8_14);
let t_b9 = calc_f64!(t_b9_1 - t_b9_2 - t_b9_3 + t_b9_4 - t_b9_5 - t_b9_6 + t_b9_7 + t_b9_8 - t_b9_9 + t_b9_10 + t_b9_11 - t_b9_12 + t_b9_13 + t_b9_14);
let t_b10 = calc_f64!(t_b10_1 - t_b10_2 + t_b10_3 + t_b10_4 - t_b10_5 + t_b10_6 + t_b10_7 - t_b10_8 + t_b10_9 + t_b10_10 - t_b10_11 + t_b10_12 + t_b10_13 - t_b10_14);
let t_b11 = calc_f64!(t_b11_1 - t_b11_2 + t_b11_3 - t_b11_4 - t_b11_5 + t_b11_6 - t_b11_7 + t_b11_8 + t_b11_9 - t_b11_10 + t_b11_11 - t_b11_12 - t_b11_13 + t_b11_14);
let t_b12 = calc_f64!(t_b12_1 - t_b12_2 + t_b12_3 - t_b12_4 + t_b12_5 + t_b12_6 - t_b12_7 + t_b12_8 - t_b12_9 + t_b12_10 - t_b12_11 - t_b12_12 + t_b12_13 - t_b12_14);
let t_b13 = calc_f64!(t_b13_1 - t_b13_2 + t_b13_3 - t_b13_4 + t_b13_5 - t_b13_6 + t_b13_7 - t_b13_8 + t_b13_9 + t_b13_10 - t_b13_11 + t_b13_12 - t_b13_13 + t_b13_14);
let t_b14 = calc_f64!(t_b14_1 - t_b14_2 + t_b14_3 - t_b14_4 + t_b14_5 - t_b14_6 + t_b14_7 - t_b14_8 + t_b14_9 - t_b14_10 + t_b14_11 - t_b14_12 + t_b14_13 - t_b14_14);
let t_b1_rot = self.rotate.rotate(t_b1);
let t_b2_rot = self.rotate.rotate(t_b2);
let t_b3_rot = self.rotate.rotate(t_b3);
let t_b4_rot = self.rotate.rotate(t_b4);
let t_b5_rot = self.rotate.rotate(t_b5);
let t_b6_rot = self.rotate.rotate(t_b6);
let t_b7_rot = self.rotate.rotate(t_b7);
let t_b8_rot = self.rotate.rotate(t_b8);
let t_b9_rot = self.rotate.rotate(t_b9);
let t_b10_rot = self.rotate.rotate(t_b10);
let t_b11_rot = self.rotate.rotate(t_b11);
let t_b12_rot = self.rotate.rotate(t_b12);
let t_b13_rot = self.rotate.rotate(t_b13);
let t_b14_rot = self.rotate.rotate(t_b14);
let y0 = calc_f64!(x0 + x1p28 + x2p27 + x3p26 + x4p25 + x5p24 + x6p23 + x7p22 + x8p21 + x9p20 + x10p19 + x11p18 + x12p17 + x13p16 + x14p15);
let [y1, y28] = solo_fft2_f64(t_a1, t_b1_rot);
let [y2, y27] = solo_fft2_f64(t_a2, t_b2_rot);
let [y3, y26] = solo_fft2_f64(t_a3, t_b3_rot);
let [y4, y25] = solo_fft2_f64(t_a4, t_b4_rot);
let [y5, y24] = solo_fft2_f64(t_a5, t_b5_rot);
let [y6, y23] = solo_fft2_f64(t_a6, t_b6_rot);
let [y7, y22] = solo_fft2_f64(t_a7, t_b7_rot);
let [y8, y21] = solo_fft2_f64(t_a8, t_b8_rot);
let [y9, y20] = solo_fft2_f64(t_a9, t_b9_rot);
let [y10, y19] = solo_fft2_f64(t_a10, t_b10_rot);
let [y11, y18] = solo_fft2_f64(t_a11, t_b11_rot);
let [y12, y17] = solo_fft2_f64(t_a12, t_b12_rot);
let [y13, y16] = solo_fft2_f64(t_a13, t_b13_rot);
let [y14, y15] = solo_fft2_f64(t_a14, t_b14_rot);
[y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22, y23, y24, y25, y26, y27, y28]
}
}
pub struct SseF32Butterfly31<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
rotate: Rotate90F32,
twiddle1re: __m128,
twiddle1im: __m128,
twiddle2re: __m128,
twiddle2im: __m128,
twiddle3re: __m128,
twiddle3im: __m128,
twiddle4re: __m128,
twiddle4im: __m128,
twiddle5re: __m128,
twiddle5im: __m128,
twiddle6re: __m128,
twiddle6im: __m128,
twiddle7re: __m128,
twiddle7im: __m128,
twiddle8re: __m128,
twiddle8im: __m128,
twiddle9re: __m128,
twiddle9im: __m128,
twiddle10re: __m128,
twiddle10im: __m128,
twiddle11re: __m128,
twiddle11im: __m128,
twiddle12re: __m128,
twiddle12im: __m128,
twiddle13re: __m128,
twiddle13im: __m128,
twiddle14re: __m128,
twiddle14im: __m128,
twiddle15re: __m128,
twiddle15im: __m128,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly31, 31, |this: &SseF32Butterfly31<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF32Butterfly31, 31, |this: &SseF32Butterfly31<_>| this
.direction);
impl<T: FftNum> SseF32Butterfly31<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
let rotate = Rotate90F32::new(true);
let tw1: Complex<f32> = twiddles::compute_twiddle(1, 31, direction);
let tw2: Complex<f32> = twiddles::compute_twiddle(2, 31, direction);
let tw3: Complex<f32> = twiddles::compute_twiddle(3, 31, direction);
let tw4: Complex<f32> = twiddles::compute_twiddle(4, 31, direction);
let tw5: Complex<f32> = twiddles::compute_twiddle(5, 31, direction);
let tw6: Complex<f32> = twiddles::compute_twiddle(6, 31, direction);
let tw7: Complex<f32> = twiddles::compute_twiddle(7, 31, direction);
let tw8: Complex<f32> = twiddles::compute_twiddle(8, 31, direction);
let tw9: Complex<f32> = twiddles::compute_twiddle(9, 31, direction);
let tw10: Complex<f32> = twiddles::compute_twiddle(10, 31, direction);
let tw11: Complex<f32> = twiddles::compute_twiddle(11, 31, direction);
let tw12: Complex<f32> = twiddles::compute_twiddle(12, 31, direction);
let tw13: Complex<f32> = twiddles::compute_twiddle(13, 31, direction);
let tw14: Complex<f32> = twiddles::compute_twiddle(14, 31, direction);
let tw15: Complex<f32> = twiddles::compute_twiddle(15, 31, direction);
let twiddle1re = unsafe { _mm_load1_ps(&tw1.re) };
let twiddle1im = unsafe { _mm_load1_ps(&tw1.im) };
let twiddle2re = unsafe { _mm_load1_ps(&tw2.re) };
let twiddle2im = unsafe { _mm_load1_ps(&tw2.im) };
let twiddle3re = unsafe { _mm_load1_ps(&tw3.re) };
let twiddle3im = unsafe { _mm_load1_ps(&tw3.im) };
let twiddle4re = unsafe { _mm_load1_ps(&tw4.re) };
let twiddle4im = unsafe { _mm_load1_ps(&tw4.im) };
let twiddle5re = unsafe { _mm_load1_ps(&tw5.re) };
let twiddle5im = unsafe { _mm_load1_ps(&tw5.im) };
let twiddle6re = unsafe { _mm_load1_ps(&tw6.re) };
let twiddle6im = unsafe { _mm_load1_ps(&tw6.im) };
let twiddle7re = unsafe { _mm_load1_ps(&tw7.re) };
let twiddle7im = unsafe { _mm_load1_ps(&tw7.im) };
let twiddle8re = unsafe { _mm_load1_ps(&tw8.re) };
let twiddle8im = unsafe { _mm_load1_ps(&tw8.im) };
let twiddle9re = unsafe { _mm_load1_ps(&tw9.re) };
let twiddle9im = unsafe { _mm_load1_ps(&tw9.im) };
let twiddle10re = unsafe { _mm_load1_ps(&tw10.re) };
let twiddle10im = unsafe { _mm_load1_ps(&tw10.im) };
let twiddle11re = unsafe { _mm_load1_ps(&tw11.re) };
let twiddle11im = unsafe { _mm_load1_ps(&tw11.im) };
let twiddle12re = unsafe { _mm_load1_ps(&tw12.re) };
let twiddle12im = unsafe { _mm_load1_ps(&tw12.im) };
let twiddle13re = unsafe { _mm_load1_ps(&tw13.re) };
let twiddle13im = unsafe { _mm_load1_ps(&tw13.im) };
let twiddle14re = unsafe { _mm_load1_ps(&tw14.re) };
let twiddle14im = unsafe { _mm_load1_ps(&tw14.im) };
let twiddle15re = unsafe { _mm_load1_ps(&tw15.re) };
let twiddle15im = unsafe { _mm_load1_ps(&tw15.im) };
Self {
direction,
_phantom: std::marker::PhantomData,
rotate,
twiddle1re,
twiddle1im,
twiddle2re,
twiddle2im,
twiddle3re,
twiddle3im,
twiddle4re,
twiddle4im,
twiddle5re,
twiddle5im,
twiddle6re,
twiddle6im,
twiddle7re,
twiddle7im,
twiddle8re,
twiddle8im,
twiddle9re,
twiddle9im,
twiddle10re,
twiddle10im,
twiddle11re,
twiddle11im,
twiddle12re,
twiddle12im,
twiddle13re,
twiddle13im,
twiddle14re,
twiddle14im,
twiddle15re,
twiddle15im,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let values = read_partial1_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30});
let out = self.perform_parallel_fft_direct(values);
write_partial_lo_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30});
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60});
let values = [
extract_lo_hi_f32(input_packed[0], input_packed[15]),
extract_hi_lo_f32(input_packed[0], input_packed[16]),
extract_lo_hi_f32(input_packed[1], input_packed[16]),
extract_hi_lo_f32(input_packed[1], input_packed[17]),
extract_lo_hi_f32(input_packed[2], input_packed[17]),
extract_hi_lo_f32(input_packed[2], input_packed[18]),
extract_lo_hi_f32(input_packed[3], input_packed[18]),
extract_hi_lo_f32(input_packed[3], input_packed[19]),
extract_lo_hi_f32(input_packed[4], input_packed[19]),
extract_hi_lo_f32(input_packed[4], input_packed[20]),
extract_lo_hi_f32(input_packed[5], input_packed[20]),
extract_hi_lo_f32(input_packed[5], input_packed[21]),
extract_lo_hi_f32(input_packed[6], input_packed[21]),
extract_hi_lo_f32(input_packed[6], input_packed[22]),
extract_lo_hi_f32(input_packed[7], input_packed[22]),
extract_hi_lo_f32(input_packed[7], input_packed[23]),
extract_lo_hi_f32(input_packed[8], input_packed[23]),
extract_hi_lo_f32(input_packed[8], input_packed[24]),
extract_lo_hi_f32(input_packed[9], input_packed[24]),
extract_hi_lo_f32(input_packed[9], input_packed[25]),
extract_lo_hi_f32(input_packed[10], input_packed[25]),
extract_hi_lo_f32(input_packed[10], input_packed[26]),
extract_lo_hi_f32(input_packed[11], input_packed[26]),
extract_hi_lo_f32(input_packed[11], input_packed[27]),
extract_lo_hi_f32(input_packed[12], input_packed[27]),
extract_hi_lo_f32(input_packed[12], input_packed[28]),
extract_lo_hi_f32(input_packed[13], input_packed[28]),
extract_hi_lo_f32(input_packed[13], input_packed[29]),
extract_lo_hi_f32(input_packed[14], input_packed[29]),
extract_hi_lo_f32(input_packed[14], input_packed[30]),
extract_lo_hi_f32(input_packed[15], input_packed[30]),
];
let out = self.perform_parallel_fft_direct(values);
let out_packed = [
extract_lo_lo_f32(out[0], out[1]),
extract_lo_lo_f32(out[2], out[3]),
extract_lo_lo_f32(out[4], out[5]),
extract_lo_lo_f32(out[6], out[7]),
extract_lo_lo_f32(out[8], out[9]),
extract_lo_lo_f32(out[10], out[11]),
extract_lo_lo_f32(out[12], out[13]),
extract_lo_lo_f32(out[14], out[15]),
extract_lo_lo_f32(out[16], out[17]),
extract_lo_lo_f32(out[18], out[19]),
extract_lo_lo_f32(out[20], out[21]),
extract_lo_lo_f32(out[22], out[23]),
extract_lo_lo_f32(out[24], out[25]),
extract_lo_lo_f32(out[26], out[27]),
extract_lo_lo_f32(out[28], out[29]),
extract_lo_hi_f32(out[30], out[0]),
extract_hi_hi_f32(out[1], out[2]),
extract_hi_hi_f32(out[3], out[4]),
extract_hi_hi_f32(out[5], out[6]),
extract_hi_hi_f32(out[7], out[8]),
extract_hi_hi_f32(out[9], out[10]),
extract_hi_hi_f32(out[11], out[12]),
extract_hi_hi_f32(out[13], out[14]),
extract_hi_hi_f32(out[15], out[16]),
extract_hi_hi_f32(out[17], out[18]),
extract_hi_hi_f32(out[19], out[20]),
extract_hi_hi_f32(out[21], out[22]),
extract_hi_hi_f32(out[23], out[24]),
extract_hi_hi_f32(out[25], out[26]),
extract_hi_hi_f32(out[27], out[28]),
extract_hi_hi_f32(out[29], out[30]),
];
write_complex_to_array_strided!(out_packed, buffer, 2, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30});
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 31]) -> [__m128; 31] {
let [x1p30, x1m30] = parallel_fft2_interleaved_f32(values[1], values[30]);
let [x2p29, x2m29] = parallel_fft2_interleaved_f32(values[2], values[29]);
let [x3p28, x3m28] = parallel_fft2_interleaved_f32(values[3], values[28]);
let [x4p27, x4m27] = parallel_fft2_interleaved_f32(values[4], values[27]);
let [x5p26, x5m26] = parallel_fft2_interleaved_f32(values[5], values[26]);
let [x6p25, x6m25] = parallel_fft2_interleaved_f32(values[6], values[25]);
let [x7p24, x7m24] = parallel_fft2_interleaved_f32(values[7], values[24]);
let [x8p23, x8m23] = parallel_fft2_interleaved_f32(values[8], values[23]);
let [x9p22, x9m22] = parallel_fft2_interleaved_f32(values[9], values[22]);
let [x10p21, x10m21] = parallel_fft2_interleaved_f32(values[10], values[21]);
let [x11p20, x11m20] = parallel_fft2_interleaved_f32(values[11], values[20]);
let [x12p19, x12m19] = parallel_fft2_interleaved_f32(values[12], values[19]);
let [x13p18, x13m18] = parallel_fft2_interleaved_f32(values[13], values[18]);
let [x14p17, x14m17] = parallel_fft2_interleaved_f32(values[14], values[17]);
let [x15p16, x15m16] = parallel_fft2_interleaved_f32(values[15], values[16]);
let t_a1_1 = _mm_mul_ps(self.twiddle1re, x1p30);
let t_a1_2 = _mm_mul_ps(self.twiddle2re, x2p29);
let t_a1_3 = _mm_mul_ps(self.twiddle3re, x3p28);
let t_a1_4 = _mm_mul_ps(self.twiddle4re, x4p27);
let t_a1_5 = _mm_mul_ps(self.twiddle5re, x5p26);
let t_a1_6 = _mm_mul_ps(self.twiddle6re, x6p25);
let t_a1_7 = _mm_mul_ps(self.twiddle7re, x7p24);
let t_a1_8 = _mm_mul_ps(self.twiddle8re, x8p23);
let t_a1_9 = _mm_mul_ps(self.twiddle9re, x9p22);
let t_a1_10 = _mm_mul_ps(self.twiddle10re, x10p21);
let t_a1_11 = _mm_mul_ps(self.twiddle11re, x11p20);
let t_a1_12 = _mm_mul_ps(self.twiddle12re, x12p19);
let t_a1_13 = _mm_mul_ps(self.twiddle13re, x13p18);
let t_a1_14 = _mm_mul_ps(self.twiddle14re, x14p17);
let t_a1_15 = _mm_mul_ps(self.twiddle15re, x15p16);
let t_a2_1 = _mm_mul_ps(self.twiddle2re, x1p30);
let t_a2_2 = _mm_mul_ps(self.twiddle4re, x2p29);
let t_a2_3 = _mm_mul_ps(self.twiddle6re, x3p28);
let t_a2_4 = _mm_mul_ps(self.twiddle8re, x4p27);
let t_a2_5 = _mm_mul_ps(self.twiddle10re, x5p26);
let t_a2_6 = _mm_mul_ps(self.twiddle12re, x6p25);
let t_a2_7 = _mm_mul_ps(self.twiddle14re, x7p24);
let t_a2_8 = _mm_mul_ps(self.twiddle15re, x8p23);
let t_a2_9 = _mm_mul_ps(self.twiddle13re, x9p22);
let t_a2_10 = _mm_mul_ps(self.twiddle11re, x10p21);
let t_a2_11 = _mm_mul_ps(self.twiddle9re, x11p20);
let t_a2_12 = _mm_mul_ps(self.twiddle7re, x12p19);
let t_a2_13 = _mm_mul_ps(self.twiddle5re, x13p18);
let t_a2_14 = _mm_mul_ps(self.twiddle3re, x14p17);
let t_a2_15 = _mm_mul_ps(self.twiddle1re, x15p16);
let t_a3_1 = _mm_mul_ps(self.twiddle3re, x1p30);
let t_a3_2 = _mm_mul_ps(self.twiddle6re, x2p29);
let t_a3_3 = _mm_mul_ps(self.twiddle9re, x3p28);
let t_a3_4 = _mm_mul_ps(self.twiddle12re, x4p27);
let t_a3_5 = _mm_mul_ps(self.twiddle15re, x5p26);
let t_a3_6 = _mm_mul_ps(self.twiddle13re, x6p25);
let t_a3_7 = _mm_mul_ps(self.twiddle10re, x7p24);
let t_a3_8 = _mm_mul_ps(self.twiddle7re, x8p23);
let t_a3_9 = _mm_mul_ps(self.twiddle4re, x9p22);
let t_a3_10 = _mm_mul_ps(self.twiddle1re, x10p21);
let t_a3_11 = _mm_mul_ps(self.twiddle2re, x11p20);
let t_a3_12 = _mm_mul_ps(self.twiddle5re, x12p19);
let t_a3_13 = _mm_mul_ps(self.twiddle8re, x13p18);
let t_a3_14 = _mm_mul_ps(self.twiddle11re, x14p17);
let t_a3_15 = _mm_mul_ps(self.twiddle14re, x15p16);
let t_a4_1 = _mm_mul_ps(self.twiddle4re, x1p30);
let t_a4_2 = _mm_mul_ps(self.twiddle8re, x2p29);
let t_a4_3 = _mm_mul_ps(self.twiddle12re, x3p28);
let t_a4_4 = _mm_mul_ps(self.twiddle15re, x4p27);
let t_a4_5 = _mm_mul_ps(self.twiddle11re, x5p26);
let t_a4_6 = _mm_mul_ps(self.twiddle7re, x6p25);
let t_a4_7 = _mm_mul_ps(self.twiddle3re, x7p24);
let t_a4_8 = _mm_mul_ps(self.twiddle1re, x8p23);
let t_a4_9 = _mm_mul_ps(self.twiddle5re, x9p22);
let t_a4_10 = _mm_mul_ps(self.twiddle9re, x10p21);
let t_a4_11 = _mm_mul_ps(self.twiddle13re, x11p20);
let t_a4_12 = _mm_mul_ps(self.twiddle14re, x12p19);
let t_a4_13 = _mm_mul_ps(self.twiddle10re, x13p18);
let t_a4_14 = _mm_mul_ps(self.twiddle6re, x14p17);
let t_a4_15 = _mm_mul_ps(self.twiddle2re, x15p16);
let t_a5_1 = _mm_mul_ps(self.twiddle5re, x1p30);
let t_a5_2 = _mm_mul_ps(self.twiddle10re, x2p29);
let t_a5_3 = _mm_mul_ps(self.twiddle15re, x3p28);
let t_a5_4 = _mm_mul_ps(self.twiddle11re, x4p27);
let t_a5_5 = _mm_mul_ps(self.twiddle6re, x5p26);
let t_a5_6 = _mm_mul_ps(self.twiddle1re, x6p25);
let t_a5_7 = _mm_mul_ps(self.twiddle4re, x7p24);
let t_a5_8 = _mm_mul_ps(self.twiddle9re, x8p23);
let t_a5_9 = _mm_mul_ps(self.twiddle14re, x9p22);
let t_a5_10 = _mm_mul_ps(self.twiddle12re, x10p21);
let t_a5_11 = _mm_mul_ps(self.twiddle7re, x11p20);
let t_a5_12 = _mm_mul_ps(self.twiddle2re, x12p19);
let t_a5_13 = _mm_mul_ps(self.twiddle3re, x13p18);
let t_a5_14 = _mm_mul_ps(self.twiddle8re, x14p17);
let t_a5_15 = _mm_mul_ps(self.twiddle13re, x15p16);
let t_a6_1 = _mm_mul_ps(self.twiddle6re, x1p30);
let t_a6_2 = _mm_mul_ps(self.twiddle12re, x2p29);
let t_a6_3 = _mm_mul_ps(self.twiddle13re, x3p28);
let t_a6_4 = _mm_mul_ps(self.twiddle7re, x4p27);
let t_a6_5 = _mm_mul_ps(self.twiddle1re, x5p26);
let t_a6_6 = _mm_mul_ps(self.twiddle5re, x6p25);
let t_a6_7 = _mm_mul_ps(self.twiddle11re, x7p24);
let t_a6_8 = _mm_mul_ps(self.twiddle14re, x8p23);
let t_a6_9 = _mm_mul_ps(self.twiddle8re, x9p22);
let t_a6_10 = _mm_mul_ps(self.twiddle2re, x10p21);
let t_a6_11 = _mm_mul_ps(self.twiddle4re, x11p20);
let t_a6_12 = _mm_mul_ps(self.twiddle10re, x12p19);
let t_a6_13 = _mm_mul_ps(self.twiddle15re, x13p18);
let t_a6_14 = _mm_mul_ps(self.twiddle9re, x14p17);
let t_a6_15 = _mm_mul_ps(self.twiddle3re, x15p16);
let t_a7_1 = _mm_mul_ps(self.twiddle7re, x1p30);
let t_a7_2 = _mm_mul_ps(self.twiddle14re, x2p29);
let t_a7_3 = _mm_mul_ps(self.twiddle10re, x3p28);
let t_a7_4 = _mm_mul_ps(self.twiddle3re, x4p27);
let t_a7_5 = _mm_mul_ps(self.twiddle4re, x5p26);
let t_a7_6 = _mm_mul_ps(self.twiddle11re, x6p25);
let t_a7_7 = _mm_mul_ps(self.twiddle13re, x7p24);
let t_a7_8 = _mm_mul_ps(self.twiddle6re, x8p23);
let t_a7_9 = _mm_mul_ps(self.twiddle1re, x9p22);
let t_a7_10 = _mm_mul_ps(self.twiddle8re, x10p21);
let t_a7_11 = _mm_mul_ps(self.twiddle15re, x11p20);
let t_a7_12 = _mm_mul_ps(self.twiddle9re, x12p19);
let t_a7_13 = _mm_mul_ps(self.twiddle2re, x13p18);
let t_a7_14 = _mm_mul_ps(self.twiddle5re, x14p17);
let t_a7_15 = _mm_mul_ps(self.twiddle12re, x15p16);
let t_a8_1 = _mm_mul_ps(self.twiddle8re, x1p30);
let t_a8_2 = _mm_mul_ps(self.twiddle15re, x2p29);
let t_a8_3 = _mm_mul_ps(self.twiddle7re, x3p28);
let t_a8_4 = _mm_mul_ps(self.twiddle1re, x4p27);
let t_a8_5 = _mm_mul_ps(self.twiddle9re, x5p26);
let t_a8_6 = _mm_mul_ps(self.twiddle14re, x6p25);
let t_a8_7 = _mm_mul_ps(self.twiddle6re, x7p24);
let t_a8_8 = _mm_mul_ps(self.twiddle2re, x8p23);
let t_a8_9 = _mm_mul_ps(self.twiddle10re, x9p22);
let t_a8_10 = _mm_mul_ps(self.twiddle13re, x10p21);
let t_a8_11 = _mm_mul_ps(self.twiddle5re, x11p20);
let t_a8_12 = _mm_mul_ps(self.twiddle3re, x12p19);
let t_a8_13 = _mm_mul_ps(self.twiddle11re, x13p18);
let t_a8_14 = _mm_mul_ps(self.twiddle12re, x14p17);
let t_a8_15 = _mm_mul_ps(self.twiddle4re, x15p16);
let t_a9_1 = _mm_mul_ps(self.twiddle9re, x1p30);
let t_a9_2 = _mm_mul_ps(self.twiddle13re, x2p29);
let t_a9_3 = _mm_mul_ps(self.twiddle4re, x3p28);
let t_a9_4 = _mm_mul_ps(self.twiddle5re, x4p27);
let t_a9_5 = _mm_mul_ps(self.twiddle14re, x5p26);
let t_a9_6 = _mm_mul_ps(self.twiddle8re, x6p25);
let t_a9_7 = _mm_mul_ps(self.twiddle1re, x7p24);
let t_a9_8 = _mm_mul_ps(self.twiddle10re, x8p23);
let t_a9_9 = _mm_mul_ps(self.twiddle12re, x9p22);
let t_a9_10 = _mm_mul_ps(self.twiddle3re, x10p21);
let t_a9_11 = _mm_mul_ps(self.twiddle6re, x11p20);
let t_a9_12 = _mm_mul_ps(self.twiddle15re, x12p19);
let t_a9_13 = _mm_mul_ps(self.twiddle7re, x13p18);
let t_a9_14 = _mm_mul_ps(self.twiddle2re, x14p17);
let t_a9_15 = _mm_mul_ps(self.twiddle11re, x15p16);
let t_a10_1 = _mm_mul_ps(self.twiddle10re, x1p30);
let t_a10_2 = _mm_mul_ps(self.twiddle11re, x2p29);
let t_a10_3 = _mm_mul_ps(self.twiddle1re, x3p28);
let t_a10_4 = _mm_mul_ps(self.twiddle9re, x4p27);
let t_a10_5 = _mm_mul_ps(self.twiddle12re, x5p26);
let t_a10_6 = _mm_mul_ps(self.twiddle2re, x6p25);
let t_a10_7 = _mm_mul_ps(self.twiddle8re, x7p24);
let t_a10_8 = _mm_mul_ps(self.twiddle13re, x8p23);
let t_a10_9 = _mm_mul_ps(self.twiddle3re, x9p22);
let t_a10_10 = _mm_mul_ps(self.twiddle7re, x10p21);
let t_a10_11 = _mm_mul_ps(self.twiddle14re, x11p20);
let t_a10_12 = _mm_mul_ps(self.twiddle4re, x12p19);
let t_a10_13 = _mm_mul_ps(self.twiddle6re, x13p18);
let t_a10_14 = _mm_mul_ps(self.twiddle15re, x14p17);
let t_a10_15 = _mm_mul_ps(self.twiddle5re, x15p16);
let t_a11_1 = _mm_mul_ps(self.twiddle11re, x1p30);
let t_a11_2 = _mm_mul_ps(self.twiddle9re, x2p29);
let t_a11_3 = _mm_mul_ps(self.twiddle2re, x3p28);
let t_a11_4 = _mm_mul_ps(self.twiddle13re, x4p27);
let t_a11_5 = _mm_mul_ps(self.twiddle7re, x5p26);
let t_a11_6 = _mm_mul_ps(self.twiddle4re, x6p25);
let t_a11_7 = _mm_mul_ps(self.twiddle15re, x7p24);
let t_a11_8 = _mm_mul_ps(self.twiddle5re, x8p23);
let t_a11_9 = _mm_mul_ps(self.twiddle6re, x9p22);
let t_a11_10 = _mm_mul_ps(self.twiddle14re, x10p21);
let t_a11_11 = _mm_mul_ps(self.twiddle3re, x11p20);
let t_a11_12 = _mm_mul_ps(self.twiddle8re, x12p19);
let t_a11_13 = _mm_mul_ps(self.twiddle12re, x13p18);
let t_a11_14 = _mm_mul_ps(self.twiddle1re, x14p17);
let t_a11_15 = _mm_mul_ps(self.twiddle10re, x15p16);
let t_a12_1 = _mm_mul_ps(self.twiddle12re, x1p30);
let t_a12_2 = _mm_mul_ps(self.twiddle7re, x2p29);
let t_a12_3 = _mm_mul_ps(self.twiddle5re, x3p28);
let t_a12_4 = _mm_mul_ps(self.twiddle14re, x4p27);
let t_a12_5 = _mm_mul_ps(self.twiddle2re, x5p26);
let t_a12_6 = _mm_mul_ps(self.twiddle10re, x6p25);
let t_a12_7 = _mm_mul_ps(self.twiddle9re, x7p24);
let t_a12_8 = _mm_mul_ps(self.twiddle3re, x8p23);
let t_a12_9 = _mm_mul_ps(self.twiddle15re, x9p22);
let t_a12_10 = _mm_mul_ps(self.twiddle4re, x10p21);
let t_a12_11 = _mm_mul_ps(self.twiddle8re, x11p20);
let t_a12_12 = _mm_mul_ps(self.twiddle11re, x12p19);
let t_a12_13 = _mm_mul_ps(self.twiddle1re, x13p18);
let t_a12_14 = _mm_mul_ps(self.twiddle13re, x14p17);
let t_a12_15 = _mm_mul_ps(self.twiddle6re, x15p16);
let t_a13_1 = _mm_mul_ps(self.twiddle13re, x1p30);
let t_a13_2 = _mm_mul_ps(self.twiddle5re, x2p29);
let t_a13_3 = _mm_mul_ps(self.twiddle8re, x3p28);
let t_a13_4 = _mm_mul_ps(self.twiddle10re, x4p27);
let t_a13_5 = _mm_mul_ps(self.twiddle3re, x5p26);
let t_a13_6 = _mm_mul_ps(self.twiddle15re, x6p25);
let t_a13_7 = _mm_mul_ps(self.twiddle2re, x7p24);
let t_a13_8 = _mm_mul_ps(self.twiddle11re, x8p23);
let t_a13_9 = _mm_mul_ps(self.twiddle7re, x9p22);
let t_a13_10 = _mm_mul_ps(self.twiddle6re, x10p21);
let t_a13_11 = _mm_mul_ps(self.twiddle12re, x11p20);
let t_a13_12 = _mm_mul_ps(self.twiddle1re, x12p19);
let t_a13_13 = _mm_mul_ps(self.twiddle14re, x13p18);
let t_a13_14 = _mm_mul_ps(self.twiddle4re, x14p17);
let t_a13_15 = _mm_mul_ps(self.twiddle9re, x15p16);
let t_a14_1 = _mm_mul_ps(self.twiddle14re, x1p30);
let t_a14_2 = _mm_mul_ps(self.twiddle3re, x2p29);
let t_a14_3 = _mm_mul_ps(self.twiddle11re, x3p28);
let t_a14_4 = _mm_mul_ps(self.twiddle6re, x4p27);
let t_a14_5 = _mm_mul_ps(self.twiddle8re, x5p26);
let t_a14_6 = _mm_mul_ps(self.twiddle9re, x6p25);
let t_a14_7 = _mm_mul_ps(self.twiddle5re, x7p24);
let t_a14_8 = _mm_mul_ps(self.twiddle12re, x8p23);
let t_a14_9 = _mm_mul_ps(self.twiddle2re, x9p22);
let t_a14_10 = _mm_mul_ps(self.twiddle15re, x10p21);
let t_a14_11 = _mm_mul_ps(self.twiddle1re, x11p20);
let t_a14_12 = _mm_mul_ps(self.twiddle13re, x12p19);
let t_a14_13 = _mm_mul_ps(self.twiddle4re, x13p18);
let t_a14_14 = _mm_mul_ps(self.twiddle10re, x14p17);
let t_a14_15 = _mm_mul_ps(self.twiddle7re, x15p16);
let t_a15_1 = _mm_mul_ps(self.twiddle15re, x1p30);
let t_a15_2 = _mm_mul_ps(self.twiddle1re, x2p29);
let t_a15_3 = _mm_mul_ps(self.twiddle14re, x3p28);
let t_a15_4 = _mm_mul_ps(self.twiddle2re, x4p27);
let t_a15_5 = _mm_mul_ps(self.twiddle13re, x5p26);
let t_a15_6 = _mm_mul_ps(self.twiddle3re, x6p25);
let t_a15_7 = _mm_mul_ps(self.twiddle12re, x7p24);
let t_a15_8 = _mm_mul_ps(self.twiddle4re, x8p23);
let t_a15_9 = _mm_mul_ps(self.twiddle11re, x9p22);
let t_a15_10 = _mm_mul_ps(self.twiddle5re, x10p21);
let t_a15_11 = _mm_mul_ps(self.twiddle10re, x11p20);
let t_a15_12 = _mm_mul_ps(self.twiddle6re, x12p19);
let t_a15_13 = _mm_mul_ps(self.twiddle9re, x13p18);
let t_a15_14 = _mm_mul_ps(self.twiddle7re, x14p17);
let t_a15_15 = _mm_mul_ps(self.twiddle8re, x15p16);
let t_b1_1 = _mm_mul_ps(self.twiddle1im, x1m30);
let t_b1_2 = _mm_mul_ps(self.twiddle2im, x2m29);
let t_b1_3 = _mm_mul_ps(self.twiddle3im, x3m28);
let t_b1_4 = _mm_mul_ps(self.twiddle4im, x4m27);
let t_b1_5 = _mm_mul_ps(self.twiddle5im, x5m26);
let t_b1_6 = _mm_mul_ps(self.twiddle6im, x6m25);
let t_b1_7 = _mm_mul_ps(self.twiddle7im, x7m24);
let t_b1_8 = _mm_mul_ps(self.twiddle8im, x8m23);
let t_b1_9 = _mm_mul_ps(self.twiddle9im, x9m22);
let t_b1_10 = _mm_mul_ps(self.twiddle10im, x10m21);
let t_b1_11 = _mm_mul_ps(self.twiddle11im, x11m20);
let t_b1_12 = _mm_mul_ps(self.twiddle12im, x12m19);
let t_b1_13 = _mm_mul_ps(self.twiddle13im, x13m18);
let t_b1_14 = _mm_mul_ps(self.twiddle14im, x14m17);
let t_b1_15 = _mm_mul_ps(self.twiddle15im, x15m16);
let t_b2_1 = _mm_mul_ps(self.twiddle2im, x1m30);
let t_b2_2 = _mm_mul_ps(self.twiddle4im, x2m29);
let t_b2_3 = _mm_mul_ps(self.twiddle6im, x3m28);
let t_b2_4 = _mm_mul_ps(self.twiddle8im, x4m27);
let t_b2_5 = _mm_mul_ps(self.twiddle10im, x5m26);
let t_b2_6 = _mm_mul_ps(self.twiddle12im, x6m25);
let t_b2_7 = _mm_mul_ps(self.twiddle14im, x7m24);
let t_b2_8 = _mm_mul_ps(self.twiddle15im, x8m23);
let t_b2_9 = _mm_mul_ps(self.twiddle13im, x9m22);
let t_b2_10 = _mm_mul_ps(self.twiddle11im, x10m21);
let t_b2_11 = _mm_mul_ps(self.twiddle9im, x11m20);
let t_b2_12 = _mm_mul_ps(self.twiddle7im, x12m19);
let t_b2_13 = _mm_mul_ps(self.twiddle5im, x13m18);
let t_b2_14 = _mm_mul_ps(self.twiddle3im, x14m17);
let t_b2_15 = _mm_mul_ps(self.twiddle1im, x15m16);
let t_b3_1 = _mm_mul_ps(self.twiddle3im, x1m30);
let t_b3_2 = _mm_mul_ps(self.twiddle6im, x2m29);
let t_b3_3 = _mm_mul_ps(self.twiddle9im, x3m28);
let t_b3_4 = _mm_mul_ps(self.twiddle12im, x4m27);
let t_b3_5 = _mm_mul_ps(self.twiddle15im, x5m26);
let t_b3_6 = _mm_mul_ps(self.twiddle13im, x6m25);
let t_b3_7 = _mm_mul_ps(self.twiddle10im, x7m24);
let t_b3_8 = _mm_mul_ps(self.twiddle7im, x8m23);
let t_b3_9 = _mm_mul_ps(self.twiddle4im, x9m22);
let t_b3_10 = _mm_mul_ps(self.twiddle1im, x10m21);
let t_b3_11 = _mm_mul_ps(self.twiddle2im, x11m20);
let t_b3_12 = _mm_mul_ps(self.twiddle5im, x12m19);
let t_b3_13 = _mm_mul_ps(self.twiddle8im, x13m18);
let t_b3_14 = _mm_mul_ps(self.twiddle11im, x14m17);
let t_b3_15 = _mm_mul_ps(self.twiddle14im, x15m16);
let t_b4_1 = _mm_mul_ps(self.twiddle4im, x1m30);
let t_b4_2 = _mm_mul_ps(self.twiddle8im, x2m29);
let t_b4_3 = _mm_mul_ps(self.twiddle12im, x3m28);
let t_b4_4 = _mm_mul_ps(self.twiddle15im, x4m27);
let t_b4_5 = _mm_mul_ps(self.twiddle11im, x5m26);
let t_b4_6 = _mm_mul_ps(self.twiddle7im, x6m25);
let t_b4_7 = _mm_mul_ps(self.twiddle3im, x7m24);
let t_b4_8 = _mm_mul_ps(self.twiddle1im, x8m23);
let t_b4_9 = _mm_mul_ps(self.twiddle5im, x9m22);
let t_b4_10 = _mm_mul_ps(self.twiddle9im, x10m21);
let t_b4_11 = _mm_mul_ps(self.twiddle13im, x11m20);
let t_b4_12 = _mm_mul_ps(self.twiddle14im, x12m19);
let t_b4_13 = _mm_mul_ps(self.twiddle10im, x13m18);
let t_b4_14 = _mm_mul_ps(self.twiddle6im, x14m17);
let t_b4_15 = _mm_mul_ps(self.twiddle2im, x15m16);
let t_b5_1 = _mm_mul_ps(self.twiddle5im, x1m30);
let t_b5_2 = _mm_mul_ps(self.twiddle10im, x2m29);
let t_b5_3 = _mm_mul_ps(self.twiddle15im, x3m28);
let t_b5_4 = _mm_mul_ps(self.twiddle11im, x4m27);
let t_b5_5 = _mm_mul_ps(self.twiddle6im, x5m26);
let t_b5_6 = _mm_mul_ps(self.twiddle1im, x6m25);
let t_b5_7 = _mm_mul_ps(self.twiddle4im, x7m24);
let t_b5_8 = _mm_mul_ps(self.twiddle9im, x8m23);
let t_b5_9 = _mm_mul_ps(self.twiddle14im, x9m22);
let t_b5_10 = _mm_mul_ps(self.twiddle12im, x10m21);
let t_b5_11 = _mm_mul_ps(self.twiddle7im, x11m20);
let t_b5_12 = _mm_mul_ps(self.twiddle2im, x12m19);
let t_b5_13 = _mm_mul_ps(self.twiddle3im, x13m18);
let t_b5_14 = _mm_mul_ps(self.twiddle8im, x14m17);
let t_b5_15 = _mm_mul_ps(self.twiddle13im, x15m16);
let t_b6_1 = _mm_mul_ps(self.twiddle6im, x1m30);
let t_b6_2 = _mm_mul_ps(self.twiddle12im, x2m29);
let t_b6_3 = _mm_mul_ps(self.twiddle13im, x3m28);
let t_b6_4 = _mm_mul_ps(self.twiddle7im, x4m27);
let t_b6_5 = _mm_mul_ps(self.twiddle1im, x5m26);
let t_b6_6 = _mm_mul_ps(self.twiddle5im, x6m25);
let t_b6_7 = _mm_mul_ps(self.twiddle11im, x7m24);
let t_b6_8 = _mm_mul_ps(self.twiddle14im, x8m23);
let t_b6_9 = _mm_mul_ps(self.twiddle8im, x9m22);
let t_b6_10 = _mm_mul_ps(self.twiddle2im, x10m21);
let t_b6_11 = _mm_mul_ps(self.twiddle4im, x11m20);
let t_b6_12 = _mm_mul_ps(self.twiddle10im, x12m19);
let t_b6_13 = _mm_mul_ps(self.twiddle15im, x13m18);
let t_b6_14 = _mm_mul_ps(self.twiddle9im, x14m17);
let t_b6_15 = _mm_mul_ps(self.twiddle3im, x15m16);
let t_b7_1 = _mm_mul_ps(self.twiddle7im, x1m30);
let t_b7_2 = _mm_mul_ps(self.twiddle14im, x2m29);
let t_b7_3 = _mm_mul_ps(self.twiddle10im, x3m28);
let t_b7_4 = _mm_mul_ps(self.twiddle3im, x4m27);
let t_b7_5 = _mm_mul_ps(self.twiddle4im, x5m26);
let t_b7_6 = _mm_mul_ps(self.twiddle11im, x6m25);
let t_b7_7 = _mm_mul_ps(self.twiddle13im, x7m24);
let t_b7_8 = _mm_mul_ps(self.twiddle6im, x8m23);
let t_b7_9 = _mm_mul_ps(self.twiddle1im, x9m22);
let t_b7_10 = _mm_mul_ps(self.twiddle8im, x10m21);
let t_b7_11 = _mm_mul_ps(self.twiddle15im, x11m20);
let t_b7_12 = _mm_mul_ps(self.twiddle9im, x12m19);
let t_b7_13 = _mm_mul_ps(self.twiddle2im, x13m18);
let t_b7_14 = _mm_mul_ps(self.twiddle5im, x14m17);
let t_b7_15 = _mm_mul_ps(self.twiddle12im, x15m16);
let t_b8_1 = _mm_mul_ps(self.twiddle8im, x1m30);
let t_b8_2 = _mm_mul_ps(self.twiddle15im, x2m29);
let t_b8_3 = _mm_mul_ps(self.twiddle7im, x3m28);
let t_b8_4 = _mm_mul_ps(self.twiddle1im, x4m27);
let t_b8_5 = _mm_mul_ps(self.twiddle9im, x5m26);
let t_b8_6 = _mm_mul_ps(self.twiddle14im, x6m25);
let t_b8_7 = _mm_mul_ps(self.twiddle6im, x7m24);
let t_b8_8 = _mm_mul_ps(self.twiddle2im, x8m23);
let t_b8_9 = _mm_mul_ps(self.twiddle10im, x9m22);
let t_b8_10 = _mm_mul_ps(self.twiddle13im, x10m21);
let t_b8_11 = _mm_mul_ps(self.twiddle5im, x11m20);
let t_b8_12 = _mm_mul_ps(self.twiddle3im, x12m19);
let t_b8_13 = _mm_mul_ps(self.twiddle11im, x13m18);
let t_b8_14 = _mm_mul_ps(self.twiddle12im, x14m17);
let t_b8_15 = _mm_mul_ps(self.twiddle4im, x15m16);
let t_b9_1 = _mm_mul_ps(self.twiddle9im, x1m30);
let t_b9_2 = _mm_mul_ps(self.twiddle13im, x2m29);
let t_b9_3 = _mm_mul_ps(self.twiddle4im, x3m28);
let t_b9_4 = _mm_mul_ps(self.twiddle5im, x4m27);
let t_b9_5 = _mm_mul_ps(self.twiddle14im, x5m26);
let t_b9_6 = _mm_mul_ps(self.twiddle8im, x6m25);
let t_b9_7 = _mm_mul_ps(self.twiddle1im, x7m24);
let t_b9_8 = _mm_mul_ps(self.twiddle10im, x8m23);
let t_b9_9 = _mm_mul_ps(self.twiddle12im, x9m22);
let t_b9_10 = _mm_mul_ps(self.twiddle3im, x10m21);
let t_b9_11 = _mm_mul_ps(self.twiddle6im, x11m20);
let t_b9_12 = _mm_mul_ps(self.twiddle15im, x12m19);
let t_b9_13 = _mm_mul_ps(self.twiddle7im, x13m18);
let t_b9_14 = _mm_mul_ps(self.twiddle2im, x14m17);
let t_b9_15 = _mm_mul_ps(self.twiddle11im, x15m16);
let t_b10_1 = _mm_mul_ps(self.twiddle10im, x1m30);
let t_b10_2 = _mm_mul_ps(self.twiddle11im, x2m29);
let t_b10_3 = _mm_mul_ps(self.twiddle1im, x3m28);
let t_b10_4 = _mm_mul_ps(self.twiddle9im, x4m27);
let t_b10_5 = _mm_mul_ps(self.twiddle12im, x5m26);
let t_b10_6 = _mm_mul_ps(self.twiddle2im, x6m25);
let t_b10_7 = _mm_mul_ps(self.twiddle8im, x7m24);
let t_b10_8 = _mm_mul_ps(self.twiddle13im, x8m23);
let t_b10_9 = _mm_mul_ps(self.twiddle3im, x9m22);
let t_b10_10 = _mm_mul_ps(self.twiddle7im, x10m21);
let t_b10_11 = _mm_mul_ps(self.twiddle14im, x11m20);
let t_b10_12 = _mm_mul_ps(self.twiddle4im, x12m19);
let t_b10_13 = _mm_mul_ps(self.twiddle6im, x13m18);
let t_b10_14 = _mm_mul_ps(self.twiddle15im, x14m17);
let t_b10_15 = _mm_mul_ps(self.twiddle5im, x15m16);
let t_b11_1 = _mm_mul_ps(self.twiddle11im, x1m30);
let t_b11_2 = _mm_mul_ps(self.twiddle9im, x2m29);
let t_b11_3 = _mm_mul_ps(self.twiddle2im, x3m28);
let t_b11_4 = _mm_mul_ps(self.twiddle13im, x4m27);
let t_b11_5 = _mm_mul_ps(self.twiddle7im, x5m26);
let t_b11_6 = _mm_mul_ps(self.twiddle4im, x6m25);
let t_b11_7 = _mm_mul_ps(self.twiddle15im, x7m24);
let t_b11_8 = _mm_mul_ps(self.twiddle5im, x8m23);
let t_b11_9 = _mm_mul_ps(self.twiddle6im, x9m22);
let t_b11_10 = _mm_mul_ps(self.twiddle14im, x10m21);
let t_b11_11 = _mm_mul_ps(self.twiddle3im, x11m20);
let t_b11_12 = _mm_mul_ps(self.twiddle8im, x12m19);
let t_b11_13 = _mm_mul_ps(self.twiddle12im, x13m18);
let t_b11_14 = _mm_mul_ps(self.twiddle1im, x14m17);
let t_b11_15 = _mm_mul_ps(self.twiddle10im, x15m16);
let t_b12_1 = _mm_mul_ps(self.twiddle12im, x1m30);
let t_b12_2 = _mm_mul_ps(self.twiddle7im, x2m29);
let t_b12_3 = _mm_mul_ps(self.twiddle5im, x3m28);
let t_b12_4 = _mm_mul_ps(self.twiddle14im, x4m27);
let t_b12_5 = _mm_mul_ps(self.twiddle2im, x5m26);
let t_b12_6 = _mm_mul_ps(self.twiddle10im, x6m25);
let t_b12_7 = _mm_mul_ps(self.twiddle9im, x7m24);
let t_b12_8 = _mm_mul_ps(self.twiddle3im, x8m23);
let t_b12_9 = _mm_mul_ps(self.twiddle15im, x9m22);
let t_b12_10 = _mm_mul_ps(self.twiddle4im, x10m21);
let t_b12_11 = _mm_mul_ps(self.twiddle8im, x11m20);
let t_b12_12 = _mm_mul_ps(self.twiddle11im, x12m19);
let t_b12_13 = _mm_mul_ps(self.twiddle1im, x13m18);
let t_b12_14 = _mm_mul_ps(self.twiddle13im, x14m17);
let t_b12_15 = _mm_mul_ps(self.twiddle6im, x15m16);
let t_b13_1 = _mm_mul_ps(self.twiddle13im, x1m30);
let t_b13_2 = _mm_mul_ps(self.twiddle5im, x2m29);
let t_b13_3 = _mm_mul_ps(self.twiddle8im, x3m28);
let t_b13_4 = _mm_mul_ps(self.twiddle10im, x4m27);
let t_b13_5 = _mm_mul_ps(self.twiddle3im, x5m26);
let t_b13_6 = _mm_mul_ps(self.twiddle15im, x6m25);
let t_b13_7 = _mm_mul_ps(self.twiddle2im, x7m24);
let t_b13_8 = _mm_mul_ps(self.twiddle11im, x8m23);
let t_b13_9 = _mm_mul_ps(self.twiddle7im, x9m22);
let t_b13_10 = _mm_mul_ps(self.twiddle6im, x10m21);
let t_b13_11 = _mm_mul_ps(self.twiddle12im, x11m20);
let t_b13_12 = _mm_mul_ps(self.twiddle1im, x12m19);
let t_b13_13 = _mm_mul_ps(self.twiddle14im, x13m18);
let t_b13_14 = _mm_mul_ps(self.twiddle4im, x14m17);
let t_b13_15 = _mm_mul_ps(self.twiddle9im, x15m16);
let t_b14_1 = _mm_mul_ps(self.twiddle14im, x1m30);
let t_b14_2 = _mm_mul_ps(self.twiddle3im, x2m29);
let t_b14_3 = _mm_mul_ps(self.twiddle11im, x3m28);
let t_b14_4 = _mm_mul_ps(self.twiddle6im, x4m27);
let t_b14_5 = _mm_mul_ps(self.twiddle8im, x5m26);
let t_b14_6 = _mm_mul_ps(self.twiddle9im, x6m25);
let t_b14_7 = _mm_mul_ps(self.twiddle5im, x7m24);
let t_b14_8 = _mm_mul_ps(self.twiddle12im, x8m23);
let t_b14_9 = _mm_mul_ps(self.twiddle2im, x9m22);
let t_b14_10 = _mm_mul_ps(self.twiddle15im, x10m21);
let t_b14_11 = _mm_mul_ps(self.twiddle1im, x11m20);
let t_b14_12 = _mm_mul_ps(self.twiddle13im, x12m19);
let t_b14_13 = _mm_mul_ps(self.twiddle4im, x13m18);
let t_b14_14 = _mm_mul_ps(self.twiddle10im, x14m17);
let t_b14_15 = _mm_mul_ps(self.twiddle7im, x15m16);
let t_b15_1 = _mm_mul_ps(self.twiddle15im, x1m30);
let t_b15_2 = _mm_mul_ps(self.twiddle1im, x2m29);
let t_b15_3 = _mm_mul_ps(self.twiddle14im, x3m28);
let t_b15_4 = _mm_mul_ps(self.twiddle2im, x4m27);
let t_b15_5 = _mm_mul_ps(self.twiddle13im, x5m26);
let t_b15_6 = _mm_mul_ps(self.twiddle3im, x6m25);
let t_b15_7 = _mm_mul_ps(self.twiddle12im, x7m24);
let t_b15_8 = _mm_mul_ps(self.twiddle4im, x8m23);
let t_b15_9 = _mm_mul_ps(self.twiddle11im, x9m22);
let t_b15_10 = _mm_mul_ps(self.twiddle5im, x10m21);
let t_b15_11 = _mm_mul_ps(self.twiddle10im, x11m20);
let t_b15_12 = _mm_mul_ps(self.twiddle6im, x12m19);
let t_b15_13 = _mm_mul_ps(self.twiddle9im, x13m18);
let t_b15_14 = _mm_mul_ps(self.twiddle7im, x14m17);
let t_b15_15 = _mm_mul_ps(self.twiddle8im, x15m16);
let x0 = values[0];
let t_a1 = calc_f32!(x0 + t_a1_1 + t_a1_2 + t_a1_3 + t_a1_4 + t_a1_5 + t_a1_6 + t_a1_7 + t_a1_8 + t_a1_9 + t_a1_10 + t_a1_11 + t_a1_12 + t_a1_13 + t_a1_14 + t_a1_15);
let t_a2 = calc_f32!(x0 + t_a2_1 + t_a2_2 + t_a2_3 + t_a2_4 + t_a2_5 + t_a2_6 + t_a2_7 + t_a2_8 + t_a2_9 + t_a2_10 + t_a2_11 + t_a2_12 + t_a2_13 + t_a2_14 + t_a2_15);
let t_a3 = calc_f32!(x0 + t_a3_1 + t_a3_2 + t_a3_3 + t_a3_4 + t_a3_5 + t_a3_6 + t_a3_7 + t_a3_8 + t_a3_9 + t_a3_10 + t_a3_11 + t_a3_12 + t_a3_13 + t_a3_14 + t_a3_15);
let t_a4 = calc_f32!(x0 + t_a4_1 + t_a4_2 + t_a4_3 + t_a4_4 + t_a4_5 + t_a4_6 + t_a4_7 + t_a4_8 + t_a4_9 + t_a4_10 + t_a4_11 + t_a4_12 + t_a4_13 + t_a4_14 + t_a4_15);
let t_a5 = calc_f32!(x0 + t_a5_1 + t_a5_2 + t_a5_3 + t_a5_4 + t_a5_5 + t_a5_6 + t_a5_7 + t_a5_8 + t_a5_9 + t_a5_10 + t_a5_11 + t_a5_12 + t_a5_13 + t_a5_14 + t_a5_15);
let t_a6 = calc_f32!(x0 + t_a6_1 + t_a6_2 + t_a6_3 + t_a6_4 + t_a6_5 + t_a6_6 + t_a6_7 + t_a6_8 + t_a6_9 + t_a6_10 + t_a6_11 + t_a6_12 + t_a6_13 + t_a6_14 + t_a6_15);
let t_a7 = calc_f32!(x0 + t_a7_1 + t_a7_2 + t_a7_3 + t_a7_4 + t_a7_5 + t_a7_6 + t_a7_7 + t_a7_8 + t_a7_9 + t_a7_10 + t_a7_11 + t_a7_12 + t_a7_13 + t_a7_14 + t_a7_15);
let t_a8 = calc_f32!(x0 + t_a8_1 + t_a8_2 + t_a8_3 + t_a8_4 + t_a8_5 + t_a8_6 + t_a8_7 + t_a8_8 + t_a8_9 + t_a8_10 + t_a8_11 + t_a8_12 + t_a8_13 + t_a8_14 + t_a8_15);
let t_a9 = calc_f32!(x0 + t_a9_1 + t_a9_2 + t_a9_3 + t_a9_4 + t_a9_5 + t_a9_6 + t_a9_7 + t_a9_8 + t_a9_9 + t_a9_10 + t_a9_11 + t_a9_12 + t_a9_13 + t_a9_14 + t_a9_15);
let t_a10 = calc_f32!(x0 + t_a10_1 + t_a10_2 + t_a10_3 + t_a10_4 + t_a10_5 + t_a10_6 + t_a10_7 + t_a10_8 + t_a10_9 + t_a10_10 + t_a10_11 + t_a10_12 + t_a10_13 + t_a10_14 + t_a10_15);
let t_a11 = calc_f32!(x0 + t_a11_1 + t_a11_2 + t_a11_3 + t_a11_4 + t_a11_5 + t_a11_6 + t_a11_7 + t_a11_8 + t_a11_9 + t_a11_10 + t_a11_11 + t_a11_12 + t_a11_13 + t_a11_14 + t_a11_15);
let t_a12 = calc_f32!(x0 + t_a12_1 + t_a12_2 + t_a12_3 + t_a12_4 + t_a12_5 + t_a12_6 + t_a12_7 + t_a12_8 + t_a12_9 + t_a12_10 + t_a12_11 + t_a12_12 + t_a12_13 + t_a12_14 + t_a12_15);
let t_a13 = calc_f32!(x0 + t_a13_1 + t_a13_2 + t_a13_3 + t_a13_4 + t_a13_5 + t_a13_6 + t_a13_7 + t_a13_8 + t_a13_9 + t_a13_10 + t_a13_11 + t_a13_12 + t_a13_13 + t_a13_14 + t_a13_15);
let t_a14 = calc_f32!(x0 + t_a14_1 + t_a14_2 + t_a14_3 + t_a14_4 + t_a14_5 + t_a14_6 + t_a14_7 + t_a14_8 + t_a14_9 + t_a14_10 + t_a14_11 + t_a14_12 + t_a14_13 + t_a14_14 + t_a14_15);
let t_a15 = calc_f32!(x0 + t_a15_1 + t_a15_2 + t_a15_3 + t_a15_4 + t_a15_5 + t_a15_6 + t_a15_7 + t_a15_8 + t_a15_9 + t_a15_10 + t_a15_11 + t_a15_12 + t_a15_13 + t_a15_14 + t_a15_15);
let t_b1 = calc_f32!(t_b1_1 + t_b1_2 + t_b1_3 + t_b1_4 + t_b1_5 + t_b1_6 + t_b1_7 + t_b1_8 + t_b1_9 + t_b1_10 + t_b1_11 + t_b1_12 + t_b1_13 + t_b1_14 + t_b1_15);
let t_b2 = calc_f32!(t_b2_1 + t_b2_2 + t_b2_3 + t_b2_4 + t_b2_5 + t_b2_6 + t_b2_7 - t_b2_8 - t_b2_9 - t_b2_10 - t_b2_11 - t_b2_12 - t_b2_13 - t_b2_14 - t_b2_15);
let t_b3 = calc_f32!(t_b3_1 + t_b3_2 + t_b3_3 + t_b3_4 + t_b3_5 - t_b3_6 - t_b3_7 - t_b3_8 - t_b3_9 - t_b3_10 + t_b3_11 + t_b3_12 + t_b3_13 + t_b3_14 + t_b3_15);
let t_b4 = calc_f32!(t_b4_1 + t_b4_2 + t_b4_3 - t_b4_4 - t_b4_5 - t_b4_6 - t_b4_7 + t_b4_8 + t_b4_9 + t_b4_10 + t_b4_11 - t_b4_12 - t_b4_13 - t_b4_14 - t_b4_15);
let t_b5 = calc_f32!(t_b5_1 + t_b5_2 + t_b5_3 - t_b5_4 - t_b5_5 - t_b5_6 + t_b5_7 + t_b5_8 + t_b5_9 - t_b5_10 - t_b5_11 - t_b5_12 + t_b5_13 + t_b5_14 + t_b5_15);
let t_b6 = calc_f32!(t_b6_1 + t_b6_2 - t_b6_3 - t_b6_4 - t_b6_5 + t_b6_6 + t_b6_7 - t_b6_8 - t_b6_9 - t_b6_10 + t_b6_11 + t_b6_12 - t_b6_13 - t_b6_14 - t_b6_15);
let t_b7 = calc_f32!(t_b7_1 + t_b7_2 - t_b7_3 - t_b7_4 + t_b7_5 + t_b7_6 - t_b7_7 - t_b7_8 + t_b7_9 + t_b7_10 + t_b7_11 - t_b7_12 - t_b7_13 + t_b7_14 + t_b7_15);
let t_b8 = calc_f32!(t_b8_1 - t_b8_2 - t_b8_3 + t_b8_4 + t_b8_5 - t_b8_6 - t_b8_7 + t_b8_8 + t_b8_9 - t_b8_10 - t_b8_11 + t_b8_12 + t_b8_13 - t_b8_14 - t_b8_15);
let t_b9 = calc_f32!(t_b9_1 - t_b9_2 - t_b9_3 + t_b9_4 + t_b9_5 - t_b9_6 + t_b9_7 + t_b9_8 - t_b9_9 - t_b9_10 + t_b9_11 + t_b9_12 - t_b9_13 + t_b9_14 + t_b9_15);
let t_b10 = calc_f32!(t_b10_1 - t_b10_2 - t_b10_3 + t_b10_4 - t_b10_5 - t_b10_6 + t_b10_7 - t_b10_8 - t_b10_9 + t_b10_10 - t_b10_11 - t_b10_12 + t_b10_13 - t_b10_14 - t_b10_15);
let t_b11 = calc_f32!(t_b11_1 - t_b11_2 + t_b11_3 + t_b11_4 - t_b11_5 + t_b11_6 + t_b11_7 - t_b11_8 + t_b11_9 - t_b11_10 - t_b11_11 + t_b11_12 - t_b11_13 - t_b11_14 + t_b11_15);
let t_b12 = calc_f32!(t_b12_1 - t_b12_2 + t_b12_3 - t_b12_4 - t_b12_5 + t_b12_6 - t_b12_7 + t_b12_8 + t_b12_9 - t_b12_10 + t_b12_11 - t_b12_12 + t_b12_13 + t_b12_14 - t_b12_15);
let t_b13 = calc_f32!(t_b13_1 - t_b13_2 + t_b13_3 - t_b13_4 + t_b13_5 - t_b13_6 - t_b13_7 + t_b13_8 - t_b13_9 + t_b13_10 - t_b13_11 + t_b13_12 + t_b13_13 - t_b13_14 + t_b13_15);
let t_b14 = calc_f32!(t_b14_1 - t_b14_2 + t_b14_3 - t_b14_4 + t_b14_5 - t_b14_6 + t_b14_7 - t_b14_8 + t_b14_9 - t_b14_10 - t_b14_11 + t_b14_12 - t_b14_13 + t_b14_14 - t_b14_15);
let t_b15 = calc_f32!(t_b15_1 - t_b15_2 + t_b15_3 - t_b15_4 + t_b15_5 - t_b15_6 + t_b15_7 - t_b15_8 + t_b15_9 - t_b15_10 + t_b15_11 - t_b15_12 + t_b15_13 - t_b15_14 + t_b15_15);
let t_b1_rot = self.rotate.rotate_both(t_b1);
let t_b2_rot = self.rotate.rotate_both(t_b2);
let t_b3_rot = self.rotate.rotate_both(t_b3);
let t_b4_rot = self.rotate.rotate_both(t_b4);
let t_b5_rot = self.rotate.rotate_both(t_b5);
let t_b6_rot = self.rotate.rotate_both(t_b6);
let t_b7_rot = self.rotate.rotate_both(t_b7);
let t_b8_rot = self.rotate.rotate_both(t_b8);
let t_b9_rot = self.rotate.rotate_both(t_b9);
let t_b10_rot = self.rotate.rotate_both(t_b10);
let t_b11_rot = self.rotate.rotate_both(t_b11);
let t_b12_rot = self.rotate.rotate_both(t_b12);
let t_b13_rot = self.rotate.rotate_both(t_b13);
let t_b14_rot = self.rotate.rotate_both(t_b14);
let t_b15_rot = self.rotate.rotate_both(t_b15);
let y0 = calc_f32!(x0 + x1p30 + x2p29 + x3p28 + x4p27 + x5p26 + x6p25 + x7p24 + x8p23 + x9p22 + x10p21 + x11p20 + x12p19 + x13p18 + x14p17 + x15p16);
let [y1, y30] = parallel_fft2_interleaved_f32(t_a1, t_b1_rot);
let [y2, y29] = parallel_fft2_interleaved_f32(t_a2, t_b2_rot);
let [y3, y28] = parallel_fft2_interleaved_f32(t_a3, t_b3_rot);
let [y4, y27] = parallel_fft2_interleaved_f32(t_a4, t_b4_rot);
let [y5, y26] = parallel_fft2_interleaved_f32(t_a5, t_b5_rot);
let [y6, y25] = parallel_fft2_interleaved_f32(t_a6, t_b6_rot);
let [y7, y24] = parallel_fft2_interleaved_f32(t_a7, t_b7_rot);
let [y8, y23] = parallel_fft2_interleaved_f32(t_a8, t_b8_rot);
let [y9, y22] = parallel_fft2_interleaved_f32(t_a9, t_b9_rot);
let [y10, y21] = parallel_fft2_interleaved_f32(t_a10, t_b10_rot);
let [y11, y20] = parallel_fft2_interleaved_f32(t_a11, t_b11_rot);
let [y12, y19] = parallel_fft2_interleaved_f32(t_a12, t_b12_rot);
let [y13, y18] = parallel_fft2_interleaved_f32(t_a13, t_b13_rot);
let [y14, y17] = parallel_fft2_interleaved_f32(t_a14, t_b14_rot);
let [y15, y16] = parallel_fft2_interleaved_f32(t_a15, t_b15_rot);
[y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22, y23, y24, y25, y26, y27, y28, y29, y30]
}
}
pub struct SseF64Butterfly31<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
rotate: Rotate90F64,
twiddle1re: __m128d,
twiddle1im: __m128d,
twiddle2re: __m128d,
twiddle2im: __m128d,
twiddle3re: __m128d,
twiddle3im: __m128d,
twiddle4re: __m128d,
twiddle4im: __m128d,
twiddle5re: __m128d,
twiddle5im: __m128d,
twiddle6re: __m128d,
twiddle6im: __m128d,
twiddle7re: __m128d,
twiddle7im: __m128d,
twiddle8re: __m128d,
twiddle8im: __m128d,
twiddle9re: __m128d,
twiddle9im: __m128d,
twiddle10re: __m128d,
twiddle10im: __m128d,
twiddle11re: __m128d,
twiddle11im: __m128d,
twiddle12re: __m128d,
twiddle12im: __m128d,
twiddle13re: __m128d,
twiddle13im: __m128d,
twiddle14re: __m128d,
twiddle14im: __m128d,
twiddle15re: __m128d,
twiddle15im: __m128d,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly31, 31, |this: &SseF64Butterfly31<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF64Butterfly31, 31, |this: &SseF64Butterfly31<_>| this
.direction);
impl<T: FftNum> SseF64Butterfly31<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
let rotate = Rotate90F64::new(true);
let tw1: Complex<f64> = twiddles::compute_twiddle(1, 31, direction);
let tw2: Complex<f64> = twiddles::compute_twiddle(2, 31, direction);
let tw3: Complex<f64> = twiddles::compute_twiddle(3, 31, direction);
let tw4: Complex<f64> = twiddles::compute_twiddle(4, 31, direction);
let tw5: Complex<f64> = twiddles::compute_twiddle(5, 31, direction);
let tw6: Complex<f64> = twiddles::compute_twiddle(6, 31, direction);
let tw7: Complex<f64> = twiddles::compute_twiddle(7, 31, direction);
let tw8: Complex<f64> = twiddles::compute_twiddle(8, 31, direction);
let tw9: Complex<f64> = twiddles::compute_twiddle(9, 31, direction);
let tw10: Complex<f64> = twiddles::compute_twiddle(10, 31, direction);
let tw11: Complex<f64> = twiddles::compute_twiddle(11, 31, direction);
let tw12: Complex<f64> = twiddles::compute_twiddle(12, 31, direction);
let tw13: Complex<f64> = twiddles::compute_twiddle(13, 31, direction);
let tw14: Complex<f64> = twiddles::compute_twiddle(14, 31, direction);
let tw15: Complex<f64> = twiddles::compute_twiddle(15, 31, direction);
let twiddle1re = unsafe { _mm_set_pd(tw1.re, tw1.re) };
let twiddle1im = unsafe { _mm_set_pd(tw1.im, tw1.im) };
let twiddle2re = unsafe { _mm_set_pd(tw2.re, tw2.re) };
let twiddle2im = unsafe { _mm_set_pd(tw2.im, tw2.im) };
let twiddle3re = unsafe { _mm_set_pd(tw3.re, tw3.re) };
let twiddle3im = unsafe { _mm_set_pd(tw3.im, tw3.im) };
let twiddle4re = unsafe { _mm_set_pd(tw4.re, tw4.re) };
let twiddle4im = unsafe { _mm_set_pd(tw4.im, tw4.im) };
let twiddle5re = unsafe { _mm_set_pd(tw5.re, tw5.re) };
let twiddle5im = unsafe { _mm_set_pd(tw5.im, tw5.im) };
let twiddle6re = unsafe { _mm_set_pd(tw6.re, tw6.re) };
let twiddle6im = unsafe { _mm_set_pd(tw6.im, tw6.im) };
let twiddle7re = unsafe { _mm_set_pd(tw7.re, tw7.re) };
let twiddle7im = unsafe { _mm_set_pd(tw7.im, tw7.im) };
let twiddle8re = unsafe { _mm_set_pd(tw8.re, tw8.re) };
let twiddle8im = unsafe { _mm_set_pd(tw8.im, tw8.im) };
let twiddle9re = unsafe { _mm_set_pd(tw9.re, tw9.re) };
let twiddle9im = unsafe { _mm_set_pd(tw9.im, tw9.im) };
let twiddle10re = unsafe { _mm_set_pd(tw10.re, tw10.re) };
let twiddle10im = unsafe { _mm_set_pd(tw10.im, tw10.im) };
let twiddle11re = unsafe { _mm_set_pd(tw11.re, tw11.re) };
let twiddle11im = unsafe { _mm_set_pd(tw11.im, tw11.im) };
let twiddle12re = unsafe { _mm_set_pd(tw12.re, tw12.re) };
let twiddle12im = unsafe { _mm_set_pd(tw12.im, tw12.im) };
let twiddle13re = unsafe { _mm_set_pd(tw13.re, tw13.re) };
let twiddle13im = unsafe { _mm_set_pd(tw13.im, tw13.im) };
let twiddle14re = unsafe { _mm_set_pd(tw14.re, tw14.re) };
let twiddle14im = unsafe { _mm_set_pd(tw14.im, tw14.im) };
let twiddle15re = unsafe { _mm_set_pd(tw15.re, tw15.re) };
let twiddle15im = unsafe { _mm_set_pd(tw15.im, tw15.im) };
Self {
direction,
_phantom: std::marker::PhantomData,
rotate,
twiddle1re,
twiddle1im,
twiddle2re,
twiddle2im,
twiddle3re,
twiddle3im,
twiddle4re,
twiddle4im,
twiddle5re,
twiddle5im,
twiddle6re,
twiddle6im,
twiddle7re,
twiddle7im,
twiddle8re,
twiddle8im,
twiddle9re,
twiddle9im,
twiddle10re,
twiddle10im,
twiddle11re,
twiddle11im,
twiddle12re,
twiddle12im,
twiddle13re,
twiddle13im,
twiddle14re,
twiddle14im,
twiddle15re,
twiddle15im,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let values = read_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30});
let out = self.perform_fft_direct(values);
write_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30});
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 31]) -> [__m128d; 31] {
let [x1p30, x1m30] = solo_fft2_f64(values[1], values[30]);
let [x2p29, x2m29] = solo_fft2_f64(values[2], values[29]);
let [x3p28, x3m28] = solo_fft2_f64(values[3], values[28]);
let [x4p27, x4m27] = solo_fft2_f64(values[4], values[27]);
let [x5p26, x5m26] = solo_fft2_f64(values[5], values[26]);
let [x6p25, x6m25] = solo_fft2_f64(values[6], values[25]);
let [x7p24, x7m24] = solo_fft2_f64(values[7], values[24]);
let [x8p23, x8m23] = solo_fft2_f64(values[8], values[23]);
let [x9p22, x9m22] = solo_fft2_f64(values[9], values[22]);
let [x10p21, x10m21] = solo_fft2_f64(values[10], values[21]);
let [x11p20, x11m20] = solo_fft2_f64(values[11], values[20]);
let [x12p19, x12m19] = solo_fft2_f64(values[12], values[19]);
let [x13p18, x13m18] = solo_fft2_f64(values[13], values[18]);
let [x14p17, x14m17] = solo_fft2_f64(values[14], values[17]);
let [x15p16, x15m16] = solo_fft2_f64(values[15], values[16]);
let t_a1_1 = _mm_mul_pd(self.twiddle1re, x1p30);
let t_a1_2 = _mm_mul_pd(self.twiddle2re, x2p29);
let t_a1_3 = _mm_mul_pd(self.twiddle3re, x3p28);
let t_a1_4 = _mm_mul_pd(self.twiddle4re, x4p27);
let t_a1_5 = _mm_mul_pd(self.twiddle5re, x5p26);
let t_a1_6 = _mm_mul_pd(self.twiddle6re, x6p25);
let t_a1_7 = _mm_mul_pd(self.twiddle7re, x7p24);
let t_a1_8 = _mm_mul_pd(self.twiddle8re, x8p23);
let t_a1_9 = _mm_mul_pd(self.twiddle9re, x9p22);
let t_a1_10 = _mm_mul_pd(self.twiddle10re, x10p21);
let t_a1_11 = _mm_mul_pd(self.twiddle11re, x11p20);
let t_a1_12 = _mm_mul_pd(self.twiddle12re, x12p19);
let t_a1_13 = _mm_mul_pd(self.twiddle13re, x13p18);
let t_a1_14 = _mm_mul_pd(self.twiddle14re, x14p17);
let t_a1_15 = _mm_mul_pd(self.twiddle15re, x15p16);
let t_a2_1 = _mm_mul_pd(self.twiddle2re, x1p30);
let t_a2_2 = _mm_mul_pd(self.twiddle4re, x2p29);
let t_a2_3 = _mm_mul_pd(self.twiddle6re, x3p28);
let t_a2_4 = _mm_mul_pd(self.twiddle8re, x4p27);
let t_a2_5 = _mm_mul_pd(self.twiddle10re, x5p26);
let t_a2_6 = _mm_mul_pd(self.twiddle12re, x6p25);
let t_a2_7 = _mm_mul_pd(self.twiddle14re, x7p24);
let t_a2_8 = _mm_mul_pd(self.twiddle15re, x8p23);
let t_a2_9 = _mm_mul_pd(self.twiddle13re, x9p22);
let t_a2_10 = _mm_mul_pd(self.twiddle11re, x10p21);
let t_a2_11 = _mm_mul_pd(self.twiddle9re, x11p20);
let t_a2_12 = _mm_mul_pd(self.twiddle7re, x12p19);
let t_a2_13 = _mm_mul_pd(self.twiddle5re, x13p18);
let t_a2_14 = _mm_mul_pd(self.twiddle3re, x14p17);
let t_a2_15 = _mm_mul_pd(self.twiddle1re, x15p16);
let t_a3_1 = _mm_mul_pd(self.twiddle3re, x1p30);
let t_a3_2 = _mm_mul_pd(self.twiddle6re, x2p29);
let t_a3_3 = _mm_mul_pd(self.twiddle9re, x3p28);
let t_a3_4 = _mm_mul_pd(self.twiddle12re, x4p27);
let t_a3_5 = _mm_mul_pd(self.twiddle15re, x5p26);
let t_a3_6 = _mm_mul_pd(self.twiddle13re, x6p25);
let t_a3_7 = _mm_mul_pd(self.twiddle10re, x7p24);
let t_a3_8 = _mm_mul_pd(self.twiddle7re, x8p23);
let t_a3_9 = _mm_mul_pd(self.twiddle4re, x9p22);
let t_a3_10 = _mm_mul_pd(self.twiddle1re, x10p21);
let t_a3_11 = _mm_mul_pd(self.twiddle2re, x11p20);
let t_a3_12 = _mm_mul_pd(self.twiddle5re, x12p19);
let t_a3_13 = _mm_mul_pd(self.twiddle8re, x13p18);
let t_a3_14 = _mm_mul_pd(self.twiddle11re, x14p17);
let t_a3_15 = _mm_mul_pd(self.twiddle14re, x15p16);
let t_a4_1 = _mm_mul_pd(self.twiddle4re, x1p30);
let t_a4_2 = _mm_mul_pd(self.twiddle8re, x2p29);
let t_a4_3 = _mm_mul_pd(self.twiddle12re, x3p28);
let t_a4_4 = _mm_mul_pd(self.twiddle15re, x4p27);
let t_a4_5 = _mm_mul_pd(self.twiddle11re, x5p26);
let t_a4_6 = _mm_mul_pd(self.twiddle7re, x6p25);
let t_a4_7 = _mm_mul_pd(self.twiddle3re, x7p24);
let t_a4_8 = _mm_mul_pd(self.twiddle1re, x8p23);
let t_a4_9 = _mm_mul_pd(self.twiddle5re, x9p22);
let t_a4_10 = _mm_mul_pd(self.twiddle9re, x10p21);
let t_a4_11 = _mm_mul_pd(self.twiddle13re, x11p20);
let t_a4_12 = _mm_mul_pd(self.twiddle14re, x12p19);
let t_a4_13 = _mm_mul_pd(self.twiddle10re, x13p18);
let t_a4_14 = _mm_mul_pd(self.twiddle6re, x14p17);
let t_a4_15 = _mm_mul_pd(self.twiddle2re, x15p16);
let t_a5_1 = _mm_mul_pd(self.twiddle5re, x1p30);
let t_a5_2 = _mm_mul_pd(self.twiddle10re, x2p29);
let t_a5_3 = _mm_mul_pd(self.twiddle15re, x3p28);
let t_a5_4 = _mm_mul_pd(self.twiddle11re, x4p27);
let t_a5_5 = _mm_mul_pd(self.twiddle6re, x5p26);
let t_a5_6 = _mm_mul_pd(self.twiddle1re, x6p25);
let t_a5_7 = _mm_mul_pd(self.twiddle4re, x7p24);
let t_a5_8 = _mm_mul_pd(self.twiddle9re, x8p23);
let t_a5_9 = _mm_mul_pd(self.twiddle14re, x9p22);
let t_a5_10 = _mm_mul_pd(self.twiddle12re, x10p21);
let t_a5_11 = _mm_mul_pd(self.twiddle7re, x11p20);
let t_a5_12 = _mm_mul_pd(self.twiddle2re, x12p19);
let t_a5_13 = _mm_mul_pd(self.twiddle3re, x13p18);
let t_a5_14 = _mm_mul_pd(self.twiddle8re, x14p17);
let t_a5_15 = _mm_mul_pd(self.twiddle13re, x15p16);
let t_a6_1 = _mm_mul_pd(self.twiddle6re, x1p30);
let t_a6_2 = _mm_mul_pd(self.twiddle12re, x2p29);
let t_a6_3 = _mm_mul_pd(self.twiddle13re, x3p28);
let t_a6_4 = _mm_mul_pd(self.twiddle7re, x4p27);
let t_a6_5 = _mm_mul_pd(self.twiddle1re, x5p26);
let t_a6_6 = _mm_mul_pd(self.twiddle5re, x6p25);
let t_a6_7 = _mm_mul_pd(self.twiddle11re, x7p24);
let t_a6_8 = _mm_mul_pd(self.twiddle14re, x8p23);
let t_a6_9 = _mm_mul_pd(self.twiddle8re, x9p22);
let t_a6_10 = _mm_mul_pd(self.twiddle2re, x10p21);
let t_a6_11 = _mm_mul_pd(self.twiddle4re, x11p20);
let t_a6_12 = _mm_mul_pd(self.twiddle10re, x12p19);
let t_a6_13 = _mm_mul_pd(self.twiddle15re, x13p18);
let t_a6_14 = _mm_mul_pd(self.twiddle9re, x14p17);
let t_a6_15 = _mm_mul_pd(self.twiddle3re, x15p16);
let t_a7_1 = _mm_mul_pd(self.twiddle7re, x1p30);
let t_a7_2 = _mm_mul_pd(self.twiddle14re, x2p29);
let t_a7_3 = _mm_mul_pd(self.twiddle10re, x3p28);
let t_a7_4 = _mm_mul_pd(self.twiddle3re, x4p27);
let t_a7_5 = _mm_mul_pd(self.twiddle4re, x5p26);
let t_a7_6 = _mm_mul_pd(self.twiddle11re, x6p25);
let t_a7_7 = _mm_mul_pd(self.twiddle13re, x7p24);
let t_a7_8 = _mm_mul_pd(self.twiddle6re, x8p23);
let t_a7_9 = _mm_mul_pd(self.twiddle1re, x9p22);
let t_a7_10 = _mm_mul_pd(self.twiddle8re, x10p21);
let t_a7_11 = _mm_mul_pd(self.twiddle15re, x11p20);
let t_a7_12 = _mm_mul_pd(self.twiddle9re, x12p19);
let t_a7_13 = _mm_mul_pd(self.twiddle2re, x13p18);
let t_a7_14 = _mm_mul_pd(self.twiddle5re, x14p17);
let t_a7_15 = _mm_mul_pd(self.twiddle12re, x15p16);
let t_a8_1 = _mm_mul_pd(self.twiddle8re, x1p30);
let t_a8_2 = _mm_mul_pd(self.twiddle15re, x2p29);
let t_a8_3 = _mm_mul_pd(self.twiddle7re, x3p28);
let t_a8_4 = _mm_mul_pd(self.twiddle1re, x4p27);
let t_a8_5 = _mm_mul_pd(self.twiddle9re, x5p26);
let t_a8_6 = _mm_mul_pd(self.twiddle14re, x6p25);
let t_a8_7 = _mm_mul_pd(self.twiddle6re, x7p24);
let t_a8_8 = _mm_mul_pd(self.twiddle2re, x8p23);
let t_a8_9 = _mm_mul_pd(self.twiddle10re, x9p22);
let t_a8_10 = _mm_mul_pd(self.twiddle13re, x10p21);
let t_a8_11 = _mm_mul_pd(self.twiddle5re, x11p20);
let t_a8_12 = _mm_mul_pd(self.twiddle3re, x12p19);
let t_a8_13 = _mm_mul_pd(self.twiddle11re, x13p18);
let t_a8_14 = _mm_mul_pd(self.twiddle12re, x14p17);
let t_a8_15 = _mm_mul_pd(self.twiddle4re, x15p16);
let t_a9_1 = _mm_mul_pd(self.twiddle9re, x1p30);
let t_a9_2 = _mm_mul_pd(self.twiddle13re, x2p29);
let t_a9_3 = _mm_mul_pd(self.twiddle4re, x3p28);
let t_a9_4 = _mm_mul_pd(self.twiddle5re, x4p27);
let t_a9_5 = _mm_mul_pd(self.twiddle14re, x5p26);
let t_a9_6 = _mm_mul_pd(self.twiddle8re, x6p25);
let t_a9_7 = _mm_mul_pd(self.twiddle1re, x7p24);
let t_a9_8 = _mm_mul_pd(self.twiddle10re, x8p23);
let t_a9_9 = _mm_mul_pd(self.twiddle12re, x9p22);
let t_a9_10 = _mm_mul_pd(self.twiddle3re, x10p21);
let t_a9_11 = _mm_mul_pd(self.twiddle6re, x11p20);
let t_a9_12 = _mm_mul_pd(self.twiddle15re, x12p19);
let t_a9_13 = _mm_mul_pd(self.twiddle7re, x13p18);
let t_a9_14 = _mm_mul_pd(self.twiddle2re, x14p17);
let t_a9_15 = _mm_mul_pd(self.twiddle11re, x15p16);
let t_a10_1 = _mm_mul_pd(self.twiddle10re, x1p30);
let t_a10_2 = _mm_mul_pd(self.twiddle11re, x2p29);
let t_a10_3 = _mm_mul_pd(self.twiddle1re, x3p28);
let t_a10_4 = _mm_mul_pd(self.twiddle9re, x4p27);
let t_a10_5 = _mm_mul_pd(self.twiddle12re, x5p26);
let t_a10_6 = _mm_mul_pd(self.twiddle2re, x6p25);
let t_a10_7 = _mm_mul_pd(self.twiddle8re, x7p24);
let t_a10_8 = _mm_mul_pd(self.twiddle13re, x8p23);
let t_a10_9 = _mm_mul_pd(self.twiddle3re, x9p22);
let t_a10_10 = _mm_mul_pd(self.twiddle7re, x10p21);
let t_a10_11 = _mm_mul_pd(self.twiddle14re, x11p20);
let t_a10_12 = _mm_mul_pd(self.twiddle4re, x12p19);
let t_a10_13 = _mm_mul_pd(self.twiddle6re, x13p18);
let t_a10_14 = _mm_mul_pd(self.twiddle15re, x14p17);
let t_a10_15 = _mm_mul_pd(self.twiddle5re, x15p16);
let t_a11_1 = _mm_mul_pd(self.twiddle11re, x1p30);
let t_a11_2 = _mm_mul_pd(self.twiddle9re, x2p29);
let t_a11_3 = _mm_mul_pd(self.twiddle2re, x3p28);
let t_a11_4 = _mm_mul_pd(self.twiddle13re, x4p27);
let t_a11_5 = _mm_mul_pd(self.twiddle7re, x5p26);
let t_a11_6 = _mm_mul_pd(self.twiddle4re, x6p25);
let t_a11_7 = _mm_mul_pd(self.twiddle15re, x7p24);
let t_a11_8 = _mm_mul_pd(self.twiddle5re, x8p23);
let t_a11_9 = _mm_mul_pd(self.twiddle6re, x9p22);
let t_a11_10 = _mm_mul_pd(self.twiddle14re, x10p21);
let t_a11_11 = _mm_mul_pd(self.twiddle3re, x11p20);
let t_a11_12 = _mm_mul_pd(self.twiddle8re, x12p19);
let t_a11_13 = _mm_mul_pd(self.twiddle12re, x13p18);
let t_a11_14 = _mm_mul_pd(self.twiddle1re, x14p17);
let t_a11_15 = _mm_mul_pd(self.twiddle10re, x15p16);
let t_a12_1 = _mm_mul_pd(self.twiddle12re, x1p30);
let t_a12_2 = _mm_mul_pd(self.twiddle7re, x2p29);
let t_a12_3 = _mm_mul_pd(self.twiddle5re, x3p28);
let t_a12_4 = _mm_mul_pd(self.twiddle14re, x4p27);
let t_a12_5 = _mm_mul_pd(self.twiddle2re, x5p26);
let t_a12_6 = _mm_mul_pd(self.twiddle10re, x6p25);
let t_a12_7 = _mm_mul_pd(self.twiddle9re, x7p24);
let t_a12_8 = _mm_mul_pd(self.twiddle3re, x8p23);
let t_a12_9 = _mm_mul_pd(self.twiddle15re, x9p22);
let t_a12_10 = _mm_mul_pd(self.twiddle4re, x10p21);
let t_a12_11 = _mm_mul_pd(self.twiddle8re, x11p20);
let t_a12_12 = _mm_mul_pd(self.twiddle11re, x12p19);
let t_a12_13 = _mm_mul_pd(self.twiddle1re, x13p18);
let t_a12_14 = _mm_mul_pd(self.twiddle13re, x14p17);
let t_a12_15 = _mm_mul_pd(self.twiddle6re, x15p16);
let t_a13_1 = _mm_mul_pd(self.twiddle13re, x1p30);
let t_a13_2 = _mm_mul_pd(self.twiddle5re, x2p29);
let t_a13_3 = _mm_mul_pd(self.twiddle8re, x3p28);
let t_a13_4 = _mm_mul_pd(self.twiddle10re, x4p27);
let t_a13_5 = _mm_mul_pd(self.twiddle3re, x5p26);
let t_a13_6 = _mm_mul_pd(self.twiddle15re, x6p25);
let t_a13_7 = _mm_mul_pd(self.twiddle2re, x7p24);
let t_a13_8 = _mm_mul_pd(self.twiddle11re, x8p23);
let t_a13_9 = _mm_mul_pd(self.twiddle7re, x9p22);
let t_a13_10 = _mm_mul_pd(self.twiddle6re, x10p21);
let t_a13_11 = _mm_mul_pd(self.twiddle12re, x11p20);
let t_a13_12 = _mm_mul_pd(self.twiddle1re, x12p19);
let t_a13_13 = _mm_mul_pd(self.twiddle14re, x13p18);
let t_a13_14 = _mm_mul_pd(self.twiddle4re, x14p17);
let t_a13_15 = _mm_mul_pd(self.twiddle9re, x15p16);
let t_a14_1 = _mm_mul_pd(self.twiddle14re, x1p30);
let t_a14_2 = _mm_mul_pd(self.twiddle3re, x2p29);
let t_a14_3 = _mm_mul_pd(self.twiddle11re, x3p28);
let t_a14_4 = _mm_mul_pd(self.twiddle6re, x4p27);
let t_a14_5 = _mm_mul_pd(self.twiddle8re, x5p26);
let t_a14_6 = _mm_mul_pd(self.twiddle9re, x6p25);
let t_a14_7 = _mm_mul_pd(self.twiddle5re, x7p24);
let t_a14_8 = _mm_mul_pd(self.twiddle12re, x8p23);
let t_a14_9 = _mm_mul_pd(self.twiddle2re, x9p22);
let t_a14_10 = _mm_mul_pd(self.twiddle15re, x10p21);
let t_a14_11 = _mm_mul_pd(self.twiddle1re, x11p20);
let t_a14_12 = _mm_mul_pd(self.twiddle13re, x12p19);
let t_a14_13 = _mm_mul_pd(self.twiddle4re, x13p18);
let t_a14_14 = _mm_mul_pd(self.twiddle10re, x14p17);
let t_a14_15 = _mm_mul_pd(self.twiddle7re, x15p16);
let t_a15_1 = _mm_mul_pd(self.twiddle15re, x1p30);
let t_a15_2 = _mm_mul_pd(self.twiddle1re, x2p29);
let t_a15_3 = _mm_mul_pd(self.twiddle14re, x3p28);
let t_a15_4 = _mm_mul_pd(self.twiddle2re, x4p27);
let t_a15_5 = _mm_mul_pd(self.twiddle13re, x5p26);
let t_a15_6 = _mm_mul_pd(self.twiddle3re, x6p25);
let t_a15_7 = _mm_mul_pd(self.twiddle12re, x7p24);
let t_a15_8 = _mm_mul_pd(self.twiddle4re, x8p23);
let t_a15_9 = _mm_mul_pd(self.twiddle11re, x9p22);
let t_a15_10 = _mm_mul_pd(self.twiddle5re, x10p21);
let t_a15_11 = _mm_mul_pd(self.twiddle10re, x11p20);
let t_a15_12 = _mm_mul_pd(self.twiddle6re, x12p19);
let t_a15_13 = _mm_mul_pd(self.twiddle9re, x13p18);
let t_a15_14 = _mm_mul_pd(self.twiddle7re, x14p17);
let t_a15_15 = _mm_mul_pd(self.twiddle8re, x15p16);
let t_b1_1 = _mm_mul_pd(self.twiddle1im, x1m30);
let t_b1_2 = _mm_mul_pd(self.twiddle2im, x2m29);
let t_b1_3 = _mm_mul_pd(self.twiddle3im, x3m28);
let t_b1_4 = _mm_mul_pd(self.twiddle4im, x4m27);
let t_b1_5 = _mm_mul_pd(self.twiddle5im, x5m26);
let t_b1_6 = _mm_mul_pd(self.twiddle6im, x6m25);
let t_b1_7 = _mm_mul_pd(self.twiddle7im, x7m24);
let t_b1_8 = _mm_mul_pd(self.twiddle8im, x8m23);
let t_b1_9 = _mm_mul_pd(self.twiddle9im, x9m22);
let t_b1_10 = _mm_mul_pd(self.twiddle10im, x10m21);
let t_b1_11 = _mm_mul_pd(self.twiddle11im, x11m20);
let t_b1_12 = _mm_mul_pd(self.twiddle12im, x12m19);
let t_b1_13 = _mm_mul_pd(self.twiddle13im, x13m18);
let t_b1_14 = _mm_mul_pd(self.twiddle14im, x14m17);
let t_b1_15 = _mm_mul_pd(self.twiddle15im, x15m16);
let t_b2_1 = _mm_mul_pd(self.twiddle2im, x1m30);
let t_b2_2 = _mm_mul_pd(self.twiddle4im, x2m29);
let t_b2_3 = _mm_mul_pd(self.twiddle6im, x3m28);
let t_b2_4 = _mm_mul_pd(self.twiddle8im, x4m27);
let t_b2_5 = _mm_mul_pd(self.twiddle10im, x5m26);
let t_b2_6 = _mm_mul_pd(self.twiddle12im, x6m25);
let t_b2_7 = _mm_mul_pd(self.twiddle14im, x7m24);
let t_b2_8 = _mm_mul_pd(self.twiddle15im, x8m23);
let t_b2_9 = _mm_mul_pd(self.twiddle13im, x9m22);
let t_b2_10 = _mm_mul_pd(self.twiddle11im, x10m21);
let t_b2_11 = _mm_mul_pd(self.twiddle9im, x11m20);
let t_b2_12 = _mm_mul_pd(self.twiddle7im, x12m19);
let t_b2_13 = _mm_mul_pd(self.twiddle5im, x13m18);
let t_b2_14 = _mm_mul_pd(self.twiddle3im, x14m17);
let t_b2_15 = _mm_mul_pd(self.twiddle1im, x15m16);
let t_b3_1 = _mm_mul_pd(self.twiddle3im, x1m30);
let t_b3_2 = _mm_mul_pd(self.twiddle6im, x2m29);
let t_b3_3 = _mm_mul_pd(self.twiddle9im, x3m28);
let t_b3_4 = _mm_mul_pd(self.twiddle12im, x4m27);
let t_b3_5 = _mm_mul_pd(self.twiddle15im, x5m26);
let t_b3_6 = _mm_mul_pd(self.twiddle13im, x6m25);
let t_b3_7 = _mm_mul_pd(self.twiddle10im, x7m24);
let t_b3_8 = _mm_mul_pd(self.twiddle7im, x8m23);
let t_b3_9 = _mm_mul_pd(self.twiddle4im, x9m22);
let t_b3_10 = _mm_mul_pd(self.twiddle1im, x10m21);
let t_b3_11 = _mm_mul_pd(self.twiddle2im, x11m20);
let t_b3_12 = _mm_mul_pd(self.twiddle5im, x12m19);
let t_b3_13 = _mm_mul_pd(self.twiddle8im, x13m18);
let t_b3_14 = _mm_mul_pd(self.twiddle11im, x14m17);
let t_b3_15 = _mm_mul_pd(self.twiddle14im, x15m16);
let t_b4_1 = _mm_mul_pd(self.twiddle4im, x1m30);
let t_b4_2 = _mm_mul_pd(self.twiddle8im, x2m29);
let t_b4_3 = _mm_mul_pd(self.twiddle12im, x3m28);
let t_b4_4 = _mm_mul_pd(self.twiddle15im, x4m27);
let t_b4_5 = _mm_mul_pd(self.twiddle11im, x5m26);
let t_b4_6 = _mm_mul_pd(self.twiddle7im, x6m25);
let t_b4_7 = _mm_mul_pd(self.twiddle3im, x7m24);
let t_b4_8 = _mm_mul_pd(self.twiddle1im, x8m23);
let t_b4_9 = _mm_mul_pd(self.twiddle5im, x9m22);
let t_b4_10 = _mm_mul_pd(self.twiddle9im, x10m21);
let t_b4_11 = _mm_mul_pd(self.twiddle13im, x11m20);
let t_b4_12 = _mm_mul_pd(self.twiddle14im, x12m19);
let t_b4_13 = _mm_mul_pd(self.twiddle10im, x13m18);
let t_b4_14 = _mm_mul_pd(self.twiddle6im, x14m17);
let t_b4_15 = _mm_mul_pd(self.twiddle2im, x15m16);
let t_b5_1 = _mm_mul_pd(self.twiddle5im, x1m30);
let t_b5_2 = _mm_mul_pd(self.twiddle10im, x2m29);
let t_b5_3 = _mm_mul_pd(self.twiddle15im, x3m28);
let t_b5_4 = _mm_mul_pd(self.twiddle11im, x4m27);
let t_b5_5 = _mm_mul_pd(self.twiddle6im, x5m26);
let t_b5_6 = _mm_mul_pd(self.twiddle1im, x6m25);
let t_b5_7 = _mm_mul_pd(self.twiddle4im, x7m24);
let t_b5_8 = _mm_mul_pd(self.twiddle9im, x8m23);
let t_b5_9 = _mm_mul_pd(self.twiddle14im, x9m22);
let t_b5_10 = _mm_mul_pd(self.twiddle12im, x10m21);
let t_b5_11 = _mm_mul_pd(self.twiddle7im, x11m20);
let t_b5_12 = _mm_mul_pd(self.twiddle2im, x12m19);
let t_b5_13 = _mm_mul_pd(self.twiddle3im, x13m18);
let t_b5_14 = _mm_mul_pd(self.twiddle8im, x14m17);
let t_b5_15 = _mm_mul_pd(self.twiddle13im, x15m16);
let t_b6_1 = _mm_mul_pd(self.twiddle6im, x1m30);
let t_b6_2 = _mm_mul_pd(self.twiddle12im, x2m29);
let t_b6_3 = _mm_mul_pd(self.twiddle13im, x3m28);
let t_b6_4 = _mm_mul_pd(self.twiddle7im, x4m27);
let t_b6_5 = _mm_mul_pd(self.twiddle1im, x5m26);
let t_b6_6 = _mm_mul_pd(self.twiddle5im, x6m25);
let t_b6_7 = _mm_mul_pd(self.twiddle11im, x7m24);
let t_b6_8 = _mm_mul_pd(self.twiddle14im, x8m23);
let t_b6_9 = _mm_mul_pd(self.twiddle8im, x9m22);
let t_b6_10 = _mm_mul_pd(self.twiddle2im, x10m21);
let t_b6_11 = _mm_mul_pd(self.twiddle4im, x11m20);
let t_b6_12 = _mm_mul_pd(self.twiddle10im, x12m19);
let t_b6_13 = _mm_mul_pd(self.twiddle15im, x13m18);
let t_b6_14 = _mm_mul_pd(self.twiddle9im, x14m17);
let t_b6_15 = _mm_mul_pd(self.twiddle3im, x15m16);
let t_b7_1 = _mm_mul_pd(self.twiddle7im, x1m30);
let t_b7_2 = _mm_mul_pd(self.twiddle14im, x2m29);
let t_b7_3 = _mm_mul_pd(self.twiddle10im, x3m28);
let t_b7_4 = _mm_mul_pd(self.twiddle3im, x4m27);
let t_b7_5 = _mm_mul_pd(self.twiddle4im, x5m26);
let t_b7_6 = _mm_mul_pd(self.twiddle11im, x6m25);
let t_b7_7 = _mm_mul_pd(self.twiddle13im, x7m24);
let t_b7_8 = _mm_mul_pd(self.twiddle6im, x8m23);
let t_b7_9 = _mm_mul_pd(self.twiddle1im, x9m22);
let t_b7_10 = _mm_mul_pd(self.twiddle8im, x10m21);
let t_b7_11 = _mm_mul_pd(self.twiddle15im, x11m20);
let t_b7_12 = _mm_mul_pd(self.twiddle9im, x12m19);
let t_b7_13 = _mm_mul_pd(self.twiddle2im, x13m18);
let t_b7_14 = _mm_mul_pd(self.twiddle5im, x14m17);
let t_b7_15 = _mm_mul_pd(self.twiddle12im, x15m16);
let t_b8_1 = _mm_mul_pd(self.twiddle8im, x1m30);
let t_b8_2 = _mm_mul_pd(self.twiddle15im, x2m29);
let t_b8_3 = _mm_mul_pd(self.twiddle7im, x3m28);
let t_b8_4 = _mm_mul_pd(self.twiddle1im, x4m27);
let t_b8_5 = _mm_mul_pd(self.twiddle9im, x5m26);
let t_b8_6 = _mm_mul_pd(self.twiddle14im, x6m25);
let t_b8_7 = _mm_mul_pd(self.twiddle6im, x7m24);
let t_b8_8 = _mm_mul_pd(self.twiddle2im, x8m23);
let t_b8_9 = _mm_mul_pd(self.twiddle10im, x9m22);
let t_b8_10 = _mm_mul_pd(self.twiddle13im, x10m21);
let t_b8_11 = _mm_mul_pd(self.twiddle5im, x11m20);
let t_b8_12 = _mm_mul_pd(self.twiddle3im, x12m19);
let t_b8_13 = _mm_mul_pd(self.twiddle11im, x13m18);
let t_b8_14 = _mm_mul_pd(self.twiddle12im, x14m17);
let t_b8_15 = _mm_mul_pd(self.twiddle4im, x15m16);
let t_b9_1 = _mm_mul_pd(self.twiddle9im, x1m30);
let t_b9_2 = _mm_mul_pd(self.twiddle13im, x2m29);
let t_b9_3 = _mm_mul_pd(self.twiddle4im, x3m28);
let t_b9_4 = _mm_mul_pd(self.twiddle5im, x4m27);
let t_b9_5 = _mm_mul_pd(self.twiddle14im, x5m26);
let t_b9_6 = _mm_mul_pd(self.twiddle8im, x6m25);
let t_b9_7 = _mm_mul_pd(self.twiddle1im, x7m24);
let t_b9_8 = _mm_mul_pd(self.twiddle10im, x8m23);
let t_b9_9 = _mm_mul_pd(self.twiddle12im, x9m22);
let t_b9_10 = _mm_mul_pd(self.twiddle3im, x10m21);
let t_b9_11 = _mm_mul_pd(self.twiddle6im, x11m20);
let t_b9_12 = _mm_mul_pd(self.twiddle15im, x12m19);
let t_b9_13 = _mm_mul_pd(self.twiddle7im, x13m18);
let t_b9_14 = _mm_mul_pd(self.twiddle2im, x14m17);
let t_b9_15 = _mm_mul_pd(self.twiddle11im, x15m16);
let t_b10_1 = _mm_mul_pd(self.twiddle10im, x1m30);
let t_b10_2 = _mm_mul_pd(self.twiddle11im, x2m29);
let t_b10_3 = _mm_mul_pd(self.twiddle1im, x3m28);
let t_b10_4 = _mm_mul_pd(self.twiddle9im, x4m27);
let t_b10_5 = _mm_mul_pd(self.twiddle12im, x5m26);
let t_b10_6 = _mm_mul_pd(self.twiddle2im, x6m25);
let t_b10_7 = _mm_mul_pd(self.twiddle8im, x7m24);
let t_b10_8 = _mm_mul_pd(self.twiddle13im, x8m23);
let t_b10_9 = _mm_mul_pd(self.twiddle3im, x9m22);
let t_b10_10 = _mm_mul_pd(self.twiddle7im, x10m21);
let t_b10_11 = _mm_mul_pd(self.twiddle14im, x11m20);
let t_b10_12 = _mm_mul_pd(self.twiddle4im, x12m19);
let t_b10_13 = _mm_mul_pd(self.twiddle6im, x13m18);
let t_b10_14 = _mm_mul_pd(self.twiddle15im, x14m17);
let t_b10_15 = _mm_mul_pd(self.twiddle5im, x15m16);
let t_b11_1 = _mm_mul_pd(self.twiddle11im, x1m30);
let t_b11_2 = _mm_mul_pd(self.twiddle9im, x2m29);
let t_b11_3 = _mm_mul_pd(self.twiddle2im, x3m28);
let t_b11_4 = _mm_mul_pd(self.twiddle13im, x4m27);
let t_b11_5 = _mm_mul_pd(self.twiddle7im, x5m26);
let t_b11_6 = _mm_mul_pd(self.twiddle4im, x6m25);
let t_b11_7 = _mm_mul_pd(self.twiddle15im, x7m24);
let t_b11_8 = _mm_mul_pd(self.twiddle5im, x8m23);
let t_b11_9 = _mm_mul_pd(self.twiddle6im, x9m22);
let t_b11_10 = _mm_mul_pd(self.twiddle14im, x10m21);
let t_b11_11 = _mm_mul_pd(self.twiddle3im, x11m20);
let t_b11_12 = _mm_mul_pd(self.twiddle8im, x12m19);
let t_b11_13 = _mm_mul_pd(self.twiddle12im, x13m18);
let t_b11_14 = _mm_mul_pd(self.twiddle1im, x14m17);
let t_b11_15 = _mm_mul_pd(self.twiddle10im, x15m16);
let t_b12_1 = _mm_mul_pd(self.twiddle12im, x1m30);
let t_b12_2 = _mm_mul_pd(self.twiddle7im, x2m29);
let t_b12_3 = _mm_mul_pd(self.twiddle5im, x3m28);
let t_b12_4 = _mm_mul_pd(self.twiddle14im, x4m27);
let t_b12_5 = _mm_mul_pd(self.twiddle2im, x5m26);
let t_b12_6 = _mm_mul_pd(self.twiddle10im, x6m25);
let t_b12_7 = _mm_mul_pd(self.twiddle9im, x7m24);
let t_b12_8 = _mm_mul_pd(self.twiddle3im, x8m23);
let t_b12_9 = _mm_mul_pd(self.twiddle15im, x9m22);
let t_b12_10 = _mm_mul_pd(self.twiddle4im, x10m21);
let t_b12_11 = _mm_mul_pd(self.twiddle8im, x11m20);
let t_b12_12 = _mm_mul_pd(self.twiddle11im, x12m19);
let t_b12_13 = _mm_mul_pd(self.twiddle1im, x13m18);
let t_b12_14 = _mm_mul_pd(self.twiddle13im, x14m17);
let t_b12_15 = _mm_mul_pd(self.twiddle6im, x15m16);
let t_b13_1 = _mm_mul_pd(self.twiddle13im, x1m30);
let t_b13_2 = _mm_mul_pd(self.twiddle5im, x2m29);
let t_b13_3 = _mm_mul_pd(self.twiddle8im, x3m28);
let t_b13_4 = _mm_mul_pd(self.twiddle10im, x4m27);
let t_b13_5 = _mm_mul_pd(self.twiddle3im, x5m26);
let t_b13_6 = _mm_mul_pd(self.twiddle15im, x6m25);
let t_b13_7 = _mm_mul_pd(self.twiddle2im, x7m24);
let t_b13_8 = _mm_mul_pd(self.twiddle11im, x8m23);
let t_b13_9 = _mm_mul_pd(self.twiddle7im, x9m22);
let t_b13_10 = _mm_mul_pd(self.twiddle6im, x10m21);
let t_b13_11 = _mm_mul_pd(self.twiddle12im, x11m20);
let t_b13_12 = _mm_mul_pd(self.twiddle1im, x12m19);
let t_b13_13 = _mm_mul_pd(self.twiddle14im, x13m18);
let t_b13_14 = _mm_mul_pd(self.twiddle4im, x14m17);
let t_b13_15 = _mm_mul_pd(self.twiddle9im, x15m16);
let t_b14_1 = _mm_mul_pd(self.twiddle14im, x1m30);
let t_b14_2 = _mm_mul_pd(self.twiddle3im, x2m29);
let t_b14_3 = _mm_mul_pd(self.twiddle11im, x3m28);
let t_b14_4 = _mm_mul_pd(self.twiddle6im, x4m27);
let t_b14_5 = _mm_mul_pd(self.twiddle8im, x5m26);
let t_b14_6 = _mm_mul_pd(self.twiddle9im, x6m25);
let t_b14_7 = _mm_mul_pd(self.twiddle5im, x7m24);
let t_b14_8 = _mm_mul_pd(self.twiddle12im, x8m23);
let t_b14_9 = _mm_mul_pd(self.twiddle2im, x9m22);
let t_b14_10 = _mm_mul_pd(self.twiddle15im, x10m21);
let t_b14_11 = _mm_mul_pd(self.twiddle1im, x11m20);
let t_b14_12 = _mm_mul_pd(self.twiddle13im, x12m19);
let t_b14_13 = _mm_mul_pd(self.twiddle4im, x13m18);
let t_b14_14 = _mm_mul_pd(self.twiddle10im, x14m17);
let t_b14_15 = _mm_mul_pd(self.twiddle7im, x15m16);
let t_b15_1 = _mm_mul_pd(self.twiddle15im, x1m30);
let t_b15_2 = _mm_mul_pd(self.twiddle1im, x2m29);
let t_b15_3 = _mm_mul_pd(self.twiddle14im, x3m28);
let t_b15_4 = _mm_mul_pd(self.twiddle2im, x4m27);
let t_b15_5 = _mm_mul_pd(self.twiddle13im, x5m26);
let t_b15_6 = _mm_mul_pd(self.twiddle3im, x6m25);
let t_b15_7 = _mm_mul_pd(self.twiddle12im, x7m24);
let t_b15_8 = _mm_mul_pd(self.twiddle4im, x8m23);
let t_b15_9 = _mm_mul_pd(self.twiddle11im, x9m22);
let t_b15_10 = _mm_mul_pd(self.twiddle5im, x10m21);
let t_b15_11 = _mm_mul_pd(self.twiddle10im, x11m20);
let t_b15_12 = _mm_mul_pd(self.twiddle6im, x12m19);
let t_b15_13 = _mm_mul_pd(self.twiddle9im, x13m18);
let t_b15_14 = _mm_mul_pd(self.twiddle7im, x14m17);
let t_b15_15 = _mm_mul_pd(self.twiddle8im, x15m16);
let x0 = values[0];
let t_a1 = calc_f64!(x0 + t_a1_1 + t_a1_2 + t_a1_3 + t_a1_4 + t_a1_5 + t_a1_6 + t_a1_7 + t_a1_8 + t_a1_9 + t_a1_10 + t_a1_11 + t_a1_12 + t_a1_13 + t_a1_14 + t_a1_15);
let t_a2 = calc_f64!(x0 + t_a2_1 + t_a2_2 + t_a2_3 + t_a2_4 + t_a2_5 + t_a2_6 + t_a2_7 + t_a2_8 + t_a2_9 + t_a2_10 + t_a2_11 + t_a2_12 + t_a2_13 + t_a2_14 + t_a2_15);
let t_a3 = calc_f64!(x0 + t_a3_1 + t_a3_2 + t_a3_3 + t_a3_4 + t_a3_5 + t_a3_6 + t_a3_7 + t_a3_8 + t_a3_9 + t_a3_10 + t_a3_11 + t_a3_12 + t_a3_13 + t_a3_14 + t_a3_15);
let t_a4 = calc_f64!(x0 + t_a4_1 + t_a4_2 + t_a4_3 + t_a4_4 + t_a4_5 + t_a4_6 + t_a4_7 + t_a4_8 + t_a4_9 + t_a4_10 + t_a4_11 + t_a4_12 + t_a4_13 + t_a4_14 + t_a4_15);
let t_a5 = calc_f64!(x0 + t_a5_1 + t_a5_2 + t_a5_3 + t_a5_4 + t_a5_5 + t_a5_6 + t_a5_7 + t_a5_8 + t_a5_9 + t_a5_10 + t_a5_11 + t_a5_12 + t_a5_13 + t_a5_14 + t_a5_15);
let t_a6 = calc_f64!(x0 + t_a6_1 + t_a6_2 + t_a6_3 + t_a6_4 + t_a6_5 + t_a6_6 + t_a6_7 + t_a6_8 + t_a6_9 + t_a6_10 + t_a6_11 + t_a6_12 + t_a6_13 + t_a6_14 + t_a6_15);
let t_a7 = calc_f64!(x0 + t_a7_1 + t_a7_2 + t_a7_3 + t_a7_4 + t_a7_5 + t_a7_6 + t_a7_7 + t_a7_8 + t_a7_9 + t_a7_10 + t_a7_11 + t_a7_12 + t_a7_13 + t_a7_14 + t_a7_15);
let t_a8 = calc_f64!(x0 + t_a8_1 + t_a8_2 + t_a8_3 + t_a8_4 + t_a8_5 + t_a8_6 + t_a8_7 + t_a8_8 + t_a8_9 + t_a8_10 + t_a8_11 + t_a8_12 + t_a8_13 + t_a8_14 + t_a8_15);
let t_a9 = calc_f64!(x0 + t_a9_1 + t_a9_2 + t_a9_3 + t_a9_4 + t_a9_5 + t_a9_6 + t_a9_7 + t_a9_8 + t_a9_9 + t_a9_10 + t_a9_11 + t_a9_12 + t_a9_13 + t_a9_14 + t_a9_15);
let t_a10 = calc_f64!(x0 + t_a10_1 + t_a10_2 + t_a10_3 + t_a10_4 + t_a10_5 + t_a10_6 + t_a10_7 + t_a10_8 + t_a10_9 + t_a10_10 + t_a10_11 + t_a10_12 + t_a10_13 + t_a10_14 + t_a10_15);
let t_a11 = calc_f64!(x0 + t_a11_1 + t_a11_2 + t_a11_3 + t_a11_4 + t_a11_5 + t_a11_6 + t_a11_7 + t_a11_8 + t_a11_9 + t_a11_10 + t_a11_11 + t_a11_12 + t_a11_13 + t_a11_14 + t_a11_15);
let t_a12 = calc_f64!(x0 + t_a12_1 + t_a12_2 + t_a12_3 + t_a12_4 + t_a12_5 + t_a12_6 + t_a12_7 + t_a12_8 + t_a12_9 + t_a12_10 + t_a12_11 + t_a12_12 + t_a12_13 + t_a12_14 + t_a12_15);
let t_a13 = calc_f64!(x0 + t_a13_1 + t_a13_2 + t_a13_3 + t_a13_4 + t_a13_5 + t_a13_6 + t_a13_7 + t_a13_8 + t_a13_9 + t_a13_10 + t_a13_11 + t_a13_12 + t_a13_13 + t_a13_14 + t_a13_15);
let t_a14 = calc_f64!(x0 + t_a14_1 + t_a14_2 + t_a14_3 + t_a14_4 + t_a14_5 + t_a14_6 + t_a14_7 + t_a14_8 + t_a14_9 + t_a14_10 + t_a14_11 + t_a14_12 + t_a14_13 + t_a14_14 + t_a14_15);
let t_a15 = calc_f64!(x0 + t_a15_1 + t_a15_2 + t_a15_3 + t_a15_4 + t_a15_5 + t_a15_6 + t_a15_7 + t_a15_8 + t_a15_9 + t_a15_10 + t_a15_11 + t_a15_12 + t_a15_13 + t_a15_14 + t_a15_15);
let t_b1 = calc_f64!(t_b1_1 + t_b1_2 + t_b1_3 + t_b1_4 + t_b1_5 + t_b1_6 + t_b1_7 + t_b1_8 + t_b1_9 + t_b1_10 + t_b1_11 + t_b1_12 + t_b1_13 + t_b1_14 + t_b1_15);
let t_b2 = calc_f64!(t_b2_1 + t_b2_2 + t_b2_3 + t_b2_4 + t_b2_5 + t_b2_6 + t_b2_7 - t_b2_8 - t_b2_9 - t_b2_10 - t_b2_11 - t_b2_12 - t_b2_13 - t_b2_14 - t_b2_15);
let t_b3 = calc_f64!(t_b3_1 + t_b3_2 + t_b3_3 + t_b3_4 + t_b3_5 - t_b3_6 - t_b3_7 - t_b3_8 - t_b3_9 - t_b3_10 + t_b3_11 + t_b3_12 + t_b3_13 + t_b3_14 + t_b3_15);
let t_b4 = calc_f64!(t_b4_1 + t_b4_2 + t_b4_3 - t_b4_4 - t_b4_5 - t_b4_6 - t_b4_7 + t_b4_8 + t_b4_9 + t_b4_10 + t_b4_11 - t_b4_12 - t_b4_13 - t_b4_14 - t_b4_15);
let t_b5 = calc_f64!(t_b5_1 + t_b5_2 + t_b5_3 - t_b5_4 - t_b5_5 - t_b5_6 + t_b5_7 + t_b5_8 + t_b5_9 - t_b5_10 - t_b5_11 - t_b5_12 + t_b5_13 + t_b5_14 + t_b5_15);
let t_b6 = calc_f64!(t_b6_1 + t_b6_2 - t_b6_3 - t_b6_4 - t_b6_5 + t_b6_6 + t_b6_7 - t_b6_8 - t_b6_9 - t_b6_10 + t_b6_11 + t_b6_12 - t_b6_13 - t_b6_14 - t_b6_15);
let t_b7 = calc_f64!(t_b7_1 + t_b7_2 - t_b7_3 - t_b7_4 + t_b7_5 + t_b7_6 - t_b7_7 - t_b7_8 + t_b7_9 + t_b7_10 + t_b7_11 - t_b7_12 - t_b7_13 + t_b7_14 + t_b7_15);
let t_b8 = calc_f64!(t_b8_1 - t_b8_2 - t_b8_3 + t_b8_4 + t_b8_5 - t_b8_6 - t_b8_7 + t_b8_8 + t_b8_9 - t_b8_10 - t_b8_11 + t_b8_12 + t_b8_13 - t_b8_14 - t_b8_15);
let t_b9 = calc_f64!(t_b9_1 - t_b9_2 - t_b9_3 + t_b9_4 + t_b9_5 - t_b9_6 + t_b9_7 + t_b9_8 - t_b9_9 - t_b9_10 + t_b9_11 + t_b9_12 - t_b9_13 + t_b9_14 + t_b9_15);
let t_b10 = calc_f64!(t_b10_1 - t_b10_2 - t_b10_3 + t_b10_4 - t_b10_5 - t_b10_6 + t_b10_7 - t_b10_8 - t_b10_9 + t_b10_10 - t_b10_11 - t_b10_12 + t_b10_13 - t_b10_14 - t_b10_15);
let t_b11 = calc_f64!(t_b11_1 - t_b11_2 + t_b11_3 + t_b11_4 - t_b11_5 + t_b11_6 + t_b11_7 - t_b11_8 + t_b11_9 - t_b11_10 - t_b11_11 + t_b11_12 - t_b11_13 - t_b11_14 + t_b11_15);
let t_b12 = calc_f64!(t_b12_1 - t_b12_2 + t_b12_3 - t_b12_4 - t_b12_5 + t_b12_6 - t_b12_7 + t_b12_8 + t_b12_9 - t_b12_10 + t_b12_11 - t_b12_12 + t_b12_13 + t_b12_14 - t_b12_15);
let t_b13 = calc_f64!(t_b13_1 - t_b13_2 + t_b13_3 - t_b13_4 + t_b13_5 - t_b13_6 - t_b13_7 + t_b13_8 - t_b13_9 + t_b13_10 - t_b13_11 + t_b13_12 + t_b13_13 - t_b13_14 + t_b13_15);
let t_b14 = calc_f64!(t_b14_1 - t_b14_2 + t_b14_3 - t_b14_4 + t_b14_5 - t_b14_6 + t_b14_7 - t_b14_8 + t_b14_9 - t_b14_10 - t_b14_11 + t_b14_12 - t_b14_13 + t_b14_14 - t_b14_15);
let t_b15 = calc_f64!(t_b15_1 - t_b15_2 + t_b15_3 - t_b15_4 + t_b15_5 - t_b15_6 + t_b15_7 - t_b15_8 + t_b15_9 - t_b15_10 + t_b15_11 - t_b15_12 + t_b15_13 - t_b15_14 + t_b15_15);
let t_b1_rot = self.rotate.rotate(t_b1);
let t_b2_rot = self.rotate.rotate(t_b2);
let t_b3_rot = self.rotate.rotate(t_b3);
let t_b4_rot = self.rotate.rotate(t_b4);
let t_b5_rot = self.rotate.rotate(t_b5);
let t_b6_rot = self.rotate.rotate(t_b6);
let t_b7_rot = self.rotate.rotate(t_b7);
let t_b8_rot = self.rotate.rotate(t_b8);
let t_b9_rot = self.rotate.rotate(t_b9);
let t_b10_rot = self.rotate.rotate(t_b10);
let t_b11_rot = self.rotate.rotate(t_b11);
let t_b12_rot = self.rotate.rotate(t_b12);
let t_b13_rot = self.rotate.rotate(t_b13);
let t_b14_rot = self.rotate.rotate(t_b14);
let t_b15_rot = self.rotate.rotate(t_b15);
let y0 = calc_f64!(x0 + x1p30 + x2p29 + x3p28 + x4p27 + x5p26 + x6p25 + x7p24 + x8p23 + x9p22 + x10p21 + x11p20 + x12p19 + x13p18 + x14p17 + x15p16);
let [y1, y30] = solo_fft2_f64(t_a1, t_b1_rot);
let [y2, y29] = solo_fft2_f64(t_a2, t_b2_rot);
let [y3, y28] = solo_fft2_f64(t_a3, t_b3_rot);
let [y4, y27] = solo_fft2_f64(t_a4, t_b4_rot);
let [y5, y26] = solo_fft2_f64(t_a5, t_b5_rot);
let [y6, y25] = solo_fft2_f64(t_a6, t_b6_rot);
let [y7, y24] = solo_fft2_f64(t_a7, t_b7_rot);
let [y8, y23] = solo_fft2_f64(t_a8, t_b8_rot);
let [y9, y22] = solo_fft2_f64(t_a9, t_b9_rot);
let [y10, y21] = solo_fft2_f64(t_a10, t_b10_rot);
let [y11, y20] = solo_fft2_f64(t_a11, t_b11_rot);
let [y12, y19] = solo_fft2_f64(t_a12, t_b12_rot);
let [y13, y18] = solo_fft2_f64(t_a13, t_b13_rot);
let [y14, y17] = solo_fft2_f64(t_a14, t_b14_rot);
let [y15, y16] = solo_fft2_f64(t_a15, t_b15_rot);
[y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22, y23, y24, y25, y26, y27, y28, y29, y30]
}
}
#[cfg(test)]
mod unit_tests {
use super::*;
use crate::test_utils::check_fft_algorithm;
macro_rules! test_butterfly_32_func {
($test_name:ident, $struct_name:ident, $size:expr) => {
#[test]
fn $test_name() {
let butterfly = $struct_name::new(FftDirection::Forward);
check_fft_algorithm::<f32>(&butterfly, $size, FftDirection::Forward);
let butterfly_direction = $struct_name::new(FftDirection::Inverse);
check_fft_algorithm::<f32>(&butterfly_direction, $size, FftDirection::Inverse);
}
};
}
test_butterfly_32_func!(test_ssef32_butterfly7, SseF32Butterfly7, 7);
test_butterfly_32_func!(test_ssef32_butterfly11, SseF32Butterfly11, 11);
test_butterfly_32_func!(test_ssef32_butterfly13, SseF32Butterfly13, 13);
test_butterfly_32_func!(test_ssef32_butterfly17, SseF32Butterfly17, 17);
test_butterfly_32_func!(test_ssef32_butterfly19, SseF32Butterfly19, 19);
test_butterfly_32_func!(test_ssef32_butterfly23, SseF32Butterfly23, 23);
test_butterfly_32_func!(test_ssef32_butterfly29, SseF32Butterfly29, 29);
test_butterfly_32_func!(test_ssef32_butterfly31, SseF32Butterfly31, 31);
macro_rules! test_butterfly_64_func {
($test_name:ident, $struct_name:ident, $size:expr) => {
#[test]
fn $test_name() {
let butterfly = $struct_name::new(FftDirection::Forward);
check_fft_algorithm::<f64>(&butterfly, $size, FftDirection::Forward);
let butterfly_direction = $struct_name::new(FftDirection::Inverse);
check_fft_algorithm::<f64>(&butterfly_direction, $size, FftDirection::Inverse);
}
};
}
test_butterfly_64_func!(test_ssef64_butterfly7, SseF64Butterfly7, 7);
test_butterfly_64_func!(test_ssef64_butterfly11, SseF64Butterfly11, 11);
test_butterfly_64_func!(test_ssef64_butterfly13, SseF64Butterfly13, 13);
test_butterfly_64_func!(test_ssef64_butterfly17, SseF64Butterfly17, 17);
test_butterfly_64_func!(test_ssef64_butterfly19, SseF64Butterfly19, 19);
test_butterfly_64_func!(test_ssef64_butterfly23, SseF64Butterfly23, 23);
test_butterfly_64_func!(test_ssef64_butterfly29, SseF64Butterfly29, 29);
test_butterfly_64_func!(test_ssef64_butterfly31, SseF64Butterfly31, 31);
}