use core::arch::x86_64::*;
use num_complex::Complex;
use crate::{common::FftNum, FftDirection};
use crate::array_utils;
use crate::array_utils::workaround_transmute_mut;
use crate::array_utils::DoubleBuf;
use crate::common::{fft_error_inplace, fft_error_outofplace};
use crate::twiddles;
use crate::{Direction, Fft, Length};
use super::sse_common::{assert_f32, assert_f64};
use super::sse_utils::*;
use super::sse_vector::SseArrayMut;
#[allow(unused)]
macro_rules! boilerplate_fft_sse_f32_butterfly {
($struct_name:ident, $len:expr, $direction_fn:expr) => {
impl<T: FftNum> $struct_name<T> {
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn perform_fft_butterfly(&self, buffer: &mut [Complex<T>]) {
self.perform_fft_contiguous(workaround_transmute_mut::<_, Complex<f32>>(buffer));
}
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn perform_parallel_fft_butterfly(&self, buffer: &mut [Complex<T>]) {
self.perform_parallel_fft_contiguous(workaround_transmute_mut::<_, Complex<f32>>(
buffer,
));
}
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn perform_fft_butterfly_multi(
&self,
buffer: &mut [Complex<T>],
) -> Result<(), ()> {
let len = buffer.len();
let alldone = array_utils::iter_chunks(buffer, 2 * self.len(), |chunk| {
self.perform_parallel_fft_butterfly(chunk)
});
if alldone.is_err() && buffer.len() >= self.len() {
self.perform_fft_butterfly(&mut buffer[len - self.len()..]);
}
Ok(())
}
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn perform_oop_fft_butterfly_multi(
&self,
input: &mut [Complex<T>],
output: &mut [Complex<T>],
) -> Result<(), ()> {
let len = input.len();
let alldone = array_utils::iter_chunks_zipped(
input,
output,
2 * self.len(),
|in_chunk, out_chunk| {
let input_slice = workaround_transmute_mut(in_chunk);
let output_slice = workaround_transmute_mut(out_chunk);
self.perform_parallel_fft_contiguous(DoubleBuf {
input: input_slice,
output: output_slice,
})
},
);
if alldone.is_err() && input.len() >= self.len() {
let input_slice = workaround_transmute_mut(input);
let output_slice = workaround_transmute_mut(output);
self.perform_fft_contiguous(DoubleBuf {
input: &mut input_slice[len - self.len()..],
output: &mut output_slice[len - self.len()..],
})
}
Ok(())
}
}
};
}
macro_rules! boilerplate_fft_sse_f64_butterfly {
($struct_name:ident, $len:expr, $direction_fn:expr) => {
impl<T: FftNum> $struct_name<T> {
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn perform_fft_butterfly(&self, buffer: &mut [Complex<T>]) {
self.perform_fft_contiguous(workaround_transmute_mut::<_, Complex<f64>>(buffer));
}
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn perform_fft_butterfly_multi(
&self,
buffer: &mut [Complex<T>],
) -> Result<(), ()> {
array_utils::iter_chunks(buffer, self.len(), |chunk| {
self.perform_fft_butterfly(chunk)
})
}
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn perform_oop_fft_butterfly_multi(
&self,
input: &mut [Complex<T>],
output: &mut [Complex<T>],
) -> Result<(), ()> {
array_utils::iter_chunks_zipped(input, output, self.len(), |in_chunk, out_chunk| {
let input_slice = workaround_transmute_mut(in_chunk);
let output_slice = workaround_transmute_mut(out_chunk);
self.perform_fft_contiguous(DoubleBuf {
input: input_slice,
output: output_slice,
})
})
}
}
};
}
#[allow(unused)]
macro_rules! boilerplate_fft_sse_common_butterfly {
($struct_name:ident, $len:expr, $direction_fn:expr) => {
impl<T: FftNum> Fft<T> for $struct_name<T> {
fn process_outofplace_with_scratch(
&self,
input: &mut [Complex<T>],
output: &mut [Complex<T>],
_scratch: &mut [Complex<T>],
) {
if input.len() < self.len() || output.len() != input.len() {
fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0);
return; }
let result = unsafe { self.perform_oop_fft_butterfly_multi(input, output) };
if result.is_err() {
fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0);
}
}
fn process_with_scratch(&self, buffer: &mut [Complex<T>], _scratch: &mut [Complex<T>]) {
if buffer.len() < self.len() {
fft_error_inplace(self.len(), buffer.len(), 0, 0);
return; }
let result = unsafe { self.perform_fft_butterfly_multi(buffer) };
if result.is_err() {
fft_error_inplace(self.len(), buffer.len(), 0, 0);
}
}
#[inline(always)]
fn get_inplace_scratch_len(&self) -> usize {
0
}
#[inline(always)]
fn get_outofplace_scratch_len(&self) -> usize {
0
}
}
impl<T> Length for $struct_name<T> {
#[inline(always)]
fn len(&self) -> usize {
$len
}
}
impl<T> Direction for $struct_name<T> {
#[inline(always)]
fn fft_direction(&self) -> FftDirection {
$direction_fn(self)
}
}
};
}
pub struct SseF32Butterfly1<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly1, 1, |this: &SseF32Butterfly1<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF32Butterfly1, 1, |this: &SseF32Butterfly1<_>| this
.direction);
impl<T: FftNum> SseF32Butterfly1<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
Self {
direction,
_phantom: std::marker::PhantomData,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let value = buffer.load_partial1_complex(0);
buffer.store_partial_lo_complex(value, 0);
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let value = buffer.load_complex(0);
buffer.store_complex(value, 0);
}
}
pub struct SseF64Butterfly1<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly1, 1, |this: &SseF64Butterfly1<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF64Butterfly1, 1, |this: &SseF64Butterfly1<_>| this
.direction);
impl<T: FftNum> SseF64Butterfly1<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
Self {
direction,
_phantom: std::marker::PhantomData,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let value = buffer.load_complex(0);
buffer.store_complex(value, 0);
}
}
pub struct SseF32Butterfly2<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly2, 2, |this: &SseF32Butterfly2<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF32Butterfly2, 2, |this: &SseF32Butterfly2<_>| this
.direction);
impl<T: FftNum> SseF32Butterfly2<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
Self {
direction,
_phantom: std::marker::PhantomData,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let values = buffer.load_complex(0);
let temp = self.perform_fft_direct(values);
buffer.store_complex(temp, 0);
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let values_a = buffer.load_complex(0);
let values_b = buffer.load_complex(2);
let out = self.perform_parallel_fft_direct(values_a, values_b);
let [out02, out13] = transpose_complex_2x2_f32(out[0], out[1]);
buffer.store_complex(out02, 0);
buffer.store_complex(out13, 2);
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(&self, values: __m128) -> __m128 {
solo_fft2_f32(values)
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_direct(
&self,
values_x: __m128,
values_y: __m128,
) -> [__m128; 2] {
parallel_fft2_contiguous_f32(values_x, values_y)
}
}
#[inline(always)]
pub(crate) unsafe fn parallel_fft2_interleaved_f32(val02: __m128, val13: __m128) -> [__m128; 2] {
let temp0 = _mm_add_ps(val02, val13);
let temp1 = _mm_sub_ps(val02, val13);
[temp0, temp1]
}
#[inline(always)]
unsafe fn parallel_fft2_contiguous_f32(left: __m128, right: __m128) -> [__m128; 2] {
let [temp02, temp13] = transpose_complex_2x2_f32(left, right);
parallel_fft2_interleaved_f32(temp02, temp13)
}
#[inline(always)]
unsafe fn solo_fft2_f32(values: __m128) -> __m128 {
let temp = reverse_complex_elements_f32(values);
let temp2 = negate_hi_f32(values);
_mm_add_ps(temp2, temp)
}
pub struct SseF64Butterfly2<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly2, 2, |this: &SseF64Butterfly2<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF64Butterfly2, 2, |this: &SseF64Butterfly2<_>| this
.direction);
impl<T: FftNum> SseF64Butterfly2<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
Self {
direction,
_phantom: std::marker::PhantomData,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let value0 = buffer.load_complex(0);
let value1 = buffer.load_complex(1);
let out = self.perform_fft_direct(value0, value1);
buffer.store_complex(out[0], 0);
buffer.store_complex(out[1], 1);
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(
&self,
value0: __m128d,
value1: __m128d,
) -> [__m128d; 2] {
solo_fft2_f64(value0, value1)
}
}
#[inline(always)]
pub(crate) unsafe fn solo_fft2_f64(left: __m128d, right: __m128d) -> [__m128d; 2] {
let temp0 = _mm_add_pd(left, right);
let temp1 = _mm_sub_pd(left, right);
[temp0, temp1]
}
pub struct SseF32Butterfly3<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
rotate: Rotate90F32,
twiddle: __m128,
twiddle1re: __m128,
twiddle1im: __m128,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly3, 3, |this: &SseF32Butterfly3<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF32Butterfly3, 3, |this: &SseF32Butterfly3<_>| this
.direction);
impl<T: FftNum> SseF32Butterfly3<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
let rotate = Rotate90F32::new(true);
let tw1: Complex<f32> = twiddles::compute_twiddle(1, 3, direction);
let twiddle = unsafe { _mm_set_ps(-tw1.im, -tw1.im, tw1.re, tw1.re) };
let twiddle1re = unsafe { _mm_set_ps(tw1.re, tw1.re, tw1.re, tw1.re) };
let twiddle1im = unsafe { _mm_set_ps(tw1.im, tw1.im, tw1.im, tw1.im) };
Self {
direction,
_phantom: std::marker::PhantomData,
rotate,
twiddle,
twiddle1re,
twiddle1im,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let value0x = buffer.load_partial1_complex(0);
let value12 = buffer.load_complex(1);
let out = self.perform_fft_direct(value0x, value12);
buffer.store_partial_lo_complex(out[0], 0);
buffer.store_complex(out[1], 1);
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let valuea0a1 = buffer.load_complex(0);
let valuea2b0 = buffer.load_complex(2);
let valueb1b2 = buffer.load_complex(4);
let value0 = extract_lo_hi_f32(valuea0a1, valuea2b0);
let value1 = extract_hi_lo_f32(valuea0a1, valueb1b2);
let value2 = extract_lo_hi_f32(valuea2b0, valueb1b2);
let out = self.perform_parallel_fft_direct(value0, value1, value2);
let out0 = extract_lo_lo_f32(out[0], out[1]);
let out1 = extract_lo_hi_f32(out[2], out[0]);
let out2 = extract_hi_hi_f32(out[1], out[2]);
buffer.store_complex(out0, 0);
buffer.store_complex(out1, 2);
buffer.store_complex(out2, 4);
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(
&self,
value0x: __m128,
value12: __m128,
) -> [__m128; 2] {
let rev12 = negate_hi_f32(reverse_complex_elements_f32(value12));
let temp12pn = self.rotate.rotate_hi(_mm_add_ps(value12, rev12));
let twiddled = _mm_mul_ps(temp12pn, self.twiddle);
let temp = _mm_add_ps(value0x, twiddled);
let out12 = solo_fft2_f32(temp);
let out0x = _mm_add_ps(value0x, temp12pn);
[out0x, out12]
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_direct(
&self,
value0: __m128,
value1: __m128,
value2: __m128,
) -> [__m128; 3] {
let x12p = _mm_add_ps(value1, value2);
let x12n = _mm_sub_ps(value1, value2);
let sum = _mm_add_ps(value0, x12p);
let temp_a = _mm_mul_ps(self.twiddle1re, x12p);
let temp_a = _mm_add_ps(temp_a, value0);
let n_rot = self.rotate.rotate_both(x12n);
let temp_b = _mm_mul_ps(self.twiddle1im, n_rot);
let x1 = _mm_add_ps(temp_a, temp_b);
let x2 = _mm_sub_ps(temp_a, temp_b);
[sum, x1, x2]
}
}
pub struct SseF64Butterfly3<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
rotate: Rotate90F64,
twiddle1re: __m128d,
twiddle1im: __m128d,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly3, 3, |this: &SseF64Butterfly3<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF64Butterfly3, 3, |this: &SseF64Butterfly3<_>| this
.direction);
impl<T: FftNum> SseF64Butterfly3<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
let rotate = Rotate90F64::new(true);
let tw1: Complex<f64> = twiddles::compute_twiddle(1, 3, direction);
let twiddle1re = unsafe { _mm_set_pd(tw1.re, tw1.re) };
let twiddle1im = unsafe { _mm_set_pd(tw1.im, tw1.im) };
Self {
direction,
_phantom: std::marker::PhantomData,
rotate,
twiddle1re,
twiddle1im,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let value0 = buffer.load_complex(0);
let value1 = buffer.load_complex(1);
let value2 = buffer.load_complex(2);
let out = self.perform_fft_direct(value0, value1, value2);
buffer.store_complex(out[0], 0);
buffer.store_complex(out[1], 1);
buffer.store_complex(out[2], 2);
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(
&self,
value0: __m128d,
value1: __m128d,
value2: __m128d,
) -> [__m128d; 3] {
let x12p = _mm_add_pd(value1, value2);
let x12n = _mm_sub_pd(value1, value2);
let sum = _mm_add_pd(value0, x12p);
let temp_a = _mm_mul_pd(self.twiddle1re, x12p);
let temp_a = _mm_add_pd(temp_a, value0);
let n_rot = self.rotate.rotate(x12n);
let temp_b = _mm_mul_pd(self.twiddle1im, n_rot);
let x1 = _mm_add_pd(temp_a, temp_b);
let x2 = _mm_sub_pd(temp_a, temp_b);
[sum, x1, x2]
}
}
pub struct SseF32Butterfly4<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
rotate: Rotate90F32,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly4, 4, |this: &SseF32Butterfly4<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF32Butterfly4, 4, |this: &SseF32Butterfly4<_>| this
.direction);
impl<T: FftNum> SseF32Butterfly4<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
let rotate = if direction == FftDirection::Inverse {
Rotate90F32::new(true)
} else {
Rotate90F32::new(false)
};
Self {
direction,
_phantom: std::marker::PhantomData,
rotate,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let value01 = buffer.load_complex(0);
let value23 = buffer.load_complex(2);
let out = self.perform_fft_direct(value01, value23);
buffer.store_complex(out[0], 0);
buffer.store_complex(out[1], 2);
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let value01a = buffer.load_complex(0);
let value23a = buffer.load_complex(2);
let value01b = buffer.load_complex(4);
let value23b = buffer.load_complex(6);
let [value0ab, value1ab] = transpose_complex_2x2_f32(value01a, value01b);
let [value2ab, value3ab] = transpose_complex_2x2_f32(value23a, value23b);
let out = self.perform_parallel_fft_direct(value0ab, value1ab, value2ab, value3ab);
let [out0, out1] = transpose_complex_2x2_f32(out[0], out[1]);
let [out2, out3] = transpose_complex_2x2_f32(out[2], out[3]);
buffer.store_complex(out0, 0);
buffer.store_complex(out1, 4);
buffer.store_complex(out2, 2);
buffer.store_complex(out3, 6);
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(
&self,
value01: __m128,
value23: __m128,
) -> [__m128; 2] {
let mut temp = parallel_fft2_interleaved_f32(value01, value23);
temp[1] = self.rotate.rotate_hi(temp[1]);
parallel_fft2_contiguous_f32(temp[0], temp[1])
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_direct(
&self,
values0: __m128,
values1: __m128,
values2: __m128,
values3: __m128,
) -> [__m128; 4] {
let temp0 = parallel_fft2_interleaved_f32(values0, values2);
let mut temp1 = parallel_fft2_interleaved_f32(values1, values3);
temp1[1] = self.rotate.rotate_both(temp1[1]);
let out0 = parallel_fft2_interleaved_f32(temp0[0], temp1[0]);
let out2 = parallel_fft2_interleaved_f32(temp0[1], temp1[1]);
[out0[0], out2[0], out0[1], out2[1]]
}
}
pub struct SseF64Butterfly4<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
rotate: Rotate90F64,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly4, 4, |this: &SseF64Butterfly4<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF64Butterfly4, 4, |this: &SseF64Butterfly4<_>| this
.direction);
impl<T: FftNum> SseF64Butterfly4<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
let rotate = if direction == FftDirection::Inverse {
Rotate90F64::new(true)
} else {
Rotate90F64::new(false)
};
Self {
direction,
_phantom: std::marker::PhantomData,
rotate,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let value0 = buffer.load_complex(0);
let value1 = buffer.load_complex(1);
let value2 = buffer.load_complex(2);
let value3 = buffer.load_complex(3);
let out = self.perform_fft_direct(value0, value1, value2, value3);
buffer.store_complex(out[0], 0);
buffer.store_complex(out[1], 1);
buffer.store_complex(out[2], 2);
buffer.store_complex(out[3], 3);
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(
&self,
value0: __m128d,
value1: __m128d,
value2: __m128d,
value3: __m128d,
) -> [__m128d; 4] {
let temp0 = solo_fft2_f64(value0, value2);
let mut temp1 = solo_fft2_f64(value1, value3);
temp1[1] = self.rotate.rotate(temp1[1]);
let out0 = solo_fft2_f64(temp0[0], temp1[0]);
let out2 = solo_fft2_f64(temp0[1], temp1[1]);
[out0[0], out2[0], out0[1], out2[1]]
}
}
pub struct SseF32Butterfly5<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
rotate: Rotate90F32,
twiddle12re: __m128,
twiddle21re: __m128,
twiddle12im: __m128,
twiddle21im: __m128,
twiddle1re: __m128,
twiddle1im: __m128,
twiddle2re: __m128,
twiddle2im: __m128,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly5, 5, |this: &SseF32Butterfly5<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF32Butterfly5, 5, |this: &SseF32Butterfly5<_>| this
.direction);
impl<T: FftNum> SseF32Butterfly5<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
let rotate = Rotate90F32::new(true);
let tw1: Complex<f32> = twiddles::compute_twiddle(1, 5, direction);
let tw2: Complex<f32> = twiddles::compute_twiddle(2, 5, direction);
let twiddle12re = unsafe { _mm_set_ps(tw2.re, tw2.re, tw1.re, tw1.re) };
let twiddle21re = unsafe { _mm_set_ps(tw1.re, tw1.re, tw2.re, tw2.re) };
let twiddle12im = unsafe { _mm_set_ps(tw2.im, tw2.im, tw1.im, tw1.im) };
let twiddle21im = unsafe { _mm_set_ps(-tw1.im, -tw1.im, tw2.im, tw2.im) };
let twiddle1re = unsafe { _mm_set_ps(tw1.re, tw1.re, tw1.re, tw1.re) };
let twiddle1im = unsafe { _mm_set_ps(tw1.im, tw1.im, tw1.im, tw1.im) };
let twiddle2re = unsafe { _mm_set_ps(tw2.re, tw2.re, tw2.re, tw2.re) };
let twiddle2im = unsafe { _mm_set_ps(tw2.im, tw2.im, tw2.im, tw2.im) };
Self {
direction,
_phantom: std::marker::PhantomData,
rotate,
twiddle12re,
twiddle21re,
twiddle12im,
twiddle21im,
twiddle1re,
twiddle1im,
twiddle2re,
twiddle2im,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let value00 = buffer.load1_complex(0);
let value12 = buffer.load_complex(1);
let value34 = buffer.load_complex(3);
let out = self.perform_fft_direct(value00, value12, value34);
buffer.store_partial_lo_complex(out[0], 0);
buffer.store_complex(out[1], 1);
buffer.store_complex(out[2], 3);
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed = read_complex_to_array!(buffer, {0, 2, 4 ,6, 8});
let value0 = extract_lo_hi_f32(input_packed[0], input_packed[2]);
let value1 = extract_hi_lo_f32(input_packed[0], input_packed[3]);
let value2 = extract_lo_hi_f32(input_packed[1], input_packed[3]);
let value3 = extract_hi_lo_f32(input_packed[1], input_packed[4]);
let value4 = extract_lo_hi_f32(input_packed[2], input_packed[4]);
let out = self.perform_parallel_fft_direct(value0, value1, value2, value3, value4);
let out_packed = [
extract_lo_lo_f32(out[0], out[1]),
extract_lo_lo_f32(out[2], out[3]),
extract_lo_hi_f32(out[4], out[0]),
extract_hi_hi_f32(out[1], out[2]),
extract_hi_hi_f32(out[3], out[4]),
];
write_complex_to_array_strided!(out_packed, buffer, 2, {0, 1, 2, 3, 4});
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(
&self,
value00: __m128,
value12: __m128,
value34: __m128,
) -> [__m128; 3] {
let temp43 = reverse_complex_elements_f32(value34);
let x1423p = _mm_add_ps(value12, temp43);
let x1423n = _mm_sub_ps(value12, temp43);
let x1414p = duplicate_lo_f32(x1423p);
let x2323p = duplicate_hi_f32(x1423p);
let x1414n = duplicate_lo_f32(x1423n);
let x2323n = duplicate_hi_f32(x1423n);
let temp_a1 = _mm_mul_ps(self.twiddle12re, x1414p);
let temp_a2 = _mm_mul_ps(self.twiddle21re, x2323p);
let temp_b1 = _mm_mul_ps(self.twiddle12im, x1414n);
let temp_b2 = _mm_mul_ps(self.twiddle21im, x2323n);
let temp_a = _mm_add_ps(temp_a1, temp_a2);
let temp_a = _mm_add_ps(value00, temp_a);
let temp_b = _mm_add_ps(temp_b1, temp_b2);
let b_rot = self.rotate.rotate_both(temp_b);
let x00 = _mm_add_ps(value00, _mm_add_ps(x1414p, x2323p));
let x12 = _mm_add_ps(temp_a, b_rot);
let x34 = reverse_complex_elements_f32(_mm_sub_ps(temp_a, b_rot));
[x00, x12, x34]
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_direct(
&self,
value0: __m128,
value1: __m128,
value2: __m128,
value3: __m128,
value4: __m128,
) -> [__m128; 5] {
let x14p = _mm_add_ps(value1, value4);
let x14n = _mm_sub_ps(value1, value4);
let x23p = _mm_add_ps(value2, value3);
let x23n = _mm_sub_ps(value2, value3);
let temp_a1_1 = _mm_mul_ps(self.twiddle1re, x14p);
let temp_a1_2 = _mm_mul_ps(self.twiddle2re, x23p);
let temp_b1_1 = _mm_mul_ps(self.twiddle1im, x14n);
let temp_b1_2 = _mm_mul_ps(self.twiddle2im, x23n);
let temp_a2_1 = _mm_mul_ps(self.twiddle1re, x23p);
let temp_a2_2 = _mm_mul_ps(self.twiddle2re, x14p);
let temp_b2_1 = _mm_mul_ps(self.twiddle2im, x14n);
let temp_b2_2 = _mm_mul_ps(self.twiddle1im, x23n);
let temp_a1 = _mm_add_ps(value0, _mm_add_ps(temp_a1_1, temp_a1_2));
let temp_b1 = _mm_add_ps(temp_b1_1, temp_b1_2);
let temp_a2 = _mm_add_ps(value0, _mm_add_ps(temp_a2_1, temp_a2_2));
let temp_b2 = _mm_sub_ps(temp_b2_1, temp_b2_2);
[
_mm_add_ps(value0, _mm_add_ps(x14p, x23p)),
_mm_add_ps(temp_a1, self.rotate.rotate_both(temp_b1)),
_mm_add_ps(temp_a2, self.rotate.rotate_both(temp_b2)),
_mm_sub_ps(temp_a2, self.rotate.rotate_both(temp_b2)),
_mm_sub_ps(temp_a1, self.rotate.rotate_both(temp_b1)),
]
}
}
pub struct SseF64Butterfly5<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
rotate: Rotate90F64,
twiddle1re: __m128d,
twiddle1im: __m128d,
twiddle2re: __m128d,
twiddle2im: __m128d,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly5, 5, |this: &SseF64Butterfly5<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF64Butterfly5, 5, |this: &SseF64Butterfly5<_>| this
.direction);
impl<T: FftNum> SseF64Butterfly5<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
let rotate = Rotate90F64::new(true);
let tw1: Complex<f64> = twiddles::compute_twiddle(1, 5, direction);
let tw2: Complex<f64> = twiddles::compute_twiddle(2, 5, direction);
let twiddle1re = unsafe { _mm_set_pd(tw1.re, tw1.re) };
let twiddle1im = unsafe { _mm_set_pd(tw1.im, tw1.im) };
let twiddle2re = unsafe { _mm_set_pd(tw2.re, tw2.re) };
let twiddle2im = unsafe { _mm_set_pd(tw2.im, tw2.im) };
Self {
direction,
_phantom: std::marker::PhantomData,
rotate,
twiddle1re,
twiddle1im,
twiddle2re,
twiddle2im,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let value0 = buffer.load_complex(0);
let value1 = buffer.load_complex(1);
let value2 = buffer.load_complex(2);
let value3 = buffer.load_complex(3);
let value4 = buffer.load_complex(4);
let out = self.perform_fft_direct(value0, value1, value2, value3, value4);
buffer.store_complex(out[0], 0);
buffer.store_complex(out[1], 1);
buffer.store_complex(out[2], 2);
buffer.store_complex(out[3], 3);
buffer.store_complex(out[4], 4);
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(
&self,
value0: __m128d,
value1: __m128d,
value2: __m128d,
value3: __m128d,
value4: __m128d,
) -> [__m128d; 5] {
let x14p = _mm_add_pd(value1, value4);
let x14n = _mm_sub_pd(value1, value4);
let x23p = _mm_add_pd(value2, value3);
let x23n = _mm_sub_pd(value2, value3);
let temp_a1_1 = _mm_mul_pd(self.twiddle1re, x14p);
let temp_a1_2 = _mm_mul_pd(self.twiddle2re, x23p);
let temp_a2_1 = _mm_mul_pd(self.twiddle2re, x14p);
let temp_a2_2 = _mm_mul_pd(self.twiddle1re, x23p);
let temp_b1_1 = _mm_mul_pd(self.twiddle1im, x14n);
let temp_b1_2 = _mm_mul_pd(self.twiddle2im, x23n);
let temp_b2_1 = _mm_mul_pd(self.twiddle2im, x14n);
let temp_b2_2 = _mm_mul_pd(self.twiddle1im, x23n);
let temp_a1 = _mm_add_pd(value0, _mm_add_pd(temp_a1_1, temp_a1_2));
let temp_a2 = _mm_add_pd(value0, _mm_add_pd(temp_a2_1, temp_a2_2));
let temp_b1 = _mm_add_pd(temp_b1_1, temp_b1_2);
let temp_b2 = _mm_sub_pd(temp_b2_1, temp_b2_2);
let temp_b1_rot = self.rotate.rotate(temp_b1);
let temp_b2_rot = self.rotate.rotate(temp_b2);
[
_mm_add_pd(value0, _mm_add_pd(x14p, x23p)),
_mm_add_pd(temp_a1, temp_b1_rot),
_mm_add_pd(temp_a2, temp_b2_rot),
_mm_sub_pd(temp_a2, temp_b2_rot),
_mm_sub_pd(temp_a1, temp_b1_rot),
]
}
}
pub struct SseF32Butterfly6<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
bf3: SseF32Butterfly3<T>,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly6, 6, |this: &SseF32Butterfly6<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF32Butterfly6, 6, |this: &SseF32Butterfly6<_>| this
.direction);
impl<T: FftNum> SseF32Butterfly6<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
let bf3 = SseF32Butterfly3::new(direction);
Self {
direction,
_phantom: std::marker::PhantomData,
bf3,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let value01 = buffer.load_complex(0);
let value23 = buffer.load_complex(2);
let value45 = buffer.load_complex(4);
let out = self.perform_fft_direct(value01, value23, value45);
buffer.store_complex(out[0], 0);
buffer.store_complex(out[1], 2);
buffer.store_complex(out[2], 4);
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10});
let values = interleave_complex_f32!(input_packed, 3, {0, 1, 2});
let out = self.perform_parallel_fft_direct(
values[0], values[1], values[2], values[3], values[4], values[5],
);
let out_sorted = separate_interleaved_complex_f32!(out, {0, 2, 4});
write_complex_to_array_strided!(out_sorted, buffer, 2, {0, 1, 2, 3, 4, 5});
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(
&self,
value01: __m128,
value23: __m128,
value45: __m128,
) -> [__m128; 3] {
let reord0 = extract_lo_hi_f32(value01, value23);
let reord1 = extract_lo_hi_f32(value23, value45);
let reord2 = extract_lo_hi_f32(value45, value01);
let mid = self.bf3.perform_parallel_fft_direct(reord0, reord1, reord2);
let [output0, output1] = parallel_fft2_contiguous_f32(mid[0], mid[1]);
let output2 = solo_fft2_f32(mid[2]);
[
extract_lo_hi_f32(output0, output1),
extract_lo_lo_f32(output2, output1),
extract_hi_hi_f32(output0, output2),
]
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_direct(
&self,
value0: __m128,
value1: __m128,
value2: __m128,
value3: __m128,
value4: __m128,
value5: __m128,
) -> [__m128; 6] {
let mid0 = self.bf3.perform_parallel_fft_direct(value0, value2, value4);
let mid1 = self.bf3.perform_parallel_fft_direct(value3, value5, value1);
let [output0, output1] = parallel_fft2_interleaved_f32(mid0[0], mid1[0]);
let [output2, output3] = parallel_fft2_interleaved_f32(mid0[1], mid1[1]);
let [output4, output5] = parallel_fft2_interleaved_f32(mid0[2], mid1[2]);
[output0, output3, output4, output1, output2, output5]
}
}
pub struct SseF64Butterfly6<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
bf3: SseF64Butterfly3<T>,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly6, 6, |this: &SseF64Butterfly6<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF64Butterfly6, 6, |this: &SseF64Butterfly6<_>| this
.direction);
impl<T: FftNum> SseF64Butterfly6<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
let bf3 = SseF64Butterfly3::new(direction);
Self {
direction,
_phantom: std::marker::PhantomData,
bf3,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let value0 = buffer.load_complex(0);
let value1 = buffer.load_complex(1);
let value2 = buffer.load_complex(2);
let value3 = buffer.load_complex(3);
let value4 = buffer.load_complex(4);
let value5 = buffer.load_complex(5);
let out = self.perform_fft_direct(value0, value1, value2, value3, value4, value5);
buffer.store_complex(out[0], 0);
buffer.store_complex(out[1], 1);
buffer.store_complex(out[2], 2);
buffer.store_complex(out[3], 3);
buffer.store_complex(out[4], 4);
buffer.store_complex(out[5], 5);
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(
&self,
value0: __m128d,
value1: __m128d,
value2: __m128d,
value3: __m128d,
value4: __m128d,
value5: __m128d,
) -> [__m128d; 6] {
let mid0 = self.bf3.perform_fft_direct(value0, value2, value4);
let mid1 = self.bf3.perform_fft_direct(value3, value5, value1);
let [output0, output1] = solo_fft2_f64(mid0[0], mid1[0]);
let [output2, output3] = solo_fft2_f64(mid0[1], mid1[1]);
let [output4, output5] = solo_fft2_f64(mid0[2], mid1[2]);
[output0, output3, output4, output1, output2, output5]
}
}
pub struct SseF32Butterfly8<T> {
root2: __m128,
root2_dual: __m128,
direction: FftDirection,
bf4: SseF32Butterfly4<T>,
rotate90: Rotate90F32,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly8, 8, |this: &SseF32Butterfly8<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF32Butterfly8, 8, |this: &SseF32Butterfly8<_>| this
.direction);
impl<T: FftNum> SseF32Butterfly8<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
let bf4 = SseF32Butterfly4::new(direction);
let root2 = unsafe { _mm_set_ps(0.5f32.sqrt(), 0.5f32.sqrt(), 1.0, 1.0) };
let root2_dual = unsafe { _mm_load1_ps(&0.5f32.sqrt()) };
let rotate90 = if direction == FftDirection::Inverse {
Rotate90F32::new(true)
} else {
Rotate90F32::new(false)
};
Self {
root2,
root2_dual,
direction,
bf4,
rotate90,
}
}
#[inline(always)]
unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6});
let out = self.perform_fft_direct(input_packed);
write_complex_to_array_strided!(out, buffer, 2, {0,1,2,3});
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10, 12, 14});
let values = interleave_complex_f32!(input_packed, 4, {0, 1, 2, 3});
let out = self.perform_parallel_fft_direct(values);
let out_sorted = separate_interleaved_complex_f32!(out, {0, 2, 4, 6});
write_complex_to_array_strided!(out_sorted, buffer, 2, {0,1,2,3,4,5,6,7});
}
#[inline(always)]
unsafe fn perform_fft_direct(&self, values: [__m128; 4]) -> [__m128; 4] {
let [in02, in13] = transpose_complex_2x2_f32(values[0], values[1]);
let [in46, in57] = transpose_complex_2x2_f32(values[2], values[3]);
let val0 = self.bf4.perform_fft_direct(in02, in46);
let mut val2 = self.bf4.perform_fft_direct(in13, in57);
let val2b = self.rotate90.rotate_hi(val2[0]);
let val2c = _mm_add_ps(val2b, val2[0]);
let val2d = _mm_mul_ps(val2c, self.root2);
val2[0] = extract_lo_hi_f32(val2[0], val2d);
let val3b = self.rotate90.rotate_both(val2[1]);
let val3c = _mm_sub_ps(val3b, val2[1]);
let val3d = _mm_mul_ps(val3c, self.root2);
val2[1] = extract_lo_hi_f32(val3b, val3d);
let out0 = parallel_fft2_interleaved_f32(val0[0], val2[0]);
let out1 = parallel_fft2_interleaved_f32(val0[1], val2[1]);
[out0[0], out1[0], out0[1], out1[1]]
}
#[inline(always)]
unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 8]) -> [__m128; 8] {
let val03 = self
.bf4
.perform_parallel_fft_direct(values[0], values[2], values[4], values[6]);
let mut val47 = self
.bf4
.perform_parallel_fft_direct(values[1], values[3], values[5], values[7]);
let val5b = self.rotate90.rotate_both(val47[1]);
let val5c = _mm_add_ps(val5b, val47[1]);
val47[1] = _mm_mul_ps(val5c, self.root2_dual);
val47[2] = self.rotate90.rotate_both(val47[2]);
let val7b = self.rotate90.rotate_both(val47[3]);
let val7c = _mm_sub_ps(val7b, val47[3]);
val47[3] = _mm_mul_ps(val7c, self.root2_dual);
let out0 = parallel_fft2_interleaved_f32(val03[0], val47[0]);
let out1 = parallel_fft2_interleaved_f32(val03[1], val47[1]);
let out2 = parallel_fft2_interleaved_f32(val03[2], val47[2]);
let out3 = parallel_fft2_interleaved_f32(val03[3], val47[3]);
[
out0[0], out1[0], out2[0], out3[0], out0[1], out1[1], out2[1], out3[1],
]
}
}
pub struct SseF64Butterfly8<T> {
root2: __m128d,
direction: FftDirection,
bf4: SseF64Butterfly4<T>,
rotate90: Rotate90F64,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly8, 8, |this: &SseF64Butterfly8<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF64Butterfly8, 8, |this: &SseF64Butterfly8<_>| this
.direction);
impl<T: FftNum> SseF64Butterfly8<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
let bf4 = SseF64Butterfly4::new(direction);
let root2 = unsafe { _mm_load1_pd(&0.5f64.sqrt()) };
let rotate90 = if direction == FftDirection::Inverse {
Rotate90F64::new(true)
} else {
Rotate90F64::new(false)
};
Self {
root2,
direction,
bf4,
rotate90,
}
}
#[inline(always)]
unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let values = read_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7});
let out = self.perform_fft_direct(values);
write_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7});
}
#[inline(always)]
unsafe fn perform_fft_direct(&self, values: [__m128d; 8]) -> [__m128d; 8] {
let val03 = self
.bf4
.perform_fft_direct(values[0], values[2], values[4], values[6]);
let mut val47 = self
.bf4
.perform_fft_direct(values[1], values[3], values[5], values[7]);
let val5b = self.rotate90.rotate(val47[1]);
let val5c = _mm_add_pd(val5b, val47[1]);
val47[1] = _mm_mul_pd(val5c, self.root2);
val47[2] = self.rotate90.rotate(val47[2]);
let val7b = self.rotate90.rotate(val47[3]);
let val7c = _mm_sub_pd(val7b, val47[3]);
val47[3] = _mm_mul_pd(val7c, self.root2);
let out0 = solo_fft2_f64(val03[0], val47[0]);
let out1 = solo_fft2_f64(val03[1], val47[1]);
let out2 = solo_fft2_f64(val03[2], val47[2]);
let out3 = solo_fft2_f64(val03[3], val47[3]);
[
out0[0], out1[0], out2[0], out3[0], out0[1], out1[1], out2[1], out3[1],
]
}
}
pub struct SseF32Butterfly9<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
bf3: SseF32Butterfly3<T>,
twiddle1: __m128,
twiddle2: __m128,
twiddle4: __m128,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly9, 9, |this: &SseF32Butterfly9<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF32Butterfly9, 9, |this: &SseF32Butterfly9<_>| this
.direction);
impl<T: FftNum> SseF32Butterfly9<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
let bf3 = SseF32Butterfly3::new(direction);
let tw1: Complex<f32> = twiddles::compute_twiddle(1, 9, direction);
let tw2: Complex<f32> = twiddles::compute_twiddle(2, 9, direction);
let tw4: Complex<f32> = twiddles::compute_twiddle(4, 9, direction);
let twiddle1 = unsafe { _mm_set_ps(tw1.im, tw1.re, tw1.im, tw1.re) };
let twiddle2 = unsafe { _mm_set_ps(tw2.im, tw2.re, tw2.im, tw2.re) };
let twiddle4 = unsafe { _mm_set_ps(tw4.im, tw4.re, tw4.im, tw4.re) };
Self {
direction,
_phantom: std::marker::PhantomData,
bf3,
twiddle1,
twiddle2,
twiddle4,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let values = read_partial1_complex_to_array!(buffer, {0,1,2,3,4,5,6,7,8});
let out = self.perform_parallel_fft_direct(values);
for n in 0..9 {
buffer.store_partial_lo_complex(out[n], n);
}
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10, 12, 14, 16});
let values = [
extract_lo_hi_f32(input_packed[0], input_packed[4]),
extract_hi_lo_f32(input_packed[0], input_packed[5]),
extract_lo_hi_f32(input_packed[1], input_packed[5]),
extract_hi_lo_f32(input_packed[1], input_packed[6]),
extract_lo_hi_f32(input_packed[2], input_packed[6]),
extract_hi_lo_f32(input_packed[2], input_packed[7]),
extract_lo_hi_f32(input_packed[3], input_packed[7]),
extract_hi_lo_f32(input_packed[3], input_packed[8]),
extract_lo_hi_f32(input_packed[4], input_packed[8]),
];
let out = self.perform_parallel_fft_direct(values);
let out_packed = [
extract_lo_lo_f32(out[0], out[1]),
extract_lo_lo_f32(out[2], out[3]),
extract_lo_lo_f32(out[4], out[5]),
extract_lo_lo_f32(out[6], out[7]),
extract_lo_hi_f32(out[8], out[0]),
extract_hi_hi_f32(out[1], out[2]),
extract_hi_hi_f32(out[3], out[4]),
extract_hi_hi_f32(out[5], out[6]),
extract_hi_hi_f32(out[7], out[8]),
];
write_complex_to_array_strided!(out_packed, buffer, 2, {0,1,2,3,4,5,6,7,8});
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 9]) -> [__m128; 9] {
let mid0 = self
.bf3
.perform_parallel_fft_direct(values[0], values[3], values[6]);
let mut mid1 = self
.bf3
.perform_parallel_fft_direct(values[1], values[4], values[7]);
let mut mid2 = self
.bf3
.perform_parallel_fft_direct(values[2], values[5], values[8]);
mid1[1] = mul_complex_f32(self.twiddle1, mid1[1]);
mid1[2] = mul_complex_f32(self.twiddle2, mid1[2]);
mid2[1] = mul_complex_f32(self.twiddle2, mid2[1]);
mid2[2] = mul_complex_f32(self.twiddle4, mid2[2]);
let [output0, output1, output2] = self
.bf3
.perform_parallel_fft_direct(mid0[0], mid1[0], mid2[0]);
let [output3, output4, output5] = self
.bf3
.perform_parallel_fft_direct(mid0[1], mid1[1], mid2[1]);
let [output6, output7, output8] = self
.bf3
.perform_parallel_fft_direct(mid0[2], mid1[2], mid2[2]);
[
output0, output3, output6, output1, output4, output7, output2, output5, output8,
]
}
}
pub struct SseF64Butterfly9<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
bf3: SseF64Butterfly3<T>,
twiddle1: __m128d,
twiddle2: __m128d,
twiddle4: __m128d,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly9, 9, |this: &SseF64Butterfly9<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF64Butterfly9, 9, |this: &SseF64Butterfly9<_>| this
.direction);
impl<T: FftNum> SseF64Butterfly9<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
let bf3 = SseF64Butterfly3::new(direction);
let tw1: Complex<f64> = twiddles::compute_twiddle(1, 9, direction);
let tw2: Complex<f64> = twiddles::compute_twiddle(2, 9, direction);
let tw4: Complex<f64> = twiddles::compute_twiddle(4, 9, direction);
let twiddle1 = unsafe { _mm_set_pd(tw1.im, tw1.re) };
let twiddle2 = unsafe { _mm_set_pd(tw2.im, tw2.re) };
let twiddle4 = unsafe { _mm_set_pd(tw4.im, tw4.re) };
Self {
direction,
_phantom: std::marker::PhantomData,
bf3,
twiddle1,
twiddle2,
twiddle4,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let values = read_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8});
let out = self.perform_fft_direct(values);
write_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8});
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 9]) -> [__m128d; 9] {
let mid0 = self.bf3.perform_fft_direct(values[0], values[3], values[6]);
let mut mid1 = self.bf3.perform_fft_direct(values[1], values[4], values[7]);
let mut mid2 = self.bf3.perform_fft_direct(values[2], values[5], values[8]);
mid1[1] = mul_complex_f64(self.twiddle1, mid1[1]);
mid1[2] = mul_complex_f64(self.twiddle2, mid1[2]);
mid2[1] = mul_complex_f64(self.twiddle2, mid2[1]);
mid2[2] = mul_complex_f64(self.twiddle4, mid2[2]);
let [output0, output1, output2] = self.bf3.perform_fft_direct(mid0[0], mid1[0], mid2[0]);
let [output3, output4, output5] = self.bf3.perform_fft_direct(mid0[1], mid1[1], mid2[1]);
let [output6, output7, output8] = self.bf3.perform_fft_direct(mid0[2], mid1[2], mid2[2]);
[
output0, output3, output6, output1, output4, output7, output2, output5, output8,
]
}
}
pub struct SseF32Butterfly10<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
bf5: SseF32Butterfly5<T>,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly10, 10, |this: &SseF32Butterfly10<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF32Butterfly10, 10, |this: &SseF32Butterfly10<_>| this
.direction);
impl<T: FftNum> SseF32Butterfly10<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
let bf5 = SseF32Butterfly5::new(direction);
Self {
direction,
_phantom: std::marker::PhantomData,
bf5,
}
}
#[inline(always)]
unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8});
let out = self.perform_fft_direct(input_packed);
write_complex_to_array_strided!(out, buffer, 2, {0,1,2,3,4});
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18});
let values = interleave_complex_f32!(input_packed, 5, {0, 1, 2, 3, 4});
let out = self.perform_parallel_fft_direct(values);
let out_sorted = separate_interleaved_complex_f32!(out, {0, 2, 4, 6, 8});
write_complex_to_array_strided!(out_sorted, buffer, 2, {0,1,2,3,4,5,6,7,8,9});
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128; 5]) -> [__m128; 5] {
let reord0 = extract_lo_hi_f32(values[0], values[2]);
let reord1 = extract_lo_hi_f32(values[1], values[3]);
let reord2 = extract_lo_hi_f32(values[2], values[4]);
let reord3 = extract_lo_hi_f32(values[3], values[0]);
let reord4 = extract_lo_hi_f32(values[4], values[1]);
let mids = self
.bf5
.perform_parallel_fft_direct(reord0, reord1, reord2, reord3, reord4);
let [temp01, temp23] = parallel_fft2_contiguous_f32(mids[0], mids[1]);
let [temp45, temp67] = parallel_fft2_contiguous_f32(mids[2], mids[3]);
let temp89 = solo_fft2_f32(mids[4]);
let out01 = extract_lo_hi_f32(temp01, temp23);
let out23 = extract_lo_hi_f32(temp45, temp67);
let out45 = extract_lo_lo_f32(temp89, temp23);
let out67 = extract_hi_lo_f32(temp01, temp67);
let out89 = extract_hi_hi_f32(temp45, temp89);
[out01, out23, out45, out67, out89]
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 10]) -> [__m128; 10] {
let mid0 = self
.bf5
.perform_parallel_fft_direct(values[0], values[2], values[4], values[6], values[8]);
let mid1 = self
.bf5
.perform_parallel_fft_direct(values[5], values[7], values[9], values[1], values[3]);
let [output0, output1] = parallel_fft2_interleaved_f32(mid0[0], mid1[0]);
let [output2, output3] = parallel_fft2_interleaved_f32(mid0[1], mid1[1]);
let [output4, output5] = parallel_fft2_interleaved_f32(mid0[2], mid1[2]);
let [output6, output7] = parallel_fft2_interleaved_f32(mid0[3], mid1[3]);
let [output8, output9] = parallel_fft2_interleaved_f32(mid0[4], mid1[4]);
[
output0, output3, output4, output7, output8, output1, output2, output5, output6,
output9,
]
}
}
pub struct SseF64Butterfly10<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
bf2: SseF64Butterfly2<T>,
bf5: SseF64Butterfly5<T>,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly10, 10, |this: &SseF64Butterfly10<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF64Butterfly10, 10, |this: &SseF64Butterfly10<_>| this
.direction);
impl<T: FftNum> SseF64Butterfly10<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
let bf2 = SseF64Butterfly2::new(direction);
let bf5 = SseF64Butterfly5::new(direction);
Self {
direction,
_phantom: std::marker::PhantomData,
bf2,
bf5,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let values = read_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
let out = self.perform_fft_direct(values);
write_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 10]) -> [__m128d; 10] {
let mid0 = self
.bf5
.perform_fft_direct(values[0], values[2], values[4], values[6], values[8]);
let mid1 = self
.bf5
.perform_fft_direct(values[5], values[7], values[9], values[1], values[3]);
let [output0, output1] = self.bf2.perform_fft_direct(mid0[0], mid1[0]);
let [output2, output3] = self.bf2.perform_fft_direct(mid0[1], mid1[1]);
let [output4, output5] = self.bf2.perform_fft_direct(mid0[2], mid1[2]);
let [output6, output7] = self.bf2.perform_fft_direct(mid0[3], mid1[3]);
let [output8, output9] = self.bf2.perform_fft_direct(mid0[4], mid1[4]);
[
output0, output3, output4, output7, output8, output1, output2, output5, output6,
output9,
]
}
}
pub struct SseF32Butterfly12<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
bf3: SseF32Butterfly3<T>,
bf4: SseF32Butterfly4<T>,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly12, 12, |this: &SseF32Butterfly12<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF32Butterfly12, 12, |this: &SseF32Butterfly12<_>| this
.direction);
impl<T: FftNum> SseF32Butterfly12<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
let bf3 = SseF32Butterfly3::new(direction);
let bf4 = SseF32Butterfly4::new(direction);
Self {
direction,
_phantom: std::marker::PhantomData,
bf3,
bf4,
}
}
#[inline(always)]
unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10 });
let out = self.perform_fft_direct(input_packed);
write_complex_to_array_strided!(out, buffer, 2, {0,1,2,3,4,5});
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed =
read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22});
let values = interleave_complex_f32!(input_packed, 6, {0, 1, 2, 3, 4, 5});
let out = self.perform_parallel_fft_direct(values);
let out_sorted = separate_interleaved_complex_f32!(out, {0, 2, 4, 6, 8, 10});
write_complex_to_array_strided!(out_sorted, buffer, 2, {0,1,2,3,4,5,6,7,8,9, 10, 11});
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128; 6]) -> [__m128; 6] {
let packed03 = extract_lo_hi_f32(values[0], values[1]);
let packed47 = extract_lo_hi_f32(values[2], values[3]);
let packed69 = extract_lo_hi_f32(values[3], values[4]);
let packed101 = extract_lo_hi_f32(values[5], values[0]);
let packed811 = extract_lo_hi_f32(values[4], values[5]);
let packed25 = extract_lo_hi_f32(values[1], values[2]);
let mid0 = self.bf4.perform_fft_direct(packed03, packed69);
let mid1 = self.bf4.perform_fft_direct(packed47, packed101);
let mid2 = self.bf4.perform_fft_direct(packed811, packed25);
let [temp03, temp14, temp25] = self
.bf3
.perform_parallel_fft_direct(mid0[0], mid1[0], mid2[0]);
let [temp69, temp710, temp811] = self
.bf3
.perform_parallel_fft_direct(mid0[1], mid1[1], mid2[1]);
[
extract_lo_hi_f32(temp03, temp14),
extract_lo_hi_f32(temp811, temp69),
extract_lo_hi_f32(temp14, temp25),
extract_lo_hi_f32(temp69, temp710),
extract_lo_hi_f32(temp25, temp03),
extract_lo_hi_f32(temp710, temp811),
]
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 12]) -> [__m128; 12] {
let mid0 = self
.bf4
.perform_parallel_fft_direct(values[0], values[3], values[6], values[9]);
let mid1 = self
.bf4
.perform_parallel_fft_direct(values[4], values[7], values[10], values[1]);
let mid2 = self
.bf4
.perform_parallel_fft_direct(values[8], values[11], values[2], values[5]);
let [output0, output1, output2] = self
.bf3
.perform_parallel_fft_direct(mid0[0], mid1[0], mid2[0]);
let [output3, output4, output5] = self
.bf3
.perform_parallel_fft_direct(mid0[1], mid1[1], mid2[1]);
let [output6, output7, output8] = self
.bf3
.perform_parallel_fft_direct(mid0[2], mid1[2], mid2[2]);
let [output9, output10, output11] = self
.bf3
.perform_parallel_fft_direct(mid0[3], mid1[3], mid2[3]);
[
output0, output4, output8, output9, output1, output5, output6, output10, output2,
output3, output7, output11,
]
}
}
pub struct SseF64Butterfly12<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
bf3: SseF64Butterfly3<T>,
bf4: SseF64Butterfly4<T>,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly12, 12, |this: &SseF64Butterfly12<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF64Butterfly12, 12, |this: &SseF64Butterfly12<_>| this
.direction);
impl<T: FftNum> SseF64Butterfly12<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
let bf3 = SseF64Butterfly3::new(direction);
let bf4 = SseF64Butterfly4::new(direction);
Self {
direction,
_phantom: std::marker::PhantomData,
bf3,
bf4,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let values = read_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
let out = self.perform_fft_direct(values);
write_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 12]) -> [__m128d; 12] {
let mid0 = self
.bf4
.perform_fft_direct(values[0], values[3], values[6], values[9]);
let mid1 = self
.bf4
.perform_fft_direct(values[4], values[7], values[10], values[1]);
let mid2 = self
.bf4
.perform_fft_direct(values[8], values[11], values[2], values[5]);
let [output0, output1, output2] = self.bf3.perform_fft_direct(mid0[0], mid1[0], mid2[0]);
let [output3, output4, output5] = self.bf3.perform_fft_direct(mid0[1], mid1[1], mid2[1]);
let [output6, output7, output8] = self.bf3.perform_fft_direct(mid0[2], mid1[2], mid2[2]);
let [output9, output10, output11] = self.bf3.perform_fft_direct(mid0[3], mid1[3], mid2[3]);
[
output0, output4, output8, output9, output1, output5, output6, output10, output2,
output3, output7, output11,
]
}
}
pub struct SseF32Butterfly15<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
bf3: SseF32Butterfly3<T>,
bf5: SseF32Butterfly5<T>,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly15, 15, |this: &SseF32Butterfly15<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF32Butterfly15, 15, |this: &SseF32Butterfly15<_>| this
.direction);
impl<T: FftNum> SseF32Butterfly15<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
let bf3 = SseF32Butterfly3::new(direction);
let bf5 = SseF32Butterfly5::new(direction);
Self {
direction,
_phantom: std::marker::PhantomData,
bf3,
bf5,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let values = read_partial1_complex_to_array!(buffer, {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14});
let out = self.perform_parallel_fft_direct(values);
for n in 0..15 {
buffer.store_partial_lo_complex(out[n], n);
}
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed =
read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28});
let values = [
extract_lo_hi_f32(input_packed[0], input_packed[7]),
extract_hi_lo_f32(input_packed[0], input_packed[8]),
extract_lo_hi_f32(input_packed[1], input_packed[8]),
extract_hi_lo_f32(input_packed[1], input_packed[9]),
extract_lo_hi_f32(input_packed[2], input_packed[9]),
extract_hi_lo_f32(input_packed[2], input_packed[10]),
extract_lo_hi_f32(input_packed[3], input_packed[10]),
extract_hi_lo_f32(input_packed[3], input_packed[11]),
extract_lo_hi_f32(input_packed[4], input_packed[11]),
extract_hi_lo_f32(input_packed[4], input_packed[12]),
extract_lo_hi_f32(input_packed[5], input_packed[12]),
extract_hi_lo_f32(input_packed[5], input_packed[13]),
extract_lo_hi_f32(input_packed[6], input_packed[13]),
extract_hi_lo_f32(input_packed[6], input_packed[14]),
extract_lo_hi_f32(input_packed[7], input_packed[14]),
];
let out = self.perform_parallel_fft_direct(values);
let out_packed = [
extract_lo_lo_f32(out[0], out[1]),
extract_lo_lo_f32(out[2], out[3]),
extract_lo_lo_f32(out[4], out[5]),
extract_lo_lo_f32(out[6], out[7]),
extract_lo_lo_f32(out[8], out[9]),
extract_lo_lo_f32(out[10], out[11]),
extract_lo_lo_f32(out[12], out[13]),
extract_lo_hi_f32(out[14], out[0]),
extract_hi_hi_f32(out[1], out[2]),
extract_hi_hi_f32(out[3], out[4]),
extract_hi_hi_f32(out[5], out[6]),
extract_hi_hi_f32(out[7], out[8]),
extract_hi_hi_f32(out[9], out[10]),
extract_hi_hi_f32(out[11], out[12]),
extract_hi_hi_f32(out[13], out[14]),
];
write_complex_to_array_strided!(out_packed, buffer, 2, {0,1,2,3,4,5,6,7,8,9, 10, 11, 12, 13, 14});
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 15]) -> [__m128; 15] {
let mid0 = self
.bf5
.perform_parallel_fft_direct(values[0], values[3], values[6], values[9], values[12]);
let mid1 = self
.bf5
.perform_parallel_fft_direct(values[5], values[8], values[11], values[14], values[2]);
let mid2 = self
.bf5
.perform_parallel_fft_direct(values[10], values[13], values[1], values[4], values[7]);
let [output0, output1, output2] = self
.bf3
.perform_parallel_fft_direct(mid0[0], mid1[0], mid2[0]);
let [output3, output4, output5] = self
.bf3
.perform_parallel_fft_direct(mid0[1], mid1[1], mid2[1]);
let [output6, output7, output8] = self
.bf3
.perform_parallel_fft_direct(mid0[2], mid1[2], mid2[2]);
let [output9, output10, output11] = self
.bf3
.perform_parallel_fft_direct(mid0[3], mid1[3], mid2[3]);
let [output12, output13, output14] = self
.bf3
.perform_parallel_fft_direct(mid0[4], mid1[4], mid2[4]);
[
output0, output4, output8, output9, output13, output2, output3, output7, output11,
output12, output1, output5, output6, output10, output14,
]
}
}
pub struct SseF64Butterfly15<T> {
direction: FftDirection,
_phantom: std::marker::PhantomData<T>,
bf3: SseF64Butterfly3<T>,
bf5: SseF64Butterfly5<T>,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly15, 15, |this: &SseF64Butterfly15<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF64Butterfly15, 15, |this: &SseF64Butterfly15<_>| this
.direction);
impl<T: FftNum> SseF64Butterfly15<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
let bf3 = SseF64Butterfly3::new(direction);
let bf5 = SseF64Butterfly5::new(direction);
Self {
direction,
_phantom: std::marker::PhantomData,
bf3,
bf5,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let values =
read_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
let out = self.perform_fft_direct(values);
write_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 15]) -> [__m128d; 15] {
let mid0 = self
.bf5
.perform_fft_direct(values[0], values[3], values[6], values[9], values[12]);
let mid1 = self
.bf5
.perform_fft_direct(values[5], values[8], values[11], values[14], values[2]);
let mid2 = self
.bf5
.perform_fft_direct(values[10], values[13], values[1], values[4], values[7]);
let [output0, output1, output2] = self.bf3.perform_fft_direct(mid0[0], mid1[0], mid2[0]);
let [output3, output4, output5] = self.bf3.perform_fft_direct(mid0[1], mid1[1], mid2[1]);
let [output6, output7, output8] = self.bf3.perform_fft_direct(mid0[2], mid1[2], mid2[2]);
let [output9, output10, output11] = self.bf3.perform_fft_direct(mid0[3], mid1[3], mid2[3]);
let [output12, output13, output14] = self.bf3.perform_fft_direct(mid0[4], mid1[4], mid2[4]);
[
output0, output4, output8, output9, output13, output2, output3, output7, output11,
output12, output1, output5, output6, output10, output14,
]
}
}
pub struct SseF32Butterfly16<T> {
direction: FftDirection,
bf4: SseF32Butterfly4<T>,
bf8: SseF32Butterfly8<T>,
rotate90: Rotate90F32,
twiddle01: __m128,
twiddle23: __m128,
twiddle01conj: __m128,
twiddle23conj: __m128,
twiddle1: __m128,
twiddle2: __m128,
twiddle3: __m128,
twiddle1c: __m128,
twiddle2c: __m128,
twiddle3c: __m128,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly16, 16, |this: &SseF32Butterfly16<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF32Butterfly16, 16, |this: &SseF32Butterfly16<_>| this
.direction);
impl<T: FftNum> SseF32Butterfly16<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
let bf8 = SseF32Butterfly8::new(direction);
let bf4 = SseF32Butterfly4::new(direction);
let rotate90 = if direction == FftDirection::Inverse {
Rotate90F32::new(true)
} else {
Rotate90F32::new(false)
};
let tw1: Complex<f32> = twiddles::compute_twiddle(1, 16, direction);
let tw2: Complex<f32> = twiddles::compute_twiddle(2, 16, direction);
let tw3: Complex<f32> = twiddles::compute_twiddle(3, 16, direction);
let twiddle01 = unsafe { _mm_set_ps(tw1.im, tw1.re, 0.0, 1.0) };
let twiddle23 = unsafe { _mm_set_ps(tw3.im, tw3.re, tw2.im, tw2.re) };
let twiddle01conj = unsafe { _mm_set_ps(-tw1.im, tw1.re, 0.0, 1.0) };
let twiddle23conj = unsafe { _mm_set_ps(-tw3.im, tw3.re, -tw2.im, tw2.re) };
let twiddle1 = unsafe { _mm_set_ps(tw1.im, tw1.re, tw1.im, tw1.re) };
let twiddle2 = unsafe { _mm_set_ps(tw2.im, tw2.re, tw2.im, tw2.re) };
let twiddle3 = unsafe { _mm_set_ps(tw3.im, tw3.re, tw3.im, tw3.re) };
let twiddle1c = unsafe { _mm_set_ps(-tw1.im, tw1.re, -tw1.im, tw1.re) };
let twiddle2c = unsafe { _mm_set_ps(-tw2.im, tw2.re, -tw2.im, tw2.re) };
let twiddle3c = unsafe { _mm_set_ps(-tw3.im, tw3.re, -tw3.im, tw3.re) };
Self {
direction,
bf4,
bf8,
rotate90,
twiddle01,
twiddle23,
twiddle01conj,
twiddle23conj,
twiddle1,
twiddle2,
twiddle3,
twiddle1c,
twiddle2c,
twiddle3c,
}
}
#[inline(always)]
unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10, 12, 14 });
let out = self.perform_fft_direct(input_packed);
write_complex_to_array_strided!(out, buffer, 2, {0,1,2,3,4,5,6,7});
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
let values = interleave_complex_f32!(input_packed, 8, {0, 1, 2, 3 ,4 ,5 ,6 ,7});
let out = self.perform_parallel_fft_direct(values);
let out_sorted = separate_interleaved_complex_f32!(out, {0, 2, 4, 6, 8, 10, 12, 14});
write_complex_to_array_strided!(out_sorted, buffer, 2, {0,1,2,3,4,5,6,7,8,9, 10, 11,12,13,14, 15});
}
#[inline(always)]
unsafe fn perform_fft_direct(&self, input: [__m128; 8]) -> [__m128; 8] {
let in0002 = extract_lo_lo_f32(input[0], input[1]);
let in0406 = extract_lo_lo_f32(input[2], input[3]);
let in0810 = extract_lo_lo_f32(input[4], input[5]);
let in1214 = extract_lo_lo_f32(input[6], input[7]);
let in0105 = extract_hi_hi_f32(input[0], input[2]);
let in0913 = extract_hi_hi_f32(input[4], input[6]);
let in1503 = extract_hi_hi_f32(input[7], input[1]);
let in0711 = extract_hi_hi_f32(input[3], input[5]);
let in_evens = [in0002, in0406, in0810, in1214];
let evens = self.bf8.perform_fft_direct(in_evens);
let mut odds1 = self.bf4.perform_fft_direct(in0105, in0913);
let mut odds3 = self.bf4.perform_fft_direct(in1503, in0711);
odds1[0] = mul_complex_f32(odds1[0], self.twiddle01);
odds3[0] = mul_complex_f32(odds3[0], self.twiddle01conj);
odds1[1] = mul_complex_f32(odds1[1], self.twiddle23);
odds3[1] = mul_complex_f32(odds3[1], self.twiddle23conj);
let mut temp0 = parallel_fft2_interleaved_f32(odds1[0], odds3[0]);
let mut temp1 = parallel_fft2_interleaved_f32(odds1[1], odds3[1]);
temp0[1] = self.rotate90.rotate_both(temp0[1]);
temp1[1] = self.rotate90.rotate_both(temp1[1]);
[
_mm_add_ps(evens[0], temp0[0]),
_mm_add_ps(evens[1], temp1[0]),
_mm_add_ps(evens[2], temp0[1]),
_mm_add_ps(evens[3], temp1[1]),
_mm_sub_ps(evens[0], temp0[0]),
_mm_sub_ps(evens[1], temp1[0]),
_mm_sub_ps(evens[2], temp0[1]),
_mm_sub_ps(evens[3], temp1[1]),
]
}
#[inline(always)]
unsafe fn perform_parallel_fft_direct(&self, input: [__m128; 16]) -> [__m128; 16] {
let evens = self.bf8.perform_parallel_fft_direct([
input[0], input[2], input[4], input[6], input[8], input[10], input[12], input[14],
]);
let mut odds1 = self
.bf4
.perform_parallel_fft_direct(input[1], input[5], input[9], input[13]);
let mut odds3 = self
.bf4
.perform_parallel_fft_direct(input[15], input[3], input[7], input[11]);
odds1[1] = mul_complex_f32(odds1[1], self.twiddle1);
odds3[1] = mul_complex_f32(odds3[1], self.twiddle1c);
odds1[2] = mul_complex_f32(odds1[2], self.twiddle2);
odds3[2] = mul_complex_f32(odds3[2], self.twiddle2c);
odds1[3] = mul_complex_f32(odds1[3], self.twiddle3);
odds3[3] = mul_complex_f32(odds3[3], self.twiddle3c);
let mut temp0 = parallel_fft2_interleaved_f32(odds1[0], odds3[0]);
let mut temp1 = parallel_fft2_interleaved_f32(odds1[1], odds3[1]);
let mut temp2 = parallel_fft2_interleaved_f32(odds1[2], odds3[2]);
let mut temp3 = parallel_fft2_interleaved_f32(odds1[3], odds3[3]);
temp0[1] = self.rotate90.rotate_both(temp0[1]);
temp1[1] = self.rotate90.rotate_both(temp1[1]);
temp2[1] = self.rotate90.rotate_both(temp2[1]);
temp3[1] = self.rotate90.rotate_both(temp3[1]);
[
_mm_add_ps(evens[0], temp0[0]),
_mm_add_ps(evens[1], temp1[0]),
_mm_add_ps(evens[2], temp2[0]),
_mm_add_ps(evens[3], temp3[0]),
_mm_add_ps(evens[4], temp0[1]),
_mm_add_ps(evens[5], temp1[1]),
_mm_add_ps(evens[6], temp2[1]),
_mm_add_ps(evens[7], temp3[1]),
_mm_sub_ps(evens[0], temp0[0]),
_mm_sub_ps(evens[1], temp1[0]),
_mm_sub_ps(evens[2], temp2[0]),
_mm_sub_ps(evens[3], temp3[0]),
_mm_sub_ps(evens[4], temp0[1]),
_mm_sub_ps(evens[5], temp1[1]),
_mm_sub_ps(evens[6], temp2[1]),
_mm_sub_ps(evens[7], temp3[1]),
]
}
}
pub struct SseF64Butterfly16<T> {
direction: FftDirection,
bf4: SseF64Butterfly4<T>,
bf8: SseF64Butterfly8<T>,
rotate90: Rotate90F64,
twiddle1: __m128d,
twiddle2: __m128d,
twiddle3: __m128d,
twiddle1c: __m128d,
twiddle2c: __m128d,
twiddle3c: __m128d,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly16, 16, |this: &SseF64Butterfly16<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF64Butterfly16, 16, |this: &SseF64Butterfly16<_>| this
.direction);
impl<T: FftNum> SseF64Butterfly16<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
let bf8 = SseF64Butterfly8::new(direction);
let bf4 = SseF64Butterfly4::new(direction);
let rotate90 = if direction == FftDirection::Inverse {
Rotate90F64::new(true)
} else {
Rotate90F64::new(false)
};
let twiddle1 =
unsafe { _mm_loadu_pd(&twiddles::compute_twiddle(1, 16, direction).re as *const f64) };
let twiddle2 =
unsafe { _mm_loadu_pd(&twiddles::compute_twiddle(2, 16, direction).re as *const f64) };
let twiddle3 =
unsafe { _mm_loadu_pd(&twiddles::compute_twiddle(3, 16, direction).re as *const f64) };
let twiddle1c = unsafe {
_mm_loadu_pd(&twiddles::compute_twiddle(1, 16, direction).conj().re as *const f64)
};
let twiddle2c = unsafe {
_mm_loadu_pd(&twiddles::compute_twiddle(2, 16, direction).conj().re as *const f64)
};
let twiddle3c = unsafe {
_mm_loadu_pd(&twiddles::compute_twiddle(3, 16, direction).conj().re as *const f64)
};
Self {
direction,
bf4,
bf8,
rotate90,
twiddle1,
twiddle2,
twiddle3,
twiddle1c,
twiddle2c,
twiddle3c,
}
}
#[inline(always)]
unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let values =
read_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
let out = self.perform_fft_direct(values);
write_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
}
#[inline(always)]
unsafe fn perform_fft_direct(&self, input: [__m128d; 16]) -> [__m128d; 16] {
let evens = self.bf8.perform_fft_direct([
input[0], input[2], input[4], input[6], input[8], input[10], input[12], input[14],
]);
let mut odds1 = self
.bf4
.perform_fft_direct(input[1], input[5], input[9], input[13]);
let mut odds3 = self
.bf4
.perform_fft_direct(input[15], input[3], input[7], input[11]);
odds1[1] = mul_complex_f64(odds1[1], self.twiddle1);
odds3[1] = mul_complex_f64(odds3[1], self.twiddle1c);
odds1[2] = mul_complex_f64(odds1[2], self.twiddle2);
odds3[2] = mul_complex_f64(odds3[2], self.twiddle2c);
odds1[3] = mul_complex_f64(odds1[3], self.twiddle3);
odds3[3] = mul_complex_f64(odds3[3], self.twiddle3c);
let mut temp0 = solo_fft2_f64(odds1[0], odds3[0]);
let mut temp1 = solo_fft2_f64(odds1[1], odds3[1]);
let mut temp2 = solo_fft2_f64(odds1[2], odds3[2]);
let mut temp3 = solo_fft2_f64(odds1[3], odds3[3]);
temp0[1] = self.rotate90.rotate(temp0[1]);
temp1[1] = self.rotate90.rotate(temp1[1]);
temp2[1] = self.rotate90.rotate(temp2[1]);
temp3[1] = self.rotate90.rotate(temp3[1]);
[
_mm_add_pd(evens[0], temp0[0]),
_mm_add_pd(evens[1], temp1[0]),
_mm_add_pd(evens[2], temp2[0]),
_mm_add_pd(evens[3], temp3[0]),
_mm_add_pd(evens[4], temp0[1]),
_mm_add_pd(evens[5], temp1[1]),
_mm_add_pd(evens[6], temp2[1]),
_mm_add_pd(evens[7], temp3[1]),
_mm_sub_pd(evens[0], temp0[0]),
_mm_sub_pd(evens[1], temp1[0]),
_mm_sub_pd(evens[2], temp2[0]),
_mm_sub_pd(evens[3], temp3[0]),
_mm_sub_pd(evens[4], temp0[1]),
_mm_sub_pd(evens[5], temp1[1]),
_mm_sub_pd(evens[6], temp2[1]),
_mm_sub_pd(evens[7], temp3[1]),
]
}
}
pub struct SseF32Butterfly32<T> {
direction: FftDirection,
bf8: SseF32Butterfly8<T>,
bf16: SseF32Butterfly16<T>,
rotate90: Rotate90F32,
twiddle01: __m128,
twiddle23: __m128,
twiddle45: __m128,
twiddle67: __m128,
twiddle01conj: __m128,
twiddle23conj: __m128,
twiddle45conj: __m128,
twiddle67conj: __m128,
twiddle1: __m128,
twiddle2: __m128,
twiddle3: __m128,
twiddle4: __m128,
twiddle5: __m128,
twiddle6: __m128,
twiddle7: __m128,
twiddle1c: __m128,
twiddle2c: __m128,
twiddle3c: __m128,
twiddle4c: __m128,
twiddle5c: __m128,
twiddle6c: __m128,
twiddle7c: __m128,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly32, 32, |this: &SseF32Butterfly32<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF32Butterfly32, 32, |this: &SseF32Butterfly32<_>| this
.direction);
impl<T: FftNum> SseF32Butterfly32<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
let bf8 = SseF32Butterfly8::new(direction);
let bf16 = SseF32Butterfly16::new(direction);
let rotate90 = if direction == FftDirection::Inverse {
Rotate90F32::new(true)
} else {
Rotate90F32::new(false)
};
let tw1: Complex<f32> = twiddles::compute_twiddle(1, 32, direction);
let tw2: Complex<f32> = twiddles::compute_twiddle(2, 32, direction);
let tw3: Complex<f32> = twiddles::compute_twiddle(3, 32, direction);
let tw4: Complex<f32> = twiddles::compute_twiddle(4, 32, direction);
let tw5: Complex<f32> = twiddles::compute_twiddle(5, 32, direction);
let tw6: Complex<f32> = twiddles::compute_twiddle(6, 32, direction);
let tw7: Complex<f32> = twiddles::compute_twiddle(7, 32, direction);
let twiddle01 = unsafe { _mm_set_ps(tw1.im, tw1.re, 0.0, 1.0) };
let twiddle23 = unsafe { _mm_set_ps(tw3.im, tw3.re, tw2.im, tw2.re) };
let twiddle45 = unsafe { _mm_set_ps(tw5.im, tw5.re, tw4.im, tw4.re) };
let twiddle67 = unsafe { _mm_set_ps(tw7.im, tw7.re, tw6.im, tw6.re) };
let twiddle01conj = unsafe { _mm_set_ps(-tw1.im, tw1.re, 0.0, 1.0) };
let twiddle23conj = unsafe { _mm_set_ps(-tw3.im, tw3.re, -tw2.im, tw2.re) };
let twiddle45conj = unsafe { _mm_set_ps(-tw5.im, tw5.re, -tw4.im, tw4.re) };
let twiddle67conj = unsafe { _mm_set_ps(-tw7.im, tw7.re, -tw6.im, tw6.re) };
let twiddle1 = unsafe { _mm_set_ps(tw1.im, tw1.re, tw1.im, tw1.re) };
let twiddle2 = unsafe { _mm_set_ps(tw2.im, tw2.re, tw2.im, tw2.re) };
let twiddle3 = unsafe { _mm_set_ps(tw3.im, tw3.re, tw3.im, tw3.re) };
let twiddle4 = unsafe { _mm_set_ps(tw4.im, tw4.re, tw4.im, tw4.re) };
let twiddle5 = unsafe { _mm_set_ps(tw5.im, tw5.re, tw5.im, tw5.re) };
let twiddle6 = unsafe { _mm_set_ps(tw6.im, tw6.re, tw6.im, tw6.re) };
let twiddle7 = unsafe { _mm_set_ps(tw7.im, tw7.re, tw7.im, tw7.re) };
let twiddle1c = unsafe { _mm_set_ps(-tw1.im, tw1.re, -tw1.im, tw1.re) };
let twiddle2c = unsafe { _mm_set_ps(-tw2.im, tw2.re, -tw2.im, tw2.re) };
let twiddle3c = unsafe { _mm_set_ps(-tw3.im, tw3.re, -tw3.im, tw3.re) };
let twiddle4c = unsafe { _mm_set_ps(-tw4.im, tw4.re, -tw4.im, tw4.re) };
let twiddle5c = unsafe { _mm_set_ps(-tw5.im, tw5.re, -tw5.im, tw5.re) };
let twiddle6c = unsafe { _mm_set_ps(-tw6.im, tw6.re, -tw6.im, tw6.re) };
let twiddle7c = unsafe { _mm_set_ps(-tw7.im, tw7.re, -tw7.im, tw7.re) };
Self {
direction,
bf8,
bf16,
rotate90,
twiddle01,
twiddle23,
twiddle45,
twiddle67,
twiddle01conj,
twiddle23conj,
twiddle45conj,
twiddle67conj,
twiddle1,
twiddle2,
twiddle3,
twiddle4,
twiddle5,
twiddle6,
twiddle7,
twiddle1c,
twiddle2c,
twiddle3c,
twiddle4c,
twiddle5c,
twiddle6c,
twiddle7c,
}
}
#[inline(always)]
unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 });
let out = self.perform_fft_direct(input_packed);
write_complex_to_array_strided!(out, buffer, 2, {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15});
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62});
let values = interleave_complex_f32!(input_packed, 16, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
let out = self.perform_parallel_fft_direct(values);
let out_sorted = separate_interleaved_complex_f32!(out, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
write_complex_to_array_strided!(out_sorted, buffer, 2, {0,1,2,3,4,5,6,7,8,9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 });
}
#[inline(always)]
unsafe fn perform_fft_direct(&self, input: [__m128; 16]) -> [__m128; 16] {
let in0002 = extract_lo_lo_f32(input[0], input[1]);
let in0406 = extract_lo_lo_f32(input[2], input[3]);
let in0810 = extract_lo_lo_f32(input[4], input[5]);
let in1214 = extract_lo_lo_f32(input[6], input[7]);
let in1618 = extract_lo_lo_f32(input[8], input[9]);
let in2022 = extract_lo_lo_f32(input[10], input[11]);
let in2426 = extract_lo_lo_f32(input[12], input[13]);
let in2830 = extract_lo_lo_f32(input[14], input[15]);
let in0105 = extract_hi_hi_f32(input[0], input[2]);
let in0913 = extract_hi_hi_f32(input[4], input[6]);
let in1721 = extract_hi_hi_f32(input[8], input[10]);
let in2529 = extract_hi_hi_f32(input[12], input[14]);
let in3103 = extract_hi_hi_f32(input[15], input[1]);
let in0711 = extract_hi_hi_f32(input[3], input[5]);
let in1519 = extract_hi_hi_f32(input[7], input[9]);
let in2327 = extract_hi_hi_f32(input[11], input[13]);
let in_evens = [
in0002, in0406, in0810, in1214, in1618, in2022, in2426, in2830,
];
let evens = self.bf16.perform_fft_direct(in_evens);
let mut odds1 = self
.bf8
.perform_fft_direct([in0105, in0913, in1721, in2529]);
let mut odds3 = self
.bf8
.perform_fft_direct([in3103, in0711, in1519, in2327]);
odds1[0] = mul_complex_f32(odds1[0], self.twiddle01);
odds3[0] = mul_complex_f32(odds3[0], self.twiddle01conj);
odds1[1] = mul_complex_f32(odds1[1], self.twiddle23);
odds3[1] = mul_complex_f32(odds3[1], self.twiddle23conj);
odds1[2] = mul_complex_f32(odds1[2], self.twiddle45);
odds3[2] = mul_complex_f32(odds3[2], self.twiddle45conj);
odds1[3] = mul_complex_f32(odds1[3], self.twiddle67);
odds3[3] = mul_complex_f32(odds3[3], self.twiddle67conj);
let mut temp0 = parallel_fft2_interleaved_f32(odds1[0], odds3[0]);
let mut temp1 = parallel_fft2_interleaved_f32(odds1[1], odds3[1]);
let mut temp2 = parallel_fft2_interleaved_f32(odds1[2], odds3[2]);
let mut temp3 = parallel_fft2_interleaved_f32(odds1[3], odds3[3]);
temp0[1] = self.rotate90.rotate_both(temp0[1]);
temp1[1] = self.rotate90.rotate_both(temp1[1]);
temp2[1] = self.rotate90.rotate_both(temp2[1]);
temp3[1] = self.rotate90.rotate_both(temp3[1]);
[
_mm_add_ps(evens[0], temp0[0]),
_mm_add_ps(evens[1], temp1[0]),
_mm_add_ps(evens[2], temp2[0]),
_mm_add_ps(evens[3], temp3[0]),
_mm_add_ps(evens[4], temp0[1]),
_mm_add_ps(evens[5], temp1[1]),
_mm_add_ps(evens[6], temp2[1]),
_mm_add_ps(evens[7], temp3[1]),
_mm_sub_ps(evens[0], temp0[0]),
_mm_sub_ps(evens[1], temp1[0]),
_mm_sub_ps(evens[2], temp2[0]),
_mm_sub_ps(evens[3], temp3[0]),
_mm_sub_ps(evens[4], temp0[1]),
_mm_sub_ps(evens[5], temp1[1]),
_mm_sub_ps(evens[6], temp2[1]),
_mm_sub_ps(evens[7], temp3[1]),
]
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_direct(&self, input: [__m128; 32]) -> [__m128; 32] {
let evens = self.bf16.perform_parallel_fft_direct([
input[0], input[2], input[4], input[6], input[8], input[10], input[12], input[14],
input[16], input[18], input[20], input[22], input[24], input[26], input[28], input[30],
]);
let mut odds1 = self.bf8.perform_parallel_fft_direct([
input[1], input[5], input[9], input[13], input[17], input[21], input[25], input[29],
]);
let mut odds3 = self.bf8.perform_parallel_fft_direct([
input[31], input[3], input[7], input[11], input[15], input[19], input[23], input[27],
]);
odds1[1] = mul_complex_f32(odds1[1], self.twiddle1);
odds3[1] = mul_complex_f32(odds3[1], self.twiddle1c);
odds1[2] = mul_complex_f32(odds1[2], self.twiddle2);
odds3[2] = mul_complex_f32(odds3[2], self.twiddle2c);
odds1[3] = mul_complex_f32(odds1[3], self.twiddle3);
odds3[3] = mul_complex_f32(odds3[3], self.twiddle3c);
odds1[4] = mul_complex_f32(odds1[4], self.twiddle4);
odds3[4] = mul_complex_f32(odds3[4], self.twiddle4c);
odds1[5] = mul_complex_f32(odds1[5], self.twiddle5);
odds3[5] = mul_complex_f32(odds3[5], self.twiddle5c);
odds1[6] = mul_complex_f32(odds1[6], self.twiddle6);
odds3[6] = mul_complex_f32(odds3[6], self.twiddle6c);
odds1[7] = mul_complex_f32(odds1[7], self.twiddle7);
odds3[7] = mul_complex_f32(odds3[7], self.twiddle7c);
let mut temp0 = parallel_fft2_interleaved_f32(odds1[0], odds3[0]);
let mut temp1 = parallel_fft2_interleaved_f32(odds1[1], odds3[1]);
let mut temp2 = parallel_fft2_interleaved_f32(odds1[2], odds3[2]);
let mut temp3 = parallel_fft2_interleaved_f32(odds1[3], odds3[3]);
let mut temp4 = parallel_fft2_interleaved_f32(odds1[4], odds3[4]);
let mut temp5 = parallel_fft2_interleaved_f32(odds1[5], odds3[5]);
let mut temp6 = parallel_fft2_interleaved_f32(odds1[6], odds3[6]);
let mut temp7 = parallel_fft2_interleaved_f32(odds1[7], odds3[7]);
temp0[1] = self.rotate90.rotate_both(temp0[1]);
temp1[1] = self.rotate90.rotate_both(temp1[1]);
temp2[1] = self.rotate90.rotate_both(temp2[1]);
temp3[1] = self.rotate90.rotate_both(temp3[1]);
temp4[1] = self.rotate90.rotate_both(temp4[1]);
temp5[1] = self.rotate90.rotate_both(temp5[1]);
temp6[1] = self.rotate90.rotate_both(temp6[1]);
temp7[1] = self.rotate90.rotate_both(temp7[1]);
[
_mm_add_ps(evens[0], temp0[0]),
_mm_add_ps(evens[1], temp1[0]),
_mm_add_ps(evens[2], temp2[0]),
_mm_add_ps(evens[3], temp3[0]),
_mm_add_ps(evens[4], temp4[0]),
_mm_add_ps(evens[5], temp5[0]),
_mm_add_ps(evens[6], temp6[0]),
_mm_add_ps(evens[7], temp7[0]),
_mm_add_ps(evens[8], temp0[1]),
_mm_add_ps(evens[9], temp1[1]),
_mm_add_ps(evens[10], temp2[1]),
_mm_add_ps(evens[11], temp3[1]),
_mm_add_ps(evens[12], temp4[1]),
_mm_add_ps(evens[13], temp5[1]),
_mm_add_ps(evens[14], temp6[1]),
_mm_add_ps(evens[15], temp7[1]),
_mm_sub_ps(evens[0], temp0[0]),
_mm_sub_ps(evens[1], temp1[0]),
_mm_sub_ps(evens[2], temp2[0]),
_mm_sub_ps(evens[3], temp3[0]),
_mm_sub_ps(evens[4], temp4[0]),
_mm_sub_ps(evens[5], temp5[0]),
_mm_sub_ps(evens[6], temp6[0]),
_mm_sub_ps(evens[7], temp7[0]),
_mm_sub_ps(evens[8], temp0[1]),
_mm_sub_ps(evens[9], temp1[1]),
_mm_sub_ps(evens[10], temp2[1]),
_mm_sub_ps(evens[11], temp3[1]),
_mm_sub_ps(evens[12], temp4[1]),
_mm_sub_ps(evens[13], temp5[1]),
_mm_sub_ps(evens[14], temp6[1]),
_mm_sub_ps(evens[15], temp7[1]),
]
}
}
pub struct SseF64Butterfly32<T> {
direction: FftDirection,
bf8: SseF64Butterfly8<T>,
bf16: SseF64Butterfly16<T>,
rotate90: Rotate90F64,
twiddle1: __m128d,
twiddle2: __m128d,
twiddle3: __m128d,
twiddle4: __m128d,
twiddle5: __m128d,
twiddle6: __m128d,
twiddle7: __m128d,
twiddle1c: __m128d,
twiddle2c: __m128d,
twiddle3c: __m128d,
twiddle4c: __m128d,
twiddle5c: __m128d,
twiddle6c: __m128d,
twiddle7c: __m128d,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly32, 32, |this: &SseF64Butterfly32<_>| this
.direction);
boilerplate_fft_sse_common_butterfly!(SseF64Butterfly32, 32, |this: &SseF64Butterfly32<_>| this
.direction);
impl<T: FftNum> SseF64Butterfly32<T> {
#[inline(always)]
pub fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
let bf8 = SseF64Butterfly8::new(direction);
let bf16 = SseF64Butterfly16::new(direction);
let rotate90 = if direction == FftDirection::Inverse {
Rotate90F64::new(true)
} else {
Rotate90F64::new(false)
};
let twiddle1 =
unsafe { _mm_loadu_pd(&twiddles::compute_twiddle(1, 32, direction).re as *const f64) };
let twiddle2 =
unsafe { _mm_loadu_pd(&twiddles::compute_twiddle(2, 32, direction).re as *const f64) };
let twiddle3 =
unsafe { _mm_loadu_pd(&twiddles::compute_twiddle(3, 32, direction).re as *const f64) };
let twiddle4 =
unsafe { _mm_loadu_pd(&twiddles::compute_twiddle(4, 32, direction).re as *const f64) };
let twiddle5 =
unsafe { _mm_loadu_pd(&twiddles::compute_twiddle(5, 32, direction).re as *const f64) };
let twiddle6 =
unsafe { _mm_loadu_pd(&twiddles::compute_twiddle(6, 32, direction).re as *const f64) };
let twiddle7 =
unsafe { _mm_loadu_pd(&twiddles::compute_twiddle(7, 32, direction).re as *const f64) };
let twiddle1c = unsafe {
_mm_loadu_pd(&twiddles::compute_twiddle(1, 32, direction).conj().re as *const f64)
};
let twiddle2c = unsafe {
_mm_loadu_pd(&twiddles::compute_twiddle(2, 32, direction).conj().re as *const f64)
};
let twiddle3c = unsafe {
_mm_loadu_pd(&twiddles::compute_twiddle(3, 32, direction).conj().re as *const f64)
};
let twiddle4c = unsafe {
_mm_loadu_pd(&twiddles::compute_twiddle(4, 32, direction).conj().re as *const f64)
};
let twiddle5c = unsafe {
_mm_loadu_pd(&twiddles::compute_twiddle(5, 32, direction).conj().re as *const f64)
};
let twiddle6c = unsafe {
_mm_loadu_pd(&twiddles::compute_twiddle(6, 32, direction).conj().re as *const f64)
};
let twiddle7c = unsafe {
_mm_loadu_pd(&twiddles::compute_twiddle(7, 32, direction).conj().re as *const f64)
};
Self {
direction,
bf8,
bf16,
rotate90,
twiddle1,
twiddle2,
twiddle3,
twiddle4,
twiddle5,
twiddle6,
twiddle7,
twiddle1c,
twiddle2c,
twiddle3c,
twiddle4c,
twiddle5c,
twiddle6c,
twiddle7c,
}
}
#[inline(always)]
unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let values = read_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
let out = self.perform_fft_direct(values);
write_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
}
#[inline(always)]
unsafe fn perform_fft_direct(&self, input: [__m128d; 32]) -> [__m128d; 32] {
let evens = self.bf16.perform_fft_direct([
input[0], input[2], input[4], input[6], input[8], input[10], input[12], input[14],
input[16], input[18], input[20], input[22], input[24], input[26], input[28], input[30],
]);
let mut odds1 = self.bf8.perform_fft_direct([
input[1], input[5], input[9], input[13], input[17], input[21], input[25], input[29],
]);
let mut odds3 = self.bf8.perform_fft_direct([
input[31], input[3], input[7], input[11], input[15], input[19], input[23], input[27],
]);
odds1[1] = mul_complex_f64(odds1[1], self.twiddle1);
odds3[1] = mul_complex_f64(odds3[1], self.twiddle1c);
odds1[2] = mul_complex_f64(odds1[2], self.twiddle2);
odds3[2] = mul_complex_f64(odds3[2], self.twiddle2c);
odds1[3] = mul_complex_f64(odds1[3], self.twiddle3);
odds3[3] = mul_complex_f64(odds3[3], self.twiddle3c);
odds1[4] = mul_complex_f64(odds1[4], self.twiddle4);
odds3[4] = mul_complex_f64(odds3[4], self.twiddle4c);
odds1[5] = mul_complex_f64(odds1[5], self.twiddle5);
odds3[5] = mul_complex_f64(odds3[5], self.twiddle5c);
odds1[6] = mul_complex_f64(odds1[6], self.twiddle6);
odds3[6] = mul_complex_f64(odds3[6], self.twiddle6c);
odds1[7] = mul_complex_f64(odds1[7], self.twiddle7);
odds3[7] = mul_complex_f64(odds3[7], self.twiddle7c);
let mut temp0 = solo_fft2_f64(odds1[0], odds3[0]);
let mut temp1 = solo_fft2_f64(odds1[1], odds3[1]);
let mut temp2 = solo_fft2_f64(odds1[2], odds3[2]);
let mut temp3 = solo_fft2_f64(odds1[3], odds3[3]);
let mut temp4 = solo_fft2_f64(odds1[4], odds3[4]);
let mut temp5 = solo_fft2_f64(odds1[5], odds3[5]);
let mut temp6 = solo_fft2_f64(odds1[6], odds3[6]);
let mut temp7 = solo_fft2_f64(odds1[7], odds3[7]);
temp0[1] = self.rotate90.rotate(temp0[1]);
temp1[1] = self.rotate90.rotate(temp1[1]);
temp2[1] = self.rotate90.rotate(temp2[1]);
temp3[1] = self.rotate90.rotate(temp3[1]);
temp4[1] = self.rotate90.rotate(temp4[1]);
temp5[1] = self.rotate90.rotate(temp5[1]);
temp6[1] = self.rotate90.rotate(temp6[1]);
temp7[1] = self.rotate90.rotate(temp7[1]);
[
_mm_add_pd(evens[0], temp0[0]),
_mm_add_pd(evens[1], temp1[0]),
_mm_add_pd(evens[2], temp2[0]),
_mm_add_pd(evens[3], temp3[0]),
_mm_add_pd(evens[4], temp4[0]),
_mm_add_pd(evens[5], temp5[0]),
_mm_add_pd(evens[6], temp6[0]),
_mm_add_pd(evens[7], temp7[0]),
_mm_add_pd(evens[8], temp0[1]),
_mm_add_pd(evens[9], temp1[1]),
_mm_add_pd(evens[10], temp2[1]),
_mm_add_pd(evens[11], temp3[1]),
_mm_add_pd(evens[12], temp4[1]),
_mm_add_pd(evens[13], temp5[1]),
_mm_add_pd(evens[14], temp6[1]),
_mm_add_pd(evens[15], temp7[1]),
_mm_sub_pd(evens[0], temp0[0]),
_mm_sub_pd(evens[1], temp1[0]),
_mm_sub_pd(evens[2], temp2[0]),
_mm_sub_pd(evens[3], temp3[0]),
_mm_sub_pd(evens[4], temp4[0]),
_mm_sub_pd(evens[5], temp5[0]),
_mm_sub_pd(evens[6], temp6[0]),
_mm_sub_pd(evens[7], temp7[0]),
_mm_sub_pd(evens[8], temp0[1]),
_mm_sub_pd(evens[9], temp1[1]),
_mm_sub_pd(evens[10], temp2[1]),
_mm_sub_pd(evens[11], temp3[1]),
_mm_sub_pd(evens[12], temp4[1]),
_mm_sub_pd(evens[13], temp5[1]),
_mm_sub_pd(evens[14], temp6[1]),
_mm_sub_pd(evens[15], temp7[1]),
]
}
}
#[cfg(test)]
mod unit_tests {
use super::*;
use crate::algorithm::Dft;
use crate::test_utils::{check_fft_algorithm, compare_vectors};
macro_rules! test_butterfly_32_func {
($test_name:ident, $struct_name:ident, $size:expr) => {
#[test]
fn $test_name() {
let butterfly = $struct_name::new(FftDirection::Forward);
check_fft_algorithm::<f32>(&butterfly, $size, FftDirection::Forward);
let butterfly_direction = $struct_name::new(FftDirection::Inverse);
check_fft_algorithm::<f32>(&butterfly_direction, $size, FftDirection::Inverse);
}
};
}
test_butterfly_32_func!(test_ssef32_butterfly1, SseF32Butterfly1, 1);
test_butterfly_32_func!(test_ssef32_butterfly2, SseF32Butterfly2, 2);
test_butterfly_32_func!(test_ssef32_butterfly3, SseF32Butterfly3, 3);
test_butterfly_32_func!(test_ssef32_butterfly4, SseF32Butterfly4, 4);
test_butterfly_32_func!(test_ssef32_butterfly5, SseF32Butterfly5, 5);
test_butterfly_32_func!(test_ssef32_butterfly6, SseF32Butterfly6, 6);
test_butterfly_32_func!(test_ssef32_butterfly8, SseF32Butterfly8, 8);
test_butterfly_32_func!(test_ssef32_butterfly9, SseF32Butterfly9, 9);
test_butterfly_32_func!(test_ssef32_butterfly10, SseF32Butterfly10, 10);
test_butterfly_32_func!(test_ssef32_butterfly12, SseF32Butterfly12, 12);
test_butterfly_32_func!(test_ssef32_butterfly15, SseF32Butterfly15, 15);
test_butterfly_32_func!(test_ssef32_butterfly16, SseF32Butterfly16, 16);
test_butterfly_32_func!(test_ssef32_butterfly32, SseF32Butterfly32, 32);
macro_rules! test_butterfly_64_func {
($test_name:ident, $struct_name:ident, $size:expr) => {
#[test]
fn $test_name() {
let butterfly = $struct_name::new(FftDirection::Forward);
check_fft_algorithm::<f64>(&butterfly, $size, FftDirection::Forward);
let butterfly_direction = $struct_name::new(FftDirection::Inverse);
check_fft_algorithm::<f64>(&butterfly_direction, $size, FftDirection::Inverse);
}
};
}
test_butterfly_64_func!(test_ssef64_butterfly1, SseF64Butterfly1, 1);
test_butterfly_64_func!(test_ssef64_butterfly2, SseF64Butterfly2, 2);
test_butterfly_64_func!(test_ssef64_butterfly3, SseF64Butterfly3, 3);
test_butterfly_64_func!(test_ssef64_butterfly4, SseF64Butterfly4, 4);
test_butterfly_64_func!(test_ssef64_butterfly5, SseF64Butterfly5, 5);
test_butterfly_64_func!(test_ssef64_butterfly6, SseF64Butterfly6, 6);
test_butterfly_64_func!(test_ssef64_butterfly8, SseF64Butterfly8, 8);
test_butterfly_64_func!(test_ssef64_butterfly9, SseF64Butterfly9, 9);
test_butterfly_64_func!(test_ssef64_butterfly10, SseF64Butterfly10, 10);
test_butterfly_64_func!(test_ssef64_butterfly12, SseF64Butterfly12, 12);
test_butterfly_64_func!(test_ssef64_butterfly15, SseF64Butterfly15, 15);
test_butterfly_64_func!(test_ssef64_butterfly16, SseF64Butterfly16, 16);
test_butterfly_64_func!(test_ssef64_butterfly32, SseF64Butterfly32, 32);
#[test]
fn test_mul_complex_f64() {
unsafe {
let right = _mm_set_pd(1.0, 2.0);
let left = _mm_set_pd(5.0, 7.0);
let res = mul_complex_f64(left, right);
let expected = _mm_set_pd(2.0 * 5.0 + 1.0 * 7.0, 2.0 * 7.0 - 1.0 * 5.0);
assert_eq!(
std::mem::transmute::<__m128d, Complex<f64>>(res),
std::mem::transmute::<__m128d, Complex<f64>>(expected)
);
}
}
#[test]
fn test_mul_complex_f32() {
unsafe {
let val1 = Complex::<f32>::new(1.0, 2.5);
let val2 = Complex::<f32>::new(3.2, 4.2);
let val3 = Complex::<f32>::new(5.6, 6.2);
let val4 = Complex::<f32>::new(7.4, 8.3);
let nbr2 = _mm_set_ps(val4.im, val4.re, val3.im, val3.re);
let nbr1 = _mm_set_ps(val2.im, val2.re, val1.im, val1.re);
let res = mul_complex_f32(nbr1, nbr2);
let res = std::mem::transmute::<__m128, [Complex<f32>; 2]>(res);
let expected = [val1 * val3, val2 * val4];
assert_eq!(res, expected);
}
}
#[test]
fn test_parallel_fft4_32() {
unsafe {
let val_a1 = Complex::<f32>::new(1.0, 2.5);
let val_a2 = Complex::<f32>::new(3.2, 4.2);
let val_a3 = Complex::<f32>::new(5.6, 6.2);
let val_a4 = Complex::<f32>::new(7.4, 8.3);
let val_b1 = Complex::<f32>::new(6.0, 24.5);
let val_b2 = Complex::<f32>::new(4.2, 34.2);
let val_b3 = Complex::<f32>::new(9.6, 61.2);
let val_b4 = Complex::<f32>::new(17.4, 81.3);
let p1 = _mm_set_ps(val_b1.im, val_b1.re, val_a1.im, val_a1.re);
let p2 = _mm_set_ps(val_b2.im, val_b2.re, val_a2.im, val_a2.re);
let p3 = _mm_set_ps(val_b3.im, val_b3.re, val_a3.im, val_a3.re);
let p4 = _mm_set_ps(val_b4.im, val_b4.re, val_a4.im, val_a4.re);
let mut val_a = vec![val_a1, val_a2, val_a3, val_a4];
let mut val_b = vec![val_b1, val_b2, val_b3, val_b4];
let dft = Dft::new(4, FftDirection::Forward);
let bf4 = SseF32Butterfly4::<f32>::new(FftDirection::Forward);
dft.process(&mut val_a);
dft.process(&mut val_b);
let res_both = bf4.perform_parallel_fft_direct(p1, p2, p3, p4);
let res = std::mem::transmute::<[__m128; 4], [Complex<f32>; 8]>(res_both);
let sse_res_a = [res[0], res[2], res[4], res[6]];
let sse_res_b = [res[1], res[3], res[5], res[7]];
assert!(compare_vectors(&val_a, &sse_res_a));
assert!(compare_vectors(&val_b, &sse_res_b));
}
}
#[test]
fn test_pack() {
unsafe {
let nbr2 = _mm_set_ps(8.0, 7.0, 6.0, 5.0);
let nbr1 = _mm_set_ps(4.0, 3.0, 2.0, 1.0);
let first = extract_lo_lo_f32(nbr1, nbr2);
let second = extract_hi_hi_f32(nbr1, nbr2);
let first = std::mem::transmute::<__m128, [Complex<f32>; 2]>(first);
let second = std::mem::transmute::<__m128, [Complex<f32>; 2]>(second);
let first_expected = [Complex::new(1.0, 2.0), Complex::new(5.0, 6.0)];
let second_expected = [Complex::new(3.0, 4.0), Complex::new(7.0, 8.0)];
assert_eq!(first, first_expected);
assert_eq!(second, second_expected);
}
}
}