rustfft/sse/
sse_butterflies.rs

1use core::arch::x86_64::*;
2use num_complex::Complex;
3
4use crate::{common::FftNum, FftDirection};
5
6use crate::array_utils;
7use crate::array_utils::workaround_transmute_mut;
8use crate::array_utils::DoubleBuf;
9use crate::common::{fft_error_inplace, fft_error_outofplace};
10use crate::twiddles;
11use crate::{Direction, Fft, Length};
12
13use super::sse_common::{assert_f32, assert_f64};
14use super::sse_utils::*;
15use super::sse_vector::{SseArrayMut, SseVector};
16
17#[inline(always)]
18unsafe fn pack_32(a: Complex<f32>, b: Complex<f32>) -> __m128 {
19    _mm_set_ps(b.im, b.re, a.im, a.re)
20}
21#[inline(always)]
22unsafe fn pack_64(a: Complex<f64>) -> __m128d {
23    _mm_set_pd(a.im, a.re)
24}
25
26#[allow(unused)]
27macro_rules! boilerplate_fft_sse_f32_butterfly {
28    ($struct_name:ident) => {
29        impl<T: FftNum> $struct_name<T> {
30            #[target_feature(enable = "sse4.1")]
31            //#[inline(always)]
32            pub(crate) unsafe fn perform_fft_butterfly(&self, buffer: &mut [Complex<T>]) {
33                self.perform_fft_contiguous(workaround_transmute_mut::<_, Complex<f32>>(buffer));
34            }
35
36            #[target_feature(enable = "sse4.1")]
37            //#[inline(always)]
38            pub(crate) unsafe fn perform_parallel_fft_butterfly(&self, buffer: &mut [Complex<T>]) {
39                self.perform_parallel_fft_contiguous(workaround_transmute_mut::<_, Complex<f32>>(
40                    buffer,
41                ));
42            }
43
44            // Do multiple ffts over a longer vector inplace, called from "process_with_scratch" of Fft trait
45            #[target_feature(enable = "sse4.1")]
46            pub(crate) unsafe fn perform_fft_butterfly_multi(
47                &self,
48                buffer: &mut [Complex<T>],
49            ) -> Result<(), ()> {
50                let len = buffer.len();
51                let alldone = array_utils::iter_chunks(buffer, 2 * self.len(), |chunk| {
52                    self.perform_parallel_fft_butterfly(chunk)
53                });
54                if alldone.is_err() && buffer.len() >= self.len() {
55                    self.perform_fft_butterfly(&mut buffer[len - self.len()..]);
56                }
57                Ok(())
58            }
59
60            // Do multiple ffts over a longer vector outofplace, called from "process_outofplace_with_scratch" of Fft trait
61            #[target_feature(enable = "sse4.1")]
62            pub(crate) unsafe fn perform_oop_fft_butterfly_multi(
63                &self,
64                input: &mut [Complex<T>],
65                output: &mut [Complex<T>],
66            ) -> Result<(), ()> {
67                let len = input.len();
68                let alldone = array_utils::iter_chunks_zipped(
69                    input,
70                    output,
71                    2 * self.len(),
72                    |in_chunk, out_chunk| {
73                        let input_slice = workaround_transmute_mut(in_chunk);
74                        let output_slice = workaround_transmute_mut(out_chunk);
75                        self.perform_parallel_fft_contiguous(DoubleBuf {
76                            input: input_slice,
77                            output: output_slice,
78                        })
79                    },
80                );
81                if alldone.is_err() && input.len() >= self.len() {
82                    let input_slice = workaround_transmute_mut(input);
83                    let output_slice = workaround_transmute_mut(output);
84                    self.perform_fft_contiguous(DoubleBuf {
85                        input: &mut input_slice[len - self.len()..],
86                        output: &mut output_slice[len - self.len()..],
87                    })
88                }
89                Ok(())
90            }
91        }
92    };
93}
94
95macro_rules! boilerplate_fft_sse_f32_butterfly_noparallel {
96    ($struct_name:ident) => {
97        impl<T: FftNum> $struct_name<T> {
98            // Do a single fft
99            #[target_feature(enable = "sse4.1")]
100            pub(crate) unsafe fn perform_fft_butterfly(&self, buffer: &mut [Complex<T>]) {
101                self.perform_fft_contiguous(workaround_transmute_mut::<_, Complex<f32>>(buffer));
102            }
103
104            // Do multiple ffts over a longer vector inplace, called from "process_with_scratch" of Fft trait
105            #[target_feature(enable = "sse4.1")]
106            pub(crate) unsafe fn perform_fft_butterfly_multi(
107                &self,
108                buffer: &mut [Complex<T>],
109            ) -> Result<(), ()> {
110                array_utils::iter_chunks(buffer, self.len(), |chunk| {
111                    self.perform_fft_butterfly(chunk)
112                })
113            }
114
115            // Do multiple ffts over a longer vector outofplace, called from "process_outofplace_with_scratch" of Fft trait
116            #[target_feature(enable = "sse4.1")]
117            pub(crate) unsafe fn perform_oop_fft_butterfly_multi(
118                &self,
119                input: &mut [Complex<T>],
120                output: &mut [Complex<T>],
121            ) -> Result<(), ()> {
122                array_utils::iter_chunks_zipped(input, output, self.len(), |in_chunk, out_chunk| {
123                    let input_slice = workaround_transmute_mut(in_chunk);
124                    let output_slice = workaround_transmute_mut(out_chunk);
125                    self.perform_fft_contiguous(DoubleBuf {
126                        input: input_slice,
127                        output: output_slice,
128                    })
129                })
130            }
131        }
132    };
133}
134
135macro_rules! boilerplate_fft_sse_f64_butterfly {
136    ($struct_name:ident) => {
137        impl<T: FftNum> $struct_name<T> {
138            // Do a single fft
139            #[target_feature(enable = "sse4.1")]
140            pub(crate) unsafe fn perform_fft_butterfly(&self, buffer: &mut [Complex<T>]) {
141                self.perform_fft_contiguous(workaround_transmute_mut::<_, Complex<f64>>(buffer));
142            }
143
144            // Do multiple ffts over a longer vector inplace, called from "process_with_scratch" of Fft trait
145            #[target_feature(enable = "sse4.1")]
146            pub(crate) unsafe fn perform_fft_butterfly_multi(
147                &self,
148                buffer: &mut [Complex<T>],
149            ) -> Result<(), ()> {
150                array_utils::iter_chunks(buffer, self.len(), |chunk| {
151                    self.perform_fft_butterfly(chunk)
152                })
153            }
154
155            // Do multiple ffts over a longer vector outofplace, called from "process_outofplace_with_scratch" of Fft trait
156            #[target_feature(enable = "sse4.1")]
157            pub(crate) unsafe fn perform_oop_fft_butterfly_multi(
158                &self,
159                input: &mut [Complex<T>],
160                output: &mut [Complex<T>],
161            ) -> Result<(), ()> {
162                array_utils::iter_chunks_zipped(input, output, self.len(), |in_chunk, out_chunk| {
163                    let input_slice = workaround_transmute_mut(in_chunk);
164                    let output_slice = workaround_transmute_mut(out_chunk);
165                    self.perform_fft_contiguous(DoubleBuf {
166                        input: input_slice,
167                        output: output_slice,
168                    })
169                })
170            }
171        }
172    };
173}
174
175#[allow(unused)]
176macro_rules! boilerplate_fft_sse_common_butterfly {
177    ($struct_name:ident, $len:expr, $direction_fn:expr) => {
178        impl<T: FftNum> Fft<T> for $struct_name<T> {
179            fn process_outofplace_with_scratch(
180                &self,
181                input: &mut [Complex<T>],
182                output: &mut [Complex<T>],
183                _scratch: &mut [Complex<T>],
184            ) {
185                if input.len() < self.len() || output.len() != input.len() {
186                    // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us
187                    fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0);
188                    return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here
189                }
190                let result = unsafe { self.perform_oop_fft_butterfly_multi(input, output) };
191
192                if result.is_err() {
193                    // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size,
194                    // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us
195                    fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0);
196                }
197            }
198            fn process_with_scratch(&self, buffer: &mut [Complex<T>], _scratch: &mut [Complex<T>]) {
199                if buffer.len() < self.len() {
200                    // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us
201                    fft_error_inplace(self.len(), buffer.len(), 0, 0);
202                    return; // Unreachable, because fft_error_inplace asserts, but it helps codegen to put it here
203                }
204
205                let result = unsafe { self.perform_fft_butterfly_multi(buffer) };
206
207                if result.is_err() {
208                    // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size,
209                    // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us
210                    fft_error_inplace(self.len(), buffer.len(), 0, 0);
211                }
212            }
213            #[inline(always)]
214            fn get_inplace_scratch_len(&self) -> usize {
215                0
216            }
217            #[inline(always)]
218            fn get_outofplace_scratch_len(&self) -> usize {
219                0
220            }
221        }
222        impl<T> Length for $struct_name<T> {
223            #[inline(always)]
224            fn len(&self) -> usize {
225                $len
226            }
227        }
228        impl<T> Direction for $struct_name<T> {
229            #[inline(always)]
230            fn fft_direction(&self) -> FftDirection {
231                $direction_fn(self)
232            }
233        }
234    };
235}
236
237//   _            _________  _     _ _
238//  / |          |___ /___ \| |__ (_) |_
239//  | |   _____    |_ \ __) | '_ \| | __|
240//  | |  |_____|  ___) / __/| |_) | | |_
241//  |_|          |____/_____|_.__/|_|\__|
242//
243
244pub struct SseF32Butterfly1<T> {
245    direction: FftDirection,
246    _phantom: std::marker::PhantomData<T>,
247}
248
249boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly1);
250boilerplate_fft_sse_common_butterfly!(SseF32Butterfly1, 1, |this: &SseF32Butterfly1<_>| this
251    .direction);
252impl<T: FftNum> SseF32Butterfly1<T> {
253    #[inline(always)]
254    pub fn new(direction: FftDirection) -> Self {
255        assert_f32::<T>();
256        Self {
257            direction,
258            _phantom: std::marker::PhantomData,
259        }
260    }
261    #[inline(always)]
262    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
263        let value = buffer.load_partial_lo_complex(0);
264        buffer.store_partial_lo_complex(value, 0);
265    }
266
267    #[inline(always)]
268    pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
269        let value = buffer.load_complex(0);
270        buffer.store_complex(value, 0);
271    }
272}
273
274//   _             __   _  _   _     _ _
275//  / |           / /_ | || | | |__ (_) |_
276//  | |   _____  | '_ \| || |_| '_ \| | __|
277//  | |  |_____| | (_) |__   _| |_) | | |_
278//  |_|           \___/   |_| |_.__/|_|\__|
279//
280
281pub struct SseF64Butterfly1<T> {
282    direction: FftDirection,
283    _phantom: std::marker::PhantomData<T>,
284}
285
286boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly1);
287boilerplate_fft_sse_common_butterfly!(SseF64Butterfly1, 1, |this: &SseF64Butterfly1<_>| this
288    .direction);
289impl<T: FftNum> SseF64Butterfly1<T> {
290    #[inline(always)]
291    pub fn new(direction: FftDirection) -> Self {
292        assert_f64::<T>();
293        Self {
294            direction,
295            _phantom: std::marker::PhantomData,
296        }
297    }
298    #[inline(always)]
299    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
300        let value = buffer.load_complex(0);
301        buffer.store_complex(value, 0);
302    }
303}
304
305//   ____            _________  _     _ _
306//  |___ \          |___ /___ \| |__ (_) |_
307//    __) |  _____    |_ \ __) | '_ \| | __|
308//   / __/  |_____|  ___) / __/| |_) | | |_
309//  |_____|         |____/_____|_.__/|_|\__|
310//
311
312pub struct SseF32Butterfly2<T> {
313    direction: FftDirection,
314    _phantom: std::marker::PhantomData<T>,
315}
316
317boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly2);
318boilerplate_fft_sse_common_butterfly!(SseF32Butterfly2, 2, |this: &SseF32Butterfly2<_>| this
319    .direction);
320impl<T: FftNum> SseF32Butterfly2<T> {
321    #[inline(always)]
322    pub fn new(direction: FftDirection) -> Self {
323        assert_f32::<T>();
324        Self {
325            direction,
326            _phantom: std::marker::PhantomData,
327        }
328    }
329    #[inline(always)]
330    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
331        let values = buffer.load_complex(0);
332
333        let temp = self.perform_fft_direct(values);
334
335        buffer.store_complex(temp, 0);
336    }
337
338    #[inline(always)]
339    pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
340        let values_a = buffer.load_complex(0);
341        let values_b = buffer.load_complex(2);
342
343        let out = self.perform_parallel_fft_direct(values_a, values_b);
344
345        let [out02, out13] = transpose_complex_2x2_f32(out[0], out[1]);
346
347        buffer.store_complex(out02, 0);
348        buffer.store_complex(out13, 2);
349    }
350
351    // length 2 fft of x, given as [x0, x1]
352    // result is [X0, X1]
353    #[inline(always)]
354    pub(crate) unsafe fn perform_fft_direct(&self, values: __m128) -> __m128 {
355        solo_fft2_f32(values)
356    }
357
358    // dual length 2 fft of x and y, given as [x0, x1], [y0, y1]
359    // result is [X0, Y0], [X1, Y1]
360    #[inline(always)]
361    pub(crate) unsafe fn perform_parallel_fft_direct(
362        &self,
363        values_x: __m128,
364        values_y: __m128,
365    ) -> [__m128; 2] {
366        parallel_fft2_contiguous_f32(values_x, values_y)
367    }
368}
369
370// double lenth 2 fft of a and b, given as [x0, y0], [x1, y1]
371// result is [X0, Y0], [X1, Y1]
372#[inline(always)]
373pub(crate) unsafe fn parallel_fft2_interleaved_f32(val02: __m128, val13: __m128) -> [__m128; 2] {
374    let temp0 = _mm_add_ps(val02, val13);
375    let temp1 = _mm_sub_ps(val02, val13);
376    [temp0, temp1]
377}
378
379// double lenth 2 fft of a and b, given as [x0, x1], [y0, y1]
380// result is [X0, Y0], [X1, Y1]
381#[inline(always)]
382unsafe fn parallel_fft2_contiguous_f32(left: __m128, right: __m128) -> [__m128; 2] {
383    let [temp02, temp13] = transpose_complex_2x2_f32(left, right);
384    parallel_fft2_interleaved_f32(temp02, temp13)
385}
386
387// length 2 fft of x, given as [x0, x1]
388// result is [X0, X1]
389#[inline(always)]
390unsafe fn solo_fft2_f32(values: __m128) -> __m128 {
391    let temp = reverse_complex_elements_f32(values);
392    let temp2 = negate_hi_f32(values);
393    _mm_add_ps(temp2, temp)
394}
395
396//   ____             __   _  _   _     _ _
397//  |___ \           / /_ | || | | |__ (_) |_
398//    __) |  _____  | '_ \| || |_| '_ \| | __|
399//   / __/  |_____| | (_) |__   _| |_) | | |_
400//  |_____|          \___/   |_| |_.__/|_|\__|
401//
402
403pub struct SseF64Butterfly2<T> {
404    direction: FftDirection,
405    _phantom: std::marker::PhantomData<T>,
406}
407
408boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly2);
409boilerplate_fft_sse_common_butterfly!(SseF64Butterfly2, 2, |this: &SseF64Butterfly2<_>| this
410    .direction);
411impl<T: FftNum> SseF64Butterfly2<T> {
412    #[inline(always)]
413    pub fn new(direction: FftDirection) -> Self {
414        assert_f64::<T>();
415        Self {
416            direction,
417            _phantom: std::marker::PhantomData,
418        }
419    }
420
421    #[inline(always)]
422    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
423        let value0 = buffer.load_complex(0);
424        let value1 = buffer.load_complex(1);
425
426        let out = self.perform_fft_direct(value0, value1);
427
428        buffer.store_complex(out[0], 0);
429        buffer.store_complex(out[1], 1);
430    }
431
432    #[inline(always)]
433    pub(crate) unsafe fn perform_fft_direct(
434        &self,
435        value0: __m128d,
436        value1: __m128d,
437    ) -> [__m128d; 2] {
438        solo_fft2_f64(value0, value1)
439    }
440}
441
442#[inline(always)]
443pub(crate) unsafe fn solo_fft2_f64(left: __m128d, right: __m128d) -> [__m128d; 2] {
444    let temp0 = _mm_add_pd(left, right);
445    let temp1 = _mm_sub_pd(left, right);
446    [temp0, temp1]
447}
448
449//   _____            _________  _     _ _
450//  |___ /           |___ /___ \| |__ (_) |_
451//    |_ \    _____    |_ \ __) | '_ \| | __|
452//   ___) |  |_____|  ___) / __/| |_) | | |_
453//  |____/           |____/_____|_.__/|_|\__|
454//
455
456pub struct SseF32Butterfly3<T> {
457    direction: FftDirection,
458    _phantom: std::marker::PhantomData<T>,
459    rotate: Rotate90F32,
460    twiddle: __m128,
461    twiddle1re: __m128,
462    twiddle1im: __m128,
463}
464
465boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly3);
466boilerplate_fft_sse_common_butterfly!(SseF32Butterfly3, 3, |this: &SseF32Butterfly3<_>| this
467    .direction);
468impl<T: FftNum> SseF32Butterfly3<T> {
469    #[inline(always)]
470    pub fn new(direction: FftDirection) -> Self {
471        assert_f32::<T>();
472        let rotate = Rotate90F32::new(true);
473        let tw1: Complex<f32> = twiddles::compute_twiddle(1, 3, direction);
474        let twiddle = unsafe { _mm_set_ps(-tw1.im, -tw1.im, tw1.re, tw1.re) };
475        let twiddle1re = unsafe { _mm_set_ps(tw1.re, tw1.re, tw1.re, tw1.re) };
476        let twiddle1im = unsafe { _mm_set_ps(tw1.im, tw1.im, tw1.im, tw1.im) };
477        Self {
478            direction,
479            _phantom: std::marker::PhantomData,
480            rotate,
481            twiddle,
482            twiddle1re,
483            twiddle1im,
484        }
485    }
486    #[inline(always)]
487    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
488        let value0x = buffer.load_partial_lo_complex(0);
489        let value12 = buffer.load_complex(1);
490
491        let out = self.perform_fft_direct(value0x, value12);
492
493        buffer.store_partial_lo_complex(out[0], 0);
494        buffer.store_complex(out[1], 1);
495    }
496
497    #[inline(always)]
498    pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
499        let valuea0a1 = buffer.load_complex(0);
500        let valuea2b0 = buffer.load_complex(2);
501        let valueb1b2 = buffer.load_complex(4);
502
503        let value0 = extract_lo_hi_f32(valuea0a1, valuea2b0);
504        let value1 = extract_hi_lo_f32(valuea0a1, valueb1b2);
505        let value2 = extract_lo_hi_f32(valuea2b0, valueb1b2);
506
507        let out = self.perform_parallel_fft_direct(value0, value1, value2);
508
509        let out0 = extract_lo_lo_f32(out[0], out[1]);
510        let out1 = extract_lo_hi_f32(out[2], out[0]);
511        let out2 = extract_hi_hi_f32(out[1], out[2]);
512
513        buffer.store_complex(out0, 0);
514        buffer.store_complex(out1, 2);
515        buffer.store_complex(out2, 4);
516    }
517
518    // length 3 fft of a, given as [x0, 0.0], [x1, x2]
519    // result is [X0, Z], [X1, X2]
520    // The value Z should be discarded.
521    #[inline(always)]
522    pub(crate) unsafe fn perform_fft_direct(
523        &self,
524        value0x: __m128,
525        value12: __m128,
526    ) -> [__m128; 2] {
527        // This is a SSE translation of the scalar 3-point butterfly
528        let rev12 = negate_hi_f32(reverse_complex_elements_f32(value12));
529        let temp12pn = self.rotate.rotate_hi(_mm_add_ps(value12, rev12));
530        let twiddled = _mm_mul_ps(temp12pn, self.twiddle);
531        let temp = _mm_add_ps(value0x, twiddled);
532        let out12 = solo_fft2_f32(temp);
533        let out0x = _mm_add_ps(value0x, temp12pn);
534        [out0x, out12]
535    }
536
537    // length 3 dual fft of a, given as (x0, y0), (x1, y1), (x2, y2).
538    // result is [(X0, Y0), (X1, Y1), (X2, Y2)]
539    #[inline(always)]
540    pub(crate) unsafe fn perform_parallel_fft_direct(
541        &self,
542        value0: __m128,
543        value1: __m128,
544        value2: __m128,
545    ) -> [__m128; 3] {
546        // This is a SSE translation of the scalar 3-point butterfly
547        let x12p = _mm_add_ps(value1, value2);
548        let x12n = _mm_sub_ps(value1, value2);
549        let sum = _mm_add_ps(value0, x12p);
550
551        let temp_a = _mm_mul_ps(self.twiddle1re, x12p);
552        let temp_a = _mm_add_ps(temp_a, value0);
553
554        let n_rot = self.rotate.rotate_both(x12n);
555        let temp_b = _mm_mul_ps(self.twiddle1im, n_rot);
556
557        let x1 = _mm_add_ps(temp_a, temp_b);
558        let x2 = _mm_sub_ps(temp_a, temp_b);
559        [sum, x1, x2]
560    }
561}
562
563//   _____             __   _  _   _     _ _
564//  |___ /            / /_ | || | | |__ (_) |_
565//    |_ \    _____  | '_ \| || |_| '_ \| | __|
566//   ___) |  |_____| | (_) |__   _| |_) | | |_
567//  |____/            \___/   |_| |_.__/|_|\__|
568//
569
570pub struct SseF64Butterfly3<T> {
571    direction: FftDirection,
572    _phantom: std::marker::PhantomData<T>,
573    rotate: Rotate90F64,
574    twiddle1re: __m128d,
575    twiddle1im: __m128d,
576}
577
578boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly3);
579boilerplate_fft_sse_common_butterfly!(SseF64Butterfly3, 3, |this: &SseF64Butterfly3<_>| this
580    .direction);
581impl<T: FftNum> SseF64Butterfly3<T> {
582    #[inline(always)]
583    pub fn new(direction: FftDirection) -> Self {
584        assert_f64::<T>();
585        let rotate = Rotate90F64::new(true);
586        let tw1: Complex<f64> = twiddles::compute_twiddle(1, 3, direction);
587        let twiddle1re = unsafe { _mm_set_pd(tw1.re, tw1.re) };
588        let twiddle1im = unsafe { _mm_set_pd(tw1.im, tw1.im) };
589
590        Self {
591            direction,
592            _phantom: std::marker::PhantomData,
593            rotate,
594            twiddle1re,
595            twiddle1im,
596        }
597    }
598
599    #[inline(always)]
600    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
601        let value0 = buffer.load_complex(0);
602        let value1 = buffer.load_complex(1);
603        let value2 = buffer.load_complex(2);
604
605        let out = self.perform_fft_direct(value0, value1, value2);
606
607        buffer.store_complex(out[0], 0);
608        buffer.store_complex(out[1], 1);
609        buffer.store_complex(out[2], 2);
610    }
611
612    // length 3 fft of x, given as x0, x1, x2.
613    // result is [X0, X1, X2]
614    #[inline(always)]
615    pub(crate) unsafe fn perform_fft_direct(
616        &self,
617        value0: __m128d,
618        value1: __m128d,
619        value2: __m128d,
620    ) -> [__m128d; 3] {
621        // This is a SSE translation of the scalar 3-point butterfly
622        let x12p = _mm_add_pd(value1, value2);
623        let x12n = _mm_sub_pd(value1, value2);
624        let sum = _mm_add_pd(value0, x12p);
625
626        let temp_a = _mm_mul_pd(self.twiddle1re, x12p);
627        let temp_a = _mm_add_pd(temp_a, value0);
628
629        let n_rot = self.rotate.rotate(x12n);
630        let temp_b = _mm_mul_pd(self.twiddle1im, n_rot);
631
632        let x1 = _mm_add_pd(temp_a, temp_b);
633        let x2 = _mm_sub_pd(temp_a, temp_b);
634        [sum, x1, x2]
635    }
636}
637
638//   _  _             _________  _     _ _
639//  | || |           |___ /___ \| |__ (_) |_
640//  | || |_   _____    |_ \ __) | '_ \| | __|
641//  |__   _| |_____|  ___) / __/| |_) | | |_
642//     |_|           |____/_____|_.__/|_|\__|
643//
644
645pub struct SseF32Butterfly4<T> {
646    direction: FftDirection,
647    _phantom: std::marker::PhantomData<T>,
648    rotate: Rotate90F32,
649}
650
651boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly4);
652boilerplate_fft_sse_common_butterfly!(SseF32Butterfly4, 4, |this: &SseF32Butterfly4<_>| this
653    .direction);
654impl<T: FftNum> SseF32Butterfly4<T> {
655    #[inline(always)]
656    pub fn new(direction: FftDirection) -> Self {
657        assert_f32::<T>();
658        let rotate = if direction == FftDirection::Inverse {
659            Rotate90F32::new(true)
660        } else {
661            Rotate90F32::new(false)
662        };
663        Self {
664            direction,
665            _phantom: std::marker::PhantomData,
666            rotate,
667        }
668    }
669    #[inline(always)]
670    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
671        let value01 = buffer.load_complex(0);
672        let value23 = buffer.load_complex(2);
673
674        let out = self.perform_fft_direct(value01, value23);
675
676        buffer.store_complex(out[0], 0);
677        buffer.store_complex(out[1], 2);
678    }
679
680    #[inline(always)]
681    pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
682        let value01a = buffer.load_complex(0);
683        let value23a = buffer.load_complex(2);
684        let value01b = buffer.load_complex(4);
685        let value23b = buffer.load_complex(6);
686
687        let [value0ab, value1ab] = transpose_complex_2x2_f32(value01a, value01b);
688        let [value2ab, value3ab] = transpose_complex_2x2_f32(value23a, value23b);
689
690        let out = self.perform_parallel_fft_direct([value0ab, value1ab, value2ab, value3ab]);
691
692        let [out0, out1] = transpose_complex_2x2_f32(out[0], out[1]);
693        let [out2, out3] = transpose_complex_2x2_f32(out[2], out[3]);
694
695        buffer.store_complex(out0, 0);
696        buffer.store_complex(out1, 4);
697        buffer.store_complex(out2, 2);
698        buffer.store_complex(out3, 6);
699    }
700
701    // length 4 fft of a, given as [x0, x1], [x2, x3]
702    // result is [[X0, X1], [X2, X3]]
703    #[inline(always)]
704    pub(crate) unsafe fn perform_fft_direct(
705        &self,
706        value01: __m128,
707        value23: __m128,
708    ) -> [__m128; 2] {
709        //we're going to hardcode a step of mixed radix
710        //aka we're going to do the six step algorithm
711
712        // step 1: transpose
713        // and
714        // step 2: column FFTs
715        let mut temp = parallel_fft2_interleaved_f32(value01, value23);
716
717        // step 3: apply twiddle factors (only one in this case, and it's either 0 + i or 0 - i)
718        temp[1] = self.rotate.rotate_hi(temp[1]);
719
720        // step 4: transpose, which we're skipping because we're the previous FFTs were non-contiguous
721
722        // step 5: row FFTs
723        // and
724        // step 6: transpose by swapping index 1 and 2
725        parallel_fft2_contiguous_f32(temp[0], temp[1])
726    }
727
728    #[inline(always)]
729    pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 4]) -> [__m128; 4] {
730        //we're going to hardcode a step of mixed radix
731        //aka we're going to do the six step algorithm
732
733        // step 1: transpose
734        // and
735        // step 2: column FFTs
736        let temp0 = parallel_fft2_interleaved_f32(values[0], values[2]);
737        let mut temp1 = parallel_fft2_interleaved_f32(values[1], values[3]);
738
739        // step 3: apply twiddle factors (only one in this case, and it's either 0 + i or 0 - i)
740        temp1[1] = self.rotate.rotate_both(temp1[1]);
741
742        // step 4: transpose, which we're skipping because we're the previous FFTs were non-contiguous
743
744        // step 5: row FFTs
745        let out0 = parallel_fft2_interleaved_f32(temp0[0], temp1[0]);
746        let out2 = parallel_fft2_interleaved_f32(temp0[1], temp1[1]);
747
748        // step 6: transpose by swapping index 1 and 2
749        [out0[0], out2[0], out0[1], out2[1]]
750    }
751}
752
753//   _  _              __   _  _   _     _ _
754//  | || |            / /_ | || | | |__ (_) |_
755//  | || |_   _____  | '_ \| || |_| '_ \| | __|
756//  |__   _| |_____| | (_) |__   _| |_) | | |_
757//     |_|            \___/   |_| |_.__/|_|\__|
758//
759
760pub struct SseF64Butterfly4<T> {
761    direction: FftDirection,
762    _phantom: std::marker::PhantomData<T>,
763    rotate: Rotate90F64,
764}
765
766boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly4);
767boilerplate_fft_sse_common_butterfly!(SseF64Butterfly4, 4, |this: &SseF64Butterfly4<_>| this
768    .direction);
769impl<T: FftNum> SseF64Butterfly4<T> {
770    #[inline(always)]
771    pub fn new(direction: FftDirection) -> Self {
772        assert_f64::<T>();
773        let rotate = if direction == FftDirection::Inverse {
774            Rotate90F64::new(true)
775        } else {
776            Rotate90F64::new(false)
777        };
778
779        Self {
780            direction,
781            _phantom: std::marker::PhantomData,
782            rotate,
783        }
784    }
785
786    #[inline(always)]
787    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
788        let value0 = buffer.load_complex(0);
789        let value1 = buffer.load_complex(1);
790        let value2 = buffer.load_complex(2);
791        let value3 = buffer.load_complex(3);
792
793        let out = self.perform_fft_direct([value0, value1, value2, value3]);
794
795        buffer.store_complex(out[0], 0);
796        buffer.store_complex(out[1], 1);
797        buffer.store_complex(out[2], 2);
798        buffer.store_complex(out[3], 3);
799    }
800
801    #[inline(always)]
802    pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 4]) -> [__m128d; 4] {
803        //we're going to hardcode a step of mixed radix
804        //aka we're going to do the six step algorithm
805
806        // step 1: transpose
807        // and
808        // step 2: column FFTs
809        let temp0 = solo_fft2_f64(values[0], values[2]);
810        let mut temp1 = solo_fft2_f64(values[1], values[3]);
811
812        // step 3: apply twiddle factors (only one in this case, and it's either 0 + i or 0 - i)
813        temp1[1] = self.rotate.rotate(temp1[1]);
814
815        // step 4: transpose, which we're skipping because we're the previous FFTs were non-contiguous
816
817        // step 5: row FFTs
818        let out0 = solo_fft2_f64(temp0[0], temp1[0]);
819        let out2 = solo_fft2_f64(temp0[1], temp1[1]);
820
821        // step 6: transpose by swapping index 1 and 2
822        [out0[0], out2[0], out0[1], out2[1]]
823    }
824}
825
826//   ____             _________  _     _ _
827//  | ___|           |___ /___ \| |__ (_) |_
828//  |___ \    _____    |_ \ __) | '_ \| | __|
829//   ___) |  |_____|  ___) / __/| |_) | | |_
830//  |____/           |____/_____|_.__/|_|\__|
831//
832
833pub struct SseF32Butterfly5<T> {
834    direction: FftDirection,
835    _phantom: std::marker::PhantomData<T>,
836    rotate: Rotate90F32,
837    twiddle12re: __m128,
838    twiddle21re: __m128,
839    twiddle12im: __m128,
840    twiddle21im: __m128,
841    twiddle1re: __m128,
842    twiddle1im: __m128,
843    twiddle2re: __m128,
844    twiddle2im: __m128,
845}
846
847boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly5);
848boilerplate_fft_sse_common_butterfly!(SseF32Butterfly5, 5, |this: &SseF32Butterfly5<_>| this
849    .direction);
850impl<T: FftNum> SseF32Butterfly5<T> {
851    #[inline(always)]
852    pub fn new(direction: FftDirection) -> Self {
853        assert_f32::<T>();
854        let rotate = Rotate90F32::new(true);
855        let tw1: Complex<f32> = twiddles::compute_twiddle(1, 5, direction);
856        let tw2: Complex<f32> = twiddles::compute_twiddle(2, 5, direction);
857        let twiddle12re = unsafe { _mm_set_ps(tw2.re, tw2.re, tw1.re, tw1.re) };
858        let twiddle21re = unsafe { _mm_set_ps(tw1.re, tw1.re, tw2.re, tw2.re) };
859        let twiddle12im = unsafe { _mm_set_ps(tw2.im, tw2.im, tw1.im, tw1.im) };
860        let twiddle21im = unsafe { _mm_set_ps(-tw1.im, -tw1.im, tw2.im, tw2.im) };
861        let twiddle1re = unsafe { _mm_set_ps(tw1.re, tw1.re, tw1.re, tw1.re) };
862        let twiddle1im = unsafe { _mm_set_ps(tw1.im, tw1.im, tw1.im, tw1.im) };
863        let twiddle2re = unsafe { _mm_set_ps(tw2.re, tw2.re, tw2.re, tw2.re) };
864        let twiddle2im = unsafe { _mm_set_ps(tw2.im, tw2.im, tw2.im, tw2.im) };
865
866        Self {
867            direction,
868            _phantom: std::marker::PhantomData,
869            rotate,
870            twiddle12re,
871            twiddle21re,
872            twiddle12im,
873            twiddle21im,
874            twiddle1re,
875            twiddle1im,
876            twiddle2re,
877            twiddle2im,
878        }
879    }
880    #[inline(always)]
881    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
882        let value00 = buffer.load1_complex(0);
883        let value12 = buffer.load_complex(1);
884        let value34 = buffer.load_complex(3);
885
886        let out = self.perform_fft_direct(value00, value12, value34);
887
888        buffer.store_partial_lo_complex(out[0], 0);
889        buffer.store_complex(out[1], 1);
890        buffer.store_complex(out[2], 3);
891    }
892
893    #[inline(always)]
894    pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
895        let input_packed = read_complex_to_array!(buffer, {0, 2, 4 ,6, 8});
896
897        let value0 = extract_lo_hi_f32(input_packed[0], input_packed[2]);
898        let value1 = extract_hi_lo_f32(input_packed[0], input_packed[3]);
899        let value2 = extract_lo_hi_f32(input_packed[1], input_packed[3]);
900        let value3 = extract_hi_lo_f32(input_packed[1], input_packed[4]);
901        let value4 = extract_lo_hi_f32(input_packed[2], input_packed[4]);
902
903        let out = self.perform_parallel_fft_direct(value0, value1, value2, value3, value4);
904
905        let out_packed = [
906            extract_lo_lo_f32(out[0], out[1]),
907            extract_lo_lo_f32(out[2], out[3]),
908            extract_lo_hi_f32(out[4], out[0]),
909            extract_hi_hi_f32(out[1], out[2]),
910            extract_hi_hi_f32(out[3], out[4]),
911        ];
912
913        write_complex_to_array_strided!(out_packed, buffer, 2, {0, 1, 2, 3, 4});
914    }
915
916    // length 5 fft of a, given as [x0, x0], [x1, x2], [x3, x4].
917    // result is [[X0, Z], [X1, X2], [X3, X4]]
918    // Note that Z should not be used.
919    #[inline(always)]
920    pub(crate) unsafe fn perform_fft_direct(
921        &self,
922        value00: __m128,
923        value12: __m128,
924        value34: __m128,
925    ) -> [__m128; 3] {
926        // This is a SSE translation of the scalar 5-point butterfly
927        let temp43 = reverse_complex_elements_f32(value34);
928        let x1423p = _mm_add_ps(value12, temp43);
929        let x1423n = _mm_sub_ps(value12, temp43);
930
931        let x1414p = duplicate_lo_f32(x1423p);
932        let x2323p = duplicate_hi_f32(x1423p);
933        let x1414n = duplicate_lo_f32(x1423n);
934        let x2323n = duplicate_hi_f32(x1423n);
935
936        let temp_a1 = _mm_mul_ps(self.twiddle12re, x1414p);
937        let temp_a2 = _mm_mul_ps(self.twiddle21re, x2323p);
938
939        let temp_b1 = _mm_mul_ps(self.twiddle12im, x1414n);
940        let temp_b2 = _mm_mul_ps(self.twiddle21im, x2323n);
941
942        let temp_a = _mm_add_ps(temp_a1, temp_a2);
943        let temp_a = _mm_add_ps(value00, temp_a);
944
945        let temp_b = _mm_add_ps(temp_b1, temp_b2);
946
947        let b_rot = self.rotate.rotate_both(temp_b);
948
949        let x00 = _mm_add_ps(value00, _mm_add_ps(x1414p, x2323p));
950
951        let x12 = _mm_add_ps(temp_a, b_rot);
952        let x34 = reverse_complex_elements_f32(_mm_sub_ps(temp_a, b_rot));
953        [x00, x12, x34]
954    }
955
956    // length 5 dual fft of x and y, given as (x0, y0), (x1, y1) ... (x4, y4).
957    // result is [(X0, Y0), (X1, Y1) ... (X2, Y2)]
958    #[inline(always)]
959    pub(crate) unsafe fn perform_parallel_fft_direct(
960        &self,
961        value0: __m128,
962        value1: __m128,
963        value2: __m128,
964        value3: __m128,
965        value4: __m128,
966    ) -> [__m128; 5] {
967        // This is a SSE translation of the scalar 3-point butterfly
968        let x14p = _mm_add_ps(value1, value4);
969        let x14n = _mm_sub_ps(value1, value4);
970        let x23p = _mm_add_ps(value2, value3);
971        let x23n = _mm_sub_ps(value2, value3);
972
973        let temp_a1_1 = _mm_mul_ps(self.twiddle1re, x14p);
974        let temp_a1_2 = _mm_mul_ps(self.twiddle2re, x23p);
975        let temp_b1_1 = _mm_mul_ps(self.twiddle1im, x14n);
976        let temp_b1_2 = _mm_mul_ps(self.twiddle2im, x23n);
977        let temp_a2_1 = _mm_mul_ps(self.twiddle1re, x23p);
978        let temp_a2_2 = _mm_mul_ps(self.twiddle2re, x14p);
979        let temp_b2_1 = _mm_mul_ps(self.twiddle2im, x14n);
980        let temp_b2_2 = _mm_mul_ps(self.twiddle1im, x23n);
981
982        let temp_a1 = _mm_add_ps(value0, _mm_add_ps(temp_a1_1, temp_a1_2));
983        let temp_b1 = _mm_add_ps(temp_b1_1, temp_b1_2);
984        let temp_a2 = _mm_add_ps(value0, _mm_add_ps(temp_a2_1, temp_a2_2));
985        let temp_b2 = _mm_sub_ps(temp_b2_1, temp_b2_2);
986
987        [
988            _mm_add_ps(value0, _mm_add_ps(x14p, x23p)),
989            _mm_add_ps(temp_a1, self.rotate.rotate_both(temp_b1)),
990            _mm_add_ps(temp_a2, self.rotate.rotate_both(temp_b2)),
991            _mm_sub_ps(temp_a2, self.rotate.rotate_both(temp_b2)),
992            _mm_sub_ps(temp_a1, self.rotate.rotate_both(temp_b1)),
993        ]
994    }
995}
996
997//   ____              __   _  _   _     _ _
998//  | ___|            / /_ | || | | |__ (_) |_
999//  |___ \    _____  | '_ \| || |_| '_ \| | __|
1000//   ___) |  |_____| | (_) |__   _| |_) | | |_
1001//  |____/            \___/   |_| |_.__/|_|\__|
1002//
1003
1004pub struct SseF64Butterfly5<T> {
1005    direction: FftDirection,
1006    _phantom: std::marker::PhantomData<T>,
1007    rotate: Rotate90F64,
1008    twiddle1re: __m128d,
1009    twiddle1im: __m128d,
1010    twiddle2re: __m128d,
1011    twiddle2im: __m128d,
1012}
1013
1014boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly5);
1015boilerplate_fft_sse_common_butterfly!(SseF64Butterfly5, 5, |this: &SseF64Butterfly5<_>| this
1016    .direction);
1017impl<T: FftNum> SseF64Butterfly5<T> {
1018    #[inline(always)]
1019    pub fn new(direction: FftDirection) -> Self {
1020        assert_f64::<T>();
1021        let rotate = Rotate90F64::new(true);
1022        let tw1: Complex<f64> = twiddles::compute_twiddle(1, 5, direction);
1023        let tw2: Complex<f64> = twiddles::compute_twiddle(2, 5, direction);
1024        let twiddle1re = unsafe { _mm_set_pd(tw1.re, tw1.re) };
1025        let twiddle1im = unsafe { _mm_set_pd(tw1.im, tw1.im) };
1026        let twiddle2re = unsafe { _mm_set_pd(tw2.re, tw2.re) };
1027        let twiddle2im = unsafe { _mm_set_pd(tw2.im, tw2.im) };
1028
1029        Self {
1030            direction,
1031            _phantom: std::marker::PhantomData,
1032            rotate,
1033            twiddle1re,
1034            twiddle1im,
1035            twiddle2re,
1036            twiddle2im,
1037        }
1038    }
1039
1040    #[inline(always)]
1041    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
1042        let value0 = buffer.load_complex(0);
1043        let value1 = buffer.load_complex(1);
1044        let value2 = buffer.load_complex(2);
1045        let value3 = buffer.load_complex(3);
1046        let value4 = buffer.load_complex(4);
1047
1048        let out = self.perform_fft_direct(value0, value1, value2, value3, value4);
1049
1050        buffer.store_complex(out[0], 0);
1051        buffer.store_complex(out[1], 1);
1052        buffer.store_complex(out[2], 2);
1053        buffer.store_complex(out[3], 3);
1054        buffer.store_complex(out[4], 4);
1055    }
1056
1057    // length 5 fft of x, given as x0, x1, x2, x3, x4.
1058    // result is [X0, X1, X2, X3, X4]
1059    #[inline(always)]
1060    pub(crate) unsafe fn perform_fft_direct(
1061        &self,
1062        value0: __m128d,
1063        value1: __m128d,
1064        value2: __m128d,
1065        value3: __m128d,
1066        value4: __m128d,
1067    ) -> [__m128d; 5] {
1068        // This is a SSE translation of the scalar 5-point butterfly
1069        let x14p = _mm_add_pd(value1, value4);
1070        let x14n = _mm_sub_pd(value1, value4);
1071        let x23p = _mm_add_pd(value2, value3);
1072        let x23n = _mm_sub_pd(value2, value3);
1073
1074        let temp_a1_1 = _mm_mul_pd(self.twiddle1re, x14p);
1075        let temp_a1_2 = _mm_mul_pd(self.twiddle2re, x23p);
1076        let temp_a2_1 = _mm_mul_pd(self.twiddle2re, x14p);
1077        let temp_a2_2 = _mm_mul_pd(self.twiddle1re, x23p);
1078
1079        let temp_b1_1 = _mm_mul_pd(self.twiddle1im, x14n);
1080        let temp_b1_2 = _mm_mul_pd(self.twiddle2im, x23n);
1081        let temp_b2_1 = _mm_mul_pd(self.twiddle2im, x14n);
1082        let temp_b2_2 = _mm_mul_pd(self.twiddle1im, x23n);
1083
1084        let temp_a1 = _mm_add_pd(value0, _mm_add_pd(temp_a1_1, temp_a1_2));
1085        let temp_a2 = _mm_add_pd(value0, _mm_add_pd(temp_a2_1, temp_a2_2));
1086
1087        let temp_b1 = _mm_add_pd(temp_b1_1, temp_b1_2);
1088        let temp_b2 = _mm_sub_pd(temp_b2_1, temp_b2_2);
1089
1090        let temp_b1_rot = self.rotate.rotate(temp_b1);
1091        let temp_b2_rot = self.rotate.rotate(temp_b2);
1092        [
1093            _mm_add_pd(value0, _mm_add_pd(x14p, x23p)),
1094            _mm_add_pd(temp_a1, temp_b1_rot),
1095            _mm_add_pd(temp_a2, temp_b2_rot),
1096            _mm_sub_pd(temp_a2, temp_b2_rot),
1097            _mm_sub_pd(temp_a1, temp_b1_rot),
1098        ]
1099    }
1100}
1101
1102//    __             _________  _     _ _
1103//   / /_           |___ /___ \| |__ (_) |_
1104//  | '_ \   _____    |_ \ __) | '_ \| | __|
1105//  | (_) | |_____|  ___) / __/| |_) | | |_
1106//   \___/          |____/_____|_.__/|_|\__|
1107//
1108
1109pub struct SseF32Butterfly6<T> {
1110    direction: FftDirection,
1111    _phantom: std::marker::PhantomData<T>,
1112    bf3: SseF32Butterfly3<T>,
1113}
1114
1115boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly6);
1116boilerplate_fft_sse_common_butterfly!(SseF32Butterfly6, 6, |this: &SseF32Butterfly6<_>| this
1117    .direction);
1118impl<T: FftNum> SseF32Butterfly6<T> {
1119    #[inline(always)]
1120    pub fn new(direction: FftDirection) -> Self {
1121        assert_f32::<T>();
1122        let bf3 = SseF32Butterfly3::new(direction);
1123
1124        Self {
1125            direction,
1126            _phantom: std::marker::PhantomData,
1127            bf3,
1128        }
1129    }
1130
1131    #[inline(always)]
1132    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
1133        let value01 = buffer.load_complex(0);
1134        let value23 = buffer.load_complex(2);
1135        let value45 = buffer.load_complex(4);
1136
1137        let out = self.perform_fft_direct(value01, value23, value45);
1138
1139        buffer.store_complex(out[0], 0);
1140        buffer.store_complex(out[1], 2);
1141        buffer.store_complex(out[2], 4);
1142    }
1143
1144    #[inline(always)]
1145    pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
1146        let input_packed = read_complex_to_array!(buffer,  {0, 2, 4, 6, 8, 10});
1147
1148        let values = interleave_complex_f32!(input_packed, 3, {0, 1, 2});
1149
1150        let out = self.perform_parallel_fft_direct(
1151            values[0], values[1], values[2], values[3], values[4], values[5],
1152        );
1153
1154        let out_sorted = separate_interleaved_complex_f32!(out, {0, 2, 4});
1155        write_complex_to_array_strided!(out_sorted, buffer, 2, {0, 1, 2, 3, 4, 5});
1156    }
1157
1158    #[inline(always)]
1159    pub(crate) unsafe fn perform_fft_direct(
1160        &self,
1161        value01: __m128,
1162        value23: __m128,
1163        value45: __m128,
1164    ) -> [__m128; 3] {
1165        // Algorithm: 3x2 good-thomas
1166
1167        // Size-3 FFTs down the columns of our reordered array
1168        let reord0 = extract_lo_hi_f32(value01, value23);
1169        let reord1 = extract_lo_hi_f32(value23, value45);
1170        let reord2 = extract_lo_hi_f32(value45, value01);
1171
1172        let mid = self.bf3.perform_parallel_fft_direct(reord0, reord1, reord2);
1173
1174        // We normally would put twiddle factors right here, but since this is good-thomas algorithm, we don't need twiddle factors
1175
1176        // Transpose the data and do size-2 FFTs down the columns
1177        let [output0, output1] = parallel_fft2_contiguous_f32(mid[0], mid[1]);
1178        let output2 = solo_fft2_f32(mid[2]);
1179
1180        // Reorder into output
1181        [
1182            extract_lo_hi_f32(output0, output1),
1183            extract_lo_lo_f32(output2, output1),
1184            extract_hi_hi_f32(output0, output2),
1185        ]
1186    }
1187
1188    #[inline(always)]
1189    pub(crate) unsafe fn perform_parallel_fft_direct(
1190        &self,
1191        value0: __m128,
1192        value1: __m128,
1193        value2: __m128,
1194        value3: __m128,
1195        value4: __m128,
1196        value5: __m128,
1197    ) -> [__m128; 6] {
1198        // Algorithm: 3x2 good-thomas
1199
1200        // Size-3 FFTs down the columns of our reordered array
1201        let mid0 = self.bf3.perform_parallel_fft_direct(value0, value2, value4);
1202        let mid1 = self.bf3.perform_parallel_fft_direct(value3, value5, value1);
1203
1204        // We normally would put twiddle factors right here, but since this is good-thomas algorithm, we don't need twiddle factors
1205
1206        // Transpose the data and do size-2 FFTs down the columns
1207        let [output0, output1] = parallel_fft2_interleaved_f32(mid0[0], mid1[0]);
1208        let [output2, output3] = parallel_fft2_interleaved_f32(mid0[1], mid1[1]);
1209        let [output4, output5] = parallel_fft2_interleaved_f32(mid0[2], mid1[2]);
1210
1211        // Reorder into output
1212        [output0, output3, output4, output1, output2, output5]
1213    }
1214}
1215
1216//    __              __   _  _   _     _ _
1217//   / /_            / /_ | || | | |__ (_) |_
1218//  | '_ \   _____  | '_ \| || |_| '_ \| | __|
1219//  | (_) | |_____| | (_) |__   _| |_) | | |_
1220//   \___/           \___/   |_| |_.__/|_|\__|
1221//
1222
1223pub struct SseF64Butterfly6<T> {
1224    direction: FftDirection,
1225    _phantom: std::marker::PhantomData<T>,
1226    bf3: SseF64Butterfly3<T>,
1227}
1228
1229boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly6);
1230boilerplate_fft_sse_common_butterfly!(SseF64Butterfly6, 6, |this: &SseF64Butterfly6<_>| this
1231    .direction);
1232impl<T: FftNum> SseF64Butterfly6<T> {
1233    #[inline(always)]
1234    pub fn new(direction: FftDirection) -> Self {
1235        assert_f64::<T>();
1236        let bf3 = SseF64Butterfly3::new(direction);
1237
1238        Self {
1239            direction,
1240            _phantom: std::marker::PhantomData,
1241            bf3,
1242        }
1243    }
1244
1245    #[inline(always)]
1246    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
1247        let value0 = buffer.load_complex(0);
1248        let value1 = buffer.load_complex(1);
1249        let value2 = buffer.load_complex(2);
1250        let value3 = buffer.load_complex(3);
1251        let value4 = buffer.load_complex(4);
1252        let value5 = buffer.load_complex(5);
1253
1254        let out = self.perform_fft_direct([value0, value1, value2, value3, value4, value5]);
1255
1256        buffer.store_complex(out[0], 0);
1257        buffer.store_complex(out[1], 1);
1258        buffer.store_complex(out[2], 2);
1259        buffer.store_complex(out[3], 3);
1260        buffer.store_complex(out[4], 4);
1261        buffer.store_complex(out[5], 5);
1262    }
1263
1264    #[inline(always)]
1265    pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 6]) -> [__m128d; 6] {
1266        // Algorithm: 3x2 good-thomas
1267
1268        // Size-3 FFTs down the columns of our reordered array
1269        let mid0 = self.bf3.perform_fft_direct(values[0], values[2], values[4]);
1270        let mid1 = self.bf3.perform_fft_direct(values[3], values[5], values[1]);
1271
1272        // We normally would put twiddle factors right here, but since this is good-thomas algorithm, we don't need twiddle factors
1273
1274        // Transpose the data and do size-2 FFTs down the columns
1275        let [output0, output1] = solo_fft2_f64(mid0[0], mid1[0]);
1276        let [output2, output3] = solo_fft2_f64(mid0[1], mid1[1]);
1277        let [output4, output5] = solo_fft2_f64(mid0[2], mid1[2]);
1278
1279        // Reorder into output
1280        [output0, output3, output4, output1, output2, output5]
1281    }
1282}
1283
1284//    ___            _________  _     _ _
1285//   ( _ )          |___ /___ \| |__ (_) |_
1286//   / _ \   _____    |_ \ __) | '_ \| | __|
1287//  | (_) | |_____|  ___) / __/| |_) | | |_
1288//   \___/          |____/_____|_.__/|_|\__|
1289//
1290
1291pub struct SseF32Butterfly8<T> {
1292    root2: __m128,
1293    root2_dual: __m128,
1294    direction: FftDirection,
1295    bf4: SseF32Butterfly4<T>,
1296    rotate90: Rotate90F32,
1297}
1298
1299boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly8);
1300boilerplate_fft_sse_common_butterfly!(SseF32Butterfly8, 8, |this: &SseF32Butterfly8<_>| this
1301    .direction);
1302impl<T: FftNum> SseF32Butterfly8<T> {
1303    #[inline(always)]
1304    pub fn new(direction: FftDirection) -> Self {
1305        assert_f32::<T>();
1306        let bf4 = SseF32Butterfly4::new(direction);
1307        let root2 = unsafe { _mm_set_ps(0.5f32.sqrt(), 0.5f32.sqrt(), 1.0, 1.0) };
1308        let root2_dual = unsafe { _mm_load1_ps(&0.5f32.sqrt()) };
1309        let rotate90 = if direction == FftDirection::Inverse {
1310            Rotate90F32::new(true)
1311        } else {
1312            Rotate90F32::new(false)
1313        };
1314        Self {
1315            root2,
1316            root2_dual,
1317            direction,
1318            bf4,
1319            rotate90,
1320        }
1321    }
1322
1323    #[inline(always)]
1324    unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
1325        let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6});
1326
1327        let out = self.perform_fft_direct(input_packed);
1328
1329        write_complex_to_array_strided!(out, buffer, 2, {0,1,2,3});
1330    }
1331
1332    #[inline(always)]
1333    pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
1334        let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10, 12, 14});
1335
1336        let values = interleave_complex_f32!(input_packed, 4, {0, 1, 2, 3});
1337
1338        let out = self.perform_parallel_fft_direct(values);
1339
1340        let out_sorted = separate_interleaved_complex_f32!(out, {0, 2, 4, 6});
1341
1342        write_complex_to_array_strided!(out_sorted, buffer, 2, {0,1,2,3,4,5,6,7});
1343    }
1344
1345    #[inline(always)]
1346    unsafe fn perform_fft_direct(&self, values: [__m128; 4]) -> [__m128; 4] {
1347        // we're going to hardcode a step of mixed radix
1348        // step 1: copy and reorder the input into the scratch
1349        let [in02, in13] = transpose_complex_2x2_f32(values[0], values[1]);
1350        let [in46, in57] = transpose_complex_2x2_f32(values[2], values[3]);
1351
1352        // step 2: column FFTs
1353        let val0 = self.bf4.perform_fft_direct(in02, in46);
1354        let mut val2 = self.bf4.perform_fft_direct(in13, in57);
1355
1356        // step 3: apply twiddle factors
1357        let val2b = self.rotate90.rotate_hi(val2[0]);
1358        let val2c = _mm_add_ps(val2b, val2[0]);
1359        let val2d = _mm_mul_ps(val2c, self.root2);
1360        val2[0] = extract_lo_hi_f32(val2[0], val2d);
1361
1362        let val3b = self.rotate90.rotate_both(val2[1]);
1363        let val3c = _mm_sub_ps(val3b, val2[1]);
1364        let val3d = _mm_mul_ps(val3c, self.root2);
1365        val2[1] = extract_lo_hi_f32(val3b, val3d);
1366
1367        // step 4: transpose -- skipped because we're going to do the next FFTs non-contiguously
1368
1369        // step 5: row FFTs
1370        let out0 = parallel_fft2_interleaved_f32(val0[0], val2[0]);
1371        let out1 = parallel_fft2_interleaved_f32(val0[1], val2[1]);
1372
1373        // step 6: rearrange and copy to buffer
1374        [out0[0], out1[0], out0[1], out1[1]]
1375    }
1376
1377    #[inline(always)]
1378    unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 8]) -> [__m128; 8] {
1379        // we're going to hardcode a step of mixed radix
1380        // step 1: copy and reorder the input into the scratch
1381        // and
1382        // step 2: column FFTs
1383        let val03 = self
1384            .bf4
1385            .perform_parallel_fft_direct([values[0], values[2], values[4], values[6]]);
1386        let mut val47 = self
1387            .bf4
1388            .perform_parallel_fft_direct([values[1], values[3], values[5], values[7]]);
1389
1390        // step 3: apply twiddle factors
1391        let val5b = self.rotate90.rotate_both(val47[1]);
1392        let val5c = _mm_add_ps(val5b, val47[1]);
1393        val47[1] = _mm_mul_ps(val5c, self.root2_dual);
1394        val47[2] = self.rotate90.rotate_both(val47[2]);
1395        let val7b = self.rotate90.rotate_both(val47[3]);
1396        let val7c = _mm_sub_ps(val7b, val47[3]);
1397        val47[3] = _mm_mul_ps(val7c, self.root2_dual);
1398
1399        // step 4: transpose -- skipped because we're going to do the next FFTs non-contiguously
1400
1401        // step 5: row FFTs
1402        let out0 = parallel_fft2_interleaved_f32(val03[0], val47[0]);
1403        let out1 = parallel_fft2_interleaved_f32(val03[1], val47[1]);
1404        let out2 = parallel_fft2_interleaved_f32(val03[2], val47[2]);
1405        let out3 = parallel_fft2_interleaved_f32(val03[3], val47[3]);
1406
1407        // step 6: rearrange and copy to buffer
1408        [
1409            out0[0], out1[0], out2[0], out3[0], out0[1], out1[1], out2[1], out3[1],
1410        ]
1411    }
1412}
1413
1414//    ___             __   _  _   _     _ _
1415//   ( _ )           / /_ | || | | |__ (_) |_
1416//   / _ \   _____  | '_ \| || |_| '_ \| | __|
1417//  | (_) | |_____| | (_) |__   _| |_) | | |_
1418//   \___/           \___/   |_| |_.__/|_|\__|
1419//
1420
1421pub struct SseF64Butterfly8<T> {
1422    bf4: SseF64Butterfly4<T>,
1423}
1424
1425boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly8);
1426boilerplate_fft_sse_common_butterfly!(SseF64Butterfly8, 8, |this: &SseF64Butterfly8<_>| this
1427    .bf4
1428    .direction);
1429impl<T: FftNum> SseF64Butterfly8<T> {
1430    #[inline(always)]
1431    pub fn new(direction: FftDirection) -> Self {
1432        assert_f64::<T>();
1433        Self {
1434            bf4: SseF64Butterfly4::new(direction),
1435        }
1436    }
1437
1438    #[inline(always)]
1439    unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
1440        let values = read_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7});
1441
1442        let out = self.perform_fft_direct(values);
1443
1444        write_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7});
1445    }
1446
1447    #[inline(always)]
1448    unsafe fn perform_fft_direct(&self, values: [__m128d; 8]) -> [__m128d; 8] {
1449        // we're going to hardcode a step of mixed radix
1450        // step 1: copy and reorder the input into the scratch
1451        // and
1452        // step 2: column FFTs
1453        let val03 = self
1454            .bf4
1455            .perform_fft_direct([values[0], values[2], values[4], values[6]]);
1456        let mut val47 = self
1457            .bf4
1458            .perform_fft_direct([values[1], values[3], values[5], values[7]]);
1459
1460        // step 3: apply twiddle factors
1461        val47[1] = self.bf4.rotate.rotate_45(val47[1]);
1462        val47[2] = self.bf4.rotate.rotate(val47[2]);
1463        val47[3] = self.bf4.rotate.rotate_135(val47[3]);
1464
1465        // step 4: transpose -- skipped because we're going to do the next FFTs non-contiguously
1466
1467        // step 5: row FFTs
1468        let out0 = solo_fft2_f64(val03[0], val47[0]);
1469        let out1 = solo_fft2_f64(val03[1], val47[1]);
1470        let out2 = solo_fft2_f64(val03[2], val47[2]);
1471        let out3 = solo_fft2_f64(val03[3], val47[3]);
1472
1473        // step 6: rearrange and copy to buffer
1474        [
1475            out0[0], out1[0], out2[0], out3[0], out0[1], out1[1], out2[1], out3[1],
1476        ]
1477    }
1478}
1479
1480//    ___            _________  _     _ _
1481//   / _ \          |___ /___ \| |__ (_) |_
1482//  | (_) |  _____    |_ \ __) | '_ \| | __|
1483//   \__, | |_____|  ___) / __/| |_) | | |_
1484//     /_/          |____/_____|_.__/|_|\__|
1485//
1486pub struct SseF32Butterfly9<T> {
1487    direction: FftDirection,
1488    _phantom: std::marker::PhantomData<T>,
1489    bf3: SseF32Butterfly3<T>,
1490    twiddle1: __m128,
1491    twiddle2: __m128,
1492    twiddle4: __m128,
1493}
1494
1495boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly9);
1496boilerplate_fft_sse_common_butterfly!(SseF32Butterfly9, 9, |this: &SseF32Butterfly9<_>| this
1497    .direction);
1498impl<T: FftNum> SseF32Butterfly9<T> {
1499    #[inline(always)]
1500    pub fn new(direction: FftDirection) -> Self {
1501        assert_f32::<T>();
1502        let bf3 = SseF32Butterfly3::new(direction);
1503        let tw1: Complex<f32> = twiddles::compute_twiddle(1, 9, direction);
1504        let tw2: Complex<f32> = twiddles::compute_twiddle(2, 9, direction);
1505        let tw4: Complex<f32> = twiddles::compute_twiddle(4, 9, direction);
1506        let twiddle1 = unsafe { _mm_set_ps(tw1.im, tw1.re, tw1.im, tw1.re) };
1507        let twiddle2 = unsafe { _mm_set_ps(tw2.im, tw2.re, tw2.im, tw2.re) };
1508        let twiddle4 = unsafe { _mm_set_ps(tw4.im, tw4.re, tw4.im, tw4.re) };
1509        Self {
1510            direction,
1511            _phantom: std::marker::PhantomData,
1512            bf3,
1513            twiddle1,
1514            twiddle2,
1515            twiddle4,
1516        }
1517    }
1518
1519    #[inline(always)]
1520    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
1521        // A single Sse 9-point will need a lot of shuffling, let's just reuse the dual one
1522        let values = read_partial1_complex_to_array!(buffer, {0,1,2,3,4,5,6,7,8});
1523
1524        let out = self.perform_parallel_fft_direct(values);
1525
1526        for n in 0..9 {
1527            buffer.store_partial_lo_complex(out[n], n);
1528        }
1529    }
1530
1531    #[inline(always)]
1532    pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
1533        let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10, 12, 14, 16});
1534
1535        let values = [
1536            extract_lo_hi_f32(input_packed[0], input_packed[4]),
1537            extract_hi_lo_f32(input_packed[0], input_packed[5]),
1538            extract_lo_hi_f32(input_packed[1], input_packed[5]),
1539            extract_hi_lo_f32(input_packed[1], input_packed[6]),
1540            extract_lo_hi_f32(input_packed[2], input_packed[6]),
1541            extract_hi_lo_f32(input_packed[2], input_packed[7]),
1542            extract_lo_hi_f32(input_packed[3], input_packed[7]),
1543            extract_hi_lo_f32(input_packed[3], input_packed[8]),
1544            extract_lo_hi_f32(input_packed[4], input_packed[8]),
1545        ];
1546
1547        let out = self.perform_parallel_fft_direct(values);
1548
1549        let out_packed = [
1550            extract_lo_lo_f32(out[0], out[1]),
1551            extract_lo_lo_f32(out[2], out[3]),
1552            extract_lo_lo_f32(out[4], out[5]),
1553            extract_lo_lo_f32(out[6], out[7]),
1554            extract_lo_hi_f32(out[8], out[0]),
1555            extract_hi_hi_f32(out[1], out[2]),
1556            extract_hi_hi_f32(out[3], out[4]),
1557            extract_hi_hi_f32(out[5], out[6]),
1558            extract_hi_hi_f32(out[7], out[8]),
1559        ];
1560
1561        write_complex_to_array_strided!(out_packed, buffer, 2, {0,1,2,3,4,5,6,7,8});
1562    }
1563
1564    #[inline(always)]
1565    pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 9]) -> [__m128; 9] {
1566        // Algorithm: 3x3 mixed radix
1567
1568        // Size-3 FFTs down the columns
1569        let mid0 = self
1570            .bf3
1571            .perform_parallel_fft_direct(values[0], values[3], values[6]);
1572        let mut mid1 = self
1573            .bf3
1574            .perform_parallel_fft_direct(values[1], values[4], values[7]);
1575        let mut mid2 = self
1576            .bf3
1577            .perform_parallel_fft_direct(values[2], values[5], values[8]);
1578
1579        // Apply twiddle factors. Note that we're re-using twiddle2
1580        mid1[1] = SseVector::mul_complex(self.twiddle1, mid1[1]);
1581        mid1[2] = SseVector::mul_complex(self.twiddle2, mid1[2]);
1582        mid2[1] = SseVector::mul_complex(self.twiddle2, mid2[1]);
1583        mid2[2] = SseVector::mul_complex(self.twiddle4, mid2[2]);
1584
1585        let [output0, output1, output2] = self
1586            .bf3
1587            .perform_parallel_fft_direct(mid0[0], mid1[0], mid2[0]);
1588        let [output3, output4, output5] = self
1589            .bf3
1590            .perform_parallel_fft_direct(mid0[1], mid1[1], mid2[1]);
1591        let [output6, output7, output8] = self
1592            .bf3
1593            .perform_parallel_fft_direct(mid0[2], mid1[2], mid2[2]);
1594
1595        [
1596            output0, output3, output6, output1, output4, output7, output2, output5, output8,
1597        ]
1598    }
1599}
1600
1601//    ___             __   _  _   _     _ _
1602//   / _ \           / /_ | || | | |__ (_) |_
1603//  | (_) |  _____  | '_ \| || |_| '_ \| | __|
1604//   \__, | |_____| | (_) |__   _| |_) | | |_
1605//     /_/           \___/   |_| |_.__/|_|\__|
1606//
1607
1608pub struct SseF64Butterfly9<T> {
1609    direction: FftDirection,
1610    _phantom: std::marker::PhantomData<T>,
1611    bf3: SseF64Butterfly3<T>,
1612    twiddle1: __m128d,
1613    twiddle2: __m128d,
1614    twiddle4: __m128d,
1615}
1616
1617boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly9);
1618boilerplate_fft_sse_common_butterfly!(SseF64Butterfly9, 9, |this: &SseF64Butterfly9<_>| this
1619    .direction);
1620impl<T: FftNum> SseF64Butterfly9<T> {
1621    #[inline(always)]
1622    pub fn new(direction: FftDirection) -> Self {
1623        assert_f64::<T>();
1624        let bf3 = SseF64Butterfly3::new(direction);
1625        let tw1: Complex<f64> = twiddles::compute_twiddle(1, 9, direction);
1626        let tw2: Complex<f64> = twiddles::compute_twiddle(2, 9, direction);
1627        let tw4: Complex<f64> = twiddles::compute_twiddle(4, 9, direction);
1628        let twiddle1 = unsafe { _mm_set_pd(tw1.im, tw1.re) };
1629        let twiddle2 = unsafe { _mm_set_pd(tw2.im, tw2.re) };
1630        let twiddle4 = unsafe { _mm_set_pd(tw4.im, tw4.re) };
1631        Self {
1632            direction,
1633            _phantom: std::marker::PhantomData,
1634            bf3,
1635            twiddle1,
1636            twiddle2,
1637            twiddle4,
1638        }
1639    }
1640
1641    #[inline(always)]
1642    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
1643        let values = read_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8});
1644
1645        let out = self.perform_fft_direct(values);
1646
1647        write_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8});
1648    }
1649
1650    #[inline(always)]
1651    pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 9]) -> [__m128d; 9] {
1652        // Algorithm: 3x3 mixed radix
1653
1654        // Size-3 FFTs down the columns
1655        let mid0 = self.bf3.perform_fft_direct(values[0], values[3], values[6]);
1656        let mut mid1 = self.bf3.perform_fft_direct(values[1], values[4], values[7]);
1657        let mut mid2 = self.bf3.perform_fft_direct(values[2], values[5], values[8]);
1658
1659        // Apply twiddle factors. Note that we're re-using twiddle2
1660        mid1[1] = SseVector::mul_complex(self.twiddle1, mid1[1]);
1661        mid1[2] = SseVector::mul_complex(self.twiddle2, mid1[2]);
1662        mid2[1] = SseVector::mul_complex(self.twiddle2, mid2[1]);
1663        mid2[2] = SseVector::mul_complex(self.twiddle4, mid2[2]);
1664
1665        let [output0, output1, output2] = self.bf3.perform_fft_direct(mid0[0], mid1[0], mid2[0]);
1666        let [output3, output4, output5] = self.bf3.perform_fft_direct(mid0[1], mid1[1], mid2[1]);
1667        let [output6, output7, output8] = self.bf3.perform_fft_direct(mid0[2], mid1[2], mid2[2]);
1668
1669        [
1670            output0, output3, output6, output1, output4, output7, output2, output5, output8,
1671        ]
1672    }
1673}
1674
1675//   _  ___            _________  _     _ _
1676//  / |/ _ \          |___ /___ \| |__ (_) |_
1677//  | | | | |  _____    |_ \ __) | '_ \| | __|
1678//  | | |_| | |_____|  ___) / __/| |_) | | |_
1679//  |_|\___/          |____/_____|_.__/|_|\__|
1680//
1681
1682pub struct SseF32Butterfly10<T> {
1683    direction: FftDirection,
1684    _phantom: std::marker::PhantomData<T>,
1685    bf5: SseF32Butterfly5<T>,
1686}
1687
1688boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly10);
1689boilerplate_fft_sse_common_butterfly!(SseF32Butterfly10, 10, |this: &SseF32Butterfly10<_>| this
1690    .direction);
1691impl<T: FftNum> SseF32Butterfly10<T> {
1692    #[inline(always)]
1693    pub fn new(direction: FftDirection) -> Self {
1694        assert_f32::<T>();
1695        let bf5 = SseF32Butterfly5::new(direction);
1696        Self {
1697            direction,
1698            _phantom: std::marker::PhantomData,
1699            bf5,
1700        }
1701    }
1702
1703    #[inline(always)]
1704    unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
1705        let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8});
1706
1707        let out = self.perform_fft_direct(input_packed);
1708
1709        write_complex_to_array_strided!(out, buffer, 2, {0,1,2,3,4});
1710    }
1711
1712    #[inline(always)]
1713    pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
1714        let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18});
1715
1716        let values = interleave_complex_f32!(input_packed, 5, {0, 1, 2, 3, 4});
1717
1718        let out = self.perform_parallel_fft_direct(values);
1719
1720        let out_sorted = separate_interleaved_complex_f32!(out, {0, 2, 4, 6, 8});
1721
1722        write_complex_to_array_strided!(out_sorted, buffer, 2, {0,1,2,3,4,5,6,7,8,9});
1723    }
1724
1725    #[inline(always)]
1726    pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128; 5]) -> [__m128; 5] {
1727        // Algorithm: 5x2 good-thomas
1728        // Reorder and pack
1729        let reord0 = extract_lo_hi_f32(values[0], values[2]);
1730        let reord1 = extract_lo_hi_f32(values[1], values[3]);
1731        let reord2 = extract_lo_hi_f32(values[2], values[4]);
1732        let reord3 = extract_lo_hi_f32(values[3], values[0]);
1733        let reord4 = extract_lo_hi_f32(values[4], values[1]);
1734
1735        // Size-5 FFTs down the columns of our reordered array
1736        let mids = self
1737            .bf5
1738            .perform_parallel_fft_direct(reord0, reord1, reord2, reord3, reord4);
1739
1740        // Since this is good-thomas algorithm, we don't need twiddle factors
1741
1742        // Transpose the data and do size-2 FFTs down the columns
1743        let [temp01, temp23] = parallel_fft2_contiguous_f32(mids[0], mids[1]);
1744        let [temp45, temp67] = parallel_fft2_contiguous_f32(mids[2], mids[3]);
1745        let temp89 = solo_fft2_f32(mids[4]);
1746
1747        // Reorder
1748        let out01 = extract_lo_hi_f32(temp01, temp23);
1749        let out23 = extract_lo_hi_f32(temp45, temp67);
1750        let out45 = extract_lo_lo_f32(temp89, temp23);
1751        let out67 = extract_hi_lo_f32(temp01, temp67);
1752        let out89 = extract_hi_hi_f32(temp45, temp89);
1753
1754        [out01, out23, out45, out67, out89]
1755    }
1756
1757    #[inline(always)]
1758    pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 10]) -> [__m128; 10] {
1759        // Algorithm: 5x2 good-thomas
1760
1761        // Size-5 FFTs down the columns of our reordered array
1762        let mid0 = self
1763            .bf5
1764            .perform_parallel_fft_direct(values[0], values[2], values[4], values[6], values[8]);
1765        let mid1 = self
1766            .bf5
1767            .perform_parallel_fft_direct(values[5], values[7], values[9], values[1], values[3]);
1768
1769        // Since this is good-thomas algorithm, we don't need twiddle factors
1770
1771        // Transpose the data and do size-2 FFTs down the columns
1772        let [output0, output1] = parallel_fft2_interleaved_f32(mid0[0], mid1[0]);
1773        let [output2, output3] = parallel_fft2_interleaved_f32(mid0[1], mid1[1]);
1774        let [output4, output5] = parallel_fft2_interleaved_f32(mid0[2], mid1[2]);
1775        let [output6, output7] = parallel_fft2_interleaved_f32(mid0[3], mid1[3]);
1776        let [output8, output9] = parallel_fft2_interleaved_f32(mid0[4], mid1[4]);
1777
1778        // Reorder and return
1779        [
1780            output0, output3, output4, output7, output8, output1, output2, output5, output6,
1781            output9,
1782        ]
1783    }
1784}
1785
1786//   _  ___             __   _  _   _     _ _
1787//  / |/ _ \           / /_ | || | | |__ (_) |_
1788//  | | | | |  _____  | '_ \| || |_| '_ \| | __|
1789//  | | |_| | |_____| | (_) |__   _| |_) | | |_
1790//  |_|\___/           \___/   |_| |_.__/|_|\__|
1791//
1792
1793pub struct SseF64Butterfly10<T> {
1794    direction: FftDirection,
1795    _phantom: std::marker::PhantomData<T>,
1796    bf2: SseF64Butterfly2<T>,
1797    bf5: SseF64Butterfly5<T>,
1798}
1799
1800boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly10);
1801boilerplate_fft_sse_common_butterfly!(SseF64Butterfly10, 10, |this: &SseF64Butterfly10<_>| this
1802    .direction);
1803impl<T: FftNum> SseF64Butterfly10<T> {
1804    #[inline(always)]
1805    pub fn new(direction: FftDirection) -> Self {
1806        assert_f64::<T>();
1807        let bf2 = SseF64Butterfly2::new(direction);
1808        let bf5 = SseF64Butterfly5::new(direction);
1809        Self {
1810            direction,
1811            _phantom: std::marker::PhantomData,
1812            bf2,
1813            bf5,
1814        }
1815    }
1816
1817    #[inline(always)]
1818    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
1819        let values = read_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
1820
1821        let out = self.perform_fft_direct(values);
1822
1823        write_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
1824    }
1825
1826    #[inline(always)]
1827    pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 10]) -> [__m128d; 10] {
1828        // Algorithm: 5x2 good-thomas
1829
1830        // Size-5 FFTs down the columns of our reordered array
1831        let mid0 = self
1832            .bf5
1833            .perform_fft_direct(values[0], values[2], values[4], values[6], values[8]);
1834        let mid1 = self
1835            .bf5
1836            .perform_fft_direct(values[5], values[7], values[9], values[1], values[3]);
1837
1838        // Since this is good-thomas algorithm, we don't need twiddle factors
1839
1840        // Transpose the data and do size-2 FFTs down the columns
1841        let [output0, output1] = self.bf2.perform_fft_direct(mid0[0], mid1[0]);
1842        let [output2, output3] = self.bf2.perform_fft_direct(mid0[1], mid1[1]);
1843        let [output4, output5] = self.bf2.perform_fft_direct(mid0[2], mid1[2]);
1844        let [output6, output7] = self.bf2.perform_fft_direct(mid0[3], mid1[3]);
1845        let [output8, output9] = self.bf2.perform_fft_direct(mid0[4], mid1[4]);
1846
1847        // Reorder and return
1848        [
1849            output0, output3, output4, output7, output8, output1, output2, output5, output6,
1850            output9,
1851        ]
1852    }
1853}
1854
1855//   _ ____            _________  _     _ _
1856//  / |___ \          |___ /___ \| |__ (_) |_
1857//  | | __) |  _____    |_ \ __) | '_ \| | __|
1858//  | |/ __/  |_____|  ___) / __/| |_) | | |_
1859//  |_|_____|         |____/_____|_.__/|_|\__|
1860//
1861
1862pub struct SseF32Butterfly12<T> {
1863    direction: FftDirection,
1864    _phantom: std::marker::PhantomData<T>,
1865    bf3: SseF32Butterfly3<T>,
1866    bf4: SseF32Butterfly4<T>,
1867}
1868
1869boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly12);
1870boilerplate_fft_sse_common_butterfly!(SseF32Butterfly12, 12, |this: &SseF32Butterfly12<_>| this
1871    .direction);
1872impl<T: FftNum> SseF32Butterfly12<T> {
1873    #[inline(always)]
1874    pub fn new(direction: FftDirection) -> Self {
1875        assert_f32::<T>();
1876        let bf3 = SseF32Butterfly3::new(direction);
1877        let bf4 = SseF32Butterfly4::new(direction);
1878        Self {
1879            direction,
1880            _phantom: std::marker::PhantomData,
1881            bf3,
1882            bf4,
1883        }
1884    }
1885
1886    #[inline(always)]
1887    unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
1888        let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10 });
1889
1890        let out = self.perform_fft_direct(input_packed);
1891
1892        write_complex_to_array_strided!(out, buffer, 2, {0,1,2,3,4,5});
1893    }
1894
1895    #[inline(always)]
1896    pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
1897        let input_packed =
1898            read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22});
1899
1900        let values = interleave_complex_f32!(input_packed, 6, {0, 1, 2, 3, 4, 5});
1901
1902        let out = self.perform_parallel_fft_direct(values);
1903
1904        let out_sorted = separate_interleaved_complex_f32!(out, {0, 2, 4, 6, 8, 10});
1905
1906        write_complex_to_array_strided!(out_sorted, buffer, 2, {0,1,2,3,4,5,6,7,8,9, 10, 11});
1907    }
1908
1909    #[inline(always)]
1910    pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128; 6]) -> [__m128; 6] {
1911        // Algorithm: 4x3 good-thomas
1912
1913        // Reorder and pack
1914        let packed03 = extract_lo_hi_f32(values[0], values[1]);
1915        let packed47 = extract_lo_hi_f32(values[2], values[3]);
1916        let packed69 = extract_lo_hi_f32(values[3], values[4]);
1917        let packed101 = extract_lo_hi_f32(values[5], values[0]);
1918        let packed811 = extract_lo_hi_f32(values[4], values[5]);
1919        let packed25 = extract_lo_hi_f32(values[1], values[2]);
1920
1921        // Size-4 FFTs down the columns of our reordered array
1922        let mid0 = self.bf4.perform_fft_direct(packed03, packed69);
1923        let mid1 = self.bf4.perform_fft_direct(packed47, packed101);
1924        let mid2 = self.bf4.perform_fft_direct(packed811, packed25);
1925
1926        // Since this is good-thomas algorithm, we don't need twiddle factors
1927
1928        // Transpose the data and do size-3 FFTs down the columns
1929        let [temp03, temp14, temp25] = self
1930            .bf3
1931            .perform_parallel_fft_direct(mid0[0], mid1[0], mid2[0]);
1932        let [temp69, temp710, temp811] = self
1933            .bf3
1934            .perform_parallel_fft_direct(mid0[1], mid1[1], mid2[1]);
1935
1936        // Reorder and return
1937        [
1938            extract_lo_hi_f32(temp03, temp14),
1939            extract_lo_hi_f32(temp811, temp69),
1940            extract_lo_hi_f32(temp14, temp25),
1941            extract_lo_hi_f32(temp69, temp710),
1942            extract_lo_hi_f32(temp25, temp03),
1943            extract_lo_hi_f32(temp710, temp811),
1944        ]
1945    }
1946
1947    #[inline(always)]
1948    pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 12]) -> [__m128; 12] {
1949        // Algorithm: 4x3 good-thomas
1950
1951        // Size-4 FFTs down the columns of our reordered array
1952        let mid0 = self
1953            .bf4
1954            .perform_parallel_fft_direct([values[0], values[3], values[6], values[9]]);
1955        let mid1 = self
1956            .bf4
1957            .perform_parallel_fft_direct([values[4], values[7], values[10], values[1]]);
1958        let mid2 = self
1959            .bf4
1960            .perform_parallel_fft_direct([values[8], values[11], values[2], values[5]]);
1961
1962        // Since this is good-thomas algorithm, we don't need twiddle factors
1963
1964        // Transpose the data and do size-3 FFTs down the columns
1965        let [output0, output1, output2] = self
1966            .bf3
1967            .perform_parallel_fft_direct(mid0[0], mid1[0], mid2[0]);
1968        let [output3, output4, output5] = self
1969            .bf3
1970            .perform_parallel_fft_direct(mid0[1], mid1[1], mid2[1]);
1971        let [output6, output7, output8] = self
1972            .bf3
1973            .perform_parallel_fft_direct(mid0[2], mid1[2], mid2[2]);
1974        let [output9, output10, output11] = self
1975            .bf3
1976            .perform_parallel_fft_direct(mid0[3], mid1[3], mid2[3]);
1977
1978        // Reorder and return
1979        [
1980            output0, output4, output8, output9, output1, output5, output6, output10, output2,
1981            output3, output7, output11,
1982        ]
1983    }
1984}
1985
1986//   _ ____             __   _  _   _     _ _
1987//  / |___ \           / /_ | || | | |__ (_) |_
1988//  | | __) |  _____  | '_ \| || |_| '_ \| | __|
1989//  | |/ __/  |_____| | (_) |__   _| |_) | | |_
1990//  |_|_____|          \___/   |_| |_.__/|_|\__|
1991//
1992
1993pub struct SseF64Butterfly12<T> {
1994    direction: FftDirection,
1995    _phantom: std::marker::PhantomData<T>,
1996    bf3: SseF64Butterfly3<T>,
1997    bf4: SseF64Butterfly4<T>,
1998}
1999
2000boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly12);
2001boilerplate_fft_sse_common_butterfly!(SseF64Butterfly12, 12, |this: &SseF64Butterfly12<_>| this
2002    .direction);
2003impl<T: FftNum> SseF64Butterfly12<T> {
2004    #[inline(always)]
2005    pub fn new(direction: FftDirection) -> Self {
2006        assert_f64::<T>();
2007        let bf3 = SseF64Butterfly3::new(direction);
2008        let bf4 = SseF64Butterfly4::new(direction);
2009        Self {
2010            direction,
2011            _phantom: std::marker::PhantomData,
2012            bf3,
2013            bf4,
2014        }
2015    }
2016
2017    #[inline(always)]
2018    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
2019        let values = read_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
2020
2021        let out = self.perform_fft_direct(values);
2022
2023        write_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
2024    }
2025
2026    #[inline(always)]
2027    pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 12]) -> [__m128d; 12] {
2028        // Algorithm: 4x3 good-thomas
2029
2030        // Size-4 FFTs down the columns of our reordered array
2031        let mid0 = self
2032            .bf4
2033            .perform_fft_direct([values[0], values[3], values[6], values[9]]);
2034        let mid1 = self
2035            .bf4
2036            .perform_fft_direct([values[4], values[7], values[10], values[1]]);
2037        let mid2 = self
2038            .bf4
2039            .perform_fft_direct([values[8], values[11], values[2], values[5]]);
2040
2041        // Since this is good-thomas algorithm, we don't need twiddle factors
2042
2043        // Transpose the data and do size-3 FFTs down the columns
2044        let [output0, output1, output2] = self.bf3.perform_fft_direct(mid0[0], mid1[0], mid2[0]);
2045        let [output3, output4, output5] = self.bf3.perform_fft_direct(mid0[1], mid1[1], mid2[1]);
2046        let [output6, output7, output8] = self.bf3.perform_fft_direct(mid0[2], mid1[2], mid2[2]);
2047        let [output9, output10, output11] = self.bf3.perform_fft_direct(mid0[3], mid1[3], mid2[3]);
2048
2049        [
2050            output0, output4, output8, output9, output1, output5, output6, output10, output2,
2051            output3, output7, output11,
2052        ]
2053    }
2054}
2055
2056//   _ ____            _________  _     _ _
2057//  / | ___|          |___ /___ \| |__ (_) |_
2058//  | |___ \   _____    |_ \ __) | '_ \| | __|
2059//  | |___) | |_____|  ___) / __/| |_) | | |_
2060//  |_|____/          |____/_____|_.__/|_|\__|
2061//
2062pub struct SseF32Butterfly15<T> {
2063    direction: FftDirection,
2064    _phantom: std::marker::PhantomData<T>,
2065    bf3: SseF32Butterfly3<T>,
2066    bf5: SseF32Butterfly5<T>,
2067}
2068
2069boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly15);
2070boilerplate_fft_sse_common_butterfly!(SseF32Butterfly15, 15, |this: &SseF32Butterfly15<_>| this
2071    .direction);
2072impl<T: FftNum> SseF32Butterfly15<T> {
2073    #[inline(always)]
2074    pub fn new(direction: FftDirection) -> Self {
2075        assert_f32::<T>();
2076        let bf3 = SseF32Butterfly3::new(direction);
2077        let bf5 = SseF32Butterfly5::new(direction);
2078        Self {
2079            direction,
2080            _phantom: std::marker::PhantomData,
2081            bf3,
2082            bf5,
2083        }
2084    }
2085
2086    #[inline(always)]
2087    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
2088        // A single Sse 15-point will need a lot of shuffling, let's just reuse the dual one
2089        let values = read_partial1_complex_to_array!(buffer, {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14});
2090
2091        let out = self.perform_parallel_fft_direct(values);
2092
2093        for n in 0..15 {
2094            buffer.store_partial_lo_complex(out[n], n);
2095        }
2096    }
2097
2098    #[inline(always)]
2099    pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
2100        let input_packed =
2101            read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28});
2102
2103        let values = [
2104            extract_lo_hi_f32(input_packed[0], input_packed[7]),
2105            extract_hi_lo_f32(input_packed[0], input_packed[8]),
2106            extract_lo_hi_f32(input_packed[1], input_packed[8]),
2107            extract_hi_lo_f32(input_packed[1], input_packed[9]),
2108            extract_lo_hi_f32(input_packed[2], input_packed[9]),
2109            extract_hi_lo_f32(input_packed[2], input_packed[10]),
2110            extract_lo_hi_f32(input_packed[3], input_packed[10]),
2111            extract_hi_lo_f32(input_packed[3], input_packed[11]),
2112            extract_lo_hi_f32(input_packed[4], input_packed[11]),
2113            extract_hi_lo_f32(input_packed[4], input_packed[12]),
2114            extract_lo_hi_f32(input_packed[5], input_packed[12]),
2115            extract_hi_lo_f32(input_packed[5], input_packed[13]),
2116            extract_lo_hi_f32(input_packed[6], input_packed[13]),
2117            extract_hi_lo_f32(input_packed[6], input_packed[14]),
2118            extract_lo_hi_f32(input_packed[7], input_packed[14]),
2119        ];
2120
2121        let out = self.perform_parallel_fft_direct(values);
2122
2123        let out_packed = [
2124            extract_lo_lo_f32(out[0], out[1]),
2125            extract_lo_lo_f32(out[2], out[3]),
2126            extract_lo_lo_f32(out[4], out[5]),
2127            extract_lo_lo_f32(out[6], out[7]),
2128            extract_lo_lo_f32(out[8], out[9]),
2129            extract_lo_lo_f32(out[10], out[11]),
2130            extract_lo_lo_f32(out[12], out[13]),
2131            extract_lo_hi_f32(out[14], out[0]),
2132            extract_hi_hi_f32(out[1], out[2]),
2133            extract_hi_hi_f32(out[3], out[4]),
2134            extract_hi_hi_f32(out[5], out[6]),
2135            extract_hi_hi_f32(out[7], out[8]),
2136            extract_hi_hi_f32(out[9], out[10]),
2137            extract_hi_hi_f32(out[11], out[12]),
2138            extract_hi_hi_f32(out[13], out[14]),
2139        ];
2140
2141        write_complex_to_array_strided!(out_packed, buffer, 2, {0,1,2,3,4,5,6,7,8,9, 10, 11, 12, 13, 14});
2142    }
2143
2144    #[inline(always)]
2145    pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 15]) -> [__m128; 15] {
2146        // Algorithm: 5x3 good-thomas
2147
2148        // Size-5 FFTs down the columns of our reordered array
2149        let mid0 = self
2150            .bf5
2151            .perform_parallel_fft_direct(values[0], values[3], values[6], values[9], values[12]);
2152        let mid1 = self
2153            .bf5
2154            .perform_parallel_fft_direct(values[5], values[8], values[11], values[14], values[2]);
2155        let mid2 = self
2156            .bf5
2157            .perform_parallel_fft_direct(values[10], values[13], values[1], values[4], values[7]);
2158
2159        // Since this is good-thomas algorithm, we don't need twiddle factors
2160
2161        // Transpose the data and do size-3 FFTs down the columns
2162        let [output0, output1, output2] = self
2163            .bf3
2164            .perform_parallel_fft_direct(mid0[0], mid1[0], mid2[0]);
2165        let [output3, output4, output5] = self
2166            .bf3
2167            .perform_parallel_fft_direct(mid0[1], mid1[1], mid2[1]);
2168        let [output6, output7, output8] = self
2169            .bf3
2170            .perform_parallel_fft_direct(mid0[2], mid1[2], mid2[2]);
2171        let [output9, output10, output11] = self
2172            .bf3
2173            .perform_parallel_fft_direct(mid0[3], mid1[3], mid2[3]);
2174        let [output12, output13, output14] = self
2175            .bf3
2176            .perform_parallel_fft_direct(mid0[4], mid1[4], mid2[4]);
2177
2178        [
2179            output0, output4, output8, output9, output13, output2, output3, output7, output11,
2180            output12, output1, output5, output6, output10, output14,
2181        ]
2182    }
2183}
2184
2185//   _ ____             __   _  _   _     _ _
2186//  / | ___|           / /_ | || | | |__ (_) |_
2187//  | |___ \   _____  | '_ \| || |_| '_ \| | __|
2188//  | |___) | |_____| | (_) |__   _| |_) | | |_
2189//  |_|____/           \___/   |_| |_.__/|_|\__|
2190//
2191
2192pub struct SseF64Butterfly15<T> {
2193    direction: FftDirection,
2194    _phantom: std::marker::PhantomData<T>,
2195    bf3: SseF64Butterfly3<T>,
2196    bf5: SseF64Butterfly5<T>,
2197}
2198
2199boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly15);
2200boilerplate_fft_sse_common_butterfly!(SseF64Butterfly15, 15, |this: &SseF64Butterfly15<_>| this
2201    .direction);
2202impl<T: FftNum> SseF64Butterfly15<T> {
2203    #[inline(always)]
2204    pub fn new(direction: FftDirection) -> Self {
2205        assert_f64::<T>();
2206        let bf3 = SseF64Butterfly3::new(direction);
2207        let bf5 = SseF64Butterfly5::new(direction);
2208        Self {
2209            direction,
2210            _phantom: std::marker::PhantomData,
2211            bf3,
2212            bf5,
2213        }
2214    }
2215
2216    #[inline(always)]
2217    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
2218        let values =
2219            read_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
2220
2221        let out = self.perform_fft_direct(values);
2222
2223        write_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
2224    }
2225
2226    #[inline(always)]
2227    pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 15]) -> [__m128d; 15] {
2228        // Algorithm: 5x3 good-thomas
2229
2230        // Size-5 FFTs down the columns of our reordered array
2231        let mid0 = self
2232            .bf5
2233            .perform_fft_direct(values[0], values[3], values[6], values[9], values[12]);
2234        let mid1 = self
2235            .bf5
2236            .perform_fft_direct(values[5], values[8], values[11], values[14], values[2]);
2237        let mid2 = self
2238            .bf5
2239            .perform_fft_direct(values[10], values[13], values[1], values[4], values[7]);
2240
2241        // Since this is good-thomas algorithm, we don't need twiddle factors
2242
2243        // Transpose the data and do size-3 FFTs down the columns
2244        let [output0, output1, output2] = self.bf3.perform_fft_direct(mid0[0], mid1[0], mid2[0]);
2245        let [output3, output4, output5] = self.bf3.perform_fft_direct(mid0[1], mid1[1], mid2[1]);
2246        let [output6, output7, output8] = self.bf3.perform_fft_direct(mid0[2], mid1[2], mid2[2]);
2247        let [output9, output10, output11] = self.bf3.perform_fft_direct(mid0[3], mid1[3], mid2[3]);
2248        let [output12, output13, output14] = self.bf3.perform_fft_direct(mid0[4], mid1[4], mid2[4]);
2249
2250        [
2251            output0, output4, output8, output9, output13, output2, output3, output7, output11,
2252            output12, output1, output5, output6, output10, output14,
2253        ]
2254    }
2255}
2256
2257//   _  __             _________  _     _ _
2258//  / |/ /_           |___ /___ \| |__ (_) |_
2259//  | | '_ \   _____    |_ \ __) | '_ \| | __|
2260//  | | (_) | |_____|  ___) / __/| |_) | | |_
2261//  |_|\___/          |____/_____|_.__/|_|\__|
2262//
2263
2264pub struct SseF32Butterfly16<T> {
2265    bf4: SseF32Butterfly4<T>,
2266    twiddles_packed: [__m128; 6],
2267    twiddle1: __m128,
2268    twiddle3: __m128,
2269    twiddle9: __m128,
2270}
2271
2272boilerplate_fft_sse_f32_butterfly_noparallel!(SseF32Butterfly16);
2273boilerplate_fft_sse_common_butterfly!(SseF32Butterfly16, 16, |this: &SseF32Butterfly16<_>| this
2274    .bf4
2275    .direction);
2276impl<T: FftNum> SseF32Butterfly16<T> {
2277    pub fn new(direction: FftDirection) -> Self {
2278        assert_f32::<T>();
2279        let tw0: Complex<f32> = Complex { re: 1.0, im: 0.0 };
2280        let tw1: Complex<f32> = twiddles::compute_twiddle(1, 16, direction);
2281        let tw2: Complex<f32> = twiddles::compute_twiddle(2, 16, direction);
2282        let tw3: Complex<f32> = twiddles::compute_twiddle(3, 16, direction);
2283        let tw4: Complex<f32> = twiddles::compute_twiddle(4, 16, direction);
2284        let tw6: Complex<f32> = twiddles::compute_twiddle(6, 16, direction);
2285        let tw9: Complex<f32> = twiddles::compute_twiddle(9, 16, direction);
2286
2287        unsafe {
2288            Self {
2289                bf4: SseF32Butterfly4::new(direction),
2290                twiddles_packed: [
2291                    pack_32(tw0, tw1),
2292                    pack_32(tw0, tw2),
2293                    pack_32(tw0, tw3),
2294                    pack_32(tw2, tw3),
2295                    pack_32(tw4, tw6),
2296                    pack_32(tw6, tw9),
2297                ],
2298                twiddle1: pack_32(tw1, tw1),
2299                twiddle3: pack_32(tw3, tw3),
2300                twiddle9: pack_32(tw9, tw9),
2301            }
2302        }
2303    }
2304
2305    #[inline(always)]
2306    unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
2307        // To make the best possible use of registers, we're going to write this algorithm in an unusual way
2308        // It's 4x4 mixed radix, so we're going to do the usual steps of size-4 FFTs down the columns, apply twiddle factors, then transpose and do size-4 FFTs again
2309        // But to reduce the number of times registers get spilled, we have these optimizations:
2310        // 1: Load data as late as possible, not upfront
2311        // 2: Once we're working with a piece of data, make as much progress as possible before moving on
2312        //      IE, once we load a column, we should do the FFT down the column, do twiddle factors, and do the pieces of the transpose for that column, all before starting on the next column
2313        // 3: Store data as soon as we're finished with it, rather than waiting for the end
2314        let load = |i| {
2315            [
2316                buffer.load_complex(i),
2317                buffer.load_complex(i + 4),
2318                buffer.load_complex(i + 8),
2319                buffer.load_complex(i + 12),
2320            ]
2321        };
2322
2323        // For each pair of columns: load the data, apply our size-4 FFT, apply twiddle factors, and transpose
2324        let mut tmp0 = self.bf4.perform_parallel_fft_direct(load(0));
2325        tmp0[1] = SseVector::mul_complex(tmp0[1], self.twiddles_packed[0]);
2326        tmp0[2] = SseVector::mul_complex(tmp0[2], self.twiddles_packed[1]);
2327        tmp0[3] = SseVector::mul_complex(tmp0[3], self.twiddles_packed[2]);
2328        let [mid0, mid1] = transpose_complex_2x2_f32(tmp0[0], tmp0[1]);
2329        let [mid4, mid5] = transpose_complex_2x2_f32(tmp0[2], tmp0[3]);
2330
2331        let mut tmp1 = self.bf4.perform_parallel_fft_direct(load(2));
2332        tmp1[1] = SseVector::mul_complex(tmp1[1], self.twiddles_packed[3]);
2333        tmp1[2] = SseVector::mul_complex(tmp1[2], self.twiddles_packed[4]);
2334        tmp1[3] = SseVector::mul_complex(tmp1[3], self.twiddles_packed[5]);
2335        let [mid2, mid3] = transpose_complex_2x2_f32(tmp1[0], tmp1[1]);
2336        let [mid6, mid7] = transpose_complex_2x2_f32(tmp1[2], tmp1[3]);
2337
2338        ////////////////////////////////////////////////////////////
2339        let mut store = |i: usize, vectors: [__m128; 4]| {
2340            buffer.store_complex(vectors[0], i + 0);
2341            buffer.store_complex(vectors[1], i + 4);
2342            buffer.store_complex(vectors[2], i + 8);
2343            buffer.store_complex(vectors[3], i + 12);
2344        };
2345        // Size-4 FFTs down each pair of transposed columns, storing them as soon as we're done with them
2346        let out0 = self
2347            .bf4
2348            .perform_parallel_fft_direct([mid0, mid1, mid2, mid3]);
2349        store(0, out0);
2350
2351        let out1 = self
2352            .bf4
2353            .perform_parallel_fft_direct([mid4, mid5, mid6, mid7]);
2354        store(2, out1);
2355    }
2356
2357    // benchmarking shows it's faster to always use the nonparallel version, but this is kep around for reference
2358    #[allow(unused)]
2359    pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
2360        // To make the best possible use of registers, we're going to write this algorithm in an unusual way
2361        // It's 4x4 mixed radix, so we're going to do the usual steps of size-4 FFTs down the columns, apply twiddle factors, then transpose and do size-4 FFTs again
2362        // But to reduce the number of times registers get spilled, we have these optimizations:
2363        // 1: Load data as late as possible, not upfront
2364        // 2: Once we're working with a piece of data, make as much progress as possible before moving on
2365        //      IE, once we load a column, we should do the FFT down the column, do twiddle factors, and do the pieces of the transpose for that column, all before starting on the next column
2366        // 3: Store data as soon as we're finished with it, rather than waiting for the end
2367        let load = |i: usize| {
2368            let [a0, a1] =
2369                transpose_complex_2x2_f32(buffer.load_complex(i + 0), buffer.load_complex(i + 16));
2370            let [b0, b1] =
2371                transpose_complex_2x2_f32(buffer.load_complex(i + 4), buffer.load_complex(i + 20));
2372            let [c0, c1] =
2373                transpose_complex_2x2_f32(buffer.load_complex(i + 8), buffer.load_complex(i + 24));
2374            let [d0, d1] =
2375                transpose_complex_2x2_f32(buffer.load_complex(i + 12), buffer.load_complex(i + 28));
2376            [[a0, b0, c0, d0], [a1, b1, c1, d1]]
2377        };
2378
2379        // For each pair of columns: load the data, apply our size-4 FFT, apply twiddle factors
2380        let [in2, in3] = load(2);
2381        let mut tmp2 = self.bf4.perform_parallel_fft_direct(in2);
2382        let mut tmp3 = self.bf4.perform_parallel_fft_direct(in3);
2383        tmp2[1] = self.bf4.rotate.rotate_both_45(tmp2[1]);
2384        tmp2[2] = self.bf4.rotate.rotate_both(tmp2[2]);
2385        tmp2[3] = self.bf4.rotate.rotate_both_135(tmp2[3]);
2386        tmp3[1] = SseVector::mul_complex(tmp3[1], self.twiddle3);
2387        tmp3[2] = self.bf4.rotate.rotate_both_135(tmp3[2]);
2388        tmp3[3] = SseVector::mul_complex(tmp3[3], self.twiddle9);
2389
2390        // Do these last, because fewer twiddles means fewer temporaries forcing the above data to spill
2391        let [in0, in1] = load(0);
2392        let tmp0 = self.bf4.perform_parallel_fft_direct(in0);
2393        let mut tmp1 = self.bf4.perform_parallel_fft_direct(in1);
2394        tmp1[1] = SseVector::mul_complex(tmp1[1], self.twiddle1);
2395        tmp1[2] = self.bf4.rotate.rotate_both_45(tmp1[2]);
2396        tmp1[3] = SseVector::mul_complex(tmp1[3], self.twiddle3);
2397
2398        ////////////////////////////////////////////////////////////
2399        let mut store = |i, values_a: [__m128; 4], values_b: [__m128; 4]| {
2400            for n in 0..4 {
2401                let [a, b] = transpose_complex_2x2_f32(values_a[n], values_b[n]);
2402                buffer.store_complex(a, i + n * 4);
2403                buffer.store_complex(b, i + n * 4 + 16);
2404            }
2405        };
2406        // Size-4 FFTs down each pair of transposed columns, storing them as soon as we're done with them
2407        let out0 = self
2408            .bf4
2409            .perform_parallel_fft_direct([tmp0[0], tmp1[0], tmp2[0], tmp3[0]]);
2410        let out1 = self
2411            .bf4
2412            .perform_parallel_fft_direct([tmp0[1], tmp1[1], tmp2[1], tmp3[1]]);
2413        store(0, out0, out1);
2414
2415        let out2 = self
2416            .bf4
2417            .perform_parallel_fft_direct([tmp0[2], tmp1[2], tmp2[2], tmp3[2]]);
2418        let out3 = self
2419            .bf4
2420            .perform_parallel_fft_direct([tmp0[3], tmp1[3], tmp2[3], tmp3[3]]);
2421        store(2, out2, out3);
2422    }
2423}
2424//   _  __              __   _  _   _     _ _
2425//  / |/ /_            / /_ | || | | |__ (_) |_
2426//  | | '_ \   _____  | '_ \| || |_| '_ \| | __|
2427//  | | (_) | |_____| | (_) |__   _| |_) | | |_
2428//  |_|\___/           \___/   |_| |_.__/|_|\__|
2429//
2430
2431pub struct SseF64Butterfly16<T> {
2432    bf4: SseF64Butterfly4<T>,
2433    twiddle1: __m128d,
2434    twiddle3: __m128d,
2435    twiddle9: __m128d,
2436}
2437
2438boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly16);
2439boilerplate_fft_sse_common_butterfly!(SseF64Butterfly16, 16, |this: &SseF64Butterfly16<_>| this
2440    .bf4
2441    .direction);
2442impl<T: FftNum> SseF64Butterfly16<T> {
2443    #[inline(always)]
2444    pub fn new(direction: FftDirection) -> Self {
2445        assert_f64::<T>();
2446        let tw1: Complex<f64> = twiddles::compute_twiddle(1, 16, direction);
2447        let tw3: Complex<f64> = twiddles::compute_twiddle(3, 16, direction);
2448        let tw9: Complex<f64> = twiddles::compute_twiddle(9, 16, direction);
2449
2450        unsafe {
2451            Self {
2452                bf4: SseF64Butterfly4::new(direction),
2453                twiddle1: pack_64(tw1),
2454                twiddle3: pack_64(tw3),
2455                twiddle9: pack_64(tw9),
2456            }
2457        }
2458    }
2459
2460    #[inline(always)]
2461    unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
2462        // To make the best possible use of registers, we're going to write this algorithm in an unusual way
2463        // It's 4x4 mixed radix, so we're going to do the usual steps of size-4 FFTs down the columns, apply twiddle factors, then transpose and do size-4 FFTs again
2464        // But to reduce the number of times registers get spilled, we have these optimizations:
2465        // 1: Load data as late as possible, not upfront
2466        // 2: Once we're working with a piece of data, make as much progress as possible before moving on
2467        //      IE, once we load a column, we should do the FFT down the column, do twiddle factors, and do the pieces of the transpose for that column, all before starting on the next column
2468        // 3: Store data as soon as we're finished with it, rather than waiting for the end
2469        let load = |i| {
2470            [
2471                buffer.load_complex(i),
2472                buffer.load_complex(i + 4),
2473                buffer.load_complex(i + 8),
2474                buffer.load_complex(i + 12),
2475            ]
2476        };
2477
2478        // For each column: load the data, apply our size-4 FFT, apply twiddle factors
2479        let mut tmp1 = self.bf4.perform_fft_direct(load(1));
2480        tmp1[1] = SseVector::mul_complex(tmp1[1], self.twiddle1);
2481        tmp1[2] = self.bf4.rotate.rotate_45(tmp1[2]);
2482        tmp1[3] = SseVector::mul_complex(tmp1[3], self.twiddle3);
2483
2484        let mut tmp3 = self.bf4.perform_fft_direct(load(3));
2485        tmp3[1] = SseVector::mul_complex(tmp3[1], self.twiddle3);
2486        tmp3[2] = self.bf4.rotate.rotate_135(tmp3[2]);
2487        tmp3[3] = SseVector::mul_complex(tmp3[3], self.twiddle9);
2488
2489        let mut tmp2 = self.bf4.perform_fft_direct(load(2));
2490        tmp2[1] = self.bf4.rotate.rotate_45(tmp2[1]);
2491        tmp2[2] = self.bf4.rotate.rotate(tmp2[2]);
2492        tmp2[3] = self.bf4.rotate.rotate_135(tmp2[3]);
2493
2494        // Do the first column last, because no twiddles means fewer temporaries forcing the above data to spill
2495        let tmp0 = self.bf4.perform_fft_direct(load(0));
2496
2497        ////////////////////////////////////////////////////////////
2498        let mut store = |i: usize, vectors: [__m128d; 4]| {
2499            buffer.store_complex(vectors[0], i + 0);
2500            buffer.store_complex(vectors[1], i + 4);
2501            buffer.store_complex(vectors[2], i + 8);
2502            buffer.store_complex(vectors[3], i + 12);
2503        };
2504
2505        // Size-4 FFTs down each of our transposed columns, storing them as soon as we're done with them
2506        let out0 = self
2507            .bf4
2508            .perform_fft_direct([tmp0[0], tmp1[0], tmp2[0], tmp3[0]]);
2509        store(0, out0);
2510
2511        let out1 = self
2512            .bf4
2513            .perform_fft_direct([tmp0[1], tmp1[1], tmp2[1], tmp3[1]]);
2514        store(1, out1);
2515
2516        let out2 = self
2517            .bf4
2518            .perform_fft_direct([tmp0[2], tmp1[2], tmp2[2], tmp3[2]]);
2519        store(2, out2);
2520
2521        let out3 = self
2522            .bf4
2523            .perform_fft_direct([tmp0[3], tmp1[3], tmp2[3], tmp3[3]]);
2524        store(3, out3);
2525    }
2526}
2527
2528//    ___ _  _             _________  _     _ _
2529//   |__ \ || |           |___ /___ \| |__ (_) |_
2530//    __) ||| |_   _____    |_ \ __) | '_ \| | __|
2531//   / __/__   _| |_____|  ___) / __/| |_) | | |_
2532//  |____}  |_|           |____/_____|_.__/|_|\__|
2533//
2534
2535pub struct SseF32Butterfly24<T> {
2536    bf4: SseF32Butterfly4<T>,
2537    bf6: SseF32Butterfly6<T>,
2538    twiddles_packed: [__m128; 9],
2539    twiddle1: __m128,
2540    twiddle2: __m128,
2541    twiddle4: __m128,
2542    twiddle5: __m128,
2543    twiddle8: __m128,
2544    twiddle10: __m128,
2545}
2546
2547boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly24);
2548boilerplate_fft_sse_common_butterfly!(SseF32Butterfly24, 24, |this: &SseF32Butterfly24<_>| this
2549    .bf4
2550    .direction);
2551impl<T: FftNum> SseF32Butterfly24<T> {
2552    #[inline(always)]
2553    pub fn new(direction: FftDirection) -> Self {
2554        assert_f32::<T>();
2555        let tw0: Complex<f32> = Complex { re: 1.0, im: 0.0 };
2556        let tw1: Complex<f32> = twiddles::compute_twiddle(1, 24, direction);
2557        let tw2: Complex<f32> = twiddles::compute_twiddle(2, 24, direction);
2558        let tw3: Complex<f32> = twiddles::compute_twiddle(3, 24, direction);
2559        let tw4: Complex<f32> = twiddles::compute_twiddle(4, 24, direction);
2560        let tw5: Complex<f32> = twiddles::compute_twiddle(5, 24, direction);
2561        let tw6: Complex<f32> = twiddles::compute_twiddle(6, 24, direction);
2562        let tw8: Complex<f32> = twiddles::compute_twiddle(8, 24, direction);
2563        let tw9: Complex<f32> = twiddles::compute_twiddle(9, 24, direction);
2564        let tw10: Complex<f32> = twiddles::compute_twiddle(10, 24, direction);
2565        let tw12: Complex<f32> = twiddles::compute_twiddle(12, 24, direction);
2566        let tw15: Complex<f32> = twiddles::compute_twiddle(15, 24, direction);
2567        unsafe {
2568            Self {
2569                bf4: SseF32Butterfly4::new(direction),
2570                bf6: SseF32Butterfly6::new(direction),
2571                twiddles_packed: [
2572                    pack_32(tw0, tw1),
2573                    pack_32(tw0, tw2),
2574                    pack_32(tw0, tw3),
2575                    pack_32(tw2, tw3),
2576                    pack_32(tw4, tw6),
2577                    pack_32(tw6, tw9),
2578                    pack_32(tw4, tw5),
2579                    pack_32(tw8, tw10),
2580                    pack_32(tw12, tw15),
2581                ],
2582                twiddle1: pack_32(tw1, tw1),
2583                twiddle2: pack_32(tw2, tw2),
2584                twiddle4: pack_32(tw4, tw4),
2585                twiddle5: pack_32(tw5, tw5),
2586                twiddle8: pack_32(tw8, tw8),
2587                twiddle10: pack_32(tw10, tw10),
2588            }
2589        }
2590    }
2591
2592    #[inline(always)]
2593    unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
2594        // To make the best possible use of registers, we're going to write this algorithm in an unusual way
2595        // It's 6x4 mixed radix, so we're going to do the usual steps of size-4 FFTs down the columns, apply twiddle factors, then transpose and do size-6 FFTs
2596        // But to reduce the number of times registers get spilled, we have these optimizations:
2597        // 1: Load data as late as possible, not upfront
2598        // 2: Once we're working with a piece of data, make as much progress as possible before moving on
2599        //      IE, once we load a column, we should do the FFT down the column, do twiddle factors, and do the pieces of the transpose for that column, all before starting on the next column
2600        // 3: Store data as soon as we're finished with it, rather than waiting for the end
2601        let load = |i| {
2602            [
2603                buffer.load_complex(i),
2604                buffer.load_complex(i + 6),
2605                buffer.load_complex(i + 12),
2606                buffer.load_complex(i + 18),
2607            ]
2608        };
2609
2610        // For each pair of columns: load the data, apply our size-4 FFT, apply twiddle factors, transpose
2611        let mut tmp1 = self.bf4.perform_parallel_fft_direct(load(2));
2612        tmp1[1] = SseVector::mul_complex(tmp1[1], self.twiddles_packed[3]);
2613        tmp1[2] = SseVector::mul_complex(tmp1[2], self.twiddles_packed[4]);
2614        tmp1[3] = SseVector::mul_complex(tmp1[3], self.twiddles_packed[5]);
2615        let [mid2, mid3] = transpose_complex_2x2_f32(tmp1[0], tmp1[1]);
2616        let [mid8, mid9] = transpose_complex_2x2_f32(tmp1[2], tmp1[3]);
2617
2618        let mut tmp2 = self.bf4.perform_parallel_fft_direct(load(4));
2619        tmp2[1] = SseVector::mul_complex(tmp2[1], self.twiddles_packed[6]);
2620        tmp2[2] = SseVector::mul_complex(tmp2[2], self.twiddles_packed[7]);
2621        tmp2[3] = SseVector::mul_complex(tmp2[3], self.twiddles_packed[8]);
2622        let [mid4, mid5] = transpose_complex_2x2_f32(tmp2[0], tmp2[1]);
2623        let [mid10, mid11] = transpose_complex_2x2_f32(tmp2[2], tmp2[3]);
2624
2625        let mut tmp0 = self.bf4.perform_parallel_fft_direct(load(0));
2626        tmp0[1] = SseVector::mul_complex(tmp0[1], self.twiddles_packed[0]);
2627        tmp0[2] = SseVector::mul_complex(tmp0[2], self.twiddles_packed[1]);
2628        tmp0[3] = SseVector::mul_complex(tmp0[3], self.twiddles_packed[2]);
2629        let [mid0, mid1] = transpose_complex_2x2_f32(tmp0[0], tmp0[1]);
2630        let [mid6, mid7] = transpose_complex_2x2_f32(tmp0[2], tmp0[3]);
2631
2632        ////////////////////////////////////////////////////////////
2633        let mut store = |i, vectors: [__m128; 6]| {
2634            buffer.store_complex(vectors[0], i);
2635            buffer.store_complex(vectors[1], i + 4);
2636            buffer.store_complex(vectors[2], i + 8);
2637            buffer.store_complex(vectors[3], i + 12);
2638            buffer.store_complex(vectors[4], i + 16);
2639            buffer.store_complex(vectors[5], i + 20);
2640        };
2641
2642        // Size-6 FFTs down each pair of transposed columns, storing them as soon as we're done with them
2643        let out0 = self
2644            .bf6
2645            .perform_parallel_fft_direct(mid0, mid1, mid2, mid3, mid4, mid5);
2646        store(0, out0);
2647
2648        let out1 = self
2649            .bf6
2650            .perform_parallel_fft_direct(mid6, mid7, mid8, mid9, mid10, mid11);
2651        store(2, out1);
2652    }
2653
2654    #[inline(always)]
2655    pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
2656        // To make the best possible use of registers, we're going to write this algorithm in an unusual way
2657        // It's 6x4 mixed radix, so we're going to do the usual steps of size-4 FFTs down the columns, apply twiddle factors, then transpose and do size-6 FFTs
2658        // But to reduce the number of times registers get spilled, we have these optimizations:
2659        // 1: Load data as late as possible, not upfront
2660        // 2: Once we're working with a piece of data, make as much progress as possible before moving on
2661        //      IE, once we load a column, we should do the FFT down the column, do twiddle factors, and do the pieces of the transpose for that column, all before starting on the next column
2662        // 3: Store data as soon as we're finished with it, rather than waiting for the end
2663        let load = |i: usize| {
2664            let [a0, a1] =
2665                transpose_complex_2x2_f32(buffer.load_complex(i + 0), buffer.load_complex(i + 24));
2666            let [b0, b1] =
2667                transpose_complex_2x2_f32(buffer.load_complex(i + 6), buffer.load_complex(i + 30));
2668            let [c0, c1] =
2669                transpose_complex_2x2_f32(buffer.load_complex(i + 12), buffer.load_complex(i + 36));
2670            let [d0, d1] =
2671                transpose_complex_2x2_f32(buffer.load_complex(i + 18), buffer.load_complex(i + 42));
2672            [[a0, b0, c0, d0], [a1, b1, c1, d1]]
2673        };
2674
2675        // For each pair of columns: load the data, apply our size-4 FFT, apply twiddle factors
2676        let [in0, in1] = load(0);
2677        let tmp0 = self.bf4.perform_parallel_fft_direct(in0);
2678        let mut tmp1 = self.bf4.perform_parallel_fft_direct(in1);
2679        tmp1[1] = SseVector::mul_complex(tmp1[1], self.twiddle1);
2680        tmp1[2] = SseVector::mul_complex(tmp1[2], self.twiddle2);
2681        tmp1[3] = self.bf4.rotate.rotate_both_45(tmp1[3]);
2682
2683        let [in2, in3] = load(2);
2684        let mut tmp2 = self.bf4.perform_parallel_fft_direct(in2);
2685        let mut tmp3 = self.bf4.perform_parallel_fft_direct(in3);
2686        tmp2[1] = SseVector::mul_complex(tmp2[1], self.twiddle2);
2687        tmp2[2] = SseVector::mul_complex(tmp2[2], self.twiddle4);
2688        tmp2[3] = self.bf4.rotate.rotate_both(tmp2[3]);
2689        tmp3[1] = self.bf4.rotate.rotate_both_45(tmp3[1]);
2690        tmp3[2] = self.bf4.rotate.rotate_both(tmp3[2]);
2691        tmp3[3] = self.bf4.rotate.rotate_both_135(tmp3[3]);
2692
2693        let [in4, in5] = load(4);
2694        let mut tmp4 = self.bf4.perform_parallel_fft_direct(in4);
2695        let mut tmp5 = self.bf4.perform_parallel_fft_direct(in5);
2696        tmp4[1] = SseVector::mul_complex(tmp4[1], self.twiddle4);
2697        tmp4[2] = SseVector::mul_complex(tmp4[2], self.twiddle8);
2698        tmp4[3] = SseVector::neg(tmp4[3]);
2699        tmp5[1] = SseVector::mul_complex(tmp5[1], self.twiddle5);
2700        tmp5[2] = SseVector::mul_complex(tmp5[2], self.twiddle10);
2701        tmp5[3] = self.bf4.rotate.rotate_both_225(tmp5[3]);
2702
2703        ////////////////////////////////////////////////////////////
2704        let mut store = |i, vectors_a: [__m128; 6], vectors_b: [__m128; 6]| {
2705            for n in 0..6 {
2706                let [a, b] = transpose_complex_2x2_f32(vectors_a[n], vectors_b[n]);
2707                buffer.store_complex(a, i + n * 4);
2708                buffer.store_complex(b, i + n * 4 + 24);
2709            }
2710        };
2711
2712        // Size-6 FFTs down each pair of transposed columns, storing them as soon as we're done with them
2713        let out0 = self
2714            .bf6
2715            .perform_parallel_fft_direct(tmp0[0], tmp1[0], tmp2[0], tmp3[0], tmp4[0], tmp5[0]);
2716        let out1 = self
2717            .bf6
2718            .perform_parallel_fft_direct(tmp0[1], tmp1[1], tmp2[1], tmp3[1], tmp4[1], tmp5[1]);
2719        store(0, out0, out1);
2720
2721        let out2 = self
2722            .bf6
2723            .perform_parallel_fft_direct(tmp0[2], tmp1[2], tmp2[2], tmp3[2], tmp4[2], tmp5[2]);
2724        let out3 = self
2725            .bf6
2726            .perform_parallel_fft_direct(tmp0[3], tmp1[3], tmp2[3], tmp3[3], tmp4[3], tmp5[3]);
2727        store(2, out2, out3);
2728    }
2729}
2730
2731//    ___ _  _              __   _  _   _     _ _
2732//   |__ \ || |            / /_ | || | | |__ (_) |_
2733//    __) ||| |_   _____  | '_ \| || |_| '_ \| | __|
2734//   / __/__   _| |_____| | (_) |__   _| |_) | | |_
2735//  |____}  |_|            \___/   |_| |_.__/|_|\__|
2736//
2737
2738pub struct SseF64Butterfly24<T> {
2739    bf4: SseF64Butterfly4<T>,
2740    bf6: SseF64Butterfly6<T>,
2741    twiddle1: __m128d,
2742    twiddle2: __m128d,
2743    twiddle4: __m128d,
2744    twiddle5: __m128d,
2745    twiddle8: __m128d,
2746    twiddle10: __m128d,
2747}
2748
2749boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly24);
2750boilerplate_fft_sse_common_butterfly!(SseF64Butterfly24, 24, |this: &SseF64Butterfly24<_>| this
2751    .bf4
2752    .direction);
2753impl<T: FftNum> SseF64Butterfly24<T> {
2754    #[inline(always)]
2755    pub fn new(direction: FftDirection) -> Self {
2756        assert_f64::<T>();
2757        let tw1: Complex<f64> = twiddles::compute_twiddle(1, 24, direction);
2758        let tw2: Complex<f64> = twiddles::compute_twiddle(2, 24, direction);
2759        let tw4: Complex<f64> = twiddles::compute_twiddle(4, 24, direction);
2760        let tw5: Complex<f64> = twiddles::compute_twiddle(5, 24, direction);
2761        let tw8: Complex<f64> = twiddles::compute_twiddle(8, 24, direction);
2762        let tw10: Complex<f64> = twiddles::compute_twiddle(10, 24, direction);
2763
2764        unsafe {
2765            Self {
2766                bf4: SseF64Butterfly4::new(direction),
2767                bf6: SseF64Butterfly6::new(direction),
2768                twiddle1: pack_64(tw1),
2769                twiddle2: pack_64(tw2),
2770                twiddle4: pack_64(tw4),
2771                twiddle5: pack_64(tw5),
2772                twiddle8: pack_64(tw8),
2773                twiddle10: pack_64(tw10),
2774            }
2775        }
2776    }
2777
2778    #[inline(always)]
2779    unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
2780        // To make the best possible use of registers, we're going to write this algorithm in an unusual way
2781        // It's 6x4 mixed radix, so we're going to do the usual steps of size-4 FFTs down the columns, apply twiddle factors, then transpose and do size-6 FFTs
2782        // But to reduce the number of times registers get spilled, we have these optimizations:
2783        // 1: Load data as late as possible, not upfront
2784        // 2: Once we're working with a piece of data, make as much progress as possible before moving on
2785        //      IE, once we load a column, we should do the FFT down the column, do twiddle factors, and do the pieces of the transpose for that column, all before starting on the next column
2786        // 3: Store data as soon as we're finished with it, rather than waiting for the end
2787        let load = |i| {
2788            [
2789                buffer.load_complex(i),
2790                buffer.load_complex(i + 6),
2791                buffer.load_complex(i + 12),
2792                buffer.load_complex(i + 18),
2793            ]
2794        };
2795
2796        // For each column: load the data, apply our size-4 FFT, apply twiddle factors
2797        let mut tmp1 = self.bf4.perform_fft_direct(load(1));
2798        tmp1[1] = SseVector::mul_complex(tmp1[1], self.twiddle1);
2799        tmp1[2] = SseVector::mul_complex(tmp1[2], self.twiddle2);
2800        tmp1[3] = self.bf4.rotate.rotate_45(tmp1[3]);
2801
2802        let mut tmp2 = self.bf4.perform_fft_direct(load(2));
2803        tmp2[1] = SseVector::mul_complex(tmp2[1], self.twiddle2);
2804        tmp2[2] = SseVector::mul_complex(tmp2[2], self.twiddle4);
2805        tmp2[3] = self.bf4.rotate.rotate(tmp2[3]);
2806
2807        let mut tmp4 = self.bf4.perform_fft_direct(load(4));
2808        tmp4[1] = SseVector::mul_complex(tmp4[1], self.twiddle4);
2809        tmp4[2] = SseVector::mul_complex(tmp4[2], self.twiddle8);
2810        tmp4[3] = SseVector::neg(tmp4[3]);
2811
2812        let mut tmp5 = self.bf4.perform_fft_direct(load(5));
2813        tmp5[1] = SseVector::mul_complex(tmp5[1], self.twiddle5);
2814        tmp5[2] = SseVector::mul_complex(tmp5[2], self.twiddle10);
2815        tmp5[3] = self.bf4.rotate.rotate_225(tmp5[3]);
2816
2817        let mut tmp3 = self.bf4.perform_fft_direct(load(3));
2818        tmp3[1] = self.bf4.rotate.rotate_45(tmp3[1]);
2819        tmp3[2] = self.bf4.rotate.rotate(tmp3[2]);
2820        tmp3[3] = self.bf4.rotate.rotate_135(tmp3[3]);
2821
2822        // Do the first column last, because no twiddles means fewer temporaries forcing the above data to spill
2823        let tmp0 = self.bf4.perform_fft_direct(load(0));
2824
2825        ////////////////////////////////////////////////////////////
2826        let mut store = |i, vectors: [__m128d; 6]| {
2827            buffer.store_complex(vectors[0], i);
2828            buffer.store_complex(vectors[1], i + 4);
2829            buffer.store_complex(vectors[2], i + 8);
2830            buffer.store_complex(vectors[3], i + 12);
2831            buffer.store_complex(vectors[4], i + 16);
2832            buffer.store_complex(vectors[5], i + 20);
2833        };
2834
2835        // Size-6 FFTs down each of our transposed columns, storing them as soon as we're done with them
2836        let out0 = self
2837            .bf6
2838            .perform_fft_direct([tmp0[0], tmp1[0], tmp2[0], tmp3[0], tmp4[0], tmp5[0]]);
2839        store(0, out0);
2840
2841        let out1 = self
2842            .bf6
2843            .perform_fft_direct([tmp0[1], tmp1[1], tmp2[1], tmp3[1], tmp4[1], tmp5[1]]);
2844        store(1, out1);
2845
2846        let out2 = self
2847            .bf6
2848            .perform_fft_direct([tmp0[2], tmp1[2], tmp2[2], tmp3[2], tmp4[2], tmp5[2]]);
2849        store(2, out2);
2850
2851        let out3 = self
2852            .bf6
2853            .perform_fft_direct([tmp0[3], tmp1[3], tmp2[3], tmp3[3], tmp4[3], tmp5[3]]);
2854        store(3, out3);
2855    }
2856}
2857
2858//   _________            _________  _     _ _
2859//  |___ /___ \          |___ /___ \| |__ (_) |_
2860//    |_ \ __) |  _____    |_ \ __) | '_ \| | __|
2861//   ___) / __/  |_____|  ___) / __/| |_) | | |_
2862//  |____/_____|         |____/_____|_.__/|_|\__|
2863//
2864
2865pub struct SseF32Butterfly32<T> {
2866    bf8: SseF32Butterfly8<T>,
2867    twiddles_packed: [__m128; 12],
2868    twiddle1: __m128,
2869    twiddle2: __m128,
2870    twiddle3: __m128,
2871    twiddle5: __m128,
2872    twiddle6: __m128,
2873    twiddle7: __m128,
2874    twiddle9: __m128,
2875    twiddle10: __m128,
2876    twiddle14: __m128,
2877    twiddle15: __m128,
2878    twiddle18: __m128,
2879    twiddle21: __m128,
2880}
2881
2882boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly32);
2883boilerplate_fft_sse_common_butterfly!(SseF32Butterfly32, 32, |this: &SseF32Butterfly32<_>| this
2884    .bf8
2885    .bf4
2886    .direction);
2887impl<T: FftNum> SseF32Butterfly32<T> {
2888    #[inline(always)]
2889    pub fn new(direction: FftDirection) -> Self {
2890        assert_f32::<T>();
2891        let tw0: Complex<f32> = Complex { re: 1.0, im: 0.0 };
2892        let tw1: Complex<f32> = twiddles::compute_twiddle(1, 32, direction);
2893        let tw2: Complex<f32> = twiddles::compute_twiddle(2, 32, direction);
2894        let tw3: Complex<f32> = twiddles::compute_twiddle(3, 32, direction);
2895        let tw4: Complex<f32> = twiddles::compute_twiddle(4, 32, direction);
2896        let tw5: Complex<f32> = twiddles::compute_twiddle(5, 32, direction);
2897        let tw6: Complex<f32> = twiddles::compute_twiddle(6, 32, direction);
2898        let tw7: Complex<f32> = twiddles::compute_twiddle(7, 32, direction);
2899        let tw8: Complex<f32> = twiddles::compute_twiddle(8, 32, direction);
2900        let tw9: Complex<f32> = twiddles::compute_twiddle(9, 32, direction);
2901        let tw10: Complex<f32> = twiddles::compute_twiddle(10, 32, direction);
2902        let tw12: Complex<f32> = twiddles::compute_twiddle(12, 32, direction);
2903        let tw14: Complex<f32> = twiddles::compute_twiddle(14, 32, direction);
2904        let tw15: Complex<f32> = twiddles::compute_twiddle(15, 32, direction);
2905        let tw18: Complex<f32> = twiddles::compute_twiddle(18, 32, direction);
2906        let tw21: Complex<f32> = twiddles::compute_twiddle(21, 32, direction);
2907        unsafe {
2908            Self {
2909                bf8: SseF32Butterfly8::new(direction),
2910                twiddles_packed: [
2911                    pack_32(tw0, tw1),
2912                    pack_32(tw0, tw2),
2913                    pack_32(tw0, tw3),
2914                    pack_32(tw2, tw3),
2915                    pack_32(tw4, tw6),
2916                    pack_32(tw6, tw9),
2917                    pack_32(tw4, tw5),
2918                    pack_32(tw8, tw10),
2919                    pack_32(tw12, tw15),
2920                    pack_32(tw6, tw7),
2921                    pack_32(tw12, tw14),
2922                    pack_32(tw18, tw21),
2923                ],
2924                twiddle1: pack_32(tw1, tw1),
2925                twiddle2: pack_32(tw2, tw2),
2926                twiddle3: pack_32(tw3, tw3),
2927                twiddle5: pack_32(tw5, tw5),
2928                twiddle6: pack_32(tw6, tw6),
2929                twiddle7: pack_32(tw7, tw7),
2930                twiddle9: pack_32(tw9, tw9),
2931                twiddle10: pack_32(tw10, tw10),
2932                twiddle14: pack_32(tw14, tw14),
2933                twiddle15: pack_32(tw15, tw15),
2934                twiddle18: pack_32(tw18, tw18),
2935                twiddle21: pack_32(tw21, tw21),
2936            }
2937        }
2938    }
2939
2940    #[inline(always)]
2941    unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
2942        // To make the best possible use of registers, we're going to write this algorithm in an unusual way
2943        // It's 8x4 mixed radix, so we're going to do the usual steps of size-4 FFTs down the columns, apply twiddle factors, then transpose and do size-8 FFTs
2944        // But to reduce the number of times registers get spilled, we have these optimizations:
2945        // 1: Load data as late as possible, not upfront
2946        // 2: Once we're working with a piece of data, make as much progress as possible before moving on
2947        //      IE, once we load a column, we should do the FFT down the column, do twiddle factors, and do the pieces of the transpose for that column, all before starting on the next column
2948        // 3: Store data as soon as we're finished with it, rather than waiting for the end
2949        let load = |i| {
2950            [
2951                buffer.load_complex(i),
2952                buffer.load_complex(i + 8),
2953                buffer.load_complex(i + 16),
2954                buffer.load_complex(i + 24),
2955            ]
2956        };
2957
2958        // For each pair of columns: load the data, apply our size-4 FFT, apply twiddle factors
2959        let mut tmp0 = self.bf8.bf4.perform_parallel_fft_direct(load(0));
2960        tmp0[1] = SseVector::mul_complex(tmp0[1], self.twiddles_packed[0]);
2961        tmp0[2] = SseVector::mul_complex(tmp0[2], self.twiddles_packed[1]);
2962        tmp0[3] = SseVector::mul_complex(tmp0[3], self.twiddles_packed[2]);
2963        let [mid0, mid1] = transpose_complex_2x2_f32(tmp0[0], tmp0[1]);
2964        let [mid8, mid9] = transpose_complex_2x2_f32(tmp0[2], tmp0[3]);
2965
2966        let mut tmp1 = self.bf8.bf4.perform_parallel_fft_direct(load(2));
2967        tmp1[1] = SseVector::mul_complex(tmp1[1], self.twiddles_packed[3]);
2968        tmp1[2] = SseVector::mul_complex(tmp1[2], self.twiddles_packed[4]);
2969        tmp1[3] = SseVector::mul_complex(tmp1[3], self.twiddles_packed[5]);
2970        let [mid2, mid3] = transpose_complex_2x2_f32(tmp1[0], tmp1[1]);
2971        let [mid10, mid11] = transpose_complex_2x2_f32(tmp1[2], tmp1[3]);
2972
2973        let mut tmp2 = self.bf8.bf4.perform_parallel_fft_direct(load(4));
2974        tmp2[1] = SseVector::mul_complex(tmp2[1], self.twiddles_packed[6]);
2975        tmp2[2] = SseVector::mul_complex(tmp2[2], self.twiddles_packed[7]);
2976        tmp2[3] = SseVector::mul_complex(tmp2[3], self.twiddles_packed[8]);
2977        let [mid4, mid5] = transpose_complex_2x2_f32(tmp2[0], tmp2[1]);
2978        let [mid12, mid13] = transpose_complex_2x2_f32(tmp2[2], tmp2[3]);
2979
2980        let mut tmp3 = self.bf8.bf4.perform_parallel_fft_direct(load(6));
2981        tmp3[1] = SseVector::mul_complex(tmp3[1], self.twiddles_packed[9]);
2982        tmp3[2] = SseVector::mul_complex(tmp3[2], self.twiddles_packed[10]);
2983        tmp3[3] = SseVector::mul_complex(tmp3[3], self.twiddles_packed[11]);
2984        let [mid6, mid7] = transpose_complex_2x2_f32(tmp3[0], tmp3[1]);
2985        let [mid14, mid15] = transpose_complex_2x2_f32(tmp3[2], tmp3[3]);
2986
2987        ////////////////////////////////////////////////////////////
2988        let mut store = |i, vectors: [__m128; 8]| {
2989            buffer.store_complex(vectors[0], i);
2990            buffer.store_complex(vectors[1], i + 4);
2991            buffer.store_complex(vectors[2], i + 8);
2992            buffer.store_complex(vectors[3], i + 12);
2993            buffer.store_complex(vectors[4], i + 16);
2994            buffer.store_complex(vectors[5], i + 20);
2995            buffer.store_complex(vectors[6], i + 24);
2996            buffer.store_complex(vectors[7], i + 28);
2997        };
2998
2999        // Size-8 FFTs down each pair of transposed columns, storing them as soon as we're done with them
3000        let out0 = self
3001            .bf8
3002            .perform_parallel_fft_direct([mid0, mid1, mid2, mid3, mid4, mid5, mid6, mid7]);
3003        store(0, out0);
3004
3005        let out1 = self
3006            .bf8
3007            .perform_parallel_fft_direct([mid8, mid9, mid10, mid11, mid12, mid13, mid14, mid15]);
3008        store(2, out1);
3009    }
3010
3011    #[inline(always)]
3012    pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
3013        // To make the best possible use of registers, we're going to write this algorithm in an unusual way
3014        // It's 8x4 mixed radix, so we're going to do the usual steps of size-4 FFTs down the columns, apply twiddle factors, then transpose and do size-8 FFTs
3015        // But to reduce the number of times registers get spilled, we have these optimizations:
3016        // 1: Load data as late as possible, not upfront
3017        // 2: Once we're working with a piece of data, make as much progress as possible before moving on
3018        //      IE, once we load a column, we should do the FFT down the column, do twiddle factors, and do the pieces of the transpose for that column, all before starting on the next column
3019        // 3: Store data as soon as we're finished with it, rather than waiting for the end
3020        let load = |i: usize| {
3021            let [a0, a1] =
3022                transpose_complex_2x2_f32(buffer.load_complex(i + 0), buffer.load_complex(i + 32));
3023            let [b0, b1] =
3024                transpose_complex_2x2_f32(buffer.load_complex(i + 8), buffer.load_complex(i + 40));
3025            let [c0, c1] =
3026                transpose_complex_2x2_f32(buffer.load_complex(i + 16), buffer.load_complex(i + 48));
3027            let [d0, d1] =
3028                transpose_complex_2x2_f32(buffer.load_complex(i + 24), buffer.load_complex(i + 56));
3029            [[a0, b0, c0, d0], [a1, b1, c1, d1]]
3030        };
3031
3032        // For each pair of columns: load the data, apply our size-4 FFT, apply twiddle factors
3033        let [in0, in1] = load(0);
3034        let tmp0 = self.bf8.bf4.perform_parallel_fft_direct(in0);
3035        let mut tmp1 = self.bf8.bf4.perform_parallel_fft_direct(in1);
3036        tmp1[1] = SseVector::mul_complex(tmp1[1], self.twiddle1);
3037        tmp1[2] = SseVector::mul_complex(tmp1[2], self.twiddle2);
3038        tmp1[3] = SseVector::mul_complex(tmp1[3], self.twiddle3);
3039
3040        let [in2, in3] = load(2);
3041        let mut tmp2 = self.bf8.bf4.perform_parallel_fft_direct(in2);
3042        let mut tmp3 = self.bf8.bf4.perform_parallel_fft_direct(in3);
3043        tmp2[1] = SseVector::mul_complex(tmp2[1], self.twiddle2);
3044        tmp2[2] = self.bf8.bf4.rotate.rotate_both_45(tmp2[2]);
3045        tmp2[3] = SseVector::mul_complex(tmp2[3], self.twiddle6);
3046        tmp3[1] = SseVector::mul_complex(tmp3[1], self.twiddle3);
3047        tmp3[2] = SseVector::mul_complex(tmp3[2], self.twiddle6);
3048        tmp3[3] = SseVector::mul_complex(tmp3[3], self.twiddle9);
3049
3050        let [in4, in5] = load(4);
3051        let mut tmp4 = self.bf8.bf4.perform_parallel_fft_direct(in4);
3052        let mut tmp5 = self.bf8.bf4.perform_parallel_fft_direct(in5);
3053        tmp4[1] = self.bf8.bf4.rotate.rotate_both_45(tmp4[1]);
3054        tmp4[2] = self.bf8.bf4.rotate.rotate_both(tmp4[2]);
3055        tmp4[3] = self.bf8.bf4.rotate.rotate_both_135(tmp4[3]);
3056        tmp5[1] = SseVector::mul_complex(tmp5[1], self.twiddle5);
3057        tmp5[2] = SseVector::mul_complex(tmp5[2], self.twiddle10);
3058        tmp5[3] = SseVector::mul_complex(tmp5[3], self.twiddle15);
3059
3060        let [in6, in7] = load(6);
3061        let mut tmp6 = self.bf8.bf4.perform_parallel_fft_direct(in6);
3062        let mut tmp7 = self.bf8.bf4.perform_parallel_fft_direct(in7);
3063        tmp6[1] = SseVector::mul_complex(tmp6[1], self.twiddle6);
3064        tmp6[2] = self.bf8.bf4.rotate.rotate_both_135(tmp6[2]);
3065        tmp6[3] = SseVector::mul_complex(tmp6[3], self.twiddle18);
3066        tmp7[1] = SseVector::mul_complex(tmp7[1], self.twiddle7);
3067        tmp7[2] = SseVector::mul_complex(tmp7[2], self.twiddle14);
3068        tmp7[3] = SseVector::mul_complex(tmp7[3], self.twiddle21);
3069
3070        ////////////////////////////////////////////////////////////
3071        let mut store = |i, vectors_a: [__m128; 8], vectors_b: [__m128; 8]| {
3072            for n in 0..8 {
3073                let [a, b] = transpose_complex_2x2_f32(vectors_a[n], vectors_b[n]);
3074                buffer.store_complex(a, i + n * 4);
3075                buffer.store_complex(b, i + n * 4 + 32);
3076            }
3077        };
3078
3079        // Size-8 FFTs down each pair of transposed columns, storing them as soon as we're done with them
3080        let out0 = self.bf8.perform_parallel_fft_direct([
3081            tmp0[0], tmp1[0], tmp2[0], tmp3[0], tmp4[0], tmp5[0], tmp6[0], tmp7[0],
3082        ]);
3083        let out1 = self.bf8.perform_parallel_fft_direct([
3084            tmp0[1], tmp1[1], tmp2[1], tmp3[1], tmp4[1], tmp5[1], tmp6[1], tmp7[1],
3085        ]);
3086        store(0, out0, out1);
3087
3088        let out2 = self.bf8.perform_parallel_fft_direct([
3089            tmp0[2], tmp1[2], tmp2[2], tmp3[2], tmp4[2], tmp5[2], tmp6[2], tmp7[2],
3090        ]);
3091        let out3 = self.bf8.perform_parallel_fft_direct([
3092            tmp0[3], tmp1[3], tmp2[3], tmp3[3], tmp4[3], tmp5[3], tmp6[3], tmp7[3],
3093        ]);
3094        store(2, out2, out3);
3095    }
3096}
3097
3098//   _________             __   _  _   _     _ _
3099//  |___ /___ \           / /_ | || | | |__ (_) |_
3100//    |_ \ __) |  _____  | '_ \| || |_| '_ \| | __|
3101//   ___) / __/  |_____| | (_) |__   _| |_) | | |_
3102//  |____/_____|          \___/   |_| |_.__/|_|\__|
3103//
3104
3105pub struct SseF64Butterfly32<T> {
3106    bf8: SseF64Butterfly8<T>,
3107    twiddle1: __m128d,
3108    twiddle2: __m128d,
3109    twiddle3: __m128d,
3110    twiddle5: __m128d,
3111    twiddle6: __m128d,
3112    twiddle7: __m128d,
3113    twiddle9: __m128d,
3114    twiddle10: __m128d,
3115    twiddle14: __m128d,
3116    twiddle15: __m128d,
3117    twiddle18: __m128d,
3118    twiddle21: __m128d,
3119}
3120
3121boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly32);
3122boilerplate_fft_sse_common_butterfly!(SseF64Butterfly32, 32, |this: &SseF64Butterfly32<_>| this
3123    .bf8
3124    .bf4
3125    .direction);
3126impl<T: FftNum> SseF64Butterfly32<T> {
3127    #[inline(always)]
3128    pub fn new(direction: FftDirection) -> Self {
3129        assert_f64::<T>();
3130        let tw1: Complex<f64> = twiddles::compute_twiddle(1, 32, direction);
3131        let tw2: Complex<f64> = twiddles::compute_twiddle(2, 32, direction);
3132        let tw3: Complex<f64> = twiddles::compute_twiddle(3, 32, direction);
3133        let tw5: Complex<f64> = twiddles::compute_twiddle(5, 32, direction);
3134        let tw6: Complex<f64> = twiddles::compute_twiddle(6, 32, direction);
3135        let tw7: Complex<f64> = twiddles::compute_twiddle(7, 32, direction);
3136        let tw9: Complex<f64> = twiddles::compute_twiddle(9, 32, direction);
3137        let tw10: Complex<f64> = twiddles::compute_twiddle(10, 32, direction);
3138        let tw14: Complex<f64> = twiddles::compute_twiddle(14, 32, direction);
3139        let tw15: Complex<f64> = twiddles::compute_twiddle(15, 32, direction);
3140        let tw18: Complex<f64> = twiddles::compute_twiddle(18, 32, direction);
3141        let tw21: Complex<f64> = twiddles::compute_twiddle(21, 32, direction);
3142
3143        unsafe {
3144            Self {
3145                bf8: SseF64Butterfly8::new(direction),
3146                twiddle1: pack_64(tw1),
3147                twiddle2: pack_64(tw2),
3148                twiddle3: pack_64(tw3),
3149                twiddle5: pack_64(tw5),
3150                twiddle6: pack_64(tw6),
3151                twiddle7: pack_64(tw7),
3152                twiddle9: pack_64(tw9),
3153                twiddle10: pack_64(tw10),
3154                twiddle14: pack_64(tw14),
3155                twiddle15: pack_64(tw15),
3156                twiddle18: pack_64(tw18),
3157                twiddle21: pack_64(tw21),
3158            }
3159        }
3160    }
3161
3162    #[inline(always)]
3163    unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
3164        // To make the best possible use of registers, we're going to write this algorithm in an unusual way
3165        // It's 8x4 mixed radix, so we're going to do the usual steps of size-4 FFTs down the columns, apply twiddle factors, then transpose and do size-8 FFTs
3166        // But to reduce the number of times registers get spilled, we have these optimizations:
3167        // 1: Load data as late as possible, not upfront
3168        // 2: Once we're working with a piece of data, make as much progress as possible before moving on
3169        //      IE, once we load a column, we should do the FFT down the column, do twiddle factors, and do the pieces of the transpose for that column, all before starting on the next column
3170        // 3: Store data as soon as we're finished with it, rather than waiting for the end
3171        let load = |i| {
3172            [
3173                buffer.load_complex(i),
3174                buffer.load_complex(i + 8),
3175                buffer.load_complex(i + 16),
3176                buffer.load_complex(i + 24),
3177            ]
3178        };
3179
3180        // For each column: load the data, apply our size-4 FFT, apply twiddle factors
3181        let mut tmp1 = self.bf8.bf4.perform_fft_direct(load(1));
3182        tmp1[1] = SseVector::mul_complex(tmp1[1], self.twiddle1);
3183        tmp1[2] = SseVector::mul_complex(tmp1[2], self.twiddle2);
3184        tmp1[3] = SseVector::mul_complex(tmp1[3], self.twiddle3);
3185
3186        let mut tmp2 = self.bf8.bf4.perform_fft_direct(load(2));
3187        tmp2[1] = SseVector::mul_complex(tmp2[1], self.twiddle2);
3188        tmp2[2] = self.bf8.bf4.rotate.rotate_45(tmp2[2]);
3189        tmp2[3] = SseVector::mul_complex(tmp2[3], self.twiddle6);
3190
3191        let mut tmp3 = self.bf8.bf4.perform_fft_direct(load(3));
3192        tmp3[1] = SseVector::mul_complex(tmp3[1], self.twiddle3);
3193        tmp3[2] = SseVector::mul_complex(tmp3[2], self.twiddle6);
3194        tmp3[3] = SseVector::mul_complex(tmp3[3], self.twiddle9);
3195
3196        let mut tmp5 = self.bf8.bf4.perform_fft_direct(load(5));
3197        tmp5[1] = SseVector::mul_complex(tmp5[1], self.twiddle5);
3198        tmp5[2] = SseVector::mul_complex(tmp5[2], self.twiddle10);
3199        tmp5[3] = SseVector::mul_complex(tmp5[3], self.twiddle15);
3200
3201        let mut tmp6 = self.bf8.bf4.perform_fft_direct(load(6));
3202        tmp6[1] = SseVector::mul_complex(tmp6[1], self.twiddle6);
3203        tmp6[2] = self.bf8.bf4.rotate.rotate_135(tmp6[2]);
3204        tmp6[3] = SseVector::mul_complex(tmp6[3], self.twiddle18);
3205
3206        let mut tmp7 = self.bf8.bf4.perform_fft_direct(load(7));
3207        tmp7[1] = SseVector::mul_complex(tmp7[1], self.twiddle7);
3208        tmp7[2] = SseVector::mul_complex(tmp7[2], self.twiddle14);
3209        tmp7[3] = SseVector::mul_complex(tmp7[3], self.twiddle21);
3210
3211        let mut tmp4 = self.bf8.bf4.perform_fft_direct(load(4));
3212        tmp4[1] = self.bf8.bf4.rotate.rotate_45(tmp4[1]);
3213        tmp4[2] = self.bf8.bf4.rotate.rotate(tmp4[2]);
3214        tmp4[3] = self.bf8.bf4.rotate.rotate_135(tmp4[3]);
3215
3216        // Do the first column last, because no twiddles means fewer temporaries forcing the above data to spill
3217        let tmp0 = self.bf8.bf4.perform_fft_direct(load(0));
3218
3219        ////////////////////////////////////////////////////////////
3220        let mut store = |i, vectors: [__m128d; 8]| {
3221            buffer.store_complex(vectors[0], i);
3222            buffer.store_complex(vectors[1], i + 4);
3223            buffer.store_complex(vectors[2], i + 8);
3224            buffer.store_complex(vectors[3], i + 12);
3225            buffer.store_complex(vectors[4], i + 16);
3226            buffer.store_complex(vectors[5], i + 20);
3227            buffer.store_complex(vectors[6], i + 24);
3228            buffer.store_complex(vectors[7], i + 28);
3229        };
3230
3231        // Size-8 FFTs down each of our transposed columns, storing them as soon as we're done with them
3232        let out0 = self.bf8.perform_fft_direct([
3233            tmp0[0], tmp1[0], tmp2[0], tmp3[0], tmp4[0], tmp5[0], tmp6[0], tmp7[0],
3234        ]);
3235        store(0, out0);
3236
3237        let out1 = self.bf8.perform_fft_direct([
3238            tmp0[1], tmp1[1], tmp2[1], tmp3[1], tmp4[1], tmp5[1], tmp6[1], tmp7[1],
3239        ]);
3240        store(1, out1);
3241
3242        let out2 = self.bf8.perform_fft_direct([
3243            tmp0[2], tmp1[2], tmp2[2], tmp3[2], tmp4[2], tmp5[2], tmp6[2], tmp7[2],
3244        ]);
3245        store(2, out2);
3246
3247        let out3 = self.bf8.perform_fft_direct([
3248            tmp0[3], tmp1[3], tmp2[3], tmp3[3], tmp4[3], tmp5[3], tmp6[3], tmp7[3],
3249        ]);
3250        store(3, out3);
3251    }
3252}
3253
3254#[cfg(test)]
3255mod unit_tests {
3256    use super::*;
3257    use crate::{
3258        algorithm::Dft,
3259        test_utils::{check_fft_algorithm, compare_vectors},
3260    };
3261
3262    //the tests for all butterflies will be identical except for the identifiers used and size
3263    //so it's ideal for a macro
3264    macro_rules! test_butterfly_32_func {
3265        ($test_name:ident, $struct_name:ident, $size:expr) => {
3266            #[test]
3267            fn $test_name() {
3268                let butterfly = $struct_name::new(FftDirection::Forward);
3269                check_fft_algorithm::<f32>(&butterfly, $size, FftDirection::Forward);
3270
3271                let butterfly_direction = $struct_name::new(FftDirection::Inverse);
3272                check_fft_algorithm::<f32>(&butterfly_direction, $size, FftDirection::Inverse);
3273            }
3274        };
3275    }
3276    test_butterfly_32_func!(test_ssef32_butterfly1, SseF32Butterfly1, 1);
3277    test_butterfly_32_func!(test_ssef32_butterfly2, SseF32Butterfly2, 2);
3278    test_butterfly_32_func!(test_ssef32_butterfly3, SseF32Butterfly3, 3);
3279    test_butterfly_32_func!(test_ssef32_butterfly4, SseF32Butterfly4, 4);
3280    test_butterfly_32_func!(test_ssef32_butterfly5, SseF32Butterfly5, 5);
3281    test_butterfly_32_func!(test_ssef32_butterfly6, SseF32Butterfly6, 6);
3282    test_butterfly_32_func!(test_ssef32_butterfly8, SseF32Butterfly8, 8);
3283    test_butterfly_32_func!(test_ssef32_butterfly9, SseF32Butterfly9, 9);
3284    test_butterfly_32_func!(test_ssef32_butterfly10, SseF32Butterfly10, 10);
3285    test_butterfly_32_func!(test_ssef32_butterfly12, SseF32Butterfly12, 12);
3286    test_butterfly_32_func!(test_ssef32_butterfly15, SseF32Butterfly15, 15);
3287    test_butterfly_32_func!(test_ssef32_butterfly16, SseF32Butterfly16, 16);
3288    test_butterfly_32_func!(test_ssef32_butterfly24, SseF32Butterfly24, 24);
3289    test_butterfly_32_func!(test_ssef32_butterfly32, SseF32Butterfly32, 32);
3290
3291    //the tests for all butterflies will be identical except for the identifiers used and size
3292    //so it's ideal for a macro
3293    macro_rules! test_butterfly_64_func {
3294        ($test_name:ident, $struct_name:ident, $size:expr) => {
3295            #[test]
3296            fn $test_name() {
3297                let butterfly = $struct_name::new(FftDirection::Forward);
3298                check_fft_algorithm::<f64>(&butterfly, $size, FftDirection::Forward);
3299
3300                let butterfly_direction = $struct_name::new(FftDirection::Inverse);
3301                check_fft_algorithm::<f64>(&butterfly_direction, $size, FftDirection::Inverse);
3302            }
3303        };
3304    }
3305    test_butterfly_64_func!(test_ssef64_butterfly1, SseF64Butterfly1, 1);
3306    test_butterfly_64_func!(test_ssef64_butterfly2, SseF64Butterfly2, 2);
3307    test_butterfly_64_func!(test_ssef64_butterfly3, SseF64Butterfly3, 3);
3308    test_butterfly_64_func!(test_ssef64_butterfly4, SseF64Butterfly4, 4);
3309    test_butterfly_64_func!(test_ssef64_butterfly5, SseF64Butterfly5, 5);
3310    test_butterfly_64_func!(test_ssef64_butterfly6, SseF64Butterfly6, 6);
3311    test_butterfly_64_func!(test_ssef64_butterfly8, SseF64Butterfly8, 8);
3312    test_butterfly_64_func!(test_ssef64_butterfly9, SseF64Butterfly9, 9);
3313    test_butterfly_64_func!(test_ssef64_butterfly10, SseF64Butterfly10, 10);
3314    test_butterfly_64_func!(test_ssef64_butterfly12, SseF64Butterfly12, 12);
3315    test_butterfly_64_func!(test_ssef64_butterfly15, SseF64Butterfly15, 15);
3316    test_butterfly_64_func!(test_ssef64_butterfly16, SseF64Butterfly16, 16);
3317    test_butterfly_64_func!(test_ssef64_butterfly24, SseF64Butterfly24, 24);
3318    test_butterfly_64_func!(test_ssef64_butterfly32, SseF64Butterfly32, 32);
3319
3320    #[test]
3321    fn test_parallel_fft4_32() {
3322        unsafe {
3323            let val_a1 = Complex::<f32>::new(1.0, 2.5);
3324            let val_a2 = Complex::<f32>::new(3.2, 4.2);
3325            let val_a3 = Complex::<f32>::new(5.6, 6.2);
3326            let val_a4 = Complex::<f32>::new(7.4, 8.3);
3327
3328            let val_b1 = Complex::<f32>::new(6.0, 24.5);
3329            let val_b2 = Complex::<f32>::new(4.2, 34.2);
3330            let val_b3 = Complex::<f32>::new(9.6, 61.2);
3331            let val_b4 = Complex::<f32>::new(17.4, 81.3);
3332
3333            let p1 = _mm_set_ps(val_b1.im, val_b1.re, val_a1.im, val_a1.re);
3334            let p2 = _mm_set_ps(val_b2.im, val_b2.re, val_a2.im, val_a2.re);
3335            let p3 = _mm_set_ps(val_b3.im, val_b3.re, val_a3.im, val_a3.re);
3336            let p4 = _mm_set_ps(val_b4.im, val_b4.re, val_a4.im, val_a4.re);
3337
3338            let mut val_a = vec![val_a1, val_a2, val_a3, val_a4];
3339            let mut val_b = vec![val_b1, val_b2, val_b3, val_b4];
3340
3341            let dft = Dft::new(4, FftDirection::Forward);
3342
3343            let bf4 = SseF32Butterfly4::<f32>::new(FftDirection::Forward);
3344
3345            dft.process(&mut val_a);
3346            dft.process(&mut val_b);
3347            let res_both = bf4.perform_parallel_fft_direct([p1, p2, p3, p4]);
3348
3349            let res = std::mem::transmute::<[__m128; 4], [Complex<f32>; 8]>(res_both);
3350            let sse_res_a = [res[0], res[2], res[4], res[6]];
3351            let sse_res_b = [res[1], res[3], res[5], res[7]];
3352            assert!(compare_vectors(&val_a, &sse_res_a));
3353            assert!(compare_vectors(&val_b, &sse_res_b));
3354        }
3355    }
3356
3357    #[test]
3358    fn test_pack() {
3359        unsafe {
3360            let nbr2 = _mm_set_ps(8.0, 7.0, 6.0, 5.0);
3361            let nbr1 = _mm_set_ps(4.0, 3.0, 2.0, 1.0);
3362            let first = extract_lo_lo_f32(nbr1, nbr2);
3363            let second = extract_hi_hi_f32(nbr1, nbr2);
3364            let first = std::mem::transmute::<__m128, [Complex<f32>; 2]>(first);
3365            let second = std::mem::transmute::<__m128, [Complex<f32>; 2]>(second);
3366            let first_expected = [Complex::new(1.0, 2.0), Complex::new(5.0, 6.0)];
3367            let second_expected = [Complex::new(3.0, 4.0), Complex::new(7.0, 8.0)];
3368            assert_eq!(first, first_expected);
3369            assert_eq!(second, second_expected);
3370        }
3371    }
3372}