rustfft/avx/
avx64_butterflies.rs

1use std::arch::x86_64::*;
2use std::marker::PhantomData;
3use std::mem::MaybeUninit;
4
5use num_complex::Complex;
6
7use crate::array_utils;
8use crate::array_utils::workaround_transmute_mut;
9use crate::array_utils::DoubleBuf;
10use crate::common::{fft_error_inplace, fft_error_outofplace};
11use crate::{common::FftNum, twiddles};
12use crate::{Direction, Fft, FftDirection, Length};
13
14use super::avx64_utils;
15use super::avx_vector;
16use super::avx_vector::{AvxArray, AvxArrayMut, AvxVector, AvxVector128, AvxVector256, Rotation90};
17
18// Safety: This macro will call `self::perform_fft_f32()` which probably has a #[target_feature(enable = "...")] annotation on it.
19// Calling functions with that annotation is unsafe, because it doesn't actually check if the CPU has the required features.
20// Callers of this macro must guarantee that users can't even obtain an instance of $struct_name if their CPU doesn't have the required CPU features.
21#[allow(unused)]
22macro_rules! boilerplate_fft_simd_butterfly {
23    ($struct_name:ident, $len:expr) => {
24        impl $struct_name<f64> {
25            #[inline]
26            pub fn new(direction: FftDirection) -> Result<Self, ()> {
27                let has_avx = is_x86_feature_detected!("avx");
28                let has_fma = is_x86_feature_detected!("fma");
29                if has_avx && has_fma {
30                    // Safety: new_internal requires the "avx" feature set. Since we know it's present, we're safe
31                    Ok(unsafe { Self::new_with_avx(direction) })
32                } else {
33                    Err(())
34                }
35            }
36        }
37        impl<T: FftNum> Fft<T> for $struct_name<f64> {
38            fn process_outofplace_with_scratch(
39                &self,
40                input: &mut [Complex<T>],
41                output: &mut [Complex<T>],
42                _scratch: &mut [Complex<T>],
43            ) {
44                if input.len() < self.len() || output.len() != input.len() {
45                    // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us
46                    fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0);
47                    return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here
48                }
49
50                let result = array_utils::iter_chunks_zipped(
51                    input,
52                    output,
53                    self.len(),
54                    |in_chunk, out_chunk| {
55                        unsafe {
56                            // Specialization workaround: See the comments in FftPlannerAvx::new() for why we have to transmute these slices
57                            let input_slice = workaround_transmute_mut(in_chunk);
58                            let output_slice = workaround_transmute_mut(out_chunk);
59                            self.perform_fft_f64(DoubleBuf {
60                                input: input_slice,
61                                output: output_slice,
62                            });
63                        }
64                    },
65                );
66
67                if result.is_err() {
68                    // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size,
69                    // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us
70                    fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0);
71                }
72            }
73            fn process_with_scratch(&self, buffer: &mut [Complex<T>], _scratch: &mut [Complex<T>]) {
74                if buffer.len() < self.len() {
75                    // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us
76                    fft_error_inplace(self.len(), buffer.len(), 0, 0);
77                    return; // Unreachable, because fft_error_inplace asserts, but it helps codegen to put it here
78                }
79
80                let result = array_utils::iter_chunks(buffer, self.len(), |chunk| {
81                    unsafe {
82                        // Specialization workaround: See the comments in FftPlannerAvx::new() for why we have to transmute these slices
83                        self.perform_fft_f64(workaround_transmute_mut::<_, Complex<f64>>(chunk));
84                    }
85                });
86
87                if result.is_err() {
88                    // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size,
89                    // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us
90                    fft_error_inplace(self.len(), buffer.len(), 0, 0);
91                }
92            }
93            #[inline(always)]
94            fn get_inplace_scratch_len(&self) -> usize {
95                0
96            }
97            #[inline(always)]
98            fn get_outofplace_scratch_len(&self) -> usize {
99                0
100            }
101        }
102        impl<T> Length for $struct_name<T> {
103            #[inline(always)]
104            fn len(&self) -> usize {
105                $len
106            }
107        }
108        impl<T> Direction for $struct_name<T> {
109            #[inline(always)]
110            fn fft_direction(&self) -> FftDirection {
111                self.direction
112            }
113        }
114    };
115}
116
117// Safety: This macro will call `self::column_butterflies_and_transpose()` and `self::row_butterflies()` which probably has a #[target_feature(enable = "...")] annotation on it.
118// Calling functions with that annotation is unsafe, because it doesn't actually check if the CPU has the required features.
119// Callers of this macro must guarantee that users can't even obtain an instance of $struct_name if their CPU doesn't have the required CPU features.
120macro_rules! boilerplate_fft_simd_butterfly_with_scratch {
121    ($struct_name:ident, $len:expr) => {
122        impl $struct_name<f64> {
123            #[inline]
124            pub fn is_supported_by_cpu() -> bool {
125                is_x86_feature_detected!("avx") && is_x86_feature_detected!("fma")
126            }
127            #[inline]
128            pub fn new(direction: FftDirection) -> Result<Self, ()> {
129                if Self::is_supported_by_cpu() {
130                    // Safety: new_internal requires the "avx" feature set. Since we know it's present, we're safe
131                    Ok(unsafe { Self::new_with_avx(direction) })
132                } else {
133                    Err(())
134                }
135            }
136        }
137        impl<T> $struct_name<T> {
138            #[inline]
139            fn perform_fft_inplace(
140                &self,
141                buffer: &mut [Complex<f64>],
142                scratch: &mut [Complex<f64>],
143            ) {
144                // Perform the column FFTs
145                // Safety: self.perform_column_butterflies() requres the "avx" and "fma" instruction sets, and we return Err() in our constructor if the instructions aren't available
146                unsafe { self.column_butterflies_and_transpose(buffer, scratch) };
147
148                // process the row FFTs, and copy from the scratch back to the buffer as we go
149                // Safety: self.transpose() requres the "avx" instruction set, and we return Err() in our constructor if the instructions aren't available
150                unsafe {
151                    self.row_butterflies(DoubleBuf {
152                        input: scratch,
153                        output: buffer,
154                    })
155                };
156            }
157
158            #[inline]
159            fn perform_fft_out_of_place(
160                &self,
161                input: &mut [Complex<f64>],
162                output: &mut [Complex<f64>],
163            ) {
164                // Perform the column FFTs
165                // Safety: self.perform_column_butterflies() requres the "avx" and "fma" instruction sets, and we return Err() in our constructor if the instructions aren't available
166                unsafe { self.column_butterflies_and_transpose(input, output) };
167
168                // process the row FFTs in-place in the output buffer
169                // Safety: self.transpose() requres the "avx" instruction set, and we return Err() in our constructor if the instructions aren't available
170                unsafe { self.row_butterflies(output) };
171            }
172        }
173        impl<T: FftNum> Fft<T> for $struct_name<f64> {
174            fn process_outofplace_with_scratch(
175                &self,
176                input: &mut [Complex<T>],
177                output: &mut [Complex<T>],
178                _scratch: &mut [Complex<T>],
179            ) {
180                if input.len() < self.len() || output.len() != input.len() {
181                    // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us
182                    fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0);
183                    return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here
184                }
185
186                // Specialization workaround: See the comments in FftPlannerAvx::new() for why these calls to array_utils::workaround_transmute are necessary
187                let transmuted_input: &mut [Complex<f64>] =
188                    unsafe { array_utils::workaround_transmute_mut(input) };
189                let transmuted_output: &mut [Complex<f64>] =
190                    unsafe { array_utils::workaround_transmute_mut(output) };
191                let result = array_utils::iter_chunks_zipped(
192                    transmuted_input,
193                    transmuted_output,
194                    self.len(),
195                    |in_chunk, out_chunk| self.perform_fft_out_of_place(in_chunk, out_chunk),
196                );
197
198                if result.is_err() {
199                    // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size,
200                    // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us
201                    fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0);
202                }
203            }
204            fn process_with_scratch(&self, buffer: &mut [Complex<T>], scratch: &mut [Complex<T>]) {
205                let required_scratch = self.len();
206                if scratch.len() < required_scratch || buffer.len() < self.len() {
207                    // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us
208                    fft_error_inplace(self.len(), buffer.len(), self.len(), scratch.len());
209                    return; // Unreachable, because fft_error_inplace asserts, but it helps codegen to put it here
210                }
211
212                let scratch = &mut scratch[..required_scratch];
213
214                // Specialization workaround: See the comments in FftPlannerAvx::new() for why these calls to array_utils::workaround_transmute are necessary
215                let transmuted_buffer: &mut [Complex<f64>] =
216                    unsafe { array_utils::workaround_transmute_mut(buffer) };
217                let transmuted_scratch: &mut [Complex<f64>] =
218                    unsafe { array_utils::workaround_transmute_mut(scratch) };
219                let result = array_utils::iter_chunks(transmuted_buffer, self.len(), |chunk| {
220                    self.perform_fft_inplace(chunk, transmuted_scratch)
221                });
222
223                if result.is_err() {
224                    // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size,
225                    // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us
226                    fft_error_inplace(self.len(), buffer.len(), self.len(), scratch.len());
227                }
228            }
229            #[inline(always)]
230            fn get_inplace_scratch_len(&self) -> usize {
231                $len
232            }
233            #[inline(always)]
234            fn get_outofplace_scratch_len(&self) -> usize {
235                0
236            }
237        }
238        impl<T> Length for $struct_name<T> {
239            #[inline(always)]
240            fn len(&self) -> usize {
241                $len
242            }
243        }
244        impl<T> Direction for $struct_name<T> {
245            #[inline(always)]
246            fn fft_direction(&self) -> FftDirection {
247                self.direction
248            }
249        }
250    };
251}
252
253macro_rules! gen_butterfly_twiddles_interleaved_columns {
254    ($num_rows:expr, $num_cols:expr, $skip_cols:expr, $direction: expr) => {{
255        const FFT_LEN: usize = $num_rows * $num_cols;
256        const TWIDDLE_ROWS: usize = $num_rows - 1;
257        const TWIDDLE_COLS: usize = $num_cols - $skip_cols;
258        const TWIDDLE_VECTOR_COLS: usize = TWIDDLE_COLS / 2;
259        const TWIDDLE_VECTOR_COUNT: usize = TWIDDLE_VECTOR_COLS * TWIDDLE_ROWS;
260        let mut twiddles = [AvxVector::zero(); TWIDDLE_VECTOR_COUNT];
261        for index in 0..TWIDDLE_VECTOR_COUNT {
262            let y = (index / TWIDDLE_VECTOR_COLS) + 1;
263            let x = (index % TWIDDLE_VECTOR_COLS) * 2 + $skip_cols;
264
265            twiddles[index] = AvxVector::make_mixedradix_twiddle_chunk(x, y, FFT_LEN, $direction);
266        }
267        twiddles
268    }};
269}
270
271macro_rules! gen_butterfly_twiddles_separated_columns {
272    ($num_rows:expr, $num_cols:expr, $skip_cols:expr, $direction: expr) => {{
273        const FFT_LEN: usize = $num_rows * $num_cols;
274        const TWIDDLE_ROWS: usize = $num_rows - 1;
275        const TWIDDLE_COLS: usize = $num_cols - $skip_cols;
276        const TWIDDLE_VECTOR_COLS: usize = TWIDDLE_COLS / 2;
277        const TWIDDLE_VECTOR_COUNT: usize = TWIDDLE_VECTOR_COLS * TWIDDLE_ROWS;
278        let mut twiddles = [AvxVector::zero(); TWIDDLE_VECTOR_COUNT];
279        for index in 0..TWIDDLE_VECTOR_COUNT {
280            let y = (index % TWIDDLE_ROWS) + 1;
281            let x = (index / TWIDDLE_ROWS) * 2 + $skip_cols;
282
283            twiddles[index] = AvxVector::make_mixedradix_twiddle_chunk(x, y, FFT_LEN, $direction);
284        }
285        twiddles
286    }};
287}
288
289pub struct Butterfly5Avx64<T> {
290    twiddles: [__m256d; 3],
291    direction: FftDirection,
292    _phantom_t: std::marker::PhantomData<T>,
293}
294boilerplate_fft_simd_butterfly!(Butterfly5Avx64, 5);
295impl Butterfly5Avx64<f64> {
296    #[target_feature(enable = "avx")]
297    unsafe fn new_with_avx(direction: FftDirection) -> Self {
298        let twiddle1 = twiddles::compute_twiddle(1, 5, direction);
299        let twiddle2 = twiddles::compute_twiddle(2, 5, direction);
300        Self {
301            twiddles: [
302                _mm256_set_pd(twiddle1.im, twiddle1.im, twiddle1.re, twiddle1.re),
303                _mm256_set_pd(twiddle2.im, twiddle2.im, twiddle2.re, twiddle2.re),
304                _mm256_set_pd(-twiddle1.im, -twiddle1.im, twiddle1.re, twiddle1.re),
305            ],
306            direction,
307            _phantom_t: PhantomData,
308        }
309    }
310}
311impl<T> Butterfly5Avx64<T> {
312    #[target_feature(enable = "avx", enable = "fma")]
313    unsafe fn perform_fft_f64(&self, mut buffer: impl AvxArrayMut<f64>) {
314        let input0 = _mm256_loadu2_m128d(
315            buffer.input_ptr() as *const f64,
316            buffer.input_ptr() as *const f64,
317        );
318        let input12 = buffer.load_complex(1);
319        let input34 = buffer.load_complex(3);
320
321        // swap elements for inputs 3 and 4
322        let input43 = AvxVector::reverse_complex_elements(input34);
323
324        // do some prep work before we can start applying twiddle factors
325        let [sum12, diff43] = AvxVector::column_butterfly2([input12, input43]);
326
327        let rotation = AvxVector::make_rotation90(FftDirection::Inverse);
328        let rotated43 = AvxVector::rotate90(diff43, rotation);
329
330        let [mid14, mid23] = avx64_utils::transpose_2x2_f64([sum12, rotated43]);
331
332        // to compute the first output, compute the sum of all elements. mid14[0] and mid23[0] already have the sum of 1+4 and 2+3 respectively, so if we add them, we'll get the sum of all 4
333        let sum1234 = AvxVector::add(mid14.lo(), mid23.lo());
334        let output0 = AvxVector::add(input0.lo(), sum1234);
335
336        // apply twiddle factors
337        let twiddled_outer14 = AvxVector::mul(mid14, self.twiddles[0]);
338        let twiddled_inner14 = AvxVector::mul(mid14, self.twiddles[1]);
339        let twiddled14 = AvxVector::fmadd(mid23, self.twiddles[1], twiddled_outer14);
340        let twiddled23 = AvxVector::fmadd(mid23, self.twiddles[2], twiddled_inner14);
341
342        // unpack the data for the last butterfly 2
343        let [twiddled12, twiddled43] = avx64_utils::transpose_2x2_f64([twiddled14, twiddled23]);
344        let [output12, output43] = AvxVector::column_butterfly2([twiddled12, twiddled43]);
345
346        // swap the elements in output43 before writing them out, and add the first input to everything
347        let final12 = AvxVector::add(input0, output12);
348        let output34 = AvxVector::reverse_complex_elements(output43);
349        let final34 = AvxVector::add(input0, output34);
350
351        buffer.store_partial1_complex(output0, 0);
352        buffer.store_complex(final12, 1);
353        buffer.store_complex(final34, 3);
354    }
355}
356
357pub struct Butterfly7Avx64<T> {
358    twiddles: [__m256d; 5],
359    direction: FftDirection,
360    _phantom_t: std::marker::PhantomData<T>,
361}
362boilerplate_fft_simd_butterfly!(Butterfly7Avx64, 7);
363impl Butterfly7Avx64<f64> {
364    #[target_feature(enable = "avx")]
365    unsafe fn new_with_avx(direction: FftDirection) -> Self {
366        let twiddle1 = twiddles::compute_twiddle(1, 7, direction);
367        let twiddle2 = twiddles::compute_twiddle(2, 7, direction);
368        let twiddle3 = twiddles::compute_twiddle(3, 7, direction);
369        Self {
370            twiddles: [
371                _mm256_set_pd(twiddle1.im, twiddle1.im, twiddle1.re, twiddle1.re),
372                _mm256_set_pd(twiddle2.im, twiddle2.im, twiddle2.re, twiddle2.re),
373                _mm256_set_pd(twiddle3.im, twiddle3.im, twiddle3.re, twiddle3.re),
374                _mm256_set_pd(-twiddle3.im, -twiddle3.im, twiddle3.re, twiddle3.re),
375                _mm256_set_pd(-twiddle1.im, -twiddle1.im, twiddle1.re, twiddle1.re),
376            ],
377            direction,
378            _phantom_t: PhantomData,
379        }
380    }
381}
382impl<T> Butterfly7Avx64<T> {
383    #[target_feature(enable = "avx", enable = "fma")]
384    unsafe fn perform_fft_f64(&self, mut buffer: impl AvxArrayMut<f64>) {
385        let input0 = _mm256_loadu2_m128d(
386            buffer.input_ptr() as *const f64,
387            buffer.input_ptr() as *const f64,
388        );
389        let input12 = buffer.load_complex(1);
390        let input3 = buffer.load_partial1_complex(3);
391        let input4 = buffer.load_partial1_complex(4);
392        let input56 = buffer.load_complex(5);
393
394        // reverse the order of input56
395        let input65 = AvxVector::reverse_complex_elements(input56);
396
397        // do some prep work before we can start applying twiddle factors
398        let [sum12, diff65] = AvxVector::column_butterfly2([input12, input65]);
399        let [sum3, diff4] = AvxVector::column_butterfly2([input3, input4]);
400
401        let rotation = AvxVector::make_rotation90(FftDirection::Inverse);
402        let rotated65 = AvxVector::rotate90(diff65, rotation);
403        let rotated4 = AvxVector::rotate90(diff4, rotation.lo());
404
405        let [mid16, mid25] = avx64_utils::transpose_2x2_f64([sum12, rotated65]);
406        let mid34 = AvxVector128::merge(sum3, rotated4);
407
408        // to compute the first output, compute the sum of all elements. mid16[0], mid25[0], and mid34[0] already have the sum of 1+6, 2+5 and 3+4 respectively, so if we add them, we'll get 1+2+3+4+5+6
409        let output0_left = AvxVector::add(mid16.lo(), mid25.lo());
410        let output0_right = AvxVector::add(input0.lo(), mid34.lo());
411        let output0 = AvxVector::add(output0_left, output0_right);
412        buffer.store_partial1_complex(output0, 0);
413
414        // apply twiddle factors
415        let twiddled16_intermediate1 = AvxVector::mul(mid16, self.twiddles[0]);
416        let twiddled25_intermediate1 = AvxVector::mul(mid16, self.twiddles[1]);
417        let twiddled34_intermediate1 = AvxVector::mul(mid16, self.twiddles[2]);
418
419        let twiddled16_intermediate2 =
420            AvxVector::fmadd(mid25, self.twiddles[1], twiddled16_intermediate1);
421        let twiddled25_intermediate2 =
422            AvxVector::fmadd(mid25, self.twiddles[3], twiddled25_intermediate1);
423        let twiddled34_intermediate2 =
424            AvxVector::fmadd(mid25, self.twiddles[4], twiddled34_intermediate1);
425
426        let twiddled16 = AvxVector::fmadd(mid34, self.twiddles[2], twiddled16_intermediate2);
427        let twiddled25 = AvxVector::fmadd(mid34, self.twiddles[4], twiddled25_intermediate2);
428        let twiddled34 = AvxVector::fmadd(mid34, self.twiddles[1], twiddled34_intermediate2);
429
430        // unpack the data for the last butterfly 2
431        let [twiddled12, twiddled65] = avx64_utils::transpose_2x2_f64([twiddled16, twiddled25]);
432
433        // we can save one add if we add input0 to twiddled3 now. normally we'd add input0 to the final output, but the arrangement of data makes that a little awkward
434        let twiddled03 = AvxVector::add(twiddled34.lo(), input0.lo());
435
436        let [output12, output65] = AvxVector::column_butterfly2([twiddled12, twiddled65]);
437        let final12 = AvxVector::add(output12, input0);
438        let output56 = AvxVector::reverse_complex_elements(output65);
439        let final56 = AvxVector::add(output56, input0);
440
441        let [final3, final4] = AvxVector::column_butterfly2([twiddled03, twiddled34.hi()]);
442
443        buffer.store_complex(final12, 1);
444        buffer.store_partial1_complex(final3, 3);
445        buffer.store_partial1_complex(final4, 4);
446        buffer.store_complex(final56, 5);
447    }
448}
449
450pub struct Butterfly11Avx64<T> {
451    twiddles: [__m256d; 10],
452    direction: FftDirection,
453    _phantom_t: std::marker::PhantomData<T>,
454}
455boilerplate_fft_simd_butterfly!(Butterfly11Avx64, 11);
456impl Butterfly11Avx64<f64> {
457    #[target_feature(enable = "avx")]
458    unsafe fn new_with_avx(direction: FftDirection) -> Self {
459        let twiddle1 = twiddles::compute_twiddle(1, 11, direction);
460        let twiddle2 = twiddles::compute_twiddle(2, 11, direction);
461        let twiddle3 = twiddles::compute_twiddle(3, 11, direction);
462        let twiddle4 = twiddles::compute_twiddle(4, 11, direction);
463        let twiddle5 = twiddles::compute_twiddle(5, 11, direction);
464
465        let twiddles = [
466            _mm256_set_pd(twiddle1.im, twiddle1.im, twiddle1.re, twiddle1.re),
467            _mm256_set_pd(twiddle2.im, twiddle2.im, twiddle2.re, twiddle2.re),
468            _mm256_set_pd(twiddle3.im, twiddle3.im, twiddle3.re, twiddle3.re),
469            _mm256_set_pd(twiddle4.im, twiddle4.im, twiddle4.re, twiddle4.re),
470            _mm256_set_pd(twiddle5.im, twiddle5.im, twiddle5.re, twiddle5.re),
471            _mm256_set_pd(-twiddle5.im, -twiddle5.im, twiddle5.re, twiddle5.re),
472            _mm256_set_pd(-twiddle4.im, -twiddle4.im, twiddle4.re, twiddle4.re),
473            _mm256_set_pd(-twiddle3.im, -twiddle3.im, twiddle3.re, twiddle3.re),
474            _mm256_set_pd(-twiddle2.im, -twiddle2.im, twiddle2.re, twiddle2.re),
475            _mm256_set_pd(-twiddle1.im, -twiddle1.im, twiddle1.re, twiddle1.re),
476        ];
477
478        Self {
479            twiddles,
480            direction,
481            _phantom_t: PhantomData,
482        }
483    }
484}
485impl<T> Butterfly11Avx64<T> {
486    #[target_feature(enable = "avx", enable = "fma")]
487    unsafe fn perform_fft_f64(&self, mut buffer: impl AvxArrayMut<f64>) {
488        let input0 = buffer.load_partial1_complex(0);
489        let input12 = buffer.load_complex(1);
490        let input34 = buffer.load_complex(3);
491        let input56 = buffer.load_complex(5);
492        let input78 = buffer.load_complex(7);
493        let input910 = buffer.load_complex(9);
494
495        // reverse the order of input78910, and separate
496        let [input55, input66] = AvxVector::unpack_complex([input56, input56]);
497        let input87 = AvxVector::reverse_complex_elements(input78);
498        let input109 = AvxVector::reverse_complex_elements(input910);
499
500        // do some initial butterflies and rotations
501        let [sum12, diff109] = AvxVector::column_butterfly2([input12, input109]);
502        let [sum34, diff87] = AvxVector::column_butterfly2([input34, input87]);
503        let [sum55, diff66] = AvxVector::column_butterfly2([input55, input66]);
504
505        let rotation = AvxVector::make_rotation90(FftDirection::Inverse);
506        let rotated109 = AvxVector::rotate90(diff109, rotation);
507        let rotated87 = AvxVector::rotate90(diff87, rotation);
508        let rotated66 = AvxVector::rotate90(diff66, rotation);
509
510        // arrange the data into the format to apply twiddles
511        let [mid110, mid29] = AvxVector::unpack_complex([sum12, rotated109]);
512        let [mid38, mid47] = AvxVector::unpack_complex([sum34, rotated87]);
513        let mid56 = AvxVector::unpacklo_complex([sum55, rotated66]);
514
515        // to compute the first output, compute the sum of all elements. mid110[0], mid29[0], mid38[0], mid47 already have the sum of 1+10, 2+9 and so on, so if we add them, we'll get the sum of everything
516        let mid12910 = AvxVector::add(mid110.lo(), mid29.lo());
517        let mid3478 = AvxVector::add(mid38.lo(), mid47.lo());
518        let output0_left = AvxVector::add(input0, mid56.lo());
519        let output0_right = AvxVector::add(mid12910, mid3478);
520        let output0 = AvxVector::add(output0_left, output0_right);
521        buffer.store_partial1_complex(output0, 0);
522
523        // we need to add the first input to each of our 5 twiddles values -- but only the first complex element of each vector. so just use zero for the other element
524        let zero = _mm_setzero_pd();
525        let input0 = AvxVector256::merge(input0, zero);
526
527        // apply twiddle factors
528        let twiddled110 = AvxVector::fmadd(mid110, self.twiddles[0], input0);
529        let twiddled38 = AvxVector::fmadd(mid110, self.twiddles[2], input0);
530        let twiddled29 = AvxVector::fmadd(mid110, self.twiddles[1], input0);
531        let twiddled47 = AvxVector::fmadd(mid110, self.twiddles[3], input0);
532        let twiddled56 = AvxVector::fmadd(mid110, self.twiddles[4], input0);
533
534        let twiddled110 = AvxVector::fmadd(mid29, self.twiddles[1], twiddled110);
535        let twiddled38 = AvxVector::fmadd(mid29, self.twiddles[5], twiddled38);
536        let twiddled29 = AvxVector::fmadd(mid29, self.twiddles[3], twiddled29);
537        let twiddled47 = AvxVector::fmadd(mid29, self.twiddles[7], twiddled47);
538        let twiddled56 = AvxVector::fmadd(mid29, self.twiddles[9], twiddled56);
539
540        let twiddled110 = AvxVector::fmadd(mid38, self.twiddles[2], twiddled110);
541        let twiddled38 = AvxVector::fmadd(mid38, self.twiddles[8], twiddled38);
542        let twiddled29 = AvxVector::fmadd(mid38, self.twiddles[5], twiddled29);
543        let twiddled47 = AvxVector::fmadd(mid38, self.twiddles[0], twiddled47);
544        let twiddled56 = AvxVector::fmadd(mid38, self.twiddles[3], twiddled56);
545
546        let twiddled110 = AvxVector::fmadd(mid47, self.twiddles[3], twiddled110);
547        let twiddled38 = AvxVector::fmadd(mid47, self.twiddles[0], twiddled38);
548        let twiddled29 = AvxVector::fmadd(mid47, self.twiddles[7], twiddled29);
549        let twiddled47 = AvxVector::fmadd(mid47, self.twiddles[4], twiddled47);
550        let twiddled56 = AvxVector::fmadd(mid47, self.twiddles[8], twiddled56);
551
552        let twiddled110 = AvxVector::fmadd(mid56, self.twiddles[4], twiddled110);
553        let twiddled38 = AvxVector::fmadd(mid56, self.twiddles[3], twiddled38);
554        let twiddled29 = AvxVector::fmadd(mid56, self.twiddles[9], twiddled29);
555        let twiddled47 = AvxVector::fmadd(mid56, self.twiddles[8], twiddled47);
556        let twiddled56 = AvxVector::fmadd(mid56, self.twiddles[2], twiddled56);
557
558        // unpack the data for the last butterfly 2
559        let [twiddled12, twiddled109] = AvxVector::unpack_complex([twiddled110, twiddled29]);
560        let [twiddled34, twiddled87] = AvxVector::unpack_complex([twiddled38, twiddled47]);
561        let [twiddled55, twiddled66] = AvxVector::unpack_complex([twiddled56, twiddled56]);
562
563        let [output12, output109] = AvxVector::column_butterfly2([twiddled12, twiddled109]);
564        let [output34, output87] = AvxVector::column_butterfly2([twiddled34, twiddled87]);
565        let [output55, output66] = AvxVector::column_butterfly2([twiddled55, twiddled66]);
566        let output78 = AvxVector::reverse_complex_elements(output87);
567        let output910 = AvxVector::reverse_complex_elements(output109);
568
569        buffer.store_complex(output12, 1);
570        buffer.store_complex(output34, 3);
571        buffer.store_partial1_complex(output55.lo(), 5);
572        buffer.store_partial1_complex(output66.lo(), 6);
573        buffer.store_complex(output78, 7);
574        buffer.store_complex(output910, 9);
575    }
576}
577
578pub struct Butterfly8Avx64<T> {
579    twiddles: [__m256d; 2],
580    twiddles_butterfly4: Rotation90<__m256d>,
581    direction: FftDirection,
582    _phantom_t: std::marker::PhantomData<T>,
583}
584boilerplate_fft_simd_butterfly!(Butterfly8Avx64, 8);
585impl Butterfly8Avx64<f64> {
586    #[target_feature(enable = "avx")]
587    unsafe fn new_with_avx(direction: FftDirection) -> Self {
588        Self {
589            twiddles: gen_butterfly_twiddles_interleaved_columns!(2, 4, 0, direction),
590            twiddles_butterfly4: AvxVector::make_rotation90(direction),
591            direction,
592            _phantom_t: PhantomData,
593        }
594    }
595}
596impl<T> Butterfly8Avx64<T> {
597    #[target_feature(enable = "avx", enable = "fma")]
598    unsafe fn perform_fft_f64(&self, mut buffer: impl AvxArrayMut<f64>) {
599        let row0 = buffer.load_complex(0);
600        let row1 = buffer.load_complex(2);
601        let row2 = buffer.load_complex(4);
602        let row3 = buffer.load_complex(6);
603
604        // Do our butterfly 2's down the columns of a 4x2 array
605        let [mid0, mid2] = AvxVector::column_butterfly2([row0, row2]);
606        let [mid1, mid3] = AvxVector::column_butterfly2([row1, row3]);
607
608        let mid2_twiddled = AvxVector::mul_complex(mid2, self.twiddles[0]);
609        let mid3_twiddled = AvxVector::mul_complex(mid3, self.twiddles[1]);
610
611        // transpose to a 2x4 array
612        let transposed =
613            avx64_utils::transpose_4x2_to_2x4_f64([mid0, mid2_twiddled], [mid1, mid3_twiddled]);
614
615        // butterfly 4's down the transposed array
616        let output_rows = AvxVector::column_butterfly4(transposed, self.twiddles_butterfly4);
617
618        buffer.store_complex(output_rows[0], 0);
619        buffer.store_complex(output_rows[1], 2);
620        buffer.store_complex(output_rows[2], 4);
621        buffer.store_complex(output_rows[3], 6);
622    }
623}
624
625pub struct Butterfly9Avx64<T> {
626    twiddles: [__m256d; 2],
627    twiddles_butterfly3: __m256d,
628    direction: FftDirection,
629    _phantom_t: std::marker::PhantomData<T>,
630}
631boilerplate_fft_simd_butterfly!(Butterfly9Avx64, 9);
632impl Butterfly9Avx64<f64> {
633    #[target_feature(enable = "avx")]
634    unsafe fn new_with_avx(direction: FftDirection) -> Self {
635        Self {
636            twiddles: gen_butterfly_twiddles_interleaved_columns!(3, 3, 1, direction),
637            twiddles_butterfly3: AvxVector::broadcast_twiddle(1, 3, direction),
638            direction,
639            _phantom_t: PhantomData,
640        }
641    }
642}
643impl<T> Butterfly9Avx64<T> {
644    #[target_feature(enable = "avx", enable = "fma")]
645    unsafe fn perform_fft_f64(&self, mut buffer: impl AvxArrayMut<f64>) {
646        // we're going to load our input as a 3x3 array. We have to load 3 columns, which is a little awkward
647        // We can reduce the number of multiplies we do if we load the first column as half-width and the second column as full.
648        let mut rows0 = [AvxVector::zero(); 3];
649        let mut rows1 = [AvxVector::zero(); 3];
650
651        for r in 0..3 {
652            rows0[r] = buffer.load_partial1_complex(3 * r);
653            rows1[r] = buffer.load_complex(3 * r + 1);
654        }
655
656        // do butterfly 3's down the columns
657        let mid0 = AvxVector::column_butterfly3(rows0, self.twiddles_butterfly3.lo());
658        let mut mid1 = AvxVector::column_butterfly3(rows1, self.twiddles_butterfly3);
659
660        // apply twiddle factors
661        for n in 1..3 {
662            mid1[n] = AvxVector::mul_complex(mid1[n], self.twiddles[n - 1]);
663        }
664
665        // transpose our 3x3 array
666        let (transposed0, transposed1) = avx64_utils::transpose_3x3_f64(mid0, mid1);
667
668        // apply butterfly 3's down the columns
669        let output0 = AvxVector::column_butterfly3(transposed0, self.twiddles_butterfly3.lo());
670        let output1 = AvxVector::column_butterfly3(transposed1, self.twiddles_butterfly3);
671
672        for r in 0..3 {
673            buffer.store_partial1_complex(output0[r], 3 * r);
674            buffer.store_complex(output1[r], 3 * r + 1);
675        }
676    }
677}
678
679pub struct Butterfly12Avx64<T> {
680    twiddles: [__m256d; 3],
681    twiddles_butterfly3: __m256d,
682    twiddles_butterfly4: Rotation90<__m256d>,
683    direction: FftDirection,
684    _phantom_t: std::marker::PhantomData<T>,
685}
686boilerplate_fft_simd_butterfly!(Butterfly12Avx64, 12);
687impl Butterfly12Avx64<f64> {
688    #[target_feature(enable = "avx")]
689    unsafe fn new_with_avx(direction: FftDirection) -> Self {
690        Self {
691            twiddles: gen_butterfly_twiddles_interleaved_columns!(4, 3, 1, direction),
692            twiddles_butterfly3: AvxVector::broadcast_twiddle(1, 3, direction),
693            twiddles_butterfly4: AvxVector::make_rotation90(direction),
694            direction,
695            _phantom_t: PhantomData,
696        }
697    }
698}
699impl<T> Butterfly12Avx64<T> {
700    #[target_feature(enable = "avx", enable = "fma")]
701    unsafe fn perform_fft_f64(&self, mut buffer: impl AvxArrayMut<f64>) {
702        // we're going to load our input as a 3x4 array. We have to load 3 columns, which is a little awkward
703        // We can reduce the number of multiplies we do if we load the first column as half-width and the second column as full.
704        let mut rows0 = [AvxVector::zero(); 4];
705        let mut rows1 = [AvxVector::zero(); 4];
706
707        for n in 0..4 {
708            rows0[n] = buffer.load_partial1_complex(n * 3);
709            rows1[n] = buffer.load_complex(n * 3 + 1);
710        }
711
712        // do butterfly 4's down the columns
713        let mid0 = AvxVector::column_butterfly4(rows0, self.twiddles_butterfly4.lo());
714        let mut mid1 = AvxVector::column_butterfly4(rows1, self.twiddles_butterfly4);
715
716        // apply twiddle factors
717        for n in 1..4 {
718            mid1[n] = AvxVector::mul_complex(mid1[n], self.twiddles[n - 1]);
719        }
720
721        // transpose our 3x4 array to a 4x3 array
722        let (transposed0, transposed1) = avx64_utils::transpose_3x4_to_4x3_f64(mid0, mid1);
723
724        // apply butterfly 3's down the columns
725        let output0 = AvxVector::column_butterfly3(transposed0, self.twiddles_butterfly3);
726        let output1 = AvxVector::column_butterfly3(transposed1, self.twiddles_butterfly3);
727
728        for r in 0..3 {
729            buffer.store_complex(output0[r], 4 * r);
730            buffer.store_complex(output1[r], 4 * r + 2);
731        }
732    }
733}
734
735pub struct Butterfly16Avx64<T> {
736    twiddles: [__m256d; 6],
737    twiddles_butterfly4: Rotation90<__m256d>,
738    direction: FftDirection,
739    _phantom_t: std::marker::PhantomData<T>,
740}
741boilerplate_fft_simd_butterfly!(Butterfly16Avx64, 16);
742impl Butterfly16Avx64<f64> {
743    #[target_feature(enable = "avx")]
744    unsafe fn new_with_avx(direction: FftDirection) -> Self {
745        Self {
746            twiddles: gen_butterfly_twiddles_interleaved_columns!(4, 4, 0, direction),
747            twiddles_butterfly4: AvxVector::make_rotation90(direction),
748            direction,
749            _phantom_t: PhantomData,
750        }
751    }
752}
753impl<T> Butterfly16Avx64<T> {
754    #[target_feature(enable = "avx", enable = "fma")]
755    unsafe fn perform_fft_f64(&self, mut buffer: impl AvxArrayMut<f64>) {
756        let mut rows0 = [AvxVector::zero(); 4];
757        let mut rows1 = [AvxVector::zero(); 4];
758        for r in 0..4 {
759            rows0[r] = buffer.load_complex(4 * r);
760            rows1[r] = buffer.load_complex(4 * r + 2);
761        }
762
763        // We're going to treat our input as a 4x4 2d array. First, do 4 butterfly 4's down the columns of that array.
764        let mut mid0 = AvxVector::column_butterfly4(rows0, self.twiddles_butterfly4);
765        let mut mid1 = AvxVector::column_butterfly4(rows1, self.twiddles_butterfly4);
766
767        // apply twiddle factors
768        for r in 1..4 {
769            mid0[r] = AvxVector::mul_complex(mid0[r], self.twiddles[2 * r - 2]);
770            mid1[r] = AvxVector::mul_complex(mid1[r], self.twiddles[2 * r - 1]);
771        }
772
773        // Transpose our 4x4 array
774        let (transposed0, transposed1) = avx64_utils::transpose_4x4_f64(mid0, mid1);
775
776        // Butterfly 4's down columns of the transposed array
777        let output0 = AvxVector::column_butterfly4(transposed0, self.twiddles_butterfly4);
778        let output1 = AvxVector::column_butterfly4(transposed1, self.twiddles_butterfly4);
779
780        for r in 0..4 {
781            buffer.store_complex(output0[r], 4 * r);
782            buffer.store_complex(output1[r], 4 * r + 2);
783        }
784    }
785}
786
787pub struct Butterfly18Avx64<T> {
788    twiddles: [__m256d; 5],
789    twiddles_butterfly3: __m256d,
790    direction: FftDirection,
791    _phantom_t: std::marker::PhantomData<T>,
792}
793boilerplate_fft_simd_butterfly!(Butterfly18Avx64, 18);
794impl Butterfly18Avx64<f64> {
795    #[target_feature(enable = "avx")]
796    unsafe fn new_with_avx(direction: FftDirection) -> Self {
797        Self {
798            twiddles: gen_butterfly_twiddles_interleaved_columns!(6, 3, 1, direction),
799            twiddles_butterfly3: AvxVector::broadcast_twiddle(1, 3, direction),
800            direction,
801            _phantom_t: PhantomData,
802        }
803    }
804}
805impl<T> Butterfly18Avx64<T> {
806    #[target_feature(enable = "avx", enable = "fma")]
807    unsafe fn perform_fft_f64(&self, mut buffer: impl AvxArrayMut<f64>) {
808        // we're going to load our input as a 3x6 array. We have to load 3 columns, which is a little awkward
809        // We can reduce the number of multiplies we do if we load the first column as half-width and the second column as full.
810        let mut rows0 = [AvxVector::zero(); 6];
811        let mut rows1 = [AvxVector::zero(); 6];
812        for n in 0..6 {
813            rows0[n] = buffer.load_partial1_complex(n * 3);
814            rows1[n] = buffer.load_complex(n * 3 + 1);
815        }
816
817        // do butterfly 6's down the columns
818        let mid0 = AvxVector128::column_butterfly6(rows0, self.twiddles_butterfly3);
819        let mut mid1 = AvxVector256::column_butterfly6(rows1, self.twiddles_butterfly3);
820
821        // apply twiddle factors
822        for n in 1..6 {
823            mid1[n] = AvxVector::mul_complex(mid1[n], self.twiddles[n - 1]);
824        }
825
826        // transpose our 3x6 array to a 6x3 array
827        let (transposed0, transposed1, transposed2) =
828            avx64_utils::transpose_3x6_to_6x3_f64(mid0, mid1);
829
830        // apply butterfly 3's down the columns
831        let output0 = AvxVector::column_butterfly3(transposed0, self.twiddles_butterfly3);
832        let output1 = AvxVector::column_butterfly3(transposed1, self.twiddles_butterfly3);
833        let output2 = AvxVector::column_butterfly3(transposed2, self.twiddles_butterfly3);
834
835        for r in 0..3 {
836            buffer.store_complex(output0[r], 6 * r);
837            buffer.store_complex(output1[r], 6 * r + 2);
838            buffer.store_complex(output2[r], 6 * r + 4);
839        }
840    }
841}
842
843pub struct Butterfly24Avx64<T> {
844    twiddles: [__m256d; 9],
845    twiddles_butterfly3: __m256d,
846    twiddles_butterfly4: Rotation90<__m256d>,
847    direction: FftDirection,
848    _phantom_t: std::marker::PhantomData<T>,
849}
850boilerplate_fft_simd_butterfly!(Butterfly24Avx64, 24);
851impl Butterfly24Avx64<f64> {
852    #[target_feature(enable = "avx")]
853    unsafe fn new_with_avx(direction: FftDirection) -> Self {
854        Self {
855            twiddles: gen_butterfly_twiddles_interleaved_columns!(4, 6, 0, direction),
856            twiddles_butterfly3: AvxVector::broadcast_twiddle(1, 3, direction),
857            twiddles_butterfly4: AvxVector::make_rotation90(direction),
858            direction,
859            _phantom_t: PhantomData,
860        }
861    }
862}
863impl<T> Butterfly24Avx64<T> {
864    #[target_feature(enable = "avx", enable = "fma")]
865    unsafe fn perform_fft_f64(&self, mut buffer: impl AvxArrayMut<f64>) {
866        let mut rows0 = [AvxVector::zero(); 4];
867        let mut rows1 = [AvxVector::zero(); 4];
868        let mut rows2 = [AvxVector::zero(); 4];
869        for r in 0..4 {
870            rows0[r] = buffer.load_complex(6 * r);
871            rows1[r] = buffer.load_complex(6 * r + 2);
872            rows2[r] = buffer.load_complex(6 * r + 4);
873        }
874
875        // We're going to treat our input as a 6x4 2d array. First, do 6 butterfly 4's down the columns of that array.
876        let mut mid0 = AvxVector::column_butterfly4(rows0, self.twiddles_butterfly4);
877        let mut mid1 = AvxVector::column_butterfly4(rows1, self.twiddles_butterfly4);
878        let mut mid2 = AvxVector::column_butterfly4(rows2, self.twiddles_butterfly4);
879
880        // apply twiddle factors
881        for r in 1..4 {
882            mid0[r] = AvxVector::mul_complex(mid0[r], self.twiddles[3 * r - 3]);
883            mid1[r] = AvxVector::mul_complex(mid1[r], self.twiddles[3 * r - 2]);
884            mid2[r] = AvxVector::mul_complex(mid2[r], self.twiddles[3 * r - 1]);
885        }
886
887        // Transpose our 6x4 array
888        let (transposed0, transposed1) = avx64_utils::transpose_6x4_to_4x6_f64(mid0, mid1, mid2);
889
890        // Butterfly 6's down columns of the transposed array
891        let output0 = AvxVector256::column_butterfly6(transposed0, self.twiddles_butterfly3);
892        let output1 = AvxVector256::column_butterfly6(transposed1, self.twiddles_butterfly3);
893
894        for r in 0..6 {
895            buffer.store_complex(output0[r], 4 * r);
896            buffer.store_complex(output1[r], 4 * r + 2);
897        }
898    }
899}
900
901pub struct Butterfly27Avx64<T> {
902    twiddles: [__m256d; 8],
903    twiddles_butterfly9: [__m256d; 3],
904    twiddles_butterfly9_lo: [__m256d; 2],
905    twiddles_butterfly3: __m256d,
906    direction: FftDirection,
907    _phantom_t: std::marker::PhantomData<T>,
908}
909boilerplate_fft_simd_butterfly!(Butterfly27Avx64, 27);
910impl Butterfly27Avx64<f64> {
911    #[target_feature(enable = "avx")]
912    unsafe fn new_with_avx(direction: FftDirection) -> Self {
913        let twiddle1 = __m128d::broadcast_twiddle(1, 9, direction);
914        let twiddle2 = __m128d::broadcast_twiddle(2, 9, direction);
915        let twiddle4 = __m128d::broadcast_twiddle(4, 9, direction);
916
917        Self {
918            twiddles: gen_butterfly_twiddles_interleaved_columns!(3, 9, 1, direction),
919            twiddles_butterfly9: [
920                AvxVector::broadcast_twiddle(1, 9, direction),
921                AvxVector::broadcast_twiddle(2, 9, direction),
922                AvxVector::broadcast_twiddle(4, 9, direction),
923            ],
924            twiddles_butterfly9_lo: [
925                AvxVector256::merge(twiddle1, twiddle2),
926                AvxVector256::merge(twiddle2, twiddle4),
927            ],
928            twiddles_butterfly3: AvxVector::broadcast_twiddle(1, 3, direction),
929            direction,
930            _phantom_t: PhantomData,
931        }
932    }
933}
934impl<T> Butterfly27Avx64<T> {
935    #[target_feature(enable = "avx", enable = "fma")]
936    unsafe fn perform_fft_f64(&self, mut buffer: impl AvxArrayMut<f64>) {
937        // we're going to load our input as a 9x3 array. We have to load 9 columns, which is a little awkward
938        // We can reduce the number of multiplies we do if we load the first column as half-width and the remaining 4 sets of vectors as full.
939        // We can't fit the whole problem into AVX registers at once, so we'll have to spill some things.
940        // By computing chunks of the problem and then not referencing any of it for a while, we're making it easy for the compiler to decide what to spill
941        let mut rows0 = [AvxVector::zero(); 3];
942        for n in 0..3 {
943            rows0[n] = buffer.load_partial1_complex(n * 9);
944        }
945        let mid0 = AvxVector::column_butterfly3(rows0, self.twiddles_butterfly3.lo());
946
947        // First chunk is done and can be spilled, do 2 more chunks
948        let mut rows1 = [AvxVector::zero(); 3];
949        let mut rows2 = [AvxVector::zero(); 3];
950        for n in 0..3 {
951            rows1[n] = buffer.load_complex(n * 9 + 1);
952            rows2[n] = buffer.load_complex(n * 9 + 3);
953        }
954        let mut mid1 = AvxVector::column_butterfly3(rows1, self.twiddles_butterfly3);
955        let mut mid2 = AvxVector::column_butterfly3(rows2, self.twiddles_butterfly3);
956        for r in 1..3 {
957            mid1[r] = AvxVector::mul_complex(mid1[r], self.twiddles[4 * r - 4]);
958            mid2[r] = AvxVector::mul_complex(mid2[r], self.twiddles[4 * r - 3]);
959        }
960
961        // First 3 chunks are done and can be spilled, do the final 2 chunks
962        let mut rows3 = [AvxVector::zero(); 3];
963        let mut rows4 = [AvxVector::zero(); 3];
964        for n in 0..3 {
965            rows3[n] = buffer.load_complex(n * 9 + 5);
966            rows4[n] = buffer.load_complex(n * 9 + 7);
967        }
968        let mut mid3 = AvxVector::column_butterfly3(rows3, self.twiddles_butterfly3);
969        let mut mid4 = AvxVector::column_butterfly3(rows4, self.twiddles_butterfly3);
970        for r in 1..3 {
971            mid3[r] = AvxVector::mul_complex(mid3[r], self.twiddles[4 * r - 2]);
972            mid4[r] = AvxVector::mul_complex(mid4[r], self.twiddles[4 * r - 1]);
973        }
974
975        // transpose our 9x3 array to a 3x9 array
976        let (transposed0, transposed1) =
977            avx64_utils::transpose_9x3_to_3x9_f64(mid0, mid1, mid2, mid3, mid4);
978
979        // apply butterfly 9's down the columns. Again, do the work in chunks to make it easier for the compiler to spill
980        let output0 = AvxVector128::column_butterfly9(
981            transposed0,
982            self.twiddles_butterfly9_lo,
983            self.twiddles_butterfly3,
984        );
985        for r in 0..3 {
986            buffer.store_partial1_complex(output0[r * 3], 9 * r);
987            buffer.store_partial1_complex(output0[r * 3 + 1], 9 * r + 3);
988            buffer.store_partial1_complex(output0[r * 3 + 2], 9 * r + 6);
989        }
990
991        let output1 = AvxVector256::column_butterfly9(
992            transposed1,
993            self.twiddles_butterfly9,
994            self.twiddles_butterfly3,
995        );
996        for r in 0..3 {
997            buffer.store_complex(output1[r * 3], 9 * r + 1);
998            buffer.store_complex(output1[r * 3 + 1], 9 * r + 4);
999            buffer.store_complex(output1[r * 3 + 2], 9 * r + 7);
1000        }
1001    }
1002}
1003
1004pub struct Butterfly32Avx64<T> {
1005    twiddles: [__m256d; 12],
1006    twiddles_butterfly4: Rotation90<__m256d>,
1007    direction: FftDirection,
1008    _phantom_t: std::marker::PhantomData<T>,
1009}
1010boilerplate_fft_simd_butterfly!(Butterfly32Avx64, 32);
1011impl Butterfly32Avx64<f64> {
1012    #[target_feature(enable = "avx")]
1013    unsafe fn new_with_avx(direction: FftDirection) -> Self {
1014        Self {
1015            twiddles: gen_butterfly_twiddles_interleaved_columns!(4, 8, 0, direction),
1016            twiddles_butterfly4: AvxVector::make_rotation90(direction),
1017            direction,
1018            _phantom_t: PhantomData,
1019        }
1020    }
1021}
1022impl<T> Butterfly32Avx64<T> {
1023    #[target_feature(enable = "avx", enable = "fma")]
1024    unsafe fn perform_fft_f64(&self, mut buffer: impl AvxArrayMut<f64>) {
1025        // We're going to treat our input as a 8x4 2d array. First, do 8 butterfly 4's down the columns of that array.
1026        // We can't fit the whole problem into AVX registers at once, so we'll have to spill some things.
1027        // By computing half of the problem and then not referencing any of it for a while, we're making it easy for the compiler to decide what to spill
1028        let mut rows0 = [AvxVector::zero(); 4];
1029        let mut rows1 = [AvxVector::zero(); 4];
1030        for r in 0..4 {
1031            rows0[r] = buffer.load_complex(8 * r);
1032            rows1[r] = buffer.load_complex(8 * r + 2);
1033        }
1034        let mut mid0 = AvxVector::column_butterfly4(rows0, self.twiddles_butterfly4);
1035        let mut mid1 = AvxVector::column_butterfly4(rows1, self.twiddles_butterfly4);
1036        for r in 1..4 {
1037            mid0[r] = AvxVector::mul_complex(mid0[r], self.twiddles[4 * r - 4]);
1038            mid1[r] = AvxVector::mul_complex(mid1[r], self.twiddles[4 * r - 3]);
1039        }
1040
1041        // One half is done, so the compiler can spill everything above this. Now do the other set of columns
1042        let mut rows2 = [AvxVector::zero(); 4];
1043        let mut rows3 = [AvxVector::zero(); 4];
1044        for r in 0..4 {
1045            rows2[r] = buffer.load_complex(8 * r + 4);
1046            rows3[r] = buffer.load_complex(8 * r + 6);
1047        }
1048        let mut mid2 = AvxVector::column_butterfly4(rows2, self.twiddles_butterfly4);
1049        let mut mid3 = AvxVector::column_butterfly4(rows3, self.twiddles_butterfly4);
1050        for r in 1..4 {
1051            mid2[r] = AvxVector::mul_complex(mid2[r], self.twiddles[4 * r - 2]);
1052            mid3[r] = AvxVector::mul_complex(mid3[r], self.twiddles[4 * r - 1]);
1053        }
1054
1055        // Transpose our 8x4 array to a 4x8 array
1056        let (transposed0, transposed1) =
1057            avx64_utils::transpose_8x4_to_4x8_f64(mid0, mid1, mid2, mid3);
1058
1059        // Do 4 butterfly 8's down columns of the transposed array
1060        // Same thing as above - Do the half of the butterfly 8's separately to give the compiler a better hint about what to spill
1061        let output0 = AvxVector::column_butterfly8(transposed0, self.twiddles_butterfly4);
1062        for r in 0..8 {
1063            buffer.store_complex(output0[r], 4 * r);
1064        }
1065        let output1 = AvxVector::column_butterfly8(transposed1, self.twiddles_butterfly4);
1066        for r in 0..8 {
1067            buffer.store_complex(output1[r], 4 * r + 2);
1068        }
1069    }
1070}
1071
1072pub struct Butterfly36Avx64<T> {
1073    twiddles: [__m256d; 15],
1074    twiddles_butterfly3: __m256d,
1075    direction: FftDirection,
1076    _phantom_t: std::marker::PhantomData<T>,
1077}
1078boilerplate_fft_simd_butterfly!(Butterfly36Avx64, 36);
1079impl Butterfly36Avx64<f64> {
1080    #[target_feature(enable = "avx")]
1081    unsafe fn new_with_avx(direction: FftDirection) -> Self {
1082        Self {
1083            twiddles: gen_butterfly_twiddles_separated_columns!(6, 6, 0, direction),
1084            twiddles_butterfly3: AvxVector::broadcast_twiddle(1, 3, direction),
1085            direction,
1086            _phantom_t: PhantomData,
1087        }
1088    }
1089}
1090impl<T> Butterfly36Avx64<T> {
1091    #[target_feature(enable = "avx", enable = "fma")]
1092    unsafe fn perform_fft_f64(&self, mut buffer: impl AvxArrayMut<f64>) {
1093        // we're going to load our input as a 6x6 array
1094        // We can't fit the whole problem into AVX registers at once, so we'll have to spill some things.
1095        // By computing chunks of the problem and then not referencing any of it for a while, we're making it easy for the compiler to decide what to spill
1096        let mut rows0 = [AvxVector::zero(); 6];
1097        for n in 0..6 {
1098            rows0[n] = buffer.load_complex(n * 6);
1099        }
1100        let mut mid0 = AvxVector256::column_butterfly6(rows0, self.twiddles_butterfly3);
1101        for r in 1..6 {
1102            mid0[r] = AvxVector::mul_complex(mid0[r], self.twiddles[r - 1]);
1103        }
1104
1105        // we're going to load our input as a 6x6 array
1106        let mut rows1 = [AvxVector::zero(); 6];
1107        for n in 0..6 {
1108            rows1[n] = buffer.load_complex(n * 6 + 2);
1109        }
1110        let mut mid1 = AvxVector256::column_butterfly6(rows1, self.twiddles_butterfly3);
1111        for r in 1..6 {
1112            mid1[r] = AvxVector::mul_complex(mid1[r], self.twiddles[r + 4]);
1113        }
1114
1115        // we're going to load our input as a 6x6 array
1116        let mut rows2 = [AvxVector::zero(); 6];
1117        for n in 0..6 {
1118            rows2[n] = buffer.load_complex(n * 6 + 4);
1119        }
1120        let mut mid2 = AvxVector256::column_butterfly6(rows2, self.twiddles_butterfly3);
1121        for r in 1..6 {
1122            mid2[r] = AvxVector::mul_complex(mid2[r], self.twiddles[r + 9]);
1123        }
1124
1125        // Transpose our 6x6 array
1126        let (transposed0, transposed1, transposed2) =
1127            avx64_utils::transpose_6x6_f64(mid0, mid1, mid2);
1128
1129        // Apply butterfly 6's down the columns.  Again, do the work in chunks to make it easier for the compiler to spill
1130        let output0 = AvxVector256::column_butterfly6(transposed0, self.twiddles_butterfly3);
1131        for r in 0..3 {
1132            buffer.store_complex(output0[r * 2], 12 * r);
1133            buffer.store_complex(output0[r * 2 + 1], 12 * r + 6);
1134        }
1135
1136        let output1 = AvxVector256::column_butterfly6(transposed1, self.twiddles_butterfly3);
1137        for r in 0..3 {
1138            buffer.store_complex(output1[r * 2], 12 * r + 2);
1139            buffer.store_complex(output1[r * 2 + 1], 12 * r + 8);
1140        }
1141
1142        let output2 = AvxVector256::column_butterfly6(transposed2, self.twiddles_butterfly3);
1143        for r in 0..3 {
1144            buffer.store_complex(output2[r * 2], 12 * r + 4);
1145            buffer.store_complex(output2[r * 2 + 1], 12 * r + 10);
1146        }
1147    }
1148}
1149
1150pub struct Butterfly64Avx64<T> {
1151    twiddles: [__m256d; 28],
1152    twiddles_butterfly4: Rotation90<__m256d>,
1153    direction: FftDirection,
1154    _phantom_t: std::marker::PhantomData<T>,
1155}
1156boilerplate_fft_simd_butterfly_with_scratch!(Butterfly64Avx64, 64);
1157impl Butterfly64Avx64<f64> {
1158    #[target_feature(enable = "avx")]
1159    unsafe fn new_with_avx(direction: FftDirection) -> Self {
1160        Self {
1161            twiddles: gen_butterfly_twiddles_separated_columns!(8, 8, 0, direction),
1162            twiddles_butterfly4: AvxVector::make_rotation90(direction),
1163            direction,
1164            _phantom_t: PhantomData,
1165        }
1166    }
1167}
1168impl<T> Butterfly64Avx64<T> {
1169    #[target_feature(enable = "avx", enable = "fma")]
1170    unsafe fn column_butterflies_and_transpose(
1171        &self,
1172        input: &[Complex<f64>],
1173        mut output: &mut [Complex<f64>],
1174    ) {
1175        // A size-64 FFT is way too big to fit in registers, so instead we're going to compute it in two phases, storing in scratch in between.
1176
1177        // First phase is to treat this size-64 array like a 8x8 2D array, and do butterfly 8's down the columns
1178        // Then, apply twiddle factors, and finally transpose into the scratch space
1179
1180        // But again, we don't have enough registers to load it all at once, so only load one column of AVX vectors at a time
1181        for columnset in 0..4 {
1182            let mut rows = [AvxVector::zero(); 8];
1183            for r in 0..8 {
1184                rows[r] = input.load_complex(columnset * 2 + 8 * r);
1185            }
1186            // apply butterfly 8
1187            let mut mid = AvxVector::column_butterfly8(rows, self.twiddles_butterfly4);
1188
1189            // apply twiddle factors
1190            for r in 1..8 {
1191                mid[r] = AvxVector::mul_complex(mid[r], self.twiddles[r - 1 + 7 * columnset]);
1192            }
1193
1194            // transpose
1195            let transposed = AvxVector::transpose8_packed(mid);
1196
1197            // write out
1198            for i in 0..4 {
1199                output.store_complex(transposed[i * 2], columnset * 16 + i * 4);
1200                output.store_complex(transposed[i * 2 + 1], columnset * 16 + i * 4 + 2);
1201            }
1202        }
1203    }
1204
1205    #[target_feature(enable = "avx", enable = "fma")]
1206    unsafe fn row_butterflies(&self, mut buffer: impl AvxArrayMut<f64>) {
1207        // Second phase: Butterfly 8's down the columns of our transposed array.
1208        // Thankfully, during the first phase, we set everything up so that all we have to do here is compute the size-8 FFT columns and write them back out where we got them
1209        for columnset in 0usize..4 {
1210            let mut rows = [AvxVector::zero(); 8];
1211            for r in 0..8 {
1212                rows[r] = buffer.load_complex(columnset * 2 + 8 * r);
1213            }
1214            let mid = AvxVector::column_butterfly8(rows, self.twiddles_butterfly4);
1215            for r in 0..8 {
1216                buffer.store_complex(mid[r], columnset * 2 + 8 * r);
1217            }
1218        }
1219    }
1220}
1221
1222pub struct Butterfly128Avx64<T> {
1223    twiddles: [__m256d; 56],
1224    twiddles_butterfly16: [__m256d; 2],
1225    twiddles_butterfly4: Rotation90<__m256d>,
1226    direction: FftDirection,
1227    _phantom_t: std::marker::PhantomData<T>,
1228}
1229boilerplate_fft_simd_butterfly_with_scratch!(Butterfly128Avx64, 128);
1230impl Butterfly128Avx64<f64> {
1231    #[target_feature(enable = "avx")]
1232    unsafe fn new_with_avx(direction: FftDirection) -> Self {
1233        Self {
1234            twiddles: gen_butterfly_twiddles_separated_columns!(8, 16, 0, direction),
1235            twiddles_butterfly16: [
1236                AvxVector::broadcast_twiddle(1, 16, direction),
1237                AvxVector::broadcast_twiddle(3, 16, direction),
1238            ],
1239            twiddles_butterfly4: AvxVector::make_rotation90(direction),
1240            direction,
1241            _phantom_t: PhantomData,
1242        }
1243    }
1244}
1245impl<T> Butterfly128Avx64<T> {
1246    #[target_feature(enable = "avx", enable = "fma")]
1247    unsafe fn column_butterflies_and_transpose(
1248        &self,
1249        input: &[Complex<f64>],
1250        mut output: &mut [Complex<f64>],
1251    ) {
1252        // A size-128 FFT is way too big to fit in registers, so instead we're going to compute it in two phases, storing in scratch in between.
1253
1254        // First phase is to treat this size-128 array like a 16x8 2D array, and do butterfly 8's down the columns
1255        // Then, apply twiddle factors, and finally transpose into the scratch space
1256
1257        // But again, we don't have enough registers to load it all at once, so only load one column of AVX vectors at a time
1258        for columnset in 0..8 {
1259            let mut rows = [AvxVector::zero(); 8];
1260            for r in 0..8 {
1261                rows[r] = input.load_complex(columnset * 2 + 16 * r);
1262            }
1263            // apply butterfly 8
1264            let mut mid = AvxVector::column_butterfly8(rows, self.twiddles_butterfly4);
1265
1266            // apply twiddle factors
1267            for r in 1..8 {
1268                mid[r] = AvxVector::mul_complex(mid[r], self.twiddles[r - 1 + 7 * columnset]);
1269            }
1270
1271            // transpose
1272            let transposed = AvxVector::transpose8_packed(mid);
1273
1274            // write out
1275            for i in 0..4 {
1276                output.store_complex(transposed[i * 2], columnset * 16 + i * 4);
1277                output.store_complex(transposed[i * 2 + 1], columnset * 16 + i * 4 + 2);
1278            }
1279        }
1280    }
1281
1282    #[target_feature(enable = "avx", enable = "fma")]
1283    unsafe fn row_butterflies(&self, mut buffer: impl AvxArrayMut<f64>) {
1284        // Second phase: Butterfly 16's down the columns of our transposed array.
1285        // Thankfully, during the first phase, we set everything up so that all we have to do here is compute the size-16 FFT columns and write them back out where we got them
1286        // We're also using a customized butterfly16 function that is smarter about when it loads/stores data, to reduce register spilling
1287        for columnset in 0usize..4 {
1288            column_butterfly16_loadfn!(
1289                |index: usize| buffer.load_complex(columnset * 2 + index * 8),
1290                |data, index| buffer.store_complex(data, columnset * 2 + index * 8),
1291                self.twiddles_butterfly16,
1292                self.twiddles_butterfly4
1293            );
1294        }
1295    }
1296}
1297
1298pub struct Butterfly256Avx64<T> {
1299    twiddles: [__m256d; 112],
1300    twiddles_butterfly32: [__m256d; 6],
1301    twiddles_butterfly4: Rotation90<__m256d>,
1302    direction: FftDirection,
1303    _phantom_t: std::marker::PhantomData<T>,
1304}
1305boilerplate_fft_simd_butterfly_with_scratch!(Butterfly256Avx64, 256);
1306impl Butterfly256Avx64<f64> {
1307    #[target_feature(enable = "avx")]
1308    unsafe fn new_with_avx(direction: FftDirection) -> Self {
1309        Self {
1310            twiddles: gen_butterfly_twiddles_separated_columns!(8, 32, 0, direction),
1311            twiddles_butterfly32: [
1312                AvxVector::broadcast_twiddle(1, 32, direction),
1313                AvxVector::broadcast_twiddle(2, 32, direction),
1314                AvxVector::broadcast_twiddle(3, 32, direction),
1315                AvxVector::broadcast_twiddle(5, 32, direction),
1316                AvxVector::broadcast_twiddle(6, 32, direction),
1317                AvxVector::broadcast_twiddle(7, 32, direction),
1318            ],
1319            twiddles_butterfly4: AvxVector::make_rotation90(direction),
1320            direction,
1321            _phantom_t: PhantomData,
1322        }
1323    }
1324}
1325impl<T> Butterfly256Avx64<T> {
1326    #[target_feature(enable = "avx", enable = "fma")]
1327    unsafe fn column_butterflies_and_transpose(
1328        &self,
1329        input: &[Complex<f64>],
1330        mut output: &mut [Complex<f64>],
1331    ) {
1332        // A size-256 FFT is way too big to fit in registers, so instead we're going to compute it in two phases, storing in scratch in between.
1333
1334        // First phase is to treeat this size-256 array like a 32x8 2D array, and do butterfly 8's down the columns
1335        // Then, apply twiddle factors, and finally transpose into the scratch space
1336
1337        // But again, we don't have enough registers to load it all at once, so only load one column of AVX vectors at a time
1338        for columnset in 0..16 {
1339            let mut rows = [AvxVector::zero(); 8];
1340            for r in 0..8 {
1341                rows[r] = input.load_complex(columnset * 2 + 32 * r);
1342            }
1343            // apply butterfly 8
1344            let mut mid = AvxVector::column_butterfly8(rows, self.twiddles_butterfly4);
1345
1346            // apply twiddle factors
1347            for r in 1..8 {
1348                mid[r] = AvxVector::mul_complex(mid[r], self.twiddles[r - 1 + 7 * columnset]);
1349            }
1350
1351            // transpose
1352            let transposed = AvxVector::transpose8_packed(mid);
1353
1354            // write out
1355            for i in 0..4 {
1356                output.store_complex(transposed[i * 2], columnset * 16 + i * 4);
1357                output.store_complex(transposed[i * 2 + 1], columnset * 16 + i * 4 + 2);
1358            }
1359        }
1360    }
1361
1362    #[target_feature(enable = "avx", enable = "fma")]
1363    unsafe fn row_butterflies(&self, mut buffer: impl AvxArrayMut<f64>) {
1364        // Second phase: Butterfly 32's down the columns of our transposed array.
1365        // Thankfully, during the first phase, we set everything up so that all we have to do here is compute the size-32 FFT columns and write them back out where we got them
1366        // We're also using a customized butterfly32 function that is smarter about when it loads/stores data, to reduce register spilling
1367        for columnset in 0usize..4 {
1368            column_butterfly32_loadfn!(
1369                |index: usize| buffer.load_complex(columnset * 2 + index * 8),
1370                |data, index| buffer.store_complex(data, columnset * 2 + index * 8),
1371                self.twiddles_butterfly32,
1372                self.twiddles_butterfly4
1373            );
1374        }
1375    }
1376}
1377
1378pub struct Butterfly512Avx64<T> {
1379    twiddles: [__m256d; 240],
1380    twiddles_butterfly32: [__m256d; 6],
1381    twiddles_butterfly16: [__m256d; 2],
1382    twiddles_butterfly4: Rotation90<__m256d>,
1383    direction: FftDirection,
1384    _phantom_t: std::marker::PhantomData<T>,
1385}
1386boilerplate_fft_simd_butterfly_with_scratch!(Butterfly512Avx64, 512);
1387impl Butterfly512Avx64<f64> {
1388    #[target_feature(enable = "avx")]
1389    unsafe fn new_with_avx(direction: FftDirection) -> Self {
1390        Self {
1391            twiddles: gen_butterfly_twiddles_separated_columns!(16, 32, 0, direction),
1392            twiddles_butterfly32: [
1393                AvxVector::broadcast_twiddle(1, 32, direction),
1394                AvxVector::broadcast_twiddle(2, 32, direction),
1395                AvxVector::broadcast_twiddle(3, 32, direction),
1396                AvxVector::broadcast_twiddle(5, 32, direction),
1397                AvxVector::broadcast_twiddle(6, 32, direction),
1398                AvxVector::broadcast_twiddle(7, 32, direction),
1399            ],
1400            twiddles_butterfly16: [
1401                AvxVector::broadcast_twiddle(1, 16, direction),
1402                AvxVector::broadcast_twiddle(3, 16, direction),
1403            ],
1404            twiddles_butterfly4: AvxVector::make_rotation90(direction),
1405            direction,
1406            _phantom_t: PhantomData,
1407        }
1408    }
1409}
1410impl<T> Butterfly512Avx64<T> {
1411    #[target_feature(enable = "avx", enable = "fma")]
1412    unsafe fn column_butterflies_and_transpose(
1413        &self,
1414        input: &[Complex<f64>],
1415        mut output: &mut [Complex<f64>],
1416    ) {
1417        // A size-512 FFT is way too big to fit in registers, so instead we're going to compute it in two phases, storing in scratch in between.
1418
1419        // First phase is to treat this size-512 array like a 32x16 2D array, and do butterfly 16's down the columns
1420        // Then, apply twiddle factors, and finally transpose into the scratch space
1421
1422        // But again, we don't have enough registers to load it all at once, so only load one column of AVX vectors at a time
1423        // We're also using a customized butterfly16 function that is smarter about when it loads/stores data, to reduce register spilling
1424        const TWIDDLES_PER_COLUMN: usize = 15;
1425        for (columnset, twiddle_chunk) in
1426            self.twiddles.chunks_exact(TWIDDLES_PER_COLUMN).enumerate()
1427        {
1428            // Sadly we have to use MaybeUninit here. If we init an array like normal with AvxVector::Zero(), the compiler can't seem to figure out that it can
1429            // eliminate the dead stores of zeroes to the stack. By using uninit here, we avoid those unnecessary writes
1430            let mut mid_uninit: [MaybeUninit<__m256d>; 16] = [MaybeUninit::<__m256d>::uninit(); 16];
1431
1432            column_butterfly16_loadfn!(
1433                |index: usize| input.load_complex(columnset * 2 + 32 * index),
1434                |data, index: usize| {
1435                    mid_uninit[index].as_mut_ptr().write(data);
1436                },
1437                self.twiddles_butterfly16,
1438                self.twiddles_butterfly4
1439            );
1440
1441            // Apply twiddle factors, transpose, and store. Traditionally we apply all the twiddle factors at once and then do all the transposes at once,
1442            // But our data is pushing the limit of what we can store in registers, so the idea here is to get the data out the door with as few spills to the stack as possible
1443            for chunk in 0..4 {
1444                let twiddled = [
1445                    if chunk > 0 {
1446                        AvxVector::mul_complex(
1447                            mid_uninit[4 * chunk].assume_init(),
1448                            twiddle_chunk[4 * chunk - 1],
1449                        )
1450                    } else {
1451                        mid_uninit[4 * chunk].assume_init()
1452                    },
1453                    AvxVector::mul_complex(
1454                        mid_uninit[4 * chunk + 1].assume_init(),
1455                        twiddle_chunk[4 * chunk],
1456                    ),
1457                    AvxVector::mul_complex(
1458                        mid_uninit[4 * chunk + 2].assume_init(),
1459                        twiddle_chunk[4 * chunk + 1],
1460                    ),
1461                    AvxVector::mul_complex(
1462                        mid_uninit[4 * chunk + 3].assume_init(),
1463                        twiddle_chunk[4 * chunk + 2],
1464                    ),
1465                ];
1466
1467                let transposed = AvxVector::transpose4_packed(twiddled);
1468
1469                output.store_complex(transposed[0], columnset * 32 + 4 * chunk);
1470                output.store_complex(transposed[1], columnset * 32 + 4 * chunk + 2);
1471                output.store_complex(transposed[2], columnset * 32 + 4 * chunk + 16);
1472                output.store_complex(transposed[3], columnset * 32 + 4 * chunk + 18);
1473            }
1474        }
1475    }
1476
1477    #[target_feature(enable = "avx", enable = "fma")]
1478    unsafe fn row_butterflies(&self, mut buffer: impl AvxArrayMut<f64>) {
1479        // Second phase: Butterfly 32's down the columns of our transposed array.
1480        // Thankfully, during the first phase, we set everything up so that all we have to do here is compute the size-32 FFT columns and write them back out where we got them
1481        // We're also using a customized butterfly32 function that is smarter about when it loads/stores data, to reduce register spilling
1482        for columnset in 0usize..8 {
1483            column_butterfly32_loadfn!(
1484                |index: usize| buffer.load_complex(columnset * 2 + index * 16),
1485                |data, index| buffer.store_complex(data, columnset * 2 + index * 16),
1486                self.twiddles_butterfly32,
1487                self.twiddles_butterfly4
1488            );
1489        }
1490    }
1491}
1492
1493#[cfg(test)]
1494mod unit_tests {
1495    use super::*;
1496    use crate::test_utils::check_fft_algorithm;
1497
1498    macro_rules! test_avx_butterfly {
1499        ($test_name:ident, $struct_name:ident, $size:expr) => (
1500            #[test]
1501            fn $test_name() {
1502                let butterfly = $struct_name::new(FftDirection::Forward).expect("Can't run test because this machine doesn't have the required instruction sets");
1503                check_fft_algorithm(&butterfly as &dyn Fft<f64>, $size, FftDirection::Forward);
1504
1505                let butterfly_inverse = $struct_name::new(FftDirection::Inverse).expect("Can't run test because this machine doesn't have the required instruction sets");
1506                check_fft_algorithm(&butterfly_inverse as &dyn Fft<f64>, $size, FftDirection::Inverse);
1507            }
1508        )
1509    }
1510
1511    test_avx_butterfly!(test_avx_butterfly5_f64, Butterfly5Avx64, 5);
1512    test_avx_butterfly!(test_avx_butterfly7_f64, Butterfly7Avx64, 7);
1513    test_avx_butterfly!(test_avx_mixedradix8_f64, Butterfly8Avx64, 8);
1514    test_avx_butterfly!(test_avx_mixedradix9_f64, Butterfly9Avx64, 9);
1515    test_avx_butterfly!(test_avx_mixedradix11_f64, Butterfly11Avx64, 11);
1516    test_avx_butterfly!(test_avx_mixedradix12_f64, Butterfly12Avx64, 12);
1517    test_avx_butterfly!(test_avx_mixedradix16_f64, Butterfly16Avx64, 16);
1518    test_avx_butterfly!(test_avx_mixedradix18_f64, Butterfly18Avx64, 18);
1519    test_avx_butterfly!(test_avx_mixedradix24_f64, Butterfly24Avx64, 24);
1520    test_avx_butterfly!(test_avx_mixedradix27_f64, Butterfly27Avx64, 27);
1521    test_avx_butterfly!(test_avx_mixedradix32_f64, Butterfly32Avx64, 32);
1522    test_avx_butterfly!(test_avx_mixedradix36_f64, Butterfly36Avx64, 36);
1523    test_avx_butterfly!(test_avx_mixedradix64_f64, Butterfly64Avx64, 64);
1524    test_avx_butterfly!(test_avx_mixedradix128_f64, Butterfly128Avx64, 128);
1525    test_avx_butterfly!(test_avx_mixedradix256_f64, Butterfly256Avx64, 256);
1526    test_avx_butterfly!(test_avx_mixedradix512_f64, Butterfly512Avx64, 512);
1527}