rustfft/sse/
sse_prime_butterflies.rs

1use core::arch::x86_64::{__m128, __m128d};
2use std::any::TypeId;
3use std::sync::Arc;
4use num_complex::Complex;
5
6use crate::{common::FftNum, FftDirection};
7
8use crate::array_utils;
9use crate::array_utils::workaround_transmute_mut;
10use crate::array_utils::DoubleBuf;
11use crate::common::{fft_error_inplace, fft_error_outofplace};
12use crate::twiddles;
13use crate::{Direction, Fft, Length};
14
15use super::sse_common::{assert_f32, assert_f64};
16use super::sse_utils::*;
17use super::sse_vector::*;
18
19/* 
20This file contains autogenerated butterfly algorithms for small prime-sized FFTs using the SSE instruction set.
21
22NOTE: All of the code in this file was **autogenerated** using the following command in the project root:
23    cargo run --manifest-path ./tools/gen_simd_butterflies/Cargo.toml -- sse 7 11 13 17 19 23 29 31
24    
25Do not make changes directly to this file. Instead, update the autogeneration script 
26(Located at tools/gen_simd_butterflies/src/main.rs) and then re-run the script to generate the new code.
27
28For these sizes, we use a variant of the naive DFT algorithm. Even though this algorithm is O(n^2),
29we're able to factor out some redundant computations, and they up being faster than the fancier algorithms.
30
31To generate more or fewer butterfly sizes, simply add or remove numbers from the command above and re-run.
32The code generation script will even handle adding or removing sizes from the planner, all you need to do is run the script.
33*/
34
35pub const fn prime_butterfly_lens() -> &'static [usize] {
36    &[7, 11, 13, 17, 19, 23, 29, 31, ]
37}
38
39/// Safety: The current machine must support the sse4.1 target feature
40#[target_feature(enable = "sse4.1")]
41pub unsafe fn construct_prime_butterfly<T: FftNum>(len: usize, direction: FftDirection) -> Arc<dyn Fft<T>> {
42    let id_f32 = TypeId::of::<f32>();
43    let id_f64 = TypeId::of::<f64>();
44    let id_t = TypeId::of::<T>();
45    if id_t == id_f32 {
46        match len {
47            7 => Arc::new(SseF32Butterfly7::new(direction)) as Arc<dyn Fft<T>>,
48            11 => Arc::new(SseF32Butterfly11::new(direction)) as Arc<dyn Fft<T>>,
49            13 => Arc::new(SseF32Butterfly13::new(direction)) as Arc<dyn Fft<T>>,
50            17 => Arc::new(SseF32Butterfly17::new(direction)) as Arc<dyn Fft<T>>,
51            19 => Arc::new(SseF32Butterfly19::new(direction)) as Arc<dyn Fft<T>>,
52            23 => Arc::new(SseF32Butterfly23::new(direction)) as Arc<dyn Fft<T>>,
53            29 => Arc::new(SseF32Butterfly29::new(direction)) as Arc<dyn Fft<T>>,
54            31 => Arc::new(SseF32Butterfly31::new(direction)) as Arc<dyn Fft<T>>,
55            _ => unimplemented!("Invalid SSE prime butterfly length: {len}"),
56        }
57    } else if id_t == id_f64 {
58        match len {
59            7 => Arc::new(SseF64Butterfly7::new(direction)) as Arc<dyn Fft<T>>,
60            11 => Arc::new(SseF64Butterfly11::new(direction)) as Arc<dyn Fft<T>>,
61            13 => Arc::new(SseF64Butterfly13::new(direction)) as Arc<dyn Fft<T>>,
62            17 => Arc::new(SseF64Butterfly17::new(direction)) as Arc<dyn Fft<T>>,
63            19 => Arc::new(SseF64Butterfly19::new(direction)) as Arc<dyn Fft<T>>,
64            23 => Arc::new(SseF64Butterfly23::new(direction)) as Arc<dyn Fft<T>>,
65            29 => Arc::new(SseF64Butterfly29::new(direction)) as Arc<dyn Fft<T>>,
66            31 => Arc::new(SseF64Butterfly31::new(direction)) as Arc<dyn Fft<T>>,
67            _ => unimplemented!("Invalid SSE prime butterfly length: {len}"),
68        }
69    } else {
70        unimplemented!("Not f32 or f64");
71    }
72}
73
74#[inline(always)]
75fn make_twiddles<const TW: usize, T: FftNum>(len: usize, direction: FftDirection) -> [Complex<T>; TW] {
76    let mut i = 1;
77    [(); TW].map(|_| {
78        let twiddle = twiddles::compute_twiddle(i, len, direction);
79        i += 1;
80        twiddle
81    })
82}
83
84struct SseF32Butterfly7<T> {
85    direction: FftDirection,
86    twiddles_re: [__m128; 3],
87    twiddles_im: [__m128; 3],
88    _phantom: std::marker::PhantomData<T>,
89}
90
91boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly7);
92boilerplate_fft_sse_common_butterfly!(SseF32Butterfly7, 7, |this: &SseF32Butterfly7<_>| this.direction);
93impl<T: FftNum> SseF32Butterfly7<T> {
94    /// Safety: The current machine must support the sse4.1 instruction set
95    #[target_feature(enable = "sse4.1")]
96    unsafe fn new(direction: FftDirection) -> Self {
97        assert_f32::<T>();
98        let twiddles = make_twiddles(7, direction);
99        Self {
100            direction,
101            twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
102            twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
103            _phantom: std::marker::PhantomData,
104        }
105    }
106
107    #[inline(always)]
108    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
109        let values = read_partial1_complex_to_array!(buffer, { 0,1,2,3,4,5,6 });
110
111        let out = self.perform_parallel_fft_direct(values);
112        
113        write_partial_lo_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6 } ); 
114    }
115
116    #[inline(always)]
117    pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
118        let input_packed = read_complex_to_array!(buffer, { 0,2,4,6,8,10,12 });
119
120        let values = [
121            extract_lo_hi_f32(input_packed[0], input_packed[3]),
122            extract_hi_lo_f32(input_packed[0], input_packed[4]),
123            extract_lo_hi_f32(input_packed[1], input_packed[4]),
124            extract_hi_lo_f32(input_packed[1], input_packed[5]),
125            extract_lo_hi_f32(input_packed[2], input_packed[5]),
126            extract_hi_lo_f32(input_packed[2], input_packed[6]),
127            extract_lo_hi_f32(input_packed[3], input_packed[6]),
128        ];
129
130        let out = self.perform_parallel_fft_direct(values);
131
132        let out_packed = [
133            extract_lo_lo_f32(out[0], out[1]),
134            extract_lo_lo_f32(out[2], out[3]),
135            extract_lo_lo_f32(out[4], out[5]),
136            extract_lo_hi_f32(out[6], out[0]),
137            extract_hi_hi_f32(out[1], out[2]),
138            extract_hi_hi_f32(out[3], out[4]),
139            extract_hi_hi_f32(out[5], out[6]),
140        ];
141
142        write_complex_to_array_strided!(out_packed, buffer, 2, { 0,1,2,3,4,5,6 });
143    }
144
145    #[inline(always)]
146    pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 7]) -> [__m128; 7] {
147        let rotate = SseVector::make_rotate90(FftDirection::Inverse);
148
149        let y00 = values[0];
150        let [x1p6, x1m6] =  SseVector::column_butterfly2([values[1], values[6]]);
151        let x1m6 = SseVector::apply_rotate90(rotate, x1m6);
152        let y00 = SseVector::add(y00, x1p6);
153        let [x2p5, x2m5] =  SseVector::column_butterfly2([values[2], values[5]]);
154        let x2m5 = SseVector::apply_rotate90(rotate, x2m5);
155        let y00 = SseVector::add(y00, x2p5);
156        let [x3p4, x3m4] =  SseVector::column_butterfly2([values[3], values[4]]);
157        let x3m4 = SseVector::apply_rotate90(rotate, x3m4);
158        let y00 = SseVector::add(y00, x3p4);
159
160        let m0106a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p6);
161        let m0106a = SseVector::fmadd(m0106a, self.twiddles_re[1], x2p5);
162        let m0106a = SseVector::fmadd(m0106a, self.twiddles_re[2], x3p4);
163        let m0106b = SseVector::mul(self.twiddles_im[0], x1m6);
164        let m0106b = SseVector::fmadd(m0106b, self.twiddles_im[1], x2m5);
165        let m0106b = SseVector::fmadd(m0106b, self.twiddles_im[2], x3m4);
166        let [y01, y06] = SseVector::column_butterfly2([m0106a, m0106b]);
167
168        let m0205a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p6);
169        let m0205a = SseVector::fmadd(m0205a, self.twiddles_re[2], x2p5);
170        let m0205a = SseVector::fmadd(m0205a, self.twiddles_re[0], x3p4);
171        let m0205b = SseVector::mul(self.twiddles_im[1], x1m6);
172        let m0205b = SseVector::nmadd(m0205b, self.twiddles_im[2], x2m5);
173        let m0205b = SseVector::nmadd(m0205b, self.twiddles_im[0], x3m4);
174        let [y02, y05] = SseVector::column_butterfly2([m0205a, m0205b]);
175
176        let m0304a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p6);
177        let m0304a = SseVector::fmadd(m0304a, self.twiddles_re[0], x2p5);
178        let m0304a = SseVector::fmadd(m0304a, self.twiddles_re[1], x3p4);
179        let m0304b = SseVector::mul(self.twiddles_im[2], x1m6);
180        let m0304b = SseVector::nmadd(m0304b, self.twiddles_im[0], x2m5);
181        let m0304b = SseVector::fmadd(m0304b, self.twiddles_im[1], x3m4);
182        let [y03, y04] = SseVector::column_butterfly2([m0304a, m0304b]);
183
184
185        [y00, y01, y02, y03, y04, y05, y06]
186    }
187}
188
189struct SseF64Butterfly7<T> {
190    direction: FftDirection,
191    twiddles_re: [__m128d; 3],
192    twiddles_im: [__m128d; 3],
193    _phantom: std::marker::PhantomData<T>,
194}
195
196boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly7);
197boilerplate_fft_sse_common_butterfly!(SseF64Butterfly7, 7, |this: &SseF64Butterfly7<_>| this.direction);
198impl<T: FftNum> SseF64Butterfly7<T> {
199    /// Safety: The current machine must support the sse4.1 instruction set
200    #[target_feature(enable = "sse4.1")]
201    unsafe fn new(direction: FftDirection) -> Self {
202        assert_f64::<T>();
203        let twiddles = make_twiddles(7, direction);
204        unsafe {Self {
205            direction,
206            twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
207            twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
208            _phantom: std::marker::PhantomData,
209        }}
210    }
211
212    #[inline(always)]
213    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
214        let values = read_complex_to_array!(buffer, { 0,1,2,3,4,5,6 });
215
216        let out = self.perform_fft_direct(values);
217
218        write_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6 });   
219    }
220
221    #[inline(always)]
222    pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 7]) -> [__m128d; 7] {
223        let rotate = SseVector::make_rotate90(FftDirection::Inverse);
224
225        let y00 = values[0];
226        let [x1p6, x1m6] =  SseVector::column_butterfly2([values[1], values[6]]);
227        let x1m6 = SseVector::apply_rotate90(rotate, x1m6);
228        let y00 = SseVector::add(y00, x1p6);
229        let [x2p5, x2m5] =  SseVector::column_butterfly2([values[2], values[5]]);
230        let x2m5 = SseVector::apply_rotate90(rotate, x2m5);
231        let y00 = SseVector::add(y00, x2p5);
232        let [x3p4, x3m4] =  SseVector::column_butterfly2([values[3], values[4]]);
233        let x3m4 = SseVector::apply_rotate90(rotate, x3m4);
234        let y00 = SseVector::add(y00, x3p4);
235
236        let m0106a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p6);
237        let m0106a = SseVector::fmadd(m0106a, self.twiddles_re[1], x2p5);
238        let m0106a = SseVector::fmadd(m0106a, self.twiddles_re[2], x3p4);
239        let m0106b = SseVector::mul(self.twiddles_im[0], x1m6);
240        let m0106b = SseVector::fmadd(m0106b, self.twiddles_im[1], x2m5);
241        let m0106b = SseVector::fmadd(m0106b, self.twiddles_im[2], x3m4);
242        let [y01, y06] = SseVector::column_butterfly2([m0106a, m0106b]);
243
244        let m0205a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p6);
245        let m0205a = SseVector::fmadd(m0205a, self.twiddles_re[2], x2p5);
246        let m0205a = SseVector::fmadd(m0205a, self.twiddles_re[0], x3p4);
247        let m0205b = SseVector::mul(self.twiddles_im[1], x1m6);
248        let m0205b = SseVector::nmadd(m0205b, self.twiddles_im[2], x2m5);
249        let m0205b = SseVector::nmadd(m0205b, self.twiddles_im[0], x3m4);
250        let [y02, y05] = SseVector::column_butterfly2([m0205a, m0205b]);
251
252        let m0304a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p6);
253        let m0304a = SseVector::fmadd(m0304a, self.twiddles_re[0], x2p5);
254        let m0304a = SseVector::fmadd(m0304a, self.twiddles_re[1], x3p4);
255        let m0304b = SseVector::mul(self.twiddles_im[2], x1m6);
256        let m0304b = SseVector::nmadd(m0304b, self.twiddles_im[0], x2m5);
257        let m0304b = SseVector::fmadd(m0304b, self.twiddles_im[1], x3m4);
258        let [y03, y04] = SseVector::column_butterfly2([m0304a, m0304b]);
259
260
261        [y00, y01, y02, y03, y04, y05, y06]
262    }
263}
264
265struct SseF32Butterfly11<T> {
266    direction: FftDirection,
267    twiddles_re: [__m128; 5],
268    twiddles_im: [__m128; 5],
269    _phantom: std::marker::PhantomData<T>,
270}
271
272boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly11);
273boilerplate_fft_sse_common_butterfly!(SseF32Butterfly11, 11, |this: &SseF32Butterfly11<_>| this.direction);
274impl<T: FftNum> SseF32Butterfly11<T> {
275    /// Safety: The current machine must support the sse4.1 instruction set
276    #[target_feature(enable = "sse4.1")]
277    unsafe fn new(direction: FftDirection) -> Self {
278        assert_f32::<T>();
279        let twiddles = make_twiddles(11, direction);
280        Self {
281            direction,
282            twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
283            twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
284            _phantom: std::marker::PhantomData,
285        }
286    }
287
288    #[inline(always)]
289    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
290        let values = read_partial1_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10 });
291
292        let out = self.perform_parallel_fft_direct(values);
293        
294        write_partial_lo_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10 } ); 
295    }
296
297    #[inline(always)]
298    pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
299        let input_packed = read_complex_to_array!(buffer, { 0,2,4,6,8,10,12,14,16,18,20 });
300
301        let values = [
302            extract_lo_hi_f32(input_packed[0], input_packed[5]),
303            extract_hi_lo_f32(input_packed[0], input_packed[6]),
304            extract_lo_hi_f32(input_packed[1], input_packed[6]),
305            extract_hi_lo_f32(input_packed[1], input_packed[7]),
306            extract_lo_hi_f32(input_packed[2], input_packed[7]),
307            extract_hi_lo_f32(input_packed[2], input_packed[8]),
308            extract_lo_hi_f32(input_packed[3], input_packed[8]),
309            extract_hi_lo_f32(input_packed[3], input_packed[9]),
310            extract_lo_hi_f32(input_packed[4], input_packed[9]),
311            extract_hi_lo_f32(input_packed[4], input_packed[10]),
312            extract_lo_hi_f32(input_packed[5], input_packed[10]),
313        ];
314
315        let out = self.perform_parallel_fft_direct(values);
316
317        let out_packed = [
318            extract_lo_lo_f32(out[0], out[1]),
319            extract_lo_lo_f32(out[2], out[3]),
320            extract_lo_lo_f32(out[4], out[5]),
321            extract_lo_lo_f32(out[6], out[7]),
322            extract_lo_lo_f32(out[8], out[9]),
323            extract_lo_hi_f32(out[10], out[0]),
324            extract_hi_hi_f32(out[1], out[2]),
325            extract_hi_hi_f32(out[3], out[4]),
326            extract_hi_hi_f32(out[5], out[6]),
327            extract_hi_hi_f32(out[7], out[8]),
328            extract_hi_hi_f32(out[9], out[10]),
329        ];
330
331        write_complex_to_array_strided!(out_packed, buffer, 2, { 0,1,2,3,4,5,6,7,8,9,10 });
332    }
333
334    #[inline(always)]
335    pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 11]) -> [__m128; 11] {
336        let rotate = SseVector::make_rotate90(FftDirection::Inverse);
337
338        let y00 = values[0];
339        let [x1p10, x1m10] =  SseVector::column_butterfly2([values[1], values[10]]);
340        let x1m10 = SseVector::apply_rotate90(rotate, x1m10);
341        let y00 = SseVector::add(y00, x1p10);
342        let [x2p9, x2m9] =  SseVector::column_butterfly2([values[2], values[9]]);
343        let x2m9 = SseVector::apply_rotate90(rotate, x2m9);
344        let y00 = SseVector::add(y00, x2p9);
345        let [x3p8, x3m8] =  SseVector::column_butterfly2([values[3], values[8]]);
346        let x3m8 = SseVector::apply_rotate90(rotate, x3m8);
347        let y00 = SseVector::add(y00, x3p8);
348        let [x4p7, x4m7] =  SseVector::column_butterfly2([values[4], values[7]]);
349        let x4m7 = SseVector::apply_rotate90(rotate, x4m7);
350        let y00 = SseVector::add(y00, x4p7);
351        let [x5p6, x5m6] =  SseVector::column_butterfly2([values[5], values[6]]);
352        let x5m6 = SseVector::apply_rotate90(rotate, x5m6);
353        let y00 = SseVector::add(y00, x5p6);
354
355        let m0110a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p10);
356        let m0110a = SseVector::fmadd(m0110a, self.twiddles_re[1], x2p9);
357        let m0110a = SseVector::fmadd(m0110a, self.twiddles_re[2], x3p8);
358        let m0110a = SseVector::fmadd(m0110a, self.twiddles_re[3], x4p7);
359        let m0110a = SseVector::fmadd(m0110a, self.twiddles_re[4], x5p6);
360        let m0110b = SseVector::mul(self.twiddles_im[0], x1m10);
361        let m0110b = SseVector::fmadd(m0110b, self.twiddles_im[1], x2m9);
362        let m0110b = SseVector::fmadd(m0110b, self.twiddles_im[2], x3m8);
363        let m0110b = SseVector::fmadd(m0110b, self.twiddles_im[3], x4m7);
364        let m0110b = SseVector::fmadd(m0110b, self.twiddles_im[4], x5m6);
365        let [y01, y10] = SseVector::column_butterfly2([m0110a, m0110b]);
366
367        let m0209a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p10);
368        let m0209a = SseVector::fmadd(m0209a, self.twiddles_re[3], x2p9);
369        let m0209a = SseVector::fmadd(m0209a, self.twiddles_re[4], x3p8);
370        let m0209a = SseVector::fmadd(m0209a, self.twiddles_re[2], x4p7);
371        let m0209a = SseVector::fmadd(m0209a, self.twiddles_re[0], x5p6);
372        let m0209b = SseVector::mul(self.twiddles_im[1], x1m10);
373        let m0209b = SseVector::fmadd(m0209b, self.twiddles_im[3], x2m9);
374        let m0209b = SseVector::nmadd(m0209b, self.twiddles_im[4], x3m8);
375        let m0209b = SseVector::nmadd(m0209b, self.twiddles_im[2], x4m7);
376        let m0209b = SseVector::nmadd(m0209b, self.twiddles_im[0], x5m6);
377        let [y02, y09] = SseVector::column_butterfly2([m0209a, m0209b]);
378
379        let m0308a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p10);
380        let m0308a = SseVector::fmadd(m0308a, self.twiddles_re[4], x2p9);
381        let m0308a = SseVector::fmadd(m0308a, self.twiddles_re[1], x3p8);
382        let m0308a = SseVector::fmadd(m0308a, self.twiddles_re[0], x4p7);
383        let m0308a = SseVector::fmadd(m0308a, self.twiddles_re[3], x5p6);
384        let m0308b = SseVector::mul(self.twiddles_im[2], x1m10);
385        let m0308b = SseVector::nmadd(m0308b, self.twiddles_im[4], x2m9);
386        let m0308b = SseVector::nmadd(m0308b, self.twiddles_im[1], x3m8);
387        let m0308b = SseVector::fmadd(m0308b, self.twiddles_im[0], x4m7);
388        let m0308b = SseVector::fmadd(m0308b, self.twiddles_im[3], x5m6);
389        let [y03, y08] = SseVector::column_butterfly2([m0308a, m0308b]);
390
391        let m0407a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p10);
392        let m0407a = SseVector::fmadd(m0407a, self.twiddles_re[2], x2p9);
393        let m0407a = SseVector::fmadd(m0407a, self.twiddles_re[0], x3p8);
394        let m0407a = SseVector::fmadd(m0407a, self.twiddles_re[4], x4p7);
395        let m0407a = SseVector::fmadd(m0407a, self.twiddles_re[1], x5p6);
396        let m0407b = SseVector::mul(self.twiddles_im[3], x1m10);
397        let m0407b = SseVector::nmadd(m0407b, self.twiddles_im[2], x2m9);
398        let m0407b = SseVector::fmadd(m0407b, self.twiddles_im[0], x3m8);
399        let m0407b = SseVector::fmadd(m0407b, self.twiddles_im[4], x4m7);
400        let m0407b = SseVector::nmadd(m0407b, self.twiddles_im[1], x5m6);
401        let [y04, y07] = SseVector::column_butterfly2([m0407a, m0407b]);
402
403        let m0506a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p10);
404        let m0506a = SseVector::fmadd(m0506a, self.twiddles_re[0], x2p9);
405        let m0506a = SseVector::fmadd(m0506a, self.twiddles_re[3], x3p8);
406        let m0506a = SseVector::fmadd(m0506a, self.twiddles_re[1], x4p7);
407        let m0506a = SseVector::fmadd(m0506a, self.twiddles_re[2], x5p6);
408        let m0506b = SseVector::mul(self.twiddles_im[4], x1m10);
409        let m0506b = SseVector::nmadd(m0506b, self.twiddles_im[0], x2m9);
410        let m0506b = SseVector::fmadd(m0506b, self.twiddles_im[3], x3m8);
411        let m0506b = SseVector::nmadd(m0506b, self.twiddles_im[1], x4m7);
412        let m0506b = SseVector::fmadd(m0506b, self.twiddles_im[2], x5m6);
413        let [y05, y06] = SseVector::column_butterfly2([m0506a, m0506b]);
414
415
416        [y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10]
417    }
418}
419
420struct SseF64Butterfly11<T> {
421    direction: FftDirection,
422    twiddles_re: [__m128d; 5],
423    twiddles_im: [__m128d; 5],
424    _phantom: std::marker::PhantomData<T>,
425}
426
427boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly11);
428boilerplate_fft_sse_common_butterfly!(SseF64Butterfly11, 11, |this: &SseF64Butterfly11<_>| this.direction);
429impl<T: FftNum> SseF64Butterfly11<T> {
430    /// Safety: The current machine must support the sse4.1 instruction set
431    #[target_feature(enable = "sse4.1")]
432    unsafe fn new(direction: FftDirection) -> Self {
433        assert_f64::<T>();
434        let twiddles = make_twiddles(11, direction);
435        unsafe {Self {
436            direction,
437            twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
438            twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
439            _phantom: std::marker::PhantomData,
440        }}
441    }
442
443    #[inline(always)]
444    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
445        let values = read_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10 });
446
447        let out = self.perform_fft_direct(values);
448
449        write_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10 });   
450    }
451
452    #[inline(always)]
453    pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 11]) -> [__m128d; 11] {
454        let rotate = SseVector::make_rotate90(FftDirection::Inverse);
455
456        let y00 = values[0];
457        let [x1p10, x1m10] =  SseVector::column_butterfly2([values[1], values[10]]);
458        let x1m10 = SseVector::apply_rotate90(rotate, x1m10);
459        let y00 = SseVector::add(y00, x1p10);
460        let [x2p9, x2m9] =  SseVector::column_butterfly2([values[2], values[9]]);
461        let x2m9 = SseVector::apply_rotate90(rotate, x2m9);
462        let y00 = SseVector::add(y00, x2p9);
463        let [x3p8, x3m8] =  SseVector::column_butterfly2([values[3], values[8]]);
464        let x3m8 = SseVector::apply_rotate90(rotate, x3m8);
465        let y00 = SseVector::add(y00, x3p8);
466        let [x4p7, x4m7] =  SseVector::column_butterfly2([values[4], values[7]]);
467        let x4m7 = SseVector::apply_rotate90(rotate, x4m7);
468        let y00 = SseVector::add(y00, x4p7);
469        let [x5p6, x5m6] =  SseVector::column_butterfly2([values[5], values[6]]);
470        let x5m6 = SseVector::apply_rotate90(rotate, x5m6);
471        let y00 = SseVector::add(y00, x5p6);
472
473        let m0110a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p10);
474        let m0110a = SseVector::fmadd(m0110a, self.twiddles_re[1], x2p9);
475        let m0110a = SseVector::fmadd(m0110a, self.twiddles_re[2], x3p8);
476        let m0110a = SseVector::fmadd(m0110a, self.twiddles_re[3], x4p7);
477        let m0110a = SseVector::fmadd(m0110a, self.twiddles_re[4], x5p6);
478        let m0110b = SseVector::mul(self.twiddles_im[0], x1m10);
479        let m0110b = SseVector::fmadd(m0110b, self.twiddles_im[1], x2m9);
480        let m0110b = SseVector::fmadd(m0110b, self.twiddles_im[2], x3m8);
481        let m0110b = SseVector::fmadd(m0110b, self.twiddles_im[3], x4m7);
482        let m0110b = SseVector::fmadd(m0110b, self.twiddles_im[4], x5m6);
483        let [y01, y10] = SseVector::column_butterfly2([m0110a, m0110b]);
484
485        let m0209a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p10);
486        let m0209a = SseVector::fmadd(m0209a, self.twiddles_re[3], x2p9);
487        let m0209a = SseVector::fmadd(m0209a, self.twiddles_re[4], x3p8);
488        let m0209a = SseVector::fmadd(m0209a, self.twiddles_re[2], x4p7);
489        let m0209a = SseVector::fmadd(m0209a, self.twiddles_re[0], x5p6);
490        let m0209b = SseVector::mul(self.twiddles_im[1], x1m10);
491        let m0209b = SseVector::fmadd(m0209b, self.twiddles_im[3], x2m9);
492        let m0209b = SseVector::nmadd(m0209b, self.twiddles_im[4], x3m8);
493        let m0209b = SseVector::nmadd(m0209b, self.twiddles_im[2], x4m7);
494        let m0209b = SseVector::nmadd(m0209b, self.twiddles_im[0], x5m6);
495        let [y02, y09] = SseVector::column_butterfly2([m0209a, m0209b]);
496
497        let m0308a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p10);
498        let m0308a = SseVector::fmadd(m0308a, self.twiddles_re[4], x2p9);
499        let m0308a = SseVector::fmadd(m0308a, self.twiddles_re[1], x3p8);
500        let m0308a = SseVector::fmadd(m0308a, self.twiddles_re[0], x4p7);
501        let m0308a = SseVector::fmadd(m0308a, self.twiddles_re[3], x5p6);
502        let m0308b = SseVector::mul(self.twiddles_im[2], x1m10);
503        let m0308b = SseVector::nmadd(m0308b, self.twiddles_im[4], x2m9);
504        let m0308b = SseVector::nmadd(m0308b, self.twiddles_im[1], x3m8);
505        let m0308b = SseVector::fmadd(m0308b, self.twiddles_im[0], x4m7);
506        let m0308b = SseVector::fmadd(m0308b, self.twiddles_im[3], x5m6);
507        let [y03, y08] = SseVector::column_butterfly2([m0308a, m0308b]);
508
509        let m0407a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p10);
510        let m0407a = SseVector::fmadd(m0407a, self.twiddles_re[2], x2p9);
511        let m0407a = SseVector::fmadd(m0407a, self.twiddles_re[0], x3p8);
512        let m0407a = SseVector::fmadd(m0407a, self.twiddles_re[4], x4p7);
513        let m0407a = SseVector::fmadd(m0407a, self.twiddles_re[1], x5p6);
514        let m0407b = SseVector::mul(self.twiddles_im[3], x1m10);
515        let m0407b = SseVector::nmadd(m0407b, self.twiddles_im[2], x2m9);
516        let m0407b = SseVector::fmadd(m0407b, self.twiddles_im[0], x3m8);
517        let m0407b = SseVector::fmadd(m0407b, self.twiddles_im[4], x4m7);
518        let m0407b = SseVector::nmadd(m0407b, self.twiddles_im[1], x5m6);
519        let [y04, y07] = SseVector::column_butterfly2([m0407a, m0407b]);
520
521        let m0506a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p10);
522        let m0506a = SseVector::fmadd(m0506a, self.twiddles_re[0], x2p9);
523        let m0506a = SseVector::fmadd(m0506a, self.twiddles_re[3], x3p8);
524        let m0506a = SseVector::fmadd(m0506a, self.twiddles_re[1], x4p7);
525        let m0506a = SseVector::fmadd(m0506a, self.twiddles_re[2], x5p6);
526        let m0506b = SseVector::mul(self.twiddles_im[4], x1m10);
527        let m0506b = SseVector::nmadd(m0506b, self.twiddles_im[0], x2m9);
528        let m0506b = SseVector::fmadd(m0506b, self.twiddles_im[3], x3m8);
529        let m0506b = SseVector::nmadd(m0506b, self.twiddles_im[1], x4m7);
530        let m0506b = SseVector::fmadd(m0506b, self.twiddles_im[2], x5m6);
531        let [y05, y06] = SseVector::column_butterfly2([m0506a, m0506b]);
532
533
534        [y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10]
535    }
536}
537
538struct SseF32Butterfly13<T> {
539    direction: FftDirection,
540    twiddles_re: [__m128; 6],
541    twiddles_im: [__m128; 6],
542    _phantom: std::marker::PhantomData<T>,
543}
544
545boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly13);
546boilerplate_fft_sse_common_butterfly!(SseF32Butterfly13, 13, |this: &SseF32Butterfly13<_>| this.direction);
547impl<T: FftNum> SseF32Butterfly13<T> {
548    /// Safety: The current machine must support the sse4.1 instruction set
549    #[target_feature(enable = "sse4.1")]
550    unsafe fn new(direction: FftDirection) -> Self {
551        assert_f32::<T>();
552        let twiddles = make_twiddles(13, direction);
553        Self {
554            direction,
555            twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
556            twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
557            _phantom: std::marker::PhantomData,
558        }
559    }
560
561    #[inline(always)]
562    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
563        let values = read_partial1_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12 });
564
565        let out = self.perform_parallel_fft_direct(values);
566        
567        write_partial_lo_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12 } ); 
568    }
569
570    #[inline(always)]
571    pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
572        let input_packed = read_complex_to_array!(buffer, { 0,2,4,6,8,10,12,14,16,18,20,22,24 });
573
574        let values = [
575            extract_lo_hi_f32(input_packed[0], input_packed[6]),
576            extract_hi_lo_f32(input_packed[0], input_packed[7]),
577            extract_lo_hi_f32(input_packed[1], input_packed[7]),
578            extract_hi_lo_f32(input_packed[1], input_packed[8]),
579            extract_lo_hi_f32(input_packed[2], input_packed[8]),
580            extract_hi_lo_f32(input_packed[2], input_packed[9]),
581            extract_lo_hi_f32(input_packed[3], input_packed[9]),
582            extract_hi_lo_f32(input_packed[3], input_packed[10]),
583            extract_lo_hi_f32(input_packed[4], input_packed[10]),
584            extract_hi_lo_f32(input_packed[4], input_packed[11]),
585            extract_lo_hi_f32(input_packed[5], input_packed[11]),
586            extract_hi_lo_f32(input_packed[5], input_packed[12]),
587            extract_lo_hi_f32(input_packed[6], input_packed[12]),
588        ];
589
590        let out = self.perform_parallel_fft_direct(values);
591
592        let out_packed = [
593            extract_lo_lo_f32(out[0], out[1]),
594            extract_lo_lo_f32(out[2], out[3]),
595            extract_lo_lo_f32(out[4], out[5]),
596            extract_lo_lo_f32(out[6], out[7]),
597            extract_lo_lo_f32(out[8], out[9]),
598            extract_lo_lo_f32(out[10], out[11]),
599            extract_lo_hi_f32(out[12], out[0]),
600            extract_hi_hi_f32(out[1], out[2]),
601            extract_hi_hi_f32(out[3], out[4]),
602            extract_hi_hi_f32(out[5], out[6]),
603            extract_hi_hi_f32(out[7], out[8]),
604            extract_hi_hi_f32(out[9], out[10]),
605            extract_hi_hi_f32(out[11], out[12]),
606        ];
607
608        write_complex_to_array_strided!(out_packed, buffer, 2, { 0,1,2,3,4,5,6,7,8,9,10,11,12 });
609    }
610
611    #[inline(always)]
612    pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 13]) -> [__m128; 13] {
613        let rotate = SseVector::make_rotate90(FftDirection::Inverse);
614
615        let y00 = values[0];
616        let [x1p12, x1m12] =  SseVector::column_butterfly2([values[1], values[12]]);
617        let x1m12 = SseVector::apply_rotate90(rotate, x1m12);
618        let y00 = SseVector::add(y00, x1p12);
619        let [x2p11, x2m11] =  SseVector::column_butterfly2([values[2], values[11]]);
620        let x2m11 = SseVector::apply_rotate90(rotate, x2m11);
621        let y00 = SseVector::add(y00, x2p11);
622        let [x3p10, x3m10] =  SseVector::column_butterfly2([values[3], values[10]]);
623        let x3m10 = SseVector::apply_rotate90(rotate, x3m10);
624        let y00 = SseVector::add(y00, x3p10);
625        let [x4p9, x4m9] =  SseVector::column_butterfly2([values[4], values[9]]);
626        let x4m9 = SseVector::apply_rotate90(rotate, x4m9);
627        let y00 = SseVector::add(y00, x4p9);
628        let [x5p8, x5m8] =  SseVector::column_butterfly2([values[5], values[8]]);
629        let x5m8 = SseVector::apply_rotate90(rotate, x5m8);
630        let y00 = SseVector::add(y00, x5p8);
631        let [x6p7, x6m7] =  SseVector::column_butterfly2([values[6], values[7]]);
632        let x6m7 = SseVector::apply_rotate90(rotate, x6m7);
633        let y00 = SseVector::add(y00, x6p7);
634
635        let m0112a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p12);
636        let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[1], x2p11);
637        let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[2], x3p10);
638        let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[3], x4p9);
639        let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[4], x5p8);
640        let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[5], x6p7);
641        let m0112b = SseVector::mul(self.twiddles_im[0], x1m12);
642        let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[1], x2m11);
643        let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[2], x3m10);
644        let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[3], x4m9);
645        let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[4], x5m8);
646        let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[5], x6m7);
647        let [y01, y12] = SseVector::column_butterfly2([m0112a, m0112b]);
648
649        let m0211a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p12);
650        let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[3], x2p11);
651        let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[5], x3p10);
652        let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[4], x4p9);
653        let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[2], x5p8);
654        let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[0], x6p7);
655        let m0211b = SseVector::mul(self.twiddles_im[1], x1m12);
656        let m0211b = SseVector::fmadd(m0211b, self.twiddles_im[3], x2m11);
657        let m0211b = SseVector::fmadd(m0211b, self.twiddles_im[5], x3m10);
658        let m0211b = SseVector::nmadd(m0211b, self.twiddles_im[4], x4m9);
659        let m0211b = SseVector::nmadd(m0211b, self.twiddles_im[2], x5m8);
660        let m0211b = SseVector::nmadd(m0211b, self.twiddles_im[0], x6m7);
661        let [y02, y11] = SseVector::column_butterfly2([m0211a, m0211b]);
662
663        let m0310a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p12);
664        let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[5], x2p11);
665        let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[3], x3p10);
666        let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[0], x4p9);
667        let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[1], x5p8);
668        let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[4], x6p7);
669        let m0310b = SseVector::mul(self.twiddles_im[2], x1m12);
670        let m0310b = SseVector::fmadd(m0310b, self.twiddles_im[5], x2m11);
671        let m0310b = SseVector::nmadd(m0310b, self.twiddles_im[3], x3m10);
672        let m0310b = SseVector::nmadd(m0310b, self.twiddles_im[0], x4m9);
673        let m0310b = SseVector::fmadd(m0310b, self.twiddles_im[1], x5m8);
674        let m0310b = SseVector::fmadd(m0310b, self.twiddles_im[4], x6m7);
675        let [y03, y10] = SseVector::column_butterfly2([m0310a, m0310b]);
676
677        let m0409a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p12);
678        let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[4], x2p11);
679        let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[0], x3p10);
680        let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[2], x4p9);
681        let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[5], x5p8);
682        let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[1], x6p7);
683        let m0409b = SseVector::mul(self.twiddles_im[3], x1m12);
684        let m0409b = SseVector::nmadd(m0409b, self.twiddles_im[4], x2m11);
685        let m0409b = SseVector::nmadd(m0409b, self.twiddles_im[0], x3m10);
686        let m0409b = SseVector::fmadd(m0409b, self.twiddles_im[2], x4m9);
687        let m0409b = SseVector::nmadd(m0409b, self.twiddles_im[5], x5m8);
688        let m0409b = SseVector::nmadd(m0409b, self.twiddles_im[1], x6m7);
689        let [y04, y09] = SseVector::column_butterfly2([m0409a, m0409b]);
690
691        let m0508a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p12);
692        let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[2], x2p11);
693        let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[1], x3p10);
694        let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[5], x4p9);
695        let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[0], x5p8);
696        let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[3], x6p7);
697        let m0508b = SseVector::mul(self.twiddles_im[4], x1m12);
698        let m0508b = SseVector::nmadd(m0508b, self.twiddles_im[2], x2m11);
699        let m0508b = SseVector::fmadd(m0508b, self.twiddles_im[1], x3m10);
700        let m0508b = SseVector::nmadd(m0508b, self.twiddles_im[5], x4m9);
701        let m0508b = SseVector::nmadd(m0508b, self.twiddles_im[0], x5m8);
702        let m0508b = SseVector::fmadd(m0508b, self.twiddles_im[3], x6m7);
703        let [y05, y08] = SseVector::column_butterfly2([m0508a, m0508b]);
704
705        let m0607a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p12);
706        let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[0], x2p11);
707        let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[4], x3p10);
708        let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[1], x4p9);
709        let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[3], x5p8);
710        let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[2], x6p7);
711        let m0607b = SseVector::mul(self.twiddles_im[5], x1m12);
712        let m0607b = SseVector::nmadd(m0607b, self.twiddles_im[0], x2m11);
713        let m0607b = SseVector::fmadd(m0607b, self.twiddles_im[4], x3m10);
714        let m0607b = SseVector::nmadd(m0607b, self.twiddles_im[1], x4m9);
715        let m0607b = SseVector::fmadd(m0607b, self.twiddles_im[3], x5m8);
716        let m0607b = SseVector::nmadd(m0607b, self.twiddles_im[2], x6m7);
717        let [y06, y07] = SseVector::column_butterfly2([m0607a, m0607b]);
718
719
720        [y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12]
721    }
722}
723
724struct SseF64Butterfly13<T> {
725    direction: FftDirection,
726    twiddles_re: [__m128d; 6],
727    twiddles_im: [__m128d; 6],
728    _phantom: std::marker::PhantomData<T>,
729}
730
731boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly13);
732boilerplate_fft_sse_common_butterfly!(SseF64Butterfly13, 13, |this: &SseF64Butterfly13<_>| this.direction);
733impl<T: FftNum> SseF64Butterfly13<T> {
734    /// Safety: The current machine must support the sse4.1 instruction set
735    #[target_feature(enable = "sse4.1")]
736    unsafe fn new(direction: FftDirection) -> Self {
737        assert_f64::<T>();
738        let twiddles = make_twiddles(13, direction);
739        unsafe {Self {
740            direction,
741            twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
742            twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
743            _phantom: std::marker::PhantomData,
744        }}
745    }
746
747    #[inline(always)]
748    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
749        let values = read_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12 });
750
751        let out = self.perform_fft_direct(values);
752
753        write_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12 });   
754    }
755
756    #[inline(always)]
757    pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 13]) -> [__m128d; 13] {
758        let rotate = SseVector::make_rotate90(FftDirection::Inverse);
759
760        let y00 = values[0];
761        let [x1p12, x1m12] =  SseVector::column_butterfly2([values[1], values[12]]);
762        let x1m12 = SseVector::apply_rotate90(rotate, x1m12);
763        let y00 = SseVector::add(y00, x1p12);
764        let [x2p11, x2m11] =  SseVector::column_butterfly2([values[2], values[11]]);
765        let x2m11 = SseVector::apply_rotate90(rotate, x2m11);
766        let y00 = SseVector::add(y00, x2p11);
767        let [x3p10, x3m10] =  SseVector::column_butterfly2([values[3], values[10]]);
768        let x3m10 = SseVector::apply_rotate90(rotate, x3m10);
769        let y00 = SseVector::add(y00, x3p10);
770        let [x4p9, x4m9] =  SseVector::column_butterfly2([values[4], values[9]]);
771        let x4m9 = SseVector::apply_rotate90(rotate, x4m9);
772        let y00 = SseVector::add(y00, x4p9);
773        let [x5p8, x5m8] =  SseVector::column_butterfly2([values[5], values[8]]);
774        let x5m8 = SseVector::apply_rotate90(rotate, x5m8);
775        let y00 = SseVector::add(y00, x5p8);
776        let [x6p7, x6m7] =  SseVector::column_butterfly2([values[6], values[7]]);
777        let x6m7 = SseVector::apply_rotate90(rotate, x6m7);
778        let y00 = SseVector::add(y00, x6p7);
779
780        let m0112a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p12);
781        let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[1], x2p11);
782        let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[2], x3p10);
783        let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[3], x4p9);
784        let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[4], x5p8);
785        let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[5], x6p7);
786        let m0112b = SseVector::mul(self.twiddles_im[0], x1m12);
787        let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[1], x2m11);
788        let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[2], x3m10);
789        let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[3], x4m9);
790        let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[4], x5m8);
791        let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[5], x6m7);
792        let [y01, y12] = SseVector::column_butterfly2([m0112a, m0112b]);
793
794        let m0211a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p12);
795        let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[3], x2p11);
796        let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[5], x3p10);
797        let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[4], x4p9);
798        let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[2], x5p8);
799        let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[0], x6p7);
800        let m0211b = SseVector::mul(self.twiddles_im[1], x1m12);
801        let m0211b = SseVector::fmadd(m0211b, self.twiddles_im[3], x2m11);
802        let m0211b = SseVector::fmadd(m0211b, self.twiddles_im[5], x3m10);
803        let m0211b = SseVector::nmadd(m0211b, self.twiddles_im[4], x4m9);
804        let m0211b = SseVector::nmadd(m0211b, self.twiddles_im[2], x5m8);
805        let m0211b = SseVector::nmadd(m0211b, self.twiddles_im[0], x6m7);
806        let [y02, y11] = SseVector::column_butterfly2([m0211a, m0211b]);
807
808        let m0310a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p12);
809        let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[5], x2p11);
810        let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[3], x3p10);
811        let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[0], x4p9);
812        let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[1], x5p8);
813        let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[4], x6p7);
814        let m0310b = SseVector::mul(self.twiddles_im[2], x1m12);
815        let m0310b = SseVector::fmadd(m0310b, self.twiddles_im[5], x2m11);
816        let m0310b = SseVector::nmadd(m0310b, self.twiddles_im[3], x3m10);
817        let m0310b = SseVector::nmadd(m0310b, self.twiddles_im[0], x4m9);
818        let m0310b = SseVector::fmadd(m0310b, self.twiddles_im[1], x5m8);
819        let m0310b = SseVector::fmadd(m0310b, self.twiddles_im[4], x6m7);
820        let [y03, y10] = SseVector::column_butterfly2([m0310a, m0310b]);
821
822        let m0409a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p12);
823        let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[4], x2p11);
824        let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[0], x3p10);
825        let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[2], x4p9);
826        let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[5], x5p8);
827        let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[1], x6p7);
828        let m0409b = SseVector::mul(self.twiddles_im[3], x1m12);
829        let m0409b = SseVector::nmadd(m0409b, self.twiddles_im[4], x2m11);
830        let m0409b = SseVector::nmadd(m0409b, self.twiddles_im[0], x3m10);
831        let m0409b = SseVector::fmadd(m0409b, self.twiddles_im[2], x4m9);
832        let m0409b = SseVector::nmadd(m0409b, self.twiddles_im[5], x5m8);
833        let m0409b = SseVector::nmadd(m0409b, self.twiddles_im[1], x6m7);
834        let [y04, y09] = SseVector::column_butterfly2([m0409a, m0409b]);
835
836        let m0508a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p12);
837        let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[2], x2p11);
838        let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[1], x3p10);
839        let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[5], x4p9);
840        let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[0], x5p8);
841        let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[3], x6p7);
842        let m0508b = SseVector::mul(self.twiddles_im[4], x1m12);
843        let m0508b = SseVector::nmadd(m0508b, self.twiddles_im[2], x2m11);
844        let m0508b = SseVector::fmadd(m0508b, self.twiddles_im[1], x3m10);
845        let m0508b = SseVector::nmadd(m0508b, self.twiddles_im[5], x4m9);
846        let m0508b = SseVector::nmadd(m0508b, self.twiddles_im[0], x5m8);
847        let m0508b = SseVector::fmadd(m0508b, self.twiddles_im[3], x6m7);
848        let [y05, y08] = SseVector::column_butterfly2([m0508a, m0508b]);
849
850        let m0607a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p12);
851        let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[0], x2p11);
852        let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[4], x3p10);
853        let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[1], x4p9);
854        let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[3], x5p8);
855        let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[2], x6p7);
856        let m0607b = SseVector::mul(self.twiddles_im[5], x1m12);
857        let m0607b = SseVector::nmadd(m0607b, self.twiddles_im[0], x2m11);
858        let m0607b = SseVector::fmadd(m0607b, self.twiddles_im[4], x3m10);
859        let m0607b = SseVector::nmadd(m0607b, self.twiddles_im[1], x4m9);
860        let m0607b = SseVector::fmadd(m0607b, self.twiddles_im[3], x5m8);
861        let m0607b = SseVector::nmadd(m0607b, self.twiddles_im[2], x6m7);
862        let [y06, y07] = SseVector::column_butterfly2([m0607a, m0607b]);
863
864
865        [y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12]
866    }
867}
868
869struct SseF32Butterfly17<T> {
870    direction: FftDirection,
871    twiddles_re: [__m128; 8],
872    twiddles_im: [__m128; 8],
873    _phantom: std::marker::PhantomData<T>,
874}
875
876boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly17);
877boilerplate_fft_sse_common_butterfly!(SseF32Butterfly17, 17, |this: &SseF32Butterfly17<_>| this.direction);
878impl<T: FftNum> SseF32Butterfly17<T> {
879    /// Safety: The current machine must support the sse4.1 instruction set
880    #[target_feature(enable = "sse4.1")]
881    unsafe fn new(direction: FftDirection) -> Self {
882        assert_f32::<T>();
883        let twiddles = make_twiddles(17, direction);
884        Self {
885            direction,
886            twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
887            twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
888            _phantom: std::marker::PhantomData,
889        }
890    }
891
892    #[inline(always)]
893    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
894        let values = read_partial1_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 });
895
896        let out = self.perform_parallel_fft_direct(values);
897        
898        write_partial_lo_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 } ); 
899    }
900
901    #[inline(always)]
902    pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
903        let input_packed = read_complex_to_array!(buffer, { 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32 });
904
905        let values = [
906            extract_lo_hi_f32(input_packed[0], input_packed[8]),
907            extract_hi_lo_f32(input_packed[0], input_packed[9]),
908            extract_lo_hi_f32(input_packed[1], input_packed[9]),
909            extract_hi_lo_f32(input_packed[1], input_packed[10]),
910            extract_lo_hi_f32(input_packed[2], input_packed[10]),
911            extract_hi_lo_f32(input_packed[2], input_packed[11]),
912            extract_lo_hi_f32(input_packed[3], input_packed[11]),
913            extract_hi_lo_f32(input_packed[3], input_packed[12]),
914            extract_lo_hi_f32(input_packed[4], input_packed[12]),
915            extract_hi_lo_f32(input_packed[4], input_packed[13]),
916            extract_lo_hi_f32(input_packed[5], input_packed[13]),
917            extract_hi_lo_f32(input_packed[5], input_packed[14]),
918            extract_lo_hi_f32(input_packed[6], input_packed[14]),
919            extract_hi_lo_f32(input_packed[6], input_packed[15]),
920            extract_lo_hi_f32(input_packed[7], input_packed[15]),
921            extract_hi_lo_f32(input_packed[7], input_packed[16]),
922            extract_lo_hi_f32(input_packed[8], input_packed[16]),
923        ];
924
925        let out = self.perform_parallel_fft_direct(values);
926
927        let out_packed = [
928            extract_lo_lo_f32(out[0], out[1]),
929            extract_lo_lo_f32(out[2], out[3]),
930            extract_lo_lo_f32(out[4], out[5]),
931            extract_lo_lo_f32(out[6], out[7]),
932            extract_lo_lo_f32(out[8], out[9]),
933            extract_lo_lo_f32(out[10], out[11]),
934            extract_lo_lo_f32(out[12], out[13]),
935            extract_lo_lo_f32(out[14], out[15]),
936            extract_lo_hi_f32(out[16], out[0]),
937            extract_hi_hi_f32(out[1], out[2]),
938            extract_hi_hi_f32(out[3], out[4]),
939            extract_hi_hi_f32(out[5], out[6]),
940            extract_hi_hi_f32(out[7], out[8]),
941            extract_hi_hi_f32(out[9], out[10]),
942            extract_hi_hi_f32(out[11], out[12]),
943            extract_hi_hi_f32(out[13], out[14]),
944            extract_hi_hi_f32(out[15], out[16]),
945        ];
946
947        write_complex_to_array_strided!(out_packed, buffer, 2, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 });
948    }
949
950    #[inline(always)]
951    pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 17]) -> [__m128; 17] {
952        let rotate = SseVector::make_rotate90(FftDirection::Inverse);
953
954        let y00 = values[0];
955        let [x1p16, x1m16] =  SseVector::column_butterfly2([values[1], values[16]]);
956        let x1m16 = SseVector::apply_rotate90(rotate, x1m16);
957        let y00 = SseVector::add(y00, x1p16);
958        let [x2p15, x2m15] =  SseVector::column_butterfly2([values[2], values[15]]);
959        let x2m15 = SseVector::apply_rotate90(rotate, x2m15);
960        let y00 = SseVector::add(y00, x2p15);
961        let [x3p14, x3m14] =  SseVector::column_butterfly2([values[3], values[14]]);
962        let x3m14 = SseVector::apply_rotate90(rotate, x3m14);
963        let y00 = SseVector::add(y00, x3p14);
964        let [x4p13, x4m13] =  SseVector::column_butterfly2([values[4], values[13]]);
965        let x4m13 = SseVector::apply_rotate90(rotate, x4m13);
966        let y00 = SseVector::add(y00, x4p13);
967        let [x5p12, x5m12] =  SseVector::column_butterfly2([values[5], values[12]]);
968        let x5m12 = SseVector::apply_rotate90(rotate, x5m12);
969        let y00 = SseVector::add(y00, x5p12);
970        let [x6p11, x6m11] =  SseVector::column_butterfly2([values[6], values[11]]);
971        let x6m11 = SseVector::apply_rotate90(rotate, x6m11);
972        let y00 = SseVector::add(y00, x6p11);
973        let [x7p10, x7m10] =  SseVector::column_butterfly2([values[7], values[10]]);
974        let x7m10 = SseVector::apply_rotate90(rotate, x7m10);
975        let y00 = SseVector::add(y00, x7p10);
976        let [x8p9, x8m9] =  SseVector::column_butterfly2([values[8], values[9]]);
977        let x8m9 = SseVector::apply_rotate90(rotate, x8m9);
978        let y00 = SseVector::add(y00, x8p9);
979
980        let m0116a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p16);
981        let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[1], x2p15);
982        let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[2], x3p14);
983        let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[3], x4p13);
984        let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[4], x5p12);
985        let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[5], x6p11);
986        let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[6], x7p10);
987        let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[7], x8p9);
988        let m0116b = SseVector::mul(self.twiddles_im[0], x1m16);
989        let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[1], x2m15);
990        let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[2], x3m14);
991        let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[3], x4m13);
992        let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[4], x5m12);
993        let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[5], x6m11);
994        let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[6], x7m10);
995        let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[7], x8m9);
996        let [y01, y16] = SseVector::column_butterfly2([m0116a, m0116b]);
997
998        let m0215a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p16);
999        let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[3], x2p15);
1000        let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[5], x3p14);
1001        let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[7], x4p13);
1002        let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[6], x5p12);
1003        let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[4], x6p11);
1004        let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[2], x7p10);
1005        let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[0], x8p9);
1006        let m0215b = SseVector::mul(self.twiddles_im[1], x1m16);
1007        let m0215b = SseVector::fmadd(m0215b, self.twiddles_im[3], x2m15);
1008        let m0215b = SseVector::fmadd(m0215b, self.twiddles_im[5], x3m14);
1009        let m0215b = SseVector::fmadd(m0215b, self.twiddles_im[7], x4m13);
1010        let m0215b = SseVector::nmadd(m0215b, self.twiddles_im[6], x5m12);
1011        let m0215b = SseVector::nmadd(m0215b, self.twiddles_im[4], x6m11);
1012        let m0215b = SseVector::nmadd(m0215b, self.twiddles_im[2], x7m10);
1013        let m0215b = SseVector::nmadd(m0215b, self.twiddles_im[0], x8m9);
1014        let [y02, y15] = SseVector::column_butterfly2([m0215a, m0215b]);
1015
1016        let m0314a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p16);
1017        let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[5], x2p15);
1018        let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[7], x3p14);
1019        let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[4], x4p13);
1020        let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[1], x5p12);
1021        let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[0], x6p11);
1022        let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[3], x7p10);
1023        let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[6], x8p9);
1024        let m0314b = SseVector::mul(self.twiddles_im[2], x1m16);
1025        let m0314b = SseVector::fmadd(m0314b, self.twiddles_im[5], x2m15);
1026        let m0314b = SseVector::nmadd(m0314b, self.twiddles_im[7], x3m14);
1027        let m0314b = SseVector::nmadd(m0314b, self.twiddles_im[4], x4m13);
1028        let m0314b = SseVector::nmadd(m0314b, self.twiddles_im[1], x5m12);
1029        let m0314b = SseVector::fmadd(m0314b, self.twiddles_im[0], x6m11);
1030        let m0314b = SseVector::fmadd(m0314b, self.twiddles_im[3], x7m10);
1031        let m0314b = SseVector::fmadd(m0314b, self.twiddles_im[6], x8m9);
1032        let [y03, y14] = SseVector::column_butterfly2([m0314a, m0314b]);
1033
1034        let m0413a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p16);
1035        let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[7], x2p15);
1036        let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[4], x3p14);
1037        let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[0], x4p13);
1038        let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[2], x5p12);
1039        let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[6], x6p11);
1040        let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[5], x7p10);
1041        let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[1], x8p9);
1042        let m0413b = SseVector::mul(self.twiddles_im[3], x1m16);
1043        let m0413b = SseVector::fmadd(m0413b, self.twiddles_im[7], x2m15);
1044        let m0413b = SseVector::nmadd(m0413b, self.twiddles_im[4], x3m14);
1045        let m0413b = SseVector::nmadd(m0413b, self.twiddles_im[0], x4m13);
1046        let m0413b = SseVector::fmadd(m0413b, self.twiddles_im[2], x5m12);
1047        let m0413b = SseVector::fmadd(m0413b, self.twiddles_im[6], x6m11);
1048        let m0413b = SseVector::nmadd(m0413b, self.twiddles_im[5], x7m10);
1049        let m0413b = SseVector::nmadd(m0413b, self.twiddles_im[1], x8m9);
1050        let [y04, y13] = SseVector::column_butterfly2([m0413a, m0413b]);
1051
1052        let m0512a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p16);
1053        let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[6], x2p15);
1054        let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[1], x3p14);
1055        let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[2], x4p13);
1056        let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[7], x5p12);
1057        let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[3], x6p11);
1058        let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[0], x7p10);
1059        let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[5], x8p9);
1060        let m0512b = SseVector::mul(self.twiddles_im[4], x1m16);
1061        let m0512b = SseVector::nmadd(m0512b, self.twiddles_im[6], x2m15);
1062        let m0512b = SseVector::nmadd(m0512b, self.twiddles_im[1], x3m14);
1063        let m0512b = SseVector::fmadd(m0512b, self.twiddles_im[2], x4m13);
1064        let m0512b = SseVector::fmadd(m0512b, self.twiddles_im[7], x5m12);
1065        let m0512b = SseVector::nmadd(m0512b, self.twiddles_im[3], x6m11);
1066        let m0512b = SseVector::fmadd(m0512b, self.twiddles_im[0], x7m10);
1067        let m0512b = SseVector::fmadd(m0512b, self.twiddles_im[5], x8m9);
1068        let [y05, y12] = SseVector::column_butterfly2([m0512a, m0512b]);
1069
1070        let m0611a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p16);
1071        let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[4], x2p15);
1072        let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[0], x3p14);
1073        let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[6], x4p13);
1074        let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[3], x5p12);
1075        let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[1], x6p11);
1076        let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[7], x7p10);
1077        let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[2], x8p9);
1078        let m0611b = SseVector::mul(self.twiddles_im[5], x1m16);
1079        let m0611b = SseVector::nmadd(m0611b, self.twiddles_im[4], x2m15);
1080        let m0611b = SseVector::fmadd(m0611b, self.twiddles_im[0], x3m14);
1081        let m0611b = SseVector::fmadd(m0611b, self.twiddles_im[6], x4m13);
1082        let m0611b = SseVector::nmadd(m0611b, self.twiddles_im[3], x5m12);
1083        let m0611b = SseVector::fmadd(m0611b, self.twiddles_im[1], x6m11);
1084        let m0611b = SseVector::fmadd(m0611b, self.twiddles_im[7], x7m10);
1085        let m0611b = SseVector::nmadd(m0611b, self.twiddles_im[2], x8m9);
1086        let [y06, y11] = SseVector::column_butterfly2([m0611a, m0611b]);
1087
1088        let m0710a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p16);
1089        let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[2], x2p15);
1090        let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[3], x3p14);
1091        let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[5], x4p13);
1092        let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[0], x5p12);
1093        let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[7], x6p11);
1094        let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[1], x7p10);
1095        let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[4], x8p9);
1096        let m0710b = SseVector::mul(self.twiddles_im[6], x1m16);
1097        let m0710b = SseVector::nmadd(m0710b, self.twiddles_im[2], x2m15);
1098        let m0710b = SseVector::fmadd(m0710b, self.twiddles_im[3], x3m14);
1099        let m0710b = SseVector::nmadd(m0710b, self.twiddles_im[5], x4m13);
1100        let m0710b = SseVector::fmadd(m0710b, self.twiddles_im[0], x5m12);
1101        let m0710b = SseVector::fmadd(m0710b, self.twiddles_im[7], x6m11);
1102        let m0710b = SseVector::nmadd(m0710b, self.twiddles_im[1], x7m10);
1103        let m0710b = SseVector::fmadd(m0710b, self.twiddles_im[4], x8m9);
1104        let [y07, y10] = SseVector::column_butterfly2([m0710a, m0710b]);
1105
1106        let m0809a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p16);
1107        let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[0], x2p15);
1108        let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[6], x3p14);
1109        let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[1], x4p13);
1110        let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[5], x5p12);
1111        let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[2], x6p11);
1112        let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[4], x7p10);
1113        let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[3], x8p9);
1114        let m0809b = SseVector::mul(self.twiddles_im[7], x1m16);
1115        let m0809b = SseVector::nmadd(m0809b, self.twiddles_im[0], x2m15);
1116        let m0809b = SseVector::fmadd(m0809b, self.twiddles_im[6], x3m14);
1117        let m0809b = SseVector::nmadd(m0809b, self.twiddles_im[1], x4m13);
1118        let m0809b = SseVector::fmadd(m0809b, self.twiddles_im[5], x5m12);
1119        let m0809b = SseVector::nmadd(m0809b, self.twiddles_im[2], x6m11);
1120        let m0809b = SseVector::fmadd(m0809b, self.twiddles_im[4], x7m10);
1121        let m0809b = SseVector::nmadd(m0809b, self.twiddles_im[3], x8m9);
1122        let [y08, y09] = SseVector::column_butterfly2([m0809a, m0809b]);
1123
1124
1125        [y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16]
1126    }
1127}
1128
1129struct SseF64Butterfly17<T> {
1130    direction: FftDirection,
1131    twiddles_re: [__m128d; 8],
1132    twiddles_im: [__m128d; 8],
1133    _phantom: std::marker::PhantomData<T>,
1134}
1135
1136boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly17);
1137boilerplate_fft_sse_common_butterfly!(SseF64Butterfly17, 17, |this: &SseF64Butterfly17<_>| this.direction);
1138impl<T: FftNum> SseF64Butterfly17<T> {
1139    /// Safety: The current machine must support the sse4.1 instruction set
1140    #[target_feature(enable = "sse4.1")]
1141    unsafe fn new(direction: FftDirection) -> Self {
1142        assert_f64::<T>();
1143        let twiddles = make_twiddles(17, direction);
1144        unsafe {Self {
1145            direction,
1146            twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
1147            twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
1148            _phantom: std::marker::PhantomData,
1149        }}
1150    }
1151
1152    #[inline(always)]
1153    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
1154        let values = read_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 });
1155
1156        let out = self.perform_fft_direct(values);
1157
1158        write_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 });   
1159    }
1160
1161    #[inline(always)]
1162    pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 17]) -> [__m128d; 17] {
1163        let rotate = SseVector::make_rotate90(FftDirection::Inverse);
1164
1165        let y00 = values[0];
1166        let [x1p16, x1m16] =  SseVector::column_butterfly2([values[1], values[16]]);
1167        let x1m16 = SseVector::apply_rotate90(rotate, x1m16);
1168        let y00 = SseVector::add(y00, x1p16);
1169        let [x2p15, x2m15] =  SseVector::column_butterfly2([values[2], values[15]]);
1170        let x2m15 = SseVector::apply_rotate90(rotate, x2m15);
1171        let y00 = SseVector::add(y00, x2p15);
1172        let [x3p14, x3m14] =  SseVector::column_butterfly2([values[3], values[14]]);
1173        let x3m14 = SseVector::apply_rotate90(rotate, x3m14);
1174        let y00 = SseVector::add(y00, x3p14);
1175        let [x4p13, x4m13] =  SseVector::column_butterfly2([values[4], values[13]]);
1176        let x4m13 = SseVector::apply_rotate90(rotate, x4m13);
1177        let y00 = SseVector::add(y00, x4p13);
1178        let [x5p12, x5m12] =  SseVector::column_butterfly2([values[5], values[12]]);
1179        let x5m12 = SseVector::apply_rotate90(rotate, x5m12);
1180        let y00 = SseVector::add(y00, x5p12);
1181        let [x6p11, x6m11] =  SseVector::column_butterfly2([values[6], values[11]]);
1182        let x6m11 = SseVector::apply_rotate90(rotate, x6m11);
1183        let y00 = SseVector::add(y00, x6p11);
1184        let [x7p10, x7m10] =  SseVector::column_butterfly2([values[7], values[10]]);
1185        let x7m10 = SseVector::apply_rotate90(rotate, x7m10);
1186        let y00 = SseVector::add(y00, x7p10);
1187        let [x8p9, x8m9] =  SseVector::column_butterfly2([values[8], values[9]]);
1188        let x8m9 = SseVector::apply_rotate90(rotate, x8m9);
1189        let y00 = SseVector::add(y00, x8p9);
1190
1191        let m0116a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p16);
1192        let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[1], x2p15);
1193        let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[2], x3p14);
1194        let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[3], x4p13);
1195        let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[4], x5p12);
1196        let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[5], x6p11);
1197        let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[6], x7p10);
1198        let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[7], x8p9);
1199        let m0116b = SseVector::mul(self.twiddles_im[0], x1m16);
1200        let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[1], x2m15);
1201        let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[2], x3m14);
1202        let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[3], x4m13);
1203        let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[4], x5m12);
1204        let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[5], x6m11);
1205        let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[6], x7m10);
1206        let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[7], x8m9);
1207        let [y01, y16] = SseVector::column_butterfly2([m0116a, m0116b]);
1208
1209        let m0215a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p16);
1210        let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[3], x2p15);
1211        let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[5], x3p14);
1212        let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[7], x4p13);
1213        let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[6], x5p12);
1214        let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[4], x6p11);
1215        let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[2], x7p10);
1216        let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[0], x8p9);
1217        let m0215b = SseVector::mul(self.twiddles_im[1], x1m16);
1218        let m0215b = SseVector::fmadd(m0215b, self.twiddles_im[3], x2m15);
1219        let m0215b = SseVector::fmadd(m0215b, self.twiddles_im[5], x3m14);
1220        let m0215b = SseVector::fmadd(m0215b, self.twiddles_im[7], x4m13);
1221        let m0215b = SseVector::nmadd(m0215b, self.twiddles_im[6], x5m12);
1222        let m0215b = SseVector::nmadd(m0215b, self.twiddles_im[4], x6m11);
1223        let m0215b = SseVector::nmadd(m0215b, self.twiddles_im[2], x7m10);
1224        let m0215b = SseVector::nmadd(m0215b, self.twiddles_im[0], x8m9);
1225        let [y02, y15] = SseVector::column_butterfly2([m0215a, m0215b]);
1226
1227        let m0314a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p16);
1228        let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[5], x2p15);
1229        let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[7], x3p14);
1230        let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[4], x4p13);
1231        let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[1], x5p12);
1232        let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[0], x6p11);
1233        let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[3], x7p10);
1234        let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[6], x8p9);
1235        let m0314b = SseVector::mul(self.twiddles_im[2], x1m16);
1236        let m0314b = SseVector::fmadd(m0314b, self.twiddles_im[5], x2m15);
1237        let m0314b = SseVector::nmadd(m0314b, self.twiddles_im[7], x3m14);
1238        let m0314b = SseVector::nmadd(m0314b, self.twiddles_im[4], x4m13);
1239        let m0314b = SseVector::nmadd(m0314b, self.twiddles_im[1], x5m12);
1240        let m0314b = SseVector::fmadd(m0314b, self.twiddles_im[0], x6m11);
1241        let m0314b = SseVector::fmadd(m0314b, self.twiddles_im[3], x7m10);
1242        let m0314b = SseVector::fmadd(m0314b, self.twiddles_im[6], x8m9);
1243        let [y03, y14] = SseVector::column_butterfly2([m0314a, m0314b]);
1244
1245        let m0413a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p16);
1246        let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[7], x2p15);
1247        let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[4], x3p14);
1248        let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[0], x4p13);
1249        let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[2], x5p12);
1250        let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[6], x6p11);
1251        let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[5], x7p10);
1252        let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[1], x8p9);
1253        let m0413b = SseVector::mul(self.twiddles_im[3], x1m16);
1254        let m0413b = SseVector::fmadd(m0413b, self.twiddles_im[7], x2m15);
1255        let m0413b = SseVector::nmadd(m0413b, self.twiddles_im[4], x3m14);
1256        let m0413b = SseVector::nmadd(m0413b, self.twiddles_im[0], x4m13);
1257        let m0413b = SseVector::fmadd(m0413b, self.twiddles_im[2], x5m12);
1258        let m0413b = SseVector::fmadd(m0413b, self.twiddles_im[6], x6m11);
1259        let m0413b = SseVector::nmadd(m0413b, self.twiddles_im[5], x7m10);
1260        let m0413b = SseVector::nmadd(m0413b, self.twiddles_im[1], x8m9);
1261        let [y04, y13] = SseVector::column_butterfly2([m0413a, m0413b]);
1262
1263        let m0512a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p16);
1264        let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[6], x2p15);
1265        let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[1], x3p14);
1266        let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[2], x4p13);
1267        let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[7], x5p12);
1268        let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[3], x6p11);
1269        let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[0], x7p10);
1270        let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[5], x8p9);
1271        let m0512b = SseVector::mul(self.twiddles_im[4], x1m16);
1272        let m0512b = SseVector::nmadd(m0512b, self.twiddles_im[6], x2m15);
1273        let m0512b = SseVector::nmadd(m0512b, self.twiddles_im[1], x3m14);
1274        let m0512b = SseVector::fmadd(m0512b, self.twiddles_im[2], x4m13);
1275        let m0512b = SseVector::fmadd(m0512b, self.twiddles_im[7], x5m12);
1276        let m0512b = SseVector::nmadd(m0512b, self.twiddles_im[3], x6m11);
1277        let m0512b = SseVector::fmadd(m0512b, self.twiddles_im[0], x7m10);
1278        let m0512b = SseVector::fmadd(m0512b, self.twiddles_im[5], x8m9);
1279        let [y05, y12] = SseVector::column_butterfly2([m0512a, m0512b]);
1280
1281        let m0611a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p16);
1282        let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[4], x2p15);
1283        let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[0], x3p14);
1284        let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[6], x4p13);
1285        let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[3], x5p12);
1286        let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[1], x6p11);
1287        let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[7], x7p10);
1288        let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[2], x8p9);
1289        let m0611b = SseVector::mul(self.twiddles_im[5], x1m16);
1290        let m0611b = SseVector::nmadd(m0611b, self.twiddles_im[4], x2m15);
1291        let m0611b = SseVector::fmadd(m0611b, self.twiddles_im[0], x3m14);
1292        let m0611b = SseVector::fmadd(m0611b, self.twiddles_im[6], x4m13);
1293        let m0611b = SseVector::nmadd(m0611b, self.twiddles_im[3], x5m12);
1294        let m0611b = SseVector::fmadd(m0611b, self.twiddles_im[1], x6m11);
1295        let m0611b = SseVector::fmadd(m0611b, self.twiddles_im[7], x7m10);
1296        let m0611b = SseVector::nmadd(m0611b, self.twiddles_im[2], x8m9);
1297        let [y06, y11] = SseVector::column_butterfly2([m0611a, m0611b]);
1298
1299        let m0710a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p16);
1300        let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[2], x2p15);
1301        let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[3], x3p14);
1302        let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[5], x4p13);
1303        let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[0], x5p12);
1304        let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[7], x6p11);
1305        let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[1], x7p10);
1306        let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[4], x8p9);
1307        let m0710b = SseVector::mul(self.twiddles_im[6], x1m16);
1308        let m0710b = SseVector::nmadd(m0710b, self.twiddles_im[2], x2m15);
1309        let m0710b = SseVector::fmadd(m0710b, self.twiddles_im[3], x3m14);
1310        let m0710b = SseVector::nmadd(m0710b, self.twiddles_im[5], x4m13);
1311        let m0710b = SseVector::fmadd(m0710b, self.twiddles_im[0], x5m12);
1312        let m0710b = SseVector::fmadd(m0710b, self.twiddles_im[7], x6m11);
1313        let m0710b = SseVector::nmadd(m0710b, self.twiddles_im[1], x7m10);
1314        let m0710b = SseVector::fmadd(m0710b, self.twiddles_im[4], x8m9);
1315        let [y07, y10] = SseVector::column_butterfly2([m0710a, m0710b]);
1316
1317        let m0809a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p16);
1318        let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[0], x2p15);
1319        let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[6], x3p14);
1320        let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[1], x4p13);
1321        let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[5], x5p12);
1322        let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[2], x6p11);
1323        let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[4], x7p10);
1324        let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[3], x8p9);
1325        let m0809b = SseVector::mul(self.twiddles_im[7], x1m16);
1326        let m0809b = SseVector::nmadd(m0809b, self.twiddles_im[0], x2m15);
1327        let m0809b = SseVector::fmadd(m0809b, self.twiddles_im[6], x3m14);
1328        let m0809b = SseVector::nmadd(m0809b, self.twiddles_im[1], x4m13);
1329        let m0809b = SseVector::fmadd(m0809b, self.twiddles_im[5], x5m12);
1330        let m0809b = SseVector::nmadd(m0809b, self.twiddles_im[2], x6m11);
1331        let m0809b = SseVector::fmadd(m0809b, self.twiddles_im[4], x7m10);
1332        let m0809b = SseVector::nmadd(m0809b, self.twiddles_im[3], x8m9);
1333        let [y08, y09] = SseVector::column_butterfly2([m0809a, m0809b]);
1334
1335
1336        [y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16]
1337    }
1338}
1339
1340struct SseF32Butterfly19<T> {
1341    direction: FftDirection,
1342    twiddles_re: [__m128; 9],
1343    twiddles_im: [__m128; 9],
1344    _phantom: std::marker::PhantomData<T>,
1345}
1346
1347boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly19);
1348boilerplate_fft_sse_common_butterfly!(SseF32Butterfly19, 19, |this: &SseF32Butterfly19<_>| this.direction);
1349impl<T: FftNum> SseF32Butterfly19<T> {
1350    /// Safety: The current machine must support the sse4.1 instruction set
1351    #[target_feature(enable = "sse4.1")]
1352    unsafe fn new(direction: FftDirection) -> Self {
1353        assert_f32::<T>();
1354        let twiddles = make_twiddles(19, direction);
1355        Self {
1356            direction,
1357            twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
1358            twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
1359            _phantom: std::marker::PhantomData,
1360        }
1361    }
1362
1363    #[inline(always)]
1364    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
1365        let values = read_partial1_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 });
1366
1367        let out = self.perform_parallel_fft_direct(values);
1368        
1369        write_partial_lo_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 } ); 
1370    }
1371
1372    #[inline(always)]
1373    pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
1374        let input_packed = read_complex_to_array!(buffer, { 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36 });
1375
1376        let values = [
1377            extract_lo_hi_f32(input_packed[0], input_packed[9]),
1378            extract_hi_lo_f32(input_packed[0], input_packed[10]),
1379            extract_lo_hi_f32(input_packed[1], input_packed[10]),
1380            extract_hi_lo_f32(input_packed[1], input_packed[11]),
1381            extract_lo_hi_f32(input_packed[2], input_packed[11]),
1382            extract_hi_lo_f32(input_packed[2], input_packed[12]),
1383            extract_lo_hi_f32(input_packed[3], input_packed[12]),
1384            extract_hi_lo_f32(input_packed[3], input_packed[13]),
1385            extract_lo_hi_f32(input_packed[4], input_packed[13]),
1386            extract_hi_lo_f32(input_packed[4], input_packed[14]),
1387            extract_lo_hi_f32(input_packed[5], input_packed[14]),
1388            extract_hi_lo_f32(input_packed[5], input_packed[15]),
1389            extract_lo_hi_f32(input_packed[6], input_packed[15]),
1390            extract_hi_lo_f32(input_packed[6], input_packed[16]),
1391            extract_lo_hi_f32(input_packed[7], input_packed[16]),
1392            extract_hi_lo_f32(input_packed[7], input_packed[17]),
1393            extract_lo_hi_f32(input_packed[8], input_packed[17]),
1394            extract_hi_lo_f32(input_packed[8], input_packed[18]),
1395            extract_lo_hi_f32(input_packed[9], input_packed[18]),
1396        ];
1397
1398        let out = self.perform_parallel_fft_direct(values);
1399
1400        let out_packed = [
1401            extract_lo_lo_f32(out[0], out[1]),
1402            extract_lo_lo_f32(out[2], out[3]),
1403            extract_lo_lo_f32(out[4], out[5]),
1404            extract_lo_lo_f32(out[6], out[7]),
1405            extract_lo_lo_f32(out[8], out[9]),
1406            extract_lo_lo_f32(out[10], out[11]),
1407            extract_lo_lo_f32(out[12], out[13]),
1408            extract_lo_lo_f32(out[14], out[15]),
1409            extract_lo_lo_f32(out[16], out[17]),
1410            extract_lo_hi_f32(out[18], out[0]),
1411            extract_hi_hi_f32(out[1], out[2]),
1412            extract_hi_hi_f32(out[3], out[4]),
1413            extract_hi_hi_f32(out[5], out[6]),
1414            extract_hi_hi_f32(out[7], out[8]),
1415            extract_hi_hi_f32(out[9], out[10]),
1416            extract_hi_hi_f32(out[11], out[12]),
1417            extract_hi_hi_f32(out[13], out[14]),
1418            extract_hi_hi_f32(out[15], out[16]),
1419            extract_hi_hi_f32(out[17], out[18]),
1420        ];
1421
1422        write_complex_to_array_strided!(out_packed, buffer, 2, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 });
1423    }
1424
1425    #[inline(always)]
1426    pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 19]) -> [__m128; 19] {
1427        let rotate = SseVector::make_rotate90(FftDirection::Inverse);
1428
1429        let y00 = values[0];
1430        let [x1p18, x1m18] =  SseVector::column_butterfly2([values[1], values[18]]);
1431        let x1m18 = SseVector::apply_rotate90(rotate, x1m18);
1432        let y00 = SseVector::add(y00, x1p18);
1433        let [x2p17, x2m17] =  SseVector::column_butterfly2([values[2], values[17]]);
1434        let x2m17 = SseVector::apply_rotate90(rotate, x2m17);
1435        let y00 = SseVector::add(y00, x2p17);
1436        let [x3p16, x3m16] =  SseVector::column_butterfly2([values[3], values[16]]);
1437        let x3m16 = SseVector::apply_rotate90(rotate, x3m16);
1438        let y00 = SseVector::add(y00, x3p16);
1439        let [x4p15, x4m15] =  SseVector::column_butterfly2([values[4], values[15]]);
1440        let x4m15 = SseVector::apply_rotate90(rotate, x4m15);
1441        let y00 = SseVector::add(y00, x4p15);
1442        let [x5p14, x5m14] =  SseVector::column_butterfly2([values[5], values[14]]);
1443        let x5m14 = SseVector::apply_rotate90(rotate, x5m14);
1444        let y00 = SseVector::add(y00, x5p14);
1445        let [x6p13, x6m13] =  SseVector::column_butterfly2([values[6], values[13]]);
1446        let x6m13 = SseVector::apply_rotate90(rotate, x6m13);
1447        let y00 = SseVector::add(y00, x6p13);
1448        let [x7p12, x7m12] =  SseVector::column_butterfly2([values[7], values[12]]);
1449        let x7m12 = SseVector::apply_rotate90(rotate, x7m12);
1450        let y00 = SseVector::add(y00, x7p12);
1451        let [x8p11, x8m11] =  SseVector::column_butterfly2([values[8], values[11]]);
1452        let x8m11 = SseVector::apply_rotate90(rotate, x8m11);
1453        let y00 = SseVector::add(y00, x8p11);
1454        let [x9p10, x9m10] =  SseVector::column_butterfly2([values[9], values[10]]);
1455        let x9m10 = SseVector::apply_rotate90(rotate, x9m10);
1456        let y00 = SseVector::add(y00, x9p10);
1457
1458        let m0118a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p18);
1459        let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[1], x2p17);
1460        let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[2], x3p16);
1461        let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[3], x4p15);
1462        let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[4], x5p14);
1463        let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[5], x6p13);
1464        let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[6], x7p12);
1465        let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[7], x8p11);
1466        let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[8], x9p10);
1467        let m0118b = SseVector::mul(self.twiddles_im[0], x1m18);
1468        let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[1], x2m17);
1469        let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[2], x3m16);
1470        let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[3], x4m15);
1471        let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[4], x5m14);
1472        let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[5], x6m13);
1473        let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[6], x7m12);
1474        let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[7], x8m11);
1475        let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[8], x9m10);
1476        let [y01, y18] = SseVector::column_butterfly2([m0118a, m0118b]);
1477
1478        let m0217a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p18);
1479        let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[3], x2p17);
1480        let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[5], x3p16);
1481        let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[7], x4p15);
1482        let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[8], x5p14);
1483        let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[6], x6p13);
1484        let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[4], x7p12);
1485        let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[2], x8p11);
1486        let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[0], x9p10);
1487        let m0217b = SseVector::mul(self.twiddles_im[1], x1m18);
1488        let m0217b = SseVector::fmadd(m0217b, self.twiddles_im[3], x2m17);
1489        let m0217b = SseVector::fmadd(m0217b, self.twiddles_im[5], x3m16);
1490        let m0217b = SseVector::fmadd(m0217b, self.twiddles_im[7], x4m15);
1491        let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[8], x5m14);
1492        let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[6], x6m13);
1493        let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[4], x7m12);
1494        let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[2], x8m11);
1495        let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[0], x9m10);
1496        let [y02, y17] = SseVector::column_butterfly2([m0217a, m0217b]);
1497
1498        let m0316a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p18);
1499        let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[5], x2p17);
1500        let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[8], x3p16);
1501        let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[6], x4p15);
1502        let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[3], x5p14);
1503        let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[0], x6p13);
1504        let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[1], x7p12);
1505        let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[4], x8p11);
1506        let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[7], x9p10);
1507        let m0316b = SseVector::mul(self.twiddles_im[2], x1m18);
1508        let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[5], x2m17);
1509        let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[8], x3m16);
1510        let m0316b = SseVector::nmadd(m0316b, self.twiddles_im[6], x4m15);
1511        let m0316b = SseVector::nmadd(m0316b, self.twiddles_im[3], x5m14);
1512        let m0316b = SseVector::nmadd(m0316b, self.twiddles_im[0], x6m13);
1513        let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[1], x7m12);
1514        let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[4], x8m11);
1515        let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[7], x9m10);
1516        let [y03, y16] = SseVector::column_butterfly2([m0316a, m0316b]);
1517
1518        let m0415a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p18);
1519        let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[7], x2p17);
1520        let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[6], x3p16);
1521        let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[2], x4p15);
1522        let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[0], x5p14);
1523        let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[4], x6p13);
1524        let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[8], x7p12);
1525        let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[5], x8p11);
1526        let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[1], x9p10);
1527        let m0415b = SseVector::mul(self.twiddles_im[3], x1m18);
1528        let m0415b = SseVector::fmadd(m0415b, self.twiddles_im[7], x2m17);
1529        let m0415b = SseVector::nmadd(m0415b, self.twiddles_im[6], x3m16);
1530        let m0415b = SseVector::nmadd(m0415b, self.twiddles_im[2], x4m15);
1531        let m0415b = SseVector::fmadd(m0415b, self.twiddles_im[0], x5m14);
1532        let m0415b = SseVector::fmadd(m0415b, self.twiddles_im[4], x6m13);
1533        let m0415b = SseVector::fmadd(m0415b, self.twiddles_im[8], x7m12);
1534        let m0415b = SseVector::nmadd(m0415b, self.twiddles_im[5], x8m11);
1535        let m0415b = SseVector::nmadd(m0415b, self.twiddles_im[1], x9m10);
1536        let [y04, y15] = SseVector::column_butterfly2([m0415a, m0415b]);
1537
1538        let m0514a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p18);
1539        let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[8], x2p17);
1540        let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[3], x3p16);
1541        let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[0], x4p15);
1542        let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[5], x5p14);
1543        let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[7], x6p13);
1544        let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[2], x7p12);
1545        let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[1], x8p11);
1546        let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[6], x9p10);
1547        let m0514b = SseVector::mul(self.twiddles_im[4], x1m18);
1548        let m0514b = SseVector::nmadd(m0514b, self.twiddles_im[8], x2m17);
1549        let m0514b = SseVector::nmadd(m0514b, self.twiddles_im[3], x3m16);
1550        let m0514b = SseVector::fmadd(m0514b, self.twiddles_im[0], x4m15);
1551        let m0514b = SseVector::fmadd(m0514b, self.twiddles_im[5], x5m14);
1552        let m0514b = SseVector::nmadd(m0514b, self.twiddles_im[7], x6m13);
1553        let m0514b = SseVector::nmadd(m0514b, self.twiddles_im[2], x7m12);
1554        let m0514b = SseVector::fmadd(m0514b, self.twiddles_im[1], x8m11);
1555        let m0514b = SseVector::fmadd(m0514b, self.twiddles_im[6], x9m10);
1556        let [y05, y14] = SseVector::column_butterfly2([m0514a, m0514b]);
1557
1558        let m0613a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p18);
1559        let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[6], x2p17);
1560        let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[0], x3p16);
1561        let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[4], x4p15);
1562        let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[7], x5p14);
1563        let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[1], x6p13);
1564        let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[3], x7p12);
1565        let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[8], x8p11);
1566        let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[2], x9p10);
1567        let m0613b = SseVector::mul(self.twiddles_im[5], x1m18);
1568        let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[6], x2m17);
1569        let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[0], x3m16);
1570        let m0613b = SseVector::fmadd(m0613b, self.twiddles_im[4], x4m15);
1571        let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[7], x5m14);
1572        let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[1], x6m13);
1573        let m0613b = SseVector::fmadd(m0613b, self.twiddles_im[3], x7m12);
1574        let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[8], x8m11);
1575        let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[2], x9m10);
1576        let [y06, y13] = SseVector::column_butterfly2([m0613a, m0613b]);
1577
1578        let m0712a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p18);
1579        let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[4], x2p17);
1580        let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[1], x3p16);
1581        let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[8], x4p15);
1582        let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[2], x5p14);
1583        let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[3], x6p13);
1584        let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[7], x7p12);
1585        let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[0], x8p11);
1586        let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[5], x9p10);
1587        let m0712b = SseVector::mul(self.twiddles_im[6], x1m18);
1588        let m0712b = SseVector::nmadd(m0712b, self.twiddles_im[4], x2m17);
1589        let m0712b = SseVector::fmadd(m0712b, self.twiddles_im[1], x3m16);
1590        let m0712b = SseVector::fmadd(m0712b, self.twiddles_im[8], x4m15);
1591        let m0712b = SseVector::nmadd(m0712b, self.twiddles_im[2], x5m14);
1592        let m0712b = SseVector::fmadd(m0712b, self.twiddles_im[3], x6m13);
1593        let m0712b = SseVector::nmadd(m0712b, self.twiddles_im[7], x7m12);
1594        let m0712b = SseVector::nmadd(m0712b, self.twiddles_im[0], x8m11);
1595        let m0712b = SseVector::fmadd(m0712b, self.twiddles_im[5], x9m10);
1596        let [y07, y12] = SseVector::column_butterfly2([m0712a, m0712b]);
1597
1598        let m0811a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p18);
1599        let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[2], x2p17);
1600        let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[4], x3p16);
1601        let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[5], x4p15);
1602        let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[1], x5p14);
1603        let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[8], x6p13);
1604        let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[0], x7p12);
1605        let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[6], x8p11);
1606        let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[3], x9p10);
1607        let m0811b = SseVector::mul(self.twiddles_im[7], x1m18);
1608        let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[2], x2m17);
1609        let m0811b = SseVector::fmadd(m0811b, self.twiddles_im[4], x3m16);
1610        let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[5], x4m15);
1611        let m0811b = SseVector::fmadd(m0811b, self.twiddles_im[1], x5m14);
1612        let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[8], x6m13);
1613        let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[0], x7m12);
1614        let m0811b = SseVector::fmadd(m0811b, self.twiddles_im[6], x8m11);
1615        let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[3], x9m10);
1616        let [y08, y11] = SseVector::column_butterfly2([m0811a, m0811b]);
1617
1618        let m0910a = SseVector::fmadd(values[0], self.twiddles_re[8], x1p18);
1619        let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[0], x2p17);
1620        let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[7], x3p16);
1621        let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[1], x4p15);
1622        let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[6], x5p14);
1623        let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[2], x6p13);
1624        let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[5], x7p12);
1625        let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[3], x8p11);
1626        let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[4], x9p10);
1627        let m0910b = SseVector::mul(self.twiddles_im[8], x1m18);
1628        let m0910b = SseVector::nmadd(m0910b, self.twiddles_im[0], x2m17);
1629        let m0910b = SseVector::fmadd(m0910b, self.twiddles_im[7], x3m16);
1630        let m0910b = SseVector::nmadd(m0910b, self.twiddles_im[1], x4m15);
1631        let m0910b = SseVector::fmadd(m0910b, self.twiddles_im[6], x5m14);
1632        let m0910b = SseVector::nmadd(m0910b, self.twiddles_im[2], x6m13);
1633        let m0910b = SseVector::fmadd(m0910b, self.twiddles_im[5], x7m12);
1634        let m0910b = SseVector::nmadd(m0910b, self.twiddles_im[3], x8m11);
1635        let m0910b = SseVector::fmadd(m0910b, self.twiddles_im[4], x9m10);
1636        let [y09, y10] = SseVector::column_butterfly2([m0910a, m0910b]);
1637
1638
1639        [y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16, y17, y18]
1640    }
1641}
1642
1643struct SseF64Butterfly19<T> {
1644    direction: FftDirection,
1645    twiddles_re: [__m128d; 9],
1646    twiddles_im: [__m128d; 9],
1647    _phantom: std::marker::PhantomData<T>,
1648}
1649
1650boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly19);
1651boilerplate_fft_sse_common_butterfly!(SseF64Butterfly19, 19, |this: &SseF64Butterfly19<_>| this.direction);
1652impl<T: FftNum> SseF64Butterfly19<T> {
1653    /// Safety: The current machine must support the sse4.1 instruction set
1654    #[target_feature(enable = "sse4.1")]
1655    unsafe fn new(direction: FftDirection) -> Self {
1656        assert_f64::<T>();
1657        let twiddles = make_twiddles(19, direction);
1658        unsafe {Self {
1659            direction,
1660            twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
1661            twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
1662            _phantom: std::marker::PhantomData,
1663        }}
1664    }
1665
1666    #[inline(always)]
1667    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
1668        let values = read_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 });
1669
1670        let out = self.perform_fft_direct(values);
1671
1672        write_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 });   
1673    }
1674
1675    #[inline(always)]
1676    pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 19]) -> [__m128d; 19] {
1677        let rotate = SseVector::make_rotate90(FftDirection::Inverse);
1678
1679        let y00 = values[0];
1680        let [x1p18, x1m18] =  SseVector::column_butterfly2([values[1], values[18]]);
1681        let x1m18 = SseVector::apply_rotate90(rotate, x1m18);
1682        let y00 = SseVector::add(y00, x1p18);
1683        let [x2p17, x2m17] =  SseVector::column_butterfly2([values[2], values[17]]);
1684        let x2m17 = SseVector::apply_rotate90(rotate, x2m17);
1685        let y00 = SseVector::add(y00, x2p17);
1686        let [x3p16, x3m16] =  SseVector::column_butterfly2([values[3], values[16]]);
1687        let x3m16 = SseVector::apply_rotate90(rotate, x3m16);
1688        let y00 = SseVector::add(y00, x3p16);
1689        let [x4p15, x4m15] =  SseVector::column_butterfly2([values[4], values[15]]);
1690        let x4m15 = SseVector::apply_rotate90(rotate, x4m15);
1691        let y00 = SseVector::add(y00, x4p15);
1692        let [x5p14, x5m14] =  SseVector::column_butterfly2([values[5], values[14]]);
1693        let x5m14 = SseVector::apply_rotate90(rotate, x5m14);
1694        let y00 = SseVector::add(y00, x5p14);
1695        let [x6p13, x6m13] =  SseVector::column_butterfly2([values[6], values[13]]);
1696        let x6m13 = SseVector::apply_rotate90(rotate, x6m13);
1697        let y00 = SseVector::add(y00, x6p13);
1698        let [x7p12, x7m12] =  SseVector::column_butterfly2([values[7], values[12]]);
1699        let x7m12 = SseVector::apply_rotate90(rotate, x7m12);
1700        let y00 = SseVector::add(y00, x7p12);
1701        let [x8p11, x8m11] =  SseVector::column_butterfly2([values[8], values[11]]);
1702        let x8m11 = SseVector::apply_rotate90(rotate, x8m11);
1703        let y00 = SseVector::add(y00, x8p11);
1704        let [x9p10, x9m10] =  SseVector::column_butterfly2([values[9], values[10]]);
1705        let x9m10 = SseVector::apply_rotate90(rotate, x9m10);
1706        let y00 = SseVector::add(y00, x9p10);
1707
1708        let m0118a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p18);
1709        let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[1], x2p17);
1710        let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[2], x3p16);
1711        let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[3], x4p15);
1712        let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[4], x5p14);
1713        let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[5], x6p13);
1714        let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[6], x7p12);
1715        let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[7], x8p11);
1716        let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[8], x9p10);
1717        let m0118b = SseVector::mul(self.twiddles_im[0], x1m18);
1718        let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[1], x2m17);
1719        let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[2], x3m16);
1720        let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[3], x4m15);
1721        let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[4], x5m14);
1722        let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[5], x6m13);
1723        let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[6], x7m12);
1724        let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[7], x8m11);
1725        let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[8], x9m10);
1726        let [y01, y18] = SseVector::column_butterfly2([m0118a, m0118b]);
1727
1728        let m0217a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p18);
1729        let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[3], x2p17);
1730        let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[5], x3p16);
1731        let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[7], x4p15);
1732        let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[8], x5p14);
1733        let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[6], x6p13);
1734        let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[4], x7p12);
1735        let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[2], x8p11);
1736        let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[0], x9p10);
1737        let m0217b = SseVector::mul(self.twiddles_im[1], x1m18);
1738        let m0217b = SseVector::fmadd(m0217b, self.twiddles_im[3], x2m17);
1739        let m0217b = SseVector::fmadd(m0217b, self.twiddles_im[5], x3m16);
1740        let m0217b = SseVector::fmadd(m0217b, self.twiddles_im[7], x4m15);
1741        let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[8], x5m14);
1742        let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[6], x6m13);
1743        let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[4], x7m12);
1744        let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[2], x8m11);
1745        let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[0], x9m10);
1746        let [y02, y17] = SseVector::column_butterfly2([m0217a, m0217b]);
1747
1748        let m0316a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p18);
1749        let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[5], x2p17);
1750        let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[8], x3p16);
1751        let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[6], x4p15);
1752        let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[3], x5p14);
1753        let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[0], x6p13);
1754        let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[1], x7p12);
1755        let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[4], x8p11);
1756        let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[7], x9p10);
1757        let m0316b = SseVector::mul(self.twiddles_im[2], x1m18);
1758        let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[5], x2m17);
1759        let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[8], x3m16);
1760        let m0316b = SseVector::nmadd(m0316b, self.twiddles_im[6], x4m15);
1761        let m0316b = SseVector::nmadd(m0316b, self.twiddles_im[3], x5m14);
1762        let m0316b = SseVector::nmadd(m0316b, self.twiddles_im[0], x6m13);
1763        let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[1], x7m12);
1764        let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[4], x8m11);
1765        let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[7], x9m10);
1766        let [y03, y16] = SseVector::column_butterfly2([m0316a, m0316b]);
1767
1768        let m0415a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p18);
1769        let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[7], x2p17);
1770        let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[6], x3p16);
1771        let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[2], x4p15);
1772        let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[0], x5p14);
1773        let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[4], x6p13);
1774        let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[8], x7p12);
1775        let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[5], x8p11);
1776        let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[1], x9p10);
1777        let m0415b = SseVector::mul(self.twiddles_im[3], x1m18);
1778        let m0415b = SseVector::fmadd(m0415b, self.twiddles_im[7], x2m17);
1779        let m0415b = SseVector::nmadd(m0415b, self.twiddles_im[6], x3m16);
1780        let m0415b = SseVector::nmadd(m0415b, self.twiddles_im[2], x4m15);
1781        let m0415b = SseVector::fmadd(m0415b, self.twiddles_im[0], x5m14);
1782        let m0415b = SseVector::fmadd(m0415b, self.twiddles_im[4], x6m13);
1783        let m0415b = SseVector::fmadd(m0415b, self.twiddles_im[8], x7m12);
1784        let m0415b = SseVector::nmadd(m0415b, self.twiddles_im[5], x8m11);
1785        let m0415b = SseVector::nmadd(m0415b, self.twiddles_im[1], x9m10);
1786        let [y04, y15] = SseVector::column_butterfly2([m0415a, m0415b]);
1787
1788        let m0514a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p18);
1789        let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[8], x2p17);
1790        let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[3], x3p16);
1791        let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[0], x4p15);
1792        let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[5], x5p14);
1793        let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[7], x6p13);
1794        let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[2], x7p12);
1795        let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[1], x8p11);
1796        let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[6], x9p10);
1797        let m0514b = SseVector::mul(self.twiddles_im[4], x1m18);
1798        let m0514b = SseVector::nmadd(m0514b, self.twiddles_im[8], x2m17);
1799        let m0514b = SseVector::nmadd(m0514b, self.twiddles_im[3], x3m16);
1800        let m0514b = SseVector::fmadd(m0514b, self.twiddles_im[0], x4m15);
1801        let m0514b = SseVector::fmadd(m0514b, self.twiddles_im[5], x5m14);
1802        let m0514b = SseVector::nmadd(m0514b, self.twiddles_im[7], x6m13);
1803        let m0514b = SseVector::nmadd(m0514b, self.twiddles_im[2], x7m12);
1804        let m0514b = SseVector::fmadd(m0514b, self.twiddles_im[1], x8m11);
1805        let m0514b = SseVector::fmadd(m0514b, self.twiddles_im[6], x9m10);
1806        let [y05, y14] = SseVector::column_butterfly2([m0514a, m0514b]);
1807
1808        let m0613a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p18);
1809        let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[6], x2p17);
1810        let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[0], x3p16);
1811        let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[4], x4p15);
1812        let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[7], x5p14);
1813        let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[1], x6p13);
1814        let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[3], x7p12);
1815        let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[8], x8p11);
1816        let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[2], x9p10);
1817        let m0613b = SseVector::mul(self.twiddles_im[5], x1m18);
1818        let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[6], x2m17);
1819        let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[0], x3m16);
1820        let m0613b = SseVector::fmadd(m0613b, self.twiddles_im[4], x4m15);
1821        let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[7], x5m14);
1822        let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[1], x6m13);
1823        let m0613b = SseVector::fmadd(m0613b, self.twiddles_im[3], x7m12);
1824        let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[8], x8m11);
1825        let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[2], x9m10);
1826        let [y06, y13] = SseVector::column_butterfly2([m0613a, m0613b]);
1827
1828        let m0712a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p18);
1829        let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[4], x2p17);
1830        let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[1], x3p16);
1831        let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[8], x4p15);
1832        let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[2], x5p14);
1833        let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[3], x6p13);
1834        let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[7], x7p12);
1835        let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[0], x8p11);
1836        let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[5], x9p10);
1837        let m0712b = SseVector::mul(self.twiddles_im[6], x1m18);
1838        let m0712b = SseVector::nmadd(m0712b, self.twiddles_im[4], x2m17);
1839        let m0712b = SseVector::fmadd(m0712b, self.twiddles_im[1], x3m16);
1840        let m0712b = SseVector::fmadd(m0712b, self.twiddles_im[8], x4m15);
1841        let m0712b = SseVector::nmadd(m0712b, self.twiddles_im[2], x5m14);
1842        let m0712b = SseVector::fmadd(m0712b, self.twiddles_im[3], x6m13);
1843        let m0712b = SseVector::nmadd(m0712b, self.twiddles_im[7], x7m12);
1844        let m0712b = SseVector::nmadd(m0712b, self.twiddles_im[0], x8m11);
1845        let m0712b = SseVector::fmadd(m0712b, self.twiddles_im[5], x9m10);
1846        let [y07, y12] = SseVector::column_butterfly2([m0712a, m0712b]);
1847
1848        let m0811a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p18);
1849        let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[2], x2p17);
1850        let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[4], x3p16);
1851        let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[5], x4p15);
1852        let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[1], x5p14);
1853        let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[8], x6p13);
1854        let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[0], x7p12);
1855        let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[6], x8p11);
1856        let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[3], x9p10);
1857        let m0811b = SseVector::mul(self.twiddles_im[7], x1m18);
1858        let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[2], x2m17);
1859        let m0811b = SseVector::fmadd(m0811b, self.twiddles_im[4], x3m16);
1860        let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[5], x4m15);
1861        let m0811b = SseVector::fmadd(m0811b, self.twiddles_im[1], x5m14);
1862        let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[8], x6m13);
1863        let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[0], x7m12);
1864        let m0811b = SseVector::fmadd(m0811b, self.twiddles_im[6], x8m11);
1865        let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[3], x9m10);
1866        let [y08, y11] = SseVector::column_butterfly2([m0811a, m0811b]);
1867
1868        let m0910a = SseVector::fmadd(values[0], self.twiddles_re[8], x1p18);
1869        let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[0], x2p17);
1870        let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[7], x3p16);
1871        let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[1], x4p15);
1872        let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[6], x5p14);
1873        let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[2], x6p13);
1874        let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[5], x7p12);
1875        let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[3], x8p11);
1876        let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[4], x9p10);
1877        let m0910b = SseVector::mul(self.twiddles_im[8], x1m18);
1878        let m0910b = SseVector::nmadd(m0910b, self.twiddles_im[0], x2m17);
1879        let m0910b = SseVector::fmadd(m0910b, self.twiddles_im[7], x3m16);
1880        let m0910b = SseVector::nmadd(m0910b, self.twiddles_im[1], x4m15);
1881        let m0910b = SseVector::fmadd(m0910b, self.twiddles_im[6], x5m14);
1882        let m0910b = SseVector::nmadd(m0910b, self.twiddles_im[2], x6m13);
1883        let m0910b = SseVector::fmadd(m0910b, self.twiddles_im[5], x7m12);
1884        let m0910b = SseVector::nmadd(m0910b, self.twiddles_im[3], x8m11);
1885        let m0910b = SseVector::fmadd(m0910b, self.twiddles_im[4], x9m10);
1886        let [y09, y10] = SseVector::column_butterfly2([m0910a, m0910b]);
1887
1888
1889        [y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16, y17, y18]
1890    }
1891}
1892
1893struct SseF32Butterfly23<T> {
1894    direction: FftDirection,
1895    twiddles_re: [__m128; 11],
1896    twiddles_im: [__m128; 11],
1897    _phantom: std::marker::PhantomData<T>,
1898}
1899
1900boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly23);
1901boilerplate_fft_sse_common_butterfly!(SseF32Butterfly23, 23, |this: &SseF32Butterfly23<_>| this.direction);
1902impl<T: FftNum> SseF32Butterfly23<T> {
1903    /// Safety: The current machine must support the sse4.1 instruction set
1904    #[target_feature(enable = "sse4.1")]
1905    unsafe fn new(direction: FftDirection) -> Self {
1906        assert_f32::<T>();
1907        let twiddles = make_twiddles(23, direction);
1908        Self {
1909            direction,
1910            twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
1911            twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
1912            _phantom: std::marker::PhantomData,
1913        }
1914    }
1915
1916    #[inline(always)]
1917    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
1918        let values = read_partial1_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 });
1919
1920        let out = self.perform_parallel_fft_direct(values);
1921        
1922        write_partial_lo_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 } ); 
1923    }
1924
1925    #[inline(always)]
1926    pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
1927        let input_packed = read_complex_to_array!(buffer, { 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44 });
1928
1929        let values = [
1930            extract_lo_hi_f32(input_packed[0], input_packed[11]),
1931            extract_hi_lo_f32(input_packed[0], input_packed[12]),
1932            extract_lo_hi_f32(input_packed[1], input_packed[12]),
1933            extract_hi_lo_f32(input_packed[1], input_packed[13]),
1934            extract_lo_hi_f32(input_packed[2], input_packed[13]),
1935            extract_hi_lo_f32(input_packed[2], input_packed[14]),
1936            extract_lo_hi_f32(input_packed[3], input_packed[14]),
1937            extract_hi_lo_f32(input_packed[3], input_packed[15]),
1938            extract_lo_hi_f32(input_packed[4], input_packed[15]),
1939            extract_hi_lo_f32(input_packed[4], input_packed[16]),
1940            extract_lo_hi_f32(input_packed[5], input_packed[16]),
1941            extract_hi_lo_f32(input_packed[5], input_packed[17]),
1942            extract_lo_hi_f32(input_packed[6], input_packed[17]),
1943            extract_hi_lo_f32(input_packed[6], input_packed[18]),
1944            extract_lo_hi_f32(input_packed[7], input_packed[18]),
1945            extract_hi_lo_f32(input_packed[7], input_packed[19]),
1946            extract_lo_hi_f32(input_packed[8], input_packed[19]),
1947            extract_hi_lo_f32(input_packed[8], input_packed[20]),
1948            extract_lo_hi_f32(input_packed[9], input_packed[20]),
1949            extract_hi_lo_f32(input_packed[9], input_packed[21]),
1950            extract_lo_hi_f32(input_packed[10], input_packed[21]),
1951            extract_hi_lo_f32(input_packed[10], input_packed[22]),
1952            extract_lo_hi_f32(input_packed[11], input_packed[22]),
1953        ];
1954
1955        let out = self.perform_parallel_fft_direct(values);
1956
1957        let out_packed = [
1958            extract_lo_lo_f32(out[0], out[1]),
1959            extract_lo_lo_f32(out[2], out[3]),
1960            extract_lo_lo_f32(out[4], out[5]),
1961            extract_lo_lo_f32(out[6], out[7]),
1962            extract_lo_lo_f32(out[8], out[9]),
1963            extract_lo_lo_f32(out[10], out[11]),
1964            extract_lo_lo_f32(out[12], out[13]),
1965            extract_lo_lo_f32(out[14], out[15]),
1966            extract_lo_lo_f32(out[16], out[17]),
1967            extract_lo_lo_f32(out[18], out[19]),
1968            extract_lo_lo_f32(out[20], out[21]),
1969            extract_lo_hi_f32(out[22], out[0]),
1970            extract_hi_hi_f32(out[1], out[2]),
1971            extract_hi_hi_f32(out[3], out[4]),
1972            extract_hi_hi_f32(out[5], out[6]),
1973            extract_hi_hi_f32(out[7], out[8]),
1974            extract_hi_hi_f32(out[9], out[10]),
1975            extract_hi_hi_f32(out[11], out[12]),
1976            extract_hi_hi_f32(out[13], out[14]),
1977            extract_hi_hi_f32(out[15], out[16]),
1978            extract_hi_hi_f32(out[17], out[18]),
1979            extract_hi_hi_f32(out[19], out[20]),
1980            extract_hi_hi_f32(out[21], out[22]),
1981        ];
1982
1983        write_complex_to_array_strided!(out_packed, buffer, 2, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 });
1984    }
1985
1986    #[inline(always)]
1987    pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 23]) -> [__m128; 23] {
1988        let rotate = SseVector::make_rotate90(FftDirection::Inverse);
1989
1990        let y00 = values[0];
1991        let [x1p22, x1m22] =  SseVector::column_butterfly2([values[1], values[22]]);
1992        let x1m22 = SseVector::apply_rotate90(rotate, x1m22);
1993        let y00 = SseVector::add(y00, x1p22);
1994        let [x2p21, x2m21] =  SseVector::column_butterfly2([values[2], values[21]]);
1995        let x2m21 = SseVector::apply_rotate90(rotate, x2m21);
1996        let y00 = SseVector::add(y00, x2p21);
1997        let [x3p20, x3m20] =  SseVector::column_butterfly2([values[3], values[20]]);
1998        let x3m20 = SseVector::apply_rotate90(rotate, x3m20);
1999        let y00 = SseVector::add(y00, x3p20);
2000        let [x4p19, x4m19] =  SseVector::column_butterfly2([values[4], values[19]]);
2001        let x4m19 = SseVector::apply_rotate90(rotate, x4m19);
2002        let y00 = SseVector::add(y00, x4p19);
2003        let [x5p18, x5m18] =  SseVector::column_butterfly2([values[5], values[18]]);
2004        let x5m18 = SseVector::apply_rotate90(rotate, x5m18);
2005        let y00 = SseVector::add(y00, x5p18);
2006        let [x6p17, x6m17] =  SseVector::column_butterfly2([values[6], values[17]]);
2007        let x6m17 = SseVector::apply_rotate90(rotate, x6m17);
2008        let y00 = SseVector::add(y00, x6p17);
2009        let [x7p16, x7m16] =  SseVector::column_butterfly2([values[7], values[16]]);
2010        let x7m16 = SseVector::apply_rotate90(rotate, x7m16);
2011        let y00 = SseVector::add(y00, x7p16);
2012        let [x8p15, x8m15] =  SseVector::column_butterfly2([values[8], values[15]]);
2013        let x8m15 = SseVector::apply_rotate90(rotate, x8m15);
2014        let y00 = SseVector::add(y00, x8p15);
2015        let [x9p14, x9m14] =  SseVector::column_butterfly2([values[9], values[14]]);
2016        let x9m14 = SseVector::apply_rotate90(rotate, x9m14);
2017        let y00 = SseVector::add(y00, x9p14);
2018        let [x10p13, x10m13] =  SseVector::column_butterfly2([values[10], values[13]]);
2019        let x10m13 = SseVector::apply_rotate90(rotate, x10m13);
2020        let y00 = SseVector::add(y00, x10p13);
2021        let [x11p12, x11m12] =  SseVector::column_butterfly2([values[11], values[12]]);
2022        let x11m12 = SseVector::apply_rotate90(rotate, x11m12);
2023        let y00 = SseVector::add(y00, x11p12);
2024
2025        let m0122a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p22);
2026        let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[1], x2p21);
2027        let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[2], x3p20);
2028        let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[3], x4p19);
2029        let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[4], x5p18);
2030        let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[5], x6p17);
2031        let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[6], x7p16);
2032        let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[7], x8p15);
2033        let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[8], x9p14);
2034        let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[9], x10p13);
2035        let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[10], x11p12);
2036        let m0122b = SseVector::mul(self.twiddles_im[0], x1m22);
2037        let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[1], x2m21);
2038        let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[2], x3m20);
2039        let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[3], x4m19);
2040        let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[4], x5m18);
2041        let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[5], x6m17);
2042        let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[6], x7m16);
2043        let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[7], x8m15);
2044        let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[8], x9m14);
2045        let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[9], x10m13);
2046        let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[10], x11m12);
2047        let [y01, y22] = SseVector::column_butterfly2([m0122a, m0122b]);
2048
2049        let m0221a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p22);
2050        let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[3], x2p21);
2051        let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[5], x3p20);
2052        let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[7], x4p19);
2053        let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[9], x5p18);
2054        let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[10], x6p17);
2055        let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[8], x7p16);
2056        let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[6], x8p15);
2057        let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[4], x9p14);
2058        let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[2], x10p13);
2059        let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[0], x11p12);
2060        let m0221b = SseVector::mul(self.twiddles_im[1], x1m22);
2061        let m0221b = SseVector::fmadd(m0221b, self.twiddles_im[3], x2m21);
2062        let m0221b = SseVector::fmadd(m0221b, self.twiddles_im[5], x3m20);
2063        let m0221b = SseVector::fmadd(m0221b, self.twiddles_im[7], x4m19);
2064        let m0221b = SseVector::fmadd(m0221b, self.twiddles_im[9], x5m18);
2065        let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[10], x6m17);
2066        let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[8], x7m16);
2067        let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[6], x8m15);
2068        let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[4], x9m14);
2069        let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[2], x10m13);
2070        let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[0], x11m12);
2071        let [y02, y21] = SseVector::column_butterfly2([m0221a, m0221b]);
2072
2073        let m0320a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p22);
2074        let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[5], x2p21);
2075        let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[8], x3p20);
2076        let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[10], x4p19);
2077        let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[7], x5p18);
2078        let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[4], x6p17);
2079        let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[1], x7p16);
2080        let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[0], x8p15);
2081        let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[3], x9p14);
2082        let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[6], x10p13);
2083        let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[9], x11p12);
2084        let m0320b = SseVector::mul(self.twiddles_im[2], x1m22);
2085        let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[5], x2m21);
2086        let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[8], x3m20);
2087        let m0320b = SseVector::nmadd(m0320b, self.twiddles_im[10], x4m19);
2088        let m0320b = SseVector::nmadd(m0320b, self.twiddles_im[7], x5m18);
2089        let m0320b = SseVector::nmadd(m0320b, self.twiddles_im[4], x6m17);
2090        let m0320b = SseVector::nmadd(m0320b, self.twiddles_im[1], x7m16);
2091        let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[0], x8m15);
2092        let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[3], x9m14);
2093        let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[6], x10m13);
2094        let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[9], x11m12);
2095        let [y03, y20] = SseVector::column_butterfly2([m0320a, m0320b]);
2096
2097        let m0419a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p22);
2098        let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[7], x2p21);
2099        let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[10], x3p20);
2100        let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[6], x4p19);
2101        let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[2], x5p18);
2102        let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[0], x6p17);
2103        let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[4], x7p16);
2104        let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[8], x8p15);
2105        let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[9], x9p14);
2106        let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[5], x10p13);
2107        let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[1], x11p12);
2108        let m0419b = SseVector::mul(self.twiddles_im[3], x1m22);
2109        let m0419b = SseVector::fmadd(m0419b, self.twiddles_im[7], x2m21);
2110        let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[10], x3m20);
2111        let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[6], x4m19);
2112        let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[2], x5m18);
2113        let m0419b = SseVector::fmadd(m0419b, self.twiddles_im[0], x6m17);
2114        let m0419b = SseVector::fmadd(m0419b, self.twiddles_im[4], x7m16);
2115        let m0419b = SseVector::fmadd(m0419b, self.twiddles_im[8], x8m15);
2116        let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[9], x9m14);
2117        let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[5], x10m13);
2118        let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[1], x11m12);
2119        let [y04, y19] = SseVector::column_butterfly2([m0419a, m0419b]);
2120
2121        let m0518a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p22);
2122        let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[9], x2p21);
2123        let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[7], x3p20);
2124        let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[2], x4p19);
2125        let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[1], x5p18);
2126        let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[6], x6p17);
2127        let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[10], x7p16);
2128        let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[5], x8p15);
2129        let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[0], x9p14);
2130        let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[3], x10p13);
2131        let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[8], x11p12);
2132        let m0518b = SseVector::mul(self.twiddles_im[4], x1m22);
2133        let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[9], x2m21);
2134        let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[7], x3m20);
2135        let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[2], x4m19);
2136        let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[1], x5m18);
2137        let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[6], x6m17);
2138        let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[10], x7m16);
2139        let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[5], x8m15);
2140        let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[0], x9m14);
2141        let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[3], x10m13);
2142        let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[8], x11m12);
2143        let [y05, y18] = SseVector::column_butterfly2([m0518a, m0518b]);
2144
2145        let m0617a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p22);
2146        let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[10], x2p21);
2147        let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[4], x3p20);
2148        let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[0], x4p19);
2149        let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[6], x5p18);
2150        let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[9], x6p17);
2151        let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[3], x7p16);
2152        let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[1], x8p15);
2153        let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[7], x9p14);
2154        let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[8], x10p13);
2155        let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[2], x11p12);
2156        let m0617b = SseVector::mul(self.twiddles_im[5], x1m22);
2157        let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[10], x2m21);
2158        let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[4], x3m20);
2159        let m0617b = SseVector::fmadd(m0617b, self.twiddles_im[0], x4m19);
2160        let m0617b = SseVector::fmadd(m0617b, self.twiddles_im[6], x5m18);
2161        let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[9], x6m17);
2162        let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[3], x7m16);
2163        let m0617b = SseVector::fmadd(m0617b, self.twiddles_im[1], x8m15);
2164        let m0617b = SseVector::fmadd(m0617b, self.twiddles_im[7], x9m14);
2165        let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[8], x10m13);
2166        let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[2], x11m12);
2167        let [y06, y17] = SseVector::column_butterfly2([m0617a, m0617b]);
2168
2169        let m0716a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p22);
2170        let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[8], x2p21);
2171        let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[1], x3p20);
2172        let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[4], x4p19);
2173        let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[10], x5p18);
2174        let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[3], x6p17);
2175        let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[2], x7p16);
2176        let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[9], x8p15);
2177        let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[5], x9p14);
2178        let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[0], x10p13);
2179        let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[7], x11p12);
2180        let m0716b = SseVector::mul(self.twiddles_im[6], x1m22);
2181        let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[8], x2m21);
2182        let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[1], x3m20);
2183        let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[4], x4m19);
2184        let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[10], x5m18);
2185        let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[3], x6m17);
2186        let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[2], x7m16);
2187        let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[9], x8m15);
2188        let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[5], x9m14);
2189        let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[0], x10m13);
2190        let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[7], x11m12);
2191        let [y07, y16] = SseVector::column_butterfly2([m0716a, m0716b]);
2192
2193        let m0815a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p22);
2194        let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[6], x2p21);
2195        let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[0], x3p20);
2196        let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[8], x4p19);
2197        let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[5], x5p18);
2198        let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[1], x6p17);
2199        let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[9], x7p16);
2200        let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[4], x8p15);
2201        let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[2], x9p14);
2202        let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[10], x10p13);
2203        let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[3], x11p12);
2204        let m0815b = SseVector::mul(self.twiddles_im[7], x1m22);
2205        let m0815b = SseVector::nmadd(m0815b, self.twiddles_im[6], x2m21);
2206        let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[0], x3m20);
2207        let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[8], x4m19);
2208        let m0815b = SseVector::nmadd(m0815b, self.twiddles_im[5], x5m18);
2209        let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[1], x6m17);
2210        let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[9], x7m16);
2211        let m0815b = SseVector::nmadd(m0815b, self.twiddles_im[4], x8m15);
2212        let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[2], x9m14);
2213        let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[10], x10m13);
2214        let m0815b = SseVector::nmadd(m0815b, self.twiddles_im[3], x11m12);
2215        let [y08, y15] = SseVector::column_butterfly2([m0815a, m0815b]);
2216
2217        let m0914a = SseVector::fmadd(values[0], self.twiddles_re[8], x1p22);
2218        let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[4], x2p21);
2219        let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[3], x3p20);
2220        let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[9], x4p19);
2221        let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[0], x5p18);
2222        let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[7], x6p17);
2223        let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[5], x7p16);
2224        let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[2], x8p15);
2225        let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[10], x9p14);
2226        let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[1], x10p13);
2227        let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[6], x11p12);
2228        let m0914b = SseVector::mul(self.twiddles_im[8], x1m22);
2229        let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[4], x2m21);
2230        let m0914b = SseVector::fmadd(m0914b, self.twiddles_im[3], x3m20);
2231        let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[9], x4m19);
2232        let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[0], x5m18);
2233        let m0914b = SseVector::fmadd(m0914b, self.twiddles_im[7], x6m17);
2234        let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[5], x7m16);
2235        let m0914b = SseVector::fmadd(m0914b, self.twiddles_im[2], x8m15);
2236        let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[10], x9m14);
2237        let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[1], x10m13);
2238        let m0914b = SseVector::fmadd(m0914b, self.twiddles_im[6], x11m12);
2239        let [y09, y14] = SseVector::column_butterfly2([m0914a, m0914b]);
2240
2241        let m1013a = SseVector::fmadd(values[0], self.twiddles_re[9], x1p22);
2242        let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[2], x2p21);
2243        let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[6], x3p20);
2244        let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[5], x4p19);
2245        let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[3], x5p18);
2246        let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[8], x6p17);
2247        let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[0], x7p16);
2248        let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[10], x8p15);
2249        let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[1], x9p14);
2250        let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[7], x10p13);
2251        let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[4], x11p12);
2252        let m1013b = SseVector::mul(self.twiddles_im[9], x1m22);
2253        let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[2], x2m21);
2254        let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[6], x3m20);
2255        let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[5], x4m19);
2256        let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[3], x5m18);
2257        let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[8], x6m17);
2258        let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[0], x7m16);
2259        let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[10], x8m15);
2260        let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[1], x9m14);
2261        let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[7], x10m13);
2262        let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[4], x11m12);
2263        let [y10, y13] = SseVector::column_butterfly2([m1013a, m1013b]);
2264
2265        let m1112a = SseVector::fmadd(values[0], self.twiddles_re[10], x1p22);
2266        let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[0], x2p21);
2267        let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[9], x3p20);
2268        let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[1], x4p19);
2269        let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[8], x5p18);
2270        let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[2], x6p17);
2271        let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[7], x7p16);
2272        let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[3], x8p15);
2273        let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[6], x9p14);
2274        let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[4], x10p13);
2275        let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[5], x11p12);
2276        let m1112b = SseVector::mul(self.twiddles_im[10], x1m22);
2277        let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[0], x2m21);
2278        let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[9], x3m20);
2279        let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[1], x4m19);
2280        let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[8], x5m18);
2281        let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[2], x6m17);
2282        let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[7], x7m16);
2283        let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[3], x8m15);
2284        let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[6], x9m14);
2285        let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[4], x10m13);
2286        let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[5], x11m12);
2287        let [y11, y12] = SseVector::column_butterfly2([m1112a, m1112b]);
2288
2289
2290        [y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22]
2291    }
2292}
2293
2294struct SseF64Butterfly23<T> {
2295    direction: FftDirection,
2296    twiddles_re: [__m128d; 11],
2297    twiddles_im: [__m128d; 11],
2298    _phantom: std::marker::PhantomData<T>,
2299}
2300
2301boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly23);
2302boilerplate_fft_sse_common_butterfly!(SseF64Butterfly23, 23, |this: &SseF64Butterfly23<_>| this.direction);
2303impl<T: FftNum> SseF64Butterfly23<T> {
2304    /// Safety: The current machine must support the sse4.1 instruction set
2305    #[target_feature(enable = "sse4.1")]
2306    unsafe fn new(direction: FftDirection) -> Self {
2307        assert_f64::<T>();
2308        let twiddles = make_twiddles(23, direction);
2309        unsafe {Self {
2310            direction,
2311            twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
2312            twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
2313            _phantom: std::marker::PhantomData,
2314        }}
2315    }
2316
2317    #[inline(always)]
2318    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
2319        let values = read_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 });
2320
2321        let out = self.perform_fft_direct(values);
2322
2323        write_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 });   
2324    }
2325
2326    #[inline(always)]
2327    pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 23]) -> [__m128d; 23] {
2328        let rotate = SseVector::make_rotate90(FftDirection::Inverse);
2329
2330        let y00 = values[0];
2331        let [x1p22, x1m22] =  SseVector::column_butterfly2([values[1], values[22]]);
2332        let x1m22 = SseVector::apply_rotate90(rotate, x1m22);
2333        let y00 = SseVector::add(y00, x1p22);
2334        let [x2p21, x2m21] =  SseVector::column_butterfly2([values[2], values[21]]);
2335        let x2m21 = SseVector::apply_rotate90(rotate, x2m21);
2336        let y00 = SseVector::add(y00, x2p21);
2337        let [x3p20, x3m20] =  SseVector::column_butterfly2([values[3], values[20]]);
2338        let x3m20 = SseVector::apply_rotate90(rotate, x3m20);
2339        let y00 = SseVector::add(y00, x3p20);
2340        let [x4p19, x4m19] =  SseVector::column_butterfly2([values[4], values[19]]);
2341        let x4m19 = SseVector::apply_rotate90(rotate, x4m19);
2342        let y00 = SseVector::add(y00, x4p19);
2343        let [x5p18, x5m18] =  SseVector::column_butterfly2([values[5], values[18]]);
2344        let x5m18 = SseVector::apply_rotate90(rotate, x5m18);
2345        let y00 = SseVector::add(y00, x5p18);
2346        let [x6p17, x6m17] =  SseVector::column_butterfly2([values[6], values[17]]);
2347        let x6m17 = SseVector::apply_rotate90(rotate, x6m17);
2348        let y00 = SseVector::add(y00, x6p17);
2349        let [x7p16, x7m16] =  SseVector::column_butterfly2([values[7], values[16]]);
2350        let x7m16 = SseVector::apply_rotate90(rotate, x7m16);
2351        let y00 = SseVector::add(y00, x7p16);
2352        let [x8p15, x8m15] =  SseVector::column_butterfly2([values[8], values[15]]);
2353        let x8m15 = SseVector::apply_rotate90(rotate, x8m15);
2354        let y00 = SseVector::add(y00, x8p15);
2355        let [x9p14, x9m14] =  SseVector::column_butterfly2([values[9], values[14]]);
2356        let x9m14 = SseVector::apply_rotate90(rotate, x9m14);
2357        let y00 = SseVector::add(y00, x9p14);
2358        let [x10p13, x10m13] =  SseVector::column_butterfly2([values[10], values[13]]);
2359        let x10m13 = SseVector::apply_rotate90(rotate, x10m13);
2360        let y00 = SseVector::add(y00, x10p13);
2361        let [x11p12, x11m12] =  SseVector::column_butterfly2([values[11], values[12]]);
2362        let x11m12 = SseVector::apply_rotate90(rotate, x11m12);
2363        let y00 = SseVector::add(y00, x11p12);
2364
2365        let m0122a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p22);
2366        let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[1], x2p21);
2367        let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[2], x3p20);
2368        let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[3], x4p19);
2369        let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[4], x5p18);
2370        let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[5], x6p17);
2371        let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[6], x7p16);
2372        let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[7], x8p15);
2373        let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[8], x9p14);
2374        let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[9], x10p13);
2375        let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[10], x11p12);
2376        let m0122b = SseVector::mul(self.twiddles_im[0], x1m22);
2377        let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[1], x2m21);
2378        let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[2], x3m20);
2379        let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[3], x4m19);
2380        let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[4], x5m18);
2381        let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[5], x6m17);
2382        let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[6], x7m16);
2383        let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[7], x8m15);
2384        let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[8], x9m14);
2385        let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[9], x10m13);
2386        let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[10], x11m12);
2387        let [y01, y22] = SseVector::column_butterfly2([m0122a, m0122b]);
2388
2389        let m0221a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p22);
2390        let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[3], x2p21);
2391        let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[5], x3p20);
2392        let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[7], x4p19);
2393        let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[9], x5p18);
2394        let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[10], x6p17);
2395        let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[8], x7p16);
2396        let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[6], x8p15);
2397        let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[4], x9p14);
2398        let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[2], x10p13);
2399        let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[0], x11p12);
2400        let m0221b = SseVector::mul(self.twiddles_im[1], x1m22);
2401        let m0221b = SseVector::fmadd(m0221b, self.twiddles_im[3], x2m21);
2402        let m0221b = SseVector::fmadd(m0221b, self.twiddles_im[5], x3m20);
2403        let m0221b = SseVector::fmadd(m0221b, self.twiddles_im[7], x4m19);
2404        let m0221b = SseVector::fmadd(m0221b, self.twiddles_im[9], x5m18);
2405        let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[10], x6m17);
2406        let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[8], x7m16);
2407        let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[6], x8m15);
2408        let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[4], x9m14);
2409        let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[2], x10m13);
2410        let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[0], x11m12);
2411        let [y02, y21] = SseVector::column_butterfly2([m0221a, m0221b]);
2412
2413        let m0320a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p22);
2414        let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[5], x2p21);
2415        let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[8], x3p20);
2416        let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[10], x4p19);
2417        let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[7], x5p18);
2418        let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[4], x6p17);
2419        let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[1], x7p16);
2420        let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[0], x8p15);
2421        let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[3], x9p14);
2422        let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[6], x10p13);
2423        let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[9], x11p12);
2424        let m0320b = SseVector::mul(self.twiddles_im[2], x1m22);
2425        let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[5], x2m21);
2426        let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[8], x3m20);
2427        let m0320b = SseVector::nmadd(m0320b, self.twiddles_im[10], x4m19);
2428        let m0320b = SseVector::nmadd(m0320b, self.twiddles_im[7], x5m18);
2429        let m0320b = SseVector::nmadd(m0320b, self.twiddles_im[4], x6m17);
2430        let m0320b = SseVector::nmadd(m0320b, self.twiddles_im[1], x7m16);
2431        let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[0], x8m15);
2432        let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[3], x9m14);
2433        let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[6], x10m13);
2434        let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[9], x11m12);
2435        let [y03, y20] = SseVector::column_butterfly2([m0320a, m0320b]);
2436
2437        let m0419a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p22);
2438        let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[7], x2p21);
2439        let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[10], x3p20);
2440        let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[6], x4p19);
2441        let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[2], x5p18);
2442        let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[0], x6p17);
2443        let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[4], x7p16);
2444        let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[8], x8p15);
2445        let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[9], x9p14);
2446        let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[5], x10p13);
2447        let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[1], x11p12);
2448        let m0419b = SseVector::mul(self.twiddles_im[3], x1m22);
2449        let m0419b = SseVector::fmadd(m0419b, self.twiddles_im[7], x2m21);
2450        let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[10], x3m20);
2451        let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[6], x4m19);
2452        let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[2], x5m18);
2453        let m0419b = SseVector::fmadd(m0419b, self.twiddles_im[0], x6m17);
2454        let m0419b = SseVector::fmadd(m0419b, self.twiddles_im[4], x7m16);
2455        let m0419b = SseVector::fmadd(m0419b, self.twiddles_im[8], x8m15);
2456        let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[9], x9m14);
2457        let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[5], x10m13);
2458        let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[1], x11m12);
2459        let [y04, y19] = SseVector::column_butterfly2([m0419a, m0419b]);
2460
2461        let m0518a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p22);
2462        let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[9], x2p21);
2463        let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[7], x3p20);
2464        let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[2], x4p19);
2465        let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[1], x5p18);
2466        let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[6], x6p17);
2467        let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[10], x7p16);
2468        let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[5], x8p15);
2469        let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[0], x9p14);
2470        let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[3], x10p13);
2471        let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[8], x11p12);
2472        let m0518b = SseVector::mul(self.twiddles_im[4], x1m22);
2473        let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[9], x2m21);
2474        let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[7], x3m20);
2475        let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[2], x4m19);
2476        let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[1], x5m18);
2477        let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[6], x6m17);
2478        let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[10], x7m16);
2479        let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[5], x8m15);
2480        let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[0], x9m14);
2481        let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[3], x10m13);
2482        let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[8], x11m12);
2483        let [y05, y18] = SseVector::column_butterfly2([m0518a, m0518b]);
2484
2485        let m0617a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p22);
2486        let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[10], x2p21);
2487        let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[4], x3p20);
2488        let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[0], x4p19);
2489        let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[6], x5p18);
2490        let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[9], x6p17);
2491        let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[3], x7p16);
2492        let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[1], x8p15);
2493        let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[7], x9p14);
2494        let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[8], x10p13);
2495        let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[2], x11p12);
2496        let m0617b = SseVector::mul(self.twiddles_im[5], x1m22);
2497        let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[10], x2m21);
2498        let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[4], x3m20);
2499        let m0617b = SseVector::fmadd(m0617b, self.twiddles_im[0], x4m19);
2500        let m0617b = SseVector::fmadd(m0617b, self.twiddles_im[6], x5m18);
2501        let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[9], x6m17);
2502        let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[3], x7m16);
2503        let m0617b = SseVector::fmadd(m0617b, self.twiddles_im[1], x8m15);
2504        let m0617b = SseVector::fmadd(m0617b, self.twiddles_im[7], x9m14);
2505        let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[8], x10m13);
2506        let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[2], x11m12);
2507        let [y06, y17] = SseVector::column_butterfly2([m0617a, m0617b]);
2508
2509        let m0716a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p22);
2510        let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[8], x2p21);
2511        let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[1], x3p20);
2512        let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[4], x4p19);
2513        let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[10], x5p18);
2514        let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[3], x6p17);
2515        let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[2], x7p16);
2516        let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[9], x8p15);
2517        let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[5], x9p14);
2518        let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[0], x10p13);
2519        let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[7], x11p12);
2520        let m0716b = SseVector::mul(self.twiddles_im[6], x1m22);
2521        let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[8], x2m21);
2522        let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[1], x3m20);
2523        let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[4], x4m19);
2524        let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[10], x5m18);
2525        let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[3], x6m17);
2526        let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[2], x7m16);
2527        let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[9], x8m15);
2528        let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[5], x9m14);
2529        let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[0], x10m13);
2530        let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[7], x11m12);
2531        let [y07, y16] = SseVector::column_butterfly2([m0716a, m0716b]);
2532
2533        let m0815a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p22);
2534        let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[6], x2p21);
2535        let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[0], x3p20);
2536        let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[8], x4p19);
2537        let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[5], x5p18);
2538        let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[1], x6p17);
2539        let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[9], x7p16);
2540        let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[4], x8p15);
2541        let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[2], x9p14);
2542        let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[10], x10p13);
2543        let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[3], x11p12);
2544        let m0815b = SseVector::mul(self.twiddles_im[7], x1m22);
2545        let m0815b = SseVector::nmadd(m0815b, self.twiddles_im[6], x2m21);
2546        let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[0], x3m20);
2547        let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[8], x4m19);
2548        let m0815b = SseVector::nmadd(m0815b, self.twiddles_im[5], x5m18);
2549        let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[1], x6m17);
2550        let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[9], x7m16);
2551        let m0815b = SseVector::nmadd(m0815b, self.twiddles_im[4], x8m15);
2552        let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[2], x9m14);
2553        let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[10], x10m13);
2554        let m0815b = SseVector::nmadd(m0815b, self.twiddles_im[3], x11m12);
2555        let [y08, y15] = SseVector::column_butterfly2([m0815a, m0815b]);
2556
2557        let m0914a = SseVector::fmadd(values[0], self.twiddles_re[8], x1p22);
2558        let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[4], x2p21);
2559        let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[3], x3p20);
2560        let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[9], x4p19);
2561        let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[0], x5p18);
2562        let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[7], x6p17);
2563        let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[5], x7p16);
2564        let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[2], x8p15);
2565        let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[10], x9p14);
2566        let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[1], x10p13);
2567        let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[6], x11p12);
2568        let m0914b = SseVector::mul(self.twiddles_im[8], x1m22);
2569        let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[4], x2m21);
2570        let m0914b = SseVector::fmadd(m0914b, self.twiddles_im[3], x3m20);
2571        let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[9], x4m19);
2572        let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[0], x5m18);
2573        let m0914b = SseVector::fmadd(m0914b, self.twiddles_im[7], x6m17);
2574        let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[5], x7m16);
2575        let m0914b = SseVector::fmadd(m0914b, self.twiddles_im[2], x8m15);
2576        let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[10], x9m14);
2577        let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[1], x10m13);
2578        let m0914b = SseVector::fmadd(m0914b, self.twiddles_im[6], x11m12);
2579        let [y09, y14] = SseVector::column_butterfly2([m0914a, m0914b]);
2580
2581        let m1013a = SseVector::fmadd(values[0], self.twiddles_re[9], x1p22);
2582        let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[2], x2p21);
2583        let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[6], x3p20);
2584        let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[5], x4p19);
2585        let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[3], x5p18);
2586        let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[8], x6p17);
2587        let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[0], x7p16);
2588        let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[10], x8p15);
2589        let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[1], x9p14);
2590        let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[7], x10p13);
2591        let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[4], x11p12);
2592        let m1013b = SseVector::mul(self.twiddles_im[9], x1m22);
2593        let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[2], x2m21);
2594        let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[6], x3m20);
2595        let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[5], x4m19);
2596        let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[3], x5m18);
2597        let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[8], x6m17);
2598        let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[0], x7m16);
2599        let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[10], x8m15);
2600        let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[1], x9m14);
2601        let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[7], x10m13);
2602        let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[4], x11m12);
2603        let [y10, y13] = SseVector::column_butterfly2([m1013a, m1013b]);
2604
2605        let m1112a = SseVector::fmadd(values[0], self.twiddles_re[10], x1p22);
2606        let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[0], x2p21);
2607        let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[9], x3p20);
2608        let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[1], x4p19);
2609        let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[8], x5p18);
2610        let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[2], x6p17);
2611        let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[7], x7p16);
2612        let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[3], x8p15);
2613        let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[6], x9p14);
2614        let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[4], x10p13);
2615        let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[5], x11p12);
2616        let m1112b = SseVector::mul(self.twiddles_im[10], x1m22);
2617        let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[0], x2m21);
2618        let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[9], x3m20);
2619        let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[1], x4m19);
2620        let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[8], x5m18);
2621        let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[2], x6m17);
2622        let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[7], x7m16);
2623        let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[3], x8m15);
2624        let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[6], x9m14);
2625        let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[4], x10m13);
2626        let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[5], x11m12);
2627        let [y11, y12] = SseVector::column_butterfly2([m1112a, m1112b]);
2628
2629
2630        [y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22]
2631    }
2632}
2633
2634struct SseF32Butterfly29<T> {
2635    direction: FftDirection,
2636    twiddles_re: [__m128; 14],
2637    twiddles_im: [__m128; 14],
2638    _phantom: std::marker::PhantomData<T>,
2639}
2640
2641boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly29);
2642boilerplate_fft_sse_common_butterfly!(SseF32Butterfly29, 29, |this: &SseF32Butterfly29<_>| this.direction);
2643impl<T: FftNum> SseF32Butterfly29<T> {
2644    /// Safety: The current machine must support the sse4.1 instruction set
2645    #[target_feature(enable = "sse4.1")]
2646    unsafe fn new(direction: FftDirection) -> Self {
2647        assert_f32::<T>();
2648        let twiddles = make_twiddles(29, direction);
2649        Self {
2650            direction,
2651            twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
2652            twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
2653            _phantom: std::marker::PhantomData,
2654        }
2655    }
2656
2657    #[inline(always)]
2658    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
2659        let values = read_partial1_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28 });
2660
2661        let out = self.perform_parallel_fft_direct(values);
2662        
2663        write_partial_lo_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28 } ); 
2664    }
2665
2666    #[inline(always)]
2667    pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
2668        let input_packed = read_complex_to_array!(buffer, { 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56 });
2669
2670        let values = [
2671            extract_lo_hi_f32(input_packed[0], input_packed[14]),
2672            extract_hi_lo_f32(input_packed[0], input_packed[15]),
2673            extract_lo_hi_f32(input_packed[1], input_packed[15]),
2674            extract_hi_lo_f32(input_packed[1], input_packed[16]),
2675            extract_lo_hi_f32(input_packed[2], input_packed[16]),
2676            extract_hi_lo_f32(input_packed[2], input_packed[17]),
2677            extract_lo_hi_f32(input_packed[3], input_packed[17]),
2678            extract_hi_lo_f32(input_packed[3], input_packed[18]),
2679            extract_lo_hi_f32(input_packed[4], input_packed[18]),
2680            extract_hi_lo_f32(input_packed[4], input_packed[19]),
2681            extract_lo_hi_f32(input_packed[5], input_packed[19]),
2682            extract_hi_lo_f32(input_packed[5], input_packed[20]),
2683            extract_lo_hi_f32(input_packed[6], input_packed[20]),
2684            extract_hi_lo_f32(input_packed[6], input_packed[21]),
2685            extract_lo_hi_f32(input_packed[7], input_packed[21]),
2686            extract_hi_lo_f32(input_packed[7], input_packed[22]),
2687            extract_lo_hi_f32(input_packed[8], input_packed[22]),
2688            extract_hi_lo_f32(input_packed[8], input_packed[23]),
2689            extract_lo_hi_f32(input_packed[9], input_packed[23]),
2690            extract_hi_lo_f32(input_packed[9], input_packed[24]),
2691            extract_lo_hi_f32(input_packed[10], input_packed[24]),
2692            extract_hi_lo_f32(input_packed[10], input_packed[25]),
2693            extract_lo_hi_f32(input_packed[11], input_packed[25]),
2694            extract_hi_lo_f32(input_packed[11], input_packed[26]),
2695            extract_lo_hi_f32(input_packed[12], input_packed[26]),
2696            extract_hi_lo_f32(input_packed[12], input_packed[27]),
2697            extract_lo_hi_f32(input_packed[13], input_packed[27]),
2698            extract_hi_lo_f32(input_packed[13], input_packed[28]),
2699            extract_lo_hi_f32(input_packed[14], input_packed[28]),
2700        ];
2701
2702        let out = self.perform_parallel_fft_direct(values);
2703
2704        let out_packed = [
2705            extract_lo_lo_f32(out[0], out[1]),
2706            extract_lo_lo_f32(out[2], out[3]),
2707            extract_lo_lo_f32(out[4], out[5]),
2708            extract_lo_lo_f32(out[6], out[7]),
2709            extract_lo_lo_f32(out[8], out[9]),
2710            extract_lo_lo_f32(out[10], out[11]),
2711            extract_lo_lo_f32(out[12], out[13]),
2712            extract_lo_lo_f32(out[14], out[15]),
2713            extract_lo_lo_f32(out[16], out[17]),
2714            extract_lo_lo_f32(out[18], out[19]),
2715            extract_lo_lo_f32(out[20], out[21]),
2716            extract_lo_lo_f32(out[22], out[23]),
2717            extract_lo_lo_f32(out[24], out[25]),
2718            extract_lo_lo_f32(out[26], out[27]),
2719            extract_lo_hi_f32(out[28], out[0]),
2720            extract_hi_hi_f32(out[1], out[2]),
2721            extract_hi_hi_f32(out[3], out[4]),
2722            extract_hi_hi_f32(out[5], out[6]),
2723            extract_hi_hi_f32(out[7], out[8]),
2724            extract_hi_hi_f32(out[9], out[10]),
2725            extract_hi_hi_f32(out[11], out[12]),
2726            extract_hi_hi_f32(out[13], out[14]),
2727            extract_hi_hi_f32(out[15], out[16]),
2728            extract_hi_hi_f32(out[17], out[18]),
2729            extract_hi_hi_f32(out[19], out[20]),
2730            extract_hi_hi_f32(out[21], out[22]),
2731            extract_hi_hi_f32(out[23], out[24]),
2732            extract_hi_hi_f32(out[25], out[26]),
2733            extract_hi_hi_f32(out[27], out[28]),
2734        ];
2735
2736        write_complex_to_array_strided!(out_packed, buffer, 2, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28 });
2737    }
2738
2739    #[inline(always)]
2740    pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 29]) -> [__m128; 29] {
2741        let rotate = SseVector::make_rotate90(FftDirection::Inverse);
2742
2743        let y00 = values[0];
2744        let [x1p28, x1m28] =  SseVector::column_butterfly2([values[1], values[28]]);
2745        let x1m28 = SseVector::apply_rotate90(rotate, x1m28);
2746        let y00 = SseVector::add(y00, x1p28);
2747        let [x2p27, x2m27] =  SseVector::column_butterfly2([values[2], values[27]]);
2748        let x2m27 = SseVector::apply_rotate90(rotate, x2m27);
2749        let y00 = SseVector::add(y00, x2p27);
2750        let [x3p26, x3m26] =  SseVector::column_butterfly2([values[3], values[26]]);
2751        let x3m26 = SseVector::apply_rotate90(rotate, x3m26);
2752        let y00 = SseVector::add(y00, x3p26);
2753        let [x4p25, x4m25] =  SseVector::column_butterfly2([values[4], values[25]]);
2754        let x4m25 = SseVector::apply_rotate90(rotate, x4m25);
2755        let y00 = SseVector::add(y00, x4p25);
2756        let [x5p24, x5m24] =  SseVector::column_butterfly2([values[5], values[24]]);
2757        let x5m24 = SseVector::apply_rotate90(rotate, x5m24);
2758        let y00 = SseVector::add(y00, x5p24);
2759        let [x6p23, x6m23] =  SseVector::column_butterfly2([values[6], values[23]]);
2760        let x6m23 = SseVector::apply_rotate90(rotate, x6m23);
2761        let y00 = SseVector::add(y00, x6p23);
2762        let [x7p22, x7m22] =  SseVector::column_butterfly2([values[7], values[22]]);
2763        let x7m22 = SseVector::apply_rotate90(rotate, x7m22);
2764        let y00 = SseVector::add(y00, x7p22);
2765        let [x8p21, x8m21] =  SseVector::column_butterfly2([values[8], values[21]]);
2766        let x8m21 = SseVector::apply_rotate90(rotate, x8m21);
2767        let y00 = SseVector::add(y00, x8p21);
2768        let [x9p20, x9m20] =  SseVector::column_butterfly2([values[9], values[20]]);
2769        let x9m20 = SseVector::apply_rotate90(rotate, x9m20);
2770        let y00 = SseVector::add(y00, x9p20);
2771        let [x10p19, x10m19] =  SseVector::column_butterfly2([values[10], values[19]]);
2772        let x10m19 = SseVector::apply_rotate90(rotate, x10m19);
2773        let y00 = SseVector::add(y00, x10p19);
2774        let [x11p18, x11m18] =  SseVector::column_butterfly2([values[11], values[18]]);
2775        let x11m18 = SseVector::apply_rotate90(rotate, x11m18);
2776        let y00 = SseVector::add(y00, x11p18);
2777        let [x12p17, x12m17] =  SseVector::column_butterfly2([values[12], values[17]]);
2778        let x12m17 = SseVector::apply_rotate90(rotate, x12m17);
2779        let y00 = SseVector::add(y00, x12p17);
2780        let [x13p16, x13m16] =  SseVector::column_butterfly2([values[13], values[16]]);
2781        let x13m16 = SseVector::apply_rotate90(rotate, x13m16);
2782        let y00 = SseVector::add(y00, x13p16);
2783        let [x14p15, x14m15] =  SseVector::column_butterfly2([values[14], values[15]]);
2784        let x14m15 = SseVector::apply_rotate90(rotate, x14m15);
2785        let y00 = SseVector::add(y00, x14p15);
2786
2787        let m0128a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p28);
2788        let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[1], x2p27);
2789        let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[2], x3p26);
2790        let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[3], x4p25);
2791        let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[4], x5p24);
2792        let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[5], x6p23);
2793        let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[6], x7p22);
2794        let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[7], x8p21);
2795        let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[8], x9p20);
2796        let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[9], x10p19);
2797        let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[10], x11p18);
2798        let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[11], x12p17);
2799        let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[12], x13p16);
2800        let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[13], x14p15);
2801        let m0128b = SseVector::mul(self.twiddles_im[0], x1m28);
2802        let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[1], x2m27);
2803        let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[2], x3m26);
2804        let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[3], x4m25);
2805        let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[4], x5m24);
2806        let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[5], x6m23);
2807        let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[6], x7m22);
2808        let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[7], x8m21);
2809        let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[8], x9m20);
2810        let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[9], x10m19);
2811        let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[10], x11m18);
2812        let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[11], x12m17);
2813        let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[12], x13m16);
2814        let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[13], x14m15);
2815        let [y01, y28] = SseVector::column_butterfly2([m0128a, m0128b]);
2816
2817        let m0227a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p28);
2818        let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[3], x2p27);
2819        let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[5], x3p26);
2820        let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[7], x4p25);
2821        let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[9], x5p24);
2822        let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[11], x6p23);
2823        let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[13], x7p22);
2824        let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[12], x8p21);
2825        let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[10], x9p20);
2826        let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[8], x10p19);
2827        let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[6], x11p18);
2828        let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[4], x12p17);
2829        let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[2], x13p16);
2830        let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[0], x14p15);
2831        let m0227b = SseVector::mul(self.twiddles_im[1], x1m28);
2832        let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[3], x2m27);
2833        let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[5], x3m26);
2834        let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[7], x4m25);
2835        let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[9], x5m24);
2836        let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[11], x6m23);
2837        let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[13], x7m22);
2838        let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[12], x8m21);
2839        let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[10], x9m20);
2840        let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[8], x10m19);
2841        let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[6], x11m18);
2842        let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[4], x12m17);
2843        let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[2], x13m16);
2844        let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[0], x14m15);
2845        let [y02, y27] = SseVector::column_butterfly2([m0227a, m0227b]);
2846
2847        let m0326a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p28);
2848        let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[5], x2p27);
2849        let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[8], x3p26);
2850        let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[11], x4p25);
2851        let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[13], x5p24);
2852        let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[10], x6p23);
2853        let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[7], x7p22);
2854        let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[4], x8p21);
2855        let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[1], x9p20);
2856        let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[0], x10p19);
2857        let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[3], x11p18);
2858        let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[6], x12p17);
2859        let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[9], x13p16);
2860        let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[12], x14p15);
2861        let m0326b = SseVector::mul(self.twiddles_im[2], x1m28);
2862        let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[5], x2m27);
2863        let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[8], x3m26);
2864        let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[11], x4m25);
2865        let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[13], x5m24);
2866        let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[10], x6m23);
2867        let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[7], x7m22);
2868        let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[4], x8m21);
2869        let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[1], x9m20);
2870        let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[0], x10m19);
2871        let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[3], x11m18);
2872        let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[6], x12m17);
2873        let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[9], x13m16);
2874        let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[12], x14m15);
2875        let [y03, y26] = SseVector::column_butterfly2([m0326a, m0326b]);
2876
2877        let m0425a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p28);
2878        let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[7], x2p27);
2879        let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[11], x3p26);
2880        let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[12], x4p25);
2881        let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[8], x5p24);
2882        let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[4], x6p23);
2883        let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[0], x7p22);
2884        let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[2], x8p21);
2885        let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[6], x9p20);
2886        let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[10], x10p19);
2887        let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[13], x11p18);
2888        let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[9], x12p17);
2889        let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[5], x13p16);
2890        let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[1], x14p15);
2891        let m0425b = SseVector::mul(self.twiddles_im[3], x1m28);
2892        let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[7], x2m27);
2893        let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[11], x3m26);
2894        let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[12], x4m25);
2895        let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[8], x5m24);
2896        let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[4], x6m23);
2897        let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[0], x7m22);
2898        let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[2], x8m21);
2899        let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[6], x9m20);
2900        let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[10], x10m19);
2901        let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[13], x11m18);
2902        let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[9], x12m17);
2903        let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[5], x13m16);
2904        let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[1], x14m15);
2905        let [y04, y25] = SseVector::column_butterfly2([m0425a, m0425b]);
2906
2907        let m0524a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p28);
2908        let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[9], x2p27);
2909        let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[13], x3p26);
2910        let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[8], x4p25);
2911        let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[3], x5p24);
2912        let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[0], x6p23);
2913        let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[5], x7p22);
2914        let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[10], x8p21);
2915        let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[12], x9p20);
2916        let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[7], x10p19);
2917        let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[2], x11p18);
2918        let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[1], x12p17);
2919        let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[6], x13p16);
2920        let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[11], x14p15);
2921        let m0524b = SseVector::mul(self.twiddles_im[4], x1m28);
2922        let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[9], x2m27);
2923        let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[13], x3m26);
2924        let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[8], x4m25);
2925        let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[3], x5m24);
2926        let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[0], x6m23);
2927        let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[5], x7m22);
2928        let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[10], x8m21);
2929        let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[12], x9m20);
2930        let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[7], x10m19);
2931        let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[2], x11m18);
2932        let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[1], x12m17);
2933        let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[6], x13m16);
2934        let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[11], x14m15);
2935        let [y05, y24] = SseVector::column_butterfly2([m0524a, m0524b]);
2936
2937        let m0623a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p28);
2938        let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[11], x2p27);
2939        let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[10], x3p26);
2940        let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[4], x4p25);
2941        let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[0], x5p24);
2942        let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[6], x6p23);
2943        let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[12], x7p22);
2944        let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[9], x8p21);
2945        let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[3], x9p20);
2946        let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[1], x10p19);
2947        let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[7], x11p18);
2948        let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[13], x12p17);
2949        let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[8], x13p16);
2950        let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[2], x14p15);
2951        let m0623b = SseVector::mul(self.twiddles_im[5], x1m28);
2952        let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[11], x2m27);
2953        let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[10], x3m26);
2954        let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[4], x4m25);
2955        let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[0], x5m24);
2956        let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[6], x6m23);
2957        let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[12], x7m22);
2958        let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[9], x8m21);
2959        let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[3], x9m20);
2960        let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[1], x10m19);
2961        let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[7], x11m18);
2962        let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[13], x12m17);
2963        let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[8], x13m16);
2964        let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[2], x14m15);
2965        let [y06, y23] = SseVector::column_butterfly2([m0623a, m0623b]);
2966
2967        let m0722a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p28);
2968        let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[13], x2p27);
2969        let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[7], x3p26);
2970        let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[0], x4p25);
2971        let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[5], x5p24);
2972        let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[12], x6p23);
2973        let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[8], x7p22);
2974        let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[1], x8p21);
2975        let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[4], x9p20);
2976        let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[11], x10p19);
2977        let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[9], x11p18);
2978        let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[2], x12p17);
2979        let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[3], x13p16);
2980        let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[10], x14p15);
2981        let m0722b = SseVector::mul(self.twiddles_im[6], x1m28);
2982        let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[13], x2m27);
2983        let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[7], x3m26);
2984        let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[0], x4m25);
2985        let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[5], x5m24);
2986        let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[12], x6m23);
2987        let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[8], x7m22);
2988        let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[1], x8m21);
2989        let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[4], x9m20);
2990        let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[11], x10m19);
2991        let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[9], x11m18);
2992        let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[2], x12m17);
2993        let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[3], x13m16);
2994        let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[10], x14m15);
2995        let [y07, y22] = SseVector::column_butterfly2([m0722a, m0722b]);
2996
2997        let m0821a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p28);
2998        let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[12], x2p27);
2999        let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[4], x3p26);
3000        let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[2], x4p25);
3001        let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[10], x5p24);
3002        let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[9], x6p23);
3003        let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[1], x7p22);
3004        let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[5], x8p21);
3005        let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[13], x9p20);
3006        let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[6], x10p19);
3007        let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[0], x11p18);
3008        let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[8], x12p17);
3009        let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[11], x13p16);
3010        let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[3], x14p15);
3011        let m0821b = SseVector::mul(self.twiddles_im[7], x1m28);
3012        let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[12], x2m27);
3013        let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[4], x3m26);
3014        let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[2], x4m25);
3015        let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[10], x5m24);
3016        let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[9], x6m23);
3017        let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[1], x7m22);
3018        let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[5], x8m21);
3019        let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[13], x9m20);
3020        let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[6], x10m19);
3021        let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[0], x11m18);
3022        let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[8], x12m17);
3023        let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[11], x13m16);
3024        let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[3], x14m15);
3025        let [y08, y21] = SseVector::column_butterfly2([m0821a, m0821b]);
3026
3027        let m0920a = SseVector::fmadd(values[0], self.twiddles_re[8], x1p28);
3028        let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[10], x2p27);
3029        let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[1], x3p26);
3030        let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[6], x4p25);
3031        let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[12], x5p24);
3032        let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[3], x6p23);
3033        let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[4], x7p22);
3034        let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[13], x8p21);
3035        let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[5], x9p20);
3036        let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[2], x10p19);
3037        let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[11], x11p18);
3038        let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[7], x12p17);
3039        let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[0], x13p16);
3040        let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[9], x14p15);
3041        let m0920b = SseVector::mul(self.twiddles_im[8], x1m28);
3042        let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[10], x2m27);
3043        let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[1], x3m26);
3044        let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[6], x4m25);
3045        let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[12], x5m24);
3046        let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[3], x6m23);
3047        let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[4], x7m22);
3048        let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[13], x8m21);
3049        let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[5], x9m20);
3050        let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[2], x10m19);
3051        let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[11], x11m18);
3052        let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[7], x12m17);
3053        let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[0], x13m16);
3054        let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[9], x14m15);
3055        let [y09, y20] = SseVector::column_butterfly2([m0920a, m0920b]);
3056
3057        let m1019a = SseVector::fmadd(values[0], self.twiddles_re[9], x1p28);
3058        let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[8], x2p27);
3059        let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[0], x3p26);
3060        let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[10], x4p25);
3061        let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[7], x5p24);
3062        let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[1], x6p23);
3063        let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[11], x7p22);
3064        let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[6], x8p21);
3065        let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[2], x9p20);
3066        let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[12], x10p19);
3067        let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[5], x11p18);
3068        let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[3], x12p17);
3069        let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[13], x13p16);
3070        let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[4], x14p15);
3071        let m1019b = SseVector::mul(self.twiddles_im[9], x1m28);
3072        let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[8], x2m27);
3073        let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[0], x3m26);
3074        let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[10], x4m25);
3075        let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[7], x5m24);
3076        let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[1], x6m23);
3077        let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[11], x7m22);
3078        let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[6], x8m21);
3079        let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[2], x9m20);
3080        let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[12], x10m19);
3081        let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[5], x11m18);
3082        let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[3], x12m17);
3083        let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[13], x13m16);
3084        let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[4], x14m15);
3085        let [y10, y19] = SseVector::column_butterfly2([m1019a, m1019b]);
3086
3087        let m1118a = SseVector::fmadd(values[0], self.twiddles_re[10], x1p28);
3088        let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[6], x2p27);
3089        let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[3], x3p26);
3090        let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[13], x4p25);
3091        let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[2], x5p24);
3092        let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[7], x6p23);
3093        let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[9], x7p22);
3094        let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[0], x8p21);
3095        let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[11], x9p20);
3096        let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[5], x10p19);
3097        let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[4], x11p18);
3098        let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[12], x12p17);
3099        let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[1], x13p16);
3100        let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[8], x14p15);
3101        let m1118b = SseVector::mul(self.twiddles_im[10], x1m28);
3102        let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[6], x2m27);
3103        let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[3], x3m26);
3104        let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[13], x4m25);
3105        let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[2], x5m24);
3106        let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[7], x6m23);
3107        let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[9], x7m22);
3108        let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[0], x8m21);
3109        let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[11], x9m20);
3110        let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[5], x10m19);
3111        let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[4], x11m18);
3112        let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[12], x12m17);
3113        let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[1], x13m16);
3114        let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[8], x14m15);
3115        let [y11, y18] = SseVector::column_butterfly2([m1118a, m1118b]);
3116
3117        let m1217a = SseVector::fmadd(values[0], self.twiddles_re[11], x1p28);
3118        let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[4], x2p27);
3119        let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[6], x3p26);
3120        let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[9], x4p25);
3121        let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[1], x5p24);
3122        let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[13], x6p23);
3123        let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[2], x7p22);
3124        let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[8], x8p21);
3125        let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[7], x9p20);
3126        let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[3], x10p19);
3127        let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[12], x11p18);
3128        let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[0], x12p17);
3129        let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[10], x13p16);
3130        let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[5], x14p15);
3131        let m1217b = SseVector::mul(self.twiddles_im[11], x1m28);
3132        let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[4], x2m27);
3133        let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[6], x3m26);
3134        let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[9], x4m25);
3135        let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[1], x5m24);
3136        let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[13], x6m23);
3137        let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[2], x7m22);
3138        let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[8], x8m21);
3139        let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[7], x9m20);
3140        let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[3], x10m19);
3141        let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[12], x11m18);
3142        let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[0], x12m17);
3143        let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[10], x13m16);
3144        let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[5], x14m15);
3145        let [y12, y17] = SseVector::column_butterfly2([m1217a, m1217b]);
3146
3147        let m1316a = SseVector::fmadd(values[0], self.twiddles_re[12], x1p28);
3148        let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[2], x2p27);
3149        let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[9], x3p26);
3150        let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[5], x4p25);
3151        let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[6], x5p24);
3152        let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[8], x6p23);
3153        let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[3], x7p22);
3154        let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[11], x8p21);
3155        let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[0], x9p20);
3156        let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[13], x10p19);
3157        let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[1], x11p18);
3158        let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[10], x12p17);
3159        let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[4], x13p16);
3160        let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[7], x14p15);
3161        let m1316b = SseVector::mul(self.twiddles_im[12], x1m28);
3162        let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[2], x2m27);
3163        let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[9], x3m26);
3164        let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[5], x4m25);
3165        let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[6], x5m24);
3166        let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[8], x6m23);
3167        let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[3], x7m22);
3168        let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[11], x8m21);
3169        let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[0], x9m20);
3170        let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[13], x10m19);
3171        let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[1], x11m18);
3172        let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[10], x12m17);
3173        let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[4], x13m16);
3174        let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[7], x14m15);
3175        let [y13, y16] = SseVector::column_butterfly2([m1316a, m1316b]);
3176
3177        let m1415a = SseVector::fmadd(values[0], self.twiddles_re[13], x1p28);
3178        let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[0], x2p27);
3179        let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[12], x3p26);
3180        let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[1], x4p25);
3181        let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[11], x5p24);
3182        let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[2], x6p23);
3183        let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[10], x7p22);
3184        let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[3], x8p21);
3185        let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[9], x9p20);
3186        let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[4], x10p19);
3187        let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[8], x11p18);
3188        let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[5], x12p17);
3189        let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[7], x13p16);
3190        let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[6], x14p15);
3191        let m1415b = SseVector::mul(self.twiddles_im[13], x1m28);
3192        let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[0], x2m27);
3193        let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[12], x3m26);
3194        let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[1], x4m25);
3195        let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[11], x5m24);
3196        let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[2], x6m23);
3197        let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[10], x7m22);
3198        let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[3], x8m21);
3199        let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[9], x9m20);
3200        let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[4], x10m19);
3201        let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[8], x11m18);
3202        let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[5], x12m17);
3203        let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[7], x13m16);
3204        let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[6], x14m15);
3205        let [y14, y15] = SseVector::column_butterfly2([m1415a, m1415b]);
3206
3207
3208        [y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22, y23, y24, y25, y26, y27, y28]
3209    }
3210}
3211
3212struct SseF64Butterfly29<T> {
3213    direction: FftDirection,
3214    twiddles_re: [__m128d; 14],
3215    twiddles_im: [__m128d; 14],
3216    _phantom: std::marker::PhantomData<T>,
3217}
3218
3219boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly29);
3220boilerplate_fft_sse_common_butterfly!(SseF64Butterfly29, 29, |this: &SseF64Butterfly29<_>| this.direction);
3221impl<T: FftNum> SseF64Butterfly29<T> {
3222    /// Safety: The current machine must support the sse4.1 instruction set
3223    #[target_feature(enable = "sse4.1")]
3224    unsafe fn new(direction: FftDirection) -> Self {
3225        assert_f64::<T>();
3226        let twiddles = make_twiddles(29, direction);
3227        unsafe {Self {
3228            direction,
3229            twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
3230            twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
3231            _phantom: std::marker::PhantomData,
3232        }}
3233    }
3234
3235    #[inline(always)]
3236    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
3237        let values = read_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28 });
3238
3239        let out = self.perform_fft_direct(values);
3240
3241        write_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28 });   
3242    }
3243
3244    #[inline(always)]
3245    pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 29]) -> [__m128d; 29] {
3246        let rotate = SseVector::make_rotate90(FftDirection::Inverse);
3247
3248        let y00 = values[0];
3249        let [x1p28, x1m28] =  SseVector::column_butterfly2([values[1], values[28]]);
3250        let x1m28 = SseVector::apply_rotate90(rotate, x1m28);
3251        let y00 = SseVector::add(y00, x1p28);
3252        let [x2p27, x2m27] =  SseVector::column_butterfly2([values[2], values[27]]);
3253        let x2m27 = SseVector::apply_rotate90(rotate, x2m27);
3254        let y00 = SseVector::add(y00, x2p27);
3255        let [x3p26, x3m26] =  SseVector::column_butterfly2([values[3], values[26]]);
3256        let x3m26 = SseVector::apply_rotate90(rotate, x3m26);
3257        let y00 = SseVector::add(y00, x3p26);
3258        let [x4p25, x4m25] =  SseVector::column_butterfly2([values[4], values[25]]);
3259        let x4m25 = SseVector::apply_rotate90(rotate, x4m25);
3260        let y00 = SseVector::add(y00, x4p25);
3261        let [x5p24, x5m24] =  SseVector::column_butterfly2([values[5], values[24]]);
3262        let x5m24 = SseVector::apply_rotate90(rotate, x5m24);
3263        let y00 = SseVector::add(y00, x5p24);
3264        let [x6p23, x6m23] =  SseVector::column_butterfly2([values[6], values[23]]);
3265        let x6m23 = SseVector::apply_rotate90(rotate, x6m23);
3266        let y00 = SseVector::add(y00, x6p23);
3267        let [x7p22, x7m22] =  SseVector::column_butterfly2([values[7], values[22]]);
3268        let x7m22 = SseVector::apply_rotate90(rotate, x7m22);
3269        let y00 = SseVector::add(y00, x7p22);
3270        let [x8p21, x8m21] =  SseVector::column_butterfly2([values[8], values[21]]);
3271        let x8m21 = SseVector::apply_rotate90(rotate, x8m21);
3272        let y00 = SseVector::add(y00, x8p21);
3273        let [x9p20, x9m20] =  SseVector::column_butterfly2([values[9], values[20]]);
3274        let x9m20 = SseVector::apply_rotate90(rotate, x9m20);
3275        let y00 = SseVector::add(y00, x9p20);
3276        let [x10p19, x10m19] =  SseVector::column_butterfly2([values[10], values[19]]);
3277        let x10m19 = SseVector::apply_rotate90(rotate, x10m19);
3278        let y00 = SseVector::add(y00, x10p19);
3279        let [x11p18, x11m18] =  SseVector::column_butterfly2([values[11], values[18]]);
3280        let x11m18 = SseVector::apply_rotate90(rotate, x11m18);
3281        let y00 = SseVector::add(y00, x11p18);
3282        let [x12p17, x12m17] =  SseVector::column_butterfly2([values[12], values[17]]);
3283        let x12m17 = SseVector::apply_rotate90(rotate, x12m17);
3284        let y00 = SseVector::add(y00, x12p17);
3285        let [x13p16, x13m16] =  SseVector::column_butterfly2([values[13], values[16]]);
3286        let x13m16 = SseVector::apply_rotate90(rotate, x13m16);
3287        let y00 = SseVector::add(y00, x13p16);
3288        let [x14p15, x14m15] =  SseVector::column_butterfly2([values[14], values[15]]);
3289        let x14m15 = SseVector::apply_rotate90(rotate, x14m15);
3290        let y00 = SseVector::add(y00, x14p15);
3291
3292        let m0128a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p28);
3293        let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[1], x2p27);
3294        let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[2], x3p26);
3295        let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[3], x4p25);
3296        let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[4], x5p24);
3297        let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[5], x6p23);
3298        let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[6], x7p22);
3299        let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[7], x8p21);
3300        let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[8], x9p20);
3301        let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[9], x10p19);
3302        let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[10], x11p18);
3303        let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[11], x12p17);
3304        let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[12], x13p16);
3305        let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[13], x14p15);
3306        let m0128b = SseVector::mul(self.twiddles_im[0], x1m28);
3307        let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[1], x2m27);
3308        let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[2], x3m26);
3309        let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[3], x4m25);
3310        let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[4], x5m24);
3311        let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[5], x6m23);
3312        let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[6], x7m22);
3313        let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[7], x8m21);
3314        let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[8], x9m20);
3315        let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[9], x10m19);
3316        let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[10], x11m18);
3317        let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[11], x12m17);
3318        let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[12], x13m16);
3319        let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[13], x14m15);
3320        let [y01, y28] = SseVector::column_butterfly2([m0128a, m0128b]);
3321
3322        let m0227a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p28);
3323        let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[3], x2p27);
3324        let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[5], x3p26);
3325        let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[7], x4p25);
3326        let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[9], x5p24);
3327        let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[11], x6p23);
3328        let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[13], x7p22);
3329        let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[12], x8p21);
3330        let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[10], x9p20);
3331        let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[8], x10p19);
3332        let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[6], x11p18);
3333        let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[4], x12p17);
3334        let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[2], x13p16);
3335        let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[0], x14p15);
3336        let m0227b = SseVector::mul(self.twiddles_im[1], x1m28);
3337        let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[3], x2m27);
3338        let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[5], x3m26);
3339        let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[7], x4m25);
3340        let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[9], x5m24);
3341        let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[11], x6m23);
3342        let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[13], x7m22);
3343        let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[12], x8m21);
3344        let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[10], x9m20);
3345        let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[8], x10m19);
3346        let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[6], x11m18);
3347        let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[4], x12m17);
3348        let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[2], x13m16);
3349        let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[0], x14m15);
3350        let [y02, y27] = SseVector::column_butterfly2([m0227a, m0227b]);
3351
3352        let m0326a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p28);
3353        let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[5], x2p27);
3354        let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[8], x3p26);
3355        let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[11], x4p25);
3356        let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[13], x5p24);
3357        let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[10], x6p23);
3358        let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[7], x7p22);
3359        let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[4], x8p21);
3360        let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[1], x9p20);
3361        let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[0], x10p19);
3362        let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[3], x11p18);
3363        let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[6], x12p17);
3364        let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[9], x13p16);
3365        let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[12], x14p15);
3366        let m0326b = SseVector::mul(self.twiddles_im[2], x1m28);
3367        let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[5], x2m27);
3368        let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[8], x3m26);
3369        let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[11], x4m25);
3370        let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[13], x5m24);
3371        let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[10], x6m23);
3372        let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[7], x7m22);
3373        let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[4], x8m21);
3374        let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[1], x9m20);
3375        let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[0], x10m19);
3376        let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[3], x11m18);
3377        let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[6], x12m17);
3378        let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[9], x13m16);
3379        let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[12], x14m15);
3380        let [y03, y26] = SseVector::column_butterfly2([m0326a, m0326b]);
3381
3382        let m0425a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p28);
3383        let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[7], x2p27);
3384        let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[11], x3p26);
3385        let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[12], x4p25);
3386        let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[8], x5p24);
3387        let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[4], x6p23);
3388        let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[0], x7p22);
3389        let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[2], x8p21);
3390        let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[6], x9p20);
3391        let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[10], x10p19);
3392        let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[13], x11p18);
3393        let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[9], x12p17);
3394        let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[5], x13p16);
3395        let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[1], x14p15);
3396        let m0425b = SseVector::mul(self.twiddles_im[3], x1m28);
3397        let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[7], x2m27);
3398        let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[11], x3m26);
3399        let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[12], x4m25);
3400        let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[8], x5m24);
3401        let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[4], x6m23);
3402        let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[0], x7m22);
3403        let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[2], x8m21);
3404        let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[6], x9m20);
3405        let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[10], x10m19);
3406        let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[13], x11m18);
3407        let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[9], x12m17);
3408        let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[5], x13m16);
3409        let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[1], x14m15);
3410        let [y04, y25] = SseVector::column_butterfly2([m0425a, m0425b]);
3411
3412        let m0524a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p28);
3413        let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[9], x2p27);
3414        let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[13], x3p26);
3415        let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[8], x4p25);
3416        let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[3], x5p24);
3417        let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[0], x6p23);
3418        let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[5], x7p22);
3419        let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[10], x8p21);
3420        let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[12], x9p20);
3421        let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[7], x10p19);
3422        let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[2], x11p18);
3423        let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[1], x12p17);
3424        let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[6], x13p16);
3425        let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[11], x14p15);
3426        let m0524b = SseVector::mul(self.twiddles_im[4], x1m28);
3427        let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[9], x2m27);
3428        let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[13], x3m26);
3429        let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[8], x4m25);
3430        let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[3], x5m24);
3431        let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[0], x6m23);
3432        let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[5], x7m22);
3433        let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[10], x8m21);
3434        let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[12], x9m20);
3435        let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[7], x10m19);
3436        let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[2], x11m18);
3437        let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[1], x12m17);
3438        let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[6], x13m16);
3439        let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[11], x14m15);
3440        let [y05, y24] = SseVector::column_butterfly2([m0524a, m0524b]);
3441
3442        let m0623a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p28);
3443        let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[11], x2p27);
3444        let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[10], x3p26);
3445        let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[4], x4p25);
3446        let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[0], x5p24);
3447        let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[6], x6p23);
3448        let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[12], x7p22);
3449        let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[9], x8p21);
3450        let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[3], x9p20);
3451        let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[1], x10p19);
3452        let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[7], x11p18);
3453        let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[13], x12p17);
3454        let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[8], x13p16);
3455        let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[2], x14p15);
3456        let m0623b = SseVector::mul(self.twiddles_im[5], x1m28);
3457        let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[11], x2m27);
3458        let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[10], x3m26);
3459        let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[4], x4m25);
3460        let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[0], x5m24);
3461        let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[6], x6m23);
3462        let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[12], x7m22);
3463        let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[9], x8m21);
3464        let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[3], x9m20);
3465        let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[1], x10m19);
3466        let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[7], x11m18);
3467        let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[13], x12m17);
3468        let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[8], x13m16);
3469        let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[2], x14m15);
3470        let [y06, y23] = SseVector::column_butterfly2([m0623a, m0623b]);
3471
3472        let m0722a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p28);
3473        let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[13], x2p27);
3474        let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[7], x3p26);
3475        let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[0], x4p25);
3476        let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[5], x5p24);
3477        let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[12], x6p23);
3478        let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[8], x7p22);
3479        let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[1], x8p21);
3480        let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[4], x9p20);
3481        let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[11], x10p19);
3482        let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[9], x11p18);
3483        let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[2], x12p17);
3484        let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[3], x13p16);
3485        let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[10], x14p15);
3486        let m0722b = SseVector::mul(self.twiddles_im[6], x1m28);
3487        let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[13], x2m27);
3488        let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[7], x3m26);
3489        let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[0], x4m25);
3490        let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[5], x5m24);
3491        let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[12], x6m23);
3492        let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[8], x7m22);
3493        let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[1], x8m21);
3494        let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[4], x9m20);
3495        let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[11], x10m19);
3496        let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[9], x11m18);
3497        let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[2], x12m17);
3498        let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[3], x13m16);
3499        let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[10], x14m15);
3500        let [y07, y22] = SseVector::column_butterfly2([m0722a, m0722b]);
3501
3502        let m0821a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p28);
3503        let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[12], x2p27);
3504        let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[4], x3p26);
3505        let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[2], x4p25);
3506        let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[10], x5p24);
3507        let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[9], x6p23);
3508        let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[1], x7p22);
3509        let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[5], x8p21);
3510        let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[13], x9p20);
3511        let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[6], x10p19);
3512        let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[0], x11p18);
3513        let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[8], x12p17);
3514        let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[11], x13p16);
3515        let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[3], x14p15);
3516        let m0821b = SseVector::mul(self.twiddles_im[7], x1m28);
3517        let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[12], x2m27);
3518        let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[4], x3m26);
3519        let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[2], x4m25);
3520        let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[10], x5m24);
3521        let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[9], x6m23);
3522        let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[1], x7m22);
3523        let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[5], x8m21);
3524        let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[13], x9m20);
3525        let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[6], x10m19);
3526        let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[0], x11m18);
3527        let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[8], x12m17);
3528        let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[11], x13m16);
3529        let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[3], x14m15);
3530        let [y08, y21] = SseVector::column_butterfly2([m0821a, m0821b]);
3531
3532        let m0920a = SseVector::fmadd(values[0], self.twiddles_re[8], x1p28);
3533        let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[10], x2p27);
3534        let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[1], x3p26);
3535        let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[6], x4p25);
3536        let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[12], x5p24);
3537        let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[3], x6p23);
3538        let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[4], x7p22);
3539        let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[13], x8p21);
3540        let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[5], x9p20);
3541        let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[2], x10p19);
3542        let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[11], x11p18);
3543        let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[7], x12p17);
3544        let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[0], x13p16);
3545        let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[9], x14p15);
3546        let m0920b = SseVector::mul(self.twiddles_im[8], x1m28);
3547        let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[10], x2m27);
3548        let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[1], x3m26);
3549        let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[6], x4m25);
3550        let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[12], x5m24);
3551        let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[3], x6m23);
3552        let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[4], x7m22);
3553        let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[13], x8m21);
3554        let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[5], x9m20);
3555        let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[2], x10m19);
3556        let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[11], x11m18);
3557        let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[7], x12m17);
3558        let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[0], x13m16);
3559        let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[9], x14m15);
3560        let [y09, y20] = SseVector::column_butterfly2([m0920a, m0920b]);
3561
3562        let m1019a = SseVector::fmadd(values[0], self.twiddles_re[9], x1p28);
3563        let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[8], x2p27);
3564        let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[0], x3p26);
3565        let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[10], x4p25);
3566        let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[7], x5p24);
3567        let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[1], x6p23);
3568        let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[11], x7p22);
3569        let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[6], x8p21);
3570        let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[2], x9p20);
3571        let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[12], x10p19);
3572        let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[5], x11p18);
3573        let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[3], x12p17);
3574        let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[13], x13p16);
3575        let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[4], x14p15);
3576        let m1019b = SseVector::mul(self.twiddles_im[9], x1m28);
3577        let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[8], x2m27);
3578        let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[0], x3m26);
3579        let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[10], x4m25);
3580        let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[7], x5m24);
3581        let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[1], x6m23);
3582        let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[11], x7m22);
3583        let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[6], x8m21);
3584        let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[2], x9m20);
3585        let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[12], x10m19);
3586        let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[5], x11m18);
3587        let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[3], x12m17);
3588        let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[13], x13m16);
3589        let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[4], x14m15);
3590        let [y10, y19] = SseVector::column_butterfly2([m1019a, m1019b]);
3591
3592        let m1118a = SseVector::fmadd(values[0], self.twiddles_re[10], x1p28);
3593        let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[6], x2p27);
3594        let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[3], x3p26);
3595        let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[13], x4p25);
3596        let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[2], x5p24);
3597        let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[7], x6p23);
3598        let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[9], x7p22);
3599        let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[0], x8p21);
3600        let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[11], x9p20);
3601        let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[5], x10p19);
3602        let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[4], x11p18);
3603        let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[12], x12p17);
3604        let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[1], x13p16);
3605        let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[8], x14p15);
3606        let m1118b = SseVector::mul(self.twiddles_im[10], x1m28);
3607        let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[6], x2m27);
3608        let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[3], x3m26);
3609        let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[13], x4m25);
3610        let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[2], x5m24);
3611        let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[7], x6m23);
3612        let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[9], x7m22);
3613        let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[0], x8m21);
3614        let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[11], x9m20);
3615        let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[5], x10m19);
3616        let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[4], x11m18);
3617        let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[12], x12m17);
3618        let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[1], x13m16);
3619        let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[8], x14m15);
3620        let [y11, y18] = SseVector::column_butterfly2([m1118a, m1118b]);
3621
3622        let m1217a = SseVector::fmadd(values[0], self.twiddles_re[11], x1p28);
3623        let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[4], x2p27);
3624        let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[6], x3p26);
3625        let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[9], x4p25);
3626        let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[1], x5p24);
3627        let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[13], x6p23);
3628        let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[2], x7p22);
3629        let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[8], x8p21);
3630        let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[7], x9p20);
3631        let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[3], x10p19);
3632        let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[12], x11p18);
3633        let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[0], x12p17);
3634        let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[10], x13p16);
3635        let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[5], x14p15);
3636        let m1217b = SseVector::mul(self.twiddles_im[11], x1m28);
3637        let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[4], x2m27);
3638        let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[6], x3m26);
3639        let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[9], x4m25);
3640        let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[1], x5m24);
3641        let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[13], x6m23);
3642        let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[2], x7m22);
3643        let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[8], x8m21);
3644        let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[7], x9m20);
3645        let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[3], x10m19);
3646        let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[12], x11m18);
3647        let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[0], x12m17);
3648        let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[10], x13m16);
3649        let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[5], x14m15);
3650        let [y12, y17] = SseVector::column_butterfly2([m1217a, m1217b]);
3651
3652        let m1316a = SseVector::fmadd(values[0], self.twiddles_re[12], x1p28);
3653        let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[2], x2p27);
3654        let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[9], x3p26);
3655        let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[5], x4p25);
3656        let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[6], x5p24);
3657        let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[8], x6p23);
3658        let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[3], x7p22);
3659        let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[11], x8p21);
3660        let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[0], x9p20);
3661        let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[13], x10p19);
3662        let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[1], x11p18);
3663        let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[10], x12p17);
3664        let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[4], x13p16);
3665        let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[7], x14p15);
3666        let m1316b = SseVector::mul(self.twiddles_im[12], x1m28);
3667        let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[2], x2m27);
3668        let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[9], x3m26);
3669        let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[5], x4m25);
3670        let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[6], x5m24);
3671        let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[8], x6m23);
3672        let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[3], x7m22);
3673        let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[11], x8m21);
3674        let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[0], x9m20);
3675        let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[13], x10m19);
3676        let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[1], x11m18);
3677        let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[10], x12m17);
3678        let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[4], x13m16);
3679        let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[7], x14m15);
3680        let [y13, y16] = SseVector::column_butterfly2([m1316a, m1316b]);
3681
3682        let m1415a = SseVector::fmadd(values[0], self.twiddles_re[13], x1p28);
3683        let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[0], x2p27);
3684        let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[12], x3p26);
3685        let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[1], x4p25);
3686        let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[11], x5p24);
3687        let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[2], x6p23);
3688        let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[10], x7p22);
3689        let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[3], x8p21);
3690        let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[9], x9p20);
3691        let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[4], x10p19);
3692        let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[8], x11p18);
3693        let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[5], x12p17);
3694        let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[7], x13p16);
3695        let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[6], x14p15);
3696        let m1415b = SseVector::mul(self.twiddles_im[13], x1m28);
3697        let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[0], x2m27);
3698        let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[12], x3m26);
3699        let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[1], x4m25);
3700        let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[11], x5m24);
3701        let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[2], x6m23);
3702        let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[10], x7m22);
3703        let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[3], x8m21);
3704        let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[9], x9m20);
3705        let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[4], x10m19);
3706        let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[8], x11m18);
3707        let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[5], x12m17);
3708        let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[7], x13m16);
3709        let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[6], x14m15);
3710        let [y14, y15] = SseVector::column_butterfly2([m1415a, m1415b]);
3711
3712
3713        [y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22, y23, y24, y25, y26, y27, y28]
3714    }
3715}
3716
3717struct SseF32Butterfly31<T> {
3718    direction: FftDirection,
3719    twiddles_re: [__m128; 15],
3720    twiddles_im: [__m128; 15],
3721    _phantom: std::marker::PhantomData<T>,
3722}
3723
3724boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly31);
3725boilerplate_fft_sse_common_butterfly!(SseF32Butterfly31, 31, |this: &SseF32Butterfly31<_>| this.direction);
3726impl<T: FftNum> SseF32Butterfly31<T> {
3727    /// Safety: The current machine must support the sse4.1 instruction set
3728    #[target_feature(enable = "sse4.1")]
3729    unsafe fn new(direction: FftDirection) -> Self {
3730        assert_f32::<T>();
3731        let twiddles = make_twiddles(31, direction);
3732        Self {
3733            direction,
3734            twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
3735            twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
3736            _phantom: std::marker::PhantomData,
3737        }
3738    }
3739
3740    #[inline(always)]
3741    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
3742        let values = read_partial1_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 });
3743
3744        let out = self.perform_parallel_fft_direct(values);
3745        
3746        write_partial_lo_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 } ); 
3747    }
3748
3749    #[inline(always)]
3750    pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
3751        let input_packed = read_complex_to_array!(buffer, { 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60 });
3752
3753        let values = [
3754            extract_lo_hi_f32(input_packed[0], input_packed[15]),
3755            extract_hi_lo_f32(input_packed[0], input_packed[16]),
3756            extract_lo_hi_f32(input_packed[1], input_packed[16]),
3757            extract_hi_lo_f32(input_packed[1], input_packed[17]),
3758            extract_lo_hi_f32(input_packed[2], input_packed[17]),
3759            extract_hi_lo_f32(input_packed[2], input_packed[18]),
3760            extract_lo_hi_f32(input_packed[3], input_packed[18]),
3761            extract_hi_lo_f32(input_packed[3], input_packed[19]),
3762            extract_lo_hi_f32(input_packed[4], input_packed[19]),
3763            extract_hi_lo_f32(input_packed[4], input_packed[20]),
3764            extract_lo_hi_f32(input_packed[5], input_packed[20]),
3765            extract_hi_lo_f32(input_packed[5], input_packed[21]),
3766            extract_lo_hi_f32(input_packed[6], input_packed[21]),
3767            extract_hi_lo_f32(input_packed[6], input_packed[22]),
3768            extract_lo_hi_f32(input_packed[7], input_packed[22]),
3769            extract_hi_lo_f32(input_packed[7], input_packed[23]),
3770            extract_lo_hi_f32(input_packed[8], input_packed[23]),
3771            extract_hi_lo_f32(input_packed[8], input_packed[24]),
3772            extract_lo_hi_f32(input_packed[9], input_packed[24]),
3773            extract_hi_lo_f32(input_packed[9], input_packed[25]),
3774            extract_lo_hi_f32(input_packed[10], input_packed[25]),
3775            extract_hi_lo_f32(input_packed[10], input_packed[26]),
3776            extract_lo_hi_f32(input_packed[11], input_packed[26]),
3777            extract_hi_lo_f32(input_packed[11], input_packed[27]),
3778            extract_lo_hi_f32(input_packed[12], input_packed[27]),
3779            extract_hi_lo_f32(input_packed[12], input_packed[28]),
3780            extract_lo_hi_f32(input_packed[13], input_packed[28]),
3781            extract_hi_lo_f32(input_packed[13], input_packed[29]),
3782            extract_lo_hi_f32(input_packed[14], input_packed[29]),
3783            extract_hi_lo_f32(input_packed[14], input_packed[30]),
3784            extract_lo_hi_f32(input_packed[15], input_packed[30]),
3785        ];
3786
3787        let out = self.perform_parallel_fft_direct(values);
3788
3789        let out_packed = [
3790            extract_lo_lo_f32(out[0], out[1]),
3791            extract_lo_lo_f32(out[2], out[3]),
3792            extract_lo_lo_f32(out[4], out[5]),
3793            extract_lo_lo_f32(out[6], out[7]),
3794            extract_lo_lo_f32(out[8], out[9]),
3795            extract_lo_lo_f32(out[10], out[11]),
3796            extract_lo_lo_f32(out[12], out[13]),
3797            extract_lo_lo_f32(out[14], out[15]),
3798            extract_lo_lo_f32(out[16], out[17]),
3799            extract_lo_lo_f32(out[18], out[19]),
3800            extract_lo_lo_f32(out[20], out[21]),
3801            extract_lo_lo_f32(out[22], out[23]),
3802            extract_lo_lo_f32(out[24], out[25]),
3803            extract_lo_lo_f32(out[26], out[27]),
3804            extract_lo_lo_f32(out[28], out[29]),
3805            extract_lo_hi_f32(out[30], out[0]),
3806            extract_hi_hi_f32(out[1], out[2]),
3807            extract_hi_hi_f32(out[3], out[4]),
3808            extract_hi_hi_f32(out[5], out[6]),
3809            extract_hi_hi_f32(out[7], out[8]),
3810            extract_hi_hi_f32(out[9], out[10]),
3811            extract_hi_hi_f32(out[11], out[12]),
3812            extract_hi_hi_f32(out[13], out[14]),
3813            extract_hi_hi_f32(out[15], out[16]),
3814            extract_hi_hi_f32(out[17], out[18]),
3815            extract_hi_hi_f32(out[19], out[20]),
3816            extract_hi_hi_f32(out[21], out[22]),
3817            extract_hi_hi_f32(out[23], out[24]),
3818            extract_hi_hi_f32(out[25], out[26]),
3819            extract_hi_hi_f32(out[27], out[28]),
3820            extract_hi_hi_f32(out[29], out[30]),
3821        ];
3822
3823        write_complex_to_array_strided!(out_packed, buffer, 2, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 });
3824    }
3825
3826    #[inline(always)]
3827    pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 31]) -> [__m128; 31] {
3828        let rotate = SseVector::make_rotate90(FftDirection::Inverse);
3829
3830        let y00 = values[0];
3831        let [x1p30, x1m30] =  SseVector::column_butterfly2([values[1], values[30]]);
3832        let x1m30 = SseVector::apply_rotate90(rotate, x1m30);
3833        let y00 = SseVector::add(y00, x1p30);
3834        let [x2p29, x2m29] =  SseVector::column_butterfly2([values[2], values[29]]);
3835        let x2m29 = SseVector::apply_rotate90(rotate, x2m29);
3836        let y00 = SseVector::add(y00, x2p29);
3837        let [x3p28, x3m28] =  SseVector::column_butterfly2([values[3], values[28]]);
3838        let x3m28 = SseVector::apply_rotate90(rotate, x3m28);
3839        let y00 = SseVector::add(y00, x3p28);
3840        let [x4p27, x4m27] =  SseVector::column_butterfly2([values[4], values[27]]);
3841        let x4m27 = SseVector::apply_rotate90(rotate, x4m27);
3842        let y00 = SseVector::add(y00, x4p27);
3843        let [x5p26, x5m26] =  SseVector::column_butterfly2([values[5], values[26]]);
3844        let x5m26 = SseVector::apply_rotate90(rotate, x5m26);
3845        let y00 = SseVector::add(y00, x5p26);
3846        let [x6p25, x6m25] =  SseVector::column_butterfly2([values[6], values[25]]);
3847        let x6m25 = SseVector::apply_rotate90(rotate, x6m25);
3848        let y00 = SseVector::add(y00, x6p25);
3849        let [x7p24, x7m24] =  SseVector::column_butterfly2([values[7], values[24]]);
3850        let x7m24 = SseVector::apply_rotate90(rotate, x7m24);
3851        let y00 = SseVector::add(y00, x7p24);
3852        let [x8p23, x8m23] =  SseVector::column_butterfly2([values[8], values[23]]);
3853        let x8m23 = SseVector::apply_rotate90(rotate, x8m23);
3854        let y00 = SseVector::add(y00, x8p23);
3855        let [x9p22, x9m22] =  SseVector::column_butterfly2([values[9], values[22]]);
3856        let x9m22 = SseVector::apply_rotate90(rotate, x9m22);
3857        let y00 = SseVector::add(y00, x9p22);
3858        let [x10p21, x10m21] =  SseVector::column_butterfly2([values[10], values[21]]);
3859        let x10m21 = SseVector::apply_rotate90(rotate, x10m21);
3860        let y00 = SseVector::add(y00, x10p21);
3861        let [x11p20, x11m20] =  SseVector::column_butterfly2([values[11], values[20]]);
3862        let x11m20 = SseVector::apply_rotate90(rotate, x11m20);
3863        let y00 = SseVector::add(y00, x11p20);
3864        let [x12p19, x12m19] =  SseVector::column_butterfly2([values[12], values[19]]);
3865        let x12m19 = SseVector::apply_rotate90(rotate, x12m19);
3866        let y00 = SseVector::add(y00, x12p19);
3867        let [x13p18, x13m18] =  SseVector::column_butterfly2([values[13], values[18]]);
3868        let x13m18 = SseVector::apply_rotate90(rotate, x13m18);
3869        let y00 = SseVector::add(y00, x13p18);
3870        let [x14p17, x14m17] =  SseVector::column_butterfly2([values[14], values[17]]);
3871        let x14m17 = SseVector::apply_rotate90(rotate, x14m17);
3872        let y00 = SseVector::add(y00, x14p17);
3873        let [x15p16, x15m16] =  SseVector::column_butterfly2([values[15], values[16]]);
3874        let x15m16 = SseVector::apply_rotate90(rotate, x15m16);
3875        let y00 = SseVector::add(y00, x15p16);
3876
3877        let m0130a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p30);
3878        let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[1], x2p29);
3879        let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[2], x3p28);
3880        let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[3], x4p27);
3881        let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[4], x5p26);
3882        let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[5], x6p25);
3883        let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[6], x7p24);
3884        let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[7], x8p23);
3885        let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[8], x9p22);
3886        let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[9], x10p21);
3887        let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[10], x11p20);
3888        let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[11], x12p19);
3889        let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[12], x13p18);
3890        let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[13], x14p17);
3891        let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[14], x15p16);
3892        let m0130b = SseVector::mul(self.twiddles_im[0], x1m30);
3893        let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[1], x2m29);
3894        let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[2], x3m28);
3895        let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[3], x4m27);
3896        let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[4], x5m26);
3897        let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[5], x6m25);
3898        let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[6], x7m24);
3899        let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[7], x8m23);
3900        let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[8], x9m22);
3901        let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[9], x10m21);
3902        let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[10], x11m20);
3903        let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[11], x12m19);
3904        let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[12], x13m18);
3905        let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[13], x14m17);
3906        let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[14], x15m16);
3907        let [y01, y30] = SseVector::column_butterfly2([m0130a, m0130b]);
3908
3909        let m0229a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p30);
3910        let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[3], x2p29);
3911        let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[5], x3p28);
3912        let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[7], x4p27);
3913        let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[9], x5p26);
3914        let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[11], x6p25);
3915        let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[13], x7p24);
3916        let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[14], x8p23);
3917        let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[12], x9p22);
3918        let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[10], x10p21);
3919        let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[8], x11p20);
3920        let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[6], x12p19);
3921        let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[4], x13p18);
3922        let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[2], x14p17);
3923        let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[0], x15p16);
3924        let m0229b = SseVector::mul(self.twiddles_im[1], x1m30);
3925        let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[3], x2m29);
3926        let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[5], x3m28);
3927        let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[7], x4m27);
3928        let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[9], x5m26);
3929        let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[11], x6m25);
3930        let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[13], x7m24);
3931        let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[14], x8m23);
3932        let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[12], x9m22);
3933        let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[10], x10m21);
3934        let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[8], x11m20);
3935        let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[6], x12m19);
3936        let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[4], x13m18);
3937        let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[2], x14m17);
3938        let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[0], x15m16);
3939        let [y02, y29] = SseVector::column_butterfly2([m0229a, m0229b]);
3940
3941        let m0328a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p30);
3942        let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[5], x2p29);
3943        let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[8], x3p28);
3944        let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[11], x4p27);
3945        let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[14], x5p26);
3946        let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[12], x6p25);
3947        let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[9], x7p24);
3948        let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[6], x8p23);
3949        let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[3], x9p22);
3950        let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[0], x10p21);
3951        let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[1], x11p20);
3952        let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[4], x12p19);
3953        let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[7], x13p18);
3954        let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[10], x14p17);
3955        let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[13], x15p16);
3956        let m0328b = SseVector::mul(self.twiddles_im[2], x1m30);
3957        let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[5], x2m29);
3958        let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[8], x3m28);
3959        let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[11], x4m27);
3960        let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[14], x5m26);
3961        let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[12], x6m25);
3962        let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[9], x7m24);
3963        let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[6], x8m23);
3964        let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[3], x9m22);
3965        let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[0], x10m21);
3966        let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[1], x11m20);
3967        let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[4], x12m19);
3968        let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[7], x13m18);
3969        let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[10], x14m17);
3970        let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[13], x15m16);
3971        let [y03, y28] = SseVector::column_butterfly2([m0328a, m0328b]);
3972
3973        let m0427a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p30);
3974        let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[7], x2p29);
3975        let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[11], x3p28);
3976        let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[14], x4p27);
3977        let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[10], x5p26);
3978        let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[6], x6p25);
3979        let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[2], x7p24);
3980        let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[0], x8p23);
3981        let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[4], x9p22);
3982        let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[8], x10p21);
3983        let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[12], x11p20);
3984        let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[13], x12p19);
3985        let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[9], x13p18);
3986        let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[5], x14p17);
3987        let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[1], x15p16);
3988        let m0427b = SseVector::mul(self.twiddles_im[3], x1m30);
3989        let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[7], x2m29);
3990        let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[11], x3m28);
3991        let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[14], x4m27);
3992        let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[10], x5m26);
3993        let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[6], x6m25);
3994        let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[2], x7m24);
3995        let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[0], x8m23);
3996        let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[4], x9m22);
3997        let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[8], x10m21);
3998        let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[12], x11m20);
3999        let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[13], x12m19);
4000        let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[9], x13m18);
4001        let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[5], x14m17);
4002        let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[1], x15m16);
4003        let [y04, y27] = SseVector::column_butterfly2([m0427a, m0427b]);
4004
4005        let m0526a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p30);
4006        let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[9], x2p29);
4007        let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[14], x3p28);
4008        let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[10], x4p27);
4009        let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[5], x5p26);
4010        let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[0], x6p25);
4011        let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[3], x7p24);
4012        let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[8], x8p23);
4013        let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[13], x9p22);
4014        let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[11], x10p21);
4015        let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[6], x11p20);
4016        let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[1], x12p19);
4017        let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[2], x13p18);
4018        let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[7], x14p17);
4019        let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[12], x15p16);
4020        let m0526b = SseVector::mul(self.twiddles_im[4], x1m30);
4021        let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[9], x2m29);
4022        let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[14], x3m28);
4023        let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[10], x4m27);
4024        let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[5], x5m26);
4025        let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[0], x6m25);
4026        let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[3], x7m24);
4027        let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[8], x8m23);
4028        let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[13], x9m22);
4029        let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[11], x10m21);
4030        let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[6], x11m20);
4031        let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[1], x12m19);
4032        let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[2], x13m18);
4033        let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[7], x14m17);
4034        let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[12], x15m16);
4035        let [y05, y26] = SseVector::column_butterfly2([m0526a, m0526b]);
4036
4037        let m0625a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p30);
4038        let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[11], x2p29);
4039        let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[12], x3p28);
4040        let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[6], x4p27);
4041        let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[0], x5p26);
4042        let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[4], x6p25);
4043        let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[10], x7p24);
4044        let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[13], x8p23);
4045        let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[7], x9p22);
4046        let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[1], x10p21);
4047        let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[3], x11p20);
4048        let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[9], x12p19);
4049        let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[14], x13p18);
4050        let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[8], x14p17);
4051        let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[2], x15p16);
4052        let m0625b = SseVector::mul(self.twiddles_im[5], x1m30);
4053        let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[11], x2m29);
4054        let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[12], x3m28);
4055        let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[6], x4m27);
4056        let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[0], x5m26);
4057        let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[4], x6m25);
4058        let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[10], x7m24);
4059        let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[13], x8m23);
4060        let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[7], x9m22);
4061        let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[1], x10m21);
4062        let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[3], x11m20);
4063        let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[9], x12m19);
4064        let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[14], x13m18);
4065        let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[8], x14m17);
4066        let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[2], x15m16);
4067        let [y06, y25] = SseVector::column_butterfly2([m0625a, m0625b]);
4068
4069        let m0724a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p30);
4070        let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[13], x2p29);
4071        let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[9], x3p28);
4072        let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[2], x4p27);
4073        let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[3], x5p26);
4074        let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[10], x6p25);
4075        let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[12], x7p24);
4076        let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[5], x8p23);
4077        let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[0], x9p22);
4078        let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[7], x10p21);
4079        let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[14], x11p20);
4080        let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[8], x12p19);
4081        let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[1], x13p18);
4082        let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[4], x14p17);
4083        let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[11], x15p16);
4084        let m0724b = SseVector::mul(self.twiddles_im[6], x1m30);
4085        let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[13], x2m29);
4086        let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[9], x3m28);
4087        let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[2], x4m27);
4088        let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[3], x5m26);
4089        let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[10], x6m25);
4090        let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[12], x7m24);
4091        let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[5], x8m23);
4092        let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[0], x9m22);
4093        let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[7], x10m21);
4094        let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[14], x11m20);
4095        let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[8], x12m19);
4096        let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[1], x13m18);
4097        let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[4], x14m17);
4098        let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[11], x15m16);
4099        let [y07, y24] = SseVector::column_butterfly2([m0724a, m0724b]);
4100
4101        let m0823a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p30);
4102        let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[14], x2p29);
4103        let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[6], x3p28);
4104        let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[0], x4p27);
4105        let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[8], x5p26);
4106        let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[13], x6p25);
4107        let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[5], x7p24);
4108        let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[1], x8p23);
4109        let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[9], x9p22);
4110        let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[12], x10p21);
4111        let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[4], x11p20);
4112        let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[2], x12p19);
4113        let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[10], x13p18);
4114        let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[11], x14p17);
4115        let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[3], x15p16);
4116        let m0823b = SseVector::mul(self.twiddles_im[7], x1m30);
4117        let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[14], x2m29);
4118        let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[6], x3m28);
4119        let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[0], x4m27);
4120        let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[8], x5m26);
4121        let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[13], x6m25);
4122        let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[5], x7m24);
4123        let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[1], x8m23);
4124        let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[9], x9m22);
4125        let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[12], x10m21);
4126        let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[4], x11m20);
4127        let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[2], x12m19);
4128        let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[10], x13m18);
4129        let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[11], x14m17);
4130        let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[3], x15m16);
4131        let [y08, y23] = SseVector::column_butterfly2([m0823a, m0823b]);
4132
4133        let m0922a = SseVector::fmadd(values[0], self.twiddles_re[8], x1p30);
4134        let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[12], x2p29);
4135        let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[3], x3p28);
4136        let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[4], x4p27);
4137        let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[13], x5p26);
4138        let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[7], x6p25);
4139        let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[0], x7p24);
4140        let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[9], x8p23);
4141        let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[11], x9p22);
4142        let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[2], x10p21);
4143        let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[5], x11p20);
4144        let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[14], x12p19);
4145        let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[6], x13p18);
4146        let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[1], x14p17);
4147        let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[10], x15p16);
4148        let m0922b = SseVector::mul(self.twiddles_im[8], x1m30);
4149        let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[12], x2m29);
4150        let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[3], x3m28);
4151        let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[4], x4m27);
4152        let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[13], x5m26);
4153        let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[7], x6m25);
4154        let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[0], x7m24);
4155        let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[9], x8m23);
4156        let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[11], x9m22);
4157        let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[2], x10m21);
4158        let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[5], x11m20);
4159        let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[14], x12m19);
4160        let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[6], x13m18);
4161        let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[1], x14m17);
4162        let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[10], x15m16);
4163        let [y09, y22] = SseVector::column_butterfly2([m0922a, m0922b]);
4164
4165        let m1021a = SseVector::fmadd(values[0], self.twiddles_re[9], x1p30);
4166        let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[10], x2p29);
4167        let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[0], x3p28);
4168        let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[8], x4p27);
4169        let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[11], x5p26);
4170        let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[1], x6p25);
4171        let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[7], x7p24);
4172        let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[12], x8p23);
4173        let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[2], x9p22);
4174        let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[6], x10p21);
4175        let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[13], x11p20);
4176        let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[3], x12p19);
4177        let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[5], x13p18);
4178        let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[14], x14p17);
4179        let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[4], x15p16);
4180        let m1021b = SseVector::mul(self.twiddles_im[9], x1m30);
4181        let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[10], x2m29);
4182        let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[0], x3m28);
4183        let m1021b = SseVector::fmadd(m1021b, self.twiddles_im[8], x4m27);
4184        let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[11], x5m26);
4185        let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[1], x6m25);
4186        let m1021b = SseVector::fmadd(m1021b, self.twiddles_im[7], x7m24);
4187        let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[12], x8m23);
4188        let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[2], x9m22);
4189        let m1021b = SseVector::fmadd(m1021b, self.twiddles_im[6], x10m21);
4190        let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[13], x11m20);
4191        let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[3], x12m19);
4192        let m1021b = SseVector::fmadd(m1021b, self.twiddles_im[5], x13m18);
4193        let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[14], x14m17);
4194        let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[4], x15m16);
4195        let [y10, y21] = SseVector::column_butterfly2([m1021a, m1021b]);
4196
4197        let m1120a = SseVector::fmadd(values[0], self.twiddles_re[10], x1p30);
4198        let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[8], x2p29);
4199        let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[1], x3p28);
4200        let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[12], x4p27);
4201        let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[6], x5p26);
4202        let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[3], x6p25);
4203        let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[14], x7p24);
4204        let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[4], x8p23);
4205        let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[5], x9p22);
4206        let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[13], x10p21);
4207        let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[2], x11p20);
4208        let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[7], x12p19);
4209        let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[11], x13p18);
4210        let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[0], x14p17);
4211        let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[9], x15p16);
4212        let m1120b = SseVector::mul(self.twiddles_im[10], x1m30);
4213        let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[8], x2m29);
4214        let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[1], x3m28);
4215        let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[12], x4m27);
4216        let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[6], x5m26);
4217        let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[3], x6m25);
4218        let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[14], x7m24);
4219        let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[4], x8m23);
4220        let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[5], x9m22);
4221        let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[13], x10m21);
4222        let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[2], x11m20);
4223        let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[7], x12m19);
4224        let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[11], x13m18);
4225        let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[0], x14m17);
4226        let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[9], x15m16);
4227        let [y11, y20] = SseVector::column_butterfly2([m1120a, m1120b]);
4228
4229        let m1219a = SseVector::fmadd(values[0], self.twiddles_re[11], x1p30);
4230        let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[6], x2p29);
4231        let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[4], x3p28);
4232        let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[13], x4p27);
4233        let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[1], x5p26);
4234        let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[9], x6p25);
4235        let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[8], x7p24);
4236        let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[2], x8p23);
4237        let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[14], x9p22);
4238        let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[3], x10p21);
4239        let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[7], x11p20);
4240        let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[10], x12p19);
4241        let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[0], x13p18);
4242        let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[12], x14p17);
4243        let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[5], x15p16);
4244        let m1219b = SseVector::mul(self.twiddles_im[11], x1m30);
4245        let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[6], x2m29);
4246        let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[4], x3m28);
4247        let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[13], x4m27);
4248        let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[1], x5m26);
4249        let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[9], x6m25);
4250        let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[8], x7m24);
4251        let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[2], x8m23);
4252        let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[14], x9m22);
4253        let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[3], x10m21);
4254        let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[7], x11m20);
4255        let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[10], x12m19);
4256        let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[0], x13m18);
4257        let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[12], x14m17);
4258        let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[5], x15m16);
4259        let [y12, y19] = SseVector::column_butterfly2([m1219a, m1219b]);
4260
4261        let m1318a = SseVector::fmadd(values[0], self.twiddles_re[12], x1p30);
4262        let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[4], x2p29);
4263        let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[7], x3p28);
4264        let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[9], x4p27);
4265        let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[2], x5p26);
4266        let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[14], x6p25);
4267        let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[1], x7p24);
4268        let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[10], x8p23);
4269        let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[6], x9p22);
4270        let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[5], x10p21);
4271        let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[11], x11p20);
4272        let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[0], x12p19);
4273        let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[13], x13p18);
4274        let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[3], x14p17);
4275        let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[8], x15p16);
4276        let m1318b = SseVector::mul(self.twiddles_im[12], x1m30);
4277        let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[4], x2m29);
4278        let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[7], x3m28);
4279        let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[9], x4m27);
4280        let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[2], x5m26);
4281        let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[14], x6m25);
4282        let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[1], x7m24);
4283        let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[10], x8m23);
4284        let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[6], x9m22);
4285        let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[5], x10m21);
4286        let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[11], x11m20);
4287        let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[0], x12m19);
4288        let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[13], x13m18);
4289        let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[3], x14m17);
4290        let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[8], x15m16);
4291        let [y13, y18] = SseVector::column_butterfly2([m1318a, m1318b]);
4292
4293        let m1417a = SseVector::fmadd(values[0], self.twiddles_re[13], x1p30);
4294        let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[2], x2p29);
4295        let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[10], x3p28);
4296        let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[5], x4p27);
4297        let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[7], x5p26);
4298        let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[8], x6p25);
4299        let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[4], x7p24);
4300        let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[11], x8p23);
4301        let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[1], x9p22);
4302        let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[14], x10p21);
4303        let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[0], x11p20);
4304        let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[12], x12p19);
4305        let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[3], x13p18);
4306        let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[9], x14p17);
4307        let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[6], x15p16);
4308        let m1417b = SseVector::mul(self.twiddles_im[13], x1m30);
4309        let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[2], x2m29);
4310        let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[10], x3m28);
4311        let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[5], x4m27);
4312        let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[7], x5m26);
4313        let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[8], x6m25);
4314        let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[4], x7m24);
4315        let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[11], x8m23);
4316        let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[1], x9m22);
4317        let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[14], x10m21);
4318        let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[0], x11m20);
4319        let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[12], x12m19);
4320        let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[3], x13m18);
4321        let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[9], x14m17);
4322        let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[6], x15m16);
4323        let [y14, y17] = SseVector::column_butterfly2([m1417a, m1417b]);
4324
4325        let m1516a = SseVector::fmadd(values[0], self.twiddles_re[14], x1p30);
4326        let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[0], x2p29);
4327        let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[13], x3p28);
4328        let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[1], x4p27);
4329        let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[12], x5p26);
4330        let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[2], x6p25);
4331        let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[11], x7p24);
4332        let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[3], x8p23);
4333        let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[10], x9p22);
4334        let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[4], x10p21);
4335        let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[9], x11p20);
4336        let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[5], x12p19);
4337        let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[8], x13p18);
4338        let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[6], x14p17);
4339        let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[7], x15p16);
4340        let m1516b = SseVector::mul(self.twiddles_im[14], x1m30);
4341        let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[0], x2m29);
4342        let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[13], x3m28);
4343        let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[1], x4m27);
4344        let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[12], x5m26);
4345        let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[2], x6m25);
4346        let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[11], x7m24);
4347        let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[3], x8m23);
4348        let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[10], x9m22);
4349        let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[4], x10m21);
4350        let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[9], x11m20);
4351        let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[5], x12m19);
4352        let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[8], x13m18);
4353        let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[6], x14m17);
4354        let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[7], x15m16);
4355        let [y15, y16] = SseVector::column_butterfly2([m1516a, m1516b]);
4356
4357
4358        [y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22, y23, y24, y25, y26, y27, y28, y29, y30]
4359    }
4360}
4361
4362struct SseF64Butterfly31<T> {
4363    direction: FftDirection,
4364    twiddles_re: [__m128d; 15],
4365    twiddles_im: [__m128d; 15],
4366    _phantom: std::marker::PhantomData<T>,
4367}
4368
4369boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly31);
4370boilerplate_fft_sse_common_butterfly!(SseF64Butterfly31, 31, |this: &SseF64Butterfly31<_>| this.direction);
4371impl<T: FftNum> SseF64Butterfly31<T> {
4372    /// Safety: The current machine must support the sse4.1 instruction set
4373    #[target_feature(enable = "sse4.1")]
4374    unsafe fn new(direction: FftDirection) -> Self {
4375        assert_f64::<T>();
4376        let twiddles = make_twiddles(31, direction);
4377        unsafe {Self {
4378            direction,
4379            twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
4380            twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
4381            _phantom: std::marker::PhantomData,
4382        }}
4383    }
4384
4385    #[inline(always)]
4386    pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
4387        let values = read_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 });
4388
4389        let out = self.perform_fft_direct(values);
4390
4391        write_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 });   
4392    }
4393
4394    #[inline(always)]
4395    pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 31]) -> [__m128d; 31] {
4396        let rotate = SseVector::make_rotate90(FftDirection::Inverse);
4397
4398        let y00 = values[0];
4399        let [x1p30, x1m30] =  SseVector::column_butterfly2([values[1], values[30]]);
4400        let x1m30 = SseVector::apply_rotate90(rotate, x1m30);
4401        let y00 = SseVector::add(y00, x1p30);
4402        let [x2p29, x2m29] =  SseVector::column_butterfly2([values[2], values[29]]);
4403        let x2m29 = SseVector::apply_rotate90(rotate, x2m29);
4404        let y00 = SseVector::add(y00, x2p29);
4405        let [x3p28, x3m28] =  SseVector::column_butterfly2([values[3], values[28]]);
4406        let x3m28 = SseVector::apply_rotate90(rotate, x3m28);
4407        let y00 = SseVector::add(y00, x3p28);
4408        let [x4p27, x4m27] =  SseVector::column_butterfly2([values[4], values[27]]);
4409        let x4m27 = SseVector::apply_rotate90(rotate, x4m27);
4410        let y00 = SseVector::add(y00, x4p27);
4411        let [x5p26, x5m26] =  SseVector::column_butterfly2([values[5], values[26]]);
4412        let x5m26 = SseVector::apply_rotate90(rotate, x5m26);
4413        let y00 = SseVector::add(y00, x5p26);
4414        let [x6p25, x6m25] =  SseVector::column_butterfly2([values[6], values[25]]);
4415        let x6m25 = SseVector::apply_rotate90(rotate, x6m25);
4416        let y00 = SseVector::add(y00, x6p25);
4417        let [x7p24, x7m24] =  SseVector::column_butterfly2([values[7], values[24]]);
4418        let x7m24 = SseVector::apply_rotate90(rotate, x7m24);
4419        let y00 = SseVector::add(y00, x7p24);
4420        let [x8p23, x8m23] =  SseVector::column_butterfly2([values[8], values[23]]);
4421        let x8m23 = SseVector::apply_rotate90(rotate, x8m23);
4422        let y00 = SseVector::add(y00, x8p23);
4423        let [x9p22, x9m22] =  SseVector::column_butterfly2([values[9], values[22]]);
4424        let x9m22 = SseVector::apply_rotate90(rotate, x9m22);
4425        let y00 = SseVector::add(y00, x9p22);
4426        let [x10p21, x10m21] =  SseVector::column_butterfly2([values[10], values[21]]);
4427        let x10m21 = SseVector::apply_rotate90(rotate, x10m21);
4428        let y00 = SseVector::add(y00, x10p21);
4429        let [x11p20, x11m20] =  SseVector::column_butterfly2([values[11], values[20]]);
4430        let x11m20 = SseVector::apply_rotate90(rotate, x11m20);
4431        let y00 = SseVector::add(y00, x11p20);
4432        let [x12p19, x12m19] =  SseVector::column_butterfly2([values[12], values[19]]);
4433        let x12m19 = SseVector::apply_rotate90(rotate, x12m19);
4434        let y00 = SseVector::add(y00, x12p19);
4435        let [x13p18, x13m18] =  SseVector::column_butterfly2([values[13], values[18]]);
4436        let x13m18 = SseVector::apply_rotate90(rotate, x13m18);
4437        let y00 = SseVector::add(y00, x13p18);
4438        let [x14p17, x14m17] =  SseVector::column_butterfly2([values[14], values[17]]);
4439        let x14m17 = SseVector::apply_rotate90(rotate, x14m17);
4440        let y00 = SseVector::add(y00, x14p17);
4441        let [x15p16, x15m16] =  SseVector::column_butterfly2([values[15], values[16]]);
4442        let x15m16 = SseVector::apply_rotate90(rotate, x15m16);
4443        let y00 = SseVector::add(y00, x15p16);
4444
4445        let m0130a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p30);
4446        let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[1], x2p29);
4447        let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[2], x3p28);
4448        let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[3], x4p27);
4449        let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[4], x5p26);
4450        let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[5], x6p25);
4451        let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[6], x7p24);
4452        let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[7], x8p23);
4453        let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[8], x9p22);
4454        let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[9], x10p21);
4455        let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[10], x11p20);
4456        let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[11], x12p19);
4457        let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[12], x13p18);
4458        let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[13], x14p17);
4459        let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[14], x15p16);
4460        let m0130b = SseVector::mul(self.twiddles_im[0], x1m30);
4461        let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[1], x2m29);
4462        let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[2], x3m28);
4463        let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[3], x4m27);
4464        let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[4], x5m26);
4465        let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[5], x6m25);
4466        let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[6], x7m24);
4467        let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[7], x8m23);
4468        let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[8], x9m22);
4469        let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[9], x10m21);
4470        let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[10], x11m20);
4471        let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[11], x12m19);
4472        let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[12], x13m18);
4473        let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[13], x14m17);
4474        let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[14], x15m16);
4475        let [y01, y30] = SseVector::column_butterfly2([m0130a, m0130b]);
4476
4477        let m0229a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p30);
4478        let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[3], x2p29);
4479        let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[5], x3p28);
4480        let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[7], x4p27);
4481        let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[9], x5p26);
4482        let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[11], x6p25);
4483        let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[13], x7p24);
4484        let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[14], x8p23);
4485        let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[12], x9p22);
4486        let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[10], x10p21);
4487        let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[8], x11p20);
4488        let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[6], x12p19);
4489        let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[4], x13p18);
4490        let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[2], x14p17);
4491        let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[0], x15p16);
4492        let m0229b = SseVector::mul(self.twiddles_im[1], x1m30);
4493        let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[3], x2m29);
4494        let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[5], x3m28);
4495        let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[7], x4m27);
4496        let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[9], x5m26);
4497        let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[11], x6m25);
4498        let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[13], x7m24);
4499        let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[14], x8m23);
4500        let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[12], x9m22);
4501        let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[10], x10m21);
4502        let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[8], x11m20);
4503        let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[6], x12m19);
4504        let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[4], x13m18);
4505        let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[2], x14m17);
4506        let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[0], x15m16);
4507        let [y02, y29] = SseVector::column_butterfly2([m0229a, m0229b]);
4508
4509        let m0328a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p30);
4510        let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[5], x2p29);
4511        let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[8], x3p28);
4512        let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[11], x4p27);
4513        let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[14], x5p26);
4514        let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[12], x6p25);
4515        let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[9], x7p24);
4516        let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[6], x8p23);
4517        let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[3], x9p22);
4518        let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[0], x10p21);
4519        let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[1], x11p20);
4520        let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[4], x12p19);
4521        let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[7], x13p18);
4522        let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[10], x14p17);
4523        let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[13], x15p16);
4524        let m0328b = SseVector::mul(self.twiddles_im[2], x1m30);
4525        let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[5], x2m29);
4526        let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[8], x3m28);
4527        let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[11], x4m27);
4528        let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[14], x5m26);
4529        let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[12], x6m25);
4530        let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[9], x7m24);
4531        let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[6], x8m23);
4532        let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[3], x9m22);
4533        let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[0], x10m21);
4534        let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[1], x11m20);
4535        let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[4], x12m19);
4536        let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[7], x13m18);
4537        let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[10], x14m17);
4538        let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[13], x15m16);
4539        let [y03, y28] = SseVector::column_butterfly2([m0328a, m0328b]);
4540
4541        let m0427a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p30);
4542        let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[7], x2p29);
4543        let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[11], x3p28);
4544        let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[14], x4p27);
4545        let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[10], x5p26);
4546        let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[6], x6p25);
4547        let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[2], x7p24);
4548        let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[0], x8p23);
4549        let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[4], x9p22);
4550        let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[8], x10p21);
4551        let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[12], x11p20);
4552        let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[13], x12p19);
4553        let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[9], x13p18);
4554        let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[5], x14p17);
4555        let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[1], x15p16);
4556        let m0427b = SseVector::mul(self.twiddles_im[3], x1m30);
4557        let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[7], x2m29);
4558        let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[11], x3m28);
4559        let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[14], x4m27);
4560        let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[10], x5m26);
4561        let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[6], x6m25);
4562        let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[2], x7m24);
4563        let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[0], x8m23);
4564        let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[4], x9m22);
4565        let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[8], x10m21);
4566        let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[12], x11m20);
4567        let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[13], x12m19);
4568        let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[9], x13m18);
4569        let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[5], x14m17);
4570        let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[1], x15m16);
4571        let [y04, y27] = SseVector::column_butterfly2([m0427a, m0427b]);
4572
4573        let m0526a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p30);
4574        let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[9], x2p29);
4575        let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[14], x3p28);
4576        let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[10], x4p27);
4577        let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[5], x5p26);
4578        let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[0], x6p25);
4579        let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[3], x7p24);
4580        let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[8], x8p23);
4581        let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[13], x9p22);
4582        let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[11], x10p21);
4583        let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[6], x11p20);
4584        let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[1], x12p19);
4585        let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[2], x13p18);
4586        let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[7], x14p17);
4587        let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[12], x15p16);
4588        let m0526b = SseVector::mul(self.twiddles_im[4], x1m30);
4589        let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[9], x2m29);
4590        let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[14], x3m28);
4591        let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[10], x4m27);
4592        let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[5], x5m26);
4593        let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[0], x6m25);
4594        let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[3], x7m24);
4595        let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[8], x8m23);
4596        let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[13], x9m22);
4597        let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[11], x10m21);
4598        let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[6], x11m20);
4599        let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[1], x12m19);
4600        let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[2], x13m18);
4601        let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[7], x14m17);
4602        let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[12], x15m16);
4603        let [y05, y26] = SseVector::column_butterfly2([m0526a, m0526b]);
4604
4605        let m0625a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p30);
4606        let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[11], x2p29);
4607        let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[12], x3p28);
4608        let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[6], x4p27);
4609        let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[0], x5p26);
4610        let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[4], x6p25);
4611        let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[10], x7p24);
4612        let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[13], x8p23);
4613        let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[7], x9p22);
4614        let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[1], x10p21);
4615        let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[3], x11p20);
4616        let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[9], x12p19);
4617        let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[14], x13p18);
4618        let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[8], x14p17);
4619        let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[2], x15p16);
4620        let m0625b = SseVector::mul(self.twiddles_im[5], x1m30);
4621        let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[11], x2m29);
4622        let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[12], x3m28);
4623        let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[6], x4m27);
4624        let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[0], x5m26);
4625        let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[4], x6m25);
4626        let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[10], x7m24);
4627        let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[13], x8m23);
4628        let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[7], x9m22);
4629        let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[1], x10m21);
4630        let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[3], x11m20);
4631        let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[9], x12m19);
4632        let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[14], x13m18);
4633        let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[8], x14m17);
4634        let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[2], x15m16);
4635        let [y06, y25] = SseVector::column_butterfly2([m0625a, m0625b]);
4636
4637        let m0724a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p30);
4638        let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[13], x2p29);
4639        let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[9], x3p28);
4640        let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[2], x4p27);
4641        let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[3], x5p26);
4642        let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[10], x6p25);
4643        let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[12], x7p24);
4644        let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[5], x8p23);
4645        let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[0], x9p22);
4646        let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[7], x10p21);
4647        let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[14], x11p20);
4648        let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[8], x12p19);
4649        let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[1], x13p18);
4650        let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[4], x14p17);
4651        let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[11], x15p16);
4652        let m0724b = SseVector::mul(self.twiddles_im[6], x1m30);
4653        let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[13], x2m29);
4654        let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[9], x3m28);
4655        let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[2], x4m27);
4656        let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[3], x5m26);
4657        let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[10], x6m25);
4658        let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[12], x7m24);
4659        let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[5], x8m23);
4660        let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[0], x9m22);
4661        let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[7], x10m21);
4662        let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[14], x11m20);
4663        let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[8], x12m19);
4664        let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[1], x13m18);
4665        let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[4], x14m17);
4666        let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[11], x15m16);
4667        let [y07, y24] = SseVector::column_butterfly2([m0724a, m0724b]);
4668
4669        let m0823a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p30);
4670        let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[14], x2p29);
4671        let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[6], x3p28);
4672        let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[0], x4p27);
4673        let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[8], x5p26);
4674        let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[13], x6p25);
4675        let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[5], x7p24);
4676        let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[1], x8p23);
4677        let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[9], x9p22);
4678        let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[12], x10p21);
4679        let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[4], x11p20);
4680        let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[2], x12p19);
4681        let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[10], x13p18);
4682        let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[11], x14p17);
4683        let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[3], x15p16);
4684        let m0823b = SseVector::mul(self.twiddles_im[7], x1m30);
4685        let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[14], x2m29);
4686        let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[6], x3m28);
4687        let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[0], x4m27);
4688        let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[8], x5m26);
4689        let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[13], x6m25);
4690        let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[5], x7m24);
4691        let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[1], x8m23);
4692        let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[9], x9m22);
4693        let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[12], x10m21);
4694        let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[4], x11m20);
4695        let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[2], x12m19);
4696        let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[10], x13m18);
4697        let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[11], x14m17);
4698        let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[3], x15m16);
4699        let [y08, y23] = SseVector::column_butterfly2([m0823a, m0823b]);
4700
4701        let m0922a = SseVector::fmadd(values[0], self.twiddles_re[8], x1p30);
4702        let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[12], x2p29);
4703        let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[3], x3p28);
4704        let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[4], x4p27);
4705        let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[13], x5p26);
4706        let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[7], x6p25);
4707        let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[0], x7p24);
4708        let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[9], x8p23);
4709        let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[11], x9p22);
4710        let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[2], x10p21);
4711        let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[5], x11p20);
4712        let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[14], x12p19);
4713        let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[6], x13p18);
4714        let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[1], x14p17);
4715        let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[10], x15p16);
4716        let m0922b = SseVector::mul(self.twiddles_im[8], x1m30);
4717        let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[12], x2m29);
4718        let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[3], x3m28);
4719        let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[4], x4m27);
4720        let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[13], x5m26);
4721        let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[7], x6m25);
4722        let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[0], x7m24);
4723        let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[9], x8m23);
4724        let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[11], x9m22);
4725        let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[2], x10m21);
4726        let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[5], x11m20);
4727        let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[14], x12m19);
4728        let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[6], x13m18);
4729        let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[1], x14m17);
4730        let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[10], x15m16);
4731        let [y09, y22] = SseVector::column_butterfly2([m0922a, m0922b]);
4732
4733        let m1021a = SseVector::fmadd(values[0], self.twiddles_re[9], x1p30);
4734        let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[10], x2p29);
4735        let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[0], x3p28);
4736        let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[8], x4p27);
4737        let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[11], x5p26);
4738        let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[1], x6p25);
4739        let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[7], x7p24);
4740        let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[12], x8p23);
4741        let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[2], x9p22);
4742        let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[6], x10p21);
4743        let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[13], x11p20);
4744        let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[3], x12p19);
4745        let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[5], x13p18);
4746        let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[14], x14p17);
4747        let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[4], x15p16);
4748        let m1021b = SseVector::mul(self.twiddles_im[9], x1m30);
4749        let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[10], x2m29);
4750        let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[0], x3m28);
4751        let m1021b = SseVector::fmadd(m1021b, self.twiddles_im[8], x4m27);
4752        let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[11], x5m26);
4753        let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[1], x6m25);
4754        let m1021b = SseVector::fmadd(m1021b, self.twiddles_im[7], x7m24);
4755        let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[12], x8m23);
4756        let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[2], x9m22);
4757        let m1021b = SseVector::fmadd(m1021b, self.twiddles_im[6], x10m21);
4758        let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[13], x11m20);
4759        let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[3], x12m19);
4760        let m1021b = SseVector::fmadd(m1021b, self.twiddles_im[5], x13m18);
4761        let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[14], x14m17);
4762        let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[4], x15m16);
4763        let [y10, y21] = SseVector::column_butterfly2([m1021a, m1021b]);
4764
4765        let m1120a = SseVector::fmadd(values[0], self.twiddles_re[10], x1p30);
4766        let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[8], x2p29);
4767        let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[1], x3p28);
4768        let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[12], x4p27);
4769        let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[6], x5p26);
4770        let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[3], x6p25);
4771        let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[14], x7p24);
4772        let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[4], x8p23);
4773        let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[5], x9p22);
4774        let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[13], x10p21);
4775        let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[2], x11p20);
4776        let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[7], x12p19);
4777        let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[11], x13p18);
4778        let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[0], x14p17);
4779        let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[9], x15p16);
4780        let m1120b = SseVector::mul(self.twiddles_im[10], x1m30);
4781        let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[8], x2m29);
4782        let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[1], x3m28);
4783        let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[12], x4m27);
4784        let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[6], x5m26);
4785        let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[3], x6m25);
4786        let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[14], x7m24);
4787        let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[4], x8m23);
4788        let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[5], x9m22);
4789        let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[13], x10m21);
4790        let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[2], x11m20);
4791        let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[7], x12m19);
4792        let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[11], x13m18);
4793        let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[0], x14m17);
4794        let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[9], x15m16);
4795        let [y11, y20] = SseVector::column_butterfly2([m1120a, m1120b]);
4796
4797        let m1219a = SseVector::fmadd(values[0], self.twiddles_re[11], x1p30);
4798        let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[6], x2p29);
4799        let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[4], x3p28);
4800        let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[13], x4p27);
4801        let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[1], x5p26);
4802        let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[9], x6p25);
4803        let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[8], x7p24);
4804        let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[2], x8p23);
4805        let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[14], x9p22);
4806        let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[3], x10p21);
4807        let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[7], x11p20);
4808        let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[10], x12p19);
4809        let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[0], x13p18);
4810        let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[12], x14p17);
4811        let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[5], x15p16);
4812        let m1219b = SseVector::mul(self.twiddles_im[11], x1m30);
4813        let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[6], x2m29);
4814        let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[4], x3m28);
4815        let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[13], x4m27);
4816        let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[1], x5m26);
4817        let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[9], x6m25);
4818        let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[8], x7m24);
4819        let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[2], x8m23);
4820        let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[14], x9m22);
4821        let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[3], x10m21);
4822        let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[7], x11m20);
4823        let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[10], x12m19);
4824        let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[0], x13m18);
4825        let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[12], x14m17);
4826        let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[5], x15m16);
4827        let [y12, y19] = SseVector::column_butterfly2([m1219a, m1219b]);
4828
4829        let m1318a = SseVector::fmadd(values[0], self.twiddles_re[12], x1p30);
4830        let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[4], x2p29);
4831        let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[7], x3p28);
4832        let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[9], x4p27);
4833        let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[2], x5p26);
4834        let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[14], x6p25);
4835        let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[1], x7p24);
4836        let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[10], x8p23);
4837        let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[6], x9p22);
4838        let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[5], x10p21);
4839        let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[11], x11p20);
4840        let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[0], x12p19);
4841        let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[13], x13p18);
4842        let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[3], x14p17);
4843        let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[8], x15p16);
4844        let m1318b = SseVector::mul(self.twiddles_im[12], x1m30);
4845        let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[4], x2m29);
4846        let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[7], x3m28);
4847        let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[9], x4m27);
4848        let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[2], x5m26);
4849        let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[14], x6m25);
4850        let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[1], x7m24);
4851        let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[10], x8m23);
4852        let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[6], x9m22);
4853        let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[5], x10m21);
4854        let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[11], x11m20);
4855        let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[0], x12m19);
4856        let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[13], x13m18);
4857        let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[3], x14m17);
4858        let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[8], x15m16);
4859        let [y13, y18] = SseVector::column_butterfly2([m1318a, m1318b]);
4860
4861        let m1417a = SseVector::fmadd(values[0], self.twiddles_re[13], x1p30);
4862        let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[2], x2p29);
4863        let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[10], x3p28);
4864        let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[5], x4p27);
4865        let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[7], x5p26);
4866        let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[8], x6p25);
4867        let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[4], x7p24);
4868        let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[11], x8p23);
4869        let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[1], x9p22);
4870        let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[14], x10p21);
4871        let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[0], x11p20);
4872        let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[12], x12p19);
4873        let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[3], x13p18);
4874        let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[9], x14p17);
4875        let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[6], x15p16);
4876        let m1417b = SseVector::mul(self.twiddles_im[13], x1m30);
4877        let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[2], x2m29);
4878        let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[10], x3m28);
4879        let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[5], x4m27);
4880        let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[7], x5m26);
4881        let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[8], x6m25);
4882        let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[4], x7m24);
4883        let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[11], x8m23);
4884        let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[1], x9m22);
4885        let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[14], x10m21);
4886        let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[0], x11m20);
4887        let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[12], x12m19);
4888        let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[3], x13m18);
4889        let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[9], x14m17);
4890        let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[6], x15m16);
4891        let [y14, y17] = SseVector::column_butterfly2([m1417a, m1417b]);
4892
4893        let m1516a = SseVector::fmadd(values[0], self.twiddles_re[14], x1p30);
4894        let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[0], x2p29);
4895        let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[13], x3p28);
4896        let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[1], x4p27);
4897        let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[12], x5p26);
4898        let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[2], x6p25);
4899        let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[11], x7p24);
4900        let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[3], x8p23);
4901        let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[10], x9p22);
4902        let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[4], x10p21);
4903        let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[9], x11p20);
4904        let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[5], x12p19);
4905        let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[8], x13p18);
4906        let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[6], x14p17);
4907        let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[7], x15p16);
4908        let m1516b = SseVector::mul(self.twiddles_im[14], x1m30);
4909        let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[0], x2m29);
4910        let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[13], x3m28);
4911        let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[1], x4m27);
4912        let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[12], x5m26);
4913        let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[2], x6m25);
4914        let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[11], x7m24);
4915        let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[3], x8m23);
4916        let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[10], x9m22);
4917        let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[4], x10m21);
4918        let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[9], x11m20);
4919        let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[5], x12m19);
4920        let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[8], x13m18);
4921        let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[6], x14m17);
4922        let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[7], x15m16);
4923        let [y15, y16] = SseVector::column_butterfly2([m1516a, m1516b]);
4924
4925
4926        [y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22, y23, y24, y25, y26, y27, y28, y29, y30]
4927    }
4928}
4929
4930
4931#[cfg(test)]
4932mod unit_tests {
4933    use super::*;
4934    use crate::test_utils::check_fft_algorithm;
4935
4936    macro_rules! test_butterfly_32_func {
4937        ($test_name:ident, $struct_name:ident, $size:expr) => {
4938            #[test]
4939            fn $test_name() {
4940                assert!(std::arch::is_x86_feature_detected!("sse4.1"));
4941
4942                let fwd = unsafe { $struct_name::new(FftDirection::Forward) };
4943                check_fft_algorithm::<f32>(&fwd, $size, FftDirection::Forward);
4944
4945                let inv = unsafe { $struct_name::new(FftDirection::Inverse) }; 
4946                check_fft_algorithm::<f32>(&inv, $size, FftDirection::Inverse);
4947            }
4948        };
4949    }
4950    macro_rules! test_butterfly_64_func {
4951        ($test_name:ident, $struct_name:ident, $size:expr) => {
4952            #[test]
4953            fn $test_name() {
4954                assert!(std::arch::is_x86_feature_detected!("sse4.1"));
4955
4956                let fwd = unsafe { $struct_name::new(FftDirection::Forward) };
4957                check_fft_algorithm::<f64>(&fwd, $size, FftDirection::Forward);
4958
4959                let inv = unsafe { $struct_name::new(FftDirection::Inverse) };
4960                check_fft_algorithm::<f64>(&inv, $size, FftDirection::Inverse);
4961            }
4962        };
4963    }
4964    test_butterfly_32_func!(test_ssef32_butterfly7, SseF32Butterfly7, 7);
4965    test_butterfly_32_func!(test_ssef32_butterfly11, SseF32Butterfly11, 11);
4966    test_butterfly_32_func!(test_ssef32_butterfly13, SseF32Butterfly13, 13);
4967    test_butterfly_32_func!(test_ssef32_butterfly17, SseF32Butterfly17, 17);
4968    test_butterfly_32_func!(test_ssef32_butterfly19, SseF32Butterfly19, 19);
4969    test_butterfly_32_func!(test_ssef32_butterfly23, SseF32Butterfly23, 23);
4970    test_butterfly_32_func!(test_ssef32_butterfly29, SseF32Butterfly29, 29);
4971    test_butterfly_32_func!(test_ssef32_butterfly31, SseF32Butterfly31, 31);
4972    test_butterfly_64_func!(test_ssef64_butterfly7, SseF64Butterfly7, 7);
4973    test_butterfly_64_func!(test_ssef64_butterfly11, SseF64Butterfly11, 11);
4974    test_butterfly_64_func!(test_ssef64_butterfly13, SseF64Butterfly13, 13);
4975    test_butterfly_64_func!(test_ssef64_butterfly17, SseF64Butterfly17, 17);
4976    test_butterfly_64_func!(test_ssef64_butterfly19, SseF64Butterfly19, 19);
4977    test_butterfly_64_func!(test_ssef64_butterfly23, SseF64Butterfly23, 23);
4978    test_butterfly_64_func!(test_ssef64_butterfly29, SseF64Butterfly29, 29);
4979    test_butterfly_64_func!(test_ssef64_butterfly31, SseF64Butterfly31, 31);
4980}
4981