1use core::arch::x86_64::{__m128, __m128d};
2use std::any::TypeId;
3use std::sync::Arc;
4use num_complex::Complex;
5
6use crate::{common::FftNum, FftDirection};
7
8use crate::array_utils;
9use crate::array_utils::workaround_transmute_mut;
10use crate::array_utils::DoubleBuf;
11use crate::common::{fft_error_inplace, fft_error_outofplace};
12use crate::twiddles;
13use crate::{Direction, Fft, Length};
14
15use super::sse_common::{assert_f32, assert_f64};
16use super::sse_utils::*;
17use super::sse_vector::*;
18
19pub const fn prime_butterfly_lens() -> &'static [usize] {
36 &[7, 11, 13, 17, 19, 23, 29, 31, ]
37}
38
39#[target_feature(enable = "sse4.1")]
41pub unsafe fn construct_prime_butterfly<T: FftNum>(len: usize, direction: FftDirection) -> Arc<dyn Fft<T>> {
42 let id_f32 = TypeId::of::<f32>();
43 let id_f64 = TypeId::of::<f64>();
44 let id_t = TypeId::of::<T>();
45 if id_t == id_f32 {
46 match len {
47 7 => Arc::new(SseF32Butterfly7::new(direction)) as Arc<dyn Fft<T>>,
48 11 => Arc::new(SseF32Butterfly11::new(direction)) as Arc<dyn Fft<T>>,
49 13 => Arc::new(SseF32Butterfly13::new(direction)) as Arc<dyn Fft<T>>,
50 17 => Arc::new(SseF32Butterfly17::new(direction)) as Arc<dyn Fft<T>>,
51 19 => Arc::new(SseF32Butterfly19::new(direction)) as Arc<dyn Fft<T>>,
52 23 => Arc::new(SseF32Butterfly23::new(direction)) as Arc<dyn Fft<T>>,
53 29 => Arc::new(SseF32Butterfly29::new(direction)) as Arc<dyn Fft<T>>,
54 31 => Arc::new(SseF32Butterfly31::new(direction)) as Arc<dyn Fft<T>>,
55 _ => unimplemented!("Invalid SSE prime butterfly length: {len}"),
56 }
57 } else if id_t == id_f64 {
58 match len {
59 7 => Arc::new(SseF64Butterfly7::new(direction)) as Arc<dyn Fft<T>>,
60 11 => Arc::new(SseF64Butterfly11::new(direction)) as Arc<dyn Fft<T>>,
61 13 => Arc::new(SseF64Butterfly13::new(direction)) as Arc<dyn Fft<T>>,
62 17 => Arc::new(SseF64Butterfly17::new(direction)) as Arc<dyn Fft<T>>,
63 19 => Arc::new(SseF64Butterfly19::new(direction)) as Arc<dyn Fft<T>>,
64 23 => Arc::new(SseF64Butterfly23::new(direction)) as Arc<dyn Fft<T>>,
65 29 => Arc::new(SseF64Butterfly29::new(direction)) as Arc<dyn Fft<T>>,
66 31 => Arc::new(SseF64Butterfly31::new(direction)) as Arc<dyn Fft<T>>,
67 _ => unimplemented!("Invalid SSE prime butterfly length: {len}"),
68 }
69 } else {
70 unimplemented!("Not f32 or f64");
71 }
72}
73
74#[inline(always)]
75fn make_twiddles<const TW: usize, T: FftNum>(len: usize, direction: FftDirection) -> [Complex<T>; TW] {
76 let mut i = 1;
77 [(); TW].map(|_| {
78 let twiddle = twiddles::compute_twiddle(i, len, direction);
79 i += 1;
80 twiddle
81 })
82}
83
84struct SseF32Butterfly7<T> {
85 direction: FftDirection,
86 twiddles_re: [__m128; 3],
87 twiddles_im: [__m128; 3],
88 _phantom: std::marker::PhantomData<T>,
89}
90
91boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly7);
92boilerplate_fft_sse_common_butterfly!(SseF32Butterfly7, 7, |this: &SseF32Butterfly7<_>| this.direction);
93impl<T: FftNum> SseF32Butterfly7<T> {
94 #[target_feature(enable = "sse4.1")]
96 unsafe fn new(direction: FftDirection) -> Self {
97 assert_f32::<T>();
98 let twiddles = make_twiddles(7, direction);
99 Self {
100 direction,
101 twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
102 twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
103 _phantom: std::marker::PhantomData,
104 }
105 }
106
107 #[inline(always)]
108 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
109 let values = read_partial1_complex_to_array!(buffer, { 0,1,2,3,4,5,6 });
110
111 let out = self.perform_parallel_fft_direct(values);
112
113 write_partial_lo_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6 } );
114 }
115
116 #[inline(always)]
117 pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
118 let input_packed = read_complex_to_array!(buffer, { 0,2,4,6,8,10,12 });
119
120 let values = [
121 extract_lo_hi_f32(input_packed[0], input_packed[3]),
122 extract_hi_lo_f32(input_packed[0], input_packed[4]),
123 extract_lo_hi_f32(input_packed[1], input_packed[4]),
124 extract_hi_lo_f32(input_packed[1], input_packed[5]),
125 extract_lo_hi_f32(input_packed[2], input_packed[5]),
126 extract_hi_lo_f32(input_packed[2], input_packed[6]),
127 extract_lo_hi_f32(input_packed[3], input_packed[6]),
128 ];
129
130 let out = self.perform_parallel_fft_direct(values);
131
132 let out_packed = [
133 extract_lo_lo_f32(out[0], out[1]),
134 extract_lo_lo_f32(out[2], out[3]),
135 extract_lo_lo_f32(out[4], out[5]),
136 extract_lo_hi_f32(out[6], out[0]),
137 extract_hi_hi_f32(out[1], out[2]),
138 extract_hi_hi_f32(out[3], out[4]),
139 extract_hi_hi_f32(out[5], out[6]),
140 ];
141
142 write_complex_to_array_strided!(out_packed, buffer, 2, { 0,1,2,3,4,5,6 });
143 }
144
145 #[inline(always)]
146 pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 7]) -> [__m128; 7] {
147 let rotate = SseVector::make_rotate90(FftDirection::Inverse);
148
149 let y00 = values[0];
150 let [x1p6, x1m6] = SseVector::column_butterfly2([values[1], values[6]]);
151 let x1m6 = SseVector::apply_rotate90(rotate, x1m6);
152 let y00 = SseVector::add(y00, x1p6);
153 let [x2p5, x2m5] = SseVector::column_butterfly2([values[2], values[5]]);
154 let x2m5 = SseVector::apply_rotate90(rotate, x2m5);
155 let y00 = SseVector::add(y00, x2p5);
156 let [x3p4, x3m4] = SseVector::column_butterfly2([values[3], values[4]]);
157 let x3m4 = SseVector::apply_rotate90(rotate, x3m4);
158 let y00 = SseVector::add(y00, x3p4);
159
160 let m0106a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p6);
161 let m0106a = SseVector::fmadd(m0106a, self.twiddles_re[1], x2p5);
162 let m0106a = SseVector::fmadd(m0106a, self.twiddles_re[2], x3p4);
163 let m0106b = SseVector::mul(self.twiddles_im[0], x1m6);
164 let m0106b = SseVector::fmadd(m0106b, self.twiddles_im[1], x2m5);
165 let m0106b = SseVector::fmadd(m0106b, self.twiddles_im[2], x3m4);
166 let [y01, y06] = SseVector::column_butterfly2([m0106a, m0106b]);
167
168 let m0205a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p6);
169 let m0205a = SseVector::fmadd(m0205a, self.twiddles_re[2], x2p5);
170 let m0205a = SseVector::fmadd(m0205a, self.twiddles_re[0], x3p4);
171 let m0205b = SseVector::mul(self.twiddles_im[1], x1m6);
172 let m0205b = SseVector::nmadd(m0205b, self.twiddles_im[2], x2m5);
173 let m0205b = SseVector::nmadd(m0205b, self.twiddles_im[0], x3m4);
174 let [y02, y05] = SseVector::column_butterfly2([m0205a, m0205b]);
175
176 let m0304a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p6);
177 let m0304a = SseVector::fmadd(m0304a, self.twiddles_re[0], x2p5);
178 let m0304a = SseVector::fmadd(m0304a, self.twiddles_re[1], x3p4);
179 let m0304b = SseVector::mul(self.twiddles_im[2], x1m6);
180 let m0304b = SseVector::nmadd(m0304b, self.twiddles_im[0], x2m5);
181 let m0304b = SseVector::fmadd(m0304b, self.twiddles_im[1], x3m4);
182 let [y03, y04] = SseVector::column_butterfly2([m0304a, m0304b]);
183
184
185 [y00, y01, y02, y03, y04, y05, y06]
186 }
187}
188
189struct SseF64Butterfly7<T> {
190 direction: FftDirection,
191 twiddles_re: [__m128d; 3],
192 twiddles_im: [__m128d; 3],
193 _phantom: std::marker::PhantomData<T>,
194}
195
196boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly7);
197boilerplate_fft_sse_common_butterfly!(SseF64Butterfly7, 7, |this: &SseF64Butterfly7<_>| this.direction);
198impl<T: FftNum> SseF64Butterfly7<T> {
199 #[target_feature(enable = "sse4.1")]
201 unsafe fn new(direction: FftDirection) -> Self {
202 assert_f64::<T>();
203 let twiddles = make_twiddles(7, direction);
204 unsafe {Self {
205 direction,
206 twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
207 twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
208 _phantom: std::marker::PhantomData,
209 }}
210 }
211
212 #[inline(always)]
213 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
214 let values = read_complex_to_array!(buffer, { 0,1,2,3,4,5,6 });
215
216 let out = self.perform_fft_direct(values);
217
218 write_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6 });
219 }
220
221 #[inline(always)]
222 pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 7]) -> [__m128d; 7] {
223 let rotate = SseVector::make_rotate90(FftDirection::Inverse);
224
225 let y00 = values[0];
226 let [x1p6, x1m6] = SseVector::column_butterfly2([values[1], values[6]]);
227 let x1m6 = SseVector::apply_rotate90(rotate, x1m6);
228 let y00 = SseVector::add(y00, x1p6);
229 let [x2p5, x2m5] = SseVector::column_butterfly2([values[2], values[5]]);
230 let x2m5 = SseVector::apply_rotate90(rotate, x2m5);
231 let y00 = SseVector::add(y00, x2p5);
232 let [x3p4, x3m4] = SseVector::column_butterfly2([values[3], values[4]]);
233 let x3m4 = SseVector::apply_rotate90(rotate, x3m4);
234 let y00 = SseVector::add(y00, x3p4);
235
236 let m0106a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p6);
237 let m0106a = SseVector::fmadd(m0106a, self.twiddles_re[1], x2p5);
238 let m0106a = SseVector::fmadd(m0106a, self.twiddles_re[2], x3p4);
239 let m0106b = SseVector::mul(self.twiddles_im[0], x1m6);
240 let m0106b = SseVector::fmadd(m0106b, self.twiddles_im[1], x2m5);
241 let m0106b = SseVector::fmadd(m0106b, self.twiddles_im[2], x3m4);
242 let [y01, y06] = SseVector::column_butterfly2([m0106a, m0106b]);
243
244 let m0205a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p6);
245 let m0205a = SseVector::fmadd(m0205a, self.twiddles_re[2], x2p5);
246 let m0205a = SseVector::fmadd(m0205a, self.twiddles_re[0], x3p4);
247 let m0205b = SseVector::mul(self.twiddles_im[1], x1m6);
248 let m0205b = SseVector::nmadd(m0205b, self.twiddles_im[2], x2m5);
249 let m0205b = SseVector::nmadd(m0205b, self.twiddles_im[0], x3m4);
250 let [y02, y05] = SseVector::column_butterfly2([m0205a, m0205b]);
251
252 let m0304a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p6);
253 let m0304a = SseVector::fmadd(m0304a, self.twiddles_re[0], x2p5);
254 let m0304a = SseVector::fmadd(m0304a, self.twiddles_re[1], x3p4);
255 let m0304b = SseVector::mul(self.twiddles_im[2], x1m6);
256 let m0304b = SseVector::nmadd(m0304b, self.twiddles_im[0], x2m5);
257 let m0304b = SseVector::fmadd(m0304b, self.twiddles_im[1], x3m4);
258 let [y03, y04] = SseVector::column_butterfly2([m0304a, m0304b]);
259
260
261 [y00, y01, y02, y03, y04, y05, y06]
262 }
263}
264
265struct SseF32Butterfly11<T> {
266 direction: FftDirection,
267 twiddles_re: [__m128; 5],
268 twiddles_im: [__m128; 5],
269 _phantom: std::marker::PhantomData<T>,
270}
271
272boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly11);
273boilerplate_fft_sse_common_butterfly!(SseF32Butterfly11, 11, |this: &SseF32Butterfly11<_>| this.direction);
274impl<T: FftNum> SseF32Butterfly11<T> {
275 #[target_feature(enable = "sse4.1")]
277 unsafe fn new(direction: FftDirection) -> Self {
278 assert_f32::<T>();
279 let twiddles = make_twiddles(11, direction);
280 Self {
281 direction,
282 twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
283 twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
284 _phantom: std::marker::PhantomData,
285 }
286 }
287
288 #[inline(always)]
289 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
290 let values = read_partial1_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10 });
291
292 let out = self.perform_parallel_fft_direct(values);
293
294 write_partial_lo_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10 } );
295 }
296
297 #[inline(always)]
298 pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
299 let input_packed = read_complex_to_array!(buffer, { 0,2,4,6,8,10,12,14,16,18,20 });
300
301 let values = [
302 extract_lo_hi_f32(input_packed[0], input_packed[5]),
303 extract_hi_lo_f32(input_packed[0], input_packed[6]),
304 extract_lo_hi_f32(input_packed[1], input_packed[6]),
305 extract_hi_lo_f32(input_packed[1], input_packed[7]),
306 extract_lo_hi_f32(input_packed[2], input_packed[7]),
307 extract_hi_lo_f32(input_packed[2], input_packed[8]),
308 extract_lo_hi_f32(input_packed[3], input_packed[8]),
309 extract_hi_lo_f32(input_packed[3], input_packed[9]),
310 extract_lo_hi_f32(input_packed[4], input_packed[9]),
311 extract_hi_lo_f32(input_packed[4], input_packed[10]),
312 extract_lo_hi_f32(input_packed[5], input_packed[10]),
313 ];
314
315 let out = self.perform_parallel_fft_direct(values);
316
317 let out_packed = [
318 extract_lo_lo_f32(out[0], out[1]),
319 extract_lo_lo_f32(out[2], out[3]),
320 extract_lo_lo_f32(out[4], out[5]),
321 extract_lo_lo_f32(out[6], out[7]),
322 extract_lo_lo_f32(out[8], out[9]),
323 extract_lo_hi_f32(out[10], out[0]),
324 extract_hi_hi_f32(out[1], out[2]),
325 extract_hi_hi_f32(out[3], out[4]),
326 extract_hi_hi_f32(out[5], out[6]),
327 extract_hi_hi_f32(out[7], out[8]),
328 extract_hi_hi_f32(out[9], out[10]),
329 ];
330
331 write_complex_to_array_strided!(out_packed, buffer, 2, { 0,1,2,3,4,5,6,7,8,9,10 });
332 }
333
334 #[inline(always)]
335 pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 11]) -> [__m128; 11] {
336 let rotate = SseVector::make_rotate90(FftDirection::Inverse);
337
338 let y00 = values[0];
339 let [x1p10, x1m10] = SseVector::column_butterfly2([values[1], values[10]]);
340 let x1m10 = SseVector::apply_rotate90(rotate, x1m10);
341 let y00 = SseVector::add(y00, x1p10);
342 let [x2p9, x2m9] = SseVector::column_butterfly2([values[2], values[9]]);
343 let x2m9 = SseVector::apply_rotate90(rotate, x2m9);
344 let y00 = SseVector::add(y00, x2p9);
345 let [x3p8, x3m8] = SseVector::column_butterfly2([values[3], values[8]]);
346 let x3m8 = SseVector::apply_rotate90(rotate, x3m8);
347 let y00 = SseVector::add(y00, x3p8);
348 let [x4p7, x4m7] = SseVector::column_butterfly2([values[4], values[7]]);
349 let x4m7 = SseVector::apply_rotate90(rotate, x4m7);
350 let y00 = SseVector::add(y00, x4p7);
351 let [x5p6, x5m6] = SseVector::column_butterfly2([values[5], values[6]]);
352 let x5m6 = SseVector::apply_rotate90(rotate, x5m6);
353 let y00 = SseVector::add(y00, x5p6);
354
355 let m0110a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p10);
356 let m0110a = SseVector::fmadd(m0110a, self.twiddles_re[1], x2p9);
357 let m0110a = SseVector::fmadd(m0110a, self.twiddles_re[2], x3p8);
358 let m0110a = SseVector::fmadd(m0110a, self.twiddles_re[3], x4p7);
359 let m0110a = SseVector::fmadd(m0110a, self.twiddles_re[4], x5p6);
360 let m0110b = SseVector::mul(self.twiddles_im[0], x1m10);
361 let m0110b = SseVector::fmadd(m0110b, self.twiddles_im[1], x2m9);
362 let m0110b = SseVector::fmadd(m0110b, self.twiddles_im[2], x3m8);
363 let m0110b = SseVector::fmadd(m0110b, self.twiddles_im[3], x4m7);
364 let m0110b = SseVector::fmadd(m0110b, self.twiddles_im[4], x5m6);
365 let [y01, y10] = SseVector::column_butterfly2([m0110a, m0110b]);
366
367 let m0209a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p10);
368 let m0209a = SseVector::fmadd(m0209a, self.twiddles_re[3], x2p9);
369 let m0209a = SseVector::fmadd(m0209a, self.twiddles_re[4], x3p8);
370 let m0209a = SseVector::fmadd(m0209a, self.twiddles_re[2], x4p7);
371 let m0209a = SseVector::fmadd(m0209a, self.twiddles_re[0], x5p6);
372 let m0209b = SseVector::mul(self.twiddles_im[1], x1m10);
373 let m0209b = SseVector::fmadd(m0209b, self.twiddles_im[3], x2m9);
374 let m0209b = SseVector::nmadd(m0209b, self.twiddles_im[4], x3m8);
375 let m0209b = SseVector::nmadd(m0209b, self.twiddles_im[2], x4m7);
376 let m0209b = SseVector::nmadd(m0209b, self.twiddles_im[0], x5m6);
377 let [y02, y09] = SseVector::column_butterfly2([m0209a, m0209b]);
378
379 let m0308a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p10);
380 let m0308a = SseVector::fmadd(m0308a, self.twiddles_re[4], x2p9);
381 let m0308a = SseVector::fmadd(m0308a, self.twiddles_re[1], x3p8);
382 let m0308a = SseVector::fmadd(m0308a, self.twiddles_re[0], x4p7);
383 let m0308a = SseVector::fmadd(m0308a, self.twiddles_re[3], x5p6);
384 let m0308b = SseVector::mul(self.twiddles_im[2], x1m10);
385 let m0308b = SseVector::nmadd(m0308b, self.twiddles_im[4], x2m9);
386 let m0308b = SseVector::nmadd(m0308b, self.twiddles_im[1], x3m8);
387 let m0308b = SseVector::fmadd(m0308b, self.twiddles_im[0], x4m7);
388 let m0308b = SseVector::fmadd(m0308b, self.twiddles_im[3], x5m6);
389 let [y03, y08] = SseVector::column_butterfly2([m0308a, m0308b]);
390
391 let m0407a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p10);
392 let m0407a = SseVector::fmadd(m0407a, self.twiddles_re[2], x2p9);
393 let m0407a = SseVector::fmadd(m0407a, self.twiddles_re[0], x3p8);
394 let m0407a = SseVector::fmadd(m0407a, self.twiddles_re[4], x4p7);
395 let m0407a = SseVector::fmadd(m0407a, self.twiddles_re[1], x5p6);
396 let m0407b = SseVector::mul(self.twiddles_im[3], x1m10);
397 let m0407b = SseVector::nmadd(m0407b, self.twiddles_im[2], x2m9);
398 let m0407b = SseVector::fmadd(m0407b, self.twiddles_im[0], x3m8);
399 let m0407b = SseVector::fmadd(m0407b, self.twiddles_im[4], x4m7);
400 let m0407b = SseVector::nmadd(m0407b, self.twiddles_im[1], x5m6);
401 let [y04, y07] = SseVector::column_butterfly2([m0407a, m0407b]);
402
403 let m0506a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p10);
404 let m0506a = SseVector::fmadd(m0506a, self.twiddles_re[0], x2p9);
405 let m0506a = SseVector::fmadd(m0506a, self.twiddles_re[3], x3p8);
406 let m0506a = SseVector::fmadd(m0506a, self.twiddles_re[1], x4p7);
407 let m0506a = SseVector::fmadd(m0506a, self.twiddles_re[2], x5p6);
408 let m0506b = SseVector::mul(self.twiddles_im[4], x1m10);
409 let m0506b = SseVector::nmadd(m0506b, self.twiddles_im[0], x2m9);
410 let m0506b = SseVector::fmadd(m0506b, self.twiddles_im[3], x3m8);
411 let m0506b = SseVector::nmadd(m0506b, self.twiddles_im[1], x4m7);
412 let m0506b = SseVector::fmadd(m0506b, self.twiddles_im[2], x5m6);
413 let [y05, y06] = SseVector::column_butterfly2([m0506a, m0506b]);
414
415
416 [y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10]
417 }
418}
419
420struct SseF64Butterfly11<T> {
421 direction: FftDirection,
422 twiddles_re: [__m128d; 5],
423 twiddles_im: [__m128d; 5],
424 _phantom: std::marker::PhantomData<T>,
425}
426
427boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly11);
428boilerplate_fft_sse_common_butterfly!(SseF64Butterfly11, 11, |this: &SseF64Butterfly11<_>| this.direction);
429impl<T: FftNum> SseF64Butterfly11<T> {
430 #[target_feature(enable = "sse4.1")]
432 unsafe fn new(direction: FftDirection) -> Self {
433 assert_f64::<T>();
434 let twiddles = make_twiddles(11, direction);
435 unsafe {Self {
436 direction,
437 twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
438 twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
439 _phantom: std::marker::PhantomData,
440 }}
441 }
442
443 #[inline(always)]
444 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
445 let values = read_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10 });
446
447 let out = self.perform_fft_direct(values);
448
449 write_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10 });
450 }
451
452 #[inline(always)]
453 pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 11]) -> [__m128d; 11] {
454 let rotate = SseVector::make_rotate90(FftDirection::Inverse);
455
456 let y00 = values[0];
457 let [x1p10, x1m10] = SseVector::column_butterfly2([values[1], values[10]]);
458 let x1m10 = SseVector::apply_rotate90(rotate, x1m10);
459 let y00 = SseVector::add(y00, x1p10);
460 let [x2p9, x2m9] = SseVector::column_butterfly2([values[2], values[9]]);
461 let x2m9 = SseVector::apply_rotate90(rotate, x2m9);
462 let y00 = SseVector::add(y00, x2p9);
463 let [x3p8, x3m8] = SseVector::column_butterfly2([values[3], values[8]]);
464 let x3m8 = SseVector::apply_rotate90(rotate, x3m8);
465 let y00 = SseVector::add(y00, x3p8);
466 let [x4p7, x4m7] = SseVector::column_butterfly2([values[4], values[7]]);
467 let x4m7 = SseVector::apply_rotate90(rotate, x4m7);
468 let y00 = SseVector::add(y00, x4p7);
469 let [x5p6, x5m6] = SseVector::column_butterfly2([values[5], values[6]]);
470 let x5m6 = SseVector::apply_rotate90(rotate, x5m6);
471 let y00 = SseVector::add(y00, x5p6);
472
473 let m0110a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p10);
474 let m0110a = SseVector::fmadd(m0110a, self.twiddles_re[1], x2p9);
475 let m0110a = SseVector::fmadd(m0110a, self.twiddles_re[2], x3p8);
476 let m0110a = SseVector::fmadd(m0110a, self.twiddles_re[3], x4p7);
477 let m0110a = SseVector::fmadd(m0110a, self.twiddles_re[4], x5p6);
478 let m0110b = SseVector::mul(self.twiddles_im[0], x1m10);
479 let m0110b = SseVector::fmadd(m0110b, self.twiddles_im[1], x2m9);
480 let m0110b = SseVector::fmadd(m0110b, self.twiddles_im[2], x3m8);
481 let m0110b = SseVector::fmadd(m0110b, self.twiddles_im[3], x4m7);
482 let m0110b = SseVector::fmadd(m0110b, self.twiddles_im[4], x5m6);
483 let [y01, y10] = SseVector::column_butterfly2([m0110a, m0110b]);
484
485 let m0209a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p10);
486 let m0209a = SseVector::fmadd(m0209a, self.twiddles_re[3], x2p9);
487 let m0209a = SseVector::fmadd(m0209a, self.twiddles_re[4], x3p8);
488 let m0209a = SseVector::fmadd(m0209a, self.twiddles_re[2], x4p7);
489 let m0209a = SseVector::fmadd(m0209a, self.twiddles_re[0], x5p6);
490 let m0209b = SseVector::mul(self.twiddles_im[1], x1m10);
491 let m0209b = SseVector::fmadd(m0209b, self.twiddles_im[3], x2m9);
492 let m0209b = SseVector::nmadd(m0209b, self.twiddles_im[4], x3m8);
493 let m0209b = SseVector::nmadd(m0209b, self.twiddles_im[2], x4m7);
494 let m0209b = SseVector::nmadd(m0209b, self.twiddles_im[0], x5m6);
495 let [y02, y09] = SseVector::column_butterfly2([m0209a, m0209b]);
496
497 let m0308a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p10);
498 let m0308a = SseVector::fmadd(m0308a, self.twiddles_re[4], x2p9);
499 let m0308a = SseVector::fmadd(m0308a, self.twiddles_re[1], x3p8);
500 let m0308a = SseVector::fmadd(m0308a, self.twiddles_re[0], x4p7);
501 let m0308a = SseVector::fmadd(m0308a, self.twiddles_re[3], x5p6);
502 let m0308b = SseVector::mul(self.twiddles_im[2], x1m10);
503 let m0308b = SseVector::nmadd(m0308b, self.twiddles_im[4], x2m9);
504 let m0308b = SseVector::nmadd(m0308b, self.twiddles_im[1], x3m8);
505 let m0308b = SseVector::fmadd(m0308b, self.twiddles_im[0], x4m7);
506 let m0308b = SseVector::fmadd(m0308b, self.twiddles_im[3], x5m6);
507 let [y03, y08] = SseVector::column_butterfly2([m0308a, m0308b]);
508
509 let m0407a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p10);
510 let m0407a = SseVector::fmadd(m0407a, self.twiddles_re[2], x2p9);
511 let m0407a = SseVector::fmadd(m0407a, self.twiddles_re[0], x3p8);
512 let m0407a = SseVector::fmadd(m0407a, self.twiddles_re[4], x4p7);
513 let m0407a = SseVector::fmadd(m0407a, self.twiddles_re[1], x5p6);
514 let m0407b = SseVector::mul(self.twiddles_im[3], x1m10);
515 let m0407b = SseVector::nmadd(m0407b, self.twiddles_im[2], x2m9);
516 let m0407b = SseVector::fmadd(m0407b, self.twiddles_im[0], x3m8);
517 let m0407b = SseVector::fmadd(m0407b, self.twiddles_im[4], x4m7);
518 let m0407b = SseVector::nmadd(m0407b, self.twiddles_im[1], x5m6);
519 let [y04, y07] = SseVector::column_butterfly2([m0407a, m0407b]);
520
521 let m0506a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p10);
522 let m0506a = SseVector::fmadd(m0506a, self.twiddles_re[0], x2p9);
523 let m0506a = SseVector::fmadd(m0506a, self.twiddles_re[3], x3p8);
524 let m0506a = SseVector::fmadd(m0506a, self.twiddles_re[1], x4p7);
525 let m0506a = SseVector::fmadd(m0506a, self.twiddles_re[2], x5p6);
526 let m0506b = SseVector::mul(self.twiddles_im[4], x1m10);
527 let m0506b = SseVector::nmadd(m0506b, self.twiddles_im[0], x2m9);
528 let m0506b = SseVector::fmadd(m0506b, self.twiddles_im[3], x3m8);
529 let m0506b = SseVector::nmadd(m0506b, self.twiddles_im[1], x4m7);
530 let m0506b = SseVector::fmadd(m0506b, self.twiddles_im[2], x5m6);
531 let [y05, y06] = SseVector::column_butterfly2([m0506a, m0506b]);
532
533
534 [y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10]
535 }
536}
537
538struct SseF32Butterfly13<T> {
539 direction: FftDirection,
540 twiddles_re: [__m128; 6],
541 twiddles_im: [__m128; 6],
542 _phantom: std::marker::PhantomData<T>,
543}
544
545boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly13);
546boilerplate_fft_sse_common_butterfly!(SseF32Butterfly13, 13, |this: &SseF32Butterfly13<_>| this.direction);
547impl<T: FftNum> SseF32Butterfly13<T> {
548 #[target_feature(enable = "sse4.1")]
550 unsafe fn new(direction: FftDirection) -> Self {
551 assert_f32::<T>();
552 let twiddles = make_twiddles(13, direction);
553 Self {
554 direction,
555 twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
556 twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
557 _phantom: std::marker::PhantomData,
558 }
559 }
560
561 #[inline(always)]
562 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
563 let values = read_partial1_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12 });
564
565 let out = self.perform_parallel_fft_direct(values);
566
567 write_partial_lo_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12 } );
568 }
569
570 #[inline(always)]
571 pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
572 let input_packed = read_complex_to_array!(buffer, { 0,2,4,6,8,10,12,14,16,18,20,22,24 });
573
574 let values = [
575 extract_lo_hi_f32(input_packed[0], input_packed[6]),
576 extract_hi_lo_f32(input_packed[0], input_packed[7]),
577 extract_lo_hi_f32(input_packed[1], input_packed[7]),
578 extract_hi_lo_f32(input_packed[1], input_packed[8]),
579 extract_lo_hi_f32(input_packed[2], input_packed[8]),
580 extract_hi_lo_f32(input_packed[2], input_packed[9]),
581 extract_lo_hi_f32(input_packed[3], input_packed[9]),
582 extract_hi_lo_f32(input_packed[3], input_packed[10]),
583 extract_lo_hi_f32(input_packed[4], input_packed[10]),
584 extract_hi_lo_f32(input_packed[4], input_packed[11]),
585 extract_lo_hi_f32(input_packed[5], input_packed[11]),
586 extract_hi_lo_f32(input_packed[5], input_packed[12]),
587 extract_lo_hi_f32(input_packed[6], input_packed[12]),
588 ];
589
590 let out = self.perform_parallel_fft_direct(values);
591
592 let out_packed = [
593 extract_lo_lo_f32(out[0], out[1]),
594 extract_lo_lo_f32(out[2], out[3]),
595 extract_lo_lo_f32(out[4], out[5]),
596 extract_lo_lo_f32(out[6], out[7]),
597 extract_lo_lo_f32(out[8], out[9]),
598 extract_lo_lo_f32(out[10], out[11]),
599 extract_lo_hi_f32(out[12], out[0]),
600 extract_hi_hi_f32(out[1], out[2]),
601 extract_hi_hi_f32(out[3], out[4]),
602 extract_hi_hi_f32(out[5], out[6]),
603 extract_hi_hi_f32(out[7], out[8]),
604 extract_hi_hi_f32(out[9], out[10]),
605 extract_hi_hi_f32(out[11], out[12]),
606 ];
607
608 write_complex_to_array_strided!(out_packed, buffer, 2, { 0,1,2,3,4,5,6,7,8,9,10,11,12 });
609 }
610
611 #[inline(always)]
612 pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 13]) -> [__m128; 13] {
613 let rotate = SseVector::make_rotate90(FftDirection::Inverse);
614
615 let y00 = values[0];
616 let [x1p12, x1m12] = SseVector::column_butterfly2([values[1], values[12]]);
617 let x1m12 = SseVector::apply_rotate90(rotate, x1m12);
618 let y00 = SseVector::add(y00, x1p12);
619 let [x2p11, x2m11] = SseVector::column_butterfly2([values[2], values[11]]);
620 let x2m11 = SseVector::apply_rotate90(rotate, x2m11);
621 let y00 = SseVector::add(y00, x2p11);
622 let [x3p10, x3m10] = SseVector::column_butterfly2([values[3], values[10]]);
623 let x3m10 = SseVector::apply_rotate90(rotate, x3m10);
624 let y00 = SseVector::add(y00, x3p10);
625 let [x4p9, x4m9] = SseVector::column_butterfly2([values[4], values[9]]);
626 let x4m9 = SseVector::apply_rotate90(rotate, x4m9);
627 let y00 = SseVector::add(y00, x4p9);
628 let [x5p8, x5m8] = SseVector::column_butterfly2([values[5], values[8]]);
629 let x5m8 = SseVector::apply_rotate90(rotate, x5m8);
630 let y00 = SseVector::add(y00, x5p8);
631 let [x6p7, x6m7] = SseVector::column_butterfly2([values[6], values[7]]);
632 let x6m7 = SseVector::apply_rotate90(rotate, x6m7);
633 let y00 = SseVector::add(y00, x6p7);
634
635 let m0112a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p12);
636 let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[1], x2p11);
637 let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[2], x3p10);
638 let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[3], x4p9);
639 let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[4], x5p8);
640 let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[5], x6p7);
641 let m0112b = SseVector::mul(self.twiddles_im[0], x1m12);
642 let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[1], x2m11);
643 let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[2], x3m10);
644 let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[3], x4m9);
645 let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[4], x5m8);
646 let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[5], x6m7);
647 let [y01, y12] = SseVector::column_butterfly2([m0112a, m0112b]);
648
649 let m0211a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p12);
650 let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[3], x2p11);
651 let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[5], x3p10);
652 let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[4], x4p9);
653 let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[2], x5p8);
654 let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[0], x6p7);
655 let m0211b = SseVector::mul(self.twiddles_im[1], x1m12);
656 let m0211b = SseVector::fmadd(m0211b, self.twiddles_im[3], x2m11);
657 let m0211b = SseVector::fmadd(m0211b, self.twiddles_im[5], x3m10);
658 let m0211b = SseVector::nmadd(m0211b, self.twiddles_im[4], x4m9);
659 let m0211b = SseVector::nmadd(m0211b, self.twiddles_im[2], x5m8);
660 let m0211b = SseVector::nmadd(m0211b, self.twiddles_im[0], x6m7);
661 let [y02, y11] = SseVector::column_butterfly2([m0211a, m0211b]);
662
663 let m0310a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p12);
664 let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[5], x2p11);
665 let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[3], x3p10);
666 let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[0], x4p9);
667 let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[1], x5p8);
668 let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[4], x6p7);
669 let m0310b = SseVector::mul(self.twiddles_im[2], x1m12);
670 let m0310b = SseVector::fmadd(m0310b, self.twiddles_im[5], x2m11);
671 let m0310b = SseVector::nmadd(m0310b, self.twiddles_im[3], x3m10);
672 let m0310b = SseVector::nmadd(m0310b, self.twiddles_im[0], x4m9);
673 let m0310b = SseVector::fmadd(m0310b, self.twiddles_im[1], x5m8);
674 let m0310b = SseVector::fmadd(m0310b, self.twiddles_im[4], x6m7);
675 let [y03, y10] = SseVector::column_butterfly2([m0310a, m0310b]);
676
677 let m0409a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p12);
678 let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[4], x2p11);
679 let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[0], x3p10);
680 let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[2], x4p9);
681 let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[5], x5p8);
682 let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[1], x6p7);
683 let m0409b = SseVector::mul(self.twiddles_im[3], x1m12);
684 let m0409b = SseVector::nmadd(m0409b, self.twiddles_im[4], x2m11);
685 let m0409b = SseVector::nmadd(m0409b, self.twiddles_im[0], x3m10);
686 let m0409b = SseVector::fmadd(m0409b, self.twiddles_im[2], x4m9);
687 let m0409b = SseVector::nmadd(m0409b, self.twiddles_im[5], x5m8);
688 let m0409b = SseVector::nmadd(m0409b, self.twiddles_im[1], x6m7);
689 let [y04, y09] = SseVector::column_butterfly2([m0409a, m0409b]);
690
691 let m0508a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p12);
692 let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[2], x2p11);
693 let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[1], x3p10);
694 let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[5], x4p9);
695 let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[0], x5p8);
696 let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[3], x6p7);
697 let m0508b = SseVector::mul(self.twiddles_im[4], x1m12);
698 let m0508b = SseVector::nmadd(m0508b, self.twiddles_im[2], x2m11);
699 let m0508b = SseVector::fmadd(m0508b, self.twiddles_im[1], x3m10);
700 let m0508b = SseVector::nmadd(m0508b, self.twiddles_im[5], x4m9);
701 let m0508b = SseVector::nmadd(m0508b, self.twiddles_im[0], x5m8);
702 let m0508b = SseVector::fmadd(m0508b, self.twiddles_im[3], x6m7);
703 let [y05, y08] = SseVector::column_butterfly2([m0508a, m0508b]);
704
705 let m0607a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p12);
706 let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[0], x2p11);
707 let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[4], x3p10);
708 let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[1], x4p9);
709 let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[3], x5p8);
710 let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[2], x6p7);
711 let m0607b = SseVector::mul(self.twiddles_im[5], x1m12);
712 let m0607b = SseVector::nmadd(m0607b, self.twiddles_im[0], x2m11);
713 let m0607b = SseVector::fmadd(m0607b, self.twiddles_im[4], x3m10);
714 let m0607b = SseVector::nmadd(m0607b, self.twiddles_im[1], x4m9);
715 let m0607b = SseVector::fmadd(m0607b, self.twiddles_im[3], x5m8);
716 let m0607b = SseVector::nmadd(m0607b, self.twiddles_im[2], x6m7);
717 let [y06, y07] = SseVector::column_butterfly2([m0607a, m0607b]);
718
719
720 [y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12]
721 }
722}
723
724struct SseF64Butterfly13<T> {
725 direction: FftDirection,
726 twiddles_re: [__m128d; 6],
727 twiddles_im: [__m128d; 6],
728 _phantom: std::marker::PhantomData<T>,
729}
730
731boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly13);
732boilerplate_fft_sse_common_butterfly!(SseF64Butterfly13, 13, |this: &SseF64Butterfly13<_>| this.direction);
733impl<T: FftNum> SseF64Butterfly13<T> {
734 #[target_feature(enable = "sse4.1")]
736 unsafe fn new(direction: FftDirection) -> Self {
737 assert_f64::<T>();
738 let twiddles = make_twiddles(13, direction);
739 unsafe {Self {
740 direction,
741 twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
742 twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
743 _phantom: std::marker::PhantomData,
744 }}
745 }
746
747 #[inline(always)]
748 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
749 let values = read_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12 });
750
751 let out = self.perform_fft_direct(values);
752
753 write_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12 });
754 }
755
756 #[inline(always)]
757 pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 13]) -> [__m128d; 13] {
758 let rotate = SseVector::make_rotate90(FftDirection::Inverse);
759
760 let y00 = values[0];
761 let [x1p12, x1m12] = SseVector::column_butterfly2([values[1], values[12]]);
762 let x1m12 = SseVector::apply_rotate90(rotate, x1m12);
763 let y00 = SseVector::add(y00, x1p12);
764 let [x2p11, x2m11] = SseVector::column_butterfly2([values[2], values[11]]);
765 let x2m11 = SseVector::apply_rotate90(rotate, x2m11);
766 let y00 = SseVector::add(y00, x2p11);
767 let [x3p10, x3m10] = SseVector::column_butterfly2([values[3], values[10]]);
768 let x3m10 = SseVector::apply_rotate90(rotate, x3m10);
769 let y00 = SseVector::add(y00, x3p10);
770 let [x4p9, x4m9] = SseVector::column_butterfly2([values[4], values[9]]);
771 let x4m9 = SseVector::apply_rotate90(rotate, x4m9);
772 let y00 = SseVector::add(y00, x4p9);
773 let [x5p8, x5m8] = SseVector::column_butterfly2([values[5], values[8]]);
774 let x5m8 = SseVector::apply_rotate90(rotate, x5m8);
775 let y00 = SseVector::add(y00, x5p8);
776 let [x6p7, x6m7] = SseVector::column_butterfly2([values[6], values[7]]);
777 let x6m7 = SseVector::apply_rotate90(rotate, x6m7);
778 let y00 = SseVector::add(y00, x6p7);
779
780 let m0112a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p12);
781 let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[1], x2p11);
782 let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[2], x3p10);
783 let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[3], x4p9);
784 let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[4], x5p8);
785 let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[5], x6p7);
786 let m0112b = SseVector::mul(self.twiddles_im[0], x1m12);
787 let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[1], x2m11);
788 let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[2], x3m10);
789 let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[3], x4m9);
790 let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[4], x5m8);
791 let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[5], x6m7);
792 let [y01, y12] = SseVector::column_butterfly2([m0112a, m0112b]);
793
794 let m0211a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p12);
795 let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[3], x2p11);
796 let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[5], x3p10);
797 let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[4], x4p9);
798 let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[2], x5p8);
799 let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[0], x6p7);
800 let m0211b = SseVector::mul(self.twiddles_im[1], x1m12);
801 let m0211b = SseVector::fmadd(m0211b, self.twiddles_im[3], x2m11);
802 let m0211b = SseVector::fmadd(m0211b, self.twiddles_im[5], x3m10);
803 let m0211b = SseVector::nmadd(m0211b, self.twiddles_im[4], x4m9);
804 let m0211b = SseVector::nmadd(m0211b, self.twiddles_im[2], x5m8);
805 let m0211b = SseVector::nmadd(m0211b, self.twiddles_im[0], x6m7);
806 let [y02, y11] = SseVector::column_butterfly2([m0211a, m0211b]);
807
808 let m0310a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p12);
809 let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[5], x2p11);
810 let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[3], x3p10);
811 let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[0], x4p9);
812 let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[1], x5p8);
813 let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[4], x6p7);
814 let m0310b = SseVector::mul(self.twiddles_im[2], x1m12);
815 let m0310b = SseVector::fmadd(m0310b, self.twiddles_im[5], x2m11);
816 let m0310b = SseVector::nmadd(m0310b, self.twiddles_im[3], x3m10);
817 let m0310b = SseVector::nmadd(m0310b, self.twiddles_im[0], x4m9);
818 let m0310b = SseVector::fmadd(m0310b, self.twiddles_im[1], x5m8);
819 let m0310b = SseVector::fmadd(m0310b, self.twiddles_im[4], x6m7);
820 let [y03, y10] = SseVector::column_butterfly2([m0310a, m0310b]);
821
822 let m0409a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p12);
823 let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[4], x2p11);
824 let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[0], x3p10);
825 let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[2], x4p9);
826 let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[5], x5p8);
827 let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[1], x6p7);
828 let m0409b = SseVector::mul(self.twiddles_im[3], x1m12);
829 let m0409b = SseVector::nmadd(m0409b, self.twiddles_im[4], x2m11);
830 let m0409b = SseVector::nmadd(m0409b, self.twiddles_im[0], x3m10);
831 let m0409b = SseVector::fmadd(m0409b, self.twiddles_im[2], x4m9);
832 let m0409b = SseVector::nmadd(m0409b, self.twiddles_im[5], x5m8);
833 let m0409b = SseVector::nmadd(m0409b, self.twiddles_im[1], x6m7);
834 let [y04, y09] = SseVector::column_butterfly2([m0409a, m0409b]);
835
836 let m0508a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p12);
837 let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[2], x2p11);
838 let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[1], x3p10);
839 let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[5], x4p9);
840 let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[0], x5p8);
841 let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[3], x6p7);
842 let m0508b = SseVector::mul(self.twiddles_im[4], x1m12);
843 let m0508b = SseVector::nmadd(m0508b, self.twiddles_im[2], x2m11);
844 let m0508b = SseVector::fmadd(m0508b, self.twiddles_im[1], x3m10);
845 let m0508b = SseVector::nmadd(m0508b, self.twiddles_im[5], x4m9);
846 let m0508b = SseVector::nmadd(m0508b, self.twiddles_im[0], x5m8);
847 let m0508b = SseVector::fmadd(m0508b, self.twiddles_im[3], x6m7);
848 let [y05, y08] = SseVector::column_butterfly2([m0508a, m0508b]);
849
850 let m0607a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p12);
851 let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[0], x2p11);
852 let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[4], x3p10);
853 let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[1], x4p9);
854 let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[3], x5p8);
855 let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[2], x6p7);
856 let m0607b = SseVector::mul(self.twiddles_im[5], x1m12);
857 let m0607b = SseVector::nmadd(m0607b, self.twiddles_im[0], x2m11);
858 let m0607b = SseVector::fmadd(m0607b, self.twiddles_im[4], x3m10);
859 let m0607b = SseVector::nmadd(m0607b, self.twiddles_im[1], x4m9);
860 let m0607b = SseVector::fmadd(m0607b, self.twiddles_im[3], x5m8);
861 let m0607b = SseVector::nmadd(m0607b, self.twiddles_im[2], x6m7);
862 let [y06, y07] = SseVector::column_butterfly2([m0607a, m0607b]);
863
864
865 [y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12]
866 }
867}
868
869struct SseF32Butterfly17<T> {
870 direction: FftDirection,
871 twiddles_re: [__m128; 8],
872 twiddles_im: [__m128; 8],
873 _phantom: std::marker::PhantomData<T>,
874}
875
876boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly17);
877boilerplate_fft_sse_common_butterfly!(SseF32Butterfly17, 17, |this: &SseF32Butterfly17<_>| this.direction);
878impl<T: FftNum> SseF32Butterfly17<T> {
879 #[target_feature(enable = "sse4.1")]
881 unsafe fn new(direction: FftDirection) -> Self {
882 assert_f32::<T>();
883 let twiddles = make_twiddles(17, direction);
884 Self {
885 direction,
886 twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
887 twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
888 _phantom: std::marker::PhantomData,
889 }
890 }
891
892 #[inline(always)]
893 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
894 let values = read_partial1_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 });
895
896 let out = self.perform_parallel_fft_direct(values);
897
898 write_partial_lo_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 } );
899 }
900
901 #[inline(always)]
902 pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
903 let input_packed = read_complex_to_array!(buffer, { 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32 });
904
905 let values = [
906 extract_lo_hi_f32(input_packed[0], input_packed[8]),
907 extract_hi_lo_f32(input_packed[0], input_packed[9]),
908 extract_lo_hi_f32(input_packed[1], input_packed[9]),
909 extract_hi_lo_f32(input_packed[1], input_packed[10]),
910 extract_lo_hi_f32(input_packed[2], input_packed[10]),
911 extract_hi_lo_f32(input_packed[2], input_packed[11]),
912 extract_lo_hi_f32(input_packed[3], input_packed[11]),
913 extract_hi_lo_f32(input_packed[3], input_packed[12]),
914 extract_lo_hi_f32(input_packed[4], input_packed[12]),
915 extract_hi_lo_f32(input_packed[4], input_packed[13]),
916 extract_lo_hi_f32(input_packed[5], input_packed[13]),
917 extract_hi_lo_f32(input_packed[5], input_packed[14]),
918 extract_lo_hi_f32(input_packed[6], input_packed[14]),
919 extract_hi_lo_f32(input_packed[6], input_packed[15]),
920 extract_lo_hi_f32(input_packed[7], input_packed[15]),
921 extract_hi_lo_f32(input_packed[7], input_packed[16]),
922 extract_lo_hi_f32(input_packed[8], input_packed[16]),
923 ];
924
925 let out = self.perform_parallel_fft_direct(values);
926
927 let out_packed = [
928 extract_lo_lo_f32(out[0], out[1]),
929 extract_lo_lo_f32(out[2], out[3]),
930 extract_lo_lo_f32(out[4], out[5]),
931 extract_lo_lo_f32(out[6], out[7]),
932 extract_lo_lo_f32(out[8], out[9]),
933 extract_lo_lo_f32(out[10], out[11]),
934 extract_lo_lo_f32(out[12], out[13]),
935 extract_lo_lo_f32(out[14], out[15]),
936 extract_lo_hi_f32(out[16], out[0]),
937 extract_hi_hi_f32(out[1], out[2]),
938 extract_hi_hi_f32(out[3], out[4]),
939 extract_hi_hi_f32(out[5], out[6]),
940 extract_hi_hi_f32(out[7], out[8]),
941 extract_hi_hi_f32(out[9], out[10]),
942 extract_hi_hi_f32(out[11], out[12]),
943 extract_hi_hi_f32(out[13], out[14]),
944 extract_hi_hi_f32(out[15], out[16]),
945 ];
946
947 write_complex_to_array_strided!(out_packed, buffer, 2, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 });
948 }
949
950 #[inline(always)]
951 pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 17]) -> [__m128; 17] {
952 let rotate = SseVector::make_rotate90(FftDirection::Inverse);
953
954 let y00 = values[0];
955 let [x1p16, x1m16] = SseVector::column_butterfly2([values[1], values[16]]);
956 let x1m16 = SseVector::apply_rotate90(rotate, x1m16);
957 let y00 = SseVector::add(y00, x1p16);
958 let [x2p15, x2m15] = SseVector::column_butterfly2([values[2], values[15]]);
959 let x2m15 = SseVector::apply_rotate90(rotate, x2m15);
960 let y00 = SseVector::add(y00, x2p15);
961 let [x3p14, x3m14] = SseVector::column_butterfly2([values[3], values[14]]);
962 let x3m14 = SseVector::apply_rotate90(rotate, x3m14);
963 let y00 = SseVector::add(y00, x3p14);
964 let [x4p13, x4m13] = SseVector::column_butterfly2([values[4], values[13]]);
965 let x4m13 = SseVector::apply_rotate90(rotate, x4m13);
966 let y00 = SseVector::add(y00, x4p13);
967 let [x5p12, x5m12] = SseVector::column_butterfly2([values[5], values[12]]);
968 let x5m12 = SseVector::apply_rotate90(rotate, x5m12);
969 let y00 = SseVector::add(y00, x5p12);
970 let [x6p11, x6m11] = SseVector::column_butterfly2([values[6], values[11]]);
971 let x6m11 = SseVector::apply_rotate90(rotate, x6m11);
972 let y00 = SseVector::add(y00, x6p11);
973 let [x7p10, x7m10] = SseVector::column_butterfly2([values[7], values[10]]);
974 let x7m10 = SseVector::apply_rotate90(rotate, x7m10);
975 let y00 = SseVector::add(y00, x7p10);
976 let [x8p9, x8m9] = SseVector::column_butterfly2([values[8], values[9]]);
977 let x8m9 = SseVector::apply_rotate90(rotate, x8m9);
978 let y00 = SseVector::add(y00, x8p9);
979
980 let m0116a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p16);
981 let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[1], x2p15);
982 let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[2], x3p14);
983 let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[3], x4p13);
984 let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[4], x5p12);
985 let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[5], x6p11);
986 let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[6], x7p10);
987 let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[7], x8p9);
988 let m0116b = SseVector::mul(self.twiddles_im[0], x1m16);
989 let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[1], x2m15);
990 let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[2], x3m14);
991 let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[3], x4m13);
992 let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[4], x5m12);
993 let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[5], x6m11);
994 let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[6], x7m10);
995 let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[7], x8m9);
996 let [y01, y16] = SseVector::column_butterfly2([m0116a, m0116b]);
997
998 let m0215a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p16);
999 let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[3], x2p15);
1000 let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[5], x3p14);
1001 let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[7], x4p13);
1002 let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[6], x5p12);
1003 let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[4], x6p11);
1004 let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[2], x7p10);
1005 let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[0], x8p9);
1006 let m0215b = SseVector::mul(self.twiddles_im[1], x1m16);
1007 let m0215b = SseVector::fmadd(m0215b, self.twiddles_im[3], x2m15);
1008 let m0215b = SseVector::fmadd(m0215b, self.twiddles_im[5], x3m14);
1009 let m0215b = SseVector::fmadd(m0215b, self.twiddles_im[7], x4m13);
1010 let m0215b = SseVector::nmadd(m0215b, self.twiddles_im[6], x5m12);
1011 let m0215b = SseVector::nmadd(m0215b, self.twiddles_im[4], x6m11);
1012 let m0215b = SseVector::nmadd(m0215b, self.twiddles_im[2], x7m10);
1013 let m0215b = SseVector::nmadd(m0215b, self.twiddles_im[0], x8m9);
1014 let [y02, y15] = SseVector::column_butterfly2([m0215a, m0215b]);
1015
1016 let m0314a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p16);
1017 let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[5], x2p15);
1018 let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[7], x3p14);
1019 let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[4], x4p13);
1020 let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[1], x5p12);
1021 let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[0], x6p11);
1022 let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[3], x7p10);
1023 let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[6], x8p9);
1024 let m0314b = SseVector::mul(self.twiddles_im[2], x1m16);
1025 let m0314b = SseVector::fmadd(m0314b, self.twiddles_im[5], x2m15);
1026 let m0314b = SseVector::nmadd(m0314b, self.twiddles_im[7], x3m14);
1027 let m0314b = SseVector::nmadd(m0314b, self.twiddles_im[4], x4m13);
1028 let m0314b = SseVector::nmadd(m0314b, self.twiddles_im[1], x5m12);
1029 let m0314b = SseVector::fmadd(m0314b, self.twiddles_im[0], x6m11);
1030 let m0314b = SseVector::fmadd(m0314b, self.twiddles_im[3], x7m10);
1031 let m0314b = SseVector::fmadd(m0314b, self.twiddles_im[6], x8m9);
1032 let [y03, y14] = SseVector::column_butterfly2([m0314a, m0314b]);
1033
1034 let m0413a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p16);
1035 let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[7], x2p15);
1036 let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[4], x3p14);
1037 let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[0], x4p13);
1038 let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[2], x5p12);
1039 let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[6], x6p11);
1040 let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[5], x7p10);
1041 let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[1], x8p9);
1042 let m0413b = SseVector::mul(self.twiddles_im[3], x1m16);
1043 let m0413b = SseVector::fmadd(m0413b, self.twiddles_im[7], x2m15);
1044 let m0413b = SseVector::nmadd(m0413b, self.twiddles_im[4], x3m14);
1045 let m0413b = SseVector::nmadd(m0413b, self.twiddles_im[0], x4m13);
1046 let m0413b = SseVector::fmadd(m0413b, self.twiddles_im[2], x5m12);
1047 let m0413b = SseVector::fmadd(m0413b, self.twiddles_im[6], x6m11);
1048 let m0413b = SseVector::nmadd(m0413b, self.twiddles_im[5], x7m10);
1049 let m0413b = SseVector::nmadd(m0413b, self.twiddles_im[1], x8m9);
1050 let [y04, y13] = SseVector::column_butterfly2([m0413a, m0413b]);
1051
1052 let m0512a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p16);
1053 let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[6], x2p15);
1054 let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[1], x3p14);
1055 let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[2], x4p13);
1056 let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[7], x5p12);
1057 let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[3], x6p11);
1058 let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[0], x7p10);
1059 let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[5], x8p9);
1060 let m0512b = SseVector::mul(self.twiddles_im[4], x1m16);
1061 let m0512b = SseVector::nmadd(m0512b, self.twiddles_im[6], x2m15);
1062 let m0512b = SseVector::nmadd(m0512b, self.twiddles_im[1], x3m14);
1063 let m0512b = SseVector::fmadd(m0512b, self.twiddles_im[2], x4m13);
1064 let m0512b = SseVector::fmadd(m0512b, self.twiddles_im[7], x5m12);
1065 let m0512b = SseVector::nmadd(m0512b, self.twiddles_im[3], x6m11);
1066 let m0512b = SseVector::fmadd(m0512b, self.twiddles_im[0], x7m10);
1067 let m0512b = SseVector::fmadd(m0512b, self.twiddles_im[5], x8m9);
1068 let [y05, y12] = SseVector::column_butterfly2([m0512a, m0512b]);
1069
1070 let m0611a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p16);
1071 let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[4], x2p15);
1072 let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[0], x3p14);
1073 let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[6], x4p13);
1074 let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[3], x5p12);
1075 let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[1], x6p11);
1076 let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[7], x7p10);
1077 let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[2], x8p9);
1078 let m0611b = SseVector::mul(self.twiddles_im[5], x1m16);
1079 let m0611b = SseVector::nmadd(m0611b, self.twiddles_im[4], x2m15);
1080 let m0611b = SseVector::fmadd(m0611b, self.twiddles_im[0], x3m14);
1081 let m0611b = SseVector::fmadd(m0611b, self.twiddles_im[6], x4m13);
1082 let m0611b = SseVector::nmadd(m0611b, self.twiddles_im[3], x5m12);
1083 let m0611b = SseVector::fmadd(m0611b, self.twiddles_im[1], x6m11);
1084 let m0611b = SseVector::fmadd(m0611b, self.twiddles_im[7], x7m10);
1085 let m0611b = SseVector::nmadd(m0611b, self.twiddles_im[2], x8m9);
1086 let [y06, y11] = SseVector::column_butterfly2([m0611a, m0611b]);
1087
1088 let m0710a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p16);
1089 let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[2], x2p15);
1090 let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[3], x3p14);
1091 let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[5], x4p13);
1092 let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[0], x5p12);
1093 let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[7], x6p11);
1094 let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[1], x7p10);
1095 let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[4], x8p9);
1096 let m0710b = SseVector::mul(self.twiddles_im[6], x1m16);
1097 let m0710b = SseVector::nmadd(m0710b, self.twiddles_im[2], x2m15);
1098 let m0710b = SseVector::fmadd(m0710b, self.twiddles_im[3], x3m14);
1099 let m0710b = SseVector::nmadd(m0710b, self.twiddles_im[5], x4m13);
1100 let m0710b = SseVector::fmadd(m0710b, self.twiddles_im[0], x5m12);
1101 let m0710b = SseVector::fmadd(m0710b, self.twiddles_im[7], x6m11);
1102 let m0710b = SseVector::nmadd(m0710b, self.twiddles_im[1], x7m10);
1103 let m0710b = SseVector::fmadd(m0710b, self.twiddles_im[4], x8m9);
1104 let [y07, y10] = SseVector::column_butterfly2([m0710a, m0710b]);
1105
1106 let m0809a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p16);
1107 let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[0], x2p15);
1108 let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[6], x3p14);
1109 let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[1], x4p13);
1110 let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[5], x5p12);
1111 let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[2], x6p11);
1112 let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[4], x7p10);
1113 let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[3], x8p9);
1114 let m0809b = SseVector::mul(self.twiddles_im[7], x1m16);
1115 let m0809b = SseVector::nmadd(m0809b, self.twiddles_im[0], x2m15);
1116 let m0809b = SseVector::fmadd(m0809b, self.twiddles_im[6], x3m14);
1117 let m0809b = SseVector::nmadd(m0809b, self.twiddles_im[1], x4m13);
1118 let m0809b = SseVector::fmadd(m0809b, self.twiddles_im[5], x5m12);
1119 let m0809b = SseVector::nmadd(m0809b, self.twiddles_im[2], x6m11);
1120 let m0809b = SseVector::fmadd(m0809b, self.twiddles_im[4], x7m10);
1121 let m0809b = SseVector::nmadd(m0809b, self.twiddles_im[3], x8m9);
1122 let [y08, y09] = SseVector::column_butterfly2([m0809a, m0809b]);
1123
1124
1125 [y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16]
1126 }
1127}
1128
1129struct SseF64Butterfly17<T> {
1130 direction: FftDirection,
1131 twiddles_re: [__m128d; 8],
1132 twiddles_im: [__m128d; 8],
1133 _phantom: std::marker::PhantomData<T>,
1134}
1135
1136boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly17);
1137boilerplate_fft_sse_common_butterfly!(SseF64Butterfly17, 17, |this: &SseF64Butterfly17<_>| this.direction);
1138impl<T: FftNum> SseF64Butterfly17<T> {
1139 #[target_feature(enable = "sse4.1")]
1141 unsafe fn new(direction: FftDirection) -> Self {
1142 assert_f64::<T>();
1143 let twiddles = make_twiddles(17, direction);
1144 unsafe {Self {
1145 direction,
1146 twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
1147 twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
1148 _phantom: std::marker::PhantomData,
1149 }}
1150 }
1151
1152 #[inline(always)]
1153 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
1154 let values = read_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 });
1155
1156 let out = self.perform_fft_direct(values);
1157
1158 write_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 });
1159 }
1160
1161 #[inline(always)]
1162 pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 17]) -> [__m128d; 17] {
1163 let rotate = SseVector::make_rotate90(FftDirection::Inverse);
1164
1165 let y00 = values[0];
1166 let [x1p16, x1m16] = SseVector::column_butterfly2([values[1], values[16]]);
1167 let x1m16 = SseVector::apply_rotate90(rotate, x1m16);
1168 let y00 = SseVector::add(y00, x1p16);
1169 let [x2p15, x2m15] = SseVector::column_butterfly2([values[2], values[15]]);
1170 let x2m15 = SseVector::apply_rotate90(rotate, x2m15);
1171 let y00 = SseVector::add(y00, x2p15);
1172 let [x3p14, x3m14] = SseVector::column_butterfly2([values[3], values[14]]);
1173 let x3m14 = SseVector::apply_rotate90(rotate, x3m14);
1174 let y00 = SseVector::add(y00, x3p14);
1175 let [x4p13, x4m13] = SseVector::column_butterfly2([values[4], values[13]]);
1176 let x4m13 = SseVector::apply_rotate90(rotate, x4m13);
1177 let y00 = SseVector::add(y00, x4p13);
1178 let [x5p12, x5m12] = SseVector::column_butterfly2([values[5], values[12]]);
1179 let x5m12 = SseVector::apply_rotate90(rotate, x5m12);
1180 let y00 = SseVector::add(y00, x5p12);
1181 let [x6p11, x6m11] = SseVector::column_butterfly2([values[6], values[11]]);
1182 let x6m11 = SseVector::apply_rotate90(rotate, x6m11);
1183 let y00 = SseVector::add(y00, x6p11);
1184 let [x7p10, x7m10] = SseVector::column_butterfly2([values[7], values[10]]);
1185 let x7m10 = SseVector::apply_rotate90(rotate, x7m10);
1186 let y00 = SseVector::add(y00, x7p10);
1187 let [x8p9, x8m9] = SseVector::column_butterfly2([values[8], values[9]]);
1188 let x8m9 = SseVector::apply_rotate90(rotate, x8m9);
1189 let y00 = SseVector::add(y00, x8p9);
1190
1191 let m0116a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p16);
1192 let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[1], x2p15);
1193 let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[2], x3p14);
1194 let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[3], x4p13);
1195 let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[4], x5p12);
1196 let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[5], x6p11);
1197 let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[6], x7p10);
1198 let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[7], x8p9);
1199 let m0116b = SseVector::mul(self.twiddles_im[0], x1m16);
1200 let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[1], x2m15);
1201 let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[2], x3m14);
1202 let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[3], x4m13);
1203 let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[4], x5m12);
1204 let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[5], x6m11);
1205 let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[6], x7m10);
1206 let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[7], x8m9);
1207 let [y01, y16] = SseVector::column_butterfly2([m0116a, m0116b]);
1208
1209 let m0215a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p16);
1210 let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[3], x2p15);
1211 let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[5], x3p14);
1212 let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[7], x4p13);
1213 let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[6], x5p12);
1214 let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[4], x6p11);
1215 let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[2], x7p10);
1216 let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[0], x8p9);
1217 let m0215b = SseVector::mul(self.twiddles_im[1], x1m16);
1218 let m0215b = SseVector::fmadd(m0215b, self.twiddles_im[3], x2m15);
1219 let m0215b = SseVector::fmadd(m0215b, self.twiddles_im[5], x3m14);
1220 let m0215b = SseVector::fmadd(m0215b, self.twiddles_im[7], x4m13);
1221 let m0215b = SseVector::nmadd(m0215b, self.twiddles_im[6], x5m12);
1222 let m0215b = SseVector::nmadd(m0215b, self.twiddles_im[4], x6m11);
1223 let m0215b = SseVector::nmadd(m0215b, self.twiddles_im[2], x7m10);
1224 let m0215b = SseVector::nmadd(m0215b, self.twiddles_im[0], x8m9);
1225 let [y02, y15] = SseVector::column_butterfly2([m0215a, m0215b]);
1226
1227 let m0314a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p16);
1228 let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[5], x2p15);
1229 let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[7], x3p14);
1230 let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[4], x4p13);
1231 let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[1], x5p12);
1232 let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[0], x6p11);
1233 let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[3], x7p10);
1234 let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[6], x8p9);
1235 let m0314b = SseVector::mul(self.twiddles_im[2], x1m16);
1236 let m0314b = SseVector::fmadd(m0314b, self.twiddles_im[5], x2m15);
1237 let m0314b = SseVector::nmadd(m0314b, self.twiddles_im[7], x3m14);
1238 let m0314b = SseVector::nmadd(m0314b, self.twiddles_im[4], x4m13);
1239 let m0314b = SseVector::nmadd(m0314b, self.twiddles_im[1], x5m12);
1240 let m0314b = SseVector::fmadd(m0314b, self.twiddles_im[0], x6m11);
1241 let m0314b = SseVector::fmadd(m0314b, self.twiddles_im[3], x7m10);
1242 let m0314b = SseVector::fmadd(m0314b, self.twiddles_im[6], x8m9);
1243 let [y03, y14] = SseVector::column_butterfly2([m0314a, m0314b]);
1244
1245 let m0413a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p16);
1246 let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[7], x2p15);
1247 let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[4], x3p14);
1248 let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[0], x4p13);
1249 let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[2], x5p12);
1250 let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[6], x6p11);
1251 let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[5], x7p10);
1252 let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[1], x8p9);
1253 let m0413b = SseVector::mul(self.twiddles_im[3], x1m16);
1254 let m0413b = SseVector::fmadd(m0413b, self.twiddles_im[7], x2m15);
1255 let m0413b = SseVector::nmadd(m0413b, self.twiddles_im[4], x3m14);
1256 let m0413b = SseVector::nmadd(m0413b, self.twiddles_im[0], x4m13);
1257 let m0413b = SseVector::fmadd(m0413b, self.twiddles_im[2], x5m12);
1258 let m0413b = SseVector::fmadd(m0413b, self.twiddles_im[6], x6m11);
1259 let m0413b = SseVector::nmadd(m0413b, self.twiddles_im[5], x7m10);
1260 let m0413b = SseVector::nmadd(m0413b, self.twiddles_im[1], x8m9);
1261 let [y04, y13] = SseVector::column_butterfly2([m0413a, m0413b]);
1262
1263 let m0512a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p16);
1264 let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[6], x2p15);
1265 let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[1], x3p14);
1266 let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[2], x4p13);
1267 let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[7], x5p12);
1268 let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[3], x6p11);
1269 let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[0], x7p10);
1270 let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[5], x8p9);
1271 let m0512b = SseVector::mul(self.twiddles_im[4], x1m16);
1272 let m0512b = SseVector::nmadd(m0512b, self.twiddles_im[6], x2m15);
1273 let m0512b = SseVector::nmadd(m0512b, self.twiddles_im[1], x3m14);
1274 let m0512b = SseVector::fmadd(m0512b, self.twiddles_im[2], x4m13);
1275 let m0512b = SseVector::fmadd(m0512b, self.twiddles_im[7], x5m12);
1276 let m0512b = SseVector::nmadd(m0512b, self.twiddles_im[3], x6m11);
1277 let m0512b = SseVector::fmadd(m0512b, self.twiddles_im[0], x7m10);
1278 let m0512b = SseVector::fmadd(m0512b, self.twiddles_im[5], x8m9);
1279 let [y05, y12] = SseVector::column_butterfly2([m0512a, m0512b]);
1280
1281 let m0611a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p16);
1282 let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[4], x2p15);
1283 let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[0], x3p14);
1284 let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[6], x4p13);
1285 let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[3], x5p12);
1286 let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[1], x6p11);
1287 let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[7], x7p10);
1288 let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[2], x8p9);
1289 let m0611b = SseVector::mul(self.twiddles_im[5], x1m16);
1290 let m0611b = SseVector::nmadd(m0611b, self.twiddles_im[4], x2m15);
1291 let m0611b = SseVector::fmadd(m0611b, self.twiddles_im[0], x3m14);
1292 let m0611b = SseVector::fmadd(m0611b, self.twiddles_im[6], x4m13);
1293 let m0611b = SseVector::nmadd(m0611b, self.twiddles_im[3], x5m12);
1294 let m0611b = SseVector::fmadd(m0611b, self.twiddles_im[1], x6m11);
1295 let m0611b = SseVector::fmadd(m0611b, self.twiddles_im[7], x7m10);
1296 let m0611b = SseVector::nmadd(m0611b, self.twiddles_im[2], x8m9);
1297 let [y06, y11] = SseVector::column_butterfly2([m0611a, m0611b]);
1298
1299 let m0710a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p16);
1300 let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[2], x2p15);
1301 let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[3], x3p14);
1302 let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[5], x4p13);
1303 let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[0], x5p12);
1304 let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[7], x6p11);
1305 let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[1], x7p10);
1306 let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[4], x8p9);
1307 let m0710b = SseVector::mul(self.twiddles_im[6], x1m16);
1308 let m0710b = SseVector::nmadd(m0710b, self.twiddles_im[2], x2m15);
1309 let m0710b = SseVector::fmadd(m0710b, self.twiddles_im[3], x3m14);
1310 let m0710b = SseVector::nmadd(m0710b, self.twiddles_im[5], x4m13);
1311 let m0710b = SseVector::fmadd(m0710b, self.twiddles_im[0], x5m12);
1312 let m0710b = SseVector::fmadd(m0710b, self.twiddles_im[7], x6m11);
1313 let m0710b = SseVector::nmadd(m0710b, self.twiddles_im[1], x7m10);
1314 let m0710b = SseVector::fmadd(m0710b, self.twiddles_im[4], x8m9);
1315 let [y07, y10] = SseVector::column_butterfly2([m0710a, m0710b]);
1316
1317 let m0809a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p16);
1318 let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[0], x2p15);
1319 let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[6], x3p14);
1320 let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[1], x4p13);
1321 let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[5], x5p12);
1322 let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[2], x6p11);
1323 let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[4], x7p10);
1324 let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[3], x8p9);
1325 let m0809b = SseVector::mul(self.twiddles_im[7], x1m16);
1326 let m0809b = SseVector::nmadd(m0809b, self.twiddles_im[0], x2m15);
1327 let m0809b = SseVector::fmadd(m0809b, self.twiddles_im[6], x3m14);
1328 let m0809b = SseVector::nmadd(m0809b, self.twiddles_im[1], x4m13);
1329 let m0809b = SseVector::fmadd(m0809b, self.twiddles_im[5], x5m12);
1330 let m0809b = SseVector::nmadd(m0809b, self.twiddles_im[2], x6m11);
1331 let m0809b = SseVector::fmadd(m0809b, self.twiddles_im[4], x7m10);
1332 let m0809b = SseVector::nmadd(m0809b, self.twiddles_im[3], x8m9);
1333 let [y08, y09] = SseVector::column_butterfly2([m0809a, m0809b]);
1334
1335
1336 [y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16]
1337 }
1338}
1339
1340struct SseF32Butterfly19<T> {
1341 direction: FftDirection,
1342 twiddles_re: [__m128; 9],
1343 twiddles_im: [__m128; 9],
1344 _phantom: std::marker::PhantomData<T>,
1345}
1346
1347boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly19);
1348boilerplate_fft_sse_common_butterfly!(SseF32Butterfly19, 19, |this: &SseF32Butterfly19<_>| this.direction);
1349impl<T: FftNum> SseF32Butterfly19<T> {
1350 #[target_feature(enable = "sse4.1")]
1352 unsafe fn new(direction: FftDirection) -> Self {
1353 assert_f32::<T>();
1354 let twiddles = make_twiddles(19, direction);
1355 Self {
1356 direction,
1357 twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
1358 twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
1359 _phantom: std::marker::PhantomData,
1360 }
1361 }
1362
1363 #[inline(always)]
1364 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
1365 let values = read_partial1_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 });
1366
1367 let out = self.perform_parallel_fft_direct(values);
1368
1369 write_partial_lo_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 } );
1370 }
1371
1372 #[inline(always)]
1373 pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
1374 let input_packed = read_complex_to_array!(buffer, { 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36 });
1375
1376 let values = [
1377 extract_lo_hi_f32(input_packed[0], input_packed[9]),
1378 extract_hi_lo_f32(input_packed[0], input_packed[10]),
1379 extract_lo_hi_f32(input_packed[1], input_packed[10]),
1380 extract_hi_lo_f32(input_packed[1], input_packed[11]),
1381 extract_lo_hi_f32(input_packed[2], input_packed[11]),
1382 extract_hi_lo_f32(input_packed[2], input_packed[12]),
1383 extract_lo_hi_f32(input_packed[3], input_packed[12]),
1384 extract_hi_lo_f32(input_packed[3], input_packed[13]),
1385 extract_lo_hi_f32(input_packed[4], input_packed[13]),
1386 extract_hi_lo_f32(input_packed[4], input_packed[14]),
1387 extract_lo_hi_f32(input_packed[5], input_packed[14]),
1388 extract_hi_lo_f32(input_packed[5], input_packed[15]),
1389 extract_lo_hi_f32(input_packed[6], input_packed[15]),
1390 extract_hi_lo_f32(input_packed[6], input_packed[16]),
1391 extract_lo_hi_f32(input_packed[7], input_packed[16]),
1392 extract_hi_lo_f32(input_packed[7], input_packed[17]),
1393 extract_lo_hi_f32(input_packed[8], input_packed[17]),
1394 extract_hi_lo_f32(input_packed[8], input_packed[18]),
1395 extract_lo_hi_f32(input_packed[9], input_packed[18]),
1396 ];
1397
1398 let out = self.perform_parallel_fft_direct(values);
1399
1400 let out_packed = [
1401 extract_lo_lo_f32(out[0], out[1]),
1402 extract_lo_lo_f32(out[2], out[3]),
1403 extract_lo_lo_f32(out[4], out[5]),
1404 extract_lo_lo_f32(out[6], out[7]),
1405 extract_lo_lo_f32(out[8], out[9]),
1406 extract_lo_lo_f32(out[10], out[11]),
1407 extract_lo_lo_f32(out[12], out[13]),
1408 extract_lo_lo_f32(out[14], out[15]),
1409 extract_lo_lo_f32(out[16], out[17]),
1410 extract_lo_hi_f32(out[18], out[0]),
1411 extract_hi_hi_f32(out[1], out[2]),
1412 extract_hi_hi_f32(out[3], out[4]),
1413 extract_hi_hi_f32(out[5], out[6]),
1414 extract_hi_hi_f32(out[7], out[8]),
1415 extract_hi_hi_f32(out[9], out[10]),
1416 extract_hi_hi_f32(out[11], out[12]),
1417 extract_hi_hi_f32(out[13], out[14]),
1418 extract_hi_hi_f32(out[15], out[16]),
1419 extract_hi_hi_f32(out[17], out[18]),
1420 ];
1421
1422 write_complex_to_array_strided!(out_packed, buffer, 2, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 });
1423 }
1424
1425 #[inline(always)]
1426 pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 19]) -> [__m128; 19] {
1427 let rotate = SseVector::make_rotate90(FftDirection::Inverse);
1428
1429 let y00 = values[0];
1430 let [x1p18, x1m18] = SseVector::column_butterfly2([values[1], values[18]]);
1431 let x1m18 = SseVector::apply_rotate90(rotate, x1m18);
1432 let y00 = SseVector::add(y00, x1p18);
1433 let [x2p17, x2m17] = SseVector::column_butterfly2([values[2], values[17]]);
1434 let x2m17 = SseVector::apply_rotate90(rotate, x2m17);
1435 let y00 = SseVector::add(y00, x2p17);
1436 let [x3p16, x3m16] = SseVector::column_butterfly2([values[3], values[16]]);
1437 let x3m16 = SseVector::apply_rotate90(rotate, x3m16);
1438 let y00 = SseVector::add(y00, x3p16);
1439 let [x4p15, x4m15] = SseVector::column_butterfly2([values[4], values[15]]);
1440 let x4m15 = SseVector::apply_rotate90(rotate, x4m15);
1441 let y00 = SseVector::add(y00, x4p15);
1442 let [x5p14, x5m14] = SseVector::column_butterfly2([values[5], values[14]]);
1443 let x5m14 = SseVector::apply_rotate90(rotate, x5m14);
1444 let y00 = SseVector::add(y00, x5p14);
1445 let [x6p13, x6m13] = SseVector::column_butterfly2([values[6], values[13]]);
1446 let x6m13 = SseVector::apply_rotate90(rotate, x6m13);
1447 let y00 = SseVector::add(y00, x6p13);
1448 let [x7p12, x7m12] = SseVector::column_butterfly2([values[7], values[12]]);
1449 let x7m12 = SseVector::apply_rotate90(rotate, x7m12);
1450 let y00 = SseVector::add(y00, x7p12);
1451 let [x8p11, x8m11] = SseVector::column_butterfly2([values[8], values[11]]);
1452 let x8m11 = SseVector::apply_rotate90(rotate, x8m11);
1453 let y00 = SseVector::add(y00, x8p11);
1454 let [x9p10, x9m10] = SseVector::column_butterfly2([values[9], values[10]]);
1455 let x9m10 = SseVector::apply_rotate90(rotate, x9m10);
1456 let y00 = SseVector::add(y00, x9p10);
1457
1458 let m0118a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p18);
1459 let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[1], x2p17);
1460 let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[2], x3p16);
1461 let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[3], x4p15);
1462 let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[4], x5p14);
1463 let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[5], x6p13);
1464 let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[6], x7p12);
1465 let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[7], x8p11);
1466 let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[8], x9p10);
1467 let m0118b = SseVector::mul(self.twiddles_im[0], x1m18);
1468 let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[1], x2m17);
1469 let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[2], x3m16);
1470 let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[3], x4m15);
1471 let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[4], x5m14);
1472 let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[5], x6m13);
1473 let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[6], x7m12);
1474 let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[7], x8m11);
1475 let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[8], x9m10);
1476 let [y01, y18] = SseVector::column_butterfly2([m0118a, m0118b]);
1477
1478 let m0217a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p18);
1479 let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[3], x2p17);
1480 let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[5], x3p16);
1481 let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[7], x4p15);
1482 let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[8], x5p14);
1483 let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[6], x6p13);
1484 let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[4], x7p12);
1485 let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[2], x8p11);
1486 let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[0], x9p10);
1487 let m0217b = SseVector::mul(self.twiddles_im[1], x1m18);
1488 let m0217b = SseVector::fmadd(m0217b, self.twiddles_im[3], x2m17);
1489 let m0217b = SseVector::fmadd(m0217b, self.twiddles_im[5], x3m16);
1490 let m0217b = SseVector::fmadd(m0217b, self.twiddles_im[7], x4m15);
1491 let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[8], x5m14);
1492 let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[6], x6m13);
1493 let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[4], x7m12);
1494 let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[2], x8m11);
1495 let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[0], x9m10);
1496 let [y02, y17] = SseVector::column_butterfly2([m0217a, m0217b]);
1497
1498 let m0316a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p18);
1499 let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[5], x2p17);
1500 let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[8], x3p16);
1501 let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[6], x4p15);
1502 let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[3], x5p14);
1503 let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[0], x6p13);
1504 let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[1], x7p12);
1505 let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[4], x8p11);
1506 let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[7], x9p10);
1507 let m0316b = SseVector::mul(self.twiddles_im[2], x1m18);
1508 let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[5], x2m17);
1509 let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[8], x3m16);
1510 let m0316b = SseVector::nmadd(m0316b, self.twiddles_im[6], x4m15);
1511 let m0316b = SseVector::nmadd(m0316b, self.twiddles_im[3], x5m14);
1512 let m0316b = SseVector::nmadd(m0316b, self.twiddles_im[0], x6m13);
1513 let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[1], x7m12);
1514 let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[4], x8m11);
1515 let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[7], x9m10);
1516 let [y03, y16] = SseVector::column_butterfly2([m0316a, m0316b]);
1517
1518 let m0415a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p18);
1519 let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[7], x2p17);
1520 let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[6], x3p16);
1521 let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[2], x4p15);
1522 let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[0], x5p14);
1523 let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[4], x6p13);
1524 let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[8], x7p12);
1525 let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[5], x8p11);
1526 let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[1], x9p10);
1527 let m0415b = SseVector::mul(self.twiddles_im[3], x1m18);
1528 let m0415b = SseVector::fmadd(m0415b, self.twiddles_im[7], x2m17);
1529 let m0415b = SseVector::nmadd(m0415b, self.twiddles_im[6], x3m16);
1530 let m0415b = SseVector::nmadd(m0415b, self.twiddles_im[2], x4m15);
1531 let m0415b = SseVector::fmadd(m0415b, self.twiddles_im[0], x5m14);
1532 let m0415b = SseVector::fmadd(m0415b, self.twiddles_im[4], x6m13);
1533 let m0415b = SseVector::fmadd(m0415b, self.twiddles_im[8], x7m12);
1534 let m0415b = SseVector::nmadd(m0415b, self.twiddles_im[5], x8m11);
1535 let m0415b = SseVector::nmadd(m0415b, self.twiddles_im[1], x9m10);
1536 let [y04, y15] = SseVector::column_butterfly2([m0415a, m0415b]);
1537
1538 let m0514a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p18);
1539 let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[8], x2p17);
1540 let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[3], x3p16);
1541 let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[0], x4p15);
1542 let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[5], x5p14);
1543 let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[7], x6p13);
1544 let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[2], x7p12);
1545 let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[1], x8p11);
1546 let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[6], x9p10);
1547 let m0514b = SseVector::mul(self.twiddles_im[4], x1m18);
1548 let m0514b = SseVector::nmadd(m0514b, self.twiddles_im[8], x2m17);
1549 let m0514b = SseVector::nmadd(m0514b, self.twiddles_im[3], x3m16);
1550 let m0514b = SseVector::fmadd(m0514b, self.twiddles_im[0], x4m15);
1551 let m0514b = SseVector::fmadd(m0514b, self.twiddles_im[5], x5m14);
1552 let m0514b = SseVector::nmadd(m0514b, self.twiddles_im[7], x6m13);
1553 let m0514b = SseVector::nmadd(m0514b, self.twiddles_im[2], x7m12);
1554 let m0514b = SseVector::fmadd(m0514b, self.twiddles_im[1], x8m11);
1555 let m0514b = SseVector::fmadd(m0514b, self.twiddles_im[6], x9m10);
1556 let [y05, y14] = SseVector::column_butterfly2([m0514a, m0514b]);
1557
1558 let m0613a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p18);
1559 let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[6], x2p17);
1560 let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[0], x3p16);
1561 let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[4], x4p15);
1562 let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[7], x5p14);
1563 let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[1], x6p13);
1564 let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[3], x7p12);
1565 let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[8], x8p11);
1566 let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[2], x9p10);
1567 let m0613b = SseVector::mul(self.twiddles_im[5], x1m18);
1568 let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[6], x2m17);
1569 let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[0], x3m16);
1570 let m0613b = SseVector::fmadd(m0613b, self.twiddles_im[4], x4m15);
1571 let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[7], x5m14);
1572 let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[1], x6m13);
1573 let m0613b = SseVector::fmadd(m0613b, self.twiddles_im[3], x7m12);
1574 let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[8], x8m11);
1575 let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[2], x9m10);
1576 let [y06, y13] = SseVector::column_butterfly2([m0613a, m0613b]);
1577
1578 let m0712a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p18);
1579 let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[4], x2p17);
1580 let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[1], x3p16);
1581 let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[8], x4p15);
1582 let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[2], x5p14);
1583 let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[3], x6p13);
1584 let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[7], x7p12);
1585 let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[0], x8p11);
1586 let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[5], x9p10);
1587 let m0712b = SseVector::mul(self.twiddles_im[6], x1m18);
1588 let m0712b = SseVector::nmadd(m0712b, self.twiddles_im[4], x2m17);
1589 let m0712b = SseVector::fmadd(m0712b, self.twiddles_im[1], x3m16);
1590 let m0712b = SseVector::fmadd(m0712b, self.twiddles_im[8], x4m15);
1591 let m0712b = SseVector::nmadd(m0712b, self.twiddles_im[2], x5m14);
1592 let m0712b = SseVector::fmadd(m0712b, self.twiddles_im[3], x6m13);
1593 let m0712b = SseVector::nmadd(m0712b, self.twiddles_im[7], x7m12);
1594 let m0712b = SseVector::nmadd(m0712b, self.twiddles_im[0], x8m11);
1595 let m0712b = SseVector::fmadd(m0712b, self.twiddles_im[5], x9m10);
1596 let [y07, y12] = SseVector::column_butterfly2([m0712a, m0712b]);
1597
1598 let m0811a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p18);
1599 let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[2], x2p17);
1600 let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[4], x3p16);
1601 let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[5], x4p15);
1602 let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[1], x5p14);
1603 let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[8], x6p13);
1604 let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[0], x7p12);
1605 let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[6], x8p11);
1606 let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[3], x9p10);
1607 let m0811b = SseVector::mul(self.twiddles_im[7], x1m18);
1608 let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[2], x2m17);
1609 let m0811b = SseVector::fmadd(m0811b, self.twiddles_im[4], x3m16);
1610 let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[5], x4m15);
1611 let m0811b = SseVector::fmadd(m0811b, self.twiddles_im[1], x5m14);
1612 let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[8], x6m13);
1613 let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[0], x7m12);
1614 let m0811b = SseVector::fmadd(m0811b, self.twiddles_im[6], x8m11);
1615 let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[3], x9m10);
1616 let [y08, y11] = SseVector::column_butterfly2([m0811a, m0811b]);
1617
1618 let m0910a = SseVector::fmadd(values[0], self.twiddles_re[8], x1p18);
1619 let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[0], x2p17);
1620 let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[7], x3p16);
1621 let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[1], x4p15);
1622 let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[6], x5p14);
1623 let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[2], x6p13);
1624 let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[5], x7p12);
1625 let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[3], x8p11);
1626 let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[4], x9p10);
1627 let m0910b = SseVector::mul(self.twiddles_im[8], x1m18);
1628 let m0910b = SseVector::nmadd(m0910b, self.twiddles_im[0], x2m17);
1629 let m0910b = SseVector::fmadd(m0910b, self.twiddles_im[7], x3m16);
1630 let m0910b = SseVector::nmadd(m0910b, self.twiddles_im[1], x4m15);
1631 let m0910b = SseVector::fmadd(m0910b, self.twiddles_im[6], x5m14);
1632 let m0910b = SseVector::nmadd(m0910b, self.twiddles_im[2], x6m13);
1633 let m0910b = SseVector::fmadd(m0910b, self.twiddles_im[5], x7m12);
1634 let m0910b = SseVector::nmadd(m0910b, self.twiddles_im[3], x8m11);
1635 let m0910b = SseVector::fmadd(m0910b, self.twiddles_im[4], x9m10);
1636 let [y09, y10] = SseVector::column_butterfly2([m0910a, m0910b]);
1637
1638
1639 [y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16, y17, y18]
1640 }
1641}
1642
1643struct SseF64Butterfly19<T> {
1644 direction: FftDirection,
1645 twiddles_re: [__m128d; 9],
1646 twiddles_im: [__m128d; 9],
1647 _phantom: std::marker::PhantomData<T>,
1648}
1649
1650boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly19);
1651boilerplate_fft_sse_common_butterfly!(SseF64Butterfly19, 19, |this: &SseF64Butterfly19<_>| this.direction);
1652impl<T: FftNum> SseF64Butterfly19<T> {
1653 #[target_feature(enable = "sse4.1")]
1655 unsafe fn new(direction: FftDirection) -> Self {
1656 assert_f64::<T>();
1657 let twiddles = make_twiddles(19, direction);
1658 unsafe {Self {
1659 direction,
1660 twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
1661 twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
1662 _phantom: std::marker::PhantomData,
1663 }}
1664 }
1665
1666 #[inline(always)]
1667 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
1668 let values = read_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 });
1669
1670 let out = self.perform_fft_direct(values);
1671
1672 write_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 });
1673 }
1674
1675 #[inline(always)]
1676 pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 19]) -> [__m128d; 19] {
1677 let rotate = SseVector::make_rotate90(FftDirection::Inverse);
1678
1679 let y00 = values[0];
1680 let [x1p18, x1m18] = SseVector::column_butterfly2([values[1], values[18]]);
1681 let x1m18 = SseVector::apply_rotate90(rotate, x1m18);
1682 let y00 = SseVector::add(y00, x1p18);
1683 let [x2p17, x2m17] = SseVector::column_butterfly2([values[2], values[17]]);
1684 let x2m17 = SseVector::apply_rotate90(rotate, x2m17);
1685 let y00 = SseVector::add(y00, x2p17);
1686 let [x3p16, x3m16] = SseVector::column_butterfly2([values[3], values[16]]);
1687 let x3m16 = SseVector::apply_rotate90(rotate, x3m16);
1688 let y00 = SseVector::add(y00, x3p16);
1689 let [x4p15, x4m15] = SseVector::column_butterfly2([values[4], values[15]]);
1690 let x4m15 = SseVector::apply_rotate90(rotate, x4m15);
1691 let y00 = SseVector::add(y00, x4p15);
1692 let [x5p14, x5m14] = SseVector::column_butterfly2([values[5], values[14]]);
1693 let x5m14 = SseVector::apply_rotate90(rotate, x5m14);
1694 let y00 = SseVector::add(y00, x5p14);
1695 let [x6p13, x6m13] = SseVector::column_butterfly2([values[6], values[13]]);
1696 let x6m13 = SseVector::apply_rotate90(rotate, x6m13);
1697 let y00 = SseVector::add(y00, x6p13);
1698 let [x7p12, x7m12] = SseVector::column_butterfly2([values[7], values[12]]);
1699 let x7m12 = SseVector::apply_rotate90(rotate, x7m12);
1700 let y00 = SseVector::add(y00, x7p12);
1701 let [x8p11, x8m11] = SseVector::column_butterfly2([values[8], values[11]]);
1702 let x8m11 = SseVector::apply_rotate90(rotate, x8m11);
1703 let y00 = SseVector::add(y00, x8p11);
1704 let [x9p10, x9m10] = SseVector::column_butterfly2([values[9], values[10]]);
1705 let x9m10 = SseVector::apply_rotate90(rotate, x9m10);
1706 let y00 = SseVector::add(y00, x9p10);
1707
1708 let m0118a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p18);
1709 let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[1], x2p17);
1710 let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[2], x3p16);
1711 let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[3], x4p15);
1712 let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[4], x5p14);
1713 let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[5], x6p13);
1714 let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[6], x7p12);
1715 let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[7], x8p11);
1716 let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[8], x9p10);
1717 let m0118b = SseVector::mul(self.twiddles_im[0], x1m18);
1718 let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[1], x2m17);
1719 let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[2], x3m16);
1720 let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[3], x4m15);
1721 let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[4], x5m14);
1722 let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[5], x6m13);
1723 let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[6], x7m12);
1724 let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[7], x8m11);
1725 let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[8], x9m10);
1726 let [y01, y18] = SseVector::column_butterfly2([m0118a, m0118b]);
1727
1728 let m0217a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p18);
1729 let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[3], x2p17);
1730 let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[5], x3p16);
1731 let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[7], x4p15);
1732 let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[8], x5p14);
1733 let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[6], x6p13);
1734 let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[4], x7p12);
1735 let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[2], x8p11);
1736 let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[0], x9p10);
1737 let m0217b = SseVector::mul(self.twiddles_im[1], x1m18);
1738 let m0217b = SseVector::fmadd(m0217b, self.twiddles_im[3], x2m17);
1739 let m0217b = SseVector::fmadd(m0217b, self.twiddles_im[5], x3m16);
1740 let m0217b = SseVector::fmadd(m0217b, self.twiddles_im[7], x4m15);
1741 let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[8], x5m14);
1742 let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[6], x6m13);
1743 let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[4], x7m12);
1744 let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[2], x8m11);
1745 let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[0], x9m10);
1746 let [y02, y17] = SseVector::column_butterfly2([m0217a, m0217b]);
1747
1748 let m0316a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p18);
1749 let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[5], x2p17);
1750 let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[8], x3p16);
1751 let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[6], x4p15);
1752 let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[3], x5p14);
1753 let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[0], x6p13);
1754 let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[1], x7p12);
1755 let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[4], x8p11);
1756 let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[7], x9p10);
1757 let m0316b = SseVector::mul(self.twiddles_im[2], x1m18);
1758 let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[5], x2m17);
1759 let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[8], x3m16);
1760 let m0316b = SseVector::nmadd(m0316b, self.twiddles_im[6], x4m15);
1761 let m0316b = SseVector::nmadd(m0316b, self.twiddles_im[3], x5m14);
1762 let m0316b = SseVector::nmadd(m0316b, self.twiddles_im[0], x6m13);
1763 let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[1], x7m12);
1764 let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[4], x8m11);
1765 let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[7], x9m10);
1766 let [y03, y16] = SseVector::column_butterfly2([m0316a, m0316b]);
1767
1768 let m0415a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p18);
1769 let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[7], x2p17);
1770 let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[6], x3p16);
1771 let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[2], x4p15);
1772 let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[0], x5p14);
1773 let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[4], x6p13);
1774 let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[8], x7p12);
1775 let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[5], x8p11);
1776 let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[1], x9p10);
1777 let m0415b = SseVector::mul(self.twiddles_im[3], x1m18);
1778 let m0415b = SseVector::fmadd(m0415b, self.twiddles_im[7], x2m17);
1779 let m0415b = SseVector::nmadd(m0415b, self.twiddles_im[6], x3m16);
1780 let m0415b = SseVector::nmadd(m0415b, self.twiddles_im[2], x4m15);
1781 let m0415b = SseVector::fmadd(m0415b, self.twiddles_im[0], x5m14);
1782 let m0415b = SseVector::fmadd(m0415b, self.twiddles_im[4], x6m13);
1783 let m0415b = SseVector::fmadd(m0415b, self.twiddles_im[8], x7m12);
1784 let m0415b = SseVector::nmadd(m0415b, self.twiddles_im[5], x8m11);
1785 let m0415b = SseVector::nmadd(m0415b, self.twiddles_im[1], x9m10);
1786 let [y04, y15] = SseVector::column_butterfly2([m0415a, m0415b]);
1787
1788 let m0514a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p18);
1789 let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[8], x2p17);
1790 let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[3], x3p16);
1791 let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[0], x4p15);
1792 let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[5], x5p14);
1793 let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[7], x6p13);
1794 let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[2], x7p12);
1795 let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[1], x8p11);
1796 let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[6], x9p10);
1797 let m0514b = SseVector::mul(self.twiddles_im[4], x1m18);
1798 let m0514b = SseVector::nmadd(m0514b, self.twiddles_im[8], x2m17);
1799 let m0514b = SseVector::nmadd(m0514b, self.twiddles_im[3], x3m16);
1800 let m0514b = SseVector::fmadd(m0514b, self.twiddles_im[0], x4m15);
1801 let m0514b = SseVector::fmadd(m0514b, self.twiddles_im[5], x5m14);
1802 let m0514b = SseVector::nmadd(m0514b, self.twiddles_im[7], x6m13);
1803 let m0514b = SseVector::nmadd(m0514b, self.twiddles_im[2], x7m12);
1804 let m0514b = SseVector::fmadd(m0514b, self.twiddles_im[1], x8m11);
1805 let m0514b = SseVector::fmadd(m0514b, self.twiddles_im[6], x9m10);
1806 let [y05, y14] = SseVector::column_butterfly2([m0514a, m0514b]);
1807
1808 let m0613a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p18);
1809 let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[6], x2p17);
1810 let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[0], x3p16);
1811 let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[4], x4p15);
1812 let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[7], x5p14);
1813 let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[1], x6p13);
1814 let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[3], x7p12);
1815 let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[8], x8p11);
1816 let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[2], x9p10);
1817 let m0613b = SseVector::mul(self.twiddles_im[5], x1m18);
1818 let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[6], x2m17);
1819 let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[0], x3m16);
1820 let m0613b = SseVector::fmadd(m0613b, self.twiddles_im[4], x4m15);
1821 let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[7], x5m14);
1822 let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[1], x6m13);
1823 let m0613b = SseVector::fmadd(m0613b, self.twiddles_im[3], x7m12);
1824 let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[8], x8m11);
1825 let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[2], x9m10);
1826 let [y06, y13] = SseVector::column_butterfly2([m0613a, m0613b]);
1827
1828 let m0712a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p18);
1829 let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[4], x2p17);
1830 let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[1], x3p16);
1831 let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[8], x4p15);
1832 let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[2], x5p14);
1833 let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[3], x6p13);
1834 let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[7], x7p12);
1835 let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[0], x8p11);
1836 let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[5], x9p10);
1837 let m0712b = SseVector::mul(self.twiddles_im[6], x1m18);
1838 let m0712b = SseVector::nmadd(m0712b, self.twiddles_im[4], x2m17);
1839 let m0712b = SseVector::fmadd(m0712b, self.twiddles_im[1], x3m16);
1840 let m0712b = SseVector::fmadd(m0712b, self.twiddles_im[8], x4m15);
1841 let m0712b = SseVector::nmadd(m0712b, self.twiddles_im[2], x5m14);
1842 let m0712b = SseVector::fmadd(m0712b, self.twiddles_im[3], x6m13);
1843 let m0712b = SseVector::nmadd(m0712b, self.twiddles_im[7], x7m12);
1844 let m0712b = SseVector::nmadd(m0712b, self.twiddles_im[0], x8m11);
1845 let m0712b = SseVector::fmadd(m0712b, self.twiddles_im[5], x9m10);
1846 let [y07, y12] = SseVector::column_butterfly2([m0712a, m0712b]);
1847
1848 let m0811a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p18);
1849 let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[2], x2p17);
1850 let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[4], x3p16);
1851 let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[5], x4p15);
1852 let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[1], x5p14);
1853 let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[8], x6p13);
1854 let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[0], x7p12);
1855 let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[6], x8p11);
1856 let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[3], x9p10);
1857 let m0811b = SseVector::mul(self.twiddles_im[7], x1m18);
1858 let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[2], x2m17);
1859 let m0811b = SseVector::fmadd(m0811b, self.twiddles_im[4], x3m16);
1860 let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[5], x4m15);
1861 let m0811b = SseVector::fmadd(m0811b, self.twiddles_im[1], x5m14);
1862 let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[8], x6m13);
1863 let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[0], x7m12);
1864 let m0811b = SseVector::fmadd(m0811b, self.twiddles_im[6], x8m11);
1865 let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[3], x9m10);
1866 let [y08, y11] = SseVector::column_butterfly2([m0811a, m0811b]);
1867
1868 let m0910a = SseVector::fmadd(values[0], self.twiddles_re[8], x1p18);
1869 let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[0], x2p17);
1870 let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[7], x3p16);
1871 let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[1], x4p15);
1872 let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[6], x5p14);
1873 let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[2], x6p13);
1874 let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[5], x7p12);
1875 let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[3], x8p11);
1876 let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[4], x9p10);
1877 let m0910b = SseVector::mul(self.twiddles_im[8], x1m18);
1878 let m0910b = SseVector::nmadd(m0910b, self.twiddles_im[0], x2m17);
1879 let m0910b = SseVector::fmadd(m0910b, self.twiddles_im[7], x3m16);
1880 let m0910b = SseVector::nmadd(m0910b, self.twiddles_im[1], x4m15);
1881 let m0910b = SseVector::fmadd(m0910b, self.twiddles_im[6], x5m14);
1882 let m0910b = SseVector::nmadd(m0910b, self.twiddles_im[2], x6m13);
1883 let m0910b = SseVector::fmadd(m0910b, self.twiddles_im[5], x7m12);
1884 let m0910b = SseVector::nmadd(m0910b, self.twiddles_im[3], x8m11);
1885 let m0910b = SseVector::fmadd(m0910b, self.twiddles_im[4], x9m10);
1886 let [y09, y10] = SseVector::column_butterfly2([m0910a, m0910b]);
1887
1888
1889 [y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16, y17, y18]
1890 }
1891}
1892
1893struct SseF32Butterfly23<T> {
1894 direction: FftDirection,
1895 twiddles_re: [__m128; 11],
1896 twiddles_im: [__m128; 11],
1897 _phantom: std::marker::PhantomData<T>,
1898}
1899
1900boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly23);
1901boilerplate_fft_sse_common_butterfly!(SseF32Butterfly23, 23, |this: &SseF32Butterfly23<_>| this.direction);
1902impl<T: FftNum> SseF32Butterfly23<T> {
1903 #[target_feature(enable = "sse4.1")]
1905 unsafe fn new(direction: FftDirection) -> Self {
1906 assert_f32::<T>();
1907 let twiddles = make_twiddles(23, direction);
1908 Self {
1909 direction,
1910 twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
1911 twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
1912 _phantom: std::marker::PhantomData,
1913 }
1914 }
1915
1916 #[inline(always)]
1917 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
1918 let values = read_partial1_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 });
1919
1920 let out = self.perform_parallel_fft_direct(values);
1921
1922 write_partial_lo_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 } );
1923 }
1924
1925 #[inline(always)]
1926 pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
1927 let input_packed = read_complex_to_array!(buffer, { 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44 });
1928
1929 let values = [
1930 extract_lo_hi_f32(input_packed[0], input_packed[11]),
1931 extract_hi_lo_f32(input_packed[0], input_packed[12]),
1932 extract_lo_hi_f32(input_packed[1], input_packed[12]),
1933 extract_hi_lo_f32(input_packed[1], input_packed[13]),
1934 extract_lo_hi_f32(input_packed[2], input_packed[13]),
1935 extract_hi_lo_f32(input_packed[2], input_packed[14]),
1936 extract_lo_hi_f32(input_packed[3], input_packed[14]),
1937 extract_hi_lo_f32(input_packed[3], input_packed[15]),
1938 extract_lo_hi_f32(input_packed[4], input_packed[15]),
1939 extract_hi_lo_f32(input_packed[4], input_packed[16]),
1940 extract_lo_hi_f32(input_packed[5], input_packed[16]),
1941 extract_hi_lo_f32(input_packed[5], input_packed[17]),
1942 extract_lo_hi_f32(input_packed[6], input_packed[17]),
1943 extract_hi_lo_f32(input_packed[6], input_packed[18]),
1944 extract_lo_hi_f32(input_packed[7], input_packed[18]),
1945 extract_hi_lo_f32(input_packed[7], input_packed[19]),
1946 extract_lo_hi_f32(input_packed[8], input_packed[19]),
1947 extract_hi_lo_f32(input_packed[8], input_packed[20]),
1948 extract_lo_hi_f32(input_packed[9], input_packed[20]),
1949 extract_hi_lo_f32(input_packed[9], input_packed[21]),
1950 extract_lo_hi_f32(input_packed[10], input_packed[21]),
1951 extract_hi_lo_f32(input_packed[10], input_packed[22]),
1952 extract_lo_hi_f32(input_packed[11], input_packed[22]),
1953 ];
1954
1955 let out = self.perform_parallel_fft_direct(values);
1956
1957 let out_packed = [
1958 extract_lo_lo_f32(out[0], out[1]),
1959 extract_lo_lo_f32(out[2], out[3]),
1960 extract_lo_lo_f32(out[4], out[5]),
1961 extract_lo_lo_f32(out[6], out[7]),
1962 extract_lo_lo_f32(out[8], out[9]),
1963 extract_lo_lo_f32(out[10], out[11]),
1964 extract_lo_lo_f32(out[12], out[13]),
1965 extract_lo_lo_f32(out[14], out[15]),
1966 extract_lo_lo_f32(out[16], out[17]),
1967 extract_lo_lo_f32(out[18], out[19]),
1968 extract_lo_lo_f32(out[20], out[21]),
1969 extract_lo_hi_f32(out[22], out[0]),
1970 extract_hi_hi_f32(out[1], out[2]),
1971 extract_hi_hi_f32(out[3], out[4]),
1972 extract_hi_hi_f32(out[5], out[6]),
1973 extract_hi_hi_f32(out[7], out[8]),
1974 extract_hi_hi_f32(out[9], out[10]),
1975 extract_hi_hi_f32(out[11], out[12]),
1976 extract_hi_hi_f32(out[13], out[14]),
1977 extract_hi_hi_f32(out[15], out[16]),
1978 extract_hi_hi_f32(out[17], out[18]),
1979 extract_hi_hi_f32(out[19], out[20]),
1980 extract_hi_hi_f32(out[21], out[22]),
1981 ];
1982
1983 write_complex_to_array_strided!(out_packed, buffer, 2, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 });
1984 }
1985
1986 #[inline(always)]
1987 pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 23]) -> [__m128; 23] {
1988 let rotate = SseVector::make_rotate90(FftDirection::Inverse);
1989
1990 let y00 = values[0];
1991 let [x1p22, x1m22] = SseVector::column_butterfly2([values[1], values[22]]);
1992 let x1m22 = SseVector::apply_rotate90(rotate, x1m22);
1993 let y00 = SseVector::add(y00, x1p22);
1994 let [x2p21, x2m21] = SseVector::column_butterfly2([values[2], values[21]]);
1995 let x2m21 = SseVector::apply_rotate90(rotate, x2m21);
1996 let y00 = SseVector::add(y00, x2p21);
1997 let [x3p20, x3m20] = SseVector::column_butterfly2([values[3], values[20]]);
1998 let x3m20 = SseVector::apply_rotate90(rotate, x3m20);
1999 let y00 = SseVector::add(y00, x3p20);
2000 let [x4p19, x4m19] = SseVector::column_butterfly2([values[4], values[19]]);
2001 let x4m19 = SseVector::apply_rotate90(rotate, x4m19);
2002 let y00 = SseVector::add(y00, x4p19);
2003 let [x5p18, x5m18] = SseVector::column_butterfly2([values[5], values[18]]);
2004 let x5m18 = SseVector::apply_rotate90(rotate, x5m18);
2005 let y00 = SseVector::add(y00, x5p18);
2006 let [x6p17, x6m17] = SseVector::column_butterfly2([values[6], values[17]]);
2007 let x6m17 = SseVector::apply_rotate90(rotate, x6m17);
2008 let y00 = SseVector::add(y00, x6p17);
2009 let [x7p16, x7m16] = SseVector::column_butterfly2([values[7], values[16]]);
2010 let x7m16 = SseVector::apply_rotate90(rotate, x7m16);
2011 let y00 = SseVector::add(y00, x7p16);
2012 let [x8p15, x8m15] = SseVector::column_butterfly2([values[8], values[15]]);
2013 let x8m15 = SseVector::apply_rotate90(rotate, x8m15);
2014 let y00 = SseVector::add(y00, x8p15);
2015 let [x9p14, x9m14] = SseVector::column_butterfly2([values[9], values[14]]);
2016 let x9m14 = SseVector::apply_rotate90(rotate, x9m14);
2017 let y00 = SseVector::add(y00, x9p14);
2018 let [x10p13, x10m13] = SseVector::column_butterfly2([values[10], values[13]]);
2019 let x10m13 = SseVector::apply_rotate90(rotate, x10m13);
2020 let y00 = SseVector::add(y00, x10p13);
2021 let [x11p12, x11m12] = SseVector::column_butterfly2([values[11], values[12]]);
2022 let x11m12 = SseVector::apply_rotate90(rotate, x11m12);
2023 let y00 = SseVector::add(y00, x11p12);
2024
2025 let m0122a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p22);
2026 let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[1], x2p21);
2027 let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[2], x3p20);
2028 let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[3], x4p19);
2029 let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[4], x5p18);
2030 let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[5], x6p17);
2031 let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[6], x7p16);
2032 let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[7], x8p15);
2033 let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[8], x9p14);
2034 let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[9], x10p13);
2035 let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[10], x11p12);
2036 let m0122b = SseVector::mul(self.twiddles_im[0], x1m22);
2037 let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[1], x2m21);
2038 let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[2], x3m20);
2039 let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[3], x4m19);
2040 let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[4], x5m18);
2041 let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[5], x6m17);
2042 let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[6], x7m16);
2043 let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[7], x8m15);
2044 let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[8], x9m14);
2045 let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[9], x10m13);
2046 let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[10], x11m12);
2047 let [y01, y22] = SseVector::column_butterfly2([m0122a, m0122b]);
2048
2049 let m0221a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p22);
2050 let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[3], x2p21);
2051 let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[5], x3p20);
2052 let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[7], x4p19);
2053 let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[9], x5p18);
2054 let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[10], x6p17);
2055 let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[8], x7p16);
2056 let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[6], x8p15);
2057 let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[4], x9p14);
2058 let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[2], x10p13);
2059 let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[0], x11p12);
2060 let m0221b = SseVector::mul(self.twiddles_im[1], x1m22);
2061 let m0221b = SseVector::fmadd(m0221b, self.twiddles_im[3], x2m21);
2062 let m0221b = SseVector::fmadd(m0221b, self.twiddles_im[5], x3m20);
2063 let m0221b = SseVector::fmadd(m0221b, self.twiddles_im[7], x4m19);
2064 let m0221b = SseVector::fmadd(m0221b, self.twiddles_im[9], x5m18);
2065 let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[10], x6m17);
2066 let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[8], x7m16);
2067 let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[6], x8m15);
2068 let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[4], x9m14);
2069 let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[2], x10m13);
2070 let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[0], x11m12);
2071 let [y02, y21] = SseVector::column_butterfly2([m0221a, m0221b]);
2072
2073 let m0320a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p22);
2074 let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[5], x2p21);
2075 let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[8], x3p20);
2076 let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[10], x4p19);
2077 let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[7], x5p18);
2078 let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[4], x6p17);
2079 let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[1], x7p16);
2080 let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[0], x8p15);
2081 let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[3], x9p14);
2082 let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[6], x10p13);
2083 let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[9], x11p12);
2084 let m0320b = SseVector::mul(self.twiddles_im[2], x1m22);
2085 let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[5], x2m21);
2086 let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[8], x3m20);
2087 let m0320b = SseVector::nmadd(m0320b, self.twiddles_im[10], x4m19);
2088 let m0320b = SseVector::nmadd(m0320b, self.twiddles_im[7], x5m18);
2089 let m0320b = SseVector::nmadd(m0320b, self.twiddles_im[4], x6m17);
2090 let m0320b = SseVector::nmadd(m0320b, self.twiddles_im[1], x7m16);
2091 let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[0], x8m15);
2092 let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[3], x9m14);
2093 let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[6], x10m13);
2094 let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[9], x11m12);
2095 let [y03, y20] = SseVector::column_butterfly2([m0320a, m0320b]);
2096
2097 let m0419a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p22);
2098 let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[7], x2p21);
2099 let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[10], x3p20);
2100 let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[6], x4p19);
2101 let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[2], x5p18);
2102 let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[0], x6p17);
2103 let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[4], x7p16);
2104 let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[8], x8p15);
2105 let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[9], x9p14);
2106 let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[5], x10p13);
2107 let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[1], x11p12);
2108 let m0419b = SseVector::mul(self.twiddles_im[3], x1m22);
2109 let m0419b = SseVector::fmadd(m0419b, self.twiddles_im[7], x2m21);
2110 let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[10], x3m20);
2111 let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[6], x4m19);
2112 let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[2], x5m18);
2113 let m0419b = SseVector::fmadd(m0419b, self.twiddles_im[0], x6m17);
2114 let m0419b = SseVector::fmadd(m0419b, self.twiddles_im[4], x7m16);
2115 let m0419b = SseVector::fmadd(m0419b, self.twiddles_im[8], x8m15);
2116 let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[9], x9m14);
2117 let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[5], x10m13);
2118 let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[1], x11m12);
2119 let [y04, y19] = SseVector::column_butterfly2([m0419a, m0419b]);
2120
2121 let m0518a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p22);
2122 let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[9], x2p21);
2123 let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[7], x3p20);
2124 let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[2], x4p19);
2125 let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[1], x5p18);
2126 let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[6], x6p17);
2127 let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[10], x7p16);
2128 let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[5], x8p15);
2129 let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[0], x9p14);
2130 let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[3], x10p13);
2131 let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[8], x11p12);
2132 let m0518b = SseVector::mul(self.twiddles_im[4], x1m22);
2133 let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[9], x2m21);
2134 let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[7], x3m20);
2135 let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[2], x4m19);
2136 let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[1], x5m18);
2137 let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[6], x6m17);
2138 let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[10], x7m16);
2139 let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[5], x8m15);
2140 let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[0], x9m14);
2141 let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[3], x10m13);
2142 let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[8], x11m12);
2143 let [y05, y18] = SseVector::column_butterfly2([m0518a, m0518b]);
2144
2145 let m0617a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p22);
2146 let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[10], x2p21);
2147 let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[4], x3p20);
2148 let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[0], x4p19);
2149 let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[6], x5p18);
2150 let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[9], x6p17);
2151 let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[3], x7p16);
2152 let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[1], x8p15);
2153 let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[7], x9p14);
2154 let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[8], x10p13);
2155 let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[2], x11p12);
2156 let m0617b = SseVector::mul(self.twiddles_im[5], x1m22);
2157 let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[10], x2m21);
2158 let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[4], x3m20);
2159 let m0617b = SseVector::fmadd(m0617b, self.twiddles_im[0], x4m19);
2160 let m0617b = SseVector::fmadd(m0617b, self.twiddles_im[6], x5m18);
2161 let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[9], x6m17);
2162 let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[3], x7m16);
2163 let m0617b = SseVector::fmadd(m0617b, self.twiddles_im[1], x8m15);
2164 let m0617b = SseVector::fmadd(m0617b, self.twiddles_im[7], x9m14);
2165 let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[8], x10m13);
2166 let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[2], x11m12);
2167 let [y06, y17] = SseVector::column_butterfly2([m0617a, m0617b]);
2168
2169 let m0716a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p22);
2170 let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[8], x2p21);
2171 let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[1], x3p20);
2172 let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[4], x4p19);
2173 let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[10], x5p18);
2174 let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[3], x6p17);
2175 let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[2], x7p16);
2176 let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[9], x8p15);
2177 let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[5], x9p14);
2178 let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[0], x10p13);
2179 let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[7], x11p12);
2180 let m0716b = SseVector::mul(self.twiddles_im[6], x1m22);
2181 let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[8], x2m21);
2182 let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[1], x3m20);
2183 let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[4], x4m19);
2184 let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[10], x5m18);
2185 let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[3], x6m17);
2186 let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[2], x7m16);
2187 let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[9], x8m15);
2188 let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[5], x9m14);
2189 let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[0], x10m13);
2190 let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[7], x11m12);
2191 let [y07, y16] = SseVector::column_butterfly2([m0716a, m0716b]);
2192
2193 let m0815a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p22);
2194 let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[6], x2p21);
2195 let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[0], x3p20);
2196 let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[8], x4p19);
2197 let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[5], x5p18);
2198 let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[1], x6p17);
2199 let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[9], x7p16);
2200 let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[4], x8p15);
2201 let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[2], x9p14);
2202 let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[10], x10p13);
2203 let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[3], x11p12);
2204 let m0815b = SseVector::mul(self.twiddles_im[7], x1m22);
2205 let m0815b = SseVector::nmadd(m0815b, self.twiddles_im[6], x2m21);
2206 let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[0], x3m20);
2207 let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[8], x4m19);
2208 let m0815b = SseVector::nmadd(m0815b, self.twiddles_im[5], x5m18);
2209 let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[1], x6m17);
2210 let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[9], x7m16);
2211 let m0815b = SseVector::nmadd(m0815b, self.twiddles_im[4], x8m15);
2212 let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[2], x9m14);
2213 let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[10], x10m13);
2214 let m0815b = SseVector::nmadd(m0815b, self.twiddles_im[3], x11m12);
2215 let [y08, y15] = SseVector::column_butterfly2([m0815a, m0815b]);
2216
2217 let m0914a = SseVector::fmadd(values[0], self.twiddles_re[8], x1p22);
2218 let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[4], x2p21);
2219 let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[3], x3p20);
2220 let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[9], x4p19);
2221 let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[0], x5p18);
2222 let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[7], x6p17);
2223 let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[5], x7p16);
2224 let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[2], x8p15);
2225 let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[10], x9p14);
2226 let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[1], x10p13);
2227 let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[6], x11p12);
2228 let m0914b = SseVector::mul(self.twiddles_im[8], x1m22);
2229 let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[4], x2m21);
2230 let m0914b = SseVector::fmadd(m0914b, self.twiddles_im[3], x3m20);
2231 let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[9], x4m19);
2232 let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[0], x5m18);
2233 let m0914b = SseVector::fmadd(m0914b, self.twiddles_im[7], x6m17);
2234 let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[5], x7m16);
2235 let m0914b = SseVector::fmadd(m0914b, self.twiddles_im[2], x8m15);
2236 let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[10], x9m14);
2237 let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[1], x10m13);
2238 let m0914b = SseVector::fmadd(m0914b, self.twiddles_im[6], x11m12);
2239 let [y09, y14] = SseVector::column_butterfly2([m0914a, m0914b]);
2240
2241 let m1013a = SseVector::fmadd(values[0], self.twiddles_re[9], x1p22);
2242 let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[2], x2p21);
2243 let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[6], x3p20);
2244 let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[5], x4p19);
2245 let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[3], x5p18);
2246 let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[8], x6p17);
2247 let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[0], x7p16);
2248 let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[10], x8p15);
2249 let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[1], x9p14);
2250 let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[7], x10p13);
2251 let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[4], x11p12);
2252 let m1013b = SseVector::mul(self.twiddles_im[9], x1m22);
2253 let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[2], x2m21);
2254 let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[6], x3m20);
2255 let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[5], x4m19);
2256 let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[3], x5m18);
2257 let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[8], x6m17);
2258 let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[0], x7m16);
2259 let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[10], x8m15);
2260 let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[1], x9m14);
2261 let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[7], x10m13);
2262 let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[4], x11m12);
2263 let [y10, y13] = SseVector::column_butterfly2([m1013a, m1013b]);
2264
2265 let m1112a = SseVector::fmadd(values[0], self.twiddles_re[10], x1p22);
2266 let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[0], x2p21);
2267 let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[9], x3p20);
2268 let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[1], x4p19);
2269 let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[8], x5p18);
2270 let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[2], x6p17);
2271 let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[7], x7p16);
2272 let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[3], x8p15);
2273 let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[6], x9p14);
2274 let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[4], x10p13);
2275 let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[5], x11p12);
2276 let m1112b = SseVector::mul(self.twiddles_im[10], x1m22);
2277 let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[0], x2m21);
2278 let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[9], x3m20);
2279 let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[1], x4m19);
2280 let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[8], x5m18);
2281 let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[2], x6m17);
2282 let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[7], x7m16);
2283 let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[3], x8m15);
2284 let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[6], x9m14);
2285 let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[4], x10m13);
2286 let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[5], x11m12);
2287 let [y11, y12] = SseVector::column_butterfly2([m1112a, m1112b]);
2288
2289
2290 [y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22]
2291 }
2292}
2293
2294struct SseF64Butterfly23<T> {
2295 direction: FftDirection,
2296 twiddles_re: [__m128d; 11],
2297 twiddles_im: [__m128d; 11],
2298 _phantom: std::marker::PhantomData<T>,
2299}
2300
2301boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly23);
2302boilerplate_fft_sse_common_butterfly!(SseF64Butterfly23, 23, |this: &SseF64Butterfly23<_>| this.direction);
2303impl<T: FftNum> SseF64Butterfly23<T> {
2304 #[target_feature(enable = "sse4.1")]
2306 unsafe fn new(direction: FftDirection) -> Self {
2307 assert_f64::<T>();
2308 let twiddles = make_twiddles(23, direction);
2309 unsafe {Self {
2310 direction,
2311 twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
2312 twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
2313 _phantom: std::marker::PhantomData,
2314 }}
2315 }
2316
2317 #[inline(always)]
2318 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
2319 let values = read_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 });
2320
2321 let out = self.perform_fft_direct(values);
2322
2323 write_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 });
2324 }
2325
2326 #[inline(always)]
2327 pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 23]) -> [__m128d; 23] {
2328 let rotate = SseVector::make_rotate90(FftDirection::Inverse);
2329
2330 let y00 = values[0];
2331 let [x1p22, x1m22] = SseVector::column_butterfly2([values[1], values[22]]);
2332 let x1m22 = SseVector::apply_rotate90(rotate, x1m22);
2333 let y00 = SseVector::add(y00, x1p22);
2334 let [x2p21, x2m21] = SseVector::column_butterfly2([values[2], values[21]]);
2335 let x2m21 = SseVector::apply_rotate90(rotate, x2m21);
2336 let y00 = SseVector::add(y00, x2p21);
2337 let [x3p20, x3m20] = SseVector::column_butterfly2([values[3], values[20]]);
2338 let x3m20 = SseVector::apply_rotate90(rotate, x3m20);
2339 let y00 = SseVector::add(y00, x3p20);
2340 let [x4p19, x4m19] = SseVector::column_butterfly2([values[4], values[19]]);
2341 let x4m19 = SseVector::apply_rotate90(rotate, x4m19);
2342 let y00 = SseVector::add(y00, x4p19);
2343 let [x5p18, x5m18] = SseVector::column_butterfly2([values[5], values[18]]);
2344 let x5m18 = SseVector::apply_rotate90(rotate, x5m18);
2345 let y00 = SseVector::add(y00, x5p18);
2346 let [x6p17, x6m17] = SseVector::column_butterfly2([values[6], values[17]]);
2347 let x6m17 = SseVector::apply_rotate90(rotate, x6m17);
2348 let y00 = SseVector::add(y00, x6p17);
2349 let [x7p16, x7m16] = SseVector::column_butterfly2([values[7], values[16]]);
2350 let x7m16 = SseVector::apply_rotate90(rotate, x7m16);
2351 let y00 = SseVector::add(y00, x7p16);
2352 let [x8p15, x8m15] = SseVector::column_butterfly2([values[8], values[15]]);
2353 let x8m15 = SseVector::apply_rotate90(rotate, x8m15);
2354 let y00 = SseVector::add(y00, x8p15);
2355 let [x9p14, x9m14] = SseVector::column_butterfly2([values[9], values[14]]);
2356 let x9m14 = SseVector::apply_rotate90(rotate, x9m14);
2357 let y00 = SseVector::add(y00, x9p14);
2358 let [x10p13, x10m13] = SseVector::column_butterfly2([values[10], values[13]]);
2359 let x10m13 = SseVector::apply_rotate90(rotate, x10m13);
2360 let y00 = SseVector::add(y00, x10p13);
2361 let [x11p12, x11m12] = SseVector::column_butterfly2([values[11], values[12]]);
2362 let x11m12 = SseVector::apply_rotate90(rotate, x11m12);
2363 let y00 = SseVector::add(y00, x11p12);
2364
2365 let m0122a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p22);
2366 let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[1], x2p21);
2367 let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[2], x3p20);
2368 let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[3], x4p19);
2369 let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[4], x5p18);
2370 let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[5], x6p17);
2371 let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[6], x7p16);
2372 let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[7], x8p15);
2373 let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[8], x9p14);
2374 let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[9], x10p13);
2375 let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[10], x11p12);
2376 let m0122b = SseVector::mul(self.twiddles_im[0], x1m22);
2377 let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[1], x2m21);
2378 let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[2], x3m20);
2379 let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[3], x4m19);
2380 let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[4], x5m18);
2381 let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[5], x6m17);
2382 let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[6], x7m16);
2383 let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[7], x8m15);
2384 let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[8], x9m14);
2385 let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[9], x10m13);
2386 let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[10], x11m12);
2387 let [y01, y22] = SseVector::column_butterfly2([m0122a, m0122b]);
2388
2389 let m0221a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p22);
2390 let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[3], x2p21);
2391 let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[5], x3p20);
2392 let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[7], x4p19);
2393 let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[9], x5p18);
2394 let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[10], x6p17);
2395 let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[8], x7p16);
2396 let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[6], x8p15);
2397 let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[4], x9p14);
2398 let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[2], x10p13);
2399 let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[0], x11p12);
2400 let m0221b = SseVector::mul(self.twiddles_im[1], x1m22);
2401 let m0221b = SseVector::fmadd(m0221b, self.twiddles_im[3], x2m21);
2402 let m0221b = SseVector::fmadd(m0221b, self.twiddles_im[5], x3m20);
2403 let m0221b = SseVector::fmadd(m0221b, self.twiddles_im[7], x4m19);
2404 let m0221b = SseVector::fmadd(m0221b, self.twiddles_im[9], x5m18);
2405 let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[10], x6m17);
2406 let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[8], x7m16);
2407 let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[6], x8m15);
2408 let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[4], x9m14);
2409 let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[2], x10m13);
2410 let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[0], x11m12);
2411 let [y02, y21] = SseVector::column_butterfly2([m0221a, m0221b]);
2412
2413 let m0320a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p22);
2414 let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[5], x2p21);
2415 let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[8], x3p20);
2416 let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[10], x4p19);
2417 let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[7], x5p18);
2418 let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[4], x6p17);
2419 let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[1], x7p16);
2420 let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[0], x8p15);
2421 let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[3], x9p14);
2422 let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[6], x10p13);
2423 let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[9], x11p12);
2424 let m0320b = SseVector::mul(self.twiddles_im[2], x1m22);
2425 let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[5], x2m21);
2426 let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[8], x3m20);
2427 let m0320b = SseVector::nmadd(m0320b, self.twiddles_im[10], x4m19);
2428 let m0320b = SseVector::nmadd(m0320b, self.twiddles_im[7], x5m18);
2429 let m0320b = SseVector::nmadd(m0320b, self.twiddles_im[4], x6m17);
2430 let m0320b = SseVector::nmadd(m0320b, self.twiddles_im[1], x7m16);
2431 let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[0], x8m15);
2432 let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[3], x9m14);
2433 let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[6], x10m13);
2434 let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[9], x11m12);
2435 let [y03, y20] = SseVector::column_butterfly2([m0320a, m0320b]);
2436
2437 let m0419a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p22);
2438 let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[7], x2p21);
2439 let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[10], x3p20);
2440 let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[6], x4p19);
2441 let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[2], x5p18);
2442 let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[0], x6p17);
2443 let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[4], x7p16);
2444 let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[8], x8p15);
2445 let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[9], x9p14);
2446 let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[5], x10p13);
2447 let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[1], x11p12);
2448 let m0419b = SseVector::mul(self.twiddles_im[3], x1m22);
2449 let m0419b = SseVector::fmadd(m0419b, self.twiddles_im[7], x2m21);
2450 let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[10], x3m20);
2451 let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[6], x4m19);
2452 let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[2], x5m18);
2453 let m0419b = SseVector::fmadd(m0419b, self.twiddles_im[0], x6m17);
2454 let m0419b = SseVector::fmadd(m0419b, self.twiddles_im[4], x7m16);
2455 let m0419b = SseVector::fmadd(m0419b, self.twiddles_im[8], x8m15);
2456 let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[9], x9m14);
2457 let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[5], x10m13);
2458 let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[1], x11m12);
2459 let [y04, y19] = SseVector::column_butterfly2([m0419a, m0419b]);
2460
2461 let m0518a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p22);
2462 let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[9], x2p21);
2463 let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[7], x3p20);
2464 let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[2], x4p19);
2465 let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[1], x5p18);
2466 let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[6], x6p17);
2467 let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[10], x7p16);
2468 let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[5], x8p15);
2469 let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[0], x9p14);
2470 let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[3], x10p13);
2471 let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[8], x11p12);
2472 let m0518b = SseVector::mul(self.twiddles_im[4], x1m22);
2473 let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[9], x2m21);
2474 let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[7], x3m20);
2475 let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[2], x4m19);
2476 let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[1], x5m18);
2477 let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[6], x6m17);
2478 let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[10], x7m16);
2479 let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[5], x8m15);
2480 let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[0], x9m14);
2481 let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[3], x10m13);
2482 let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[8], x11m12);
2483 let [y05, y18] = SseVector::column_butterfly2([m0518a, m0518b]);
2484
2485 let m0617a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p22);
2486 let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[10], x2p21);
2487 let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[4], x3p20);
2488 let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[0], x4p19);
2489 let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[6], x5p18);
2490 let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[9], x6p17);
2491 let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[3], x7p16);
2492 let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[1], x8p15);
2493 let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[7], x9p14);
2494 let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[8], x10p13);
2495 let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[2], x11p12);
2496 let m0617b = SseVector::mul(self.twiddles_im[5], x1m22);
2497 let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[10], x2m21);
2498 let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[4], x3m20);
2499 let m0617b = SseVector::fmadd(m0617b, self.twiddles_im[0], x4m19);
2500 let m0617b = SseVector::fmadd(m0617b, self.twiddles_im[6], x5m18);
2501 let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[9], x6m17);
2502 let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[3], x7m16);
2503 let m0617b = SseVector::fmadd(m0617b, self.twiddles_im[1], x8m15);
2504 let m0617b = SseVector::fmadd(m0617b, self.twiddles_im[7], x9m14);
2505 let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[8], x10m13);
2506 let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[2], x11m12);
2507 let [y06, y17] = SseVector::column_butterfly2([m0617a, m0617b]);
2508
2509 let m0716a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p22);
2510 let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[8], x2p21);
2511 let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[1], x3p20);
2512 let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[4], x4p19);
2513 let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[10], x5p18);
2514 let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[3], x6p17);
2515 let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[2], x7p16);
2516 let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[9], x8p15);
2517 let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[5], x9p14);
2518 let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[0], x10p13);
2519 let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[7], x11p12);
2520 let m0716b = SseVector::mul(self.twiddles_im[6], x1m22);
2521 let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[8], x2m21);
2522 let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[1], x3m20);
2523 let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[4], x4m19);
2524 let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[10], x5m18);
2525 let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[3], x6m17);
2526 let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[2], x7m16);
2527 let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[9], x8m15);
2528 let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[5], x9m14);
2529 let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[0], x10m13);
2530 let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[7], x11m12);
2531 let [y07, y16] = SseVector::column_butterfly2([m0716a, m0716b]);
2532
2533 let m0815a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p22);
2534 let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[6], x2p21);
2535 let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[0], x3p20);
2536 let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[8], x4p19);
2537 let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[5], x5p18);
2538 let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[1], x6p17);
2539 let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[9], x7p16);
2540 let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[4], x8p15);
2541 let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[2], x9p14);
2542 let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[10], x10p13);
2543 let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[3], x11p12);
2544 let m0815b = SseVector::mul(self.twiddles_im[7], x1m22);
2545 let m0815b = SseVector::nmadd(m0815b, self.twiddles_im[6], x2m21);
2546 let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[0], x3m20);
2547 let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[8], x4m19);
2548 let m0815b = SseVector::nmadd(m0815b, self.twiddles_im[5], x5m18);
2549 let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[1], x6m17);
2550 let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[9], x7m16);
2551 let m0815b = SseVector::nmadd(m0815b, self.twiddles_im[4], x8m15);
2552 let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[2], x9m14);
2553 let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[10], x10m13);
2554 let m0815b = SseVector::nmadd(m0815b, self.twiddles_im[3], x11m12);
2555 let [y08, y15] = SseVector::column_butterfly2([m0815a, m0815b]);
2556
2557 let m0914a = SseVector::fmadd(values[0], self.twiddles_re[8], x1p22);
2558 let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[4], x2p21);
2559 let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[3], x3p20);
2560 let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[9], x4p19);
2561 let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[0], x5p18);
2562 let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[7], x6p17);
2563 let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[5], x7p16);
2564 let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[2], x8p15);
2565 let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[10], x9p14);
2566 let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[1], x10p13);
2567 let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[6], x11p12);
2568 let m0914b = SseVector::mul(self.twiddles_im[8], x1m22);
2569 let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[4], x2m21);
2570 let m0914b = SseVector::fmadd(m0914b, self.twiddles_im[3], x3m20);
2571 let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[9], x4m19);
2572 let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[0], x5m18);
2573 let m0914b = SseVector::fmadd(m0914b, self.twiddles_im[7], x6m17);
2574 let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[5], x7m16);
2575 let m0914b = SseVector::fmadd(m0914b, self.twiddles_im[2], x8m15);
2576 let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[10], x9m14);
2577 let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[1], x10m13);
2578 let m0914b = SseVector::fmadd(m0914b, self.twiddles_im[6], x11m12);
2579 let [y09, y14] = SseVector::column_butterfly2([m0914a, m0914b]);
2580
2581 let m1013a = SseVector::fmadd(values[0], self.twiddles_re[9], x1p22);
2582 let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[2], x2p21);
2583 let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[6], x3p20);
2584 let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[5], x4p19);
2585 let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[3], x5p18);
2586 let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[8], x6p17);
2587 let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[0], x7p16);
2588 let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[10], x8p15);
2589 let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[1], x9p14);
2590 let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[7], x10p13);
2591 let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[4], x11p12);
2592 let m1013b = SseVector::mul(self.twiddles_im[9], x1m22);
2593 let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[2], x2m21);
2594 let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[6], x3m20);
2595 let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[5], x4m19);
2596 let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[3], x5m18);
2597 let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[8], x6m17);
2598 let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[0], x7m16);
2599 let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[10], x8m15);
2600 let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[1], x9m14);
2601 let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[7], x10m13);
2602 let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[4], x11m12);
2603 let [y10, y13] = SseVector::column_butterfly2([m1013a, m1013b]);
2604
2605 let m1112a = SseVector::fmadd(values[0], self.twiddles_re[10], x1p22);
2606 let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[0], x2p21);
2607 let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[9], x3p20);
2608 let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[1], x4p19);
2609 let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[8], x5p18);
2610 let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[2], x6p17);
2611 let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[7], x7p16);
2612 let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[3], x8p15);
2613 let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[6], x9p14);
2614 let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[4], x10p13);
2615 let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[5], x11p12);
2616 let m1112b = SseVector::mul(self.twiddles_im[10], x1m22);
2617 let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[0], x2m21);
2618 let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[9], x3m20);
2619 let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[1], x4m19);
2620 let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[8], x5m18);
2621 let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[2], x6m17);
2622 let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[7], x7m16);
2623 let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[3], x8m15);
2624 let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[6], x9m14);
2625 let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[4], x10m13);
2626 let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[5], x11m12);
2627 let [y11, y12] = SseVector::column_butterfly2([m1112a, m1112b]);
2628
2629
2630 [y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22]
2631 }
2632}
2633
2634struct SseF32Butterfly29<T> {
2635 direction: FftDirection,
2636 twiddles_re: [__m128; 14],
2637 twiddles_im: [__m128; 14],
2638 _phantom: std::marker::PhantomData<T>,
2639}
2640
2641boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly29);
2642boilerplate_fft_sse_common_butterfly!(SseF32Butterfly29, 29, |this: &SseF32Butterfly29<_>| this.direction);
2643impl<T: FftNum> SseF32Butterfly29<T> {
2644 #[target_feature(enable = "sse4.1")]
2646 unsafe fn new(direction: FftDirection) -> Self {
2647 assert_f32::<T>();
2648 let twiddles = make_twiddles(29, direction);
2649 Self {
2650 direction,
2651 twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
2652 twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
2653 _phantom: std::marker::PhantomData,
2654 }
2655 }
2656
2657 #[inline(always)]
2658 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
2659 let values = read_partial1_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28 });
2660
2661 let out = self.perform_parallel_fft_direct(values);
2662
2663 write_partial_lo_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28 } );
2664 }
2665
2666 #[inline(always)]
2667 pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
2668 let input_packed = read_complex_to_array!(buffer, { 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56 });
2669
2670 let values = [
2671 extract_lo_hi_f32(input_packed[0], input_packed[14]),
2672 extract_hi_lo_f32(input_packed[0], input_packed[15]),
2673 extract_lo_hi_f32(input_packed[1], input_packed[15]),
2674 extract_hi_lo_f32(input_packed[1], input_packed[16]),
2675 extract_lo_hi_f32(input_packed[2], input_packed[16]),
2676 extract_hi_lo_f32(input_packed[2], input_packed[17]),
2677 extract_lo_hi_f32(input_packed[3], input_packed[17]),
2678 extract_hi_lo_f32(input_packed[3], input_packed[18]),
2679 extract_lo_hi_f32(input_packed[4], input_packed[18]),
2680 extract_hi_lo_f32(input_packed[4], input_packed[19]),
2681 extract_lo_hi_f32(input_packed[5], input_packed[19]),
2682 extract_hi_lo_f32(input_packed[5], input_packed[20]),
2683 extract_lo_hi_f32(input_packed[6], input_packed[20]),
2684 extract_hi_lo_f32(input_packed[6], input_packed[21]),
2685 extract_lo_hi_f32(input_packed[7], input_packed[21]),
2686 extract_hi_lo_f32(input_packed[7], input_packed[22]),
2687 extract_lo_hi_f32(input_packed[8], input_packed[22]),
2688 extract_hi_lo_f32(input_packed[8], input_packed[23]),
2689 extract_lo_hi_f32(input_packed[9], input_packed[23]),
2690 extract_hi_lo_f32(input_packed[9], input_packed[24]),
2691 extract_lo_hi_f32(input_packed[10], input_packed[24]),
2692 extract_hi_lo_f32(input_packed[10], input_packed[25]),
2693 extract_lo_hi_f32(input_packed[11], input_packed[25]),
2694 extract_hi_lo_f32(input_packed[11], input_packed[26]),
2695 extract_lo_hi_f32(input_packed[12], input_packed[26]),
2696 extract_hi_lo_f32(input_packed[12], input_packed[27]),
2697 extract_lo_hi_f32(input_packed[13], input_packed[27]),
2698 extract_hi_lo_f32(input_packed[13], input_packed[28]),
2699 extract_lo_hi_f32(input_packed[14], input_packed[28]),
2700 ];
2701
2702 let out = self.perform_parallel_fft_direct(values);
2703
2704 let out_packed = [
2705 extract_lo_lo_f32(out[0], out[1]),
2706 extract_lo_lo_f32(out[2], out[3]),
2707 extract_lo_lo_f32(out[4], out[5]),
2708 extract_lo_lo_f32(out[6], out[7]),
2709 extract_lo_lo_f32(out[8], out[9]),
2710 extract_lo_lo_f32(out[10], out[11]),
2711 extract_lo_lo_f32(out[12], out[13]),
2712 extract_lo_lo_f32(out[14], out[15]),
2713 extract_lo_lo_f32(out[16], out[17]),
2714 extract_lo_lo_f32(out[18], out[19]),
2715 extract_lo_lo_f32(out[20], out[21]),
2716 extract_lo_lo_f32(out[22], out[23]),
2717 extract_lo_lo_f32(out[24], out[25]),
2718 extract_lo_lo_f32(out[26], out[27]),
2719 extract_lo_hi_f32(out[28], out[0]),
2720 extract_hi_hi_f32(out[1], out[2]),
2721 extract_hi_hi_f32(out[3], out[4]),
2722 extract_hi_hi_f32(out[5], out[6]),
2723 extract_hi_hi_f32(out[7], out[8]),
2724 extract_hi_hi_f32(out[9], out[10]),
2725 extract_hi_hi_f32(out[11], out[12]),
2726 extract_hi_hi_f32(out[13], out[14]),
2727 extract_hi_hi_f32(out[15], out[16]),
2728 extract_hi_hi_f32(out[17], out[18]),
2729 extract_hi_hi_f32(out[19], out[20]),
2730 extract_hi_hi_f32(out[21], out[22]),
2731 extract_hi_hi_f32(out[23], out[24]),
2732 extract_hi_hi_f32(out[25], out[26]),
2733 extract_hi_hi_f32(out[27], out[28]),
2734 ];
2735
2736 write_complex_to_array_strided!(out_packed, buffer, 2, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28 });
2737 }
2738
2739 #[inline(always)]
2740 pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 29]) -> [__m128; 29] {
2741 let rotate = SseVector::make_rotate90(FftDirection::Inverse);
2742
2743 let y00 = values[0];
2744 let [x1p28, x1m28] = SseVector::column_butterfly2([values[1], values[28]]);
2745 let x1m28 = SseVector::apply_rotate90(rotate, x1m28);
2746 let y00 = SseVector::add(y00, x1p28);
2747 let [x2p27, x2m27] = SseVector::column_butterfly2([values[2], values[27]]);
2748 let x2m27 = SseVector::apply_rotate90(rotate, x2m27);
2749 let y00 = SseVector::add(y00, x2p27);
2750 let [x3p26, x3m26] = SseVector::column_butterfly2([values[3], values[26]]);
2751 let x3m26 = SseVector::apply_rotate90(rotate, x3m26);
2752 let y00 = SseVector::add(y00, x3p26);
2753 let [x4p25, x4m25] = SseVector::column_butterfly2([values[4], values[25]]);
2754 let x4m25 = SseVector::apply_rotate90(rotate, x4m25);
2755 let y00 = SseVector::add(y00, x4p25);
2756 let [x5p24, x5m24] = SseVector::column_butterfly2([values[5], values[24]]);
2757 let x5m24 = SseVector::apply_rotate90(rotate, x5m24);
2758 let y00 = SseVector::add(y00, x5p24);
2759 let [x6p23, x6m23] = SseVector::column_butterfly2([values[6], values[23]]);
2760 let x6m23 = SseVector::apply_rotate90(rotate, x6m23);
2761 let y00 = SseVector::add(y00, x6p23);
2762 let [x7p22, x7m22] = SseVector::column_butterfly2([values[7], values[22]]);
2763 let x7m22 = SseVector::apply_rotate90(rotate, x7m22);
2764 let y00 = SseVector::add(y00, x7p22);
2765 let [x8p21, x8m21] = SseVector::column_butterfly2([values[8], values[21]]);
2766 let x8m21 = SseVector::apply_rotate90(rotate, x8m21);
2767 let y00 = SseVector::add(y00, x8p21);
2768 let [x9p20, x9m20] = SseVector::column_butterfly2([values[9], values[20]]);
2769 let x9m20 = SseVector::apply_rotate90(rotate, x9m20);
2770 let y00 = SseVector::add(y00, x9p20);
2771 let [x10p19, x10m19] = SseVector::column_butterfly2([values[10], values[19]]);
2772 let x10m19 = SseVector::apply_rotate90(rotate, x10m19);
2773 let y00 = SseVector::add(y00, x10p19);
2774 let [x11p18, x11m18] = SseVector::column_butterfly2([values[11], values[18]]);
2775 let x11m18 = SseVector::apply_rotate90(rotate, x11m18);
2776 let y00 = SseVector::add(y00, x11p18);
2777 let [x12p17, x12m17] = SseVector::column_butterfly2([values[12], values[17]]);
2778 let x12m17 = SseVector::apply_rotate90(rotate, x12m17);
2779 let y00 = SseVector::add(y00, x12p17);
2780 let [x13p16, x13m16] = SseVector::column_butterfly2([values[13], values[16]]);
2781 let x13m16 = SseVector::apply_rotate90(rotate, x13m16);
2782 let y00 = SseVector::add(y00, x13p16);
2783 let [x14p15, x14m15] = SseVector::column_butterfly2([values[14], values[15]]);
2784 let x14m15 = SseVector::apply_rotate90(rotate, x14m15);
2785 let y00 = SseVector::add(y00, x14p15);
2786
2787 let m0128a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p28);
2788 let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[1], x2p27);
2789 let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[2], x3p26);
2790 let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[3], x4p25);
2791 let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[4], x5p24);
2792 let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[5], x6p23);
2793 let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[6], x7p22);
2794 let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[7], x8p21);
2795 let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[8], x9p20);
2796 let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[9], x10p19);
2797 let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[10], x11p18);
2798 let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[11], x12p17);
2799 let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[12], x13p16);
2800 let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[13], x14p15);
2801 let m0128b = SseVector::mul(self.twiddles_im[0], x1m28);
2802 let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[1], x2m27);
2803 let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[2], x3m26);
2804 let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[3], x4m25);
2805 let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[4], x5m24);
2806 let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[5], x6m23);
2807 let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[6], x7m22);
2808 let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[7], x8m21);
2809 let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[8], x9m20);
2810 let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[9], x10m19);
2811 let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[10], x11m18);
2812 let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[11], x12m17);
2813 let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[12], x13m16);
2814 let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[13], x14m15);
2815 let [y01, y28] = SseVector::column_butterfly2([m0128a, m0128b]);
2816
2817 let m0227a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p28);
2818 let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[3], x2p27);
2819 let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[5], x3p26);
2820 let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[7], x4p25);
2821 let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[9], x5p24);
2822 let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[11], x6p23);
2823 let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[13], x7p22);
2824 let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[12], x8p21);
2825 let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[10], x9p20);
2826 let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[8], x10p19);
2827 let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[6], x11p18);
2828 let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[4], x12p17);
2829 let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[2], x13p16);
2830 let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[0], x14p15);
2831 let m0227b = SseVector::mul(self.twiddles_im[1], x1m28);
2832 let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[3], x2m27);
2833 let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[5], x3m26);
2834 let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[7], x4m25);
2835 let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[9], x5m24);
2836 let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[11], x6m23);
2837 let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[13], x7m22);
2838 let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[12], x8m21);
2839 let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[10], x9m20);
2840 let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[8], x10m19);
2841 let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[6], x11m18);
2842 let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[4], x12m17);
2843 let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[2], x13m16);
2844 let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[0], x14m15);
2845 let [y02, y27] = SseVector::column_butterfly2([m0227a, m0227b]);
2846
2847 let m0326a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p28);
2848 let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[5], x2p27);
2849 let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[8], x3p26);
2850 let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[11], x4p25);
2851 let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[13], x5p24);
2852 let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[10], x6p23);
2853 let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[7], x7p22);
2854 let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[4], x8p21);
2855 let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[1], x9p20);
2856 let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[0], x10p19);
2857 let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[3], x11p18);
2858 let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[6], x12p17);
2859 let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[9], x13p16);
2860 let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[12], x14p15);
2861 let m0326b = SseVector::mul(self.twiddles_im[2], x1m28);
2862 let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[5], x2m27);
2863 let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[8], x3m26);
2864 let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[11], x4m25);
2865 let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[13], x5m24);
2866 let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[10], x6m23);
2867 let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[7], x7m22);
2868 let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[4], x8m21);
2869 let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[1], x9m20);
2870 let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[0], x10m19);
2871 let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[3], x11m18);
2872 let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[6], x12m17);
2873 let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[9], x13m16);
2874 let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[12], x14m15);
2875 let [y03, y26] = SseVector::column_butterfly2([m0326a, m0326b]);
2876
2877 let m0425a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p28);
2878 let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[7], x2p27);
2879 let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[11], x3p26);
2880 let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[12], x4p25);
2881 let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[8], x5p24);
2882 let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[4], x6p23);
2883 let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[0], x7p22);
2884 let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[2], x8p21);
2885 let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[6], x9p20);
2886 let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[10], x10p19);
2887 let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[13], x11p18);
2888 let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[9], x12p17);
2889 let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[5], x13p16);
2890 let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[1], x14p15);
2891 let m0425b = SseVector::mul(self.twiddles_im[3], x1m28);
2892 let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[7], x2m27);
2893 let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[11], x3m26);
2894 let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[12], x4m25);
2895 let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[8], x5m24);
2896 let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[4], x6m23);
2897 let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[0], x7m22);
2898 let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[2], x8m21);
2899 let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[6], x9m20);
2900 let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[10], x10m19);
2901 let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[13], x11m18);
2902 let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[9], x12m17);
2903 let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[5], x13m16);
2904 let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[1], x14m15);
2905 let [y04, y25] = SseVector::column_butterfly2([m0425a, m0425b]);
2906
2907 let m0524a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p28);
2908 let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[9], x2p27);
2909 let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[13], x3p26);
2910 let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[8], x4p25);
2911 let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[3], x5p24);
2912 let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[0], x6p23);
2913 let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[5], x7p22);
2914 let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[10], x8p21);
2915 let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[12], x9p20);
2916 let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[7], x10p19);
2917 let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[2], x11p18);
2918 let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[1], x12p17);
2919 let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[6], x13p16);
2920 let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[11], x14p15);
2921 let m0524b = SseVector::mul(self.twiddles_im[4], x1m28);
2922 let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[9], x2m27);
2923 let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[13], x3m26);
2924 let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[8], x4m25);
2925 let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[3], x5m24);
2926 let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[0], x6m23);
2927 let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[5], x7m22);
2928 let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[10], x8m21);
2929 let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[12], x9m20);
2930 let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[7], x10m19);
2931 let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[2], x11m18);
2932 let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[1], x12m17);
2933 let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[6], x13m16);
2934 let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[11], x14m15);
2935 let [y05, y24] = SseVector::column_butterfly2([m0524a, m0524b]);
2936
2937 let m0623a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p28);
2938 let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[11], x2p27);
2939 let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[10], x3p26);
2940 let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[4], x4p25);
2941 let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[0], x5p24);
2942 let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[6], x6p23);
2943 let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[12], x7p22);
2944 let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[9], x8p21);
2945 let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[3], x9p20);
2946 let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[1], x10p19);
2947 let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[7], x11p18);
2948 let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[13], x12p17);
2949 let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[8], x13p16);
2950 let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[2], x14p15);
2951 let m0623b = SseVector::mul(self.twiddles_im[5], x1m28);
2952 let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[11], x2m27);
2953 let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[10], x3m26);
2954 let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[4], x4m25);
2955 let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[0], x5m24);
2956 let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[6], x6m23);
2957 let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[12], x7m22);
2958 let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[9], x8m21);
2959 let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[3], x9m20);
2960 let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[1], x10m19);
2961 let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[7], x11m18);
2962 let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[13], x12m17);
2963 let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[8], x13m16);
2964 let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[2], x14m15);
2965 let [y06, y23] = SseVector::column_butterfly2([m0623a, m0623b]);
2966
2967 let m0722a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p28);
2968 let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[13], x2p27);
2969 let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[7], x3p26);
2970 let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[0], x4p25);
2971 let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[5], x5p24);
2972 let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[12], x6p23);
2973 let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[8], x7p22);
2974 let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[1], x8p21);
2975 let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[4], x9p20);
2976 let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[11], x10p19);
2977 let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[9], x11p18);
2978 let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[2], x12p17);
2979 let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[3], x13p16);
2980 let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[10], x14p15);
2981 let m0722b = SseVector::mul(self.twiddles_im[6], x1m28);
2982 let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[13], x2m27);
2983 let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[7], x3m26);
2984 let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[0], x4m25);
2985 let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[5], x5m24);
2986 let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[12], x6m23);
2987 let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[8], x7m22);
2988 let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[1], x8m21);
2989 let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[4], x9m20);
2990 let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[11], x10m19);
2991 let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[9], x11m18);
2992 let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[2], x12m17);
2993 let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[3], x13m16);
2994 let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[10], x14m15);
2995 let [y07, y22] = SseVector::column_butterfly2([m0722a, m0722b]);
2996
2997 let m0821a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p28);
2998 let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[12], x2p27);
2999 let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[4], x3p26);
3000 let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[2], x4p25);
3001 let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[10], x5p24);
3002 let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[9], x6p23);
3003 let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[1], x7p22);
3004 let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[5], x8p21);
3005 let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[13], x9p20);
3006 let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[6], x10p19);
3007 let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[0], x11p18);
3008 let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[8], x12p17);
3009 let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[11], x13p16);
3010 let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[3], x14p15);
3011 let m0821b = SseVector::mul(self.twiddles_im[7], x1m28);
3012 let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[12], x2m27);
3013 let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[4], x3m26);
3014 let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[2], x4m25);
3015 let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[10], x5m24);
3016 let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[9], x6m23);
3017 let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[1], x7m22);
3018 let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[5], x8m21);
3019 let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[13], x9m20);
3020 let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[6], x10m19);
3021 let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[0], x11m18);
3022 let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[8], x12m17);
3023 let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[11], x13m16);
3024 let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[3], x14m15);
3025 let [y08, y21] = SseVector::column_butterfly2([m0821a, m0821b]);
3026
3027 let m0920a = SseVector::fmadd(values[0], self.twiddles_re[8], x1p28);
3028 let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[10], x2p27);
3029 let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[1], x3p26);
3030 let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[6], x4p25);
3031 let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[12], x5p24);
3032 let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[3], x6p23);
3033 let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[4], x7p22);
3034 let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[13], x8p21);
3035 let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[5], x9p20);
3036 let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[2], x10p19);
3037 let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[11], x11p18);
3038 let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[7], x12p17);
3039 let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[0], x13p16);
3040 let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[9], x14p15);
3041 let m0920b = SseVector::mul(self.twiddles_im[8], x1m28);
3042 let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[10], x2m27);
3043 let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[1], x3m26);
3044 let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[6], x4m25);
3045 let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[12], x5m24);
3046 let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[3], x6m23);
3047 let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[4], x7m22);
3048 let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[13], x8m21);
3049 let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[5], x9m20);
3050 let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[2], x10m19);
3051 let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[11], x11m18);
3052 let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[7], x12m17);
3053 let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[0], x13m16);
3054 let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[9], x14m15);
3055 let [y09, y20] = SseVector::column_butterfly2([m0920a, m0920b]);
3056
3057 let m1019a = SseVector::fmadd(values[0], self.twiddles_re[9], x1p28);
3058 let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[8], x2p27);
3059 let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[0], x3p26);
3060 let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[10], x4p25);
3061 let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[7], x5p24);
3062 let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[1], x6p23);
3063 let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[11], x7p22);
3064 let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[6], x8p21);
3065 let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[2], x9p20);
3066 let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[12], x10p19);
3067 let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[5], x11p18);
3068 let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[3], x12p17);
3069 let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[13], x13p16);
3070 let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[4], x14p15);
3071 let m1019b = SseVector::mul(self.twiddles_im[9], x1m28);
3072 let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[8], x2m27);
3073 let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[0], x3m26);
3074 let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[10], x4m25);
3075 let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[7], x5m24);
3076 let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[1], x6m23);
3077 let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[11], x7m22);
3078 let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[6], x8m21);
3079 let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[2], x9m20);
3080 let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[12], x10m19);
3081 let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[5], x11m18);
3082 let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[3], x12m17);
3083 let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[13], x13m16);
3084 let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[4], x14m15);
3085 let [y10, y19] = SseVector::column_butterfly2([m1019a, m1019b]);
3086
3087 let m1118a = SseVector::fmadd(values[0], self.twiddles_re[10], x1p28);
3088 let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[6], x2p27);
3089 let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[3], x3p26);
3090 let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[13], x4p25);
3091 let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[2], x5p24);
3092 let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[7], x6p23);
3093 let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[9], x7p22);
3094 let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[0], x8p21);
3095 let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[11], x9p20);
3096 let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[5], x10p19);
3097 let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[4], x11p18);
3098 let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[12], x12p17);
3099 let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[1], x13p16);
3100 let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[8], x14p15);
3101 let m1118b = SseVector::mul(self.twiddles_im[10], x1m28);
3102 let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[6], x2m27);
3103 let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[3], x3m26);
3104 let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[13], x4m25);
3105 let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[2], x5m24);
3106 let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[7], x6m23);
3107 let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[9], x7m22);
3108 let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[0], x8m21);
3109 let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[11], x9m20);
3110 let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[5], x10m19);
3111 let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[4], x11m18);
3112 let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[12], x12m17);
3113 let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[1], x13m16);
3114 let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[8], x14m15);
3115 let [y11, y18] = SseVector::column_butterfly2([m1118a, m1118b]);
3116
3117 let m1217a = SseVector::fmadd(values[0], self.twiddles_re[11], x1p28);
3118 let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[4], x2p27);
3119 let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[6], x3p26);
3120 let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[9], x4p25);
3121 let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[1], x5p24);
3122 let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[13], x6p23);
3123 let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[2], x7p22);
3124 let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[8], x8p21);
3125 let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[7], x9p20);
3126 let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[3], x10p19);
3127 let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[12], x11p18);
3128 let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[0], x12p17);
3129 let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[10], x13p16);
3130 let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[5], x14p15);
3131 let m1217b = SseVector::mul(self.twiddles_im[11], x1m28);
3132 let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[4], x2m27);
3133 let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[6], x3m26);
3134 let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[9], x4m25);
3135 let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[1], x5m24);
3136 let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[13], x6m23);
3137 let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[2], x7m22);
3138 let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[8], x8m21);
3139 let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[7], x9m20);
3140 let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[3], x10m19);
3141 let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[12], x11m18);
3142 let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[0], x12m17);
3143 let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[10], x13m16);
3144 let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[5], x14m15);
3145 let [y12, y17] = SseVector::column_butterfly2([m1217a, m1217b]);
3146
3147 let m1316a = SseVector::fmadd(values[0], self.twiddles_re[12], x1p28);
3148 let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[2], x2p27);
3149 let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[9], x3p26);
3150 let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[5], x4p25);
3151 let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[6], x5p24);
3152 let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[8], x6p23);
3153 let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[3], x7p22);
3154 let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[11], x8p21);
3155 let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[0], x9p20);
3156 let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[13], x10p19);
3157 let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[1], x11p18);
3158 let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[10], x12p17);
3159 let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[4], x13p16);
3160 let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[7], x14p15);
3161 let m1316b = SseVector::mul(self.twiddles_im[12], x1m28);
3162 let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[2], x2m27);
3163 let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[9], x3m26);
3164 let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[5], x4m25);
3165 let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[6], x5m24);
3166 let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[8], x6m23);
3167 let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[3], x7m22);
3168 let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[11], x8m21);
3169 let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[0], x9m20);
3170 let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[13], x10m19);
3171 let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[1], x11m18);
3172 let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[10], x12m17);
3173 let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[4], x13m16);
3174 let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[7], x14m15);
3175 let [y13, y16] = SseVector::column_butterfly2([m1316a, m1316b]);
3176
3177 let m1415a = SseVector::fmadd(values[0], self.twiddles_re[13], x1p28);
3178 let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[0], x2p27);
3179 let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[12], x3p26);
3180 let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[1], x4p25);
3181 let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[11], x5p24);
3182 let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[2], x6p23);
3183 let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[10], x7p22);
3184 let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[3], x8p21);
3185 let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[9], x9p20);
3186 let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[4], x10p19);
3187 let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[8], x11p18);
3188 let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[5], x12p17);
3189 let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[7], x13p16);
3190 let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[6], x14p15);
3191 let m1415b = SseVector::mul(self.twiddles_im[13], x1m28);
3192 let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[0], x2m27);
3193 let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[12], x3m26);
3194 let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[1], x4m25);
3195 let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[11], x5m24);
3196 let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[2], x6m23);
3197 let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[10], x7m22);
3198 let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[3], x8m21);
3199 let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[9], x9m20);
3200 let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[4], x10m19);
3201 let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[8], x11m18);
3202 let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[5], x12m17);
3203 let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[7], x13m16);
3204 let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[6], x14m15);
3205 let [y14, y15] = SseVector::column_butterfly2([m1415a, m1415b]);
3206
3207
3208 [y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22, y23, y24, y25, y26, y27, y28]
3209 }
3210}
3211
3212struct SseF64Butterfly29<T> {
3213 direction: FftDirection,
3214 twiddles_re: [__m128d; 14],
3215 twiddles_im: [__m128d; 14],
3216 _phantom: std::marker::PhantomData<T>,
3217}
3218
3219boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly29);
3220boilerplate_fft_sse_common_butterfly!(SseF64Butterfly29, 29, |this: &SseF64Butterfly29<_>| this.direction);
3221impl<T: FftNum> SseF64Butterfly29<T> {
3222 #[target_feature(enable = "sse4.1")]
3224 unsafe fn new(direction: FftDirection) -> Self {
3225 assert_f64::<T>();
3226 let twiddles = make_twiddles(29, direction);
3227 unsafe {Self {
3228 direction,
3229 twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
3230 twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
3231 _phantom: std::marker::PhantomData,
3232 }}
3233 }
3234
3235 #[inline(always)]
3236 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
3237 let values = read_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28 });
3238
3239 let out = self.perform_fft_direct(values);
3240
3241 write_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28 });
3242 }
3243
3244 #[inline(always)]
3245 pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 29]) -> [__m128d; 29] {
3246 let rotate = SseVector::make_rotate90(FftDirection::Inverse);
3247
3248 let y00 = values[0];
3249 let [x1p28, x1m28] = SseVector::column_butterfly2([values[1], values[28]]);
3250 let x1m28 = SseVector::apply_rotate90(rotate, x1m28);
3251 let y00 = SseVector::add(y00, x1p28);
3252 let [x2p27, x2m27] = SseVector::column_butterfly2([values[2], values[27]]);
3253 let x2m27 = SseVector::apply_rotate90(rotate, x2m27);
3254 let y00 = SseVector::add(y00, x2p27);
3255 let [x3p26, x3m26] = SseVector::column_butterfly2([values[3], values[26]]);
3256 let x3m26 = SseVector::apply_rotate90(rotate, x3m26);
3257 let y00 = SseVector::add(y00, x3p26);
3258 let [x4p25, x4m25] = SseVector::column_butterfly2([values[4], values[25]]);
3259 let x4m25 = SseVector::apply_rotate90(rotate, x4m25);
3260 let y00 = SseVector::add(y00, x4p25);
3261 let [x5p24, x5m24] = SseVector::column_butterfly2([values[5], values[24]]);
3262 let x5m24 = SseVector::apply_rotate90(rotate, x5m24);
3263 let y00 = SseVector::add(y00, x5p24);
3264 let [x6p23, x6m23] = SseVector::column_butterfly2([values[6], values[23]]);
3265 let x6m23 = SseVector::apply_rotate90(rotate, x6m23);
3266 let y00 = SseVector::add(y00, x6p23);
3267 let [x7p22, x7m22] = SseVector::column_butterfly2([values[7], values[22]]);
3268 let x7m22 = SseVector::apply_rotate90(rotate, x7m22);
3269 let y00 = SseVector::add(y00, x7p22);
3270 let [x8p21, x8m21] = SseVector::column_butterfly2([values[8], values[21]]);
3271 let x8m21 = SseVector::apply_rotate90(rotate, x8m21);
3272 let y00 = SseVector::add(y00, x8p21);
3273 let [x9p20, x9m20] = SseVector::column_butterfly2([values[9], values[20]]);
3274 let x9m20 = SseVector::apply_rotate90(rotate, x9m20);
3275 let y00 = SseVector::add(y00, x9p20);
3276 let [x10p19, x10m19] = SseVector::column_butterfly2([values[10], values[19]]);
3277 let x10m19 = SseVector::apply_rotate90(rotate, x10m19);
3278 let y00 = SseVector::add(y00, x10p19);
3279 let [x11p18, x11m18] = SseVector::column_butterfly2([values[11], values[18]]);
3280 let x11m18 = SseVector::apply_rotate90(rotate, x11m18);
3281 let y00 = SseVector::add(y00, x11p18);
3282 let [x12p17, x12m17] = SseVector::column_butterfly2([values[12], values[17]]);
3283 let x12m17 = SseVector::apply_rotate90(rotate, x12m17);
3284 let y00 = SseVector::add(y00, x12p17);
3285 let [x13p16, x13m16] = SseVector::column_butterfly2([values[13], values[16]]);
3286 let x13m16 = SseVector::apply_rotate90(rotate, x13m16);
3287 let y00 = SseVector::add(y00, x13p16);
3288 let [x14p15, x14m15] = SseVector::column_butterfly2([values[14], values[15]]);
3289 let x14m15 = SseVector::apply_rotate90(rotate, x14m15);
3290 let y00 = SseVector::add(y00, x14p15);
3291
3292 let m0128a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p28);
3293 let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[1], x2p27);
3294 let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[2], x3p26);
3295 let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[3], x4p25);
3296 let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[4], x5p24);
3297 let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[5], x6p23);
3298 let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[6], x7p22);
3299 let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[7], x8p21);
3300 let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[8], x9p20);
3301 let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[9], x10p19);
3302 let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[10], x11p18);
3303 let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[11], x12p17);
3304 let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[12], x13p16);
3305 let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[13], x14p15);
3306 let m0128b = SseVector::mul(self.twiddles_im[0], x1m28);
3307 let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[1], x2m27);
3308 let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[2], x3m26);
3309 let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[3], x4m25);
3310 let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[4], x5m24);
3311 let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[5], x6m23);
3312 let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[6], x7m22);
3313 let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[7], x8m21);
3314 let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[8], x9m20);
3315 let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[9], x10m19);
3316 let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[10], x11m18);
3317 let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[11], x12m17);
3318 let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[12], x13m16);
3319 let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[13], x14m15);
3320 let [y01, y28] = SseVector::column_butterfly2([m0128a, m0128b]);
3321
3322 let m0227a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p28);
3323 let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[3], x2p27);
3324 let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[5], x3p26);
3325 let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[7], x4p25);
3326 let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[9], x5p24);
3327 let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[11], x6p23);
3328 let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[13], x7p22);
3329 let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[12], x8p21);
3330 let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[10], x9p20);
3331 let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[8], x10p19);
3332 let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[6], x11p18);
3333 let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[4], x12p17);
3334 let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[2], x13p16);
3335 let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[0], x14p15);
3336 let m0227b = SseVector::mul(self.twiddles_im[1], x1m28);
3337 let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[3], x2m27);
3338 let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[5], x3m26);
3339 let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[7], x4m25);
3340 let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[9], x5m24);
3341 let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[11], x6m23);
3342 let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[13], x7m22);
3343 let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[12], x8m21);
3344 let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[10], x9m20);
3345 let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[8], x10m19);
3346 let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[6], x11m18);
3347 let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[4], x12m17);
3348 let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[2], x13m16);
3349 let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[0], x14m15);
3350 let [y02, y27] = SseVector::column_butterfly2([m0227a, m0227b]);
3351
3352 let m0326a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p28);
3353 let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[5], x2p27);
3354 let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[8], x3p26);
3355 let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[11], x4p25);
3356 let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[13], x5p24);
3357 let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[10], x6p23);
3358 let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[7], x7p22);
3359 let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[4], x8p21);
3360 let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[1], x9p20);
3361 let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[0], x10p19);
3362 let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[3], x11p18);
3363 let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[6], x12p17);
3364 let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[9], x13p16);
3365 let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[12], x14p15);
3366 let m0326b = SseVector::mul(self.twiddles_im[2], x1m28);
3367 let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[5], x2m27);
3368 let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[8], x3m26);
3369 let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[11], x4m25);
3370 let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[13], x5m24);
3371 let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[10], x6m23);
3372 let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[7], x7m22);
3373 let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[4], x8m21);
3374 let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[1], x9m20);
3375 let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[0], x10m19);
3376 let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[3], x11m18);
3377 let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[6], x12m17);
3378 let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[9], x13m16);
3379 let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[12], x14m15);
3380 let [y03, y26] = SseVector::column_butterfly2([m0326a, m0326b]);
3381
3382 let m0425a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p28);
3383 let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[7], x2p27);
3384 let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[11], x3p26);
3385 let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[12], x4p25);
3386 let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[8], x5p24);
3387 let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[4], x6p23);
3388 let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[0], x7p22);
3389 let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[2], x8p21);
3390 let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[6], x9p20);
3391 let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[10], x10p19);
3392 let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[13], x11p18);
3393 let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[9], x12p17);
3394 let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[5], x13p16);
3395 let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[1], x14p15);
3396 let m0425b = SseVector::mul(self.twiddles_im[3], x1m28);
3397 let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[7], x2m27);
3398 let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[11], x3m26);
3399 let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[12], x4m25);
3400 let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[8], x5m24);
3401 let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[4], x6m23);
3402 let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[0], x7m22);
3403 let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[2], x8m21);
3404 let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[6], x9m20);
3405 let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[10], x10m19);
3406 let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[13], x11m18);
3407 let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[9], x12m17);
3408 let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[5], x13m16);
3409 let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[1], x14m15);
3410 let [y04, y25] = SseVector::column_butterfly2([m0425a, m0425b]);
3411
3412 let m0524a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p28);
3413 let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[9], x2p27);
3414 let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[13], x3p26);
3415 let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[8], x4p25);
3416 let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[3], x5p24);
3417 let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[0], x6p23);
3418 let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[5], x7p22);
3419 let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[10], x8p21);
3420 let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[12], x9p20);
3421 let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[7], x10p19);
3422 let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[2], x11p18);
3423 let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[1], x12p17);
3424 let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[6], x13p16);
3425 let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[11], x14p15);
3426 let m0524b = SseVector::mul(self.twiddles_im[4], x1m28);
3427 let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[9], x2m27);
3428 let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[13], x3m26);
3429 let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[8], x4m25);
3430 let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[3], x5m24);
3431 let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[0], x6m23);
3432 let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[5], x7m22);
3433 let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[10], x8m21);
3434 let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[12], x9m20);
3435 let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[7], x10m19);
3436 let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[2], x11m18);
3437 let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[1], x12m17);
3438 let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[6], x13m16);
3439 let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[11], x14m15);
3440 let [y05, y24] = SseVector::column_butterfly2([m0524a, m0524b]);
3441
3442 let m0623a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p28);
3443 let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[11], x2p27);
3444 let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[10], x3p26);
3445 let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[4], x4p25);
3446 let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[0], x5p24);
3447 let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[6], x6p23);
3448 let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[12], x7p22);
3449 let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[9], x8p21);
3450 let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[3], x9p20);
3451 let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[1], x10p19);
3452 let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[7], x11p18);
3453 let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[13], x12p17);
3454 let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[8], x13p16);
3455 let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[2], x14p15);
3456 let m0623b = SseVector::mul(self.twiddles_im[5], x1m28);
3457 let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[11], x2m27);
3458 let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[10], x3m26);
3459 let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[4], x4m25);
3460 let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[0], x5m24);
3461 let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[6], x6m23);
3462 let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[12], x7m22);
3463 let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[9], x8m21);
3464 let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[3], x9m20);
3465 let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[1], x10m19);
3466 let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[7], x11m18);
3467 let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[13], x12m17);
3468 let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[8], x13m16);
3469 let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[2], x14m15);
3470 let [y06, y23] = SseVector::column_butterfly2([m0623a, m0623b]);
3471
3472 let m0722a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p28);
3473 let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[13], x2p27);
3474 let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[7], x3p26);
3475 let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[0], x4p25);
3476 let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[5], x5p24);
3477 let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[12], x6p23);
3478 let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[8], x7p22);
3479 let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[1], x8p21);
3480 let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[4], x9p20);
3481 let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[11], x10p19);
3482 let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[9], x11p18);
3483 let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[2], x12p17);
3484 let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[3], x13p16);
3485 let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[10], x14p15);
3486 let m0722b = SseVector::mul(self.twiddles_im[6], x1m28);
3487 let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[13], x2m27);
3488 let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[7], x3m26);
3489 let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[0], x4m25);
3490 let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[5], x5m24);
3491 let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[12], x6m23);
3492 let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[8], x7m22);
3493 let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[1], x8m21);
3494 let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[4], x9m20);
3495 let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[11], x10m19);
3496 let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[9], x11m18);
3497 let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[2], x12m17);
3498 let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[3], x13m16);
3499 let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[10], x14m15);
3500 let [y07, y22] = SseVector::column_butterfly2([m0722a, m0722b]);
3501
3502 let m0821a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p28);
3503 let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[12], x2p27);
3504 let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[4], x3p26);
3505 let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[2], x4p25);
3506 let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[10], x5p24);
3507 let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[9], x6p23);
3508 let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[1], x7p22);
3509 let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[5], x8p21);
3510 let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[13], x9p20);
3511 let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[6], x10p19);
3512 let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[0], x11p18);
3513 let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[8], x12p17);
3514 let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[11], x13p16);
3515 let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[3], x14p15);
3516 let m0821b = SseVector::mul(self.twiddles_im[7], x1m28);
3517 let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[12], x2m27);
3518 let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[4], x3m26);
3519 let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[2], x4m25);
3520 let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[10], x5m24);
3521 let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[9], x6m23);
3522 let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[1], x7m22);
3523 let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[5], x8m21);
3524 let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[13], x9m20);
3525 let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[6], x10m19);
3526 let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[0], x11m18);
3527 let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[8], x12m17);
3528 let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[11], x13m16);
3529 let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[3], x14m15);
3530 let [y08, y21] = SseVector::column_butterfly2([m0821a, m0821b]);
3531
3532 let m0920a = SseVector::fmadd(values[0], self.twiddles_re[8], x1p28);
3533 let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[10], x2p27);
3534 let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[1], x3p26);
3535 let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[6], x4p25);
3536 let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[12], x5p24);
3537 let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[3], x6p23);
3538 let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[4], x7p22);
3539 let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[13], x8p21);
3540 let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[5], x9p20);
3541 let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[2], x10p19);
3542 let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[11], x11p18);
3543 let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[7], x12p17);
3544 let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[0], x13p16);
3545 let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[9], x14p15);
3546 let m0920b = SseVector::mul(self.twiddles_im[8], x1m28);
3547 let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[10], x2m27);
3548 let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[1], x3m26);
3549 let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[6], x4m25);
3550 let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[12], x5m24);
3551 let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[3], x6m23);
3552 let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[4], x7m22);
3553 let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[13], x8m21);
3554 let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[5], x9m20);
3555 let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[2], x10m19);
3556 let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[11], x11m18);
3557 let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[7], x12m17);
3558 let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[0], x13m16);
3559 let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[9], x14m15);
3560 let [y09, y20] = SseVector::column_butterfly2([m0920a, m0920b]);
3561
3562 let m1019a = SseVector::fmadd(values[0], self.twiddles_re[9], x1p28);
3563 let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[8], x2p27);
3564 let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[0], x3p26);
3565 let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[10], x4p25);
3566 let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[7], x5p24);
3567 let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[1], x6p23);
3568 let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[11], x7p22);
3569 let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[6], x8p21);
3570 let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[2], x9p20);
3571 let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[12], x10p19);
3572 let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[5], x11p18);
3573 let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[3], x12p17);
3574 let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[13], x13p16);
3575 let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[4], x14p15);
3576 let m1019b = SseVector::mul(self.twiddles_im[9], x1m28);
3577 let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[8], x2m27);
3578 let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[0], x3m26);
3579 let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[10], x4m25);
3580 let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[7], x5m24);
3581 let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[1], x6m23);
3582 let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[11], x7m22);
3583 let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[6], x8m21);
3584 let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[2], x9m20);
3585 let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[12], x10m19);
3586 let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[5], x11m18);
3587 let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[3], x12m17);
3588 let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[13], x13m16);
3589 let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[4], x14m15);
3590 let [y10, y19] = SseVector::column_butterfly2([m1019a, m1019b]);
3591
3592 let m1118a = SseVector::fmadd(values[0], self.twiddles_re[10], x1p28);
3593 let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[6], x2p27);
3594 let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[3], x3p26);
3595 let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[13], x4p25);
3596 let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[2], x5p24);
3597 let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[7], x6p23);
3598 let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[9], x7p22);
3599 let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[0], x8p21);
3600 let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[11], x9p20);
3601 let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[5], x10p19);
3602 let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[4], x11p18);
3603 let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[12], x12p17);
3604 let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[1], x13p16);
3605 let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[8], x14p15);
3606 let m1118b = SseVector::mul(self.twiddles_im[10], x1m28);
3607 let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[6], x2m27);
3608 let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[3], x3m26);
3609 let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[13], x4m25);
3610 let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[2], x5m24);
3611 let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[7], x6m23);
3612 let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[9], x7m22);
3613 let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[0], x8m21);
3614 let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[11], x9m20);
3615 let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[5], x10m19);
3616 let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[4], x11m18);
3617 let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[12], x12m17);
3618 let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[1], x13m16);
3619 let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[8], x14m15);
3620 let [y11, y18] = SseVector::column_butterfly2([m1118a, m1118b]);
3621
3622 let m1217a = SseVector::fmadd(values[0], self.twiddles_re[11], x1p28);
3623 let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[4], x2p27);
3624 let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[6], x3p26);
3625 let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[9], x4p25);
3626 let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[1], x5p24);
3627 let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[13], x6p23);
3628 let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[2], x7p22);
3629 let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[8], x8p21);
3630 let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[7], x9p20);
3631 let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[3], x10p19);
3632 let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[12], x11p18);
3633 let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[0], x12p17);
3634 let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[10], x13p16);
3635 let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[5], x14p15);
3636 let m1217b = SseVector::mul(self.twiddles_im[11], x1m28);
3637 let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[4], x2m27);
3638 let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[6], x3m26);
3639 let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[9], x4m25);
3640 let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[1], x5m24);
3641 let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[13], x6m23);
3642 let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[2], x7m22);
3643 let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[8], x8m21);
3644 let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[7], x9m20);
3645 let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[3], x10m19);
3646 let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[12], x11m18);
3647 let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[0], x12m17);
3648 let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[10], x13m16);
3649 let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[5], x14m15);
3650 let [y12, y17] = SseVector::column_butterfly2([m1217a, m1217b]);
3651
3652 let m1316a = SseVector::fmadd(values[0], self.twiddles_re[12], x1p28);
3653 let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[2], x2p27);
3654 let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[9], x3p26);
3655 let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[5], x4p25);
3656 let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[6], x5p24);
3657 let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[8], x6p23);
3658 let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[3], x7p22);
3659 let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[11], x8p21);
3660 let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[0], x9p20);
3661 let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[13], x10p19);
3662 let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[1], x11p18);
3663 let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[10], x12p17);
3664 let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[4], x13p16);
3665 let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[7], x14p15);
3666 let m1316b = SseVector::mul(self.twiddles_im[12], x1m28);
3667 let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[2], x2m27);
3668 let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[9], x3m26);
3669 let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[5], x4m25);
3670 let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[6], x5m24);
3671 let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[8], x6m23);
3672 let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[3], x7m22);
3673 let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[11], x8m21);
3674 let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[0], x9m20);
3675 let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[13], x10m19);
3676 let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[1], x11m18);
3677 let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[10], x12m17);
3678 let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[4], x13m16);
3679 let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[7], x14m15);
3680 let [y13, y16] = SseVector::column_butterfly2([m1316a, m1316b]);
3681
3682 let m1415a = SseVector::fmadd(values[0], self.twiddles_re[13], x1p28);
3683 let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[0], x2p27);
3684 let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[12], x3p26);
3685 let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[1], x4p25);
3686 let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[11], x5p24);
3687 let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[2], x6p23);
3688 let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[10], x7p22);
3689 let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[3], x8p21);
3690 let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[9], x9p20);
3691 let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[4], x10p19);
3692 let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[8], x11p18);
3693 let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[5], x12p17);
3694 let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[7], x13p16);
3695 let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[6], x14p15);
3696 let m1415b = SseVector::mul(self.twiddles_im[13], x1m28);
3697 let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[0], x2m27);
3698 let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[12], x3m26);
3699 let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[1], x4m25);
3700 let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[11], x5m24);
3701 let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[2], x6m23);
3702 let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[10], x7m22);
3703 let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[3], x8m21);
3704 let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[9], x9m20);
3705 let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[4], x10m19);
3706 let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[8], x11m18);
3707 let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[5], x12m17);
3708 let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[7], x13m16);
3709 let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[6], x14m15);
3710 let [y14, y15] = SseVector::column_butterfly2([m1415a, m1415b]);
3711
3712
3713 [y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22, y23, y24, y25, y26, y27, y28]
3714 }
3715}
3716
3717struct SseF32Butterfly31<T> {
3718 direction: FftDirection,
3719 twiddles_re: [__m128; 15],
3720 twiddles_im: [__m128; 15],
3721 _phantom: std::marker::PhantomData<T>,
3722}
3723
3724boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly31);
3725boilerplate_fft_sse_common_butterfly!(SseF32Butterfly31, 31, |this: &SseF32Butterfly31<_>| this.direction);
3726impl<T: FftNum> SseF32Butterfly31<T> {
3727 #[target_feature(enable = "sse4.1")]
3729 unsafe fn new(direction: FftDirection) -> Self {
3730 assert_f32::<T>();
3731 let twiddles = make_twiddles(31, direction);
3732 Self {
3733 direction,
3734 twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
3735 twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
3736 _phantom: std::marker::PhantomData,
3737 }
3738 }
3739
3740 #[inline(always)]
3741 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
3742 let values = read_partial1_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 });
3743
3744 let out = self.perform_parallel_fft_direct(values);
3745
3746 write_partial_lo_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 } );
3747 }
3748
3749 #[inline(always)]
3750 pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
3751 let input_packed = read_complex_to_array!(buffer, { 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60 });
3752
3753 let values = [
3754 extract_lo_hi_f32(input_packed[0], input_packed[15]),
3755 extract_hi_lo_f32(input_packed[0], input_packed[16]),
3756 extract_lo_hi_f32(input_packed[1], input_packed[16]),
3757 extract_hi_lo_f32(input_packed[1], input_packed[17]),
3758 extract_lo_hi_f32(input_packed[2], input_packed[17]),
3759 extract_hi_lo_f32(input_packed[2], input_packed[18]),
3760 extract_lo_hi_f32(input_packed[3], input_packed[18]),
3761 extract_hi_lo_f32(input_packed[3], input_packed[19]),
3762 extract_lo_hi_f32(input_packed[4], input_packed[19]),
3763 extract_hi_lo_f32(input_packed[4], input_packed[20]),
3764 extract_lo_hi_f32(input_packed[5], input_packed[20]),
3765 extract_hi_lo_f32(input_packed[5], input_packed[21]),
3766 extract_lo_hi_f32(input_packed[6], input_packed[21]),
3767 extract_hi_lo_f32(input_packed[6], input_packed[22]),
3768 extract_lo_hi_f32(input_packed[7], input_packed[22]),
3769 extract_hi_lo_f32(input_packed[7], input_packed[23]),
3770 extract_lo_hi_f32(input_packed[8], input_packed[23]),
3771 extract_hi_lo_f32(input_packed[8], input_packed[24]),
3772 extract_lo_hi_f32(input_packed[9], input_packed[24]),
3773 extract_hi_lo_f32(input_packed[9], input_packed[25]),
3774 extract_lo_hi_f32(input_packed[10], input_packed[25]),
3775 extract_hi_lo_f32(input_packed[10], input_packed[26]),
3776 extract_lo_hi_f32(input_packed[11], input_packed[26]),
3777 extract_hi_lo_f32(input_packed[11], input_packed[27]),
3778 extract_lo_hi_f32(input_packed[12], input_packed[27]),
3779 extract_hi_lo_f32(input_packed[12], input_packed[28]),
3780 extract_lo_hi_f32(input_packed[13], input_packed[28]),
3781 extract_hi_lo_f32(input_packed[13], input_packed[29]),
3782 extract_lo_hi_f32(input_packed[14], input_packed[29]),
3783 extract_hi_lo_f32(input_packed[14], input_packed[30]),
3784 extract_lo_hi_f32(input_packed[15], input_packed[30]),
3785 ];
3786
3787 let out = self.perform_parallel_fft_direct(values);
3788
3789 let out_packed = [
3790 extract_lo_lo_f32(out[0], out[1]),
3791 extract_lo_lo_f32(out[2], out[3]),
3792 extract_lo_lo_f32(out[4], out[5]),
3793 extract_lo_lo_f32(out[6], out[7]),
3794 extract_lo_lo_f32(out[8], out[9]),
3795 extract_lo_lo_f32(out[10], out[11]),
3796 extract_lo_lo_f32(out[12], out[13]),
3797 extract_lo_lo_f32(out[14], out[15]),
3798 extract_lo_lo_f32(out[16], out[17]),
3799 extract_lo_lo_f32(out[18], out[19]),
3800 extract_lo_lo_f32(out[20], out[21]),
3801 extract_lo_lo_f32(out[22], out[23]),
3802 extract_lo_lo_f32(out[24], out[25]),
3803 extract_lo_lo_f32(out[26], out[27]),
3804 extract_lo_lo_f32(out[28], out[29]),
3805 extract_lo_hi_f32(out[30], out[0]),
3806 extract_hi_hi_f32(out[1], out[2]),
3807 extract_hi_hi_f32(out[3], out[4]),
3808 extract_hi_hi_f32(out[5], out[6]),
3809 extract_hi_hi_f32(out[7], out[8]),
3810 extract_hi_hi_f32(out[9], out[10]),
3811 extract_hi_hi_f32(out[11], out[12]),
3812 extract_hi_hi_f32(out[13], out[14]),
3813 extract_hi_hi_f32(out[15], out[16]),
3814 extract_hi_hi_f32(out[17], out[18]),
3815 extract_hi_hi_f32(out[19], out[20]),
3816 extract_hi_hi_f32(out[21], out[22]),
3817 extract_hi_hi_f32(out[23], out[24]),
3818 extract_hi_hi_f32(out[25], out[26]),
3819 extract_hi_hi_f32(out[27], out[28]),
3820 extract_hi_hi_f32(out[29], out[30]),
3821 ];
3822
3823 write_complex_to_array_strided!(out_packed, buffer, 2, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 });
3824 }
3825
3826 #[inline(always)]
3827 pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 31]) -> [__m128; 31] {
3828 let rotate = SseVector::make_rotate90(FftDirection::Inverse);
3829
3830 let y00 = values[0];
3831 let [x1p30, x1m30] = SseVector::column_butterfly2([values[1], values[30]]);
3832 let x1m30 = SseVector::apply_rotate90(rotate, x1m30);
3833 let y00 = SseVector::add(y00, x1p30);
3834 let [x2p29, x2m29] = SseVector::column_butterfly2([values[2], values[29]]);
3835 let x2m29 = SseVector::apply_rotate90(rotate, x2m29);
3836 let y00 = SseVector::add(y00, x2p29);
3837 let [x3p28, x3m28] = SseVector::column_butterfly2([values[3], values[28]]);
3838 let x3m28 = SseVector::apply_rotate90(rotate, x3m28);
3839 let y00 = SseVector::add(y00, x3p28);
3840 let [x4p27, x4m27] = SseVector::column_butterfly2([values[4], values[27]]);
3841 let x4m27 = SseVector::apply_rotate90(rotate, x4m27);
3842 let y00 = SseVector::add(y00, x4p27);
3843 let [x5p26, x5m26] = SseVector::column_butterfly2([values[5], values[26]]);
3844 let x5m26 = SseVector::apply_rotate90(rotate, x5m26);
3845 let y00 = SseVector::add(y00, x5p26);
3846 let [x6p25, x6m25] = SseVector::column_butterfly2([values[6], values[25]]);
3847 let x6m25 = SseVector::apply_rotate90(rotate, x6m25);
3848 let y00 = SseVector::add(y00, x6p25);
3849 let [x7p24, x7m24] = SseVector::column_butterfly2([values[7], values[24]]);
3850 let x7m24 = SseVector::apply_rotate90(rotate, x7m24);
3851 let y00 = SseVector::add(y00, x7p24);
3852 let [x8p23, x8m23] = SseVector::column_butterfly2([values[8], values[23]]);
3853 let x8m23 = SseVector::apply_rotate90(rotate, x8m23);
3854 let y00 = SseVector::add(y00, x8p23);
3855 let [x9p22, x9m22] = SseVector::column_butterfly2([values[9], values[22]]);
3856 let x9m22 = SseVector::apply_rotate90(rotate, x9m22);
3857 let y00 = SseVector::add(y00, x9p22);
3858 let [x10p21, x10m21] = SseVector::column_butterfly2([values[10], values[21]]);
3859 let x10m21 = SseVector::apply_rotate90(rotate, x10m21);
3860 let y00 = SseVector::add(y00, x10p21);
3861 let [x11p20, x11m20] = SseVector::column_butterfly2([values[11], values[20]]);
3862 let x11m20 = SseVector::apply_rotate90(rotate, x11m20);
3863 let y00 = SseVector::add(y00, x11p20);
3864 let [x12p19, x12m19] = SseVector::column_butterfly2([values[12], values[19]]);
3865 let x12m19 = SseVector::apply_rotate90(rotate, x12m19);
3866 let y00 = SseVector::add(y00, x12p19);
3867 let [x13p18, x13m18] = SseVector::column_butterfly2([values[13], values[18]]);
3868 let x13m18 = SseVector::apply_rotate90(rotate, x13m18);
3869 let y00 = SseVector::add(y00, x13p18);
3870 let [x14p17, x14m17] = SseVector::column_butterfly2([values[14], values[17]]);
3871 let x14m17 = SseVector::apply_rotate90(rotate, x14m17);
3872 let y00 = SseVector::add(y00, x14p17);
3873 let [x15p16, x15m16] = SseVector::column_butterfly2([values[15], values[16]]);
3874 let x15m16 = SseVector::apply_rotate90(rotate, x15m16);
3875 let y00 = SseVector::add(y00, x15p16);
3876
3877 let m0130a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p30);
3878 let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[1], x2p29);
3879 let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[2], x3p28);
3880 let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[3], x4p27);
3881 let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[4], x5p26);
3882 let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[5], x6p25);
3883 let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[6], x7p24);
3884 let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[7], x8p23);
3885 let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[8], x9p22);
3886 let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[9], x10p21);
3887 let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[10], x11p20);
3888 let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[11], x12p19);
3889 let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[12], x13p18);
3890 let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[13], x14p17);
3891 let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[14], x15p16);
3892 let m0130b = SseVector::mul(self.twiddles_im[0], x1m30);
3893 let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[1], x2m29);
3894 let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[2], x3m28);
3895 let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[3], x4m27);
3896 let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[4], x5m26);
3897 let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[5], x6m25);
3898 let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[6], x7m24);
3899 let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[7], x8m23);
3900 let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[8], x9m22);
3901 let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[9], x10m21);
3902 let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[10], x11m20);
3903 let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[11], x12m19);
3904 let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[12], x13m18);
3905 let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[13], x14m17);
3906 let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[14], x15m16);
3907 let [y01, y30] = SseVector::column_butterfly2([m0130a, m0130b]);
3908
3909 let m0229a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p30);
3910 let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[3], x2p29);
3911 let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[5], x3p28);
3912 let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[7], x4p27);
3913 let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[9], x5p26);
3914 let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[11], x6p25);
3915 let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[13], x7p24);
3916 let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[14], x8p23);
3917 let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[12], x9p22);
3918 let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[10], x10p21);
3919 let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[8], x11p20);
3920 let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[6], x12p19);
3921 let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[4], x13p18);
3922 let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[2], x14p17);
3923 let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[0], x15p16);
3924 let m0229b = SseVector::mul(self.twiddles_im[1], x1m30);
3925 let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[3], x2m29);
3926 let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[5], x3m28);
3927 let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[7], x4m27);
3928 let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[9], x5m26);
3929 let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[11], x6m25);
3930 let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[13], x7m24);
3931 let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[14], x8m23);
3932 let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[12], x9m22);
3933 let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[10], x10m21);
3934 let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[8], x11m20);
3935 let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[6], x12m19);
3936 let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[4], x13m18);
3937 let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[2], x14m17);
3938 let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[0], x15m16);
3939 let [y02, y29] = SseVector::column_butterfly2([m0229a, m0229b]);
3940
3941 let m0328a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p30);
3942 let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[5], x2p29);
3943 let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[8], x3p28);
3944 let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[11], x4p27);
3945 let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[14], x5p26);
3946 let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[12], x6p25);
3947 let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[9], x7p24);
3948 let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[6], x8p23);
3949 let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[3], x9p22);
3950 let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[0], x10p21);
3951 let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[1], x11p20);
3952 let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[4], x12p19);
3953 let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[7], x13p18);
3954 let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[10], x14p17);
3955 let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[13], x15p16);
3956 let m0328b = SseVector::mul(self.twiddles_im[2], x1m30);
3957 let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[5], x2m29);
3958 let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[8], x3m28);
3959 let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[11], x4m27);
3960 let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[14], x5m26);
3961 let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[12], x6m25);
3962 let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[9], x7m24);
3963 let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[6], x8m23);
3964 let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[3], x9m22);
3965 let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[0], x10m21);
3966 let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[1], x11m20);
3967 let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[4], x12m19);
3968 let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[7], x13m18);
3969 let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[10], x14m17);
3970 let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[13], x15m16);
3971 let [y03, y28] = SseVector::column_butterfly2([m0328a, m0328b]);
3972
3973 let m0427a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p30);
3974 let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[7], x2p29);
3975 let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[11], x3p28);
3976 let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[14], x4p27);
3977 let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[10], x5p26);
3978 let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[6], x6p25);
3979 let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[2], x7p24);
3980 let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[0], x8p23);
3981 let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[4], x9p22);
3982 let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[8], x10p21);
3983 let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[12], x11p20);
3984 let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[13], x12p19);
3985 let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[9], x13p18);
3986 let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[5], x14p17);
3987 let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[1], x15p16);
3988 let m0427b = SseVector::mul(self.twiddles_im[3], x1m30);
3989 let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[7], x2m29);
3990 let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[11], x3m28);
3991 let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[14], x4m27);
3992 let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[10], x5m26);
3993 let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[6], x6m25);
3994 let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[2], x7m24);
3995 let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[0], x8m23);
3996 let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[4], x9m22);
3997 let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[8], x10m21);
3998 let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[12], x11m20);
3999 let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[13], x12m19);
4000 let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[9], x13m18);
4001 let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[5], x14m17);
4002 let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[1], x15m16);
4003 let [y04, y27] = SseVector::column_butterfly2([m0427a, m0427b]);
4004
4005 let m0526a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p30);
4006 let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[9], x2p29);
4007 let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[14], x3p28);
4008 let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[10], x4p27);
4009 let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[5], x5p26);
4010 let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[0], x6p25);
4011 let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[3], x7p24);
4012 let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[8], x8p23);
4013 let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[13], x9p22);
4014 let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[11], x10p21);
4015 let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[6], x11p20);
4016 let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[1], x12p19);
4017 let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[2], x13p18);
4018 let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[7], x14p17);
4019 let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[12], x15p16);
4020 let m0526b = SseVector::mul(self.twiddles_im[4], x1m30);
4021 let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[9], x2m29);
4022 let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[14], x3m28);
4023 let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[10], x4m27);
4024 let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[5], x5m26);
4025 let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[0], x6m25);
4026 let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[3], x7m24);
4027 let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[8], x8m23);
4028 let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[13], x9m22);
4029 let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[11], x10m21);
4030 let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[6], x11m20);
4031 let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[1], x12m19);
4032 let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[2], x13m18);
4033 let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[7], x14m17);
4034 let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[12], x15m16);
4035 let [y05, y26] = SseVector::column_butterfly2([m0526a, m0526b]);
4036
4037 let m0625a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p30);
4038 let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[11], x2p29);
4039 let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[12], x3p28);
4040 let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[6], x4p27);
4041 let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[0], x5p26);
4042 let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[4], x6p25);
4043 let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[10], x7p24);
4044 let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[13], x8p23);
4045 let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[7], x9p22);
4046 let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[1], x10p21);
4047 let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[3], x11p20);
4048 let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[9], x12p19);
4049 let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[14], x13p18);
4050 let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[8], x14p17);
4051 let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[2], x15p16);
4052 let m0625b = SseVector::mul(self.twiddles_im[5], x1m30);
4053 let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[11], x2m29);
4054 let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[12], x3m28);
4055 let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[6], x4m27);
4056 let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[0], x5m26);
4057 let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[4], x6m25);
4058 let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[10], x7m24);
4059 let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[13], x8m23);
4060 let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[7], x9m22);
4061 let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[1], x10m21);
4062 let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[3], x11m20);
4063 let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[9], x12m19);
4064 let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[14], x13m18);
4065 let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[8], x14m17);
4066 let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[2], x15m16);
4067 let [y06, y25] = SseVector::column_butterfly2([m0625a, m0625b]);
4068
4069 let m0724a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p30);
4070 let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[13], x2p29);
4071 let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[9], x3p28);
4072 let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[2], x4p27);
4073 let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[3], x5p26);
4074 let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[10], x6p25);
4075 let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[12], x7p24);
4076 let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[5], x8p23);
4077 let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[0], x9p22);
4078 let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[7], x10p21);
4079 let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[14], x11p20);
4080 let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[8], x12p19);
4081 let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[1], x13p18);
4082 let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[4], x14p17);
4083 let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[11], x15p16);
4084 let m0724b = SseVector::mul(self.twiddles_im[6], x1m30);
4085 let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[13], x2m29);
4086 let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[9], x3m28);
4087 let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[2], x4m27);
4088 let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[3], x5m26);
4089 let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[10], x6m25);
4090 let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[12], x7m24);
4091 let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[5], x8m23);
4092 let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[0], x9m22);
4093 let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[7], x10m21);
4094 let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[14], x11m20);
4095 let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[8], x12m19);
4096 let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[1], x13m18);
4097 let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[4], x14m17);
4098 let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[11], x15m16);
4099 let [y07, y24] = SseVector::column_butterfly2([m0724a, m0724b]);
4100
4101 let m0823a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p30);
4102 let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[14], x2p29);
4103 let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[6], x3p28);
4104 let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[0], x4p27);
4105 let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[8], x5p26);
4106 let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[13], x6p25);
4107 let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[5], x7p24);
4108 let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[1], x8p23);
4109 let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[9], x9p22);
4110 let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[12], x10p21);
4111 let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[4], x11p20);
4112 let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[2], x12p19);
4113 let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[10], x13p18);
4114 let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[11], x14p17);
4115 let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[3], x15p16);
4116 let m0823b = SseVector::mul(self.twiddles_im[7], x1m30);
4117 let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[14], x2m29);
4118 let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[6], x3m28);
4119 let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[0], x4m27);
4120 let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[8], x5m26);
4121 let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[13], x6m25);
4122 let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[5], x7m24);
4123 let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[1], x8m23);
4124 let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[9], x9m22);
4125 let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[12], x10m21);
4126 let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[4], x11m20);
4127 let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[2], x12m19);
4128 let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[10], x13m18);
4129 let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[11], x14m17);
4130 let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[3], x15m16);
4131 let [y08, y23] = SseVector::column_butterfly2([m0823a, m0823b]);
4132
4133 let m0922a = SseVector::fmadd(values[0], self.twiddles_re[8], x1p30);
4134 let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[12], x2p29);
4135 let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[3], x3p28);
4136 let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[4], x4p27);
4137 let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[13], x5p26);
4138 let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[7], x6p25);
4139 let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[0], x7p24);
4140 let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[9], x8p23);
4141 let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[11], x9p22);
4142 let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[2], x10p21);
4143 let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[5], x11p20);
4144 let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[14], x12p19);
4145 let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[6], x13p18);
4146 let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[1], x14p17);
4147 let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[10], x15p16);
4148 let m0922b = SseVector::mul(self.twiddles_im[8], x1m30);
4149 let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[12], x2m29);
4150 let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[3], x3m28);
4151 let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[4], x4m27);
4152 let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[13], x5m26);
4153 let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[7], x6m25);
4154 let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[0], x7m24);
4155 let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[9], x8m23);
4156 let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[11], x9m22);
4157 let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[2], x10m21);
4158 let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[5], x11m20);
4159 let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[14], x12m19);
4160 let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[6], x13m18);
4161 let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[1], x14m17);
4162 let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[10], x15m16);
4163 let [y09, y22] = SseVector::column_butterfly2([m0922a, m0922b]);
4164
4165 let m1021a = SseVector::fmadd(values[0], self.twiddles_re[9], x1p30);
4166 let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[10], x2p29);
4167 let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[0], x3p28);
4168 let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[8], x4p27);
4169 let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[11], x5p26);
4170 let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[1], x6p25);
4171 let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[7], x7p24);
4172 let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[12], x8p23);
4173 let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[2], x9p22);
4174 let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[6], x10p21);
4175 let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[13], x11p20);
4176 let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[3], x12p19);
4177 let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[5], x13p18);
4178 let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[14], x14p17);
4179 let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[4], x15p16);
4180 let m1021b = SseVector::mul(self.twiddles_im[9], x1m30);
4181 let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[10], x2m29);
4182 let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[0], x3m28);
4183 let m1021b = SseVector::fmadd(m1021b, self.twiddles_im[8], x4m27);
4184 let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[11], x5m26);
4185 let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[1], x6m25);
4186 let m1021b = SseVector::fmadd(m1021b, self.twiddles_im[7], x7m24);
4187 let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[12], x8m23);
4188 let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[2], x9m22);
4189 let m1021b = SseVector::fmadd(m1021b, self.twiddles_im[6], x10m21);
4190 let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[13], x11m20);
4191 let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[3], x12m19);
4192 let m1021b = SseVector::fmadd(m1021b, self.twiddles_im[5], x13m18);
4193 let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[14], x14m17);
4194 let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[4], x15m16);
4195 let [y10, y21] = SseVector::column_butterfly2([m1021a, m1021b]);
4196
4197 let m1120a = SseVector::fmadd(values[0], self.twiddles_re[10], x1p30);
4198 let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[8], x2p29);
4199 let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[1], x3p28);
4200 let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[12], x4p27);
4201 let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[6], x5p26);
4202 let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[3], x6p25);
4203 let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[14], x7p24);
4204 let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[4], x8p23);
4205 let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[5], x9p22);
4206 let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[13], x10p21);
4207 let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[2], x11p20);
4208 let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[7], x12p19);
4209 let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[11], x13p18);
4210 let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[0], x14p17);
4211 let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[9], x15p16);
4212 let m1120b = SseVector::mul(self.twiddles_im[10], x1m30);
4213 let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[8], x2m29);
4214 let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[1], x3m28);
4215 let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[12], x4m27);
4216 let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[6], x5m26);
4217 let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[3], x6m25);
4218 let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[14], x7m24);
4219 let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[4], x8m23);
4220 let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[5], x9m22);
4221 let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[13], x10m21);
4222 let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[2], x11m20);
4223 let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[7], x12m19);
4224 let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[11], x13m18);
4225 let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[0], x14m17);
4226 let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[9], x15m16);
4227 let [y11, y20] = SseVector::column_butterfly2([m1120a, m1120b]);
4228
4229 let m1219a = SseVector::fmadd(values[0], self.twiddles_re[11], x1p30);
4230 let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[6], x2p29);
4231 let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[4], x3p28);
4232 let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[13], x4p27);
4233 let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[1], x5p26);
4234 let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[9], x6p25);
4235 let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[8], x7p24);
4236 let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[2], x8p23);
4237 let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[14], x9p22);
4238 let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[3], x10p21);
4239 let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[7], x11p20);
4240 let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[10], x12p19);
4241 let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[0], x13p18);
4242 let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[12], x14p17);
4243 let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[5], x15p16);
4244 let m1219b = SseVector::mul(self.twiddles_im[11], x1m30);
4245 let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[6], x2m29);
4246 let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[4], x3m28);
4247 let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[13], x4m27);
4248 let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[1], x5m26);
4249 let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[9], x6m25);
4250 let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[8], x7m24);
4251 let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[2], x8m23);
4252 let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[14], x9m22);
4253 let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[3], x10m21);
4254 let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[7], x11m20);
4255 let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[10], x12m19);
4256 let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[0], x13m18);
4257 let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[12], x14m17);
4258 let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[5], x15m16);
4259 let [y12, y19] = SseVector::column_butterfly2([m1219a, m1219b]);
4260
4261 let m1318a = SseVector::fmadd(values[0], self.twiddles_re[12], x1p30);
4262 let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[4], x2p29);
4263 let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[7], x3p28);
4264 let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[9], x4p27);
4265 let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[2], x5p26);
4266 let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[14], x6p25);
4267 let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[1], x7p24);
4268 let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[10], x8p23);
4269 let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[6], x9p22);
4270 let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[5], x10p21);
4271 let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[11], x11p20);
4272 let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[0], x12p19);
4273 let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[13], x13p18);
4274 let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[3], x14p17);
4275 let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[8], x15p16);
4276 let m1318b = SseVector::mul(self.twiddles_im[12], x1m30);
4277 let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[4], x2m29);
4278 let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[7], x3m28);
4279 let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[9], x4m27);
4280 let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[2], x5m26);
4281 let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[14], x6m25);
4282 let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[1], x7m24);
4283 let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[10], x8m23);
4284 let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[6], x9m22);
4285 let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[5], x10m21);
4286 let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[11], x11m20);
4287 let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[0], x12m19);
4288 let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[13], x13m18);
4289 let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[3], x14m17);
4290 let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[8], x15m16);
4291 let [y13, y18] = SseVector::column_butterfly2([m1318a, m1318b]);
4292
4293 let m1417a = SseVector::fmadd(values[0], self.twiddles_re[13], x1p30);
4294 let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[2], x2p29);
4295 let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[10], x3p28);
4296 let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[5], x4p27);
4297 let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[7], x5p26);
4298 let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[8], x6p25);
4299 let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[4], x7p24);
4300 let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[11], x8p23);
4301 let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[1], x9p22);
4302 let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[14], x10p21);
4303 let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[0], x11p20);
4304 let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[12], x12p19);
4305 let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[3], x13p18);
4306 let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[9], x14p17);
4307 let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[6], x15p16);
4308 let m1417b = SseVector::mul(self.twiddles_im[13], x1m30);
4309 let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[2], x2m29);
4310 let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[10], x3m28);
4311 let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[5], x4m27);
4312 let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[7], x5m26);
4313 let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[8], x6m25);
4314 let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[4], x7m24);
4315 let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[11], x8m23);
4316 let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[1], x9m22);
4317 let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[14], x10m21);
4318 let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[0], x11m20);
4319 let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[12], x12m19);
4320 let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[3], x13m18);
4321 let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[9], x14m17);
4322 let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[6], x15m16);
4323 let [y14, y17] = SseVector::column_butterfly2([m1417a, m1417b]);
4324
4325 let m1516a = SseVector::fmadd(values[0], self.twiddles_re[14], x1p30);
4326 let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[0], x2p29);
4327 let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[13], x3p28);
4328 let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[1], x4p27);
4329 let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[12], x5p26);
4330 let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[2], x6p25);
4331 let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[11], x7p24);
4332 let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[3], x8p23);
4333 let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[10], x9p22);
4334 let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[4], x10p21);
4335 let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[9], x11p20);
4336 let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[5], x12p19);
4337 let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[8], x13p18);
4338 let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[6], x14p17);
4339 let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[7], x15p16);
4340 let m1516b = SseVector::mul(self.twiddles_im[14], x1m30);
4341 let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[0], x2m29);
4342 let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[13], x3m28);
4343 let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[1], x4m27);
4344 let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[12], x5m26);
4345 let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[2], x6m25);
4346 let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[11], x7m24);
4347 let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[3], x8m23);
4348 let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[10], x9m22);
4349 let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[4], x10m21);
4350 let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[9], x11m20);
4351 let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[5], x12m19);
4352 let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[8], x13m18);
4353 let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[6], x14m17);
4354 let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[7], x15m16);
4355 let [y15, y16] = SseVector::column_butterfly2([m1516a, m1516b]);
4356
4357
4358 [y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22, y23, y24, y25, y26, y27, y28, y29, y30]
4359 }
4360}
4361
4362struct SseF64Butterfly31<T> {
4363 direction: FftDirection,
4364 twiddles_re: [__m128d; 15],
4365 twiddles_im: [__m128d; 15],
4366 _phantom: std::marker::PhantomData<T>,
4367}
4368
4369boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly31);
4370boilerplate_fft_sse_common_butterfly!(SseF64Butterfly31, 31, |this: &SseF64Butterfly31<_>| this.direction);
4371impl<T: FftNum> SseF64Butterfly31<T> {
4372 #[target_feature(enable = "sse4.1")]
4374 unsafe fn new(direction: FftDirection) -> Self {
4375 assert_f64::<T>();
4376 let twiddles = make_twiddles(31, direction);
4377 unsafe {Self {
4378 direction,
4379 twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
4380 twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
4381 _phantom: std::marker::PhantomData,
4382 }}
4383 }
4384
4385 #[inline(always)]
4386 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
4387 let values = read_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 });
4388
4389 let out = self.perform_fft_direct(values);
4390
4391 write_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 });
4392 }
4393
4394 #[inline(always)]
4395 pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 31]) -> [__m128d; 31] {
4396 let rotate = SseVector::make_rotate90(FftDirection::Inverse);
4397
4398 let y00 = values[0];
4399 let [x1p30, x1m30] = SseVector::column_butterfly2([values[1], values[30]]);
4400 let x1m30 = SseVector::apply_rotate90(rotate, x1m30);
4401 let y00 = SseVector::add(y00, x1p30);
4402 let [x2p29, x2m29] = SseVector::column_butterfly2([values[2], values[29]]);
4403 let x2m29 = SseVector::apply_rotate90(rotate, x2m29);
4404 let y00 = SseVector::add(y00, x2p29);
4405 let [x3p28, x3m28] = SseVector::column_butterfly2([values[3], values[28]]);
4406 let x3m28 = SseVector::apply_rotate90(rotate, x3m28);
4407 let y00 = SseVector::add(y00, x3p28);
4408 let [x4p27, x4m27] = SseVector::column_butterfly2([values[4], values[27]]);
4409 let x4m27 = SseVector::apply_rotate90(rotate, x4m27);
4410 let y00 = SseVector::add(y00, x4p27);
4411 let [x5p26, x5m26] = SseVector::column_butterfly2([values[5], values[26]]);
4412 let x5m26 = SseVector::apply_rotate90(rotate, x5m26);
4413 let y00 = SseVector::add(y00, x5p26);
4414 let [x6p25, x6m25] = SseVector::column_butterfly2([values[6], values[25]]);
4415 let x6m25 = SseVector::apply_rotate90(rotate, x6m25);
4416 let y00 = SseVector::add(y00, x6p25);
4417 let [x7p24, x7m24] = SseVector::column_butterfly2([values[7], values[24]]);
4418 let x7m24 = SseVector::apply_rotate90(rotate, x7m24);
4419 let y00 = SseVector::add(y00, x7p24);
4420 let [x8p23, x8m23] = SseVector::column_butterfly2([values[8], values[23]]);
4421 let x8m23 = SseVector::apply_rotate90(rotate, x8m23);
4422 let y00 = SseVector::add(y00, x8p23);
4423 let [x9p22, x9m22] = SseVector::column_butterfly2([values[9], values[22]]);
4424 let x9m22 = SseVector::apply_rotate90(rotate, x9m22);
4425 let y00 = SseVector::add(y00, x9p22);
4426 let [x10p21, x10m21] = SseVector::column_butterfly2([values[10], values[21]]);
4427 let x10m21 = SseVector::apply_rotate90(rotate, x10m21);
4428 let y00 = SseVector::add(y00, x10p21);
4429 let [x11p20, x11m20] = SseVector::column_butterfly2([values[11], values[20]]);
4430 let x11m20 = SseVector::apply_rotate90(rotate, x11m20);
4431 let y00 = SseVector::add(y00, x11p20);
4432 let [x12p19, x12m19] = SseVector::column_butterfly2([values[12], values[19]]);
4433 let x12m19 = SseVector::apply_rotate90(rotate, x12m19);
4434 let y00 = SseVector::add(y00, x12p19);
4435 let [x13p18, x13m18] = SseVector::column_butterfly2([values[13], values[18]]);
4436 let x13m18 = SseVector::apply_rotate90(rotate, x13m18);
4437 let y00 = SseVector::add(y00, x13p18);
4438 let [x14p17, x14m17] = SseVector::column_butterfly2([values[14], values[17]]);
4439 let x14m17 = SseVector::apply_rotate90(rotate, x14m17);
4440 let y00 = SseVector::add(y00, x14p17);
4441 let [x15p16, x15m16] = SseVector::column_butterfly2([values[15], values[16]]);
4442 let x15m16 = SseVector::apply_rotate90(rotate, x15m16);
4443 let y00 = SseVector::add(y00, x15p16);
4444
4445 let m0130a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p30);
4446 let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[1], x2p29);
4447 let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[2], x3p28);
4448 let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[3], x4p27);
4449 let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[4], x5p26);
4450 let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[5], x6p25);
4451 let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[6], x7p24);
4452 let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[7], x8p23);
4453 let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[8], x9p22);
4454 let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[9], x10p21);
4455 let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[10], x11p20);
4456 let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[11], x12p19);
4457 let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[12], x13p18);
4458 let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[13], x14p17);
4459 let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[14], x15p16);
4460 let m0130b = SseVector::mul(self.twiddles_im[0], x1m30);
4461 let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[1], x2m29);
4462 let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[2], x3m28);
4463 let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[3], x4m27);
4464 let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[4], x5m26);
4465 let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[5], x6m25);
4466 let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[6], x7m24);
4467 let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[7], x8m23);
4468 let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[8], x9m22);
4469 let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[9], x10m21);
4470 let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[10], x11m20);
4471 let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[11], x12m19);
4472 let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[12], x13m18);
4473 let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[13], x14m17);
4474 let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[14], x15m16);
4475 let [y01, y30] = SseVector::column_butterfly2([m0130a, m0130b]);
4476
4477 let m0229a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p30);
4478 let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[3], x2p29);
4479 let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[5], x3p28);
4480 let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[7], x4p27);
4481 let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[9], x5p26);
4482 let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[11], x6p25);
4483 let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[13], x7p24);
4484 let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[14], x8p23);
4485 let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[12], x9p22);
4486 let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[10], x10p21);
4487 let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[8], x11p20);
4488 let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[6], x12p19);
4489 let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[4], x13p18);
4490 let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[2], x14p17);
4491 let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[0], x15p16);
4492 let m0229b = SseVector::mul(self.twiddles_im[1], x1m30);
4493 let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[3], x2m29);
4494 let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[5], x3m28);
4495 let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[7], x4m27);
4496 let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[9], x5m26);
4497 let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[11], x6m25);
4498 let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[13], x7m24);
4499 let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[14], x8m23);
4500 let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[12], x9m22);
4501 let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[10], x10m21);
4502 let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[8], x11m20);
4503 let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[6], x12m19);
4504 let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[4], x13m18);
4505 let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[2], x14m17);
4506 let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[0], x15m16);
4507 let [y02, y29] = SseVector::column_butterfly2([m0229a, m0229b]);
4508
4509 let m0328a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p30);
4510 let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[5], x2p29);
4511 let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[8], x3p28);
4512 let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[11], x4p27);
4513 let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[14], x5p26);
4514 let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[12], x6p25);
4515 let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[9], x7p24);
4516 let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[6], x8p23);
4517 let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[3], x9p22);
4518 let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[0], x10p21);
4519 let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[1], x11p20);
4520 let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[4], x12p19);
4521 let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[7], x13p18);
4522 let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[10], x14p17);
4523 let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[13], x15p16);
4524 let m0328b = SseVector::mul(self.twiddles_im[2], x1m30);
4525 let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[5], x2m29);
4526 let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[8], x3m28);
4527 let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[11], x4m27);
4528 let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[14], x5m26);
4529 let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[12], x6m25);
4530 let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[9], x7m24);
4531 let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[6], x8m23);
4532 let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[3], x9m22);
4533 let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[0], x10m21);
4534 let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[1], x11m20);
4535 let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[4], x12m19);
4536 let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[7], x13m18);
4537 let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[10], x14m17);
4538 let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[13], x15m16);
4539 let [y03, y28] = SseVector::column_butterfly2([m0328a, m0328b]);
4540
4541 let m0427a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p30);
4542 let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[7], x2p29);
4543 let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[11], x3p28);
4544 let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[14], x4p27);
4545 let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[10], x5p26);
4546 let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[6], x6p25);
4547 let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[2], x7p24);
4548 let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[0], x8p23);
4549 let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[4], x9p22);
4550 let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[8], x10p21);
4551 let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[12], x11p20);
4552 let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[13], x12p19);
4553 let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[9], x13p18);
4554 let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[5], x14p17);
4555 let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[1], x15p16);
4556 let m0427b = SseVector::mul(self.twiddles_im[3], x1m30);
4557 let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[7], x2m29);
4558 let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[11], x3m28);
4559 let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[14], x4m27);
4560 let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[10], x5m26);
4561 let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[6], x6m25);
4562 let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[2], x7m24);
4563 let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[0], x8m23);
4564 let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[4], x9m22);
4565 let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[8], x10m21);
4566 let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[12], x11m20);
4567 let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[13], x12m19);
4568 let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[9], x13m18);
4569 let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[5], x14m17);
4570 let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[1], x15m16);
4571 let [y04, y27] = SseVector::column_butterfly2([m0427a, m0427b]);
4572
4573 let m0526a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p30);
4574 let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[9], x2p29);
4575 let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[14], x3p28);
4576 let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[10], x4p27);
4577 let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[5], x5p26);
4578 let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[0], x6p25);
4579 let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[3], x7p24);
4580 let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[8], x8p23);
4581 let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[13], x9p22);
4582 let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[11], x10p21);
4583 let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[6], x11p20);
4584 let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[1], x12p19);
4585 let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[2], x13p18);
4586 let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[7], x14p17);
4587 let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[12], x15p16);
4588 let m0526b = SseVector::mul(self.twiddles_im[4], x1m30);
4589 let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[9], x2m29);
4590 let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[14], x3m28);
4591 let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[10], x4m27);
4592 let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[5], x5m26);
4593 let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[0], x6m25);
4594 let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[3], x7m24);
4595 let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[8], x8m23);
4596 let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[13], x9m22);
4597 let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[11], x10m21);
4598 let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[6], x11m20);
4599 let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[1], x12m19);
4600 let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[2], x13m18);
4601 let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[7], x14m17);
4602 let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[12], x15m16);
4603 let [y05, y26] = SseVector::column_butterfly2([m0526a, m0526b]);
4604
4605 let m0625a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p30);
4606 let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[11], x2p29);
4607 let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[12], x3p28);
4608 let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[6], x4p27);
4609 let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[0], x5p26);
4610 let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[4], x6p25);
4611 let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[10], x7p24);
4612 let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[13], x8p23);
4613 let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[7], x9p22);
4614 let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[1], x10p21);
4615 let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[3], x11p20);
4616 let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[9], x12p19);
4617 let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[14], x13p18);
4618 let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[8], x14p17);
4619 let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[2], x15p16);
4620 let m0625b = SseVector::mul(self.twiddles_im[5], x1m30);
4621 let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[11], x2m29);
4622 let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[12], x3m28);
4623 let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[6], x4m27);
4624 let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[0], x5m26);
4625 let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[4], x6m25);
4626 let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[10], x7m24);
4627 let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[13], x8m23);
4628 let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[7], x9m22);
4629 let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[1], x10m21);
4630 let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[3], x11m20);
4631 let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[9], x12m19);
4632 let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[14], x13m18);
4633 let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[8], x14m17);
4634 let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[2], x15m16);
4635 let [y06, y25] = SseVector::column_butterfly2([m0625a, m0625b]);
4636
4637 let m0724a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p30);
4638 let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[13], x2p29);
4639 let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[9], x3p28);
4640 let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[2], x4p27);
4641 let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[3], x5p26);
4642 let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[10], x6p25);
4643 let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[12], x7p24);
4644 let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[5], x8p23);
4645 let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[0], x9p22);
4646 let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[7], x10p21);
4647 let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[14], x11p20);
4648 let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[8], x12p19);
4649 let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[1], x13p18);
4650 let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[4], x14p17);
4651 let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[11], x15p16);
4652 let m0724b = SseVector::mul(self.twiddles_im[6], x1m30);
4653 let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[13], x2m29);
4654 let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[9], x3m28);
4655 let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[2], x4m27);
4656 let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[3], x5m26);
4657 let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[10], x6m25);
4658 let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[12], x7m24);
4659 let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[5], x8m23);
4660 let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[0], x9m22);
4661 let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[7], x10m21);
4662 let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[14], x11m20);
4663 let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[8], x12m19);
4664 let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[1], x13m18);
4665 let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[4], x14m17);
4666 let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[11], x15m16);
4667 let [y07, y24] = SseVector::column_butterfly2([m0724a, m0724b]);
4668
4669 let m0823a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p30);
4670 let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[14], x2p29);
4671 let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[6], x3p28);
4672 let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[0], x4p27);
4673 let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[8], x5p26);
4674 let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[13], x6p25);
4675 let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[5], x7p24);
4676 let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[1], x8p23);
4677 let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[9], x9p22);
4678 let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[12], x10p21);
4679 let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[4], x11p20);
4680 let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[2], x12p19);
4681 let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[10], x13p18);
4682 let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[11], x14p17);
4683 let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[3], x15p16);
4684 let m0823b = SseVector::mul(self.twiddles_im[7], x1m30);
4685 let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[14], x2m29);
4686 let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[6], x3m28);
4687 let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[0], x4m27);
4688 let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[8], x5m26);
4689 let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[13], x6m25);
4690 let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[5], x7m24);
4691 let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[1], x8m23);
4692 let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[9], x9m22);
4693 let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[12], x10m21);
4694 let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[4], x11m20);
4695 let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[2], x12m19);
4696 let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[10], x13m18);
4697 let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[11], x14m17);
4698 let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[3], x15m16);
4699 let [y08, y23] = SseVector::column_butterfly2([m0823a, m0823b]);
4700
4701 let m0922a = SseVector::fmadd(values[0], self.twiddles_re[8], x1p30);
4702 let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[12], x2p29);
4703 let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[3], x3p28);
4704 let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[4], x4p27);
4705 let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[13], x5p26);
4706 let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[7], x6p25);
4707 let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[0], x7p24);
4708 let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[9], x8p23);
4709 let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[11], x9p22);
4710 let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[2], x10p21);
4711 let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[5], x11p20);
4712 let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[14], x12p19);
4713 let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[6], x13p18);
4714 let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[1], x14p17);
4715 let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[10], x15p16);
4716 let m0922b = SseVector::mul(self.twiddles_im[8], x1m30);
4717 let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[12], x2m29);
4718 let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[3], x3m28);
4719 let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[4], x4m27);
4720 let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[13], x5m26);
4721 let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[7], x6m25);
4722 let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[0], x7m24);
4723 let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[9], x8m23);
4724 let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[11], x9m22);
4725 let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[2], x10m21);
4726 let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[5], x11m20);
4727 let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[14], x12m19);
4728 let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[6], x13m18);
4729 let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[1], x14m17);
4730 let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[10], x15m16);
4731 let [y09, y22] = SseVector::column_butterfly2([m0922a, m0922b]);
4732
4733 let m1021a = SseVector::fmadd(values[0], self.twiddles_re[9], x1p30);
4734 let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[10], x2p29);
4735 let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[0], x3p28);
4736 let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[8], x4p27);
4737 let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[11], x5p26);
4738 let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[1], x6p25);
4739 let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[7], x7p24);
4740 let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[12], x8p23);
4741 let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[2], x9p22);
4742 let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[6], x10p21);
4743 let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[13], x11p20);
4744 let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[3], x12p19);
4745 let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[5], x13p18);
4746 let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[14], x14p17);
4747 let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[4], x15p16);
4748 let m1021b = SseVector::mul(self.twiddles_im[9], x1m30);
4749 let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[10], x2m29);
4750 let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[0], x3m28);
4751 let m1021b = SseVector::fmadd(m1021b, self.twiddles_im[8], x4m27);
4752 let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[11], x5m26);
4753 let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[1], x6m25);
4754 let m1021b = SseVector::fmadd(m1021b, self.twiddles_im[7], x7m24);
4755 let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[12], x8m23);
4756 let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[2], x9m22);
4757 let m1021b = SseVector::fmadd(m1021b, self.twiddles_im[6], x10m21);
4758 let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[13], x11m20);
4759 let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[3], x12m19);
4760 let m1021b = SseVector::fmadd(m1021b, self.twiddles_im[5], x13m18);
4761 let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[14], x14m17);
4762 let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[4], x15m16);
4763 let [y10, y21] = SseVector::column_butterfly2([m1021a, m1021b]);
4764
4765 let m1120a = SseVector::fmadd(values[0], self.twiddles_re[10], x1p30);
4766 let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[8], x2p29);
4767 let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[1], x3p28);
4768 let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[12], x4p27);
4769 let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[6], x5p26);
4770 let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[3], x6p25);
4771 let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[14], x7p24);
4772 let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[4], x8p23);
4773 let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[5], x9p22);
4774 let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[13], x10p21);
4775 let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[2], x11p20);
4776 let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[7], x12p19);
4777 let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[11], x13p18);
4778 let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[0], x14p17);
4779 let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[9], x15p16);
4780 let m1120b = SseVector::mul(self.twiddles_im[10], x1m30);
4781 let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[8], x2m29);
4782 let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[1], x3m28);
4783 let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[12], x4m27);
4784 let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[6], x5m26);
4785 let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[3], x6m25);
4786 let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[14], x7m24);
4787 let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[4], x8m23);
4788 let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[5], x9m22);
4789 let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[13], x10m21);
4790 let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[2], x11m20);
4791 let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[7], x12m19);
4792 let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[11], x13m18);
4793 let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[0], x14m17);
4794 let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[9], x15m16);
4795 let [y11, y20] = SseVector::column_butterfly2([m1120a, m1120b]);
4796
4797 let m1219a = SseVector::fmadd(values[0], self.twiddles_re[11], x1p30);
4798 let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[6], x2p29);
4799 let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[4], x3p28);
4800 let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[13], x4p27);
4801 let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[1], x5p26);
4802 let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[9], x6p25);
4803 let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[8], x7p24);
4804 let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[2], x8p23);
4805 let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[14], x9p22);
4806 let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[3], x10p21);
4807 let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[7], x11p20);
4808 let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[10], x12p19);
4809 let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[0], x13p18);
4810 let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[12], x14p17);
4811 let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[5], x15p16);
4812 let m1219b = SseVector::mul(self.twiddles_im[11], x1m30);
4813 let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[6], x2m29);
4814 let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[4], x3m28);
4815 let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[13], x4m27);
4816 let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[1], x5m26);
4817 let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[9], x6m25);
4818 let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[8], x7m24);
4819 let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[2], x8m23);
4820 let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[14], x9m22);
4821 let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[3], x10m21);
4822 let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[7], x11m20);
4823 let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[10], x12m19);
4824 let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[0], x13m18);
4825 let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[12], x14m17);
4826 let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[5], x15m16);
4827 let [y12, y19] = SseVector::column_butterfly2([m1219a, m1219b]);
4828
4829 let m1318a = SseVector::fmadd(values[0], self.twiddles_re[12], x1p30);
4830 let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[4], x2p29);
4831 let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[7], x3p28);
4832 let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[9], x4p27);
4833 let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[2], x5p26);
4834 let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[14], x6p25);
4835 let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[1], x7p24);
4836 let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[10], x8p23);
4837 let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[6], x9p22);
4838 let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[5], x10p21);
4839 let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[11], x11p20);
4840 let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[0], x12p19);
4841 let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[13], x13p18);
4842 let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[3], x14p17);
4843 let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[8], x15p16);
4844 let m1318b = SseVector::mul(self.twiddles_im[12], x1m30);
4845 let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[4], x2m29);
4846 let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[7], x3m28);
4847 let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[9], x4m27);
4848 let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[2], x5m26);
4849 let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[14], x6m25);
4850 let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[1], x7m24);
4851 let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[10], x8m23);
4852 let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[6], x9m22);
4853 let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[5], x10m21);
4854 let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[11], x11m20);
4855 let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[0], x12m19);
4856 let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[13], x13m18);
4857 let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[3], x14m17);
4858 let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[8], x15m16);
4859 let [y13, y18] = SseVector::column_butterfly2([m1318a, m1318b]);
4860
4861 let m1417a = SseVector::fmadd(values[0], self.twiddles_re[13], x1p30);
4862 let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[2], x2p29);
4863 let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[10], x3p28);
4864 let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[5], x4p27);
4865 let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[7], x5p26);
4866 let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[8], x6p25);
4867 let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[4], x7p24);
4868 let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[11], x8p23);
4869 let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[1], x9p22);
4870 let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[14], x10p21);
4871 let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[0], x11p20);
4872 let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[12], x12p19);
4873 let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[3], x13p18);
4874 let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[9], x14p17);
4875 let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[6], x15p16);
4876 let m1417b = SseVector::mul(self.twiddles_im[13], x1m30);
4877 let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[2], x2m29);
4878 let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[10], x3m28);
4879 let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[5], x4m27);
4880 let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[7], x5m26);
4881 let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[8], x6m25);
4882 let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[4], x7m24);
4883 let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[11], x8m23);
4884 let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[1], x9m22);
4885 let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[14], x10m21);
4886 let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[0], x11m20);
4887 let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[12], x12m19);
4888 let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[3], x13m18);
4889 let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[9], x14m17);
4890 let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[6], x15m16);
4891 let [y14, y17] = SseVector::column_butterfly2([m1417a, m1417b]);
4892
4893 let m1516a = SseVector::fmadd(values[0], self.twiddles_re[14], x1p30);
4894 let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[0], x2p29);
4895 let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[13], x3p28);
4896 let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[1], x4p27);
4897 let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[12], x5p26);
4898 let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[2], x6p25);
4899 let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[11], x7p24);
4900 let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[3], x8p23);
4901 let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[10], x9p22);
4902 let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[4], x10p21);
4903 let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[9], x11p20);
4904 let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[5], x12p19);
4905 let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[8], x13p18);
4906 let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[6], x14p17);
4907 let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[7], x15p16);
4908 let m1516b = SseVector::mul(self.twiddles_im[14], x1m30);
4909 let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[0], x2m29);
4910 let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[13], x3m28);
4911 let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[1], x4m27);
4912 let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[12], x5m26);
4913 let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[2], x6m25);
4914 let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[11], x7m24);
4915 let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[3], x8m23);
4916 let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[10], x9m22);
4917 let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[4], x10m21);
4918 let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[9], x11m20);
4919 let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[5], x12m19);
4920 let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[8], x13m18);
4921 let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[6], x14m17);
4922 let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[7], x15m16);
4923 let [y15, y16] = SseVector::column_butterfly2([m1516a, m1516b]);
4924
4925
4926 [y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22, y23, y24, y25, y26, y27, y28, y29, y30]
4927 }
4928}
4929
4930
4931#[cfg(test)]
4932mod unit_tests {
4933 use super::*;
4934 use crate::test_utils::check_fft_algorithm;
4935
4936 macro_rules! test_butterfly_32_func {
4937 ($test_name:ident, $struct_name:ident, $size:expr) => {
4938 #[test]
4939 fn $test_name() {
4940 assert!(std::arch::is_x86_feature_detected!("sse4.1"));
4941
4942 let fwd = unsafe { $struct_name::new(FftDirection::Forward) };
4943 check_fft_algorithm::<f32>(&fwd, $size, FftDirection::Forward);
4944
4945 let inv = unsafe { $struct_name::new(FftDirection::Inverse) };
4946 check_fft_algorithm::<f32>(&inv, $size, FftDirection::Inverse);
4947 }
4948 };
4949 }
4950 macro_rules! test_butterfly_64_func {
4951 ($test_name:ident, $struct_name:ident, $size:expr) => {
4952 #[test]
4953 fn $test_name() {
4954 assert!(std::arch::is_x86_feature_detected!("sse4.1"));
4955
4956 let fwd = unsafe { $struct_name::new(FftDirection::Forward) };
4957 check_fft_algorithm::<f64>(&fwd, $size, FftDirection::Forward);
4958
4959 let inv = unsafe { $struct_name::new(FftDirection::Inverse) };
4960 check_fft_algorithm::<f64>(&inv, $size, FftDirection::Inverse);
4961 }
4962 };
4963 }
4964 test_butterfly_32_func!(test_ssef32_butterfly7, SseF32Butterfly7, 7);
4965 test_butterfly_32_func!(test_ssef32_butterfly11, SseF32Butterfly11, 11);
4966 test_butterfly_32_func!(test_ssef32_butterfly13, SseF32Butterfly13, 13);
4967 test_butterfly_32_func!(test_ssef32_butterfly17, SseF32Butterfly17, 17);
4968 test_butterfly_32_func!(test_ssef32_butterfly19, SseF32Butterfly19, 19);
4969 test_butterfly_32_func!(test_ssef32_butterfly23, SseF32Butterfly23, 23);
4970 test_butterfly_32_func!(test_ssef32_butterfly29, SseF32Butterfly29, 29);
4971 test_butterfly_32_func!(test_ssef32_butterfly31, SseF32Butterfly31, 31);
4972 test_butterfly_64_func!(test_ssef64_butterfly7, SseF64Butterfly7, 7);
4973 test_butterfly_64_func!(test_ssef64_butterfly11, SseF64Butterfly11, 11);
4974 test_butterfly_64_func!(test_ssef64_butterfly13, SseF64Butterfly13, 13);
4975 test_butterfly_64_func!(test_ssef64_butterfly17, SseF64Butterfly17, 17);
4976 test_butterfly_64_func!(test_ssef64_butterfly19, SseF64Butterfly19, 19);
4977 test_butterfly_64_func!(test_ssef64_butterfly23, SseF64Butterfly23, 23);
4978 test_butterfly_64_func!(test_ssef64_butterfly29, SseF64Butterfly29, 29);
4979 test_butterfly_64_func!(test_ssef64_butterfly31, SseF64Butterfly31, 31);
4980}
4981