1use std::arch::x86_64::*;
2use std::marker::PhantomData;
3use std::mem::MaybeUninit;
4
5use num_complex::Complex;
6
7use crate::array_utils;
8use crate::array_utils::workaround_transmute_mut;
9use crate::array_utils::DoubleBuf;
10use crate::common::{fft_error_inplace, fft_error_outofplace};
11use crate::{common::FftNum, twiddles};
12use crate::{Direction, Fft, FftDirection, Length};
13
14use super::avx64_utils;
15use super::avx_vector;
16use super::avx_vector::{AvxArray, AvxArrayMut, AvxVector, AvxVector128, AvxVector256, Rotation90};
17
18#[allow(unused)]
22macro_rules! boilerplate_fft_simd_butterfly {
23 ($struct_name:ident, $len:expr) => {
24 impl $struct_name<f64> {
25 #[inline]
26 pub fn new(direction: FftDirection) -> Result<Self, ()> {
27 let has_avx = is_x86_feature_detected!("avx");
28 let has_fma = is_x86_feature_detected!("fma");
29 if has_avx && has_fma {
30 Ok(unsafe { Self::new_with_avx(direction) })
32 } else {
33 Err(())
34 }
35 }
36 }
37 impl<T: FftNum> Fft<T> for $struct_name<f64> {
38 fn process_outofplace_with_scratch(
39 &self,
40 input: &mut [Complex<T>],
41 output: &mut [Complex<T>],
42 _scratch: &mut [Complex<T>],
43 ) {
44 if input.len() < self.len() || output.len() != input.len() {
45 fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0);
47 return; }
49
50 let result = array_utils::iter_chunks_zipped(
51 input,
52 output,
53 self.len(),
54 |in_chunk, out_chunk| {
55 unsafe {
56 let input_slice = workaround_transmute_mut(in_chunk);
58 let output_slice = workaround_transmute_mut(out_chunk);
59 self.perform_fft_f64(DoubleBuf {
60 input: input_slice,
61 output: output_slice,
62 });
63 }
64 },
65 );
66
67 if result.is_err() {
68 fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0);
71 }
72 }
73 fn process_with_scratch(&self, buffer: &mut [Complex<T>], _scratch: &mut [Complex<T>]) {
74 if buffer.len() < self.len() {
75 fft_error_inplace(self.len(), buffer.len(), 0, 0);
77 return; }
79
80 let result = array_utils::iter_chunks(buffer, self.len(), |chunk| {
81 unsafe {
82 self.perform_fft_f64(workaround_transmute_mut::<_, Complex<f64>>(chunk));
84 }
85 });
86
87 if result.is_err() {
88 fft_error_inplace(self.len(), buffer.len(), 0, 0);
91 }
92 }
93 #[inline(always)]
94 fn get_inplace_scratch_len(&self) -> usize {
95 0
96 }
97 #[inline(always)]
98 fn get_outofplace_scratch_len(&self) -> usize {
99 0
100 }
101 }
102 impl<T> Length for $struct_name<T> {
103 #[inline(always)]
104 fn len(&self) -> usize {
105 $len
106 }
107 }
108 impl<T> Direction for $struct_name<T> {
109 #[inline(always)]
110 fn fft_direction(&self) -> FftDirection {
111 self.direction
112 }
113 }
114 };
115}
116
117macro_rules! boilerplate_fft_simd_butterfly_with_scratch {
121 ($struct_name:ident, $len:expr) => {
122 impl $struct_name<f64> {
123 #[inline]
124 pub fn is_supported_by_cpu() -> bool {
125 is_x86_feature_detected!("avx") && is_x86_feature_detected!("fma")
126 }
127 #[inline]
128 pub fn new(direction: FftDirection) -> Result<Self, ()> {
129 if Self::is_supported_by_cpu() {
130 Ok(unsafe { Self::new_with_avx(direction) })
132 } else {
133 Err(())
134 }
135 }
136 }
137 impl<T> $struct_name<T> {
138 #[inline]
139 fn perform_fft_inplace(
140 &self,
141 buffer: &mut [Complex<f64>],
142 scratch: &mut [Complex<f64>],
143 ) {
144 unsafe { self.column_butterflies_and_transpose(buffer, scratch) };
147
148 unsafe {
151 self.row_butterflies(DoubleBuf {
152 input: scratch,
153 output: buffer,
154 })
155 };
156 }
157
158 #[inline]
159 fn perform_fft_out_of_place(
160 &self,
161 input: &mut [Complex<f64>],
162 output: &mut [Complex<f64>],
163 ) {
164 unsafe { self.column_butterflies_and_transpose(input, output) };
167
168 unsafe { self.row_butterflies(output) };
171 }
172 }
173 impl<T: FftNum> Fft<T> for $struct_name<f64> {
174 fn process_outofplace_with_scratch(
175 &self,
176 input: &mut [Complex<T>],
177 output: &mut [Complex<T>],
178 _scratch: &mut [Complex<T>],
179 ) {
180 if input.len() < self.len() || output.len() != input.len() {
181 fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0);
183 return; }
185
186 let transmuted_input: &mut [Complex<f64>] =
188 unsafe { array_utils::workaround_transmute_mut(input) };
189 let transmuted_output: &mut [Complex<f64>] =
190 unsafe { array_utils::workaround_transmute_mut(output) };
191 let result = array_utils::iter_chunks_zipped(
192 transmuted_input,
193 transmuted_output,
194 self.len(),
195 |in_chunk, out_chunk| self.perform_fft_out_of_place(in_chunk, out_chunk),
196 );
197
198 if result.is_err() {
199 fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0);
202 }
203 }
204 fn process_with_scratch(&self, buffer: &mut [Complex<T>], scratch: &mut [Complex<T>]) {
205 let required_scratch = self.len();
206 if scratch.len() < required_scratch || buffer.len() < self.len() {
207 fft_error_inplace(self.len(), buffer.len(), self.len(), scratch.len());
209 return; }
211
212 let scratch = &mut scratch[..required_scratch];
213
214 let transmuted_buffer: &mut [Complex<f64>] =
216 unsafe { array_utils::workaround_transmute_mut(buffer) };
217 let transmuted_scratch: &mut [Complex<f64>] =
218 unsafe { array_utils::workaround_transmute_mut(scratch) };
219 let result = array_utils::iter_chunks(transmuted_buffer, self.len(), |chunk| {
220 self.perform_fft_inplace(chunk, transmuted_scratch)
221 });
222
223 if result.is_err() {
224 fft_error_inplace(self.len(), buffer.len(), self.len(), scratch.len());
227 }
228 }
229 #[inline(always)]
230 fn get_inplace_scratch_len(&self) -> usize {
231 $len
232 }
233 #[inline(always)]
234 fn get_outofplace_scratch_len(&self) -> usize {
235 0
236 }
237 }
238 impl<T> Length for $struct_name<T> {
239 #[inline(always)]
240 fn len(&self) -> usize {
241 $len
242 }
243 }
244 impl<T> Direction for $struct_name<T> {
245 #[inline(always)]
246 fn fft_direction(&self) -> FftDirection {
247 self.direction
248 }
249 }
250 };
251}
252
253macro_rules! gen_butterfly_twiddles_interleaved_columns {
254 ($num_rows:expr, $num_cols:expr, $skip_cols:expr, $direction: expr) => {{
255 const FFT_LEN: usize = $num_rows * $num_cols;
256 const TWIDDLE_ROWS: usize = $num_rows - 1;
257 const TWIDDLE_COLS: usize = $num_cols - $skip_cols;
258 const TWIDDLE_VECTOR_COLS: usize = TWIDDLE_COLS / 2;
259 const TWIDDLE_VECTOR_COUNT: usize = TWIDDLE_VECTOR_COLS * TWIDDLE_ROWS;
260 let mut twiddles = [AvxVector::zero(); TWIDDLE_VECTOR_COUNT];
261 for index in 0..TWIDDLE_VECTOR_COUNT {
262 let y = (index / TWIDDLE_VECTOR_COLS) + 1;
263 let x = (index % TWIDDLE_VECTOR_COLS) * 2 + $skip_cols;
264
265 twiddles[index] = AvxVector::make_mixedradix_twiddle_chunk(x, y, FFT_LEN, $direction);
266 }
267 twiddles
268 }};
269}
270
271macro_rules! gen_butterfly_twiddles_separated_columns {
272 ($num_rows:expr, $num_cols:expr, $skip_cols:expr, $direction: expr) => {{
273 const FFT_LEN: usize = $num_rows * $num_cols;
274 const TWIDDLE_ROWS: usize = $num_rows - 1;
275 const TWIDDLE_COLS: usize = $num_cols - $skip_cols;
276 const TWIDDLE_VECTOR_COLS: usize = TWIDDLE_COLS / 2;
277 const TWIDDLE_VECTOR_COUNT: usize = TWIDDLE_VECTOR_COLS * TWIDDLE_ROWS;
278 let mut twiddles = [AvxVector::zero(); TWIDDLE_VECTOR_COUNT];
279 for index in 0..TWIDDLE_VECTOR_COUNT {
280 let y = (index % TWIDDLE_ROWS) + 1;
281 let x = (index / TWIDDLE_ROWS) * 2 + $skip_cols;
282
283 twiddles[index] = AvxVector::make_mixedradix_twiddle_chunk(x, y, FFT_LEN, $direction);
284 }
285 twiddles
286 }};
287}
288
289pub struct Butterfly5Avx64<T> {
290 twiddles: [__m256d; 3],
291 direction: FftDirection,
292 _phantom_t: std::marker::PhantomData<T>,
293}
294boilerplate_fft_simd_butterfly!(Butterfly5Avx64, 5);
295impl Butterfly5Avx64<f64> {
296 #[target_feature(enable = "avx")]
297 unsafe fn new_with_avx(direction: FftDirection) -> Self {
298 let twiddle1 = twiddles::compute_twiddle(1, 5, direction);
299 let twiddle2 = twiddles::compute_twiddle(2, 5, direction);
300 Self {
301 twiddles: [
302 _mm256_set_pd(twiddle1.im, twiddle1.im, twiddle1.re, twiddle1.re),
303 _mm256_set_pd(twiddle2.im, twiddle2.im, twiddle2.re, twiddle2.re),
304 _mm256_set_pd(-twiddle1.im, -twiddle1.im, twiddle1.re, twiddle1.re),
305 ],
306 direction,
307 _phantom_t: PhantomData,
308 }
309 }
310}
311impl<T> Butterfly5Avx64<T> {
312 #[target_feature(enable = "avx", enable = "fma")]
313 unsafe fn perform_fft_f64(&self, mut buffer: impl AvxArrayMut<f64>) {
314 let input0 = _mm256_loadu2_m128d(
315 buffer.input_ptr() as *const f64,
316 buffer.input_ptr() as *const f64,
317 );
318 let input12 = buffer.load_complex(1);
319 let input34 = buffer.load_complex(3);
320
321 let input43 = AvxVector::reverse_complex_elements(input34);
323
324 let [sum12, diff43] = AvxVector::column_butterfly2([input12, input43]);
326
327 let rotation = AvxVector::make_rotation90(FftDirection::Inverse);
328 let rotated43 = AvxVector::rotate90(diff43, rotation);
329
330 let [mid14, mid23] = avx64_utils::transpose_2x2_f64([sum12, rotated43]);
331
332 let sum1234 = AvxVector::add(mid14.lo(), mid23.lo());
334 let output0 = AvxVector::add(input0.lo(), sum1234);
335
336 let twiddled_outer14 = AvxVector::mul(mid14, self.twiddles[0]);
338 let twiddled_inner14 = AvxVector::mul(mid14, self.twiddles[1]);
339 let twiddled14 = AvxVector::fmadd(mid23, self.twiddles[1], twiddled_outer14);
340 let twiddled23 = AvxVector::fmadd(mid23, self.twiddles[2], twiddled_inner14);
341
342 let [twiddled12, twiddled43] = avx64_utils::transpose_2x2_f64([twiddled14, twiddled23]);
344 let [output12, output43] = AvxVector::column_butterfly2([twiddled12, twiddled43]);
345
346 let final12 = AvxVector::add(input0, output12);
348 let output34 = AvxVector::reverse_complex_elements(output43);
349 let final34 = AvxVector::add(input0, output34);
350
351 buffer.store_partial1_complex(output0, 0);
352 buffer.store_complex(final12, 1);
353 buffer.store_complex(final34, 3);
354 }
355}
356
357pub struct Butterfly7Avx64<T> {
358 twiddles: [__m256d; 5],
359 direction: FftDirection,
360 _phantom_t: std::marker::PhantomData<T>,
361}
362boilerplate_fft_simd_butterfly!(Butterfly7Avx64, 7);
363impl Butterfly7Avx64<f64> {
364 #[target_feature(enable = "avx")]
365 unsafe fn new_with_avx(direction: FftDirection) -> Self {
366 let twiddle1 = twiddles::compute_twiddle(1, 7, direction);
367 let twiddle2 = twiddles::compute_twiddle(2, 7, direction);
368 let twiddle3 = twiddles::compute_twiddle(3, 7, direction);
369 Self {
370 twiddles: [
371 _mm256_set_pd(twiddle1.im, twiddle1.im, twiddle1.re, twiddle1.re),
372 _mm256_set_pd(twiddle2.im, twiddle2.im, twiddle2.re, twiddle2.re),
373 _mm256_set_pd(twiddle3.im, twiddle3.im, twiddle3.re, twiddle3.re),
374 _mm256_set_pd(-twiddle3.im, -twiddle3.im, twiddle3.re, twiddle3.re),
375 _mm256_set_pd(-twiddle1.im, -twiddle1.im, twiddle1.re, twiddle1.re),
376 ],
377 direction,
378 _phantom_t: PhantomData,
379 }
380 }
381}
382impl<T> Butterfly7Avx64<T> {
383 #[target_feature(enable = "avx", enable = "fma")]
384 unsafe fn perform_fft_f64(&self, mut buffer: impl AvxArrayMut<f64>) {
385 let input0 = _mm256_loadu2_m128d(
386 buffer.input_ptr() as *const f64,
387 buffer.input_ptr() as *const f64,
388 );
389 let input12 = buffer.load_complex(1);
390 let input3 = buffer.load_partial1_complex(3);
391 let input4 = buffer.load_partial1_complex(4);
392 let input56 = buffer.load_complex(5);
393
394 let input65 = AvxVector::reverse_complex_elements(input56);
396
397 let [sum12, diff65] = AvxVector::column_butterfly2([input12, input65]);
399 let [sum3, diff4] = AvxVector::column_butterfly2([input3, input4]);
400
401 let rotation = AvxVector::make_rotation90(FftDirection::Inverse);
402 let rotated65 = AvxVector::rotate90(diff65, rotation);
403 let rotated4 = AvxVector::rotate90(diff4, rotation.lo());
404
405 let [mid16, mid25] = avx64_utils::transpose_2x2_f64([sum12, rotated65]);
406 let mid34 = AvxVector128::merge(sum3, rotated4);
407
408 let output0_left = AvxVector::add(mid16.lo(), mid25.lo());
410 let output0_right = AvxVector::add(input0.lo(), mid34.lo());
411 let output0 = AvxVector::add(output0_left, output0_right);
412 buffer.store_partial1_complex(output0, 0);
413
414 let twiddled16_intermediate1 = AvxVector::mul(mid16, self.twiddles[0]);
416 let twiddled25_intermediate1 = AvxVector::mul(mid16, self.twiddles[1]);
417 let twiddled34_intermediate1 = AvxVector::mul(mid16, self.twiddles[2]);
418
419 let twiddled16_intermediate2 =
420 AvxVector::fmadd(mid25, self.twiddles[1], twiddled16_intermediate1);
421 let twiddled25_intermediate2 =
422 AvxVector::fmadd(mid25, self.twiddles[3], twiddled25_intermediate1);
423 let twiddled34_intermediate2 =
424 AvxVector::fmadd(mid25, self.twiddles[4], twiddled34_intermediate1);
425
426 let twiddled16 = AvxVector::fmadd(mid34, self.twiddles[2], twiddled16_intermediate2);
427 let twiddled25 = AvxVector::fmadd(mid34, self.twiddles[4], twiddled25_intermediate2);
428 let twiddled34 = AvxVector::fmadd(mid34, self.twiddles[1], twiddled34_intermediate2);
429
430 let [twiddled12, twiddled65] = avx64_utils::transpose_2x2_f64([twiddled16, twiddled25]);
432
433 let twiddled03 = AvxVector::add(twiddled34.lo(), input0.lo());
435
436 let [output12, output65] = AvxVector::column_butterfly2([twiddled12, twiddled65]);
437 let final12 = AvxVector::add(output12, input0);
438 let output56 = AvxVector::reverse_complex_elements(output65);
439 let final56 = AvxVector::add(output56, input0);
440
441 let [final3, final4] = AvxVector::column_butterfly2([twiddled03, twiddled34.hi()]);
442
443 buffer.store_complex(final12, 1);
444 buffer.store_partial1_complex(final3, 3);
445 buffer.store_partial1_complex(final4, 4);
446 buffer.store_complex(final56, 5);
447 }
448}
449
450pub struct Butterfly11Avx64<T> {
451 twiddles: [__m256d; 10],
452 direction: FftDirection,
453 _phantom_t: std::marker::PhantomData<T>,
454}
455boilerplate_fft_simd_butterfly!(Butterfly11Avx64, 11);
456impl Butterfly11Avx64<f64> {
457 #[target_feature(enable = "avx")]
458 unsafe fn new_with_avx(direction: FftDirection) -> Self {
459 let twiddle1 = twiddles::compute_twiddle(1, 11, direction);
460 let twiddle2 = twiddles::compute_twiddle(2, 11, direction);
461 let twiddle3 = twiddles::compute_twiddle(3, 11, direction);
462 let twiddle4 = twiddles::compute_twiddle(4, 11, direction);
463 let twiddle5 = twiddles::compute_twiddle(5, 11, direction);
464
465 let twiddles = [
466 _mm256_set_pd(twiddle1.im, twiddle1.im, twiddle1.re, twiddle1.re),
467 _mm256_set_pd(twiddle2.im, twiddle2.im, twiddle2.re, twiddle2.re),
468 _mm256_set_pd(twiddle3.im, twiddle3.im, twiddle3.re, twiddle3.re),
469 _mm256_set_pd(twiddle4.im, twiddle4.im, twiddle4.re, twiddle4.re),
470 _mm256_set_pd(twiddle5.im, twiddle5.im, twiddle5.re, twiddle5.re),
471 _mm256_set_pd(-twiddle5.im, -twiddle5.im, twiddle5.re, twiddle5.re),
472 _mm256_set_pd(-twiddle4.im, -twiddle4.im, twiddle4.re, twiddle4.re),
473 _mm256_set_pd(-twiddle3.im, -twiddle3.im, twiddle3.re, twiddle3.re),
474 _mm256_set_pd(-twiddle2.im, -twiddle2.im, twiddle2.re, twiddle2.re),
475 _mm256_set_pd(-twiddle1.im, -twiddle1.im, twiddle1.re, twiddle1.re),
476 ];
477
478 Self {
479 twiddles,
480 direction,
481 _phantom_t: PhantomData,
482 }
483 }
484}
485impl<T> Butterfly11Avx64<T> {
486 #[target_feature(enable = "avx", enable = "fma")]
487 unsafe fn perform_fft_f64(&self, mut buffer: impl AvxArrayMut<f64>) {
488 let input0 = buffer.load_partial1_complex(0);
489 let input12 = buffer.load_complex(1);
490 let input34 = buffer.load_complex(3);
491 let input56 = buffer.load_complex(5);
492 let input78 = buffer.load_complex(7);
493 let input910 = buffer.load_complex(9);
494
495 let [input55, input66] = AvxVector::unpack_complex([input56, input56]);
497 let input87 = AvxVector::reverse_complex_elements(input78);
498 let input109 = AvxVector::reverse_complex_elements(input910);
499
500 let [sum12, diff109] = AvxVector::column_butterfly2([input12, input109]);
502 let [sum34, diff87] = AvxVector::column_butterfly2([input34, input87]);
503 let [sum55, diff66] = AvxVector::column_butterfly2([input55, input66]);
504
505 let rotation = AvxVector::make_rotation90(FftDirection::Inverse);
506 let rotated109 = AvxVector::rotate90(diff109, rotation);
507 let rotated87 = AvxVector::rotate90(diff87, rotation);
508 let rotated66 = AvxVector::rotate90(diff66, rotation);
509
510 let [mid110, mid29] = AvxVector::unpack_complex([sum12, rotated109]);
512 let [mid38, mid47] = AvxVector::unpack_complex([sum34, rotated87]);
513 let mid56 = AvxVector::unpacklo_complex([sum55, rotated66]);
514
515 let mid12910 = AvxVector::add(mid110.lo(), mid29.lo());
517 let mid3478 = AvxVector::add(mid38.lo(), mid47.lo());
518 let output0_left = AvxVector::add(input0, mid56.lo());
519 let output0_right = AvxVector::add(mid12910, mid3478);
520 let output0 = AvxVector::add(output0_left, output0_right);
521 buffer.store_partial1_complex(output0, 0);
522
523 let zero = _mm_setzero_pd();
525 let input0 = AvxVector256::merge(input0, zero);
526
527 let twiddled110 = AvxVector::fmadd(mid110, self.twiddles[0], input0);
529 let twiddled38 = AvxVector::fmadd(mid110, self.twiddles[2], input0);
530 let twiddled29 = AvxVector::fmadd(mid110, self.twiddles[1], input0);
531 let twiddled47 = AvxVector::fmadd(mid110, self.twiddles[3], input0);
532 let twiddled56 = AvxVector::fmadd(mid110, self.twiddles[4], input0);
533
534 let twiddled110 = AvxVector::fmadd(mid29, self.twiddles[1], twiddled110);
535 let twiddled38 = AvxVector::fmadd(mid29, self.twiddles[5], twiddled38);
536 let twiddled29 = AvxVector::fmadd(mid29, self.twiddles[3], twiddled29);
537 let twiddled47 = AvxVector::fmadd(mid29, self.twiddles[7], twiddled47);
538 let twiddled56 = AvxVector::fmadd(mid29, self.twiddles[9], twiddled56);
539
540 let twiddled110 = AvxVector::fmadd(mid38, self.twiddles[2], twiddled110);
541 let twiddled38 = AvxVector::fmadd(mid38, self.twiddles[8], twiddled38);
542 let twiddled29 = AvxVector::fmadd(mid38, self.twiddles[5], twiddled29);
543 let twiddled47 = AvxVector::fmadd(mid38, self.twiddles[0], twiddled47);
544 let twiddled56 = AvxVector::fmadd(mid38, self.twiddles[3], twiddled56);
545
546 let twiddled110 = AvxVector::fmadd(mid47, self.twiddles[3], twiddled110);
547 let twiddled38 = AvxVector::fmadd(mid47, self.twiddles[0], twiddled38);
548 let twiddled29 = AvxVector::fmadd(mid47, self.twiddles[7], twiddled29);
549 let twiddled47 = AvxVector::fmadd(mid47, self.twiddles[4], twiddled47);
550 let twiddled56 = AvxVector::fmadd(mid47, self.twiddles[8], twiddled56);
551
552 let twiddled110 = AvxVector::fmadd(mid56, self.twiddles[4], twiddled110);
553 let twiddled38 = AvxVector::fmadd(mid56, self.twiddles[3], twiddled38);
554 let twiddled29 = AvxVector::fmadd(mid56, self.twiddles[9], twiddled29);
555 let twiddled47 = AvxVector::fmadd(mid56, self.twiddles[8], twiddled47);
556 let twiddled56 = AvxVector::fmadd(mid56, self.twiddles[2], twiddled56);
557
558 let [twiddled12, twiddled109] = AvxVector::unpack_complex([twiddled110, twiddled29]);
560 let [twiddled34, twiddled87] = AvxVector::unpack_complex([twiddled38, twiddled47]);
561 let [twiddled55, twiddled66] = AvxVector::unpack_complex([twiddled56, twiddled56]);
562
563 let [output12, output109] = AvxVector::column_butterfly2([twiddled12, twiddled109]);
564 let [output34, output87] = AvxVector::column_butterfly2([twiddled34, twiddled87]);
565 let [output55, output66] = AvxVector::column_butterfly2([twiddled55, twiddled66]);
566 let output78 = AvxVector::reverse_complex_elements(output87);
567 let output910 = AvxVector::reverse_complex_elements(output109);
568
569 buffer.store_complex(output12, 1);
570 buffer.store_complex(output34, 3);
571 buffer.store_partial1_complex(output55.lo(), 5);
572 buffer.store_partial1_complex(output66.lo(), 6);
573 buffer.store_complex(output78, 7);
574 buffer.store_complex(output910, 9);
575 }
576}
577
578pub struct Butterfly8Avx64<T> {
579 twiddles: [__m256d; 2],
580 twiddles_butterfly4: Rotation90<__m256d>,
581 direction: FftDirection,
582 _phantom_t: std::marker::PhantomData<T>,
583}
584boilerplate_fft_simd_butterfly!(Butterfly8Avx64, 8);
585impl Butterfly8Avx64<f64> {
586 #[target_feature(enable = "avx")]
587 unsafe fn new_with_avx(direction: FftDirection) -> Self {
588 Self {
589 twiddles: gen_butterfly_twiddles_interleaved_columns!(2, 4, 0, direction),
590 twiddles_butterfly4: AvxVector::make_rotation90(direction),
591 direction,
592 _phantom_t: PhantomData,
593 }
594 }
595}
596impl<T> Butterfly8Avx64<T> {
597 #[target_feature(enable = "avx", enable = "fma")]
598 unsafe fn perform_fft_f64(&self, mut buffer: impl AvxArrayMut<f64>) {
599 let row0 = buffer.load_complex(0);
600 let row1 = buffer.load_complex(2);
601 let row2 = buffer.load_complex(4);
602 let row3 = buffer.load_complex(6);
603
604 let [mid0, mid2] = AvxVector::column_butterfly2([row0, row2]);
606 let [mid1, mid3] = AvxVector::column_butterfly2([row1, row3]);
607
608 let mid2_twiddled = AvxVector::mul_complex(mid2, self.twiddles[0]);
609 let mid3_twiddled = AvxVector::mul_complex(mid3, self.twiddles[1]);
610
611 let transposed =
613 avx64_utils::transpose_4x2_to_2x4_f64([mid0, mid2_twiddled], [mid1, mid3_twiddled]);
614
615 let output_rows = AvxVector::column_butterfly4(transposed, self.twiddles_butterfly4);
617
618 buffer.store_complex(output_rows[0], 0);
619 buffer.store_complex(output_rows[1], 2);
620 buffer.store_complex(output_rows[2], 4);
621 buffer.store_complex(output_rows[3], 6);
622 }
623}
624
625pub struct Butterfly9Avx64<T> {
626 twiddles: [__m256d; 2],
627 twiddles_butterfly3: __m256d,
628 direction: FftDirection,
629 _phantom_t: std::marker::PhantomData<T>,
630}
631boilerplate_fft_simd_butterfly!(Butterfly9Avx64, 9);
632impl Butterfly9Avx64<f64> {
633 #[target_feature(enable = "avx")]
634 unsafe fn new_with_avx(direction: FftDirection) -> Self {
635 Self {
636 twiddles: gen_butterfly_twiddles_interleaved_columns!(3, 3, 1, direction),
637 twiddles_butterfly3: AvxVector::broadcast_twiddle(1, 3, direction),
638 direction,
639 _phantom_t: PhantomData,
640 }
641 }
642}
643impl<T> Butterfly9Avx64<T> {
644 #[target_feature(enable = "avx", enable = "fma")]
645 unsafe fn perform_fft_f64(&self, mut buffer: impl AvxArrayMut<f64>) {
646 let mut rows0 = [AvxVector::zero(); 3];
649 let mut rows1 = [AvxVector::zero(); 3];
650
651 for r in 0..3 {
652 rows0[r] = buffer.load_partial1_complex(3 * r);
653 rows1[r] = buffer.load_complex(3 * r + 1);
654 }
655
656 let mid0 = AvxVector::column_butterfly3(rows0, self.twiddles_butterfly3.lo());
658 let mut mid1 = AvxVector::column_butterfly3(rows1, self.twiddles_butterfly3);
659
660 for n in 1..3 {
662 mid1[n] = AvxVector::mul_complex(mid1[n], self.twiddles[n - 1]);
663 }
664
665 let (transposed0, transposed1) = avx64_utils::transpose_3x3_f64(mid0, mid1);
667
668 let output0 = AvxVector::column_butterfly3(transposed0, self.twiddles_butterfly3.lo());
670 let output1 = AvxVector::column_butterfly3(transposed1, self.twiddles_butterfly3);
671
672 for r in 0..3 {
673 buffer.store_partial1_complex(output0[r], 3 * r);
674 buffer.store_complex(output1[r], 3 * r + 1);
675 }
676 }
677}
678
679pub struct Butterfly12Avx64<T> {
680 twiddles: [__m256d; 3],
681 twiddles_butterfly3: __m256d,
682 twiddles_butterfly4: Rotation90<__m256d>,
683 direction: FftDirection,
684 _phantom_t: std::marker::PhantomData<T>,
685}
686boilerplate_fft_simd_butterfly!(Butterfly12Avx64, 12);
687impl Butterfly12Avx64<f64> {
688 #[target_feature(enable = "avx")]
689 unsafe fn new_with_avx(direction: FftDirection) -> Self {
690 Self {
691 twiddles: gen_butterfly_twiddles_interleaved_columns!(4, 3, 1, direction),
692 twiddles_butterfly3: AvxVector::broadcast_twiddle(1, 3, direction),
693 twiddles_butterfly4: AvxVector::make_rotation90(direction),
694 direction,
695 _phantom_t: PhantomData,
696 }
697 }
698}
699impl<T> Butterfly12Avx64<T> {
700 #[target_feature(enable = "avx", enable = "fma")]
701 unsafe fn perform_fft_f64(&self, mut buffer: impl AvxArrayMut<f64>) {
702 let mut rows0 = [AvxVector::zero(); 4];
705 let mut rows1 = [AvxVector::zero(); 4];
706
707 for n in 0..4 {
708 rows0[n] = buffer.load_partial1_complex(n * 3);
709 rows1[n] = buffer.load_complex(n * 3 + 1);
710 }
711
712 let mid0 = AvxVector::column_butterfly4(rows0, self.twiddles_butterfly4.lo());
714 let mut mid1 = AvxVector::column_butterfly4(rows1, self.twiddles_butterfly4);
715
716 for n in 1..4 {
718 mid1[n] = AvxVector::mul_complex(mid1[n], self.twiddles[n - 1]);
719 }
720
721 let (transposed0, transposed1) = avx64_utils::transpose_3x4_to_4x3_f64(mid0, mid1);
723
724 let output0 = AvxVector::column_butterfly3(transposed0, self.twiddles_butterfly3);
726 let output1 = AvxVector::column_butterfly3(transposed1, self.twiddles_butterfly3);
727
728 for r in 0..3 {
729 buffer.store_complex(output0[r], 4 * r);
730 buffer.store_complex(output1[r], 4 * r + 2);
731 }
732 }
733}
734
735pub struct Butterfly16Avx64<T> {
736 twiddles: [__m256d; 6],
737 twiddles_butterfly4: Rotation90<__m256d>,
738 direction: FftDirection,
739 _phantom_t: std::marker::PhantomData<T>,
740}
741boilerplate_fft_simd_butterfly!(Butterfly16Avx64, 16);
742impl Butterfly16Avx64<f64> {
743 #[target_feature(enable = "avx")]
744 unsafe fn new_with_avx(direction: FftDirection) -> Self {
745 Self {
746 twiddles: gen_butterfly_twiddles_interleaved_columns!(4, 4, 0, direction),
747 twiddles_butterfly4: AvxVector::make_rotation90(direction),
748 direction,
749 _phantom_t: PhantomData,
750 }
751 }
752}
753impl<T> Butterfly16Avx64<T> {
754 #[target_feature(enable = "avx", enable = "fma")]
755 unsafe fn perform_fft_f64(&self, mut buffer: impl AvxArrayMut<f64>) {
756 let mut rows0 = [AvxVector::zero(); 4];
757 let mut rows1 = [AvxVector::zero(); 4];
758 for r in 0..4 {
759 rows0[r] = buffer.load_complex(4 * r);
760 rows1[r] = buffer.load_complex(4 * r + 2);
761 }
762
763 let mut mid0 = AvxVector::column_butterfly4(rows0, self.twiddles_butterfly4);
765 let mut mid1 = AvxVector::column_butterfly4(rows1, self.twiddles_butterfly4);
766
767 for r in 1..4 {
769 mid0[r] = AvxVector::mul_complex(mid0[r], self.twiddles[2 * r - 2]);
770 mid1[r] = AvxVector::mul_complex(mid1[r], self.twiddles[2 * r - 1]);
771 }
772
773 let (transposed0, transposed1) = avx64_utils::transpose_4x4_f64(mid0, mid1);
775
776 let output0 = AvxVector::column_butterfly4(transposed0, self.twiddles_butterfly4);
778 let output1 = AvxVector::column_butterfly4(transposed1, self.twiddles_butterfly4);
779
780 for r in 0..4 {
781 buffer.store_complex(output0[r], 4 * r);
782 buffer.store_complex(output1[r], 4 * r + 2);
783 }
784 }
785}
786
787pub struct Butterfly18Avx64<T> {
788 twiddles: [__m256d; 5],
789 twiddles_butterfly3: __m256d,
790 direction: FftDirection,
791 _phantom_t: std::marker::PhantomData<T>,
792}
793boilerplate_fft_simd_butterfly!(Butterfly18Avx64, 18);
794impl Butterfly18Avx64<f64> {
795 #[target_feature(enable = "avx")]
796 unsafe fn new_with_avx(direction: FftDirection) -> Self {
797 Self {
798 twiddles: gen_butterfly_twiddles_interleaved_columns!(6, 3, 1, direction),
799 twiddles_butterfly3: AvxVector::broadcast_twiddle(1, 3, direction),
800 direction,
801 _phantom_t: PhantomData,
802 }
803 }
804}
805impl<T> Butterfly18Avx64<T> {
806 #[target_feature(enable = "avx", enable = "fma")]
807 unsafe fn perform_fft_f64(&self, mut buffer: impl AvxArrayMut<f64>) {
808 let mut rows0 = [AvxVector::zero(); 6];
811 let mut rows1 = [AvxVector::zero(); 6];
812 for n in 0..6 {
813 rows0[n] = buffer.load_partial1_complex(n * 3);
814 rows1[n] = buffer.load_complex(n * 3 + 1);
815 }
816
817 let mid0 = AvxVector128::column_butterfly6(rows0, self.twiddles_butterfly3);
819 let mut mid1 = AvxVector256::column_butterfly6(rows1, self.twiddles_butterfly3);
820
821 for n in 1..6 {
823 mid1[n] = AvxVector::mul_complex(mid1[n], self.twiddles[n - 1]);
824 }
825
826 let (transposed0, transposed1, transposed2) =
828 avx64_utils::transpose_3x6_to_6x3_f64(mid0, mid1);
829
830 let output0 = AvxVector::column_butterfly3(transposed0, self.twiddles_butterfly3);
832 let output1 = AvxVector::column_butterfly3(transposed1, self.twiddles_butterfly3);
833 let output2 = AvxVector::column_butterfly3(transposed2, self.twiddles_butterfly3);
834
835 for r in 0..3 {
836 buffer.store_complex(output0[r], 6 * r);
837 buffer.store_complex(output1[r], 6 * r + 2);
838 buffer.store_complex(output2[r], 6 * r + 4);
839 }
840 }
841}
842
843pub struct Butterfly24Avx64<T> {
844 twiddles: [__m256d; 9],
845 twiddles_butterfly3: __m256d,
846 twiddles_butterfly4: Rotation90<__m256d>,
847 direction: FftDirection,
848 _phantom_t: std::marker::PhantomData<T>,
849}
850boilerplate_fft_simd_butterfly!(Butterfly24Avx64, 24);
851impl Butterfly24Avx64<f64> {
852 #[target_feature(enable = "avx")]
853 unsafe fn new_with_avx(direction: FftDirection) -> Self {
854 Self {
855 twiddles: gen_butterfly_twiddles_interleaved_columns!(4, 6, 0, direction),
856 twiddles_butterfly3: AvxVector::broadcast_twiddle(1, 3, direction),
857 twiddles_butterfly4: AvxVector::make_rotation90(direction),
858 direction,
859 _phantom_t: PhantomData,
860 }
861 }
862}
863impl<T> Butterfly24Avx64<T> {
864 #[target_feature(enable = "avx", enable = "fma")]
865 unsafe fn perform_fft_f64(&self, mut buffer: impl AvxArrayMut<f64>) {
866 let mut rows0 = [AvxVector::zero(); 4];
867 let mut rows1 = [AvxVector::zero(); 4];
868 let mut rows2 = [AvxVector::zero(); 4];
869 for r in 0..4 {
870 rows0[r] = buffer.load_complex(6 * r);
871 rows1[r] = buffer.load_complex(6 * r + 2);
872 rows2[r] = buffer.load_complex(6 * r + 4);
873 }
874
875 let mut mid0 = AvxVector::column_butterfly4(rows0, self.twiddles_butterfly4);
877 let mut mid1 = AvxVector::column_butterfly4(rows1, self.twiddles_butterfly4);
878 let mut mid2 = AvxVector::column_butterfly4(rows2, self.twiddles_butterfly4);
879
880 for r in 1..4 {
882 mid0[r] = AvxVector::mul_complex(mid0[r], self.twiddles[3 * r - 3]);
883 mid1[r] = AvxVector::mul_complex(mid1[r], self.twiddles[3 * r - 2]);
884 mid2[r] = AvxVector::mul_complex(mid2[r], self.twiddles[3 * r - 1]);
885 }
886
887 let (transposed0, transposed1) = avx64_utils::transpose_6x4_to_4x6_f64(mid0, mid1, mid2);
889
890 let output0 = AvxVector256::column_butterfly6(transposed0, self.twiddles_butterfly3);
892 let output1 = AvxVector256::column_butterfly6(transposed1, self.twiddles_butterfly3);
893
894 for r in 0..6 {
895 buffer.store_complex(output0[r], 4 * r);
896 buffer.store_complex(output1[r], 4 * r + 2);
897 }
898 }
899}
900
901pub struct Butterfly27Avx64<T> {
902 twiddles: [__m256d; 8],
903 twiddles_butterfly9: [__m256d; 3],
904 twiddles_butterfly9_lo: [__m256d; 2],
905 twiddles_butterfly3: __m256d,
906 direction: FftDirection,
907 _phantom_t: std::marker::PhantomData<T>,
908}
909boilerplate_fft_simd_butterfly!(Butterfly27Avx64, 27);
910impl Butterfly27Avx64<f64> {
911 #[target_feature(enable = "avx")]
912 unsafe fn new_with_avx(direction: FftDirection) -> Self {
913 let twiddle1 = __m128d::broadcast_twiddle(1, 9, direction);
914 let twiddle2 = __m128d::broadcast_twiddle(2, 9, direction);
915 let twiddle4 = __m128d::broadcast_twiddle(4, 9, direction);
916
917 Self {
918 twiddles: gen_butterfly_twiddles_interleaved_columns!(3, 9, 1, direction),
919 twiddles_butterfly9: [
920 AvxVector::broadcast_twiddle(1, 9, direction),
921 AvxVector::broadcast_twiddle(2, 9, direction),
922 AvxVector::broadcast_twiddle(4, 9, direction),
923 ],
924 twiddles_butterfly9_lo: [
925 AvxVector256::merge(twiddle1, twiddle2),
926 AvxVector256::merge(twiddle2, twiddle4),
927 ],
928 twiddles_butterfly3: AvxVector::broadcast_twiddle(1, 3, direction),
929 direction,
930 _phantom_t: PhantomData,
931 }
932 }
933}
934impl<T> Butterfly27Avx64<T> {
935 #[target_feature(enable = "avx", enable = "fma")]
936 unsafe fn perform_fft_f64(&self, mut buffer: impl AvxArrayMut<f64>) {
937 let mut rows0 = [AvxVector::zero(); 3];
942 for n in 0..3 {
943 rows0[n] = buffer.load_partial1_complex(n * 9);
944 }
945 let mid0 = AvxVector::column_butterfly3(rows0, self.twiddles_butterfly3.lo());
946
947 let mut rows1 = [AvxVector::zero(); 3];
949 let mut rows2 = [AvxVector::zero(); 3];
950 for n in 0..3 {
951 rows1[n] = buffer.load_complex(n * 9 + 1);
952 rows2[n] = buffer.load_complex(n * 9 + 3);
953 }
954 let mut mid1 = AvxVector::column_butterfly3(rows1, self.twiddles_butterfly3);
955 let mut mid2 = AvxVector::column_butterfly3(rows2, self.twiddles_butterfly3);
956 for r in 1..3 {
957 mid1[r] = AvxVector::mul_complex(mid1[r], self.twiddles[4 * r - 4]);
958 mid2[r] = AvxVector::mul_complex(mid2[r], self.twiddles[4 * r - 3]);
959 }
960
961 let mut rows3 = [AvxVector::zero(); 3];
963 let mut rows4 = [AvxVector::zero(); 3];
964 for n in 0..3 {
965 rows3[n] = buffer.load_complex(n * 9 + 5);
966 rows4[n] = buffer.load_complex(n * 9 + 7);
967 }
968 let mut mid3 = AvxVector::column_butterfly3(rows3, self.twiddles_butterfly3);
969 let mut mid4 = AvxVector::column_butterfly3(rows4, self.twiddles_butterfly3);
970 for r in 1..3 {
971 mid3[r] = AvxVector::mul_complex(mid3[r], self.twiddles[4 * r - 2]);
972 mid4[r] = AvxVector::mul_complex(mid4[r], self.twiddles[4 * r - 1]);
973 }
974
975 let (transposed0, transposed1) =
977 avx64_utils::transpose_9x3_to_3x9_f64(mid0, mid1, mid2, mid3, mid4);
978
979 let output0 = AvxVector128::column_butterfly9(
981 transposed0,
982 self.twiddles_butterfly9_lo,
983 self.twiddles_butterfly3,
984 );
985 for r in 0..3 {
986 buffer.store_partial1_complex(output0[r * 3], 9 * r);
987 buffer.store_partial1_complex(output0[r * 3 + 1], 9 * r + 3);
988 buffer.store_partial1_complex(output0[r * 3 + 2], 9 * r + 6);
989 }
990
991 let output1 = AvxVector256::column_butterfly9(
992 transposed1,
993 self.twiddles_butterfly9,
994 self.twiddles_butterfly3,
995 );
996 for r in 0..3 {
997 buffer.store_complex(output1[r * 3], 9 * r + 1);
998 buffer.store_complex(output1[r * 3 + 1], 9 * r + 4);
999 buffer.store_complex(output1[r * 3 + 2], 9 * r + 7);
1000 }
1001 }
1002}
1003
1004pub struct Butterfly32Avx64<T> {
1005 twiddles: [__m256d; 12],
1006 twiddles_butterfly4: Rotation90<__m256d>,
1007 direction: FftDirection,
1008 _phantom_t: std::marker::PhantomData<T>,
1009}
1010boilerplate_fft_simd_butterfly!(Butterfly32Avx64, 32);
1011impl Butterfly32Avx64<f64> {
1012 #[target_feature(enable = "avx")]
1013 unsafe fn new_with_avx(direction: FftDirection) -> Self {
1014 Self {
1015 twiddles: gen_butterfly_twiddles_interleaved_columns!(4, 8, 0, direction),
1016 twiddles_butterfly4: AvxVector::make_rotation90(direction),
1017 direction,
1018 _phantom_t: PhantomData,
1019 }
1020 }
1021}
1022impl<T> Butterfly32Avx64<T> {
1023 #[target_feature(enable = "avx", enable = "fma")]
1024 unsafe fn perform_fft_f64(&self, mut buffer: impl AvxArrayMut<f64>) {
1025 let mut rows0 = [AvxVector::zero(); 4];
1029 let mut rows1 = [AvxVector::zero(); 4];
1030 for r in 0..4 {
1031 rows0[r] = buffer.load_complex(8 * r);
1032 rows1[r] = buffer.load_complex(8 * r + 2);
1033 }
1034 let mut mid0 = AvxVector::column_butterfly4(rows0, self.twiddles_butterfly4);
1035 let mut mid1 = AvxVector::column_butterfly4(rows1, self.twiddles_butterfly4);
1036 for r in 1..4 {
1037 mid0[r] = AvxVector::mul_complex(mid0[r], self.twiddles[4 * r - 4]);
1038 mid1[r] = AvxVector::mul_complex(mid1[r], self.twiddles[4 * r - 3]);
1039 }
1040
1041 let mut rows2 = [AvxVector::zero(); 4];
1043 let mut rows3 = [AvxVector::zero(); 4];
1044 for r in 0..4 {
1045 rows2[r] = buffer.load_complex(8 * r + 4);
1046 rows3[r] = buffer.load_complex(8 * r + 6);
1047 }
1048 let mut mid2 = AvxVector::column_butterfly4(rows2, self.twiddles_butterfly4);
1049 let mut mid3 = AvxVector::column_butterfly4(rows3, self.twiddles_butterfly4);
1050 for r in 1..4 {
1051 mid2[r] = AvxVector::mul_complex(mid2[r], self.twiddles[4 * r - 2]);
1052 mid3[r] = AvxVector::mul_complex(mid3[r], self.twiddles[4 * r - 1]);
1053 }
1054
1055 let (transposed0, transposed1) =
1057 avx64_utils::transpose_8x4_to_4x8_f64(mid0, mid1, mid2, mid3);
1058
1059 let output0 = AvxVector::column_butterfly8(transposed0, self.twiddles_butterfly4);
1062 for r in 0..8 {
1063 buffer.store_complex(output0[r], 4 * r);
1064 }
1065 let output1 = AvxVector::column_butterfly8(transposed1, self.twiddles_butterfly4);
1066 for r in 0..8 {
1067 buffer.store_complex(output1[r], 4 * r + 2);
1068 }
1069 }
1070}
1071
1072pub struct Butterfly36Avx64<T> {
1073 twiddles: [__m256d; 15],
1074 twiddles_butterfly3: __m256d,
1075 direction: FftDirection,
1076 _phantom_t: std::marker::PhantomData<T>,
1077}
1078boilerplate_fft_simd_butterfly!(Butterfly36Avx64, 36);
1079impl Butterfly36Avx64<f64> {
1080 #[target_feature(enable = "avx")]
1081 unsafe fn new_with_avx(direction: FftDirection) -> Self {
1082 Self {
1083 twiddles: gen_butterfly_twiddles_separated_columns!(6, 6, 0, direction),
1084 twiddles_butterfly3: AvxVector::broadcast_twiddle(1, 3, direction),
1085 direction,
1086 _phantom_t: PhantomData,
1087 }
1088 }
1089}
1090impl<T> Butterfly36Avx64<T> {
1091 #[target_feature(enable = "avx", enable = "fma")]
1092 unsafe fn perform_fft_f64(&self, mut buffer: impl AvxArrayMut<f64>) {
1093 let mut rows0 = [AvxVector::zero(); 6];
1097 for n in 0..6 {
1098 rows0[n] = buffer.load_complex(n * 6);
1099 }
1100 let mut mid0 = AvxVector256::column_butterfly6(rows0, self.twiddles_butterfly3);
1101 for r in 1..6 {
1102 mid0[r] = AvxVector::mul_complex(mid0[r], self.twiddles[r - 1]);
1103 }
1104
1105 let mut rows1 = [AvxVector::zero(); 6];
1107 for n in 0..6 {
1108 rows1[n] = buffer.load_complex(n * 6 + 2);
1109 }
1110 let mut mid1 = AvxVector256::column_butterfly6(rows1, self.twiddles_butterfly3);
1111 for r in 1..6 {
1112 mid1[r] = AvxVector::mul_complex(mid1[r], self.twiddles[r + 4]);
1113 }
1114
1115 let mut rows2 = [AvxVector::zero(); 6];
1117 for n in 0..6 {
1118 rows2[n] = buffer.load_complex(n * 6 + 4);
1119 }
1120 let mut mid2 = AvxVector256::column_butterfly6(rows2, self.twiddles_butterfly3);
1121 for r in 1..6 {
1122 mid2[r] = AvxVector::mul_complex(mid2[r], self.twiddles[r + 9]);
1123 }
1124
1125 let (transposed0, transposed1, transposed2) =
1127 avx64_utils::transpose_6x6_f64(mid0, mid1, mid2);
1128
1129 let output0 = AvxVector256::column_butterfly6(transposed0, self.twiddles_butterfly3);
1131 for r in 0..3 {
1132 buffer.store_complex(output0[r * 2], 12 * r);
1133 buffer.store_complex(output0[r * 2 + 1], 12 * r + 6);
1134 }
1135
1136 let output1 = AvxVector256::column_butterfly6(transposed1, self.twiddles_butterfly3);
1137 for r in 0..3 {
1138 buffer.store_complex(output1[r * 2], 12 * r + 2);
1139 buffer.store_complex(output1[r * 2 + 1], 12 * r + 8);
1140 }
1141
1142 let output2 = AvxVector256::column_butterfly6(transposed2, self.twiddles_butterfly3);
1143 for r in 0..3 {
1144 buffer.store_complex(output2[r * 2], 12 * r + 4);
1145 buffer.store_complex(output2[r * 2 + 1], 12 * r + 10);
1146 }
1147 }
1148}
1149
1150pub struct Butterfly64Avx64<T> {
1151 twiddles: [__m256d; 28],
1152 twiddles_butterfly4: Rotation90<__m256d>,
1153 direction: FftDirection,
1154 _phantom_t: std::marker::PhantomData<T>,
1155}
1156boilerplate_fft_simd_butterfly_with_scratch!(Butterfly64Avx64, 64);
1157impl Butterfly64Avx64<f64> {
1158 #[target_feature(enable = "avx")]
1159 unsafe fn new_with_avx(direction: FftDirection) -> Self {
1160 Self {
1161 twiddles: gen_butterfly_twiddles_separated_columns!(8, 8, 0, direction),
1162 twiddles_butterfly4: AvxVector::make_rotation90(direction),
1163 direction,
1164 _phantom_t: PhantomData,
1165 }
1166 }
1167}
1168impl<T> Butterfly64Avx64<T> {
1169 #[target_feature(enable = "avx", enable = "fma")]
1170 unsafe fn column_butterflies_and_transpose(
1171 &self,
1172 input: &[Complex<f64>],
1173 mut output: &mut [Complex<f64>],
1174 ) {
1175 for columnset in 0..4 {
1182 let mut rows = [AvxVector::zero(); 8];
1183 for r in 0..8 {
1184 rows[r] = input.load_complex(columnset * 2 + 8 * r);
1185 }
1186 let mut mid = AvxVector::column_butterfly8(rows, self.twiddles_butterfly4);
1188
1189 for r in 1..8 {
1191 mid[r] = AvxVector::mul_complex(mid[r], self.twiddles[r - 1 + 7 * columnset]);
1192 }
1193
1194 let transposed = AvxVector::transpose8_packed(mid);
1196
1197 for i in 0..4 {
1199 output.store_complex(transposed[i * 2], columnset * 16 + i * 4);
1200 output.store_complex(transposed[i * 2 + 1], columnset * 16 + i * 4 + 2);
1201 }
1202 }
1203 }
1204
1205 #[target_feature(enable = "avx", enable = "fma")]
1206 unsafe fn row_butterflies(&self, mut buffer: impl AvxArrayMut<f64>) {
1207 for columnset in 0usize..4 {
1210 let mut rows = [AvxVector::zero(); 8];
1211 for r in 0..8 {
1212 rows[r] = buffer.load_complex(columnset * 2 + 8 * r);
1213 }
1214 let mid = AvxVector::column_butterfly8(rows, self.twiddles_butterfly4);
1215 for r in 0..8 {
1216 buffer.store_complex(mid[r], columnset * 2 + 8 * r);
1217 }
1218 }
1219 }
1220}
1221
1222pub struct Butterfly128Avx64<T> {
1223 twiddles: [__m256d; 56],
1224 twiddles_butterfly16: [__m256d; 2],
1225 twiddles_butterfly4: Rotation90<__m256d>,
1226 direction: FftDirection,
1227 _phantom_t: std::marker::PhantomData<T>,
1228}
1229boilerplate_fft_simd_butterfly_with_scratch!(Butterfly128Avx64, 128);
1230impl Butterfly128Avx64<f64> {
1231 #[target_feature(enable = "avx")]
1232 unsafe fn new_with_avx(direction: FftDirection) -> Self {
1233 Self {
1234 twiddles: gen_butterfly_twiddles_separated_columns!(8, 16, 0, direction),
1235 twiddles_butterfly16: [
1236 AvxVector::broadcast_twiddle(1, 16, direction),
1237 AvxVector::broadcast_twiddle(3, 16, direction),
1238 ],
1239 twiddles_butterfly4: AvxVector::make_rotation90(direction),
1240 direction,
1241 _phantom_t: PhantomData,
1242 }
1243 }
1244}
1245impl<T> Butterfly128Avx64<T> {
1246 #[target_feature(enable = "avx", enable = "fma")]
1247 unsafe fn column_butterflies_and_transpose(
1248 &self,
1249 input: &[Complex<f64>],
1250 mut output: &mut [Complex<f64>],
1251 ) {
1252 for columnset in 0..8 {
1259 let mut rows = [AvxVector::zero(); 8];
1260 for r in 0..8 {
1261 rows[r] = input.load_complex(columnset * 2 + 16 * r);
1262 }
1263 let mut mid = AvxVector::column_butterfly8(rows, self.twiddles_butterfly4);
1265
1266 for r in 1..8 {
1268 mid[r] = AvxVector::mul_complex(mid[r], self.twiddles[r - 1 + 7 * columnset]);
1269 }
1270
1271 let transposed = AvxVector::transpose8_packed(mid);
1273
1274 for i in 0..4 {
1276 output.store_complex(transposed[i * 2], columnset * 16 + i * 4);
1277 output.store_complex(transposed[i * 2 + 1], columnset * 16 + i * 4 + 2);
1278 }
1279 }
1280 }
1281
1282 #[target_feature(enable = "avx", enable = "fma")]
1283 unsafe fn row_butterflies(&self, mut buffer: impl AvxArrayMut<f64>) {
1284 for columnset in 0usize..4 {
1288 column_butterfly16_loadfn!(
1289 |index: usize| buffer.load_complex(columnset * 2 + index * 8),
1290 |data, index| buffer.store_complex(data, columnset * 2 + index * 8),
1291 self.twiddles_butterfly16,
1292 self.twiddles_butterfly4
1293 );
1294 }
1295 }
1296}
1297
1298pub struct Butterfly256Avx64<T> {
1299 twiddles: [__m256d; 112],
1300 twiddles_butterfly32: [__m256d; 6],
1301 twiddles_butterfly4: Rotation90<__m256d>,
1302 direction: FftDirection,
1303 _phantom_t: std::marker::PhantomData<T>,
1304}
1305boilerplate_fft_simd_butterfly_with_scratch!(Butterfly256Avx64, 256);
1306impl Butterfly256Avx64<f64> {
1307 #[target_feature(enable = "avx")]
1308 unsafe fn new_with_avx(direction: FftDirection) -> Self {
1309 Self {
1310 twiddles: gen_butterfly_twiddles_separated_columns!(8, 32, 0, direction),
1311 twiddles_butterfly32: [
1312 AvxVector::broadcast_twiddle(1, 32, direction),
1313 AvxVector::broadcast_twiddle(2, 32, direction),
1314 AvxVector::broadcast_twiddle(3, 32, direction),
1315 AvxVector::broadcast_twiddle(5, 32, direction),
1316 AvxVector::broadcast_twiddle(6, 32, direction),
1317 AvxVector::broadcast_twiddle(7, 32, direction),
1318 ],
1319 twiddles_butterfly4: AvxVector::make_rotation90(direction),
1320 direction,
1321 _phantom_t: PhantomData,
1322 }
1323 }
1324}
1325impl<T> Butterfly256Avx64<T> {
1326 #[target_feature(enable = "avx", enable = "fma")]
1327 unsafe fn column_butterflies_and_transpose(
1328 &self,
1329 input: &[Complex<f64>],
1330 mut output: &mut [Complex<f64>],
1331 ) {
1332 for columnset in 0..16 {
1339 let mut rows = [AvxVector::zero(); 8];
1340 for r in 0..8 {
1341 rows[r] = input.load_complex(columnset * 2 + 32 * r);
1342 }
1343 let mut mid = AvxVector::column_butterfly8(rows, self.twiddles_butterfly4);
1345
1346 for r in 1..8 {
1348 mid[r] = AvxVector::mul_complex(mid[r], self.twiddles[r - 1 + 7 * columnset]);
1349 }
1350
1351 let transposed = AvxVector::transpose8_packed(mid);
1353
1354 for i in 0..4 {
1356 output.store_complex(transposed[i * 2], columnset * 16 + i * 4);
1357 output.store_complex(transposed[i * 2 + 1], columnset * 16 + i * 4 + 2);
1358 }
1359 }
1360 }
1361
1362 #[target_feature(enable = "avx", enable = "fma")]
1363 unsafe fn row_butterflies(&self, mut buffer: impl AvxArrayMut<f64>) {
1364 for columnset in 0usize..4 {
1368 column_butterfly32_loadfn!(
1369 |index: usize| buffer.load_complex(columnset * 2 + index * 8),
1370 |data, index| buffer.store_complex(data, columnset * 2 + index * 8),
1371 self.twiddles_butterfly32,
1372 self.twiddles_butterfly4
1373 );
1374 }
1375 }
1376}
1377
1378pub struct Butterfly512Avx64<T> {
1379 twiddles: [__m256d; 240],
1380 twiddles_butterfly32: [__m256d; 6],
1381 twiddles_butterfly16: [__m256d; 2],
1382 twiddles_butterfly4: Rotation90<__m256d>,
1383 direction: FftDirection,
1384 _phantom_t: std::marker::PhantomData<T>,
1385}
1386boilerplate_fft_simd_butterfly_with_scratch!(Butterfly512Avx64, 512);
1387impl Butterfly512Avx64<f64> {
1388 #[target_feature(enable = "avx")]
1389 unsafe fn new_with_avx(direction: FftDirection) -> Self {
1390 Self {
1391 twiddles: gen_butterfly_twiddles_separated_columns!(16, 32, 0, direction),
1392 twiddles_butterfly32: [
1393 AvxVector::broadcast_twiddle(1, 32, direction),
1394 AvxVector::broadcast_twiddle(2, 32, direction),
1395 AvxVector::broadcast_twiddle(3, 32, direction),
1396 AvxVector::broadcast_twiddle(5, 32, direction),
1397 AvxVector::broadcast_twiddle(6, 32, direction),
1398 AvxVector::broadcast_twiddle(7, 32, direction),
1399 ],
1400 twiddles_butterfly16: [
1401 AvxVector::broadcast_twiddle(1, 16, direction),
1402 AvxVector::broadcast_twiddle(3, 16, direction),
1403 ],
1404 twiddles_butterfly4: AvxVector::make_rotation90(direction),
1405 direction,
1406 _phantom_t: PhantomData,
1407 }
1408 }
1409}
1410impl<T> Butterfly512Avx64<T> {
1411 #[target_feature(enable = "avx", enable = "fma")]
1412 unsafe fn column_butterflies_and_transpose(
1413 &self,
1414 input: &[Complex<f64>],
1415 mut output: &mut [Complex<f64>],
1416 ) {
1417 const TWIDDLES_PER_COLUMN: usize = 15;
1425 for (columnset, twiddle_chunk) in
1426 self.twiddles.chunks_exact(TWIDDLES_PER_COLUMN).enumerate()
1427 {
1428 let mut mid_uninit: [MaybeUninit<__m256d>; 16] = [MaybeUninit::<__m256d>::uninit(); 16];
1431
1432 column_butterfly16_loadfn!(
1433 |index: usize| input.load_complex(columnset * 2 + 32 * index),
1434 |data, index: usize| {
1435 mid_uninit[index].as_mut_ptr().write(data);
1436 },
1437 self.twiddles_butterfly16,
1438 self.twiddles_butterfly4
1439 );
1440
1441 for chunk in 0..4 {
1444 let twiddled = [
1445 if chunk > 0 {
1446 AvxVector::mul_complex(
1447 mid_uninit[4 * chunk].assume_init(),
1448 twiddle_chunk[4 * chunk - 1],
1449 )
1450 } else {
1451 mid_uninit[4 * chunk].assume_init()
1452 },
1453 AvxVector::mul_complex(
1454 mid_uninit[4 * chunk + 1].assume_init(),
1455 twiddle_chunk[4 * chunk],
1456 ),
1457 AvxVector::mul_complex(
1458 mid_uninit[4 * chunk + 2].assume_init(),
1459 twiddle_chunk[4 * chunk + 1],
1460 ),
1461 AvxVector::mul_complex(
1462 mid_uninit[4 * chunk + 3].assume_init(),
1463 twiddle_chunk[4 * chunk + 2],
1464 ),
1465 ];
1466
1467 let transposed = AvxVector::transpose4_packed(twiddled);
1468
1469 output.store_complex(transposed[0], columnset * 32 + 4 * chunk);
1470 output.store_complex(transposed[1], columnset * 32 + 4 * chunk + 2);
1471 output.store_complex(transposed[2], columnset * 32 + 4 * chunk + 16);
1472 output.store_complex(transposed[3], columnset * 32 + 4 * chunk + 18);
1473 }
1474 }
1475 }
1476
1477 #[target_feature(enable = "avx", enable = "fma")]
1478 unsafe fn row_butterflies(&self, mut buffer: impl AvxArrayMut<f64>) {
1479 for columnset in 0usize..8 {
1483 column_butterfly32_loadfn!(
1484 |index: usize| buffer.load_complex(columnset * 2 + index * 16),
1485 |data, index| buffer.store_complex(data, columnset * 2 + index * 16),
1486 self.twiddles_butterfly32,
1487 self.twiddles_butterfly4
1488 );
1489 }
1490 }
1491}
1492
1493#[cfg(test)]
1494mod unit_tests {
1495 use super::*;
1496 use crate::test_utils::check_fft_algorithm;
1497
1498 macro_rules! test_avx_butterfly {
1499 ($test_name:ident, $struct_name:ident, $size:expr) => (
1500 #[test]
1501 fn $test_name() {
1502 let butterfly = $struct_name::new(FftDirection::Forward).expect("Can't run test because this machine doesn't have the required instruction sets");
1503 check_fft_algorithm(&butterfly as &dyn Fft<f64>, $size, FftDirection::Forward);
1504
1505 let butterfly_inverse = $struct_name::new(FftDirection::Inverse).expect("Can't run test because this machine doesn't have the required instruction sets");
1506 check_fft_algorithm(&butterfly_inverse as &dyn Fft<f64>, $size, FftDirection::Inverse);
1507 }
1508 )
1509 }
1510
1511 test_avx_butterfly!(test_avx_butterfly5_f64, Butterfly5Avx64, 5);
1512 test_avx_butterfly!(test_avx_butterfly7_f64, Butterfly7Avx64, 7);
1513 test_avx_butterfly!(test_avx_mixedradix8_f64, Butterfly8Avx64, 8);
1514 test_avx_butterfly!(test_avx_mixedradix9_f64, Butterfly9Avx64, 9);
1515 test_avx_butterfly!(test_avx_mixedradix11_f64, Butterfly11Avx64, 11);
1516 test_avx_butterfly!(test_avx_mixedradix12_f64, Butterfly12Avx64, 12);
1517 test_avx_butterfly!(test_avx_mixedradix16_f64, Butterfly16Avx64, 16);
1518 test_avx_butterfly!(test_avx_mixedradix18_f64, Butterfly18Avx64, 18);
1519 test_avx_butterfly!(test_avx_mixedradix24_f64, Butterfly24Avx64, 24);
1520 test_avx_butterfly!(test_avx_mixedradix27_f64, Butterfly27Avx64, 27);
1521 test_avx_butterfly!(test_avx_mixedradix32_f64, Butterfly32Avx64, 32);
1522 test_avx_butterfly!(test_avx_mixedradix36_f64, Butterfly36Avx64, 36);
1523 test_avx_butterfly!(test_avx_mixedradix64_f64, Butterfly64Avx64, 64);
1524 test_avx_butterfly!(test_avx_mixedradix128_f64, Butterfly128Avx64, 128);
1525 test_avx_butterfly!(test_avx_mixedradix256_f64, Butterfly256Avx64, 256);
1526 test_avx_butterfly!(test_avx_mixedradix512_f64, Butterfly512Avx64, 512);
1527}