1use core::arch::x86_64::*;
2use num_complex::Complex;
3
4use crate::{common::FftNum, FftDirection};
5
6use crate::array_utils;
7use crate::array_utils::workaround_transmute_mut;
8use crate::array_utils::DoubleBuf;
9use crate::common::{fft_error_inplace, fft_error_outofplace};
10use crate::twiddles;
11use crate::{Direction, Fft, Length};
12
13use super::sse_common::{assert_f32, assert_f64};
14use super::sse_utils::*;
15use super::sse_vector::{SseArrayMut, SseVector};
16
17#[inline(always)]
18unsafe fn pack_32(a: Complex<f32>, b: Complex<f32>) -> __m128 {
19 _mm_set_ps(b.im, b.re, a.im, a.re)
20}
21#[inline(always)]
22unsafe fn pack_64(a: Complex<f64>) -> __m128d {
23 _mm_set_pd(a.im, a.re)
24}
25
26#[allow(unused)]
27macro_rules! boilerplate_fft_sse_f32_butterfly {
28 ($struct_name:ident) => {
29 impl<T: FftNum> $struct_name<T> {
30 #[target_feature(enable = "sse4.1")]
31 pub(crate) unsafe fn perform_fft_butterfly(&self, buffer: &mut [Complex<T>]) {
33 self.perform_fft_contiguous(workaround_transmute_mut::<_, Complex<f32>>(buffer));
34 }
35
36 #[target_feature(enable = "sse4.1")]
37 pub(crate) unsafe fn perform_parallel_fft_butterfly(&self, buffer: &mut [Complex<T>]) {
39 self.perform_parallel_fft_contiguous(workaround_transmute_mut::<_, Complex<f32>>(
40 buffer,
41 ));
42 }
43
44 #[target_feature(enable = "sse4.1")]
46 pub(crate) unsafe fn perform_fft_butterfly_multi(
47 &self,
48 buffer: &mut [Complex<T>],
49 ) -> Result<(), ()> {
50 let len = buffer.len();
51 let alldone = array_utils::iter_chunks(buffer, 2 * self.len(), |chunk| {
52 self.perform_parallel_fft_butterfly(chunk)
53 });
54 if alldone.is_err() && buffer.len() >= self.len() {
55 self.perform_fft_butterfly(&mut buffer[len - self.len()..]);
56 }
57 Ok(())
58 }
59
60 #[target_feature(enable = "sse4.1")]
62 pub(crate) unsafe fn perform_oop_fft_butterfly_multi(
63 &self,
64 input: &mut [Complex<T>],
65 output: &mut [Complex<T>],
66 ) -> Result<(), ()> {
67 let len = input.len();
68 let alldone = array_utils::iter_chunks_zipped(
69 input,
70 output,
71 2 * self.len(),
72 |in_chunk, out_chunk| {
73 let input_slice = workaround_transmute_mut(in_chunk);
74 let output_slice = workaround_transmute_mut(out_chunk);
75 self.perform_parallel_fft_contiguous(DoubleBuf {
76 input: input_slice,
77 output: output_slice,
78 })
79 },
80 );
81 if alldone.is_err() && input.len() >= self.len() {
82 let input_slice = workaround_transmute_mut(input);
83 let output_slice = workaround_transmute_mut(output);
84 self.perform_fft_contiguous(DoubleBuf {
85 input: &mut input_slice[len - self.len()..],
86 output: &mut output_slice[len - self.len()..],
87 })
88 }
89 Ok(())
90 }
91 }
92 };
93}
94
95macro_rules! boilerplate_fft_sse_f32_butterfly_noparallel {
96 ($struct_name:ident) => {
97 impl<T: FftNum> $struct_name<T> {
98 #[target_feature(enable = "sse4.1")]
100 pub(crate) unsafe fn perform_fft_butterfly(&self, buffer: &mut [Complex<T>]) {
101 self.perform_fft_contiguous(workaround_transmute_mut::<_, Complex<f32>>(buffer));
102 }
103
104 #[target_feature(enable = "sse4.1")]
106 pub(crate) unsafe fn perform_fft_butterfly_multi(
107 &self,
108 buffer: &mut [Complex<T>],
109 ) -> Result<(), ()> {
110 array_utils::iter_chunks(buffer, self.len(), |chunk| {
111 self.perform_fft_butterfly(chunk)
112 })
113 }
114
115 #[target_feature(enable = "sse4.1")]
117 pub(crate) unsafe fn perform_oop_fft_butterfly_multi(
118 &self,
119 input: &mut [Complex<T>],
120 output: &mut [Complex<T>],
121 ) -> Result<(), ()> {
122 array_utils::iter_chunks_zipped(input, output, self.len(), |in_chunk, out_chunk| {
123 let input_slice = workaround_transmute_mut(in_chunk);
124 let output_slice = workaround_transmute_mut(out_chunk);
125 self.perform_fft_contiguous(DoubleBuf {
126 input: input_slice,
127 output: output_slice,
128 })
129 })
130 }
131 }
132 };
133}
134
135macro_rules! boilerplate_fft_sse_f64_butterfly {
136 ($struct_name:ident) => {
137 impl<T: FftNum> $struct_name<T> {
138 #[target_feature(enable = "sse4.1")]
140 pub(crate) unsafe fn perform_fft_butterfly(&self, buffer: &mut [Complex<T>]) {
141 self.perform_fft_contiguous(workaround_transmute_mut::<_, Complex<f64>>(buffer));
142 }
143
144 #[target_feature(enable = "sse4.1")]
146 pub(crate) unsafe fn perform_fft_butterfly_multi(
147 &self,
148 buffer: &mut [Complex<T>],
149 ) -> Result<(), ()> {
150 array_utils::iter_chunks(buffer, self.len(), |chunk| {
151 self.perform_fft_butterfly(chunk)
152 })
153 }
154
155 #[target_feature(enable = "sse4.1")]
157 pub(crate) unsafe fn perform_oop_fft_butterfly_multi(
158 &self,
159 input: &mut [Complex<T>],
160 output: &mut [Complex<T>],
161 ) -> Result<(), ()> {
162 array_utils::iter_chunks_zipped(input, output, self.len(), |in_chunk, out_chunk| {
163 let input_slice = workaround_transmute_mut(in_chunk);
164 let output_slice = workaround_transmute_mut(out_chunk);
165 self.perform_fft_contiguous(DoubleBuf {
166 input: input_slice,
167 output: output_slice,
168 })
169 })
170 }
171 }
172 };
173}
174
175#[allow(unused)]
176macro_rules! boilerplate_fft_sse_common_butterfly {
177 ($struct_name:ident, $len:expr, $direction_fn:expr) => {
178 impl<T: FftNum> Fft<T> for $struct_name<T> {
179 fn process_outofplace_with_scratch(
180 &self,
181 input: &mut [Complex<T>],
182 output: &mut [Complex<T>],
183 _scratch: &mut [Complex<T>],
184 ) {
185 if input.len() < self.len() || output.len() != input.len() {
186 fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0);
188 return; }
190 let result = unsafe { self.perform_oop_fft_butterfly_multi(input, output) };
191
192 if result.is_err() {
193 fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0);
196 }
197 }
198 fn process_with_scratch(&self, buffer: &mut [Complex<T>], _scratch: &mut [Complex<T>]) {
199 if buffer.len() < self.len() {
200 fft_error_inplace(self.len(), buffer.len(), 0, 0);
202 return; }
204
205 let result = unsafe { self.perform_fft_butterfly_multi(buffer) };
206
207 if result.is_err() {
208 fft_error_inplace(self.len(), buffer.len(), 0, 0);
211 }
212 }
213 #[inline(always)]
214 fn get_inplace_scratch_len(&self) -> usize {
215 0
216 }
217 #[inline(always)]
218 fn get_outofplace_scratch_len(&self) -> usize {
219 0
220 }
221 }
222 impl<T> Length for $struct_name<T> {
223 #[inline(always)]
224 fn len(&self) -> usize {
225 $len
226 }
227 }
228 impl<T> Direction for $struct_name<T> {
229 #[inline(always)]
230 fn fft_direction(&self) -> FftDirection {
231 $direction_fn(self)
232 }
233 }
234 };
235}
236
237pub struct SseF32Butterfly1<T> {
245 direction: FftDirection,
246 _phantom: std::marker::PhantomData<T>,
247}
248
249boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly1);
250boilerplate_fft_sse_common_butterfly!(SseF32Butterfly1, 1, |this: &SseF32Butterfly1<_>| this
251 .direction);
252impl<T: FftNum> SseF32Butterfly1<T> {
253 #[inline(always)]
254 pub fn new(direction: FftDirection) -> Self {
255 assert_f32::<T>();
256 Self {
257 direction,
258 _phantom: std::marker::PhantomData,
259 }
260 }
261 #[inline(always)]
262 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
263 let value = buffer.load_partial_lo_complex(0);
264 buffer.store_partial_lo_complex(value, 0);
265 }
266
267 #[inline(always)]
268 pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
269 let value = buffer.load_complex(0);
270 buffer.store_complex(value, 0);
271 }
272}
273
274pub struct SseF64Butterfly1<T> {
282 direction: FftDirection,
283 _phantom: std::marker::PhantomData<T>,
284}
285
286boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly1);
287boilerplate_fft_sse_common_butterfly!(SseF64Butterfly1, 1, |this: &SseF64Butterfly1<_>| this
288 .direction);
289impl<T: FftNum> SseF64Butterfly1<T> {
290 #[inline(always)]
291 pub fn new(direction: FftDirection) -> Self {
292 assert_f64::<T>();
293 Self {
294 direction,
295 _phantom: std::marker::PhantomData,
296 }
297 }
298 #[inline(always)]
299 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
300 let value = buffer.load_complex(0);
301 buffer.store_complex(value, 0);
302 }
303}
304
305pub struct SseF32Butterfly2<T> {
313 direction: FftDirection,
314 _phantom: std::marker::PhantomData<T>,
315}
316
317boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly2);
318boilerplate_fft_sse_common_butterfly!(SseF32Butterfly2, 2, |this: &SseF32Butterfly2<_>| this
319 .direction);
320impl<T: FftNum> SseF32Butterfly2<T> {
321 #[inline(always)]
322 pub fn new(direction: FftDirection) -> Self {
323 assert_f32::<T>();
324 Self {
325 direction,
326 _phantom: std::marker::PhantomData,
327 }
328 }
329 #[inline(always)]
330 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
331 let values = buffer.load_complex(0);
332
333 let temp = self.perform_fft_direct(values);
334
335 buffer.store_complex(temp, 0);
336 }
337
338 #[inline(always)]
339 pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
340 let values_a = buffer.load_complex(0);
341 let values_b = buffer.load_complex(2);
342
343 let out = self.perform_parallel_fft_direct(values_a, values_b);
344
345 let [out02, out13] = transpose_complex_2x2_f32(out[0], out[1]);
346
347 buffer.store_complex(out02, 0);
348 buffer.store_complex(out13, 2);
349 }
350
351 #[inline(always)]
354 pub(crate) unsafe fn perform_fft_direct(&self, values: __m128) -> __m128 {
355 solo_fft2_f32(values)
356 }
357
358 #[inline(always)]
361 pub(crate) unsafe fn perform_parallel_fft_direct(
362 &self,
363 values_x: __m128,
364 values_y: __m128,
365 ) -> [__m128; 2] {
366 parallel_fft2_contiguous_f32(values_x, values_y)
367 }
368}
369
370#[inline(always)]
373pub(crate) unsafe fn parallel_fft2_interleaved_f32(val02: __m128, val13: __m128) -> [__m128; 2] {
374 let temp0 = _mm_add_ps(val02, val13);
375 let temp1 = _mm_sub_ps(val02, val13);
376 [temp0, temp1]
377}
378
379#[inline(always)]
382unsafe fn parallel_fft2_contiguous_f32(left: __m128, right: __m128) -> [__m128; 2] {
383 let [temp02, temp13] = transpose_complex_2x2_f32(left, right);
384 parallel_fft2_interleaved_f32(temp02, temp13)
385}
386
387#[inline(always)]
390unsafe fn solo_fft2_f32(values: __m128) -> __m128 {
391 let temp = reverse_complex_elements_f32(values);
392 let temp2 = negate_hi_f32(values);
393 _mm_add_ps(temp2, temp)
394}
395
396pub struct SseF64Butterfly2<T> {
404 direction: FftDirection,
405 _phantom: std::marker::PhantomData<T>,
406}
407
408boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly2);
409boilerplate_fft_sse_common_butterfly!(SseF64Butterfly2, 2, |this: &SseF64Butterfly2<_>| this
410 .direction);
411impl<T: FftNum> SseF64Butterfly2<T> {
412 #[inline(always)]
413 pub fn new(direction: FftDirection) -> Self {
414 assert_f64::<T>();
415 Self {
416 direction,
417 _phantom: std::marker::PhantomData,
418 }
419 }
420
421 #[inline(always)]
422 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
423 let value0 = buffer.load_complex(0);
424 let value1 = buffer.load_complex(1);
425
426 let out = self.perform_fft_direct(value0, value1);
427
428 buffer.store_complex(out[0], 0);
429 buffer.store_complex(out[1], 1);
430 }
431
432 #[inline(always)]
433 pub(crate) unsafe fn perform_fft_direct(
434 &self,
435 value0: __m128d,
436 value1: __m128d,
437 ) -> [__m128d; 2] {
438 solo_fft2_f64(value0, value1)
439 }
440}
441
442#[inline(always)]
443pub(crate) unsafe fn solo_fft2_f64(left: __m128d, right: __m128d) -> [__m128d; 2] {
444 let temp0 = _mm_add_pd(left, right);
445 let temp1 = _mm_sub_pd(left, right);
446 [temp0, temp1]
447}
448
449pub struct SseF32Butterfly3<T> {
457 direction: FftDirection,
458 _phantom: std::marker::PhantomData<T>,
459 rotate: Rotate90F32,
460 twiddle: __m128,
461 twiddle1re: __m128,
462 twiddle1im: __m128,
463}
464
465boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly3);
466boilerplate_fft_sse_common_butterfly!(SseF32Butterfly3, 3, |this: &SseF32Butterfly3<_>| this
467 .direction);
468impl<T: FftNum> SseF32Butterfly3<T> {
469 #[inline(always)]
470 pub fn new(direction: FftDirection) -> Self {
471 assert_f32::<T>();
472 let rotate = Rotate90F32::new(true);
473 let tw1: Complex<f32> = twiddles::compute_twiddle(1, 3, direction);
474 let twiddle = unsafe { _mm_set_ps(-tw1.im, -tw1.im, tw1.re, tw1.re) };
475 let twiddle1re = unsafe { _mm_set_ps(tw1.re, tw1.re, tw1.re, tw1.re) };
476 let twiddle1im = unsafe { _mm_set_ps(tw1.im, tw1.im, tw1.im, tw1.im) };
477 Self {
478 direction,
479 _phantom: std::marker::PhantomData,
480 rotate,
481 twiddle,
482 twiddle1re,
483 twiddle1im,
484 }
485 }
486 #[inline(always)]
487 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
488 let value0x = buffer.load_partial_lo_complex(0);
489 let value12 = buffer.load_complex(1);
490
491 let out = self.perform_fft_direct(value0x, value12);
492
493 buffer.store_partial_lo_complex(out[0], 0);
494 buffer.store_complex(out[1], 1);
495 }
496
497 #[inline(always)]
498 pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
499 let valuea0a1 = buffer.load_complex(0);
500 let valuea2b0 = buffer.load_complex(2);
501 let valueb1b2 = buffer.load_complex(4);
502
503 let value0 = extract_lo_hi_f32(valuea0a1, valuea2b0);
504 let value1 = extract_hi_lo_f32(valuea0a1, valueb1b2);
505 let value2 = extract_lo_hi_f32(valuea2b0, valueb1b2);
506
507 let out = self.perform_parallel_fft_direct(value0, value1, value2);
508
509 let out0 = extract_lo_lo_f32(out[0], out[1]);
510 let out1 = extract_lo_hi_f32(out[2], out[0]);
511 let out2 = extract_hi_hi_f32(out[1], out[2]);
512
513 buffer.store_complex(out0, 0);
514 buffer.store_complex(out1, 2);
515 buffer.store_complex(out2, 4);
516 }
517
518 #[inline(always)]
522 pub(crate) unsafe fn perform_fft_direct(
523 &self,
524 value0x: __m128,
525 value12: __m128,
526 ) -> [__m128; 2] {
527 let rev12 = negate_hi_f32(reverse_complex_elements_f32(value12));
529 let temp12pn = self.rotate.rotate_hi(_mm_add_ps(value12, rev12));
530 let twiddled = _mm_mul_ps(temp12pn, self.twiddle);
531 let temp = _mm_add_ps(value0x, twiddled);
532 let out12 = solo_fft2_f32(temp);
533 let out0x = _mm_add_ps(value0x, temp12pn);
534 [out0x, out12]
535 }
536
537 #[inline(always)]
540 pub(crate) unsafe fn perform_parallel_fft_direct(
541 &self,
542 value0: __m128,
543 value1: __m128,
544 value2: __m128,
545 ) -> [__m128; 3] {
546 let x12p = _mm_add_ps(value1, value2);
548 let x12n = _mm_sub_ps(value1, value2);
549 let sum = _mm_add_ps(value0, x12p);
550
551 let temp_a = _mm_mul_ps(self.twiddle1re, x12p);
552 let temp_a = _mm_add_ps(temp_a, value0);
553
554 let n_rot = self.rotate.rotate_both(x12n);
555 let temp_b = _mm_mul_ps(self.twiddle1im, n_rot);
556
557 let x1 = _mm_add_ps(temp_a, temp_b);
558 let x2 = _mm_sub_ps(temp_a, temp_b);
559 [sum, x1, x2]
560 }
561}
562
563pub struct SseF64Butterfly3<T> {
571 direction: FftDirection,
572 _phantom: std::marker::PhantomData<T>,
573 rotate: Rotate90F64,
574 twiddle1re: __m128d,
575 twiddle1im: __m128d,
576}
577
578boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly3);
579boilerplate_fft_sse_common_butterfly!(SseF64Butterfly3, 3, |this: &SseF64Butterfly3<_>| this
580 .direction);
581impl<T: FftNum> SseF64Butterfly3<T> {
582 #[inline(always)]
583 pub fn new(direction: FftDirection) -> Self {
584 assert_f64::<T>();
585 let rotate = Rotate90F64::new(true);
586 let tw1: Complex<f64> = twiddles::compute_twiddle(1, 3, direction);
587 let twiddle1re = unsafe { _mm_set_pd(tw1.re, tw1.re) };
588 let twiddle1im = unsafe { _mm_set_pd(tw1.im, tw1.im) };
589
590 Self {
591 direction,
592 _phantom: std::marker::PhantomData,
593 rotate,
594 twiddle1re,
595 twiddle1im,
596 }
597 }
598
599 #[inline(always)]
600 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
601 let value0 = buffer.load_complex(0);
602 let value1 = buffer.load_complex(1);
603 let value2 = buffer.load_complex(2);
604
605 let out = self.perform_fft_direct(value0, value1, value2);
606
607 buffer.store_complex(out[0], 0);
608 buffer.store_complex(out[1], 1);
609 buffer.store_complex(out[2], 2);
610 }
611
612 #[inline(always)]
615 pub(crate) unsafe fn perform_fft_direct(
616 &self,
617 value0: __m128d,
618 value1: __m128d,
619 value2: __m128d,
620 ) -> [__m128d; 3] {
621 let x12p = _mm_add_pd(value1, value2);
623 let x12n = _mm_sub_pd(value1, value2);
624 let sum = _mm_add_pd(value0, x12p);
625
626 let temp_a = _mm_mul_pd(self.twiddle1re, x12p);
627 let temp_a = _mm_add_pd(temp_a, value0);
628
629 let n_rot = self.rotate.rotate(x12n);
630 let temp_b = _mm_mul_pd(self.twiddle1im, n_rot);
631
632 let x1 = _mm_add_pd(temp_a, temp_b);
633 let x2 = _mm_sub_pd(temp_a, temp_b);
634 [sum, x1, x2]
635 }
636}
637
638pub struct SseF32Butterfly4<T> {
646 direction: FftDirection,
647 _phantom: std::marker::PhantomData<T>,
648 rotate: Rotate90F32,
649}
650
651boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly4);
652boilerplate_fft_sse_common_butterfly!(SseF32Butterfly4, 4, |this: &SseF32Butterfly4<_>| this
653 .direction);
654impl<T: FftNum> SseF32Butterfly4<T> {
655 #[inline(always)]
656 pub fn new(direction: FftDirection) -> Self {
657 assert_f32::<T>();
658 let rotate = if direction == FftDirection::Inverse {
659 Rotate90F32::new(true)
660 } else {
661 Rotate90F32::new(false)
662 };
663 Self {
664 direction,
665 _phantom: std::marker::PhantomData,
666 rotate,
667 }
668 }
669 #[inline(always)]
670 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
671 let value01 = buffer.load_complex(0);
672 let value23 = buffer.load_complex(2);
673
674 let out = self.perform_fft_direct(value01, value23);
675
676 buffer.store_complex(out[0], 0);
677 buffer.store_complex(out[1], 2);
678 }
679
680 #[inline(always)]
681 pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
682 let value01a = buffer.load_complex(0);
683 let value23a = buffer.load_complex(2);
684 let value01b = buffer.load_complex(4);
685 let value23b = buffer.load_complex(6);
686
687 let [value0ab, value1ab] = transpose_complex_2x2_f32(value01a, value01b);
688 let [value2ab, value3ab] = transpose_complex_2x2_f32(value23a, value23b);
689
690 let out = self.perform_parallel_fft_direct([value0ab, value1ab, value2ab, value3ab]);
691
692 let [out0, out1] = transpose_complex_2x2_f32(out[0], out[1]);
693 let [out2, out3] = transpose_complex_2x2_f32(out[2], out[3]);
694
695 buffer.store_complex(out0, 0);
696 buffer.store_complex(out1, 4);
697 buffer.store_complex(out2, 2);
698 buffer.store_complex(out3, 6);
699 }
700
701 #[inline(always)]
704 pub(crate) unsafe fn perform_fft_direct(
705 &self,
706 value01: __m128,
707 value23: __m128,
708 ) -> [__m128; 2] {
709 let mut temp = parallel_fft2_interleaved_f32(value01, value23);
716
717 temp[1] = self.rotate.rotate_hi(temp[1]);
719
720 parallel_fft2_contiguous_f32(temp[0], temp[1])
726 }
727
728 #[inline(always)]
729 pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 4]) -> [__m128; 4] {
730 let temp0 = parallel_fft2_interleaved_f32(values[0], values[2]);
737 let mut temp1 = parallel_fft2_interleaved_f32(values[1], values[3]);
738
739 temp1[1] = self.rotate.rotate_both(temp1[1]);
741
742 let out0 = parallel_fft2_interleaved_f32(temp0[0], temp1[0]);
746 let out2 = parallel_fft2_interleaved_f32(temp0[1], temp1[1]);
747
748 [out0[0], out2[0], out0[1], out2[1]]
750 }
751}
752
753pub struct SseF64Butterfly4<T> {
761 direction: FftDirection,
762 _phantom: std::marker::PhantomData<T>,
763 rotate: Rotate90F64,
764}
765
766boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly4);
767boilerplate_fft_sse_common_butterfly!(SseF64Butterfly4, 4, |this: &SseF64Butterfly4<_>| this
768 .direction);
769impl<T: FftNum> SseF64Butterfly4<T> {
770 #[inline(always)]
771 pub fn new(direction: FftDirection) -> Self {
772 assert_f64::<T>();
773 let rotate = if direction == FftDirection::Inverse {
774 Rotate90F64::new(true)
775 } else {
776 Rotate90F64::new(false)
777 };
778
779 Self {
780 direction,
781 _phantom: std::marker::PhantomData,
782 rotate,
783 }
784 }
785
786 #[inline(always)]
787 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
788 let value0 = buffer.load_complex(0);
789 let value1 = buffer.load_complex(1);
790 let value2 = buffer.load_complex(2);
791 let value3 = buffer.load_complex(3);
792
793 let out = self.perform_fft_direct([value0, value1, value2, value3]);
794
795 buffer.store_complex(out[0], 0);
796 buffer.store_complex(out[1], 1);
797 buffer.store_complex(out[2], 2);
798 buffer.store_complex(out[3], 3);
799 }
800
801 #[inline(always)]
802 pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 4]) -> [__m128d; 4] {
803 let temp0 = solo_fft2_f64(values[0], values[2]);
810 let mut temp1 = solo_fft2_f64(values[1], values[3]);
811
812 temp1[1] = self.rotate.rotate(temp1[1]);
814
815 let out0 = solo_fft2_f64(temp0[0], temp1[0]);
819 let out2 = solo_fft2_f64(temp0[1], temp1[1]);
820
821 [out0[0], out2[0], out0[1], out2[1]]
823 }
824}
825
826pub struct SseF32Butterfly5<T> {
834 direction: FftDirection,
835 _phantom: std::marker::PhantomData<T>,
836 rotate: Rotate90F32,
837 twiddle12re: __m128,
838 twiddle21re: __m128,
839 twiddle12im: __m128,
840 twiddle21im: __m128,
841 twiddle1re: __m128,
842 twiddle1im: __m128,
843 twiddle2re: __m128,
844 twiddle2im: __m128,
845}
846
847boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly5);
848boilerplate_fft_sse_common_butterfly!(SseF32Butterfly5, 5, |this: &SseF32Butterfly5<_>| this
849 .direction);
850impl<T: FftNum> SseF32Butterfly5<T> {
851 #[inline(always)]
852 pub fn new(direction: FftDirection) -> Self {
853 assert_f32::<T>();
854 let rotate = Rotate90F32::new(true);
855 let tw1: Complex<f32> = twiddles::compute_twiddle(1, 5, direction);
856 let tw2: Complex<f32> = twiddles::compute_twiddle(2, 5, direction);
857 let twiddle12re = unsafe { _mm_set_ps(tw2.re, tw2.re, tw1.re, tw1.re) };
858 let twiddle21re = unsafe { _mm_set_ps(tw1.re, tw1.re, tw2.re, tw2.re) };
859 let twiddle12im = unsafe { _mm_set_ps(tw2.im, tw2.im, tw1.im, tw1.im) };
860 let twiddle21im = unsafe { _mm_set_ps(-tw1.im, -tw1.im, tw2.im, tw2.im) };
861 let twiddle1re = unsafe { _mm_set_ps(tw1.re, tw1.re, tw1.re, tw1.re) };
862 let twiddle1im = unsafe { _mm_set_ps(tw1.im, tw1.im, tw1.im, tw1.im) };
863 let twiddle2re = unsafe { _mm_set_ps(tw2.re, tw2.re, tw2.re, tw2.re) };
864 let twiddle2im = unsafe { _mm_set_ps(tw2.im, tw2.im, tw2.im, tw2.im) };
865
866 Self {
867 direction,
868 _phantom: std::marker::PhantomData,
869 rotate,
870 twiddle12re,
871 twiddle21re,
872 twiddle12im,
873 twiddle21im,
874 twiddle1re,
875 twiddle1im,
876 twiddle2re,
877 twiddle2im,
878 }
879 }
880 #[inline(always)]
881 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
882 let value00 = buffer.load1_complex(0);
883 let value12 = buffer.load_complex(1);
884 let value34 = buffer.load_complex(3);
885
886 let out = self.perform_fft_direct(value00, value12, value34);
887
888 buffer.store_partial_lo_complex(out[0], 0);
889 buffer.store_complex(out[1], 1);
890 buffer.store_complex(out[2], 3);
891 }
892
893 #[inline(always)]
894 pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
895 let input_packed = read_complex_to_array!(buffer, {0, 2, 4 ,6, 8});
896
897 let value0 = extract_lo_hi_f32(input_packed[0], input_packed[2]);
898 let value1 = extract_hi_lo_f32(input_packed[0], input_packed[3]);
899 let value2 = extract_lo_hi_f32(input_packed[1], input_packed[3]);
900 let value3 = extract_hi_lo_f32(input_packed[1], input_packed[4]);
901 let value4 = extract_lo_hi_f32(input_packed[2], input_packed[4]);
902
903 let out = self.perform_parallel_fft_direct(value0, value1, value2, value3, value4);
904
905 let out_packed = [
906 extract_lo_lo_f32(out[0], out[1]),
907 extract_lo_lo_f32(out[2], out[3]),
908 extract_lo_hi_f32(out[4], out[0]),
909 extract_hi_hi_f32(out[1], out[2]),
910 extract_hi_hi_f32(out[3], out[4]),
911 ];
912
913 write_complex_to_array_strided!(out_packed, buffer, 2, {0, 1, 2, 3, 4});
914 }
915
916 #[inline(always)]
920 pub(crate) unsafe fn perform_fft_direct(
921 &self,
922 value00: __m128,
923 value12: __m128,
924 value34: __m128,
925 ) -> [__m128; 3] {
926 let temp43 = reverse_complex_elements_f32(value34);
928 let x1423p = _mm_add_ps(value12, temp43);
929 let x1423n = _mm_sub_ps(value12, temp43);
930
931 let x1414p = duplicate_lo_f32(x1423p);
932 let x2323p = duplicate_hi_f32(x1423p);
933 let x1414n = duplicate_lo_f32(x1423n);
934 let x2323n = duplicate_hi_f32(x1423n);
935
936 let temp_a1 = _mm_mul_ps(self.twiddle12re, x1414p);
937 let temp_a2 = _mm_mul_ps(self.twiddle21re, x2323p);
938
939 let temp_b1 = _mm_mul_ps(self.twiddle12im, x1414n);
940 let temp_b2 = _mm_mul_ps(self.twiddle21im, x2323n);
941
942 let temp_a = _mm_add_ps(temp_a1, temp_a2);
943 let temp_a = _mm_add_ps(value00, temp_a);
944
945 let temp_b = _mm_add_ps(temp_b1, temp_b2);
946
947 let b_rot = self.rotate.rotate_both(temp_b);
948
949 let x00 = _mm_add_ps(value00, _mm_add_ps(x1414p, x2323p));
950
951 let x12 = _mm_add_ps(temp_a, b_rot);
952 let x34 = reverse_complex_elements_f32(_mm_sub_ps(temp_a, b_rot));
953 [x00, x12, x34]
954 }
955
956 #[inline(always)]
959 pub(crate) unsafe fn perform_parallel_fft_direct(
960 &self,
961 value0: __m128,
962 value1: __m128,
963 value2: __m128,
964 value3: __m128,
965 value4: __m128,
966 ) -> [__m128; 5] {
967 let x14p = _mm_add_ps(value1, value4);
969 let x14n = _mm_sub_ps(value1, value4);
970 let x23p = _mm_add_ps(value2, value3);
971 let x23n = _mm_sub_ps(value2, value3);
972
973 let temp_a1_1 = _mm_mul_ps(self.twiddle1re, x14p);
974 let temp_a1_2 = _mm_mul_ps(self.twiddle2re, x23p);
975 let temp_b1_1 = _mm_mul_ps(self.twiddle1im, x14n);
976 let temp_b1_2 = _mm_mul_ps(self.twiddle2im, x23n);
977 let temp_a2_1 = _mm_mul_ps(self.twiddle1re, x23p);
978 let temp_a2_2 = _mm_mul_ps(self.twiddle2re, x14p);
979 let temp_b2_1 = _mm_mul_ps(self.twiddle2im, x14n);
980 let temp_b2_2 = _mm_mul_ps(self.twiddle1im, x23n);
981
982 let temp_a1 = _mm_add_ps(value0, _mm_add_ps(temp_a1_1, temp_a1_2));
983 let temp_b1 = _mm_add_ps(temp_b1_1, temp_b1_2);
984 let temp_a2 = _mm_add_ps(value0, _mm_add_ps(temp_a2_1, temp_a2_2));
985 let temp_b2 = _mm_sub_ps(temp_b2_1, temp_b2_2);
986
987 [
988 _mm_add_ps(value0, _mm_add_ps(x14p, x23p)),
989 _mm_add_ps(temp_a1, self.rotate.rotate_both(temp_b1)),
990 _mm_add_ps(temp_a2, self.rotate.rotate_both(temp_b2)),
991 _mm_sub_ps(temp_a2, self.rotate.rotate_both(temp_b2)),
992 _mm_sub_ps(temp_a1, self.rotate.rotate_both(temp_b1)),
993 ]
994 }
995}
996
997pub struct SseF64Butterfly5<T> {
1005 direction: FftDirection,
1006 _phantom: std::marker::PhantomData<T>,
1007 rotate: Rotate90F64,
1008 twiddle1re: __m128d,
1009 twiddle1im: __m128d,
1010 twiddle2re: __m128d,
1011 twiddle2im: __m128d,
1012}
1013
1014boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly5);
1015boilerplate_fft_sse_common_butterfly!(SseF64Butterfly5, 5, |this: &SseF64Butterfly5<_>| this
1016 .direction);
1017impl<T: FftNum> SseF64Butterfly5<T> {
1018 #[inline(always)]
1019 pub fn new(direction: FftDirection) -> Self {
1020 assert_f64::<T>();
1021 let rotate = Rotate90F64::new(true);
1022 let tw1: Complex<f64> = twiddles::compute_twiddle(1, 5, direction);
1023 let tw2: Complex<f64> = twiddles::compute_twiddle(2, 5, direction);
1024 let twiddle1re = unsafe { _mm_set_pd(tw1.re, tw1.re) };
1025 let twiddle1im = unsafe { _mm_set_pd(tw1.im, tw1.im) };
1026 let twiddle2re = unsafe { _mm_set_pd(tw2.re, tw2.re) };
1027 let twiddle2im = unsafe { _mm_set_pd(tw2.im, tw2.im) };
1028
1029 Self {
1030 direction,
1031 _phantom: std::marker::PhantomData,
1032 rotate,
1033 twiddle1re,
1034 twiddle1im,
1035 twiddle2re,
1036 twiddle2im,
1037 }
1038 }
1039
1040 #[inline(always)]
1041 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
1042 let value0 = buffer.load_complex(0);
1043 let value1 = buffer.load_complex(1);
1044 let value2 = buffer.load_complex(2);
1045 let value3 = buffer.load_complex(3);
1046 let value4 = buffer.load_complex(4);
1047
1048 let out = self.perform_fft_direct(value0, value1, value2, value3, value4);
1049
1050 buffer.store_complex(out[0], 0);
1051 buffer.store_complex(out[1], 1);
1052 buffer.store_complex(out[2], 2);
1053 buffer.store_complex(out[3], 3);
1054 buffer.store_complex(out[4], 4);
1055 }
1056
1057 #[inline(always)]
1060 pub(crate) unsafe fn perform_fft_direct(
1061 &self,
1062 value0: __m128d,
1063 value1: __m128d,
1064 value2: __m128d,
1065 value3: __m128d,
1066 value4: __m128d,
1067 ) -> [__m128d; 5] {
1068 let x14p = _mm_add_pd(value1, value4);
1070 let x14n = _mm_sub_pd(value1, value4);
1071 let x23p = _mm_add_pd(value2, value3);
1072 let x23n = _mm_sub_pd(value2, value3);
1073
1074 let temp_a1_1 = _mm_mul_pd(self.twiddle1re, x14p);
1075 let temp_a1_2 = _mm_mul_pd(self.twiddle2re, x23p);
1076 let temp_a2_1 = _mm_mul_pd(self.twiddle2re, x14p);
1077 let temp_a2_2 = _mm_mul_pd(self.twiddle1re, x23p);
1078
1079 let temp_b1_1 = _mm_mul_pd(self.twiddle1im, x14n);
1080 let temp_b1_2 = _mm_mul_pd(self.twiddle2im, x23n);
1081 let temp_b2_1 = _mm_mul_pd(self.twiddle2im, x14n);
1082 let temp_b2_2 = _mm_mul_pd(self.twiddle1im, x23n);
1083
1084 let temp_a1 = _mm_add_pd(value0, _mm_add_pd(temp_a1_1, temp_a1_2));
1085 let temp_a2 = _mm_add_pd(value0, _mm_add_pd(temp_a2_1, temp_a2_2));
1086
1087 let temp_b1 = _mm_add_pd(temp_b1_1, temp_b1_2);
1088 let temp_b2 = _mm_sub_pd(temp_b2_1, temp_b2_2);
1089
1090 let temp_b1_rot = self.rotate.rotate(temp_b1);
1091 let temp_b2_rot = self.rotate.rotate(temp_b2);
1092 [
1093 _mm_add_pd(value0, _mm_add_pd(x14p, x23p)),
1094 _mm_add_pd(temp_a1, temp_b1_rot),
1095 _mm_add_pd(temp_a2, temp_b2_rot),
1096 _mm_sub_pd(temp_a2, temp_b2_rot),
1097 _mm_sub_pd(temp_a1, temp_b1_rot),
1098 ]
1099 }
1100}
1101
1102pub struct SseF32Butterfly6<T> {
1110 direction: FftDirection,
1111 _phantom: std::marker::PhantomData<T>,
1112 bf3: SseF32Butterfly3<T>,
1113}
1114
1115boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly6);
1116boilerplate_fft_sse_common_butterfly!(SseF32Butterfly6, 6, |this: &SseF32Butterfly6<_>| this
1117 .direction);
1118impl<T: FftNum> SseF32Butterfly6<T> {
1119 #[inline(always)]
1120 pub fn new(direction: FftDirection) -> Self {
1121 assert_f32::<T>();
1122 let bf3 = SseF32Butterfly3::new(direction);
1123
1124 Self {
1125 direction,
1126 _phantom: std::marker::PhantomData,
1127 bf3,
1128 }
1129 }
1130
1131 #[inline(always)]
1132 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
1133 let value01 = buffer.load_complex(0);
1134 let value23 = buffer.load_complex(2);
1135 let value45 = buffer.load_complex(4);
1136
1137 let out = self.perform_fft_direct(value01, value23, value45);
1138
1139 buffer.store_complex(out[0], 0);
1140 buffer.store_complex(out[1], 2);
1141 buffer.store_complex(out[2], 4);
1142 }
1143
1144 #[inline(always)]
1145 pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
1146 let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10});
1147
1148 let values = interleave_complex_f32!(input_packed, 3, {0, 1, 2});
1149
1150 let out = self.perform_parallel_fft_direct(
1151 values[0], values[1], values[2], values[3], values[4], values[5],
1152 );
1153
1154 let out_sorted = separate_interleaved_complex_f32!(out, {0, 2, 4});
1155 write_complex_to_array_strided!(out_sorted, buffer, 2, {0, 1, 2, 3, 4, 5});
1156 }
1157
1158 #[inline(always)]
1159 pub(crate) unsafe fn perform_fft_direct(
1160 &self,
1161 value01: __m128,
1162 value23: __m128,
1163 value45: __m128,
1164 ) -> [__m128; 3] {
1165 let reord0 = extract_lo_hi_f32(value01, value23);
1169 let reord1 = extract_lo_hi_f32(value23, value45);
1170 let reord2 = extract_lo_hi_f32(value45, value01);
1171
1172 let mid = self.bf3.perform_parallel_fft_direct(reord0, reord1, reord2);
1173
1174 let [output0, output1] = parallel_fft2_contiguous_f32(mid[0], mid[1]);
1178 let output2 = solo_fft2_f32(mid[2]);
1179
1180 [
1182 extract_lo_hi_f32(output0, output1),
1183 extract_lo_lo_f32(output2, output1),
1184 extract_hi_hi_f32(output0, output2),
1185 ]
1186 }
1187
1188 #[inline(always)]
1189 pub(crate) unsafe fn perform_parallel_fft_direct(
1190 &self,
1191 value0: __m128,
1192 value1: __m128,
1193 value2: __m128,
1194 value3: __m128,
1195 value4: __m128,
1196 value5: __m128,
1197 ) -> [__m128; 6] {
1198 let mid0 = self.bf3.perform_parallel_fft_direct(value0, value2, value4);
1202 let mid1 = self.bf3.perform_parallel_fft_direct(value3, value5, value1);
1203
1204 let [output0, output1] = parallel_fft2_interleaved_f32(mid0[0], mid1[0]);
1208 let [output2, output3] = parallel_fft2_interleaved_f32(mid0[1], mid1[1]);
1209 let [output4, output5] = parallel_fft2_interleaved_f32(mid0[2], mid1[2]);
1210
1211 [output0, output3, output4, output1, output2, output5]
1213 }
1214}
1215
1216pub struct SseF64Butterfly6<T> {
1224 direction: FftDirection,
1225 _phantom: std::marker::PhantomData<T>,
1226 bf3: SseF64Butterfly3<T>,
1227}
1228
1229boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly6);
1230boilerplate_fft_sse_common_butterfly!(SseF64Butterfly6, 6, |this: &SseF64Butterfly6<_>| this
1231 .direction);
1232impl<T: FftNum> SseF64Butterfly6<T> {
1233 #[inline(always)]
1234 pub fn new(direction: FftDirection) -> Self {
1235 assert_f64::<T>();
1236 let bf3 = SseF64Butterfly3::new(direction);
1237
1238 Self {
1239 direction,
1240 _phantom: std::marker::PhantomData,
1241 bf3,
1242 }
1243 }
1244
1245 #[inline(always)]
1246 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
1247 let value0 = buffer.load_complex(0);
1248 let value1 = buffer.load_complex(1);
1249 let value2 = buffer.load_complex(2);
1250 let value3 = buffer.load_complex(3);
1251 let value4 = buffer.load_complex(4);
1252 let value5 = buffer.load_complex(5);
1253
1254 let out = self.perform_fft_direct([value0, value1, value2, value3, value4, value5]);
1255
1256 buffer.store_complex(out[0], 0);
1257 buffer.store_complex(out[1], 1);
1258 buffer.store_complex(out[2], 2);
1259 buffer.store_complex(out[3], 3);
1260 buffer.store_complex(out[4], 4);
1261 buffer.store_complex(out[5], 5);
1262 }
1263
1264 #[inline(always)]
1265 pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 6]) -> [__m128d; 6] {
1266 let mid0 = self.bf3.perform_fft_direct(values[0], values[2], values[4]);
1270 let mid1 = self.bf3.perform_fft_direct(values[3], values[5], values[1]);
1271
1272 let [output0, output1] = solo_fft2_f64(mid0[0], mid1[0]);
1276 let [output2, output3] = solo_fft2_f64(mid0[1], mid1[1]);
1277 let [output4, output5] = solo_fft2_f64(mid0[2], mid1[2]);
1278
1279 [output0, output3, output4, output1, output2, output5]
1281 }
1282}
1283
1284pub struct SseF32Butterfly8<T> {
1292 root2: __m128,
1293 root2_dual: __m128,
1294 direction: FftDirection,
1295 bf4: SseF32Butterfly4<T>,
1296 rotate90: Rotate90F32,
1297}
1298
1299boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly8);
1300boilerplate_fft_sse_common_butterfly!(SseF32Butterfly8, 8, |this: &SseF32Butterfly8<_>| this
1301 .direction);
1302impl<T: FftNum> SseF32Butterfly8<T> {
1303 #[inline(always)]
1304 pub fn new(direction: FftDirection) -> Self {
1305 assert_f32::<T>();
1306 let bf4 = SseF32Butterfly4::new(direction);
1307 let root2 = unsafe { _mm_set_ps(0.5f32.sqrt(), 0.5f32.sqrt(), 1.0, 1.0) };
1308 let root2_dual = unsafe { _mm_load1_ps(&0.5f32.sqrt()) };
1309 let rotate90 = if direction == FftDirection::Inverse {
1310 Rotate90F32::new(true)
1311 } else {
1312 Rotate90F32::new(false)
1313 };
1314 Self {
1315 root2,
1316 root2_dual,
1317 direction,
1318 bf4,
1319 rotate90,
1320 }
1321 }
1322
1323 #[inline(always)]
1324 unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
1325 let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6});
1326
1327 let out = self.perform_fft_direct(input_packed);
1328
1329 write_complex_to_array_strided!(out, buffer, 2, {0,1,2,3});
1330 }
1331
1332 #[inline(always)]
1333 pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
1334 let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10, 12, 14});
1335
1336 let values = interleave_complex_f32!(input_packed, 4, {0, 1, 2, 3});
1337
1338 let out = self.perform_parallel_fft_direct(values);
1339
1340 let out_sorted = separate_interleaved_complex_f32!(out, {0, 2, 4, 6});
1341
1342 write_complex_to_array_strided!(out_sorted, buffer, 2, {0,1,2,3,4,5,6,7});
1343 }
1344
1345 #[inline(always)]
1346 unsafe fn perform_fft_direct(&self, values: [__m128; 4]) -> [__m128; 4] {
1347 let [in02, in13] = transpose_complex_2x2_f32(values[0], values[1]);
1350 let [in46, in57] = transpose_complex_2x2_f32(values[2], values[3]);
1351
1352 let val0 = self.bf4.perform_fft_direct(in02, in46);
1354 let mut val2 = self.bf4.perform_fft_direct(in13, in57);
1355
1356 let val2b = self.rotate90.rotate_hi(val2[0]);
1358 let val2c = _mm_add_ps(val2b, val2[0]);
1359 let val2d = _mm_mul_ps(val2c, self.root2);
1360 val2[0] = extract_lo_hi_f32(val2[0], val2d);
1361
1362 let val3b = self.rotate90.rotate_both(val2[1]);
1363 let val3c = _mm_sub_ps(val3b, val2[1]);
1364 let val3d = _mm_mul_ps(val3c, self.root2);
1365 val2[1] = extract_lo_hi_f32(val3b, val3d);
1366
1367 let out0 = parallel_fft2_interleaved_f32(val0[0], val2[0]);
1371 let out1 = parallel_fft2_interleaved_f32(val0[1], val2[1]);
1372
1373 [out0[0], out1[0], out0[1], out1[1]]
1375 }
1376
1377 #[inline(always)]
1378 unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 8]) -> [__m128; 8] {
1379 let val03 = self
1384 .bf4
1385 .perform_parallel_fft_direct([values[0], values[2], values[4], values[6]]);
1386 let mut val47 = self
1387 .bf4
1388 .perform_parallel_fft_direct([values[1], values[3], values[5], values[7]]);
1389
1390 let val5b = self.rotate90.rotate_both(val47[1]);
1392 let val5c = _mm_add_ps(val5b, val47[1]);
1393 val47[1] = _mm_mul_ps(val5c, self.root2_dual);
1394 val47[2] = self.rotate90.rotate_both(val47[2]);
1395 let val7b = self.rotate90.rotate_both(val47[3]);
1396 let val7c = _mm_sub_ps(val7b, val47[3]);
1397 val47[3] = _mm_mul_ps(val7c, self.root2_dual);
1398
1399 let out0 = parallel_fft2_interleaved_f32(val03[0], val47[0]);
1403 let out1 = parallel_fft2_interleaved_f32(val03[1], val47[1]);
1404 let out2 = parallel_fft2_interleaved_f32(val03[2], val47[2]);
1405 let out3 = parallel_fft2_interleaved_f32(val03[3], val47[3]);
1406
1407 [
1409 out0[0], out1[0], out2[0], out3[0], out0[1], out1[1], out2[1], out3[1],
1410 ]
1411 }
1412}
1413
1414pub struct SseF64Butterfly8<T> {
1422 bf4: SseF64Butterfly4<T>,
1423}
1424
1425boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly8);
1426boilerplate_fft_sse_common_butterfly!(SseF64Butterfly8, 8, |this: &SseF64Butterfly8<_>| this
1427 .bf4
1428 .direction);
1429impl<T: FftNum> SseF64Butterfly8<T> {
1430 #[inline(always)]
1431 pub fn new(direction: FftDirection) -> Self {
1432 assert_f64::<T>();
1433 Self {
1434 bf4: SseF64Butterfly4::new(direction),
1435 }
1436 }
1437
1438 #[inline(always)]
1439 unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
1440 let values = read_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7});
1441
1442 let out = self.perform_fft_direct(values);
1443
1444 write_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7});
1445 }
1446
1447 #[inline(always)]
1448 unsafe fn perform_fft_direct(&self, values: [__m128d; 8]) -> [__m128d; 8] {
1449 let val03 = self
1454 .bf4
1455 .perform_fft_direct([values[0], values[2], values[4], values[6]]);
1456 let mut val47 = self
1457 .bf4
1458 .perform_fft_direct([values[1], values[3], values[5], values[7]]);
1459
1460 val47[1] = self.bf4.rotate.rotate_45(val47[1]);
1462 val47[2] = self.bf4.rotate.rotate(val47[2]);
1463 val47[3] = self.bf4.rotate.rotate_135(val47[3]);
1464
1465 let out0 = solo_fft2_f64(val03[0], val47[0]);
1469 let out1 = solo_fft2_f64(val03[1], val47[1]);
1470 let out2 = solo_fft2_f64(val03[2], val47[2]);
1471 let out3 = solo_fft2_f64(val03[3], val47[3]);
1472
1473 [
1475 out0[0], out1[0], out2[0], out3[0], out0[1], out1[1], out2[1], out3[1],
1476 ]
1477 }
1478}
1479
1480pub struct SseF32Butterfly9<T> {
1487 direction: FftDirection,
1488 _phantom: std::marker::PhantomData<T>,
1489 bf3: SseF32Butterfly3<T>,
1490 twiddle1: __m128,
1491 twiddle2: __m128,
1492 twiddle4: __m128,
1493}
1494
1495boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly9);
1496boilerplate_fft_sse_common_butterfly!(SseF32Butterfly9, 9, |this: &SseF32Butterfly9<_>| this
1497 .direction);
1498impl<T: FftNum> SseF32Butterfly9<T> {
1499 #[inline(always)]
1500 pub fn new(direction: FftDirection) -> Self {
1501 assert_f32::<T>();
1502 let bf3 = SseF32Butterfly3::new(direction);
1503 let tw1: Complex<f32> = twiddles::compute_twiddle(1, 9, direction);
1504 let tw2: Complex<f32> = twiddles::compute_twiddle(2, 9, direction);
1505 let tw4: Complex<f32> = twiddles::compute_twiddle(4, 9, direction);
1506 let twiddle1 = unsafe { _mm_set_ps(tw1.im, tw1.re, tw1.im, tw1.re) };
1507 let twiddle2 = unsafe { _mm_set_ps(tw2.im, tw2.re, tw2.im, tw2.re) };
1508 let twiddle4 = unsafe { _mm_set_ps(tw4.im, tw4.re, tw4.im, tw4.re) };
1509 Self {
1510 direction,
1511 _phantom: std::marker::PhantomData,
1512 bf3,
1513 twiddle1,
1514 twiddle2,
1515 twiddle4,
1516 }
1517 }
1518
1519 #[inline(always)]
1520 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
1521 let values = read_partial1_complex_to_array!(buffer, {0,1,2,3,4,5,6,7,8});
1523
1524 let out = self.perform_parallel_fft_direct(values);
1525
1526 for n in 0..9 {
1527 buffer.store_partial_lo_complex(out[n], n);
1528 }
1529 }
1530
1531 #[inline(always)]
1532 pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
1533 let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10, 12, 14, 16});
1534
1535 let values = [
1536 extract_lo_hi_f32(input_packed[0], input_packed[4]),
1537 extract_hi_lo_f32(input_packed[0], input_packed[5]),
1538 extract_lo_hi_f32(input_packed[1], input_packed[5]),
1539 extract_hi_lo_f32(input_packed[1], input_packed[6]),
1540 extract_lo_hi_f32(input_packed[2], input_packed[6]),
1541 extract_hi_lo_f32(input_packed[2], input_packed[7]),
1542 extract_lo_hi_f32(input_packed[3], input_packed[7]),
1543 extract_hi_lo_f32(input_packed[3], input_packed[8]),
1544 extract_lo_hi_f32(input_packed[4], input_packed[8]),
1545 ];
1546
1547 let out = self.perform_parallel_fft_direct(values);
1548
1549 let out_packed = [
1550 extract_lo_lo_f32(out[0], out[1]),
1551 extract_lo_lo_f32(out[2], out[3]),
1552 extract_lo_lo_f32(out[4], out[5]),
1553 extract_lo_lo_f32(out[6], out[7]),
1554 extract_lo_hi_f32(out[8], out[0]),
1555 extract_hi_hi_f32(out[1], out[2]),
1556 extract_hi_hi_f32(out[3], out[4]),
1557 extract_hi_hi_f32(out[5], out[6]),
1558 extract_hi_hi_f32(out[7], out[8]),
1559 ];
1560
1561 write_complex_to_array_strided!(out_packed, buffer, 2, {0,1,2,3,4,5,6,7,8});
1562 }
1563
1564 #[inline(always)]
1565 pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 9]) -> [__m128; 9] {
1566 let mid0 = self
1570 .bf3
1571 .perform_parallel_fft_direct(values[0], values[3], values[6]);
1572 let mut mid1 = self
1573 .bf3
1574 .perform_parallel_fft_direct(values[1], values[4], values[7]);
1575 let mut mid2 = self
1576 .bf3
1577 .perform_parallel_fft_direct(values[2], values[5], values[8]);
1578
1579 mid1[1] = SseVector::mul_complex(self.twiddle1, mid1[1]);
1581 mid1[2] = SseVector::mul_complex(self.twiddle2, mid1[2]);
1582 mid2[1] = SseVector::mul_complex(self.twiddle2, mid2[1]);
1583 mid2[2] = SseVector::mul_complex(self.twiddle4, mid2[2]);
1584
1585 let [output0, output1, output2] = self
1586 .bf3
1587 .perform_parallel_fft_direct(mid0[0], mid1[0], mid2[0]);
1588 let [output3, output4, output5] = self
1589 .bf3
1590 .perform_parallel_fft_direct(mid0[1], mid1[1], mid2[1]);
1591 let [output6, output7, output8] = self
1592 .bf3
1593 .perform_parallel_fft_direct(mid0[2], mid1[2], mid2[2]);
1594
1595 [
1596 output0, output3, output6, output1, output4, output7, output2, output5, output8,
1597 ]
1598 }
1599}
1600
1601pub struct SseF64Butterfly9<T> {
1609 direction: FftDirection,
1610 _phantom: std::marker::PhantomData<T>,
1611 bf3: SseF64Butterfly3<T>,
1612 twiddle1: __m128d,
1613 twiddle2: __m128d,
1614 twiddle4: __m128d,
1615}
1616
1617boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly9);
1618boilerplate_fft_sse_common_butterfly!(SseF64Butterfly9, 9, |this: &SseF64Butterfly9<_>| this
1619 .direction);
1620impl<T: FftNum> SseF64Butterfly9<T> {
1621 #[inline(always)]
1622 pub fn new(direction: FftDirection) -> Self {
1623 assert_f64::<T>();
1624 let bf3 = SseF64Butterfly3::new(direction);
1625 let tw1: Complex<f64> = twiddles::compute_twiddle(1, 9, direction);
1626 let tw2: Complex<f64> = twiddles::compute_twiddle(2, 9, direction);
1627 let tw4: Complex<f64> = twiddles::compute_twiddle(4, 9, direction);
1628 let twiddle1 = unsafe { _mm_set_pd(tw1.im, tw1.re) };
1629 let twiddle2 = unsafe { _mm_set_pd(tw2.im, tw2.re) };
1630 let twiddle4 = unsafe { _mm_set_pd(tw4.im, tw4.re) };
1631 Self {
1632 direction,
1633 _phantom: std::marker::PhantomData,
1634 bf3,
1635 twiddle1,
1636 twiddle2,
1637 twiddle4,
1638 }
1639 }
1640
1641 #[inline(always)]
1642 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
1643 let values = read_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8});
1644
1645 let out = self.perform_fft_direct(values);
1646
1647 write_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8});
1648 }
1649
1650 #[inline(always)]
1651 pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 9]) -> [__m128d; 9] {
1652 let mid0 = self.bf3.perform_fft_direct(values[0], values[3], values[6]);
1656 let mut mid1 = self.bf3.perform_fft_direct(values[1], values[4], values[7]);
1657 let mut mid2 = self.bf3.perform_fft_direct(values[2], values[5], values[8]);
1658
1659 mid1[1] = SseVector::mul_complex(self.twiddle1, mid1[1]);
1661 mid1[2] = SseVector::mul_complex(self.twiddle2, mid1[2]);
1662 mid2[1] = SseVector::mul_complex(self.twiddle2, mid2[1]);
1663 mid2[2] = SseVector::mul_complex(self.twiddle4, mid2[2]);
1664
1665 let [output0, output1, output2] = self.bf3.perform_fft_direct(mid0[0], mid1[0], mid2[0]);
1666 let [output3, output4, output5] = self.bf3.perform_fft_direct(mid0[1], mid1[1], mid2[1]);
1667 let [output6, output7, output8] = self.bf3.perform_fft_direct(mid0[2], mid1[2], mid2[2]);
1668
1669 [
1670 output0, output3, output6, output1, output4, output7, output2, output5, output8,
1671 ]
1672 }
1673}
1674
1675pub struct SseF32Butterfly10<T> {
1683 direction: FftDirection,
1684 _phantom: std::marker::PhantomData<T>,
1685 bf5: SseF32Butterfly5<T>,
1686}
1687
1688boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly10);
1689boilerplate_fft_sse_common_butterfly!(SseF32Butterfly10, 10, |this: &SseF32Butterfly10<_>| this
1690 .direction);
1691impl<T: FftNum> SseF32Butterfly10<T> {
1692 #[inline(always)]
1693 pub fn new(direction: FftDirection) -> Self {
1694 assert_f32::<T>();
1695 let bf5 = SseF32Butterfly5::new(direction);
1696 Self {
1697 direction,
1698 _phantom: std::marker::PhantomData,
1699 bf5,
1700 }
1701 }
1702
1703 #[inline(always)]
1704 unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
1705 let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8});
1706
1707 let out = self.perform_fft_direct(input_packed);
1708
1709 write_complex_to_array_strided!(out, buffer, 2, {0,1,2,3,4});
1710 }
1711
1712 #[inline(always)]
1713 pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
1714 let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18});
1715
1716 let values = interleave_complex_f32!(input_packed, 5, {0, 1, 2, 3, 4});
1717
1718 let out = self.perform_parallel_fft_direct(values);
1719
1720 let out_sorted = separate_interleaved_complex_f32!(out, {0, 2, 4, 6, 8});
1721
1722 write_complex_to_array_strided!(out_sorted, buffer, 2, {0,1,2,3,4,5,6,7,8,9});
1723 }
1724
1725 #[inline(always)]
1726 pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128; 5]) -> [__m128; 5] {
1727 let reord0 = extract_lo_hi_f32(values[0], values[2]);
1730 let reord1 = extract_lo_hi_f32(values[1], values[3]);
1731 let reord2 = extract_lo_hi_f32(values[2], values[4]);
1732 let reord3 = extract_lo_hi_f32(values[3], values[0]);
1733 let reord4 = extract_lo_hi_f32(values[4], values[1]);
1734
1735 let mids = self
1737 .bf5
1738 .perform_parallel_fft_direct(reord0, reord1, reord2, reord3, reord4);
1739
1740 let [temp01, temp23] = parallel_fft2_contiguous_f32(mids[0], mids[1]);
1744 let [temp45, temp67] = parallel_fft2_contiguous_f32(mids[2], mids[3]);
1745 let temp89 = solo_fft2_f32(mids[4]);
1746
1747 let out01 = extract_lo_hi_f32(temp01, temp23);
1749 let out23 = extract_lo_hi_f32(temp45, temp67);
1750 let out45 = extract_lo_lo_f32(temp89, temp23);
1751 let out67 = extract_hi_lo_f32(temp01, temp67);
1752 let out89 = extract_hi_hi_f32(temp45, temp89);
1753
1754 [out01, out23, out45, out67, out89]
1755 }
1756
1757 #[inline(always)]
1758 pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 10]) -> [__m128; 10] {
1759 let mid0 = self
1763 .bf5
1764 .perform_parallel_fft_direct(values[0], values[2], values[4], values[6], values[8]);
1765 let mid1 = self
1766 .bf5
1767 .perform_parallel_fft_direct(values[5], values[7], values[9], values[1], values[3]);
1768
1769 let [output0, output1] = parallel_fft2_interleaved_f32(mid0[0], mid1[0]);
1773 let [output2, output3] = parallel_fft2_interleaved_f32(mid0[1], mid1[1]);
1774 let [output4, output5] = parallel_fft2_interleaved_f32(mid0[2], mid1[2]);
1775 let [output6, output7] = parallel_fft2_interleaved_f32(mid0[3], mid1[3]);
1776 let [output8, output9] = parallel_fft2_interleaved_f32(mid0[4], mid1[4]);
1777
1778 [
1780 output0, output3, output4, output7, output8, output1, output2, output5, output6,
1781 output9,
1782 ]
1783 }
1784}
1785
1786pub struct SseF64Butterfly10<T> {
1794 direction: FftDirection,
1795 _phantom: std::marker::PhantomData<T>,
1796 bf2: SseF64Butterfly2<T>,
1797 bf5: SseF64Butterfly5<T>,
1798}
1799
1800boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly10);
1801boilerplate_fft_sse_common_butterfly!(SseF64Butterfly10, 10, |this: &SseF64Butterfly10<_>| this
1802 .direction);
1803impl<T: FftNum> SseF64Butterfly10<T> {
1804 #[inline(always)]
1805 pub fn new(direction: FftDirection) -> Self {
1806 assert_f64::<T>();
1807 let bf2 = SseF64Butterfly2::new(direction);
1808 let bf5 = SseF64Butterfly5::new(direction);
1809 Self {
1810 direction,
1811 _phantom: std::marker::PhantomData,
1812 bf2,
1813 bf5,
1814 }
1815 }
1816
1817 #[inline(always)]
1818 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
1819 let values = read_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
1820
1821 let out = self.perform_fft_direct(values);
1822
1823 write_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
1824 }
1825
1826 #[inline(always)]
1827 pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 10]) -> [__m128d; 10] {
1828 let mid0 = self
1832 .bf5
1833 .perform_fft_direct(values[0], values[2], values[4], values[6], values[8]);
1834 let mid1 = self
1835 .bf5
1836 .perform_fft_direct(values[5], values[7], values[9], values[1], values[3]);
1837
1838 let [output0, output1] = self.bf2.perform_fft_direct(mid0[0], mid1[0]);
1842 let [output2, output3] = self.bf2.perform_fft_direct(mid0[1], mid1[1]);
1843 let [output4, output5] = self.bf2.perform_fft_direct(mid0[2], mid1[2]);
1844 let [output6, output7] = self.bf2.perform_fft_direct(mid0[3], mid1[3]);
1845 let [output8, output9] = self.bf2.perform_fft_direct(mid0[4], mid1[4]);
1846
1847 [
1849 output0, output3, output4, output7, output8, output1, output2, output5, output6,
1850 output9,
1851 ]
1852 }
1853}
1854
1855pub struct SseF32Butterfly12<T> {
1863 direction: FftDirection,
1864 _phantom: std::marker::PhantomData<T>,
1865 bf3: SseF32Butterfly3<T>,
1866 bf4: SseF32Butterfly4<T>,
1867}
1868
1869boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly12);
1870boilerplate_fft_sse_common_butterfly!(SseF32Butterfly12, 12, |this: &SseF32Butterfly12<_>| this
1871 .direction);
1872impl<T: FftNum> SseF32Butterfly12<T> {
1873 #[inline(always)]
1874 pub fn new(direction: FftDirection) -> Self {
1875 assert_f32::<T>();
1876 let bf3 = SseF32Butterfly3::new(direction);
1877 let bf4 = SseF32Butterfly4::new(direction);
1878 Self {
1879 direction,
1880 _phantom: std::marker::PhantomData,
1881 bf3,
1882 bf4,
1883 }
1884 }
1885
1886 #[inline(always)]
1887 unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
1888 let input_packed = read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10 });
1889
1890 let out = self.perform_fft_direct(input_packed);
1891
1892 write_complex_to_array_strided!(out, buffer, 2, {0,1,2,3,4,5});
1893 }
1894
1895 #[inline(always)]
1896 pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
1897 let input_packed =
1898 read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22});
1899
1900 let values = interleave_complex_f32!(input_packed, 6, {0, 1, 2, 3, 4, 5});
1901
1902 let out = self.perform_parallel_fft_direct(values);
1903
1904 let out_sorted = separate_interleaved_complex_f32!(out, {0, 2, 4, 6, 8, 10});
1905
1906 write_complex_to_array_strided!(out_sorted, buffer, 2, {0,1,2,3,4,5,6,7,8,9, 10, 11});
1907 }
1908
1909 #[inline(always)]
1910 pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128; 6]) -> [__m128; 6] {
1911 let packed03 = extract_lo_hi_f32(values[0], values[1]);
1915 let packed47 = extract_lo_hi_f32(values[2], values[3]);
1916 let packed69 = extract_lo_hi_f32(values[3], values[4]);
1917 let packed101 = extract_lo_hi_f32(values[5], values[0]);
1918 let packed811 = extract_lo_hi_f32(values[4], values[5]);
1919 let packed25 = extract_lo_hi_f32(values[1], values[2]);
1920
1921 let mid0 = self.bf4.perform_fft_direct(packed03, packed69);
1923 let mid1 = self.bf4.perform_fft_direct(packed47, packed101);
1924 let mid2 = self.bf4.perform_fft_direct(packed811, packed25);
1925
1926 let [temp03, temp14, temp25] = self
1930 .bf3
1931 .perform_parallel_fft_direct(mid0[0], mid1[0], mid2[0]);
1932 let [temp69, temp710, temp811] = self
1933 .bf3
1934 .perform_parallel_fft_direct(mid0[1], mid1[1], mid2[1]);
1935
1936 [
1938 extract_lo_hi_f32(temp03, temp14),
1939 extract_lo_hi_f32(temp811, temp69),
1940 extract_lo_hi_f32(temp14, temp25),
1941 extract_lo_hi_f32(temp69, temp710),
1942 extract_lo_hi_f32(temp25, temp03),
1943 extract_lo_hi_f32(temp710, temp811),
1944 ]
1945 }
1946
1947 #[inline(always)]
1948 pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 12]) -> [__m128; 12] {
1949 let mid0 = self
1953 .bf4
1954 .perform_parallel_fft_direct([values[0], values[3], values[6], values[9]]);
1955 let mid1 = self
1956 .bf4
1957 .perform_parallel_fft_direct([values[4], values[7], values[10], values[1]]);
1958 let mid2 = self
1959 .bf4
1960 .perform_parallel_fft_direct([values[8], values[11], values[2], values[5]]);
1961
1962 let [output0, output1, output2] = self
1966 .bf3
1967 .perform_parallel_fft_direct(mid0[0], mid1[0], mid2[0]);
1968 let [output3, output4, output5] = self
1969 .bf3
1970 .perform_parallel_fft_direct(mid0[1], mid1[1], mid2[1]);
1971 let [output6, output7, output8] = self
1972 .bf3
1973 .perform_parallel_fft_direct(mid0[2], mid1[2], mid2[2]);
1974 let [output9, output10, output11] = self
1975 .bf3
1976 .perform_parallel_fft_direct(mid0[3], mid1[3], mid2[3]);
1977
1978 [
1980 output0, output4, output8, output9, output1, output5, output6, output10, output2,
1981 output3, output7, output11,
1982 ]
1983 }
1984}
1985
1986pub struct SseF64Butterfly12<T> {
1994 direction: FftDirection,
1995 _phantom: std::marker::PhantomData<T>,
1996 bf3: SseF64Butterfly3<T>,
1997 bf4: SseF64Butterfly4<T>,
1998}
1999
2000boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly12);
2001boilerplate_fft_sse_common_butterfly!(SseF64Butterfly12, 12, |this: &SseF64Butterfly12<_>| this
2002 .direction);
2003impl<T: FftNum> SseF64Butterfly12<T> {
2004 #[inline(always)]
2005 pub fn new(direction: FftDirection) -> Self {
2006 assert_f64::<T>();
2007 let bf3 = SseF64Butterfly3::new(direction);
2008 let bf4 = SseF64Butterfly4::new(direction);
2009 Self {
2010 direction,
2011 _phantom: std::marker::PhantomData,
2012 bf3,
2013 bf4,
2014 }
2015 }
2016
2017 #[inline(always)]
2018 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
2019 let values = read_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
2020
2021 let out = self.perform_fft_direct(values);
2022
2023 write_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
2024 }
2025
2026 #[inline(always)]
2027 pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 12]) -> [__m128d; 12] {
2028 let mid0 = self
2032 .bf4
2033 .perform_fft_direct([values[0], values[3], values[6], values[9]]);
2034 let mid1 = self
2035 .bf4
2036 .perform_fft_direct([values[4], values[7], values[10], values[1]]);
2037 let mid2 = self
2038 .bf4
2039 .perform_fft_direct([values[8], values[11], values[2], values[5]]);
2040
2041 let [output0, output1, output2] = self.bf3.perform_fft_direct(mid0[0], mid1[0], mid2[0]);
2045 let [output3, output4, output5] = self.bf3.perform_fft_direct(mid0[1], mid1[1], mid2[1]);
2046 let [output6, output7, output8] = self.bf3.perform_fft_direct(mid0[2], mid1[2], mid2[2]);
2047 let [output9, output10, output11] = self.bf3.perform_fft_direct(mid0[3], mid1[3], mid2[3]);
2048
2049 [
2050 output0, output4, output8, output9, output1, output5, output6, output10, output2,
2051 output3, output7, output11,
2052 ]
2053 }
2054}
2055
2056pub struct SseF32Butterfly15<T> {
2063 direction: FftDirection,
2064 _phantom: std::marker::PhantomData<T>,
2065 bf3: SseF32Butterfly3<T>,
2066 bf5: SseF32Butterfly5<T>,
2067}
2068
2069boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly15);
2070boilerplate_fft_sse_common_butterfly!(SseF32Butterfly15, 15, |this: &SseF32Butterfly15<_>| this
2071 .direction);
2072impl<T: FftNum> SseF32Butterfly15<T> {
2073 #[inline(always)]
2074 pub fn new(direction: FftDirection) -> Self {
2075 assert_f32::<T>();
2076 let bf3 = SseF32Butterfly3::new(direction);
2077 let bf5 = SseF32Butterfly5::new(direction);
2078 Self {
2079 direction,
2080 _phantom: std::marker::PhantomData,
2081 bf3,
2082 bf5,
2083 }
2084 }
2085
2086 #[inline(always)]
2087 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
2088 let values = read_partial1_complex_to_array!(buffer, {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14});
2090
2091 let out = self.perform_parallel_fft_direct(values);
2092
2093 for n in 0..15 {
2094 buffer.store_partial_lo_complex(out[n], n);
2095 }
2096 }
2097
2098 #[inline(always)]
2099 pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
2100 let input_packed =
2101 read_complex_to_array!(buffer, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28});
2102
2103 let values = [
2104 extract_lo_hi_f32(input_packed[0], input_packed[7]),
2105 extract_hi_lo_f32(input_packed[0], input_packed[8]),
2106 extract_lo_hi_f32(input_packed[1], input_packed[8]),
2107 extract_hi_lo_f32(input_packed[1], input_packed[9]),
2108 extract_lo_hi_f32(input_packed[2], input_packed[9]),
2109 extract_hi_lo_f32(input_packed[2], input_packed[10]),
2110 extract_lo_hi_f32(input_packed[3], input_packed[10]),
2111 extract_hi_lo_f32(input_packed[3], input_packed[11]),
2112 extract_lo_hi_f32(input_packed[4], input_packed[11]),
2113 extract_hi_lo_f32(input_packed[4], input_packed[12]),
2114 extract_lo_hi_f32(input_packed[5], input_packed[12]),
2115 extract_hi_lo_f32(input_packed[5], input_packed[13]),
2116 extract_lo_hi_f32(input_packed[6], input_packed[13]),
2117 extract_hi_lo_f32(input_packed[6], input_packed[14]),
2118 extract_lo_hi_f32(input_packed[7], input_packed[14]),
2119 ];
2120
2121 let out = self.perform_parallel_fft_direct(values);
2122
2123 let out_packed = [
2124 extract_lo_lo_f32(out[0], out[1]),
2125 extract_lo_lo_f32(out[2], out[3]),
2126 extract_lo_lo_f32(out[4], out[5]),
2127 extract_lo_lo_f32(out[6], out[7]),
2128 extract_lo_lo_f32(out[8], out[9]),
2129 extract_lo_lo_f32(out[10], out[11]),
2130 extract_lo_lo_f32(out[12], out[13]),
2131 extract_lo_hi_f32(out[14], out[0]),
2132 extract_hi_hi_f32(out[1], out[2]),
2133 extract_hi_hi_f32(out[3], out[4]),
2134 extract_hi_hi_f32(out[5], out[6]),
2135 extract_hi_hi_f32(out[7], out[8]),
2136 extract_hi_hi_f32(out[9], out[10]),
2137 extract_hi_hi_f32(out[11], out[12]),
2138 extract_hi_hi_f32(out[13], out[14]),
2139 ];
2140
2141 write_complex_to_array_strided!(out_packed, buffer, 2, {0,1,2,3,4,5,6,7,8,9, 10, 11, 12, 13, 14});
2142 }
2143
2144 #[inline(always)]
2145 pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 15]) -> [__m128; 15] {
2146 let mid0 = self
2150 .bf5
2151 .perform_parallel_fft_direct(values[0], values[3], values[6], values[9], values[12]);
2152 let mid1 = self
2153 .bf5
2154 .perform_parallel_fft_direct(values[5], values[8], values[11], values[14], values[2]);
2155 let mid2 = self
2156 .bf5
2157 .perform_parallel_fft_direct(values[10], values[13], values[1], values[4], values[7]);
2158
2159 let [output0, output1, output2] = self
2163 .bf3
2164 .perform_parallel_fft_direct(mid0[0], mid1[0], mid2[0]);
2165 let [output3, output4, output5] = self
2166 .bf3
2167 .perform_parallel_fft_direct(mid0[1], mid1[1], mid2[1]);
2168 let [output6, output7, output8] = self
2169 .bf3
2170 .perform_parallel_fft_direct(mid0[2], mid1[2], mid2[2]);
2171 let [output9, output10, output11] = self
2172 .bf3
2173 .perform_parallel_fft_direct(mid0[3], mid1[3], mid2[3]);
2174 let [output12, output13, output14] = self
2175 .bf3
2176 .perform_parallel_fft_direct(mid0[4], mid1[4], mid2[4]);
2177
2178 [
2179 output0, output4, output8, output9, output13, output2, output3, output7, output11,
2180 output12, output1, output5, output6, output10, output14,
2181 ]
2182 }
2183}
2184
2185pub struct SseF64Butterfly15<T> {
2193 direction: FftDirection,
2194 _phantom: std::marker::PhantomData<T>,
2195 bf3: SseF64Butterfly3<T>,
2196 bf5: SseF64Butterfly5<T>,
2197}
2198
2199boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly15);
2200boilerplate_fft_sse_common_butterfly!(SseF64Butterfly15, 15, |this: &SseF64Butterfly15<_>| this
2201 .direction);
2202impl<T: FftNum> SseF64Butterfly15<T> {
2203 #[inline(always)]
2204 pub fn new(direction: FftDirection) -> Self {
2205 assert_f64::<T>();
2206 let bf3 = SseF64Butterfly3::new(direction);
2207 let bf5 = SseF64Butterfly5::new(direction);
2208 Self {
2209 direction,
2210 _phantom: std::marker::PhantomData,
2211 bf3,
2212 bf5,
2213 }
2214 }
2215
2216 #[inline(always)]
2217 pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
2218 let values =
2219 read_complex_to_array!(buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
2220
2221 let out = self.perform_fft_direct(values);
2222
2223 write_complex_to_array!(out, buffer, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
2224 }
2225
2226 #[inline(always)]
2227 pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 15]) -> [__m128d; 15] {
2228 let mid0 = self
2232 .bf5
2233 .perform_fft_direct(values[0], values[3], values[6], values[9], values[12]);
2234 let mid1 = self
2235 .bf5
2236 .perform_fft_direct(values[5], values[8], values[11], values[14], values[2]);
2237 let mid2 = self
2238 .bf5
2239 .perform_fft_direct(values[10], values[13], values[1], values[4], values[7]);
2240
2241 let [output0, output1, output2] = self.bf3.perform_fft_direct(mid0[0], mid1[0], mid2[0]);
2245 let [output3, output4, output5] = self.bf3.perform_fft_direct(mid0[1], mid1[1], mid2[1]);
2246 let [output6, output7, output8] = self.bf3.perform_fft_direct(mid0[2], mid1[2], mid2[2]);
2247 let [output9, output10, output11] = self.bf3.perform_fft_direct(mid0[3], mid1[3], mid2[3]);
2248 let [output12, output13, output14] = self.bf3.perform_fft_direct(mid0[4], mid1[4], mid2[4]);
2249
2250 [
2251 output0, output4, output8, output9, output13, output2, output3, output7, output11,
2252 output12, output1, output5, output6, output10, output14,
2253 ]
2254 }
2255}
2256
2257pub struct SseF32Butterfly16<T> {
2265 bf4: SseF32Butterfly4<T>,
2266 twiddles_packed: [__m128; 6],
2267 twiddle1: __m128,
2268 twiddle3: __m128,
2269 twiddle9: __m128,
2270}
2271
2272boilerplate_fft_sse_f32_butterfly_noparallel!(SseF32Butterfly16);
2273boilerplate_fft_sse_common_butterfly!(SseF32Butterfly16, 16, |this: &SseF32Butterfly16<_>| this
2274 .bf4
2275 .direction);
2276impl<T: FftNum> SseF32Butterfly16<T> {
2277 pub fn new(direction: FftDirection) -> Self {
2278 assert_f32::<T>();
2279 let tw0: Complex<f32> = Complex { re: 1.0, im: 0.0 };
2280 let tw1: Complex<f32> = twiddles::compute_twiddle(1, 16, direction);
2281 let tw2: Complex<f32> = twiddles::compute_twiddle(2, 16, direction);
2282 let tw3: Complex<f32> = twiddles::compute_twiddle(3, 16, direction);
2283 let tw4: Complex<f32> = twiddles::compute_twiddle(4, 16, direction);
2284 let tw6: Complex<f32> = twiddles::compute_twiddle(6, 16, direction);
2285 let tw9: Complex<f32> = twiddles::compute_twiddle(9, 16, direction);
2286
2287 unsafe {
2288 Self {
2289 bf4: SseF32Butterfly4::new(direction),
2290 twiddles_packed: [
2291 pack_32(tw0, tw1),
2292 pack_32(tw0, tw2),
2293 pack_32(tw0, tw3),
2294 pack_32(tw2, tw3),
2295 pack_32(tw4, tw6),
2296 pack_32(tw6, tw9),
2297 ],
2298 twiddle1: pack_32(tw1, tw1),
2299 twiddle3: pack_32(tw3, tw3),
2300 twiddle9: pack_32(tw9, tw9),
2301 }
2302 }
2303 }
2304
2305 #[inline(always)]
2306 unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
2307 let load = |i| {
2315 [
2316 buffer.load_complex(i),
2317 buffer.load_complex(i + 4),
2318 buffer.load_complex(i + 8),
2319 buffer.load_complex(i + 12),
2320 ]
2321 };
2322
2323 let mut tmp0 = self.bf4.perform_parallel_fft_direct(load(0));
2325 tmp0[1] = SseVector::mul_complex(tmp0[1], self.twiddles_packed[0]);
2326 tmp0[2] = SseVector::mul_complex(tmp0[2], self.twiddles_packed[1]);
2327 tmp0[3] = SseVector::mul_complex(tmp0[3], self.twiddles_packed[2]);
2328 let [mid0, mid1] = transpose_complex_2x2_f32(tmp0[0], tmp0[1]);
2329 let [mid4, mid5] = transpose_complex_2x2_f32(tmp0[2], tmp0[3]);
2330
2331 let mut tmp1 = self.bf4.perform_parallel_fft_direct(load(2));
2332 tmp1[1] = SseVector::mul_complex(tmp1[1], self.twiddles_packed[3]);
2333 tmp1[2] = SseVector::mul_complex(tmp1[2], self.twiddles_packed[4]);
2334 tmp1[3] = SseVector::mul_complex(tmp1[3], self.twiddles_packed[5]);
2335 let [mid2, mid3] = transpose_complex_2x2_f32(tmp1[0], tmp1[1]);
2336 let [mid6, mid7] = transpose_complex_2x2_f32(tmp1[2], tmp1[3]);
2337
2338 let mut store = |i: usize, vectors: [__m128; 4]| {
2340 buffer.store_complex(vectors[0], i + 0);
2341 buffer.store_complex(vectors[1], i + 4);
2342 buffer.store_complex(vectors[2], i + 8);
2343 buffer.store_complex(vectors[3], i + 12);
2344 };
2345 let out0 = self
2347 .bf4
2348 .perform_parallel_fft_direct([mid0, mid1, mid2, mid3]);
2349 store(0, out0);
2350
2351 let out1 = self
2352 .bf4
2353 .perform_parallel_fft_direct([mid4, mid5, mid6, mid7]);
2354 store(2, out1);
2355 }
2356
2357 #[allow(unused)]
2359 pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
2360 let load = |i: usize| {
2368 let [a0, a1] =
2369 transpose_complex_2x2_f32(buffer.load_complex(i + 0), buffer.load_complex(i + 16));
2370 let [b0, b1] =
2371 transpose_complex_2x2_f32(buffer.load_complex(i + 4), buffer.load_complex(i + 20));
2372 let [c0, c1] =
2373 transpose_complex_2x2_f32(buffer.load_complex(i + 8), buffer.load_complex(i + 24));
2374 let [d0, d1] =
2375 transpose_complex_2x2_f32(buffer.load_complex(i + 12), buffer.load_complex(i + 28));
2376 [[a0, b0, c0, d0], [a1, b1, c1, d1]]
2377 };
2378
2379 let [in2, in3] = load(2);
2381 let mut tmp2 = self.bf4.perform_parallel_fft_direct(in2);
2382 let mut tmp3 = self.bf4.perform_parallel_fft_direct(in3);
2383 tmp2[1] = self.bf4.rotate.rotate_both_45(tmp2[1]);
2384 tmp2[2] = self.bf4.rotate.rotate_both(tmp2[2]);
2385 tmp2[3] = self.bf4.rotate.rotate_both_135(tmp2[3]);
2386 tmp3[1] = SseVector::mul_complex(tmp3[1], self.twiddle3);
2387 tmp3[2] = self.bf4.rotate.rotate_both_135(tmp3[2]);
2388 tmp3[3] = SseVector::mul_complex(tmp3[3], self.twiddle9);
2389
2390 let [in0, in1] = load(0);
2392 let tmp0 = self.bf4.perform_parallel_fft_direct(in0);
2393 let mut tmp1 = self.bf4.perform_parallel_fft_direct(in1);
2394 tmp1[1] = SseVector::mul_complex(tmp1[1], self.twiddle1);
2395 tmp1[2] = self.bf4.rotate.rotate_both_45(tmp1[2]);
2396 tmp1[3] = SseVector::mul_complex(tmp1[3], self.twiddle3);
2397
2398 let mut store = |i, values_a: [__m128; 4], values_b: [__m128; 4]| {
2400 for n in 0..4 {
2401 let [a, b] = transpose_complex_2x2_f32(values_a[n], values_b[n]);
2402 buffer.store_complex(a, i + n * 4);
2403 buffer.store_complex(b, i + n * 4 + 16);
2404 }
2405 };
2406 let out0 = self
2408 .bf4
2409 .perform_parallel_fft_direct([tmp0[0], tmp1[0], tmp2[0], tmp3[0]]);
2410 let out1 = self
2411 .bf4
2412 .perform_parallel_fft_direct([tmp0[1], tmp1[1], tmp2[1], tmp3[1]]);
2413 store(0, out0, out1);
2414
2415 let out2 = self
2416 .bf4
2417 .perform_parallel_fft_direct([tmp0[2], tmp1[2], tmp2[2], tmp3[2]]);
2418 let out3 = self
2419 .bf4
2420 .perform_parallel_fft_direct([tmp0[3], tmp1[3], tmp2[3], tmp3[3]]);
2421 store(2, out2, out3);
2422 }
2423}
2424pub struct SseF64Butterfly16<T> {
2432 bf4: SseF64Butterfly4<T>,
2433 twiddle1: __m128d,
2434 twiddle3: __m128d,
2435 twiddle9: __m128d,
2436}
2437
2438boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly16);
2439boilerplate_fft_sse_common_butterfly!(SseF64Butterfly16, 16, |this: &SseF64Butterfly16<_>| this
2440 .bf4
2441 .direction);
2442impl<T: FftNum> SseF64Butterfly16<T> {
2443 #[inline(always)]
2444 pub fn new(direction: FftDirection) -> Self {
2445 assert_f64::<T>();
2446 let tw1: Complex<f64> = twiddles::compute_twiddle(1, 16, direction);
2447 let tw3: Complex<f64> = twiddles::compute_twiddle(3, 16, direction);
2448 let tw9: Complex<f64> = twiddles::compute_twiddle(9, 16, direction);
2449
2450 unsafe {
2451 Self {
2452 bf4: SseF64Butterfly4::new(direction),
2453 twiddle1: pack_64(tw1),
2454 twiddle3: pack_64(tw3),
2455 twiddle9: pack_64(tw9),
2456 }
2457 }
2458 }
2459
2460 #[inline(always)]
2461 unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
2462 let load = |i| {
2470 [
2471 buffer.load_complex(i),
2472 buffer.load_complex(i + 4),
2473 buffer.load_complex(i + 8),
2474 buffer.load_complex(i + 12),
2475 ]
2476 };
2477
2478 let mut tmp1 = self.bf4.perform_fft_direct(load(1));
2480 tmp1[1] = SseVector::mul_complex(tmp1[1], self.twiddle1);
2481 tmp1[2] = self.bf4.rotate.rotate_45(tmp1[2]);
2482 tmp1[3] = SseVector::mul_complex(tmp1[3], self.twiddle3);
2483
2484 let mut tmp3 = self.bf4.perform_fft_direct(load(3));
2485 tmp3[1] = SseVector::mul_complex(tmp3[1], self.twiddle3);
2486 tmp3[2] = self.bf4.rotate.rotate_135(tmp3[2]);
2487 tmp3[3] = SseVector::mul_complex(tmp3[3], self.twiddle9);
2488
2489 let mut tmp2 = self.bf4.perform_fft_direct(load(2));
2490 tmp2[1] = self.bf4.rotate.rotate_45(tmp2[1]);
2491 tmp2[2] = self.bf4.rotate.rotate(tmp2[2]);
2492 tmp2[3] = self.bf4.rotate.rotate_135(tmp2[3]);
2493
2494 let tmp0 = self.bf4.perform_fft_direct(load(0));
2496
2497 let mut store = |i: usize, vectors: [__m128d; 4]| {
2499 buffer.store_complex(vectors[0], i + 0);
2500 buffer.store_complex(vectors[1], i + 4);
2501 buffer.store_complex(vectors[2], i + 8);
2502 buffer.store_complex(vectors[3], i + 12);
2503 };
2504
2505 let out0 = self
2507 .bf4
2508 .perform_fft_direct([tmp0[0], tmp1[0], tmp2[0], tmp3[0]]);
2509 store(0, out0);
2510
2511 let out1 = self
2512 .bf4
2513 .perform_fft_direct([tmp0[1], tmp1[1], tmp2[1], tmp3[1]]);
2514 store(1, out1);
2515
2516 let out2 = self
2517 .bf4
2518 .perform_fft_direct([tmp0[2], tmp1[2], tmp2[2], tmp3[2]]);
2519 store(2, out2);
2520
2521 let out3 = self
2522 .bf4
2523 .perform_fft_direct([tmp0[3], tmp1[3], tmp2[3], tmp3[3]]);
2524 store(3, out3);
2525 }
2526}
2527
2528pub struct SseF32Butterfly24<T> {
2536 bf4: SseF32Butterfly4<T>,
2537 bf6: SseF32Butterfly6<T>,
2538 twiddles_packed: [__m128; 9],
2539 twiddle1: __m128,
2540 twiddle2: __m128,
2541 twiddle4: __m128,
2542 twiddle5: __m128,
2543 twiddle8: __m128,
2544 twiddle10: __m128,
2545}
2546
2547boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly24);
2548boilerplate_fft_sse_common_butterfly!(SseF32Butterfly24, 24, |this: &SseF32Butterfly24<_>| this
2549 .bf4
2550 .direction);
2551impl<T: FftNum> SseF32Butterfly24<T> {
2552 #[inline(always)]
2553 pub fn new(direction: FftDirection) -> Self {
2554 assert_f32::<T>();
2555 let tw0: Complex<f32> = Complex { re: 1.0, im: 0.0 };
2556 let tw1: Complex<f32> = twiddles::compute_twiddle(1, 24, direction);
2557 let tw2: Complex<f32> = twiddles::compute_twiddle(2, 24, direction);
2558 let tw3: Complex<f32> = twiddles::compute_twiddle(3, 24, direction);
2559 let tw4: Complex<f32> = twiddles::compute_twiddle(4, 24, direction);
2560 let tw5: Complex<f32> = twiddles::compute_twiddle(5, 24, direction);
2561 let tw6: Complex<f32> = twiddles::compute_twiddle(6, 24, direction);
2562 let tw8: Complex<f32> = twiddles::compute_twiddle(8, 24, direction);
2563 let tw9: Complex<f32> = twiddles::compute_twiddle(9, 24, direction);
2564 let tw10: Complex<f32> = twiddles::compute_twiddle(10, 24, direction);
2565 let tw12: Complex<f32> = twiddles::compute_twiddle(12, 24, direction);
2566 let tw15: Complex<f32> = twiddles::compute_twiddle(15, 24, direction);
2567 unsafe {
2568 Self {
2569 bf4: SseF32Butterfly4::new(direction),
2570 bf6: SseF32Butterfly6::new(direction),
2571 twiddles_packed: [
2572 pack_32(tw0, tw1),
2573 pack_32(tw0, tw2),
2574 pack_32(tw0, tw3),
2575 pack_32(tw2, tw3),
2576 pack_32(tw4, tw6),
2577 pack_32(tw6, tw9),
2578 pack_32(tw4, tw5),
2579 pack_32(tw8, tw10),
2580 pack_32(tw12, tw15),
2581 ],
2582 twiddle1: pack_32(tw1, tw1),
2583 twiddle2: pack_32(tw2, tw2),
2584 twiddle4: pack_32(tw4, tw4),
2585 twiddle5: pack_32(tw5, tw5),
2586 twiddle8: pack_32(tw8, tw8),
2587 twiddle10: pack_32(tw10, tw10),
2588 }
2589 }
2590 }
2591
2592 #[inline(always)]
2593 unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
2594 let load = |i| {
2602 [
2603 buffer.load_complex(i),
2604 buffer.load_complex(i + 6),
2605 buffer.load_complex(i + 12),
2606 buffer.load_complex(i + 18),
2607 ]
2608 };
2609
2610 let mut tmp1 = self.bf4.perform_parallel_fft_direct(load(2));
2612 tmp1[1] = SseVector::mul_complex(tmp1[1], self.twiddles_packed[3]);
2613 tmp1[2] = SseVector::mul_complex(tmp1[2], self.twiddles_packed[4]);
2614 tmp1[3] = SseVector::mul_complex(tmp1[3], self.twiddles_packed[5]);
2615 let [mid2, mid3] = transpose_complex_2x2_f32(tmp1[0], tmp1[1]);
2616 let [mid8, mid9] = transpose_complex_2x2_f32(tmp1[2], tmp1[3]);
2617
2618 let mut tmp2 = self.bf4.perform_parallel_fft_direct(load(4));
2619 tmp2[1] = SseVector::mul_complex(tmp2[1], self.twiddles_packed[6]);
2620 tmp2[2] = SseVector::mul_complex(tmp2[2], self.twiddles_packed[7]);
2621 tmp2[3] = SseVector::mul_complex(tmp2[3], self.twiddles_packed[8]);
2622 let [mid4, mid5] = transpose_complex_2x2_f32(tmp2[0], tmp2[1]);
2623 let [mid10, mid11] = transpose_complex_2x2_f32(tmp2[2], tmp2[3]);
2624
2625 let mut tmp0 = self.bf4.perform_parallel_fft_direct(load(0));
2626 tmp0[1] = SseVector::mul_complex(tmp0[1], self.twiddles_packed[0]);
2627 tmp0[2] = SseVector::mul_complex(tmp0[2], self.twiddles_packed[1]);
2628 tmp0[3] = SseVector::mul_complex(tmp0[3], self.twiddles_packed[2]);
2629 let [mid0, mid1] = transpose_complex_2x2_f32(tmp0[0], tmp0[1]);
2630 let [mid6, mid7] = transpose_complex_2x2_f32(tmp0[2], tmp0[3]);
2631
2632 let mut store = |i, vectors: [__m128; 6]| {
2634 buffer.store_complex(vectors[0], i);
2635 buffer.store_complex(vectors[1], i + 4);
2636 buffer.store_complex(vectors[2], i + 8);
2637 buffer.store_complex(vectors[3], i + 12);
2638 buffer.store_complex(vectors[4], i + 16);
2639 buffer.store_complex(vectors[5], i + 20);
2640 };
2641
2642 let out0 = self
2644 .bf6
2645 .perform_parallel_fft_direct(mid0, mid1, mid2, mid3, mid4, mid5);
2646 store(0, out0);
2647
2648 let out1 = self
2649 .bf6
2650 .perform_parallel_fft_direct(mid6, mid7, mid8, mid9, mid10, mid11);
2651 store(2, out1);
2652 }
2653
2654 #[inline(always)]
2655 pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
2656 let load = |i: usize| {
2664 let [a0, a1] =
2665 transpose_complex_2x2_f32(buffer.load_complex(i + 0), buffer.load_complex(i + 24));
2666 let [b0, b1] =
2667 transpose_complex_2x2_f32(buffer.load_complex(i + 6), buffer.load_complex(i + 30));
2668 let [c0, c1] =
2669 transpose_complex_2x2_f32(buffer.load_complex(i + 12), buffer.load_complex(i + 36));
2670 let [d0, d1] =
2671 transpose_complex_2x2_f32(buffer.load_complex(i + 18), buffer.load_complex(i + 42));
2672 [[a0, b0, c0, d0], [a1, b1, c1, d1]]
2673 };
2674
2675 let [in0, in1] = load(0);
2677 let tmp0 = self.bf4.perform_parallel_fft_direct(in0);
2678 let mut tmp1 = self.bf4.perform_parallel_fft_direct(in1);
2679 tmp1[1] = SseVector::mul_complex(tmp1[1], self.twiddle1);
2680 tmp1[2] = SseVector::mul_complex(tmp1[2], self.twiddle2);
2681 tmp1[3] = self.bf4.rotate.rotate_both_45(tmp1[3]);
2682
2683 let [in2, in3] = load(2);
2684 let mut tmp2 = self.bf4.perform_parallel_fft_direct(in2);
2685 let mut tmp3 = self.bf4.perform_parallel_fft_direct(in3);
2686 tmp2[1] = SseVector::mul_complex(tmp2[1], self.twiddle2);
2687 tmp2[2] = SseVector::mul_complex(tmp2[2], self.twiddle4);
2688 tmp2[3] = self.bf4.rotate.rotate_both(tmp2[3]);
2689 tmp3[1] = self.bf4.rotate.rotate_both_45(tmp3[1]);
2690 tmp3[2] = self.bf4.rotate.rotate_both(tmp3[2]);
2691 tmp3[3] = self.bf4.rotate.rotate_both_135(tmp3[3]);
2692
2693 let [in4, in5] = load(4);
2694 let mut tmp4 = self.bf4.perform_parallel_fft_direct(in4);
2695 let mut tmp5 = self.bf4.perform_parallel_fft_direct(in5);
2696 tmp4[1] = SseVector::mul_complex(tmp4[1], self.twiddle4);
2697 tmp4[2] = SseVector::mul_complex(tmp4[2], self.twiddle8);
2698 tmp4[3] = SseVector::neg(tmp4[3]);
2699 tmp5[1] = SseVector::mul_complex(tmp5[1], self.twiddle5);
2700 tmp5[2] = SseVector::mul_complex(tmp5[2], self.twiddle10);
2701 tmp5[3] = self.bf4.rotate.rotate_both_225(tmp5[3]);
2702
2703 let mut store = |i, vectors_a: [__m128; 6], vectors_b: [__m128; 6]| {
2705 for n in 0..6 {
2706 let [a, b] = transpose_complex_2x2_f32(vectors_a[n], vectors_b[n]);
2707 buffer.store_complex(a, i + n * 4);
2708 buffer.store_complex(b, i + n * 4 + 24);
2709 }
2710 };
2711
2712 let out0 = self
2714 .bf6
2715 .perform_parallel_fft_direct(tmp0[0], tmp1[0], tmp2[0], tmp3[0], tmp4[0], tmp5[0]);
2716 let out1 = self
2717 .bf6
2718 .perform_parallel_fft_direct(tmp0[1], tmp1[1], tmp2[1], tmp3[1], tmp4[1], tmp5[1]);
2719 store(0, out0, out1);
2720
2721 let out2 = self
2722 .bf6
2723 .perform_parallel_fft_direct(tmp0[2], tmp1[2], tmp2[2], tmp3[2], tmp4[2], tmp5[2]);
2724 let out3 = self
2725 .bf6
2726 .perform_parallel_fft_direct(tmp0[3], tmp1[3], tmp2[3], tmp3[3], tmp4[3], tmp5[3]);
2727 store(2, out2, out3);
2728 }
2729}
2730
2731pub struct SseF64Butterfly24<T> {
2739 bf4: SseF64Butterfly4<T>,
2740 bf6: SseF64Butterfly6<T>,
2741 twiddle1: __m128d,
2742 twiddle2: __m128d,
2743 twiddle4: __m128d,
2744 twiddle5: __m128d,
2745 twiddle8: __m128d,
2746 twiddle10: __m128d,
2747}
2748
2749boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly24);
2750boilerplate_fft_sse_common_butterfly!(SseF64Butterfly24, 24, |this: &SseF64Butterfly24<_>| this
2751 .bf4
2752 .direction);
2753impl<T: FftNum> SseF64Butterfly24<T> {
2754 #[inline(always)]
2755 pub fn new(direction: FftDirection) -> Self {
2756 assert_f64::<T>();
2757 let tw1: Complex<f64> = twiddles::compute_twiddle(1, 24, direction);
2758 let tw2: Complex<f64> = twiddles::compute_twiddle(2, 24, direction);
2759 let tw4: Complex<f64> = twiddles::compute_twiddle(4, 24, direction);
2760 let tw5: Complex<f64> = twiddles::compute_twiddle(5, 24, direction);
2761 let tw8: Complex<f64> = twiddles::compute_twiddle(8, 24, direction);
2762 let tw10: Complex<f64> = twiddles::compute_twiddle(10, 24, direction);
2763
2764 unsafe {
2765 Self {
2766 bf4: SseF64Butterfly4::new(direction),
2767 bf6: SseF64Butterfly6::new(direction),
2768 twiddle1: pack_64(tw1),
2769 twiddle2: pack_64(tw2),
2770 twiddle4: pack_64(tw4),
2771 twiddle5: pack_64(tw5),
2772 twiddle8: pack_64(tw8),
2773 twiddle10: pack_64(tw10),
2774 }
2775 }
2776 }
2777
2778 #[inline(always)]
2779 unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
2780 let load = |i| {
2788 [
2789 buffer.load_complex(i),
2790 buffer.load_complex(i + 6),
2791 buffer.load_complex(i + 12),
2792 buffer.load_complex(i + 18),
2793 ]
2794 };
2795
2796 let mut tmp1 = self.bf4.perform_fft_direct(load(1));
2798 tmp1[1] = SseVector::mul_complex(tmp1[1], self.twiddle1);
2799 tmp1[2] = SseVector::mul_complex(tmp1[2], self.twiddle2);
2800 tmp1[3] = self.bf4.rotate.rotate_45(tmp1[3]);
2801
2802 let mut tmp2 = self.bf4.perform_fft_direct(load(2));
2803 tmp2[1] = SseVector::mul_complex(tmp2[1], self.twiddle2);
2804 tmp2[2] = SseVector::mul_complex(tmp2[2], self.twiddle4);
2805 tmp2[3] = self.bf4.rotate.rotate(tmp2[3]);
2806
2807 let mut tmp4 = self.bf4.perform_fft_direct(load(4));
2808 tmp4[1] = SseVector::mul_complex(tmp4[1], self.twiddle4);
2809 tmp4[2] = SseVector::mul_complex(tmp4[2], self.twiddle8);
2810 tmp4[3] = SseVector::neg(tmp4[3]);
2811
2812 let mut tmp5 = self.bf4.perform_fft_direct(load(5));
2813 tmp5[1] = SseVector::mul_complex(tmp5[1], self.twiddle5);
2814 tmp5[2] = SseVector::mul_complex(tmp5[2], self.twiddle10);
2815 tmp5[3] = self.bf4.rotate.rotate_225(tmp5[3]);
2816
2817 let mut tmp3 = self.bf4.perform_fft_direct(load(3));
2818 tmp3[1] = self.bf4.rotate.rotate_45(tmp3[1]);
2819 tmp3[2] = self.bf4.rotate.rotate(tmp3[2]);
2820 tmp3[3] = self.bf4.rotate.rotate_135(tmp3[3]);
2821
2822 let tmp0 = self.bf4.perform_fft_direct(load(0));
2824
2825 let mut store = |i, vectors: [__m128d; 6]| {
2827 buffer.store_complex(vectors[0], i);
2828 buffer.store_complex(vectors[1], i + 4);
2829 buffer.store_complex(vectors[2], i + 8);
2830 buffer.store_complex(vectors[3], i + 12);
2831 buffer.store_complex(vectors[4], i + 16);
2832 buffer.store_complex(vectors[5], i + 20);
2833 };
2834
2835 let out0 = self
2837 .bf6
2838 .perform_fft_direct([tmp0[0], tmp1[0], tmp2[0], tmp3[0], tmp4[0], tmp5[0]]);
2839 store(0, out0);
2840
2841 let out1 = self
2842 .bf6
2843 .perform_fft_direct([tmp0[1], tmp1[1], tmp2[1], tmp3[1], tmp4[1], tmp5[1]]);
2844 store(1, out1);
2845
2846 let out2 = self
2847 .bf6
2848 .perform_fft_direct([tmp0[2], tmp1[2], tmp2[2], tmp3[2], tmp4[2], tmp5[2]]);
2849 store(2, out2);
2850
2851 let out3 = self
2852 .bf6
2853 .perform_fft_direct([tmp0[3], tmp1[3], tmp2[3], tmp3[3], tmp4[3], tmp5[3]]);
2854 store(3, out3);
2855 }
2856}
2857
2858pub struct SseF32Butterfly32<T> {
2866 bf8: SseF32Butterfly8<T>,
2867 twiddles_packed: [__m128; 12],
2868 twiddle1: __m128,
2869 twiddle2: __m128,
2870 twiddle3: __m128,
2871 twiddle5: __m128,
2872 twiddle6: __m128,
2873 twiddle7: __m128,
2874 twiddle9: __m128,
2875 twiddle10: __m128,
2876 twiddle14: __m128,
2877 twiddle15: __m128,
2878 twiddle18: __m128,
2879 twiddle21: __m128,
2880}
2881
2882boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly32);
2883boilerplate_fft_sse_common_butterfly!(SseF32Butterfly32, 32, |this: &SseF32Butterfly32<_>| this
2884 .bf8
2885 .bf4
2886 .direction);
2887impl<T: FftNum> SseF32Butterfly32<T> {
2888 #[inline(always)]
2889 pub fn new(direction: FftDirection) -> Self {
2890 assert_f32::<T>();
2891 let tw0: Complex<f32> = Complex { re: 1.0, im: 0.0 };
2892 let tw1: Complex<f32> = twiddles::compute_twiddle(1, 32, direction);
2893 let tw2: Complex<f32> = twiddles::compute_twiddle(2, 32, direction);
2894 let tw3: Complex<f32> = twiddles::compute_twiddle(3, 32, direction);
2895 let tw4: Complex<f32> = twiddles::compute_twiddle(4, 32, direction);
2896 let tw5: Complex<f32> = twiddles::compute_twiddle(5, 32, direction);
2897 let tw6: Complex<f32> = twiddles::compute_twiddle(6, 32, direction);
2898 let tw7: Complex<f32> = twiddles::compute_twiddle(7, 32, direction);
2899 let tw8: Complex<f32> = twiddles::compute_twiddle(8, 32, direction);
2900 let tw9: Complex<f32> = twiddles::compute_twiddle(9, 32, direction);
2901 let tw10: Complex<f32> = twiddles::compute_twiddle(10, 32, direction);
2902 let tw12: Complex<f32> = twiddles::compute_twiddle(12, 32, direction);
2903 let tw14: Complex<f32> = twiddles::compute_twiddle(14, 32, direction);
2904 let tw15: Complex<f32> = twiddles::compute_twiddle(15, 32, direction);
2905 let tw18: Complex<f32> = twiddles::compute_twiddle(18, 32, direction);
2906 let tw21: Complex<f32> = twiddles::compute_twiddle(21, 32, direction);
2907 unsafe {
2908 Self {
2909 bf8: SseF32Butterfly8::new(direction),
2910 twiddles_packed: [
2911 pack_32(tw0, tw1),
2912 pack_32(tw0, tw2),
2913 pack_32(tw0, tw3),
2914 pack_32(tw2, tw3),
2915 pack_32(tw4, tw6),
2916 pack_32(tw6, tw9),
2917 pack_32(tw4, tw5),
2918 pack_32(tw8, tw10),
2919 pack_32(tw12, tw15),
2920 pack_32(tw6, tw7),
2921 pack_32(tw12, tw14),
2922 pack_32(tw18, tw21),
2923 ],
2924 twiddle1: pack_32(tw1, tw1),
2925 twiddle2: pack_32(tw2, tw2),
2926 twiddle3: pack_32(tw3, tw3),
2927 twiddle5: pack_32(tw5, tw5),
2928 twiddle6: pack_32(tw6, tw6),
2929 twiddle7: pack_32(tw7, tw7),
2930 twiddle9: pack_32(tw9, tw9),
2931 twiddle10: pack_32(tw10, tw10),
2932 twiddle14: pack_32(tw14, tw14),
2933 twiddle15: pack_32(tw15, tw15),
2934 twiddle18: pack_32(tw18, tw18),
2935 twiddle21: pack_32(tw21, tw21),
2936 }
2937 }
2938 }
2939
2940 #[inline(always)]
2941 unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
2942 let load = |i| {
2950 [
2951 buffer.load_complex(i),
2952 buffer.load_complex(i + 8),
2953 buffer.load_complex(i + 16),
2954 buffer.load_complex(i + 24),
2955 ]
2956 };
2957
2958 let mut tmp0 = self.bf8.bf4.perform_parallel_fft_direct(load(0));
2960 tmp0[1] = SseVector::mul_complex(tmp0[1], self.twiddles_packed[0]);
2961 tmp0[2] = SseVector::mul_complex(tmp0[2], self.twiddles_packed[1]);
2962 tmp0[3] = SseVector::mul_complex(tmp0[3], self.twiddles_packed[2]);
2963 let [mid0, mid1] = transpose_complex_2x2_f32(tmp0[0], tmp0[1]);
2964 let [mid8, mid9] = transpose_complex_2x2_f32(tmp0[2], tmp0[3]);
2965
2966 let mut tmp1 = self.bf8.bf4.perform_parallel_fft_direct(load(2));
2967 tmp1[1] = SseVector::mul_complex(tmp1[1], self.twiddles_packed[3]);
2968 tmp1[2] = SseVector::mul_complex(tmp1[2], self.twiddles_packed[4]);
2969 tmp1[3] = SseVector::mul_complex(tmp1[3], self.twiddles_packed[5]);
2970 let [mid2, mid3] = transpose_complex_2x2_f32(tmp1[0], tmp1[1]);
2971 let [mid10, mid11] = transpose_complex_2x2_f32(tmp1[2], tmp1[3]);
2972
2973 let mut tmp2 = self.bf8.bf4.perform_parallel_fft_direct(load(4));
2974 tmp2[1] = SseVector::mul_complex(tmp2[1], self.twiddles_packed[6]);
2975 tmp2[2] = SseVector::mul_complex(tmp2[2], self.twiddles_packed[7]);
2976 tmp2[3] = SseVector::mul_complex(tmp2[3], self.twiddles_packed[8]);
2977 let [mid4, mid5] = transpose_complex_2x2_f32(tmp2[0], tmp2[1]);
2978 let [mid12, mid13] = transpose_complex_2x2_f32(tmp2[2], tmp2[3]);
2979
2980 let mut tmp3 = self.bf8.bf4.perform_parallel_fft_direct(load(6));
2981 tmp3[1] = SseVector::mul_complex(tmp3[1], self.twiddles_packed[9]);
2982 tmp3[2] = SseVector::mul_complex(tmp3[2], self.twiddles_packed[10]);
2983 tmp3[3] = SseVector::mul_complex(tmp3[3], self.twiddles_packed[11]);
2984 let [mid6, mid7] = transpose_complex_2x2_f32(tmp3[0], tmp3[1]);
2985 let [mid14, mid15] = transpose_complex_2x2_f32(tmp3[2], tmp3[3]);
2986
2987 let mut store = |i, vectors: [__m128; 8]| {
2989 buffer.store_complex(vectors[0], i);
2990 buffer.store_complex(vectors[1], i + 4);
2991 buffer.store_complex(vectors[2], i + 8);
2992 buffer.store_complex(vectors[3], i + 12);
2993 buffer.store_complex(vectors[4], i + 16);
2994 buffer.store_complex(vectors[5], i + 20);
2995 buffer.store_complex(vectors[6], i + 24);
2996 buffer.store_complex(vectors[7], i + 28);
2997 };
2998
2999 let out0 = self
3001 .bf8
3002 .perform_parallel_fft_direct([mid0, mid1, mid2, mid3, mid4, mid5, mid6, mid7]);
3003 store(0, out0);
3004
3005 let out1 = self
3006 .bf8
3007 .perform_parallel_fft_direct([mid8, mid9, mid10, mid11, mid12, mid13, mid14, mid15]);
3008 store(2, out1);
3009 }
3010
3011 #[inline(always)]
3012 pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
3013 let load = |i: usize| {
3021 let [a0, a1] =
3022 transpose_complex_2x2_f32(buffer.load_complex(i + 0), buffer.load_complex(i + 32));
3023 let [b0, b1] =
3024 transpose_complex_2x2_f32(buffer.load_complex(i + 8), buffer.load_complex(i + 40));
3025 let [c0, c1] =
3026 transpose_complex_2x2_f32(buffer.load_complex(i + 16), buffer.load_complex(i + 48));
3027 let [d0, d1] =
3028 transpose_complex_2x2_f32(buffer.load_complex(i + 24), buffer.load_complex(i + 56));
3029 [[a0, b0, c0, d0], [a1, b1, c1, d1]]
3030 };
3031
3032 let [in0, in1] = load(0);
3034 let tmp0 = self.bf8.bf4.perform_parallel_fft_direct(in0);
3035 let mut tmp1 = self.bf8.bf4.perform_parallel_fft_direct(in1);
3036 tmp1[1] = SseVector::mul_complex(tmp1[1], self.twiddle1);
3037 tmp1[2] = SseVector::mul_complex(tmp1[2], self.twiddle2);
3038 tmp1[3] = SseVector::mul_complex(tmp1[3], self.twiddle3);
3039
3040 let [in2, in3] = load(2);
3041 let mut tmp2 = self.bf8.bf4.perform_parallel_fft_direct(in2);
3042 let mut tmp3 = self.bf8.bf4.perform_parallel_fft_direct(in3);
3043 tmp2[1] = SseVector::mul_complex(tmp2[1], self.twiddle2);
3044 tmp2[2] = self.bf8.bf4.rotate.rotate_both_45(tmp2[2]);
3045 tmp2[3] = SseVector::mul_complex(tmp2[3], self.twiddle6);
3046 tmp3[1] = SseVector::mul_complex(tmp3[1], self.twiddle3);
3047 tmp3[2] = SseVector::mul_complex(tmp3[2], self.twiddle6);
3048 tmp3[3] = SseVector::mul_complex(tmp3[3], self.twiddle9);
3049
3050 let [in4, in5] = load(4);
3051 let mut tmp4 = self.bf8.bf4.perform_parallel_fft_direct(in4);
3052 let mut tmp5 = self.bf8.bf4.perform_parallel_fft_direct(in5);
3053 tmp4[1] = self.bf8.bf4.rotate.rotate_both_45(tmp4[1]);
3054 tmp4[2] = self.bf8.bf4.rotate.rotate_both(tmp4[2]);
3055 tmp4[3] = self.bf8.bf4.rotate.rotate_both_135(tmp4[3]);
3056 tmp5[1] = SseVector::mul_complex(tmp5[1], self.twiddle5);
3057 tmp5[2] = SseVector::mul_complex(tmp5[2], self.twiddle10);
3058 tmp5[3] = SseVector::mul_complex(tmp5[3], self.twiddle15);
3059
3060 let [in6, in7] = load(6);
3061 let mut tmp6 = self.bf8.bf4.perform_parallel_fft_direct(in6);
3062 let mut tmp7 = self.bf8.bf4.perform_parallel_fft_direct(in7);
3063 tmp6[1] = SseVector::mul_complex(tmp6[1], self.twiddle6);
3064 tmp6[2] = self.bf8.bf4.rotate.rotate_both_135(tmp6[2]);
3065 tmp6[3] = SseVector::mul_complex(tmp6[3], self.twiddle18);
3066 tmp7[1] = SseVector::mul_complex(tmp7[1], self.twiddle7);
3067 tmp7[2] = SseVector::mul_complex(tmp7[2], self.twiddle14);
3068 tmp7[3] = SseVector::mul_complex(tmp7[3], self.twiddle21);
3069
3070 let mut store = |i, vectors_a: [__m128; 8], vectors_b: [__m128; 8]| {
3072 for n in 0..8 {
3073 let [a, b] = transpose_complex_2x2_f32(vectors_a[n], vectors_b[n]);
3074 buffer.store_complex(a, i + n * 4);
3075 buffer.store_complex(b, i + n * 4 + 32);
3076 }
3077 };
3078
3079 let out0 = self.bf8.perform_parallel_fft_direct([
3081 tmp0[0], tmp1[0], tmp2[0], tmp3[0], tmp4[0], tmp5[0], tmp6[0], tmp7[0],
3082 ]);
3083 let out1 = self.bf8.perform_parallel_fft_direct([
3084 tmp0[1], tmp1[1], tmp2[1], tmp3[1], tmp4[1], tmp5[1], tmp6[1], tmp7[1],
3085 ]);
3086 store(0, out0, out1);
3087
3088 let out2 = self.bf8.perform_parallel_fft_direct([
3089 tmp0[2], tmp1[2], tmp2[2], tmp3[2], tmp4[2], tmp5[2], tmp6[2], tmp7[2],
3090 ]);
3091 let out3 = self.bf8.perform_parallel_fft_direct([
3092 tmp0[3], tmp1[3], tmp2[3], tmp3[3], tmp4[3], tmp5[3], tmp6[3], tmp7[3],
3093 ]);
3094 store(2, out2, out3);
3095 }
3096}
3097
3098pub struct SseF64Butterfly32<T> {
3106 bf8: SseF64Butterfly8<T>,
3107 twiddle1: __m128d,
3108 twiddle2: __m128d,
3109 twiddle3: __m128d,
3110 twiddle5: __m128d,
3111 twiddle6: __m128d,
3112 twiddle7: __m128d,
3113 twiddle9: __m128d,
3114 twiddle10: __m128d,
3115 twiddle14: __m128d,
3116 twiddle15: __m128d,
3117 twiddle18: __m128d,
3118 twiddle21: __m128d,
3119}
3120
3121boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly32);
3122boilerplate_fft_sse_common_butterfly!(SseF64Butterfly32, 32, |this: &SseF64Butterfly32<_>| this
3123 .bf8
3124 .bf4
3125 .direction);
3126impl<T: FftNum> SseF64Butterfly32<T> {
3127 #[inline(always)]
3128 pub fn new(direction: FftDirection) -> Self {
3129 assert_f64::<T>();
3130 let tw1: Complex<f64> = twiddles::compute_twiddle(1, 32, direction);
3131 let tw2: Complex<f64> = twiddles::compute_twiddle(2, 32, direction);
3132 let tw3: Complex<f64> = twiddles::compute_twiddle(3, 32, direction);
3133 let tw5: Complex<f64> = twiddles::compute_twiddle(5, 32, direction);
3134 let tw6: Complex<f64> = twiddles::compute_twiddle(6, 32, direction);
3135 let tw7: Complex<f64> = twiddles::compute_twiddle(7, 32, direction);
3136 let tw9: Complex<f64> = twiddles::compute_twiddle(9, 32, direction);
3137 let tw10: Complex<f64> = twiddles::compute_twiddle(10, 32, direction);
3138 let tw14: Complex<f64> = twiddles::compute_twiddle(14, 32, direction);
3139 let tw15: Complex<f64> = twiddles::compute_twiddle(15, 32, direction);
3140 let tw18: Complex<f64> = twiddles::compute_twiddle(18, 32, direction);
3141 let tw21: Complex<f64> = twiddles::compute_twiddle(21, 32, direction);
3142
3143 unsafe {
3144 Self {
3145 bf8: SseF64Butterfly8::new(direction),
3146 twiddle1: pack_64(tw1),
3147 twiddle2: pack_64(tw2),
3148 twiddle3: pack_64(tw3),
3149 twiddle5: pack_64(tw5),
3150 twiddle6: pack_64(tw6),
3151 twiddle7: pack_64(tw7),
3152 twiddle9: pack_64(tw9),
3153 twiddle10: pack_64(tw10),
3154 twiddle14: pack_64(tw14),
3155 twiddle15: pack_64(tw15),
3156 twiddle18: pack_64(tw18),
3157 twiddle21: pack_64(tw21),
3158 }
3159 }
3160 }
3161
3162 #[inline(always)]
3163 unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
3164 let load = |i| {
3172 [
3173 buffer.load_complex(i),
3174 buffer.load_complex(i + 8),
3175 buffer.load_complex(i + 16),
3176 buffer.load_complex(i + 24),
3177 ]
3178 };
3179
3180 let mut tmp1 = self.bf8.bf4.perform_fft_direct(load(1));
3182 tmp1[1] = SseVector::mul_complex(tmp1[1], self.twiddle1);
3183 tmp1[2] = SseVector::mul_complex(tmp1[2], self.twiddle2);
3184 tmp1[3] = SseVector::mul_complex(tmp1[3], self.twiddle3);
3185
3186 let mut tmp2 = self.bf8.bf4.perform_fft_direct(load(2));
3187 tmp2[1] = SseVector::mul_complex(tmp2[1], self.twiddle2);
3188 tmp2[2] = self.bf8.bf4.rotate.rotate_45(tmp2[2]);
3189 tmp2[3] = SseVector::mul_complex(tmp2[3], self.twiddle6);
3190
3191 let mut tmp3 = self.bf8.bf4.perform_fft_direct(load(3));
3192 tmp3[1] = SseVector::mul_complex(tmp3[1], self.twiddle3);
3193 tmp3[2] = SseVector::mul_complex(tmp3[2], self.twiddle6);
3194 tmp3[3] = SseVector::mul_complex(tmp3[3], self.twiddle9);
3195
3196 let mut tmp5 = self.bf8.bf4.perform_fft_direct(load(5));
3197 tmp5[1] = SseVector::mul_complex(tmp5[1], self.twiddle5);
3198 tmp5[2] = SseVector::mul_complex(tmp5[2], self.twiddle10);
3199 tmp5[3] = SseVector::mul_complex(tmp5[3], self.twiddle15);
3200
3201 let mut tmp6 = self.bf8.bf4.perform_fft_direct(load(6));
3202 tmp6[1] = SseVector::mul_complex(tmp6[1], self.twiddle6);
3203 tmp6[2] = self.bf8.bf4.rotate.rotate_135(tmp6[2]);
3204 tmp6[3] = SseVector::mul_complex(tmp6[3], self.twiddle18);
3205
3206 let mut tmp7 = self.bf8.bf4.perform_fft_direct(load(7));
3207 tmp7[1] = SseVector::mul_complex(tmp7[1], self.twiddle7);
3208 tmp7[2] = SseVector::mul_complex(tmp7[2], self.twiddle14);
3209 tmp7[3] = SseVector::mul_complex(tmp7[3], self.twiddle21);
3210
3211 let mut tmp4 = self.bf8.bf4.perform_fft_direct(load(4));
3212 tmp4[1] = self.bf8.bf4.rotate.rotate_45(tmp4[1]);
3213 tmp4[2] = self.bf8.bf4.rotate.rotate(tmp4[2]);
3214 tmp4[3] = self.bf8.bf4.rotate.rotate_135(tmp4[3]);
3215
3216 let tmp0 = self.bf8.bf4.perform_fft_direct(load(0));
3218
3219 let mut store = |i, vectors: [__m128d; 8]| {
3221 buffer.store_complex(vectors[0], i);
3222 buffer.store_complex(vectors[1], i + 4);
3223 buffer.store_complex(vectors[2], i + 8);
3224 buffer.store_complex(vectors[3], i + 12);
3225 buffer.store_complex(vectors[4], i + 16);
3226 buffer.store_complex(vectors[5], i + 20);
3227 buffer.store_complex(vectors[6], i + 24);
3228 buffer.store_complex(vectors[7], i + 28);
3229 };
3230
3231 let out0 = self.bf8.perform_fft_direct([
3233 tmp0[0], tmp1[0], tmp2[0], tmp3[0], tmp4[0], tmp5[0], tmp6[0], tmp7[0],
3234 ]);
3235 store(0, out0);
3236
3237 let out1 = self.bf8.perform_fft_direct([
3238 tmp0[1], tmp1[1], tmp2[1], tmp3[1], tmp4[1], tmp5[1], tmp6[1], tmp7[1],
3239 ]);
3240 store(1, out1);
3241
3242 let out2 = self.bf8.perform_fft_direct([
3243 tmp0[2], tmp1[2], tmp2[2], tmp3[2], tmp4[2], tmp5[2], tmp6[2], tmp7[2],
3244 ]);
3245 store(2, out2);
3246
3247 let out3 = self.bf8.perform_fft_direct([
3248 tmp0[3], tmp1[3], tmp2[3], tmp3[3], tmp4[3], tmp5[3], tmp6[3], tmp7[3],
3249 ]);
3250 store(3, out3);
3251 }
3252}
3253
3254#[cfg(test)]
3255mod unit_tests {
3256 use super::*;
3257 use crate::{
3258 algorithm::Dft,
3259 test_utils::{check_fft_algorithm, compare_vectors},
3260 };
3261
3262 macro_rules! test_butterfly_32_func {
3265 ($test_name:ident, $struct_name:ident, $size:expr) => {
3266 #[test]
3267 fn $test_name() {
3268 let butterfly = $struct_name::new(FftDirection::Forward);
3269 check_fft_algorithm::<f32>(&butterfly, $size, FftDirection::Forward);
3270
3271 let butterfly_direction = $struct_name::new(FftDirection::Inverse);
3272 check_fft_algorithm::<f32>(&butterfly_direction, $size, FftDirection::Inverse);
3273 }
3274 };
3275 }
3276 test_butterfly_32_func!(test_ssef32_butterfly1, SseF32Butterfly1, 1);
3277 test_butterfly_32_func!(test_ssef32_butterfly2, SseF32Butterfly2, 2);
3278 test_butterfly_32_func!(test_ssef32_butterfly3, SseF32Butterfly3, 3);
3279 test_butterfly_32_func!(test_ssef32_butterfly4, SseF32Butterfly4, 4);
3280 test_butterfly_32_func!(test_ssef32_butterfly5, SseF32Butterfly5, 5);
3281 test_butterfly_32_func!(test_ssef32_butterfly6, SseF32Butterfly6, 6);
3282 test_butterfly_32_func!(test_ssef32_butterfly8, SseF32Butterfly8, 8);
3283 test_butterfly_32_func!(test_ssef32_butterfly9, SseF32Butterfly9, 9);
3284 test_butterfly_32_func!(test_ssef32_butterfly10, SseF32Butterfly10, 10);
3285 test_butterfly_32_func!(test_ssef32_butterfly12, SseF32Butterfly12, 12);
3286 test_butterfly_32_func!(test_ssef32_butterfly15, SseF32Butterfly15, 15);
3287 test_butterfly_32_func!(test_ssef32_butterfly16, SseF32Butterfly16, 16);
3288 test_butterfly_32_func!(test_ssef32_butterfly24, SseF32Butterfly24, 24);
3289 test_butterfly_32_func!(test_ssef32_butterfly32, SseF32Butterfly32, 32);
3290
3291 macro_rules! test_butterfly_64_func {
3294 ($test_name:ident, $struct_name:ident, $size:expr) => {
3295 #[test]
3296 fn $test_name() {
3297 let butterfly = $struct_name::new(FftDirection::Forward);
3298 check_fft_algorithm::<f64>(&butterfly, $size, FftDirection::Forward);
3299
3300 let butterfly_direction = $struct_name::new(FftDirection::Inverse);
3301 check_fft_algorithm::<f64>(&butterfly_direction, $size, FftDirection::Inverse);
3302 }
3303 };
3304 }
3305 test_butterfly_64_func!(test_ssef64_butterfly1, SseF64Butterfly1, 1);
3306 test_butterfly_64_func!(test_ssef64_butterfly2, SseF64Butterfly2, 2);
3307 test_butterfly_64_func!(test_ssef64_butterfly3, SseF64Butterfly3, 3);
3308 test_butterfly_64_func!(test_ssef64_butterfly4, SseF64Butterfly4, 4);
3309 test_butterfly_64_func!(test_ssef64_butterfly5, SseF64Butterfly5, 5);
3310 test_butterfly_64_func!(test_ssef64_butterfly6, SseF64Butterfly6, 6);
3311 test_butterfly_64_func!(test_ssef64_butterfly8, SseF64Butterfly8, 8);
3312 test_butterfly_64_func!(test_ssef64_butterfly9, SseF64Butterfly9, 9);
3313 test_butterfly_64_func!(test_ssef64_butterfly10, SseF64Butterfly10, 10);
3314 test_butterfly_64_func!(test_ssef64_butterfly12, SseF64Butterfly12, 12);
3315 test_butterfly_64_func!(test_ssef64_butterfly15, SseF64Butterfly15, 15);
3316 test_butterfly_64_func!(test_ssef64_butterfly16, SseF64Butterfly16, 16);
3317 test_butterfly_64_func!(test_ssef64_butterfly24, SseF64Butterfly24, 24);
3318 test_butterfly_64_func!(test_ssef64_butterfly32, SseF64Butterfly32, 32);
3319
3320 #[test]
3321 fn test_parallel_fft4_32() {
3322 unsafe {
3323 let val_a1 = Complex::<f32>::new(1.0, 2.5);
3324 let val_a2 = Complex::<f32>::new(3.2, 4.2);
3325 let val_a3 = Complex::<f32>::new(5.6, 6.2);
3326 let val_a4 = Complex::<f32>::new(7.4, 8.3);
3327
3328 let val_b1 = Complex::<f32>::new(6.0, 24.5);
3329 let val_b2 = Complex::<f32>::new(4.2, 34.2);
3330 let val_b3 = Complex::<f32>::new(9.6, 61.2);
3331 let val_b4 = Complex::<f32>::new(17.4, 81.3);
3332
3333 let p1 = _mm_set_ps(val_b1.im, val_b1.re, val_a1.im, val_a1.re);
3334 let p2 = _mm_set_ps(val_b2.im, val_b2.re, val_a2.im, val_a2.re);
3335 let p3 = _mm_set_ps(val_b3.im, val_b3.re, val_a3.im, val_a3.re);
3336 let p4 = _mm_set_ps(val_b4.im, val_b4.re, val_a4.im, val_a4.re);
3337
3338 let mut val_a = vec![val_a1, val_a2, val_a3, val_a4];
3339 let mut val_b = vec![val_b1, val_b2, val_b3, val_b4];
3340
3341 let dft = Dft::new(4, FftDirection::Forward);
3342
3343 let bf4 = SseF32Butterfly4::<f32>::new(FftDirection::Forward);
3344
3345 dft.process(&mut val_a);
3346 dft.process(&mut val_b);
3347 let res_both = bf4.perform_parallel_fft_direct([p1, p2, p3, p4]);
3348
3349 let res = std::mem::transmute::<[__m128; 4], [Complex<f32>; 8]>(res_both);
3350 let sse_res_a = [res[0], res[2], res[4], res[6]];
3351 let sse_res_b = [res[1], res[3], res[5], res[7]];
3352 assert!(compare_vectors(&val_a, &sse_res_a));
3353 assert!(compare_vectors(&val_b, &sse_res_b));
3354 }
3355 }
3356
3357 #[test]
3358 fn test_pack() {
3359 unsafe {
3360 let nbr2 = _mm_set_ps(8.0, 7.0, 6.0, 5.0);
3361 let nbr1 = _mm_set_ps(4.0, 3.0, 2.0, 1.0);
3362 let first = extract_lo_lo_f32(nbr1, nbr2);
3363 let second = extract_hi_hi_f32(nbr1, nbr2);
3364 let first = std::mem::transmute::<__m128, [Complex<f32>; 2]>(first);
3365 let second = std::mem::transmute::<__m128, [Complex<f32>; 2]>(second);
3366 let first_expected = [Complex::new(1.0, 2.0), Complex::new(5.0, 6.0)];
3367 let second_expected = [Complex::new(3.0, 4.0), Complex::new(7.0, 8.0)];
3368 assert_eq!(first, first_expected);
3369 assert_eq!(second, second_expected);
3370 }
3371 }
3372}