rustfft/avx/
avx_planner.rs

1use std::sync::Arc;
2use std::{any::TypeId, cmp::min};
3
4use primal_check::miller_rabin;
5
6use crate::algorithm::*;
7use crate::common::FftNum;
8use crate::math_utils::PartialFactors;
9use crate::Fft;
10use crate::{algorithm::butterflies::*, fft_cache::FftCache};
11
12use super::*;
13
14fn wrap_fft<T: FftNum>(butterfly: impl Fft<T> + 'static) -> Arc<dyn Fft<T>> {
15    Arc::new(butterfly) as Arc<dyn Fft<T>>
16}
17
18#[derive(Debug)]
19enum MixedRadixBase {
20    // The base will be a butterfly algorithm
21    ButterflyBase(usize),
22
23    // The base will be an instance of Rader's Algorithm. That will require its own plan for the internal FFT, which we'll handle separately
24    RadersBase(usize),
25
26    // The base will be an instance of Bluestein's Algorithm. That will require its own plan for the internal FFT, which we'll handle separately.
27    // First usize is the base length, second usize is the inner FFT length
28    BluesteinsBase(usize, usize),
29
30    // The "base" is a FFT instance we already have cached
31    CacheBase(usize),
32}
33impl MixedRadixBase {
34    fn base_len(&self) -> usize {
35        match self {
36            Self::ButterflyBase(len) => *len,
37            Self::RadersBase(len) => *len,
38            Self::BluesteinsBase(len, _) => *len,
39            Self::CacheBase(len) => *len,
40        }
41    }
42}
43
44/// repreesnts a FFT plan, stored as a base FFT and a stack of MixedRadix*xn on top of it.
45#[derive(Debug)]
46pub struct MixedRadixPlan {
47    len: usize,       // product of base and radixes
48    radixes: Vec<u8>, // stored from innermost to outermost
49    base: MixedRadixBase,
50}
51impl MixedRadixPlan {
52    fn new(base: MixedRadixBase, radixes: Vec<u8>) -> Self {
53        Self {
54            len: base.base_len() * radixes.iter().map(|r| *r as usize).product::<usize>(),
55            base,
56            radixes,
57        }
58    }
59    fn cached(cached_len: usize) -> Self {
60        Self {
61            len: cached_len,
62            base: MixedRadixBase::CacheBase(cached_len),
63            radixes: Vec::new(),
64        }
65    }
66    fn butterfly(butterfly_len: usize, radixes: Vec<u8>) -> Self {
67        Self::new(MixedRadixBase::ButterflyBase(butterfly_len), radixes)
68    }
69    fn push_radix(&mut self, radix: u8) {
70        self.radixes.push(radix);
71        self.len *= radix as usize;
72    }
73    fn push_radix_power(&mut self, radix: u8, power: u32) {
74        self.radixes
75            .extend(std::iter::repeat(radix).take(power as usize));
76        self.len *= (radix as usize).pow(power);
77    }
78}
79
80/// The AVX FFT planner creates new FFT algorithm instances which take advantage of the AVX instruction set.
81///
82/// Creating an instance of `FftPlannerAvx` requires the `avx` and `fma` instructions to be available on the current machine, and it requires RustFFT's
83///  `avx` feature flag to be set. A few algorithms will use `avx2` if it's available, but it isn't required.
84///
85/// For the time being, AVX acceleration is black box, and AVX accelerated algorithms are not available without a planner. This may change in the future.
86///
87/// ~~~
88/// // Perform a forward Fft of size 1234, accelerated by AVX
89/// use std::sync::Arc;
90/// use rustfft::{FftPlannerAvx, num_complex::Complex};
91///
92/// // If FftPlannerAvx::new() returns Ok(), we'll know AVX algorithms are available
93/// // on this machine, and that RustFFT was compiled with the `avx` feature flag
94/// if let Ok(mut planner) = FftPlannerAvx::new() {
95///     let fft = planner.plan_fft_forward(1234);
96///
97///     let mut buffer = vec![Complex{ re: 0.0f32, im: 0.0f32 }; 1234];
98///     fft.process(&mut buffer);
99///
100///     // The FFT instance returned by the planner has the type `Arc<dyn Fft<T>>`,
101///     // where T is the numeric type, ie f32 or f64, so it's cheap to clone
102///     let fft_clone = Arc::clone(&fft);
103/// }
104/// ~~~
105///
106/// If you plan on creating multiple FFT instances, it is recommended to re-use the same planner for all of them. This
107/// is because the planner re-uses internal data across FFT instances wherever possible, saving memory and reducing
108/// setup time. (FFT instances created with one planner will never re-use data and buffers with FFT instances created
109/// by a different planner)
110///
111/// Each FFT instance owns [`Arc`s](std::sync::Arc) to its internal data, rather than borrowing it from the planner, so it's perfectly
112/// safe to drop the planner after creating Fft instances.
113pub struct FftPlannerAvx<T: FftNum> {
114    internal_planner: Box<dyn AvxPlannerInternalAPI<T>>,
115}
116impl<T: FftNum> FftPlannerAvx<T> {
117    /// Constructs a new `FftPlannerAvx` instance.
118    ///
119    /// Returns `Ok(planner_instance)` if we're compiling for X86_64, AVX support was enabled in feature flags, and the current CPU supports the `avx` and `fma` CPU features.
120    /// Returns `Err(())` if AVX support is not available.
121    pub fn new() -> Result<Self, ()> {
122        // Eventually we might make AVX algorithms that don't also require FMA.
123        // If that happens, we can only check for AVX here? seems like a pretty low-priority addition
124        let has_avx = is_x86_feature_detected!("avx");
125        let has_fma = is_x86_feature_detected!("fma");
126        if has_avx && has_fma {
127            // Ideally, we would implement the planner with specialization.
128            // Specialization won't be on stable rust for a long time tohugh, so in the meantime, we can hack around it.
129            //
130            // The first step of the hack is to use TypeID to determine if T is f32, f64, or neither. If neither, we don't want to di any AVX acceleration
131            // If it's f32 or f64, then construct an internal type that has two generic parameters, one bounded on AvxNum, the other bounded on FftNum
132            //
133            // - A is bounded on the AvxNum trait, and is the type we use for any AVX computations. It has associated types for AVX vectors,
134            //      associated constants for the number of elements per vector, etc.
135            // - T is bounded on the FftNum trait, and thus is the type that every FFT algorithm will recieve its input/output buffers in.
136            //
137            // An important snag relevant to the planner is that we have to box and type-erase the AvxNum bound,
138            // since the only other option is making the AvxNum bound a part of this struct's external API
139            //
140            // Another annoying snag with this setup is that we frequently have to transmute buffers from &mut [Complex<T>] to &mut [Complex<A>] or vice versa.
141            // We know this is safe because we assert everywhere that Type(A)==Type(T), so it's just a matter of "doing it right" every time.
142            // These transmutes are required because the FFT algorithm's input will come through the FFT trait, which may only be bounded by FftNum.
143            // So the buffers will have the type &mut [Complex<T>]. The problem comes in that all of our AVX computation tools are on the AvxNum trait.
144            //
145            // If we had specialization, we could easily convince the compilr that AvxNum and FftNum were different bounds on the same underlying type (IE f32 or f64)
146            // but without it, the compiler is convinced that they are different. So we use the transmute as a last-resort way to overcome this limitation.
147            //
148            // We keep both the A and T types around in all of our AVX-related structs so that we can cast between A and T whenever necessary.
149            let id_f32 = TypeId::of::<f32>();
150            let id_f64 = TypeId::of::<f64>();
151            let id_t = TypeId::of::<T>();
152
153            if id_t == id_f32 {
154                return Ok(Self {
155                    internal_planner: Box::new(AvxPlannerInternal::<f32, T>::new()),
156                });
157            } else if id_t == id_f64 {
158                return Ok(Self {
159                    internal_planner: Box::new(AvxPlannerInternal::<f64, T>::new()),
160                });
161            }
162        }
163        Err(())
164    }
165
166    /// Returns a `Fft` instance which uses AVX instructions to compute FFTs of size `len`.
167    ///
168    /// If the provided `direction` is `FftDirection::Forward`, the returned instance will compute forward FFTs. If it's `FftDirection::Inverse`, it will compute inverse FFTs.
169    ///
170    /// If this is called multiple times, the planner will attempt to re-use internal data between calls, reducing memory usage and FFT initialization time.
171    pub fn plan_fft(&mut self, len: usize, direction: FftDirection) -> Arc<dyn Fft<T>> {
172        self.internal_planner.plan_and_construct_fft(len, direction)
173    }
174    /// Returns a `Fft` instance which uses AVX instructions to compute forward FFTs of size `len`.
175    ///
176    /// If this is called multiple times, the planner will attempt to re-use internal data between calls, reducing memory usage and FFT initialization time.
177    pub fn plan_fft_forward(&mut self, len: usize) -> Arc<dyn Fft<T>> {
178        self.plan_fft(len, FftDirection::Forward)
179    }
180    /// Returns a `Fft` instance which uses AVX instructions to compute inverse FFTs of size `len.
181    ///
182    /// If this is called multiple times, the planner will attempt to re-use internal data between calls, reducing memory usage and FFT initialization time.
183    pub fn plan_fft_inverse(&mut self, len: usize) -> Arc<dyn Fft<T>> {
184        self.plan_fft(len, FftDirection::Inverse)
185    }
186
187    /// Returns a FFT plan without constructing it
188    #[allow(unused)]
189    pub(crate) fn debug_plan_fft(&self, len: usize, direction: FftDirection) -> MixedRadixPlan {
190        self.internal_planner.debug_plan_fft(len, direction)
191    }
192}
193
194trait AvxPlannerInternalAPI<T: FftNum>: Send + Sync {
195    fn plan_and_construct_fft(&mut self, len: usize, direction: FftDirection) -> Arc<dyn Fft<T>>;
196    fn debug_plan_fft(&self, len: usize, direction: FftDirection) -> MixedRadixPlan;
197}
198
199struct AvxPlannerInternal<A: AvxNum, T: FftNum> {
200    cache: FftCache<T>,
201    _phantom: std::marker::PhantomData<A>,
202}
203
204impl<T: FftNum> AvxPlannerInternalAPI<T> for AvxPlannerInternal<f32, T> {
205    fn plan_and_construct_fft(&mut self, len: usize, direction: FftDirection) -> Arc<dyn Fft<T>> {
206        // Step 1: Create a plan for this FFT length.
207        let plan = self.plan_fft(len, direction, Self::plan_mixed_radix_base);
208
209        // Step 2: Construct the plan. If the base is rader's algorithm or bluestein's algorithm, this may call self.plan_and_construct_fft recursively!
210        self.construct_plan(
211            plan,
212            direction,
213            Self::construct_butterfly,
214            Self::plan_and_construct_fft,
215        )
216    }
217    fn debug_plan_fft(&self, len: usize, direction: FftDirection) -> MixedRadixPlan {
218        self.plan_fft(len, direction, Self::plan_mixed_radix_base)
219    }
220}
221impl<T: FftNum> AvxPlannerInternalAPI<T> for AvxPlannerInternal<f64, T> {
222    fn plan_and_construct_fft(&mut self, len: usize, direction: FftDirection) -> Arc<dyn Fft<T>> {
223        // Step 1: Create a plan for this FFT length.
224        let plan = self.plan_fft(len, direction, Self::plan_mixed_radix_base);
225
226        // Step 2: Construct the plan. If the base is rader's algorithm or bluestein's algorithm, this may call self.plan_and_construct_fft recursively!
227        self.construct_plan(
228            plan,
229            direction,
230            Self::construct_butterfly,
231            Self::plan_and_construct_fft,
232        )
233    }
234    fn debug_plan_fft(&self, len: usize, direction: FftDirection) -> MixedRadixPlan {
235        self.plan_fft(len, direction, Self::plan_mixed_radix_base)
236    }
237}
238
239//-------------------------------------------------------------------
240// f32-specific planning stuff
241//-------------------------------------------------------------------
242impl<T: FftNum> AvxPlannerInternal<f32, T> {
243    pub fn new() -> Self {
244        // Internal sanity check: Make sure that T == f32.
245        // This struct has two generic parameters A and T, but they must always be the same, and are only kept separate to help work around the lack of specialization.
246        // It would be cool if we could do this as a static_assert instead
247        let id_f32 = TypeId::of::<f32>();
248        let id_t = TypeId::of::<T>();
249        assert_eq!(id_f32, id_t);
250
251        Self {
252            cache: FftCache::new(),
253            _phantom: std::marker::PhantomData,
254        }
255    }
256
257    fn plan_mixed_radix_base(&self, len: usize, factors: &PartialFactors) -> MixedRadixPlan {
258        // if we have non-fast-path factors, use them as our base FFT length, and we will have to use either rader's algorithm or bluestein's algorithm as our base
259        if factors.get_other_factors() > 1 {
260            let other_factors = factors.get_other_factors();
261
262            // First, if the "other factors" are a butterfly, use that as the butterfly
263            if self.is_butterfly(other_factors) {
264                return MixedRadixPlan::butterfly(other_factors, vec![]);
265            }
266
267            // We can only use rader's if `other_factors` is prime
268            if miller_rabin(other_factors as u64) {
269                // len is prime, so we can use Rader's Algorithm as a base. Whether or not that's a good idea is a different story
270                // Rader's Algorithm is only faster in a few narrow cases.
271                // as a heuristic, only use rader's algorithm if its inner FFT can be computed entirely without bluestein's or rader's
272                // We're intentionally being too conservative here. Otherwise we'd be recursively applying a heuristic, and repeated heuristic failures could stack to make a rader's chain significantly slower.
273                // If we were writing a measuring planner, expanding this heuristic and measuring its effectiveness would be an opportunity for up to 2x performance gains.
274                let inner_factors = PartialFactors::compute(other_factors - 1);
275                if self.is_butterfly(inner_factors.get_other_factors()) {
276                    // We only have factors of 2,3,5,7, and 11. If we don't have AVX2, we also have to exclude factors of 5 and 7 and 11, because avx2 gives us enough headroom for the overhead of those to not be a problem
277                    if is_x86_feature_detected!("avx2")
278                        || (inner_factors.product_power2power3() == len - 1)
279                    {
280                        return MixedRadixPlan::new(
281                            MixedRadixBase::RadersBase(other_factors),
282                            vec![],
283                        );
284                    }
285                }
286            }
287
288            // At this point, we know we're using bluestein's algorithm for the base. Next step is to plan the inner size we'll use for bluestein's algorithm.
289            let inner_bluesteins_len =
290                self.plan_bluesteins(other_factors, |(_len, factor2, factor3)| {
291                    if *factor2 > 16 && *factor3 < 3 {
292                        // surprisingly, pure powers of 2 have a pretty steep dropoff in speed after 65536.
293                        // the algorithm is designed to generate candidadtes larger than baseline_candidate, so if we hit a large power of 2, there should be more after it that we can skip to
294                        return false;
295                    }
296                    true
297                });
298            return MixedRadixPlan::new(
299                MixedRadixBase::BluesteinsBase(other_factors, inner_bluesteins_len),
300                vec![],
301            );
302        }
303
304        // If this FFT size is a butterfly, use that
305        if self.is_butterfly(len) {
306            return MixedRadixPlan::butterfly(len, vec![]);
307        }
308
309        // If the power2 * power3 component of this FFT is a butterfly and not too small, return that
310        let power2power3 = factors.product_power2power3();
311        if power2power3 > 4 && self.is_butterfly(power2power3) {
312            return MixedRadixPlan::butterfly(power2power3, vec![]);
313        }
314
315        // most of this code is heuristics assuming FFTs of a minimum size. if the FFT is below that minimum size, the heuristics break down.
316        // so the first thing we're going to do is hardcode the plan for osme specific sizes where we know the heuristics won't be enough
317        let hardcoded_base = match power2power3 {
318            // 3 * 2^n special cases
319            96 => Some(MixedRadixPlan::butterfly(32, vec![3])), // 2^5 * 3
320            192 => Some(MixedRadixPlan::butterfly(48, vec![4])), // 2^6 * 3
321            1536 => Some(MixedRadixPlan::butterfly(48, vec![8, 4])), // 2^8 * 3
322
323            // 9 * 2^n special cases
324            18 => Some(MixedRadixPlan::butterfly(3, vec![6])), // 2 * 3^2
325            144 => Some(MixedRadixPlan::butterfly(36, vec![4])), // 2^4 * 3^2
326
327            _ => None,
328        };
329        if let Some(hardcoded) = hardcoded_base {
330            return hardcoded;
331        }
332
333        if factors.get_power2() >= 5 {
334            match factors.get_power3() {
335                // if this FFT is a power of 2, our strategy here is to tweak the butterfly to free us up to do an 8xn chain
336                0 => match factors.get_power2() % 3 {
337                    0 => MixedRadixPlan::butterfly(512, vec![]),
338                    1 => MixedRadixPlan::butterfly(256, vec![]),
339                    2 => MixedRadixPlan::butterfly(256, vec![]),
340                    _ => unreachable!(),
341                },
342                // if this FFT is 3 times a power of 2, our strategy here is to tweak butterflies to make it easier to set up a 8xn chain
343                1 => match factors.get_power2() % 3 {
344                    0 => MixedRadixPlan::butterfly(64, vec![12, 16]),
345                    1 => MixedRadixPlan::butterfly(48, vec![]),
346                    2 => MixedRadixPlan::butterfly(64, vec![]),
347                    _ => unreachable!(),
348                },
349                // if this FFT is 9 or greater times a power of 2, just use 72. As you might expect, in this vast field of options, what is optimal becomes a lot more muddy and situational
350                // but across all the benchmarking i've done, 72 seems like the best default that will get us the best plan in 95% of the cases
351                // 64, 54, and 48 are occasionally faster, although i haven't been able to discern a pattern.
352                _ => MixedRadixPlan::butterfly(72, vec![]),
353            }
354        } else if factors.get_power3() >= 3 {
355            // Our FFT is a power of 3 times a low power of 2. A high level summary of our strategy is that we want to pick a base that will
356            // A: consume all factors of 2, and B: leave us with an even power of 3, so that we can do a 9xn chain.
357            match factors.get_power2() {
358                0 => MixedRadixPlan::butterfly(27, vec![]),
359                1 => MixedRadixPlan::butterfly(54, vec![]),
360                2 => match factors.get_power3() % 2 {
361                    0 => MixedRadixPlan::butterfly(36, vec![]),
362                    1 => MixedRadixPlan::butterfly(if len < 1000 { 36 } else { 12 }, vec![]),
363                    _ => unreachable!(),
364                },
365                3 => match factors.get_power3() % 2 {
366                    0 => MixedRadixPlan::butterfly(72, vec![]),
367                    1 => MixedRadixPlan::butterfly(
368                        if factors.get_power3() > 7 { 24 } else { 72 },
369                        vec![],
370                    ),
371                    _ => unreachable!(),
372                },
373                4 => match factors.get_power3() % 2 {
374                    0 => MixedRadixPlan::butterfly(
375                        if factors.get_power3() > 6 { 16 } else { 72 },
376                        vec![],
377                    ),
378                    1 => MixedRadixPlan::butterfly(
379                        if factors.get_power3() > 9 { 48 } else { 72 },
380                        vec![],
381                    ),
382                    _ => unreachable!(),
383                },
384                // if this FFT is 32 or greater times a power of 3, just use 72. As you might expect, in this vast field of options, what is optimal becomes a lot more muddy and situational
385                // but across all the benchmarking i've done, 72 seems like the best default that will get us the best plan in 95% of the cases
386                // 64, 54, and 48 are occasionally faster, although i haven't been able to discern a pattern.
387                _ => MixedRadixPlan::butterfly(72, vec![]),
388            }
389        }
390        // If this FFT has powers of 11, 7, or 5, use that
391        else if factors.get_power11() > 0 {
392            MixedRadixPlan::butterfly(11, vec![])
393        } else if factors.get_power7() > 0 {
394            MixedRadixPlan::butterfly(7, vec![])
395        } else if factors.get_power5() > 0 {
396            MixedRadixPlan::butterfly(5, vec![])
397        } else {
398            panic!(
399                "Couldn't find a base for FFT size {}, factors={:?}",
400                len, factors
401            )
402        }
403    }
404
405    fn is_butterfly(&self, len: usize) -> bool {
406        [
407            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 16, 17, 19, 23, 24, 27, 29, 31, 32, 36, 48,
408            54, 64, 72, 128, 256, 512,
409        ]
410        .contains(&len)
411    }
412
413    fn construct_butterfly(&self, len: usize, direction: FftDirection) -> Arc<dyn Fft<T>> {
414        match len {
415            0 | 1 => wrap_fft(Dft::new(len, direction)),
416            2 => wrap_fft(Butterfly2::new(direction)),
417            3 => wrap_fft(Butterfly3::new(direction)),
418            4 => wrap_fft(Butterfly4::new(direction)),
419            5 => wrap_fft(Butterfly5Avx::new(direction).unwrap()),
420            6 => wrap_fft(Butterfly6::new(direction)),
421            7 => wrap_fft(Butterfly7Avx::new(direction).unwrap()),
422            8 => wrap_fft(Butterfly8Avx::new(direction).unwrap()),
423            9 => wrap_fft(Butterfly9Avx::new(direction).unwrap()),
424            11 => wrap_fft(Butterfly11Avx::new(direction).unwrap()),
425            12 => wrap_fft(Butterfly12Avx::new(direction).unwrap()),
426            13 => wrap_fft(Butterfly13::new(direction)),
427            16 => wrap_fft(Butterfly16Avx::new(direction).unwrap()),
428            17 => wrap_fft(Butterfly17::new(direction)),
429            19 => wrap_fft(Butterfly19::new(direction)),
430            23 => wrap_fft(Butterfly23::new(direction)),
431            24 => wrap_fft(Butterfly24Avx::new(direction).unwrap()),
432            27 => wrap_fft(Butterfly27Avx::new(direction).unwrap()),
433            29 => wrap_fft(Butterfly29::new(direction)),
434            31 => wrap_fft(Butterfly31::new(direction)),
435            32 => wrap_fft(Butterfly32Avx::new(direction).unwrap()),
436            36 => wrap_fft(Butterfly36Avx::new(direction).unwrap()),
437            48 => wrap_fft(Butterfly48Avx::new(direction).unwrap()),
438            54 => wrap_fft(Butterfly54Avx::new(direction).unwrap()),
439            64 => wrap_fft(Butterfly64Avx::new(direction).unwrap()),
440            72 => wrap_fft(Butterfly72Avx::new(direction).unwrap()),
441            128 => wrap_fft(Butterfly128Avx::new(direction).unwrap()),
442            256 => wrap_fft(Butterfly256Avx::new(direction).unwrap()),
443            512 => wrap_fft(Butterfly512Avx::new(direction).unwrap()),
444            _ => panic!("Invalid butterfly len: {}", len),
445        }
446    }
447}
448
449//-------------------------------------------------------------------
450// f64-specific planning stuff
451//-------------------------------------------------------------------
452impl<T: FftNum> AvxPlannerInternal<f64, T> {
453    pub fn new() -> Self {
454        // Internal sanity check: Make sure that T == f64.
455        // This struct has two generic parameters A and T, but they must always be the same, and are only kept separate to help work around the lack of specialization.
456        // It would be cool if we could do this as a static_assert instead
457        let id_f64 = TypeId::of::<f64>();
458        let id_t = TypeId::of::<T>();
459        assert_eq!(id_f64, id_t);
460
461        Self {
462            cache: FftCache::new(),
463            _phantom: std::marker::PhantomData,
464        }
465    }
466
467    fn plan_mixed_radix_base(&self, len: usize, factors: &PartialFactors) -> MixedRadixPlan {
468        // if we have a factor that can't be computed with 2xn 3xn etc, we'll have to compute it with bluestein's or rader's, so use that as the base
469        if factors.get_other_factors() > 1 {
470            let other_factors = factors.get_other_factors();
471
472            // First, if the "other factors" are a butterfly, use that as the butterfly
473            if self.is_butterfly(other_factors) {
474                return MixedRadixPlan::butterfly(other_factors, vec![]);
475            }
476
477            // We can only use rader's if `other_factors` is prime
478            if miller_rabin(other_factors as u64) {
479                // len is prime, so we can use Rader's Algorithm as a base. Whether or not that's a good idea is a different story
480                // Rader's Algorithm is only faster in a few narrow cases.
481                // as a heuristic, only use rader's algorithm if its inner FFT can be computed entirely without bluestein's or rader's
482                // We're intentionally being too conservative here. Otherwise we'd be recursively applying a heuristic, and repeated heuristic failures could stack to make a rader's chain significantly slower.
483                // If we were writing a measuring planner, expanding this heuristic and measuring its effectiveness would be an opportunity for up to 2x performance gains.
484                let inner_factors = PartialFactors::compute(other_factors - 1);
485                if self.is_butterfly(inner_factors.get_other_factors()) {
486                    // We only have factors of 2,3,5,7, and 11. If we don't have AVX2, we also have to exclude factors of 5 and 7 and 11, because avx2 gives us enough headroom for the overhead of those to not be a problem
487                    if is_x86_feature_detected!("avx2")
488                        || (inner_factors.product_power2power3() == len - 1)
489                    {
490                        return MixedRadixPlan::new(
491                            MixedRadixBase::RadersBase(other_factors),
492                            vec![],
493                        );
494                    }
495                }
496            }
497
498            // At this point, we know we're using bluestein's algorithm for the base. Next step is to plan the inner size we'll use for bluestein's algorithm.
499            let inner_bluesteins_len =
500                self.plan_bluesteins(other_factors, |(_len, factor2, factor3)| {
501                    if *factor3 < 1 && *factor2 > 13 {
502                        return false;
503                    }
504                    if *factor3 < 4 && *factor2 > 14 {
505                        return false;
506                    }
507                    true
508                });
509            return MixedRadixPlan::new(
510                MixedRadixBase::BluesteinsBase(other_factors, inner_bluesteins_len),
511                vec![],
512            );
513        }
514
515        // If this FFT size is a butterfly, use that
516        if self.is_butterfly(len) {
517            return MixedRadixPlan::butterfly(len, vec![]);
518        }
519
520        // If the power2 * power3 component of this FFT is a butterfly and not too small, return that
521        let power2power3 = factors.product_power2power3();
522        if power2power3 > 4 && self.is_butterfly(power2power3) {
523            return MixedRadixPlan::butterfly(power2power3, vec![]);
524        }
525
526        // most of this code is heuristics assuming FFTs of a minimum size. if the FFT is below that minimum size, the heuristics break down.
527        // so the first thing we're going to do is hardcode the plan for osme specific sizes where we know the heuristics won't be enough
528        let hardcoded_base = match power2power3 {
529            // 2^n special cases
530            64 => Some(MixedRadixPlan::butterfly(16, vec![4])), // 2^6
531
532            // 3 * 2^n special cases
533            48 => Some(MixedRadixPlan::butterfly(12, vec![4])), // 3 * 2^4
534            96 => Some(MixedRadixPlan::butterfly(12, vec![8])), // 3 * 2^5
535            768 => Some(MixedRadixPlan::butterfly(12, vec![8, 8])), // 3 * 2^8
536
537            // 9 * 2^n special cases
538            72 => Some(MixedRadixPlan::butterfly(24, vec![3])), // 2^3 * 3^2
539            288 => Some(MixedRadixPlan::butterfly(32, vec![9])), // 2^5 * 3^2
540
541            // 4 * 3^n special cases
542            108 => Some(MixedRadixPlan::butterfly(18, vec![6])), // 2^4 * 3^2
543            _ => None,
544        };
545        if let Some(hardcoded) = hardcoded_base {
546            return hardcoded;
547        }
548
549        if factors.get_power2() >= 4 {
550            match factors.get_power3() {
551                // if this FFT is a power of 2, our strategy here is to tweak the butterfly to free us up to do an 8xn chain
552                0 => match factors.get_power2() % 3 {
553                    0 => MixedRadixPlan::butterfly(512, vec![]),
554                    1 => MixedRadixPlan::butterfly(128, vec![]),
555                    2 => MixedRadixPlan::butterfly(256, vec![]),
556                    _ => unreachable!(),
557                },
558                // if this FFT is 3 times a power of 2, our strategy here is to tweak butterflies to make it easier to set up a 8xn chain
559                1 => match factors.get_power2() % 3 {
560                    0 => MixedRadixPlan::butterfly(24, vec![]),
561                    1 => MixedRadixPlan::butterfly(32, vec![12]),
562                    2 => MixedRadixPlan::butterfly(32, vec![12, 16]),
563                    _ => unreachable!(),
564                },
565                // if this FFT is 9 times a power of 2, our strategy here is to tweak butterflies to make it easier to set up a 8xn chain
566                2 => match factors.get_power2() % 3 {
567                    0 => MixedRadixPlan::butterfly(36, vec![16]),
568                    1 => MixedRadixPlan::butterfly(36, vec![]),
569                    2 => MixedRadixPlan::butterfly(18, vec![]),
570                    _ => unreachable!(),
571                },
572                // this FFT is 27 or greater times a power of two. As you might expect, in this vast field of options, what is optimal becomes a lot more muddy and situational
573                // but across all the benchmarking i've done, 36 seems like the best default that will get us the best plan in 95% of the cases
574                // 32 is rarely faster, although i haven't been able to discern a pattern.
575                _ => MixedRadixPlan::butterfly(36, vec![]),
576            }
577        } else if factors.get_power3() >= 3 {
578            // Our FFT is a power of 3 times a low power of 2
579            match factors.get_power2() {
580                0 => match factors.get_power3() % 2 {
581                    0 => MixedRadixPlan::butterfly(
582                        if factors.get_power3() > 10 { 9 } else { 27 },
583                        vec![],
584                    ),
585                    1 => MixedRadixPlan::butterfly(27, vec![]),
586                    _ => unreachable!(),
587                },
588                1 => MixedRadixPlan::butterfly(18, vec![]),
589                2 => match factors.get_power3() % 2 {
590                    0 => MixedRadixPlan::butterfly(36, vec![]),
591                    1 => MixedRadixPlan::butterfly(
592                        if factors.get_power3() > 10 { 36 } else { 18 },
593                        vec![],
594                    ),
595                    _ => unreachable!(),
596                },
597                3 => MixedRadixPlan::butterfly(18, vec![]),
598                // this FFT is 16 or greater times a power of three. As you might expect, in this vast field of options, what is optimal becomes a lot more muddy and situational
599                // but across all the benchmarking i've done, 36 seems like the best default that will get us the best plan in 95% of the cases
600                // 32 is rarely faster, although i haven't been able to discern a pattern.
601                _ => MixedRadixPlan::butterfly(36, vec![]),
602            }
603        }
604        // If this FFT has powers of 11, 7, or 5, use that
605        else if factors.get_power11() > 0 {
606            MixedRadixPlan::butterfly(11, vec![])
607        } else if factors.get_power7() > 0 {
608            MixedRadixPlan::butterfly(7, vec![])
609        } else if factors.get_power5() > 0 {
610            MixedRadixPlan::butterfly(5, vec![])
611        } else {
612            panic!(
613                "Couldn't find a base for FFT size {}, factors={:?}",
614                len, factors
615            )
616        }
617    }
618
619    fn is_butterfly(&self, len: usize) -> bool {
620        [
621            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 16, 17, 18, 19, 23, 24, 27, 29, 31, 32, 36,
622            64, 128, 256, 512,
623        ]
624        .contains(&len)
625    }
626
627    fn construct_butterfly(&self, len: usize, direction: FftDirection) -> Arc<dyn Fft<T>> {
628        match len {
629            0 | 1 => wrap_fft(Dft::new(len, direction)),
630            2 => wrap_fft(Butterfly2::new(direction)),
631            3 => wrap_fft(Butterfly3::new(direction)),
632            4 => wrap_fft(Butterfly4::new(direction)),
633            5 => wrap_fft(Butterfly5Avx64::new(direction).unwrap()),
634            6 => wrap_fft(Butterfly6::new(direction)),
635            7 => wrap_fft(Butterfly7Avx64::new(direction).unwrap()),
636            8 => wrap_fft(Butterfly8Avx64::new(direction).unwrap()),
637            9 => wrap_fft(Butterfly9Avx64::new(direction).unwrap()),
638            11 => wrap_fft(Butterfly11Avx64::new(direction).unwrap()),
639            12 => wrap_fft(Butterfly12Avx64::new(direction).unwrap()),
640            13 => wrap_fft(Butterfly13::new(direction)),
641            16 => wrap_fft(Butterfly16Avx64::new(direction).unwrap()),
642            17 => wrap_fft(Butterfly17::new(direction)),
643            18 => wrap_fft(Butterfly18Avx64::new(direction).unwrap()),
644            19 => wrap_fft(Butterfly19::new(direction)),
645            23 => wrap_fft(Butterfly23::new(direction)),
646            24 => wrap_fft(Butterfly24Avx64::new(direction).unwrap()),
647            27 => wrap_fft(Butterfly27Avx64::new(direction).unwrap()),
648            29 => wrap_fft(Butterfly29::new(direction)),
649            31 => wrap_fft(Butterfly31::new(direction)),
650            32 => wrap_fft(Butterfly32Avx64::new(direction).unwrap()),
651            36 => wrap_fft(Butterfly36Avx64::new(direction).unwrap()),
652            64 => wrap_fft(Butterfly64Avx64::new(direction).unwrap()),
653            128 => wrap_fft(Butterfly128Avx64::new(direction).unwrap()),
654            256 => wrap_fft(Butterfly256Avx64::new(direction).unwrap()),
655            512 => wrap_fft(Butterfly512Avx64::new(direction).unwrap()),
656            _ => panic!("Invalid butterfly len: {}", len),
657        }
658    }
659}
660
661//-------------------------------------------------------------------
662// type-agnostic planning stuff
663//-------------------------------------------------------------------
664impl<A: AvxNum, T: FftNum> AvxPlannerInternal<A, T> {
665    // Given a length, return a plan for how this FFT should be computed
666    fn plan_fft(
667        &self,
668        len: usize,
669        direction: FftDirection,
670        base_fn: impl FnOnce(&Self, usize, &PartialFactors) -> MixedRadixPlan,
671    ) -> MixedRadixPlan {
672        // First step: If this size is already cached, return it directly
673        if self.cache.contains_fft(len, direction) {
674            return MixedRadixPlan::cached(len);
675        }
676
677        // We have butterflies for everything below 10, so if it's below 10, just skip the factorization etc
678        // Notably, this step is *required* if the len is 0, since we can't compute a prime factorization for zero
679        if len < 10 {
680            return MixedRadixPlan::butterfly(len, Vec::new());
681        }
682
683        // This length is not cached, so we have to come up with a new plan. The first step is to find a suitable base.
684        let factors = PartialFactors::compute(len);
685        let base = base_fn(self, len, &factors);
686
687        // it's possible that the base planner plans out the whole FFT. it's guaranteed if `len` is a prime number, or if it's a butterfly, for example
688        let uncached_plan = if base.len == len {
689            base
690        } else {
691            // We have some mixed radix steps to compute! Compute the factors that need to computed by mixed radix steps,
692            let radix_factors = factors
693                .divide_by(&PartialFactors::compute(base.len))
694                .unwrap_or_else(|| {
695                    panic!(
696                        "Invalid base for FFT length={}, base={:?}, base radixes={:?}",
697                        len, base.base, base.radixes
698                    )
699                });
700            self.plan_mixed_radix(radix_factors, base)
701        };
702
703        // Last step: We have a full FFT plan, but some of the steps of that plan may have been cached. If they have, use the largest cached step as the base.
704        self.replan_with_cache(uncached_plan, direction)
705    }
706
707    // Takes a plan and an algorithm cache, and replaces steps of the plan with cached steps, if possible
708    fn replan_with_cache(&self, plan: MixedRadixPlan, direction: FftDirection) -> MixedRadixPlan {
709        enum CacheLocation {
710            None,
711            Base,
712            Radix(usize, usize), // First value is the length of the cached FFT, and second value is the index in the radix array
713        }
714
715        let mut largest_cached_len = CacheLocation::None;
716        let base_len = plan.base.base_len();
717        let mut current_len = base_len;
718
719        // Check if the cache contains the base
720        if self.cache.contains_fft(current_len, direction) {
721            largest_cached_len = CacheLocation::Base;
722        }
723
724        // Walk up the radix chain, checking if rthe cache contains each step
725        for (i, radix) in plan.radixes.iter().enumerate() {
726            current_len *= *radix as usize;
727
728            if self.cache.contains_fft(current_len, direction) {
729                largest_cached_len = CacheLocation::Radix(current_len, i);
730            }
731        }
732
733        // If we found a cached length within the plan, update the plan to account for the cache
734        match largest_cached_len {
735            CacheLocation::None => plan,
736            CacheLocation::Base => {
737                MixedRadixPlan::new(MixedRadixBase::CacheBase(base_len), plan.radixes)
738            }
739            CacheLocation::Radix(cached_len, cached_index) => {
740                // We know that `plan.radixes[cached_index]` is the largest cache value, and `cached_len` will be our new base legth
741                // Drop every element from `plan.radixes` from up to and including cached_index
742                let mut chain = plan.radixes;
743                chain.drain(0..=cached_index);
744                MixedRadixPlan::new(MixedRadixBase::CacheBase(cached_len), chain)
745            }
746        }
747    }
748    // given a set of factors, compute how many iterations of 12xn and 16xn we should plan for. Returns (k, j) for 12^k and 6^j
749    fn plan_power12_power6(radix_factors: &PartialFactors) -> (u32, u32) {
750        // it's helpful to think of this process as rewriting the FFT length as powers of our radixes
751        // the fastest FFT we could possibly compute is 8^n, because the 8xn algorithm is blazing fast. 9xn and 12xn are also in the top tier for speed, so those 3 algorithms are what we will aim for
752        // Specifically, we want to find a combination of 8, 9, and 12, that will "consume" all factors of 2 and 3, without having any leftovers
753
754        // Unfortunately, most FFTs don't come in the form 8^n * 9^m * 12^k
755        // Thankfully, 6xn is also reasonably fast, so we can use 6xn to strip away factors.
756        // This function's job will be to divide radix_factors into 8^n * 9^m * 12^k * 6^j, which minimizes j, then maximizes k
757
758        // we're going to hypothetically add as many 12's to our plan as possible, keeping track of how many 6's were required to balance things out
759        // we can also compute this analytically with modular arithmetic, but that technique only works when the FFT is above a minimum size, but this loop+array technique always works
760        let max_twelves = min(radix_factors.get_power2() / 2, radix_factors.get_power3());
761        let mut required_sixes = [None; 4]; // only track 6^0 through 6^3. 6^4 can be converted into 12^2 * 9, and 6^5 can be converted into 12 * 8 * 9 * 9
762        for hypothetical_twelve_power in 0..(max_twelves + 1) {
763            let hypothetical_twos = radix_factors.get_power2() - hypothetical_twelve_power * 2;
764            let hypothetical_threes = radix_factors.get_power3() - hypothetical_twelve_power;
765
766            // figure out how many sixes we would need to leave our FFT at 8^n * 9^m via modular arithmetic, and write to that index of our twelves_per_sixes array
767            let sixes = match (hypothetical_twos % 3, hypothetical_threes % 2) {
768                (0, 0) => Some(0),
769                (1, 1) => Some(1),
770                (2, 0) => Some(2),
771                (0, 1) => Some(3),
772                (1, 0) => None, // it would take 4 sixes, which can be replaced by 2 twelves, so we'll hit it in a later loop (if we have that many factors)
773                (2, 1) => None, // it would take 5 sixes, but note that 12 is literally 2^2 * 3^1, so instead of applying 5 sixes, we can apply a single 12
774                (_, _) => unreachable!(),
775            };
776
777            // if we can bring the FFT into range for the fast path with sixes, record so in the required_sixes array
778            // but make sure the number of sixes we're going to apply actually fits into our available factors
779            if let Some(sixes) = sixes {
780                if sixes <= hypothetical_twos && sixes <= hypothetical_threes {
781                    required_sixes[sixes as usize] = Some(hypothetical_twelve_power)
782                }
783            }
784        }
785
786        // required_sixes[i] now contains the largest power of twelve that we can apply, given that we also apply 6^i
787        // we want to apply as many of 12 as possible, so take the array element with the largest non-None element
788        // note that it's possible (and very likely) that either power_twelve or power_six is zero, or both of them are zero! this will happen for a pure power of 2 or power of 3 FFT, for example
789        let (power_twelve, mut power_six) = required_sixes
790            .iter()
791            .enumerate()
792            .filter_map(|(i, maybe_twelve)| maybe_twelve.map(|twelve| (twelve, i as u32)))
793            .fold(
794                (0, 0),
795                |best, current| if current.0 >= best.0 { current } else { best },
796            );
797
798        // special case: if we have exactly one factor of 2 and at least one factor of 3, unconditionally apply a factor of 6 to get rid of the 2
799        if radix_factors.get_power2() == 1 && radix_factors.get_power3() > 0 {
800            power_six = 1;
801        }
802        // special case: if we have a single factor of 3 and more than one factor of 2 (and we don't have any twelves), unconditionally apply a factor of 6 to get rid of the 3
803        if radix_factors.get_power2() > 1 && radix_factors.get_power3() == 1 && power_twelve == 0 {
804            power_six = 1;
805        }
806
807        (power_twelve, power_six)
808    }
809
810    fn plan_mixed_radix(
811        &self,
812        mut radix_factors: PartialFactors,
813        mut plan: MixedRadixPlan,
814    ) -> MixedRadixPlan {
815        // if we can complete the FFT with a single radix, do it
816        if [2, 3, 4, 5, 6, 7, 8, 9, 12, 16].contains(&radix_factors.product()) {
817            plan.push_radix(radix_factors.product() as u8)
818        } else {
819            // Compute how many powers of 12 and powers of 6 we want to strip away
820            let (power_twelve, power_six) = Self::plan_power12_power6(&radix_factors);
821
822            // divide our powers of 12 and 6 out of our radix factors
823            radix_factors = radix_factors
824                .divide_by(&PartialFactors::compute(
825                    6usize.pow(power_six) * 12usize.pow(power_twelve),
826                ))
827                .unwrap();
828
829            // now that we know the 12 and 6 factors, the plan array can be computed in descending radix size
830            if radix_factors.get_power2() % 3 == 1 && radix_factors.get_power2() > 1 {
831                // our factors of 2 might not quite be a power of 8 -- our plan_power12_power6 function tried its best, but if there are very few factors of 3, it can't help.
832                // if we're 2 * 8^N, benchmarking shows that applying a 16 before our chain of 8s is very fast.
833                plan.push_radix(16);
834                radix_factors = radix_factors
835                    .divide_by(&PartialFactors::compute(16))
836                    .unwrap();
837            }
838            plan.push_radix_power(12, power_twelve);
839            plan.push_radix_power(11, radix_factors.get_power11());
840            plan.push_radix_power(9, radix_factors.get_power3() / 2);
841            plan.push_radix_power(8, radix_factors.get_power2() / 3);
842            plan.push_radix_power(7, radix_factors.get_power7());
843            plan.push_radix_power(6, power_six);
844            plan.push_radix_power(5, radix_factors.get_power5());
845            if radix_factors.get_power2() % 3 == 2 {
846                // our factors of 2 might not quite be a power of 8 -- our plan_power12_power6 function tried its best, but if we are a power of 2, it can't help.
847                // if we're 4 * 8^N, benchmarking shows that applying a 4 to the end our chain of 8s is very fast.
848                plan.push_radix(4);
849            }
850            if radix_factors.get_power3() % 2 == 1 {
851                // our factors of 3 might not quite be a power of 9 -- our plan_power12_power6 function tried its best, but if we are a power of 3, it can't help.
852                // if we're 3 * 9^N, our only choice is to add an 8xn step
853                plan.push_radix(3);
854            }
855            if radix_factors.get_power2() % 3 == 1 {
856                // our factors of 2 might not quite be a power of 8. We tried to correct this with a 16 radix and 4 radix, but as a last resort, apply a 2. 2 is very slow, but it's better than not computing the FFT
857                plan.push_radix(2);
858            }
859
860            // measurement opportunity: is it faster to let the plan_power12_power6 function put a 4 on the end instead of relying on all 8's?
861            // measurement opportunity: is it faster to slap a 16 on top of the stack?
862            // measurement opportunity: if our plan_power12_power6 function adds both 12s and sixes, is it faster to drop combinations of 12+6 down to 8+9?
863        };
864        plan
865    }
866
867    // Constructs and returns a FFT instance from a FFT plan.
868    // If the base is a butterfly, it will call the provided `construct_butterfly_fn` to do so.
869    // If constructing the base requires constructing an inner FFT (IE bluetein's or rader's algorithm), it will call the provided `inner_fft_fn` to construct it
870    fn construct_plan(
871        &mut self,
872        plan: MixedRadixPlan,
873        direction: FftDirection,
874        construct_butterfly_fn: impl FnOnce(&Self, usize, FftDirection) -> Arc<dyn Fft<T>>,
875        inner_fft_fn: impl FnOnce(&mut Self, usize, FftDirection) -> Arc<dyn Fft<T>>,
876    ) -> Arc<dyn Fft<T>> {
877        let mut fft = match plan.base {
878            MixedRadixBase::CacheBase(len) => self.cache.get(len, direction).unwrap(),
879            MixedRadixBase::ButterflyBase(len) => {
880                let butterfly_instance = construct_butterfly_fn(self, len, direction);
881
882                // Cache this FFT instance for future calls to `plan_fft`
883                self.cache.insert(&butterfly_instance);
884
885                butterfly_instance
886            }
887            MixedRadixBase::RadersBase(len) => {
888                // Rader's Algorithm requires an inner FFT of size len - 1
889                let inner_fft = inner_fft_fn(self, len - 1, direction);
890
891                // try to construct our AVX2 rader's algorithm. If that fails (probably because the machine we're running on doesn't have AVX2), fall back to scalar
892                let raders_instance =
893                    if let Ok(raders_avx) = RadersAvx2::<A, T>::new(Arc::clone(&inner_fft)) {
894                        wrap_fft(raders_avx)
895                    } else {
896                        wrap_fft(RadersAlgorithm::new(inner_fft))
897                    };
898
899                // Cache this FFT instance for future calls to `plan_fft`
900                self.cache.insert(&raders_instance);
901
902                raders_instance
903            }
904            MixedRadixBase::BluesteinsBase(len, inner_fft_len) => {
905                // Bluestein's has an inner FFT of arbitrary size. But we've already planned it, so just use what we planned
906                let inner_fft = inner_fft_fn(self, inner_fft_len, direction);
907
908                // try to construct our AVX2 rader's algorithm. If that fails (probably because the machine we're running on doesn't have AVX2), fall back to scalar
909                let bluesteins_instance =
910                    wrap_fft(BluesteinsAvx::<A, T>::new(len, inner_fft).unwrap());
911
912                // Cache this FFT instance for future calls to `plan_fft`
913                self.cache.insert(&bluesteins_instance);
914
915                bluesteins_instance
916            }
917        };
918
919        // We have constructed our base. Now, construct the radix chain.
920        for radix in plan.radixes {
921            fft = match radix {
922                2 => wrap_fft(MixedRadix2xnAvx::<A, T>::new(fft).unwrap()),
923                3 => wrap_fft(MixedRadix3xnAvx::<A, T>::new(fft).unwrap()),
924                4 => wrap_fft(MixedRadix4xnAvx::<A, T>::new(fft).unwrap()),
925                5 => wrap_fft(MixedRadix5xnAvx::<A, T>::new(fft).unwrap()),
926                6 => wrap_fft(MixedRadix6xnAvx::<A, T>::new(fft).unwrap()),
927                7 => wrap_fft(MixedRadix7xnAvx::<A, T>::new(fft).unwrap()),
928                8 => wrap_fft(MixedRadix8xnAvx::<A, T>::new(fft).unwrap()),
929                9 => wrap_fft(MixedRadix9xnAvx::<A, T>::new(fft).unwrap()),
930                11 => wrap_fft(MixedRadix11xnAvx::<A, T>::new(fft).unwrap()),
931                12 => wrap_fft(MixedRadix12xnAvx::<A, T>::new(fft).unwrap()),
932                16 => wrap_fft(MixedRadix16xnAvx::<A, T>::new(fft).unwrap()),
933                _ => unreachable!(),
934            };
935
936            // Cache this FFT instance for future calls to `plan_fft`
937            self.cache.insert(&fft);
938        }
939
940        fft
941    }
942
943    // Plan and return the inner size to be used with Bluestein's Algorithm
944    // Calls `filter_fn` on result candidates, giving the caller the opportunity to reject certain sizes
945    fn plan_bluesteins(
946        &self,
947        len: usize,
948        filter_fn: impl FnMut(&&(usize, u32, u32)) -> bool,
949    ) -> usize {
950        assert!(len > 1); // Internal consistency check: The logic in this method doesn't work for a length of 1
951
952        // Bluestein's computes a FFT of size `len` by reorganizing it as a FFT of ANY size greater than or equal to len * 2 - 1
953        // an obvious choice is the next power of two larger than  len * 2 - 1, but if we can find a smaller FFT that will go faster, we can save a lot of time.
954        // We can very efficiently compute almost any 2^n * 3^m, so we're going to search for all numbers of the form 2^n * 3^m that lie between len * 2 - 1 and the next power of two.
955        let min_len = len * 2 - 1;
956        let baseline_candidate = min_len.checked_next_power_of_two().unwrap();
957
958        // our algorithm here is to start with our next power of 2, and repeatedly divide by 2 and multiply by 3, trying to keep our value in range
959        let mut bluesteins_candidates = Vec::new();
960        let mut candidate = baseline_candidate;
961        let mut factor2 = candidate.trailing_zeros();
962        let mut factor3 = 0;
963
964        let min_factor2 = 2; // benchmarking shows that while 3^n and  2 * 3^n are fast, they're typically slower than the next-higher candidate, so don't bother generating them
965        while factor2 >= min_factor2 {
966            // if this candidate length isn't too small, add it to our candidates list
967            if candidate >= min_len {
968                bluesteins_candidates.push((candidate, factor2, factor3));
969            }
970            // if the candidate is too large, divide it by 2. if it's too small, divide it by 3
971            if candidate >= baseline_candidate {
972                candidate >>= 1;
973                factor2 -= 1;
974            } else {
975                candidate *= 3;
976                factor3 += 1;
977            }
978        }
979        bluesteins_candidates.sort();
980
981        // we now have a list of candidates to choosse from. some 2^n * 3^m FFTs are faster than others, so apply a filter, which will let us skip sizes that benchmarking has shown to be slow
982        let (chosen_size, _, _) =
983            bluesteins_candidates
984                .iter()
985                .find(filter_fn)
986                .unwrap_or_else(|| {
987                    panic!(
988                        "Failed to find a bluestein's candidate for len={}, candidates: {:?}",
989                        len, bluesteins_candidates
990                    )
991                });
992
993        *chosen_size
994    }
995}
996
997#[cfg(test)]
998mod unit_tests {
999    use super::*;
1000
1001    // We don't need to actually compute anything for a FFT size of zero, but we do need to verify that it doesn't explode
1002    #[test]
1003    fn test_plan_zero_avx() {
1004        let mut planner32 = FftPlannerAvx::<f32>::new().unwrap();
1005        let fft_zero32 = planner32.plan_fft_forward(0);
1006        fft_zero32.process(&mut []);
1007
1008        let mut planner64 = FftPlannerAvx::<f64>::new().unwrap();
1009        let fft_zero64 = planner64.plan_fft_forward(0);
1010        fft_zero64.process(&mut []);
1011    }
1012}