rustfft/avx/avx_planner.rs
1use std::sync::Arc;
2use std::{any::TypeId, cmp::min};
3
4use primal_check::miller_rabin;
5
6use crate::algorithm::*;
7use crate::common::FftNum;
8use crate::math_utils::PartialFactors;
9use crate::Fft;
10use crate::{algorithm::butterflies::*, fft_cache::FftCache};
11
12use super::*;
13
14fn wrap_fft<T: FftNum>(butterfly: impl Fft<T> + 'static) -> Arc<dyn Fft<T>> {
15 Arc::new(butterfly) as Arc<dyn Fft<T>>
16}
17
18#[derive(Debug)]
19enum MixedRadixBase {
20 // The base will be a butterfly algorithm
21 ButterflyBase(usize),
22
23 // The base will be an instance of Rader's Algorithm. That will require its own plan for the internal FFT, which we'll handle separately
24 RadersBase(usize),
25
26 // The base will be an instance of Bluestein's Algorithm. That will require its own plan for the internal FFT, which we'll handle separately.
27 // First usize is the base length, second usize is the inner FFT length
28 BluesteinsBase(usize, usize),
29
30 // The "base" is a FFT instance we already have cached
31 CacheBase(usize),
32}
33impl MixedRadixBase {
34 fn base_len(&self) -> usize {
35 match self {
36 Self::ButterflyBase(len) => *len,
37 Self::RadersBase(len) => *len,
38 Self::BluesteinsBase(len, _) => *len,
39 Self::CacheBase(len) => *len,
40 }
41 }
42}
43
44/// repreesnts a FFT plan, stored as a base FFT and a stack of MixedRadix*xn on top of it.
45#[derive(Debug)]
46pub struct MixedRadixPlan {
47 len: usize, // product of base and radixes
48 radixes: Vec<u8>, // stored from innermost to outermost
49 base: MixedRadixBase,
50}
51impl MixedRadixPlan {
52 fn new(base: MixedRadixBase, radixes: Vec<u8>) -> Self {
53 Self {
54 len: base.base_len() * radixes.iter().map(|r| *r as usize).product::<usize>(),
55 base,
56 radixes,
57 }
58 }
59 fn cached(cached_len: usize) -> Self {
60 Self {
61 len: cached_len,
62 base: MixedRadixBase::CacheBase(cached_len),
63 radixes: Vec::new(),
64 }
65 }
66 fn butterfly(butterfly_len: usize, radixes: Vec<u8>) -> Self {
67 Self::new(MixedRadixBase::ButterflyBase(butterfly_len), radixes)
68 }
69 fn push_radix(&mut self, radix: u8) {
70 self.radixes.push(radix);
71 self.len *= radix as usize;
72 }
73 fn push_radix_power(&mut self, radix: u8, power: u32) {
74 self.radixes
75 .extend(std::iter::repeat(radix).take(power as usize));
76 self.len *= (radix as usize).pow(power);
77 }
78}
79
80/// The AVX FFT planner creates new FFT algorithm instances which take advantage of the AVX instruction set.
81///
82/// Creating an instance of `FftPlannerAvx` requires the `avx` and `fma` instructions to be available on the current machine, and it requires RustFFT's
83/// `avx` feature flag to be set. A few algorithms will use `avx2` if it's available, but it isn't required.
84///
85/// For the time being, AVX acceleration is black box, and AVX accelerated algorithms are not available without a planner. This may change in the future.
86///
87/// ~~~
88/// // Perform a forward Fft of size 1234, accelerated by AVX
89/// use std::sync::Arc;
90/// use rustfft::{FftPlannerAvx, num_complex::Complex};
91///
92/// // If FftPlannerAvx::new() returns Ok(), we'll know AVX algorithms are available
93/// // on this machine, and that RustFFT was compiled with the `avx` feature flag
94/// if let Ok(mut planner) = FftPlannerAvx::new() {
95/// let fft = planner.plan_fft_forward(1234);
96///
97/// let mut buffer = vec![Complex{ re: 0.0f32, im: 0.0f32 }; 1234];
98/// fft.process(&mut buffer);
99///
100/// // The FFT instance returned by the planner has the type `Arc<dyn Fft<T>>`,
101/// // where T is the numeric type, ie f32 or f64, so it's cheap to clone
102/// let fft_clone = Arc::clone(&fft);
103/// }
104/// ~~~
105///
106/// If you plan on creating multiple FFT instances, it is recommended to re-use the same planner for all of them. This
107/// is because the planner re-uses internal data across FFT instances wherever possible, saving memory and reducing
108/// setup time. (FFT instances created with one planner will never re-use data and buffers with FFT instances created
109/// by a different planner)
110///
111/// Each FFT instance owns [`Arc`s](std::sync::Arc) to its internal data, rather than borrowing it from the planner, so it's perfectly
112/// safe to drop the planner after creating Fft instances.
113pub struct FftPlannerAvx<T: FftNum> {
114 internal_planner: Box<dyn AvxPlannerInternalAPI<T>>,
115}
116impl<T: FftNum> FftPlannerAvx<T> {
117 /// Constructs a new `FftPlannerAvx` instance.
118 ///
119 /// Returns `Ok(planner_instance)` if we're compiling for X86_64, AVX support was enabled in feature flags, and the current CPU supports the `avx` and `fma` CPU features.
120 /// Returns `Err(())` if AVX support is not available.
121 pub fn new() -> Result<Self, ()> {
122 // Eventually we might make AVX algorithms that don't also require FMA.
123 // If that happens, we can only check for AVX here? seems like a pretty low-priority addition
124 let has_avx = is_x86_feature_detected!("avx");
125 let has_fma = is_x86_feature_detected!("fma");
126 if has_avx && has_fma {
127 // Ideally, we would implement the planner with specialization.
128 // Specialization won't be on stable rust for a long time tohugh, so in the meantime, we can hack around it.
129 //
130 // The first step of the hack is to use TypeID to determine if T is f32, f64, or neither. If neither, we don't want to di any AVX acceleration
131 // If it's f32 or f64, then construct an internal type that has two generic parameters, one bounded on AvxNum, the other bounded on FftNum
132 //
133 // - A is bounded on the AvxNum trait, and is the type we use for any AVX computations. It has associated types for AVX vectors,
134 // associated constants for the number of elements per vector, etc.
135 // - T is bounded on the FftNum trait, and thus is the type that every FFT algorithm will recieve its input/output buffers in.
136 //
137 // An important snag relevant to the planner is that we have to box and type-erase the AvxNum bound,
138 // since the only other option is making the AvxNum bound a part of this struct's external API
139 //
140 // Another annoying snag with this setup is that we frequently have to transmute buffers from &mut [Complex<T>] to &mut [Complex<A>] or vice versa.
141 // We know this is safe because we assert everywhere that Type(A)==Type(T), so it's just a matter of "doing it right" every time.
142 // These transmutes are required because the FFT algorithm's input will come through the FFT trait, which may only be bounded by FftNum.
143 // So the buffers will have the type &mut [Complex<T>]. The problem comes in that all of our AVX computation tools are on the AvxNum trait.
144 //
145 // If we had specialization, we could easily convince the compilr that AvxNum and FftNum were different bounds on the same underlying type (IE f32 or f64)
146 // but without it, the compiler is convinced that they are different. So we use the transmute as a last-resort way to overcome this limitation.
147 //
148 // We keep both the A and T types around in all of our AVX-related structs so that we can cast between A and T whenever necessary.
149 let id_f32 = TypeId::of::<f32>();
150 let id_f64 = TypeId::of::<f64>();
151 let id_t = TypeId::of::<T>();
152
153 if id_t == id_f32 {
154 return Ok(Self {
155 internal_planner: Box::new(AvxPlannerInternal::<f32, T>::new()),
156 });
157 } else if id_t == id_f64 {
158 return Ok(Self {
159 internal_planner: Box::new(AvxPlannerInternal::<f64, T>::new()),
160 });
161 }
162 }
163 Err(())
164 }
165
166 /// Returns a `Fft` instance which uses AVX instructions to compute FFTs of size `len`.
167 ///
168 /// If the provided `direction` is `FftDirection::Forward`, the returned instance will compute forward FFTs. If it's `FftDirection::Inverse`, it will compute inverse FFTs.
169 ///
170 /// If this is called multiple times, the planner will attempt to re-use internal data between calls, reducing memory usage and FFT initialization time.
171 pub fn plan_fft(&mut self, len: usize, direction: FftDirection) -> Arc<dyn Fft<T>> {
172 self.internal_planner.plan_and_construct_fft(len, direction)
173 }
174 /// Returns a `Fft` instance which uses AVX instructions to compute forward FFTs of size `len`.
175 ///
176 /// If this is called multiple times, the planner will attempt to re-use internal data between calls, reducing memory usage and FFT initialization time.
177 pub fn plan_fft_forward(&mut self, len: usize) -> Arc<dyn Fft<T>> {
178 self.plan_fft(len, FftDirection::Forward)
179 }
180 /// Returns a `Fft` instance which uses AVX instructions to compute inverse FFTs of size `len.
181 ///
182 /// If this is called multiple times, the planner will attempt to re-use internal data between calls, reducing memory usage and FFT initialization time.
183 pub fn plan_fft_inverse(&mut self, len: usize) -> Arc<dyn Fft<T>> {
184 self.plan_fft(len, FftDirection::Inverse)
185 }
186
187 /// Returns a FFT plan without constructing it
188 #[allow(unused)]
189 pub(crate) fn debug_plan_fft(&self, len: usize, direction: FftDirection) -> MixedRadixPlan {
190 self.internal_planner.debug_plan_fft(len, direction)
191 }
192}
193
194trait AvxPlannerInternalAPI<T: FftNum>: Send + Sync {
195 fn plan_and_construct_fft(&mut self, len: usize, direction: FftDirection) -> Arc<dyn Fft<T>>;
196 fn debug_plan_fft(&self, len: usize, direction: FftDirection) -> MixedRadixPlan;
197}
198
199struct AvxPlannerInternal<A: AvxNum, T: FftNum> {
200 cache: FftCache<T>,
201 _phantom: std::marker::PhantomData<A>,
202}
203
204impl<T: FftNum> AvxPlannerInternalAPI<T> for AvxPlannerInternal<f32, T> {
205 fn plan_and_construct_fft(&mut self, len: usize, direction: FftDirection) -> Arc<dyn Fft<T>> {
206 // Step 1: Create a plan for this FFT length.
207 let plan = self.plan_fft(len, direction, Self::plan_mixed_radix_base);
208
209 // Step 2: Construct the plan. If the base is rader's algorithm or bluestein's algorithm, this may call self.plan_and_construct_fft recursively!
210 self.construct_plan(
211 plan,
212 direction,
213 Self::construct_butterfly,
214 Self::plan_and_construct_fft,
215 )
216 }
217 fn debug_plan_fft(&self, len: usize, direction: FftDirection) -> MixedRadixPlan {
218 self.plan_fft(len, direction, Self::plan_mixed_radix_base)
219 }
220}
221impl<T: FftNum> AvxPlannerInternalAPI<T> for AvxPlannerInternal<f64, T> {
222 fn plan_and_construct_fft(&mut self, len: usize, direction: FftDirection) -> Arc<dyn Fft<T>> {
223 // Step 1: Create a plan for this FFT length.
224 let plan = self.plan_fft(len, direction, Self::plan_mixed_radix_base);
225
226 // Step 2: Construct the plan. If the base is rader's algorithm or bluestein's algorithm, this may call self.plan_and_construct_fft recursively!
227 self.construct_plan(
228 plan,
229 direction,
230 Self::construct_butterfly,
231 Self::plan_and_construct_fft,
232 )
233 }
234 fn debug_plan_fft(&self, len: usize, direction: FftDirection) -> MixedRadixPlan {
235 self.plan_fft(len, direction, Self::plan_mixed_radix_base)
236 }
237}
238
239//-------------------------------------------------------------------
240// f32-specific planning stuff
241//-------------------------------------------------------------------
242impl<T: FftNum> AvxPlannerInternal<f32, T> {
243 pub fn new() -> Self {
244 // Internal sanity check: Make sure that T == f32.
245 // This struct has two generic parameters A and T, but they must always be the same, and are only kept separate to help work around the lack of specialization.
246 // It would be cool if we could do this as a static_assert instead
247 let id_f32 = TypeId::of::<f32>();
248 let id_t = TypeId::of::<T>();
249 assert_eq!(id_f32, id_t);
250
251 Self {
252 cache: FftCache::new(),
253 _phantom: std::marker::PhantomData,
254 }
255 }
256
257 fn plan_mixed_radix_base(&self, len: usize, factors: &PartialFactors) -> MixedRadixPlan {
258 // if we have non-fast-path factors, use them as our base FFT length, and we will have to use either rader's algorithm or bluestein's algorithm as our base
259 if factors.get_other_factors() > 1 {
260 let other_factors = factors.get_other_factors();
261
262 // First, if the "other factors" are a butterfly, use that as the butterfly
263 if self.is_butterfly(other_factors) {
264 return MixedRadixPlan::butterfly(other_factors, vec![]);
265 }
266
267 // We can only use rader's if `other_factors` is prime
268 if miller_rabin(other_factors as u64) {
269 // len is prime, so we can use Rader's Algorithm as a base. Whether or not that's a good idea is a different story
270 // Rader's Algorithm is only faster in a few narrow cases.
271 // as a heuristic, only use rader's algorithm if its inner FFT can be computed entirely without bluestein's or rader's
272 // We're intentionally being too conservative here. Otherwise we'd be recursively applying a heuristic, and repeated heuristic failures could stack to make a rader's chain significantly slower.
273 // If we were writing a measuring planner, expanding this heuristic and measuring its effectiveness would be an opportunity for up to 2x performance gains.
274 let inner_factors = PartialFactors::compute(other_factors - 1);
275 if self.is_butterfly(inner_factors.get_other_factors()) {
276 // We only have factors of 2,3,5,7, and 11. If we don't have AVX2, we also have to exclude factors of 5 and 7 and 11, because avx2 gives us enough headroom for the overhead of those to not be a problem
277 if is_x86_feature_detected!("avx2")
278 || (inner_factors.product_power2power3() == len - 1)
279 {
280 return MixedRadixPlan::new(
281 MixedRadixBase::RadersBase(other_factors),
282 vec![],
283 );
284 }
285 }
286 }
287
288 // At this point, we know we're using bluestein's algorithm for the base. Next step is to plan the inner size we'll use for bluestein's algorithm.
289 let inner_bluesteins_len =
290 self.plan_bluesteins(other_factors, |(_len, factor2, factor3)| {
291 if *factor2 > 16 && *factor3 < 3 {
292 // surprisingly, pure powers of 2 have a pretty steep dropoff in speed after 65536.
293 // the algorithm is designed to generate candidadtes larger than baseline_candidate, so if we hit a large power of 2, there should be more after it that we can skip to
294 return false;
295 }
296 true
297 });
298 return MixedRadixPlan::new(
299 MixedRadixBase::BluesteinsBase(other_factors, inner_bluesteins_len),
300 vec![],
301 );
302 }
303
304 // If this FFT size is a butterfly, use that
305 if self.is_butterfly(len) {
306 return MixedRadixPlan::butterfly(len, vec![]);
307 }
308
309 // If the power2 * power3 component of this FFT is a butterfly and not too small, return that
310 let power2power3 = factors.product_power2power3();
311 if power2power3 > 4 && self.is_butterfly(power2power3) {
312 return MixedRadixPlan::butterfly(power2power3, vec![]);
313 }
314
315 // most of this code is heuristics assuming FFTs of a minimum size. if the FFT is below that minimum size, the heuristics break down.
316 // so the first thing we're going to do is hardcode the plan for osme specific sizes where we know the heuristics won't be enough
317 let hardcoded_base = match power2power3 {
318 // 3 * 2^n special cases
319 96 => Some(MixedRadixPlan::butterfly(32, vec![3])), // 2^5 * 3
320 192 => Some(MixedRadixPlan::butterfly(48, vec![4])), // 2^6 * 3
321 1536 => Some(MixedRadixPlan::butterfly(48, vec![8, 4])), // 2^8 * 3
322
323 // 9 * 2^n special cases
324 18 => Some(MixedRadixPlan::butterfly(3, vec![6])), // 2 * 3^2
325 144 => Some(MixedRadixPlan::butterfly(36, vec![4])), // 2^4 * 3^2
326
327 _ => None,
328 };
329 if let Some(hardcoded) = hardcoded_base {
330 return hardcoded;
331 }
332
333 if factors.get_power2() >= 5 {
334 match factors.get_power3() {
335 // if this FFT is a power of 2, our strategy here is to tweak the butterfly to free us up to do an 8xn chain
336 0 => match factors.get_power2() % 3 {
337 0 => MixedRadixPlan::butterfly(512, vec![]),
338 1 => MixedRadixPlan::butterfly(256, vec![]),
339 2 => MixedRadixPlan::butterfly(256, vec![]),
340 _ => unreachable!(),
341 },
342 // if this FFT is 3 times a power of 2, our strategy here is to tweak butterflies to make it easier to set up a 8xn chain
343 1 => match factors.get_power2() % 3 {
344 0 => MixedRadixPlan::butterfly(64, vec![12, 16]),
345 1 => MixedRadixPlan::butterfly(48, vec![]),
346 2 => MixedRadixPlan::butterfly(64, vec![]),
347 _ => unreachable!(),
348 },
349 // if this FFT is 9 or greater times a power of 2, just use 72. As you might expect, in this vast field of options, what is optimal becomes a lot more muddy and situational
350 // but across all the benchmarking i've done, 72 seems like the best default that will get us the best plan in 95% of the cases
351 // 64, 54, and 48 are occasionally faster, although i haven't been able to discern a pattern.
352 _ => MixedRadixPlan::butterfly(72, vec![]),
353 }
354 } else if factors.get_power3() >= 3 {
355 // Our FFT is a power of 3 times a low power of 2. A high level summary of our strategy is that we want to pick a base that will
356 // A: consume all factors of 2, and B: leave us with an even power of 3, so that we can do a 9xn chain.
357 match factors.get_power2() {
358 0 => MixedRadixPlan::butterfly(27, vec![]),
359 1 => MixedRadixPlan::butterfly(54, vec![]),
360 2 => match factors.get_power3() % 2 {
361 0 => MixedRadixPlan::butterfly(36, vec![]),
362 1 => MixedRadixPlan::butterfly(if len < 1000 { 36 } else { 12 }, vec![]),
363 _ => unreachable!(),
364 },
365 3 => match factors.get_power3() % 2 {
366 0 => MixedRadixPlan::butterfly(72, vec![]),
367 1 => MixedRadixPlan::butterfly(
368 if factors.get_power3() > 7 { 24 } else { 72 },
369 vec![],
370 ),
371 _ => unreachable!(),
372 },
373 4 => match factors.get_power3() % 2 {
374 0 => MixedRadixPlan::butterfly(
375 if factors.get_power3() > 6 { 16 } else { 72 },
376 vec![],
377 ),
378 1 => MixedRadixPlan::butterfly(
379 if factors.get_power3() > 9 { 48 } else { 72 },
380 vec![],
381 ),
382 _ => unreachable!(),
383 },
384 // if this FFT is 32 or greater times a power of 3, just use 72. As you might expect, in this vast field of options, what is optimal becomes a lot more muddy and situational
385 // but across all the benchmarking i've done, 72 seems like the best default that will get us the best plan in 95% of the cases
386 // 64, 54, and 48 are occasionally faster, although i haven't been able to discern a pattern.
387 _ => MixedRadixPlan::butterfly(72, vec![]),
388 }
389 }
390 // If this FFT has powers of 11, 7, or 5, use that
391 else if factors.get_power11() > 0 {
392 MixedRadixPlan::butterfly(11, vec![])
393 } else if factors.get_power7() > 0 {
394 MixedRadixPlan::butterfly(7, vec![])
395 } else if factors.get_power5() > 0 {
396 MixedRadixPlan::butterfly(5, vec![])
397 } else {
398 panic!(
399 "Couldn't find a base for FFT size {}, factors={:?}",
400 len, factors
401 )
402 }
403 }
404
405 fn is_butterfly(&self, len: usize) -> bool {
406 [
407 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 16, 17, 19, 23, 24, 27, 29, 31, 32, 36, 48,
408 54, 64, 72, 128, 256, 512,
409 ]
410 .contains(&len)
411 }
412
413 fn construct_butterfly(&self, len: usize, direction: FftDirection) -> Arc<dyn Fft<T>> {
414 match len {
415 0 | 1 => wrap_fft(Dft::new(len, direction)),
416 2 => wrap_fft(Butterfly2::new(direction)),
417 3 => wrap_fft(Butterfly3::new(direction)),
418 4 => wrap_fft(Butterfly4::new(direction)),
419 5 => wrap_fft(Butterfly5Avx::new(direction).unwrap()),
420 6 => wrap_fft(Butterfly6::new(direction)),
421 7 => wrap_fft(Butterfly7Avx::new(direction).unwrap()),
422 8 => wrap_fft(Butterfly8Avx::new(direction).unwrap()),
423 9 => wrap_fft(Butterfly9Avx::new(direction).unwrap()),
424 11 => wrap_fft(Butterfly11Avx::new(direction).unwrap()),
425 12 => wrap_fft(Butterfly12Avx::new(direction).unwrap()),
426 13 => wrap_fft(Butterfly13::new(direction)),
427 16 => wrap_fft(Butterfly16Avx::new(direction).unwrap()),
428 17 => wrap_fft(Butterfly17::new(direction)),
429 19 => wrap_fft(Butterfly19::new(direction)),
430 23 => wrap_fft(Butterfly23::new(direction)),
431 24 => wrap_fft(Butterfly24Avx::new(direction).unwrap()),
432 27 => wrap_fft(Butterfly27Avx::new(direction).unwrap()),
433 29 => wrap_fft(Butterfly29::new(direction)),
434 31 => wrap_fft(Butterfly31::new(direction)),
435 32 => wrap_fft(Butterfly32Avx::new(direction).unwrap()),
436 36 => wrap_fft(Butterfly36Avx::new(direction).unwrap()),
437 48 => wrap_fft(Butterfly48Avx::new(direction).unwrap()),
438 54 => wrap_fft(Butterfly54Avx::new(direction).unwrap()),
439 64 => wrap_fft(Butterfly64Avx::new(direction).unwrap()),
440 72 => wrap_fft(Butterfly72Avx::new(direction).unwrap()),
441 128 => wrap_fft(Butterfly128Avx::new(direction).unwrap()),
442 256 => wrap_fft(Butterfly256Avx::new(direction).unwrap()),
443 512 => wrap_fft(Butterfly512Avx::new(direction).unwrap()),
444 _ => panic!("Invalid butterfly len: {}", len),
445 }
446 }
447}
448
449//-------------------------------------------------------------------
450// f64-specific planning stuff
451//-------------------------------------------------------------------
452impl<T: FftNum> AvxPlannerInternal<f64, T> {
453 pub fn new() -> Self {
454 // Internal sanity check: Make sure that T == f64.
455 // This struct has two generic parameters A and T, but they must always be the same, and are only kept separate to help work around the lack of specialization.
456 // It would be cool if we could do this as a static_assert instead
457 let id_f64 = TypeId::of::<f64>();
458 let id_t = TypeId::of::<T>();
459 assert_eq!(id_f64, id_t);
460
461 Self {
462 cache: FftCache::new(),
463 _phantom: std::marker::PhantomData,
464 }
465 }
466
467 fn plan_mixed_radix_base(&self, len: usize, factors: &PartialFactors) -> MixedRadixPlan {
468 // if we have a factor that can't be computed with 2xn 3xn etc, we'll have to compute it with bluestein's or rader's, so use that as the base
469 if factors.get_other_factors() > 1 {
470 let other_factors = factors.get_other_factors();
471
472 // First, if the "other factors" are a butterfly, use that as the butterfly
473 if self.is_butterfly(other_factors) {
474 return MixedRadixPlan::butterfly(other_factors, vec![]);
475 }
476
477 // We can only use rader's if `other_factors` is prime
478 if miller_rabin(other_factors as u64) {
479 // len is prime, so we can use Rader's Algorithm as a base. Whether or not that's a good idea is a different story
480 // Rader's Algorithm is only faster in a few narrow cases.
481 // as a heuristic, only use rader's algorithm if its inner FFT can be computed entirely without bluestein's or rader's
482 // We're intentionally being too conservative here. Otherwise we'd be recursively applying a heuristic, and repeated heuristic failures could stack to make a rader's chain significantly slower.
483 // If we were writing a measuring planner, expanding this heuristic and measuring its effectiveness would be an opportunity for up to 2x performance gains.
484 let inner_factors = PartialFactors::compute(other_factors - 1);
485 if self.is_butterfly(inner_factors.get_other_factors()) {
486 // We only have factors of 2,3,5,7, and 11. If we don't have AVX2, we also have to exclude factors of 5 and 7 and 11, because avx2 gives us enough headroom for the overhead of those to not be a problem
487 if is_x86_feature_detected!("avx2")
488 || (inner_factors.product_power2power3() == len - 1)
489 {
490 return MixedRadixPlan::new(
491 MixedRadixBase::RadersBase(other_factors),
492 vec![],
493 );
494 }
495 }
496 }
497
498 // At this point, we know we're using bluestein's algorithm for the base. Next step is to plan the inner size we'll use for bluestein's algorithm.
499 let inner_bluesteins_len =
500 self.plan_bluesteins(other_factors, |(_len, factor2, factor3)| {
501 if *factor3 < 1 && *factor2 > 13 {
502 return false;
503 }
504 if *factor3 < 4 && *factor2 > 14 {
505 return false;
506 }
507 true
508 });
509 return MixedRadixPlan::new(
510 MixedRadixBase::BluesteinsBase(other_factors, inner_bluesteins_len),
511 vec![],
512 );
513 }
514
515 // If this FFT size is a butterfly, use that
516 if self.is_butterfly(len) {
517 return MixedRadixPlan::butterfly(len, vec![]);
518 }
519
520 // If the power2 * power3 component of this FFT is a butterfly and not too small, return that
521 let power2power3 = factors.product_power2power3();
522 if power2power3 > 4 && self.is_butterfly(power2power3) {
523 return MixedRadixPlan::butterfly(power2power3, vec![]);
524 }
525
526 // most of this code is heuristics assuming FFTs of a minimum size. if the FFT is below that minimum size, the heuristics break down.
527 // so the first thing we're going to do is hardcode the plan for osme specific sizes where we know the heuristics won't be enough
528 let hardcoded_base = match power2power3 {
529 // 2^n special cases
530 64 => Some(MixedRadixPlan::butterfly(16, vec![4])), // 2^6
531
532 // 3 * 2^n special cases
533 48 => Some(MixedRadixPlan::butterfly(12, vec![4])), // 3 * 2^4
534 96 => Some(MixedRadixPlan::butterfly(12, vec![8])), // 3 * 2^5
535 768 => Some(MixedRadixPlan::butterfly(12, vec![8, 8])), // 3 * 2^8
536
537 // 9 * 2^n special cases
538 72 => Some(MixedRadixPlan::butterfly(24, vec![3])), // 2^3 * 3^2
539 288 => Some(MixedRadixPlan::butterfly(32, vec![9])), // 2^5 * 3^2
540
541 // 4 * 3^n special cases
542 108 => Some(MixedRadixPlan::butterfly(18, vec![6])), // 2^4 * 3^2
543 _ => None,
544 };
545 if let Some(hardcoded) = hardcoded_base {
546 return hardcoded;
547 }
548
549 if factors.get_power2() >= 4 {
550 match factors.get_power3() {
551 // if this FFT is a power of 2, our strategy here is to tweak the butterfly to free us up to do an 8xn chain
552 0 => match factors.get_power2() % 3 {
553 0 => MixedRadixPlan::butterfly(512, vec![]),
554 1 => MixedRadixPlan::butterfly(128, vec![]),
555 2 => MixedRadixPlan::butterfly(256, vec![]),
556 _ => unreachable!(),
557 },
558 // if this FFT is 3 times a power of 2, our strategy here is to tweak butterflies to make it easier to set up a 8xn chain
559 1 => match factors.get_power2() % 3 {
560 0 => MixedRadixPlan::butterfly(24, vec![]),
561 1 => MixedRadixPlan::butterfly(32, vec![12]),
562 2 => MixedRadixPlan::butterfly(32, vec![12, 16]),
563 _ => unreachable!(),
564 },
565 // if this FFT is 9 times a power of 2, our strategy here is to tweak butterflies to make it easier to set up a 8xn chain
566 2 => match factors.get_power2() % 3 {
567 0 => MixedRadixPlan::butterfly(36, vec![16]),
568 1 => MixedRadixPlan::butterfly(36, vec![]),
569 2 => MixedRadixPlan::butterfly(18, vec![]),
570 _ => unreachable!(),
571 },
572 // this FFT is 27 or greater times a power of two. As you might expect, in this vast field of options, what is optimal becomes a lot more muddy and situational
573 // but across all the benchmarking i've done, 36 seems like the best default that will get us the best plan in 95% of the cases
574 // 32 is rarely faster, although i haven't been able to discern a pattern.
575 _ => MixedRadixPlan::butterfly(36, vec![]),
576 }
577 } else if factors.get_power3() >= 3 {
578 // Our FFT is a power of 3 times a low power of 2
579 match factors.get_power2() {
580 0 => match factors.get_power3() % 2 {
581 0 => MixedRadixPlan::butterfly(
582 if factors.get_power3() > 10 { 9 } else { 27 },
583 vec![],
584 ),
585 1 => MixedRadixPlan::butterfly(27, vec![]),
586 _ => unreachable!(),
587 },
588 1 => MixedRadixPlan::butterfly(18, vec![]),
589 2 => match factors.get_power3() % 2 {
590 0 => MixedRadixPlan::butterfly(36, vec![]),
591 1 => MixedRadixPlan::butterfly(
592 if factors.get_power3() > 10 { 36 } else { 18 },
593 vec![],
594 ),
595 _ => unreachable!(),
596 },
597 3 => MixedRadixPlan::butterfly(18, vec![]),
598 // this FFT is 16 or greater times a power of three. As you might expect, in this vast field of options, what is optimal becomes a lot more muddy and situational
599 // but across all the benchmarking i've done, 36 seems like the best default that will get us the best plan in 95% of the cases
600 // 32 is rarely faster, although i haven't been able to discern a pattern.
601 _ => MixedRadixPlan::butterfly(36, vec![]),
602 }
603 }
604 // If this FFT has powers of 11, 7, or 5, use that
605 else if factors.get_power11() > 0 {
606 MixedRadixPlan::butterfly(11, vec![])
607 } else if factors.get_power7() > 0 {
608 MixedRadixPlan::butterfly(7, vec![])
609 } else if factors.get_power5() > 0 {
610 MixedRadixPlan::butterfly(5, vec![])
611 } else {
612 panic!(
613 "Couldn't find a base for FFT size {}, factors={:?}",
614 len, factors
615 )
616 }
617 }
618
619 fn is_butterfly(&self, len: usize) -> bool {
620 [
621 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 16, 17, 18, 19, 23, 24, 27, 29, 31, 32, 36,
622 64, 128, 256, 512,
623 ]
624 .contains(&len)
625 }
626
627 fn construct_butterfly(&self, len: usize, direction: FftDirection) -> Arc<dyn Fft<T>> {
628 match len {
629 0 | 1 => wrap_fft(Dft::new(len, direction)),
630 2 => wrap_fft(Butterfly2::new(direction)),
631 3 => wrap_fft(Butterfly3::new(direction)),
632 4 => wrap_fft(Butterfly4::new(direction)),
633 5 => wrap_fft(Butterfly5Avx64::new(direction).unwrap()),
634 6 => wrap_fft(Butterfly6::new(direction)),
635 7 => wrap_fft(Butterfly7Avx64::new(direction).unwrap()),
636 8 => wrap_fft(Butterfly8Avx64::new(direction).unwrap()),
637 9 => wrap_fft(Butterfly9Avx64::new(direction).unwrap()),
638 11 => wrap_fft(Butterfly11Avx64::new(direction).unwrap()),
639 12 => wrap_fft(Butterfly12Avx64::new(direction).unwrap()),
640 13 => wrap_fft(Butterfly13::new(direction)),
641 16 => wrap_fft(Butterfly16Avx64::new(direction).unwrap()),
642 17 => wrap_fft(Butterfly17::new(direction)),
643 18 => wrap_fft(Butterfly18Avx64::new(direction).unwrap()),
644 19 => wrap_fft(Butterfly19::new(direction)),
645 23 => wrap_fft(Butterfly23::new(direction)),
646 24 => wrap_fft(Butterfly24Avx64::new(direction).unwrap()),
647 27 => wrap_fft(Butterfly27Avx64::new(direction).unwrap()),
648 29 => wrap_fft(Butterfly29::new(direction)),
649 31 => wrap_fft(Butterfly31::new(direction)),
650 32 => wrap_fft(Butterfly32Avx64::new(direction).unwrap()),
651 36 => wrap_fft(Butterfly36Avx64::new(direction).unwrap()),
652 64 => wrap_fft(Butterfly64Avx64::new(direction).unwrap()),
653 128 => wrap_fft(Butterfly128Avx64::new(direction).unwrap()),
654 256 => wrap_fft(Butterfly256Avx64::new(direction).unwrap()),
655 512 => wrap_fft(Butterfly512Avx64::new(direction).unwrap()),
656 _ => panic!("Invalid butterfly len: {}", len),
657 }
658 }
659}
660
661//-------------------------------------------------------------------
662// type-agnostic planning stuff
663//-------------------------------------------------------------------
664impl<A: AvxNum, T: FftNum> AvxPlannerInternal<A, T> {
665 // Given a length, return a plan for how this FFT should be computed
666 fn plan_fft(
667 &self,
668 len: usize,
669 direction: FftDirection,
670 base_fn: impl FnOnce(&Self, usize, &PartialFactors) -> MixedRadixPlan,
671 ) -> MixedRadixPlan {
672 // First step: If this size is already cached, return it directly
673 if self.cache.contains_fft(len, direction) {
674 return MixedRadixPlan::cached(len);
675 }
676
677 // We have butterflies for everything below 10, so if it's below 10, just skip the factorization etc
678 // Notably, this step is *required* if the len is 0, since we can't compute a prime factorization for zero
679 if len < 10 {
680 return MixedRadixPlan::butterfly(len, Vec::new());
681 }
682
683 // This length is not cached, so we have to come up with a new plan. The first step is to find a suitable base.
684 let factors = PartialFactors::compute(len);
685 let base = base_fn(self, len, &factors);
686
687 // it's possible that the base planner plans out the whole FFT. it's guaranteed if `len` is a prime number, or if it's a butterfly, for example
688 let uncached_plan = if base.len == len {
689 base
690 } else {
691 // We have some mixed radix steps to compute! Compute the factors that need to computed by mixed radix steps,
692 let radix_factors = factors
693 .divide_by(&PartialFactors::compute(base.len))
694 .unwrap_or_else(|| {
695 panic!(
696 "Invalid base for FFT length={}, base={:?}, base radixes={:?}",
697 len, base.base, base.radixes
698 )
699 });
700 self.plan_mixed_radix(radix_factors, base)
701 };
702
703 // Last step: We have a full FFT plan, but some of the steps of that plan may have been cached. If they have, use the largest cached step as the base.
704 self.replan_with_cache(uncached_plan, direction)
705 }
706
707 // Takes a plan and an algorithm cache, and replaces steps of the plan with cached steps, if possible
708 fn replan_with_cache(&self, plan: MixedRadixPlan, direction: FftDirection) -> MixedRadixPlan {
709 enum CacheLocation {
710 None,
711 Base,
712 Radix(usize, usize), // First value is the length of the cached FFT, and second value is the index in the radix array
713 }
714
715 let mut largest_cached_len = CacheLocation::None;
716 let base_len = plan.base.base_len();
717 let mut current_len = base_len;
718
719 // Check if the cache contains the base
720 if self.cache.contains_fft(current_len, direction) {
721 largest_cached_len = CacheLocation::Base;
722 }
723
724 // Walk up the radix chain, checking if rthe cache contains each step
725 for (i, radix) in plan.radixes.iter().enumerate() {
726 current_len *= *radix as usize;
727
728 if self.cache.contains_fft(current_len, direction) {
729 largest_cached_len = CacheLocation::Radix(current_len, i);
730 }
731 }
732
733 // If we found a cached length within the plan, update the plan to account for the cache
734 match largest_cached_len {
735 CacheLocation::None => plan,
736 CacheLocation::Base => {
737 MixedRadixPlan::new(MixedRadixBase::CacheBase(base_len), plan.radixes)
738 }
739 CacheLocation::Radix(cached_len, cached_index) => {
740 // We know that `plan.radixes[cached_index]` is the largest cache value, and `cached_len` will be our new base legth
741 // Drop every element from `plan.radixes` from up to and including cached_index
742 let mut chain = plan.radixes;
743 chain.drain(0..=cached_index);
744 MixedRadixPlan::new(MixedRadixBase::CacheBase(cached_len), chain)
745 }
746 }
747 }
748 // given a set of factors, compute how many iterations of 12xn and 16xn we should plan for. Returns (k, j) for 12^k and 6^j
749 fn plan_power12_power6(radix_factors: &PartialFactors) -> (u32, u32) {
750 // it's helpful to think of this process as rewriting the FFT length as powers of our radixes
751 // the fastest FFT we could possibly compute is 8^n, because the 8xn algorithm is blazing fast. 9xn and 12xn are also in the top tier for speed, so those 3 algorithms are what we will aim for
752 // Specifically, we want to find a combination of 8, 9, and 12, that will "consume" all factors of 2 and 3, without having any leftovers
753
754 // Unfortunately, most FFTs don't come in the form 8^n * 9^m * 12^k
755 // Thankfully, 6xn is also reasonably fast, so we can use 6xn to strip away factors.
756 // This function's job will be to divide radix_factors into 8^n * 9^m * 12^k * 6^j, which minimizes j, then maximizes k
757
758 // we're going to hypothetically add as many 12's to our plan as possible, keeping track of how many 6's were required to balance things out
759 // we can also compute this analytically with modular arithmetic, but that technique only works when the FFT is above a minimum size, but this loop+array technique always works
760 let max_twelves = min(radix_factors.get_power2() / 2, radix_factors.get_power3());
761 let mut required_sixes = [None; 4]; // only track 6^0 through 6^3. 6^4 can be converted into 12^2 * 9, and 6^5 can be converted into 12 * 8 * 9 * 9
762 for hypothetical_twelve_power in 0..(max_twelves + 1) {
763 let hypothetical_twos = radix_factors.get_power2() - hypothetical_twelve_power * 2;
764 let hypothetical_threes = radix_factors.get_power3() - hypothetical_twelve_power;
765
766 // figure out how many sixes we would need to leave our FFT at 8^n * 9^m via modular arithmetic, and write to that index of our twelves_per_sixes array
767 let sixes = match (hypothetical_twos % 3, hypothetical_threes % 2) {
768 (0, 0) => Some(0),
769 (1, 1) => Some(1),
770 (2, 0) => Some(2),
771 (0, 1) => Some(3),
772 (1, 0) => None, // it would take 4 sixes, which can be replaced by 2 twelves, so we'll hit it in a later loop (if we have that many factors)
773 (2, 1) => None, // it would take 5 sixes, but note that 12 is literally 2^2 * 3^1, so instead of applying 5 sixes, we can apply a single 12
774 (_, _) => unreachable!(),
775 };
776
777 // if we can bring the FFT into range for the fast path with sixes, record so in the required_sixes array
778 // but make sure the number of sixes we're going to apply actually fits into our available factors
779 if let Some(sixes) = sixes {
780 if sixes <= hypothetical_twos && sixes <= hypothetical_threes {
781 required_sixes[sixes as usize] = Some(hypothetical_twelve_power)
782 }
783 }
784 }
785
786 // required_sixes[i] now contains the largest power of twelve that we can apply, given that we also apply 6^i
787 // we want to apply as many of 12 as possible, so take the array element with the largest non-None element
788 // note that it's possible (and very likely) that either power_twelve or power_six is zero, or both of them are zero! this will happen for a pure power of 2 or power of 3 FFT, for example
789 let (power_twelve, mut power_six) = required_sixes
790 .iter()
791 .enumerate()
792 .filter_map(|(i, maybe_twelve)| maybe_twelve.map(|twelve| (twelve, i as u32)))
793 .fold(
794 (0, 0),
795 |best, current| if current.0 >= best.0 { current } else { best },
796 );
797
798 // special case: if we have exactly one factor of 2 and at least one factor of 3, unconditionally apply a factor of 6 to get rid of the 2
799 if radix_factors.get_power2() == 1 && radix_factors.get_power3() > 0 {
800 power_six = 1;
801 }
802 // special case: if we have a single factor of 3 and more than one factor of 2 (and we don't have any twelves), unconditionally apply a factor of 6 to get rid of the 3
803 if radix_factors.get_power2() > 1 && radix_factors.get_power3() == 1 && power_twelve == 0 {
804 power_six = 1;
805 }
806
807 (power_twelve, power_six)
808 }
809
810 fn plan_mixed_radix(
811 &self,
812 mut radix_factors: PartialFactors,
813 mut plan: MixedRadixPlan,
814 ) -> MixedRadixPlan {
815 // if we can complete the FFT with a single radix, do it
816 if [2, 3, 4, 5, 6, 7, 8, 9, 12, 16].contains(&radix_factors.product()) {
817 plan.push_radix(radix_factors.product() as u8)
818 } else {
819 // Compute how many powers of 12 and powers of 6 we want to strip away
820 let (power_twelve, power_six) = Self::plan_power12_power6(&radix_factors);
821
822 // divide our powers of 12 and 6 out of our radix factors
823 radix_factors = radix_factors
824 .divide_by(&PartialFactors::compute(
825 6usize.pow(power_six) * 12usize.pow(power_twelve),
826 ))
827 .unwrap();
828
829 // now that we know the 12 and 6 factors, the plan array can be computed in descending radix size
830 if radix_factors.get_power2() % 3 == 1 && radix_factors.get_power2() > 1 {
831 // our factors of 2 might not quite be a power of 8 -- our plan_power12_power6 function tried its best, but if there are very few factors of 3, it can't help.
832 // if we're 2 * 8^N, benchmarking shows that applying a 16 before our chain of 8s is very fast.
833 plan.push_radix(16);
834 radix_factors = radix_factors
835 .divide_by(&PartialFactors::compute(16))
836 .unwrap();
837 }
838 plan.push_radix_power(12, power_twelve);
839 plan.push_radix_power(11, radix_factors.get_power11());
840 plan.push_radix_power(9, radix_factors.get_power3() / 2);
841 plan.push_radix_power(8, radix_factors.get_power2() / 3);
842 plan.push_radix_power(7, radix_factors.get_power7());
843 plan.push_radix_power(6, power_six);
844 plan.push_radix_power(5, radix_factors.get_power5());
845 if radix_factors.get_power2() % 3 == 2 {
846 // our factors of 2 might not quite be a power of 8 -- our plan_power12_power6 function tried its best, but if we are a power of 2, it can't help.
847 // if we're 4 * 8^N, benchmarking shows that applying a 4 to the end our chain of 8s is very fast.
848 plan.push_radix(4);
849 }
850 if radix_factors.get_power3() % 2 == 1 {
851 // our factors of 3 might not quite be a power of 9 -- our plan_power12_power6 function tried its best, but if we are a power of 3, it can't help.
852 // if we're 3 * 9^N, our only choice is to add an 8xn step
853 plan.push_radix(3);
854 }
855 if radix_factors.get_power2() % 3 == 1 {
856 // our factors of 2 might not quite be a power of 8. We tried to correct this with a 16 radix and 4 radix, but as a last resort, apply a 2. 2 is very slow, but it's better than not computing the FFT
857 plan.push_radix(2);
858 }
859
860 // measurement opportunity: is it faster to let the plan_power12_power6 function put a 4 on the end instead of relying on all 8's?
861 // measurement opportunity: is it faster to slap a 16 on top of the stack?
862 // measurement opportunity: if our plan_power12_power6 function adds both 12s and sixes, is it faster to drop combinations of 12+6 down to 8+9?
863 };
864 plan
865 }
866
867 // Constructs and returns a FFT instance from a FFT plan.
868 // If the base is a butterfly, it will call the provided `construct_butterfly_fn` to do so.
869 // If constructing the base requires constructing an inner FFT (IE bluetein's or rader's algorithm), it will call the provided `inner_fft_fn` to construct it
870 fn construct_plan(
871 &mut self,
872 plan: MixedRadixPlan,
873 direction: FftDirection,
874 construct_butterfly_fn: impl FnOnce(&Self, usize, FftDirection) -> Arc<dyn Fft<T>>,
875 inner_fft_fn: impl FnOnce(&mut Self, usize, FftDirection) -> Arc<dyn Fft<T>>,
876 ) -> Arc<dyn Fft<T>> {
877 let mut fft = match plan.base {
878 MixedRadixBase::CacheBase(len) => self.cache.get(len, direction).unwrap(),
879 MixedRadixBase::ButterflyBase(len) => {
880 let butterfly_instance = construct_butterfly_fn(self, len, direction);
881
882 // Cache this FFT instance for future calls to `plan_fft`
883 self.cache.insert(&butterfly_instance);
884
885 butterfly_instance
886 }
887 MixedRadixBase::RadersBase(len) => {
888 // Rader's Algorithm requires an inner FFT of size len - 1
889 let inner_fft = inner_fft_fn(self, len - 1, direction);
890
891 // try to construct our AVX2 rader's algorithm. If that fails (probably because the machine we're running on doesn't have AVX2), fall back to scalar
892 let raders_instance =
893 if let Ok(raders_avx) = RadersAvx2::<A, T>::new(Arc::clone(&inner_fft)) {
894 wrap_fft(raders_avx)
895 } else {
896 wrap_fft(RadersAlgorithm::new(inner_fft))
897 };
898
899 // Cache this FFT instance for future calls to `plan_fft`
900 self.cache.insert(&raders_instance);
901
902 raders_instance
903 }
904 MixedRadixBase::BluesteinsBase(len, inner_fft_len) => {
905 // Bluestein's has an inner FFT of arbitrary size. But we've already planned it, so just use what we planned
906 let inner_fft = inner_fft_fn(self, inner_fft_len, direction);
907
908 // try to construct our AVX2 rader's algorithm. If that fails (probably because the machine we're running on doesn't have AVX2), fall back to scalar
909 let bluesteins_instance =
910 wrap_fft(BluesteinsAvx::<A, T>::new(len, inner_fft).unwrap());
911
912 // Cache this FFT instance for future calls to `plan_fft`
913 self.cache.insert(&bluesteins_instance);
914
915 bluesteins_instance
916 }
917 };
918
919 // We have constructed our base. Now, construct the radix chain.
920 for radix in plan.radixes {
921 fft = match radix {
922 2 => wrap_fft(MixedRadix2xnAvx::<A, T>::new(fft).unwrap()),
923 3 => wrap_fft(MixedRadix3xnAvx::<A, T>::new(fft).unwrap()),
924 4 => wrap_fft(MixedRadix4xnAvx::<A, T>::new(fft).unwrap()),
925 5 => wrap_fft(MixedRadix5xnAvx::<A, T>::new(fft).unwrap()),
926 6 => wrap_fft(MixedRadix6xnAvx::<A, T>::new(fft).unwrap()),
927 7 => wrap_fft(MixedRadix7xnAvx::<A, T>::new(fft).unwrap()),
928 8 => wrap_fft(MixedRadix8xnAvx::<A, T>::new(fft).unwrap()),
929 9 => wrap_fft(MixedRadix9xnAvx::<A, T>::new(fft).unwrap()),
930 11 => wrap_fft(MixedRadix11xnAvx::<A, T>::new(fft).unwrap()),
931 12 => wrap_fft(MixedRadix12xnAvx::<A, T>::new(fft).unwrap()),
932 16 => wrap_fft(MixedRadix16xnAvx::<A, T>::new(fft).unwrap()),
933 _ => unreachable!(),
934 };
935
936 // Cache this FFT instance for future calls to `plan_fft`
937 self.cache.insert(&fft);
938 }
939
940 fft
941 }
942
943 // Plan and return the inner size to be used with Bluestein's Algorithm
944 // Calls `filter_fn` on result candidates, giving the caller the opportunity to reject certain sizes
945 fn plan_bluesteins(
946 &self,
947 len: usize,
948 filter_fn: impl FnMut(&&(usize, u32, u32)) -> bool,
949 ) -> usize {
950 assert!(len > 1); // Internal consistency check: The logic in this method doesn't work for a length of 1
951
952 // Bluestein's computes a FFT of size `len` by reorganizing it as a FFT of ANY size greater than or equal to len * 2 - 1
953 // an obvious choice is the next power of two larger than len * 2 - 1, but if we can find a smaller FFT that will go faster, we can save a lot of time.
954 // We can very efficiently compute almost any 2^n * 3^m, so we're going to search for all numbers of the form 2^n * 3^m that lie between len * 2 - 1 and the next power of two.
955 let min_len = len * 2 - 1;
956 let baseline_candidate = min_len.checked_next_power_of_two().unwrap();
957
958 // our algorithm here is to start with our next power of 2, and repeatedly divide by 2 and multiply by 3, trying to keep our value in range
959 let mut bluesteins_candidates = Vec::new();
960 let mut candidate = baseline_candidate;
961 let mut factor2 = candidate.trailing_zeros();
962 let mut factor3 = 0;
963
964 let min_factor2 = 2; // benchmarking shows that while 3^n and 2 * 3^n are fast, they're typically slower than the next-higher candidate, so don't bother generating them
965 while factor2 >= min_factor2 {
966 // if this candidate length isn't too small, add it to our candidates list
967 if candidate >= min_len {
968 bluesteins_candidates.push((candidate, factor2, factor3));
969 }
970 // if the candidate is too large, divide it by 2. if it's too small, divide it by 3
971 if candidate >= baseline_candidate {
972 candidate >>= 1;
973 factor2 -= 1;
974 } else {
975 candidate *= 3;
976 factor3 += 1;
977 }
978 }
979 bluesteins_candidates.sort();
980
981 // we now have a list of candidates to choosse from. some 2^n * 3^m FFTs are faster than others, so apply a filter, which will let us skip sizes that benchmarking has shown to be slow
982 let (chosen_size, _, _) =
983 bluesteins_candidates
984 .iter()
985 .find(filter_fn)
986 .unwrap_or_else(|| {
987 panic!(
988 "Failed to find a bluestein's candidate for len={}, candidates: {:?}",
989 len, bluesteins_candidates
990 )
991 });
992
993 *chosen_size
994 }
995}
996
997#[cfg(test)]
998mod unit_tests {
999 use super::*;
1000
1001 // We don't need to actually compute anything for a FFT size of zero, but we do need to verify that it doesn't explode
1002 #[test]
1003 fn test_plan_zero_avx() {
1004 let mut planner32 = FftPlannerAvx::<f32>::new().unwrap();
1005 let fft_zero32 = planner32.plan_fft_forward(0);
1006 fft_zero32.process(&mut []);
1007
1008 let mut planner64 = FftPlannerAvx::<f64>::new().unwrap();
1009 let fft_zero64 = planner64.plan_fft_forward(0);
1010 fft_zero64.process(&mut []);
1011 }
1012}