rustfft/algorithm/
good_thomas_algorithm.rs

1use std::cmp::max;
2use std::sync::Arc;
3
4use num_complex::Complex;
5use num_integer::Integer;
6use strength_reduce::StrengthReducedUsize;
7use transpose;
8
9use crate::array_utils;
10use crate::common::{fft_error_inplace, fft_error_outofplace};
11use crate::{common::FftNum, FftDirection};
12use crate::{Direction, Fft, Length};
13
14/// Implementation of the [Good-Thomas Algorithm (AKA Prime Factor Algorithm)](https://en.wikipedia.org/wiki/Prime-factor_FFT_algorithm)
15///
16/// This algorithm factors a size n FFT into n1 * n2, where GCD(n1, n2) == 1
17///
18/// Conceptually, this algorithm is very similar to the Mixed-Radix, except because GCD(n1, n2) == 1 we can do some
19/// number theory trickery to reduce the number of floating-point multiplications and additions. Additionally, It can
20/// be faster than Mixed-Radix at sizes below 10,000 or so.
21///
22/// ~~~
23/// // Computes a forward FFT of size 1200, using the Good-Thomas Algorithm
24/// use rustfft::algorithm::GoodThomasAlgorithm;
25/// use rustfft::{Fft, FftPlanner};
26/// use rustfft::num_complex::Complex;
27/// use rustfft::num_traits::Zero;
28///
29/// let mut buffer = vec![Complex{ re: 0.0f32, im: 0.0f32 }; 1200];
30///
31/// // we need to find an n1 and n2 such that n1 * n2 == 1200 and GCD(n1, n2) == 1
32/// // n1 = 48 and n2 = 25 satisfies this
33/// let mut planner = FftPlanner::new();
34/// let inner_fft_n1 = planner.plan_fft_forward(48);
35/// let inner_fft_n2 = planner.plan_fft_forward(25);
36///
37/// // the good-thomas FFT length will be inner_fft_n1.len() * inner_fft_n2.len() = 1200
38/// let fft = GoodThomasAlgorithm::new(inner_fft_n1, inner_fft_n2);
39/// fft.process(&mut buffer);
40/// ~~~
41pub struct GoodThomasAlgorithm<T> {
42    width: usize,
43    width_size_fft: Arc<dyn Fft<T>>,
44
45    height: usize,
46    height_size_fft: Arc<dyn Fft<T>>,
47
48    reduced_width: StrengthReducedUsize,
49    reduced_width_plus_one: StrengthReducedUsize,
50
51    inplace_scratch_len: usize,
52    outofplace_scratch_len: usize,
53
54    len: usize,
55    direction: FftDirection,
56}
57
58impl<T: FftNum> GoodThomasAlgorithm<T> {
59    /// Creates a FFT instance which will process inputs/outputs of size `width_fft.len() * height_fft.len()`
60    ///
61    /// `GCD(width_fft.len(), height_fft.len())` must be equal to 1
62    pub fn new(mut width_fft: Arc<dyn Fft<T>>, mut height_fft: Arc<dyn Fft<T>>) -> Self {
63        assert_eq!(
64            width_fft.fft_direction(), height_fft.fft_direction(),
65            "width_fft and height_fft must have the same direction. got width direction={}, height direction={}",
66            width_fft.fft_direction(), height_fft.fft_direction());
67
68        let mut width = width_fft.len();
69        let mut height = height_fft.len();
70        let direction = width_fft.fft_direction();
71
72        // This algorithm doesn't work if width and height aren't coprime
73        let gcd = num_integer::gcd(width as i64, height as i64);
74        assert!(gcd == 1,
75                "Invalid width and height for Good-Thomas Algorithm (width={}, height={}): Inputs must be coprime",
76                width,
77                height);
78
79        // The trick we're using for our index remapping will only work if width < height, so just swap them if it isn't
80        if width > height {
81            std::mem::swap(&mut width, &mut height);
82            std::mem::swap(&mut width_fft, &mut height_fft);
83        }
84
85        let len = width * height;
86
87        // Collect some data about what kind of scratch space our inner FFTs need
88        let width_inplace_scratch = width_fft.get_inplace_scratch_len();
89        let height_inplace_scratch = height_fft.get_inplace_scratch_len();
90        let height_outofplace_scratch = height_fft.get_outofplace_scratch_len();
91
92        // Computing the scratch we'll require is a somewhat confusing process.
93        // When we compute an out-of-place FFT, both of our inner FFTs are in-place
94        // When we compute an inplace FFT, our inner width FFT will be inplace, and our height FFT will be out-of-place
95        // For the out-of-place FFT, one of 2 things can happen regarding scratch:
96        //      - If the required scratch of both FFTs is <= self.len(), then we can use the input or output buffer as scratch, and so we need 0 extra scratch
97        //      - If either of the inner FFTs require more, then we'll have to request an entire scratch buffer for the inner FFTs,
98        //          whose size is the max of the two inner FFTs' required scratch
99        let max_inner_inplace_scratch = max(height_inplace_scratch, width_inplace_scratch);
100        let outofplace_scratch_len = if max_inner_inplace_scratch > len {
101            max_inner_inplace_scratch
102        } else {
103            0
104        };
105
106        // For the in-place FFT, again the best case is that we can just bounce data around between internal buffers, and the only inplace scratch we need is self.len()
107        // If our height fft's OOP FFT requires any scratch, then we can tack that on the end of our own scratch, and use split_at_mut to separate our own from our internal FFT's
108        // Likewise, if our width inplace FFT requires more inplace scracth than self.len(), we can tack that on to the end of our own inplace scratch.
109        // Thus, the total inplace scratch is our own length plus the max of what the two inner FFTs will need
110        let inplace_scratch_len = len
111            + max(
112                if width_inplace_scratch > len {
113                    width_inplace_scratch
114                } else {
115                    0
116                },
117                height_outofplace_scratch,
118            );
119
120        Self {
121            width,
122            width_size_fft: width_fft,
123
124            height,
125            height_size_fft: height_fft,
126
127            reduced_width: StrengthReducedUsize::new(width),
128            reduced_width_plus_one: StrengthReducedUsize::new(width + 1),
129
130            inplace_scratch_len,
131            outofplace_scratch_len,
132
133            len,
134            direction,
135        }
136    }
137
138    fn reindex_input(&self, source: &[Complex<T>], destination: &mut [Complex<T>]) {
139        // A critical part of the good-thomas algorithm is re-indexing the inputs and outputs.
140        // To remap the inputs, we will use the CRT mapping, paired with the normal transpose we'd do for mixed radix.
141        //
142        // The algorithm for the CRT mapping will work like this:
143        // 1: Keep an output index, initialized to 0
144        // 2: The output index will be incremented by width + 1
145        // 3: At the start of the row, compute if we will increment output_index past self.len()
146        //      3a: If we will, then compute exactly how many increments it will take,
147        //      3b: Increment however many times as we scan over the input row, copying each element to the output index
148        //      3c: Subtract self.len() from output_index
149        // 4: Scan over the rest of the row, incrementing output_index, and copying each element to output_index, thne incrementing output_index
150        // 5: The first index of each row will be the final index of the previous row plus one, but because of our incrementing (width+1) inside the loop, we overshot, so at the end of the row, subtract width from output_index
151        //
152        // This ends up producing the same result as computing the multiplicative inverse of width mod height and etc by the CRT mapping, but with only one integer division per row, instead of one per element.
153        let mut destination_index = 0;
154        for mut source_row in source.chunks_exact(self.width) {
155            let increments_until_cycle =
156                1 + (self.len() - destination_index) / self.reduced_width_plus_one;
157
158            // If we will have to rollover output_index on this row, do it in a separate loop
159            if increments_until_cycle < self.width {
160                let (pre_cycle_row, post_cycle_row) = source_row.split_at(increments_until_cycle);
161
162                for input_element in pre_cycle_row {
163                    destination[destination_index] = *input_element;
164                    destination_index += self.reduced_width_plus_one.get();
165                }
166
167                // Store the split slice back to input_row, os that outside the loop, we can finish the job of iterating the row
168                source_row = post_cycle_row;
169                destination_index -= self.len();
170            }
171
172            // Loop over the entire row (if we did not roll over) or what's left of the row (if we did) and keep incrementing output_row
173            for input_element in source_row {
174                destination[destination_index] = *input_element;
175                destination_index += self.reduced_width_plus_one.get();
176            }
177
178            // The first index of the next will be the final index this row, plus one.
179            // But because of our incrementing (width+1) inside the loop above, we overshot, so subtract width, and we'll get (width + 1) - width = 1
180            destination_index -= self.width;
181        }
182    }
183
184    fn reindex_output(&self, source: &[Complex<T>], destination: &mut [Complex<T>]) {
185        // A critical part of the good-thomas algorithm is re-indexing the inputs and outputs.
186        // To remap the outputs, we will use the ruritanian mapping, paired with the normal transpose we'd do for mixed radix.
187        //
188        // The algorithm for the ruritanian mapping will work like this:
189        // 1: At the start of every row, compute the output index = (y * self.height) % self.width
190        // 2: We will increment this output index by self.width for every element
191        // 3: Compute where in the row the output index will wrap around
192        // 4: Instead of starting copying from the beginning of the row, start copying from after the rollover point
193        // 5: When we hit the end of the row, continue from the beginning of the row, continuing to increment the output index by self.width
194        //
195        // This achieves the same result as the modular arithmetic ofthe ruritanian mapping, but with only one integer divison per row, instead of one per element
196        for (y, source_chunk) in source.chunks_exact(self.height).enumerate() {
197            let (quotient, remainder) =
198                StrengthReducedUsize::div_rem(y * self.height, self.reduced_width);
199
200            // Compute our base index and starting point in the row
201            let mut destination_index = remainder;
202            let start_x = self.height - quotient;
203
204            // Process the first part of the row
205            for x in start_x..self.height {
206                destination[destination_index] = source_chunk[x];
207                destination_index += self.width;
208            }
209
210            // Wrap back around to the beginning of the row and keep incrementing
211            for x in 0..start_x {
212                destination[destination_index] = source_chunk[x];
213                destination_index += self.width;
214            }
215        }
216    }
217
218    fn perform_fft_inplace(&self, buffer: &mut [Complex<T>], scratch: &mut [Complex<T>]) {
219        let (scratch, inner_scratch) = scratch.split_at_mut(self.len());
220
221        // Re-index the input, copying from the buffer to the scratch in the process
222        self.reindex_input(buffer, scratch);
223
224        // run FFTs of size `width`
225        let width_scratch = if inner_scratch.len() > buffer.len() {
226            &mut inner_scratch[..]
227        } else {
228            &mut buffer[..]
229        };
230        self.width_size_fft
231            .process_with_scratch(scratch, width_scratch);
232
233        // transpose
234        transpose::transpose(scratch, buffer, self.width, self.height);
235
236        // run FFTs of size 'height'
237        self.height_size_fft
238            .process_outofplace_with_scratch(buffer, scratch, inner_scratch);
239
240        // Re-index the output, copying from the scratch to the buffer in the process
241        self.reindex_output(scratch, buffer);
242    }
243
244    fn perform_fft_out_of_place(
245        &self,
246        input: &mut [Complex<T>],
247        output: &mut [Complex<T>],
248        scratch: &mut [Complex<T>],
249    ) {
250        // Re-index the input, copying from the input to the output in the process
251        self.reindex_input(input, output);
252
253        // run FFTs of size `width`
254        let width_scratch = if scratch.len() > input.len() {
255            &mut scratch[..]
256        } else {
257            &mut input[..]
258        };
259        self.width_size_fft
260            .process_with_scratch(output, width_scratch);
261
262        // transpose
263        transpose::transpose(output, input, self.width, self.height);
264
265        // run FFTs of size 'height'
266        let height_scratch = if scratch.len() > output.len() {
267            &mut scratch[..]
268        } else {
269            &mut output[..]
270        };
271        self.height_size_fft
272            .process_with_scratch(input, height_scratch);
273
274        // Re-index the output, copying from the input to the output in the process
275        self.reindex_output(input, output);
276    }
277}
278boilerplate_fft!(
279    GoodThomasAlgorithm,
280    |this: &GoodThomasAlgorithm<_>| this.len,
281    |this: &GoodThomasAlgorithm<_>| this.inplace_scratch_len,
282    |this: &GoodThomasAlgorithm<_>| this.outofplace_scratch_len
283);
284
285/// Implementation of the Good-Thomas Algorithm, specialized for smaller input sizes
286///
287/// This algorithm factors a size n FFT into n1 * n2, where GCD(n1, n2) == 1
288///
289/// Conceptually, this algorithm is very similar to MixedRadix, except because GCD(n1, n2) == 1 we can do some
290/// number theory trickery to reduce the number of floating point operations. It typically performs
291/// better than MixedRadixSmall, especially at the smallest sizes.
292///
293/// ~~~
294/// // Computes a forward FFT of size 56 using GoodThomasAlgorithmSmall
295/// use std::sync::Arc;
296/// use rustfft::algorithm::GoodThomasAlgorithmSmall;
297/// use rustfft::algorithm::butterflies::{Butterfly7, Butterfly8};
298/// use rustfft::{Fft, FftDirection};
299/// use rustfft::num_complex::Complex;
300///
301/// let mut buffer = vec![Complex{ re: 0.0f32, im: 0.0f32 }; 56];
302///
303/// // we need to find an n1 and n2 such that n1 * n2 == 56 and GCD(n1, n2) == 1
304/// // n1 = 7 and n2 = 8 satisfies this
305/// let inner_fft_n1 = Arc::new(Butterfly7::new(FftDirection::Forward));
306/// let inner_fft_n2 = Arc::new(Butterfly8::new(FftDirection::Forward));
307///
308/// // the good-thomas FFT length will be inner_fft_n1.len() * inner_fft_n2.len() = 56
309/// let fft = GoodThomasAlgorithmSmall::new(inner_fft_n1, inner_fft_n2);
310/// fft.process(&mut buffer);
311/// ~~~
312pub struct GoodThomasAlgorithmSmall<T> {
313    width: usize,
314    width_size_fft: Arc<dyn Fft<T>>,
315
316    height: usize,
317    height_size_fft: Arc<dyn Fft<T>>,
318
319    input_output_map: Box<[usize]>,
320
321    direction: FftDirection,
322}
323
324impl<T: FftNum> GoodThomasAlgorithmSmall<T> {
325    /// Creates a FFT instance which will process inputs/outputs of size `width_fft.len() * height_fft.len()`
326    ///
327    /// `GCD(width_fft.len(), height_fft.len())` must be equal to 1
328    pub fn new(width_fft: Arc<dyn Fft<T>>, height_fft: Arc<dyn Fft<T>>) -> Self {
329        assert_eq!(
330            width_fft.fft_direction(), height_fft.fft_direction(),
331            "n1_fft and height_fft must have the same direction. got width direction={}, height direction={}",
332            width_fft.fft_direction(), height_fft.fft_direction());
333
334        let width = width_fft.len();
335        let height = height_fft.len();
336        let len = width * height;
337
338        assert_eq!(width_fft.get_outofplace_scratch_len(), 0, "GoodThomasAlgorithmSmall should only be used with algorithms that require 0 out-of-place scratch. Width FFT (len={}) requires {}, should require 0", width, width_fft.get_outofplace_scratch_len());
339        assert_eq!(height_fft.get_outofplace_scratch_len(), 0, "GoodThomasAlgorithmSmall should only be used with algorithms that require 0 out-of-place scratch. Height FFT (len={}) requires {}, should require 0", height, height_fft.get_outofplace_scratch_len());
340
341        assert!(width_fft.get_inplace_scratch_len() <= width, "GoodThomasAlgorithmSmall should only be used with algorithms that require little inplace scratch. Width FFT (len={}) requires {}, should require {} or less", width, width_fft.get_inplace_scratch_len(), width);
342        assert!(height_fft.get_inplace_scratch_len() <= height, "GoodThomasAlgorithmSmall should only be used with algorithms that require little inplace scratch. Height FFT (len={}) requires {}, should require {} or less", height, height_fft.get_inplace_scratch_len(), height);
343
344        // compute the multiplicative inverse of width mod height and vice versa. x will be width mod height, and y will be height mod width
345        let gcd_data = i64::extended_gcd(&(width as i64), &(height as i64));
346        assert!(gcd_data.gcd == 1,
347                "Invalid input width and height to Good-Thomas Algorithm: ({},{}): Inputs must be coprime",
348                width,
349                height);
350
351        // width_inverse or height_inverse might be negative, make it positive by wrapping
352        let width_inverse = if gcd_data.x >= 0 {
353            gcd_data.x
354        } else {
355            gcd_data.x + height as i64
356        } as usize;
357        let height_inverse = if gcd_data.y >= 0 {
358            gcd_data.y
359        } else {
360            gcd_data.y + width as i64
361        } as usize;
362
363        // NOTE: we are precomputing the input and output reordering indexes, because benchmarking shows that it's 10-20% faster
364        // If we wanted to optimize for memory use or setup time instead of multiple-FFT speed, we could compute these on the fly in the perform_fft() method
365        let input_iter = (0..len)
366            .map(|i| (i % width, i / width))
367            .map(|(x, y)| (x * height + y * width) % len);
368        let output_iter = (0..len).map(|i| (i % height, i / height)).map(|(y, x)| {
369            (x * height * height_inverse as usize + y * width * width_inverse as usize) % len
370        });
371
372        let input_output_map: Vec<usize> = input_iter.chain(output_iter).collect();
373
374        Self {
375            direction: width_fft.fft_direction(),
376
377            width,
378            width_size_fft: width_fft,
379
380            height,
381            height_size_fft: height_fft,
382
383            input_output_map: input_output_map.into_boxed_slice(),
384        }
385    }
386
387    fn perform_fft_out_of_place(
388        &self,
389        input: &mut [Complex<T>],
390        output: &mut [Complex<T>],
391        _scratch: &mut [Complex<T>],
392    ) {
393        // These asserts are for the unsafe blocks down below. we're relying on the optimizer to get rid of this assert
394        assert_eq!(self.len(), input.len());
395        assert_eq!(self.len(), output.len());
396
397        let (input_map, output_map) = self.input_output_map.split_at(self.len());
398
399        // copy the input using our reordering mapping
400        for (output_element, &input_index) in output.iter_mut().zip(input_map.iter()) {
401            *output_element = input[input_index];
402        }
403
404        // run FFTs of size `width`
405        self.width_size_fft.process_with_scratch(output, input);
406
407        // transpose
408        unsafe { array_utils::transpose_small(self.width, self.height, output, input) };
409
410        // run FFTs of size 'height'
411        self.height_size_fft.process_with_scratch(input, output);
412
413        // copy to the output, using our output redordeing mapping
414        for (input_element, &output_index) in input.iter().zip(output_map.iter()) {
415            output[output_index] = *input_element;
416        }
417    }
418
419    fn perform_fft_inplace(&self, buffer: &mut [Complex<T>], scratch: &mut [Complex<T>]) {
420        // These asserts are for the unsafe blocks down below. we're relying on the optimizer to get rid of this assert
421        assert_eq!(self.len(), buffer.len());
422        assert_eq!(self.len(), scratch.len());
423
424        let (input_map, output_map) = self.input_output_map.split_at(self.len());
425
426        // copy the input using our reordering mapping
427        for (output_element, &input_index) in scratch.iter_mut().zip(input_map.iter()) {
428            *output_element = buffer[input_index];
429        }
430
431        // run FFTs of size `width`
432        self.width_size_fft.process_with_scratch(scratch, buffer);
433
434        // transpose
435        unsafe { array_utils::transpose_small(self.width, self.height, scratch, buffer) };
436
437        // run FFTs of size 'height'
438        self.height_size_fft
439            .process_outofplace_with_scratch(buffer, scratch, &mut []);
440
441        // copy to the output, using our output redordeing mapping
442        for (input_element, &output_index) in scratch.iter().zip(output_map.iter()) {
443            buffer[output_index] = *input_element;
444        }
445    }
446}
447boilerplate_fft!(
448    GoodThomasAlgorithmSmall,
449    |this: &GoodThomasAlgorithmSmall<_>| this.width * this.height,
450    |this: &GoodThomasAlgorithmSmall<_>| this.len(),
451    |_| 0
452);
453
454#[cfg(test)]
455mod unit_tests {
456    use super::*;
457    use crate::test_utils::check_fft_algorithm;
458    use crate::{algorithm::Dft, test_utils::BigScratchAlgorithm};
459    use num_integer::gcd;
460    use num_traits::Zero;
461    use std::sync::Arc;
462
463    #[test]
464    fn test_good_thomas() {
465        for width in 1..12 {
466            for height in 1..12 {
467                if gcd(width, height) == 1 {
468                    test_good_thomas_with_lengths(width, height, FftDirection::Forward);
469                    test_good_thomas_with_lengths(width, height, FftDirection::Inverse);
470                }
471            }
472        }
473    }
474
475    #[test]
476    fn test_good_thomas_small() {
477        let butterfly_sizes = [2, 3, 4, 5, 6, 7, 8, 16];
478        for width in &butterfly_sizes {
479            for height in &butterfly_sizes {
480                if gcd(*width, *height) == 1 {
481                    test_good_thomas_small_with_lengths(*width, *height, FftDirection::Forward);
482                    test_good_thomas_small_with_lengths(*width, *height, FftDirection::Inverse);
483                }
484            }
485        }
486    }
487
488    fn test_good_thomas_with_lengths(width: usize, height: usize, direction: FftDirection) {
489        let width_fft = Arc::new(Dft::new(width, direction)) as Arc<dyn Fft<f32>>;
490        let height_fft = Arc::new(Dft::new(height, direction)) as Arc<dyn Fft<f32>>;
491
492        let fft = GoodThomasAlgorithm::new(width_fft, height_fft);
493
494        check_fft_algorithm(&fft, width * height, direction);
495    }
496
497    fn test_good_thomas_small_with_lengths(width: usize, height: usize, direction: FftDirection) {
498        let width_fft = Arc::new(Dft::new(width, direction)) as Arc<dyn Fft<f32>>;
499        let height_fft = Arc::new(Dft::new(height, direction)) as Arc<dyn Fft<f32>>;
500
501        let fft = GoodThomasAlgorithmSmall::new(width_fft, height_fft);
502
503        check_fft_algorithm(&fft, width * height, direction);
504    }
505
506    #[test]
507    fn test_output_mapping() {
508        let width = 15;
509        for height in 3..width {
510            if gcd(width, height) == 1 {
511                let width_fft =
512                    Arc::new(Dft::new(width, FftDirection::Forward)) as Arc<dyn Fft<f32>>;
513                let height_fft =
514                    Arc::new(Dft::new(height, FftDirection::Forward)) as Arc<dyn Fft<f32>>;
515
516                let fft = GoodThomasAlgorithm::new(width_fft, height_fft);
517
518                let mut buffer = vec![Complex { re: 0.0, im: 0.0 }; fft.len()];
519
520                fft.process(&mut buffer);
521            }
522        }
523    }
524
525    // Verify that the Good-Thomas algorithm correctly provides scratch space to inner FFTs
526    #[test]
527    fn test_good_thomas_inner_scratch() {
528        let scratch_lengths = [1, 5, 24];
529
530        let mut inner_ffts = Vec::new();
531
532        for &len in &scratch_lengths {
533            for &inplace_scratch in &scratch_lengths {
534                for &outofplace_scratch in &scratch_lengths {
535                    inner_ffts.push(Arc::new(BigScratchAlgorithm {
536                        len,
537                        inplace_scratch,
538                        outofplace_scratch,
539                        direction: FftDirection::Forward,
540                    }) as Arc<dyn Fft<f32>>);
541                }
542            }
543        }
544
545        for width_fft in inner_ffts.iter() {
546            for height_fft in inner_ffts.iter() {
547                if width_fft.len() == height_fft.len() {
548                    continue;
549                }
550
551                let fft = GoodThomasAlgorithm::new(Arc::clone(width_fft), Arc::clone(height_fft));
552
553                let mut inplace_buffer = vec![Complex::zero(); fft.len()];
554                let mut inplace_scratch = vec![Complex::zero(); fft.get_inplace_scratch_len()];
555
556                fft.process_with_scratch(&mut inplace_buffer, &mut inplace_scratch);
557
558                let mut outofplace_input = vec![Complex::zero(); fft.len()];
559                let mut outofplace_output = vec![Complex::zero(); fft.len()];
560                let mut outofplace_scratch =
561                    vec![Complex::zero(); fft.get_outofplace_scratch_len()];
562
563                fft.process_outofplace_with_scratch(
564                    &mut outofplace_input,
565                    &mut outofplace_output,
566                    &mut outofplace_scratch,
567                );
568            }
569        }
570    }
571}