wide/
f32x8_.rs

1use super::*;
2
3pick! {
4  if #[cfg(target_feature="avx")] {
5    #[derive(Default, Clone, Copy, PartialEq)]
6    #[repr(C, align(32))]
7    pub struct f32x8 { avx: m256 }
8  } else {
9    #[derive(Default, Clone, Copy, PartialEq)]
10    #[repr(C, align(32))]
11    pub struct f32x8 { a : f32x4, b : f32x4 }
12  }
13}
14
15macro_rules! const_f32_as_f32x8 {
16  ($i:ident, $f:expr) => {
17    #[allow(non_upper_case_globals)]
18    pub const $i: f32x8 = f32x8::new([$f; 8]);
19  };
20}
21
22impl f32x8 {
23  const_f32_as_f32x8!(ONE, 1.0);
24  const_f32_as_f32x8!(HALF, 0.5);
25  const_f32_as_f32x8!(ZERO, 0.0);
26  const_f32_as_f32x8!(E, core::f32::consts::E);
27  const_f32_as_f32x8!(FRAC_1_PI, core::f32::consts::FRAC_1_PI);
28  const_f32_as_f32x8!(FRAC_2_PI, core::f32::consts::FRAC_2_PI);
29  const_f32_as_f32x8!(FRAC_2_SQRT_PI, core::f32::consts::FRAC_2_SQRT_PI);
30  const_f32_as_f32x8!(FRAC_1_SQRT_2, core::f32::consts::FRAC_1_SQRT_2);
31  const_f32_as_f32x8!(FRAC_PI_2, core::f32::consts::FRAC_PI_2);
32  const_f32_as_f32x8!(FRAC_PI_3, core::f32::consts::FRAC_PI_3);
33  const_f32_as_f32x8!(FRAC_PI_4, core::f32::consts::FRAC_PI_4);
34  const_f32_as_f32x8!(FRAC_PI_6, core::f32::consts::FRAC_PI_6);
35  const_f32_as_f32x8!(FRAC_PI_8, core::f32::consts::FRAC_PI_8);
36  const_f32_as_f32x8!(LN_2, core::f32::consts::LN_2);
37  const_f32_as_f32x8!(LN_10, core::f32::consts::LN_10);
38  const_f32_as_f32x8!(LOG2_E, core::f32::consts::LOG2_E);
39  const_f32_as_f32x8!(LOG10_E, core::f32::consts::LOG10_E);
40  const_f32_as_f32x8!(LOG10_2, core::f32::consts::LOG10_2);
41  const_f32_as_f32x8!(LOG2_10, core::f32::consts::LOG2_10);
42  const_f32_as_f32x8!(PI, core::f32::consts::PI);
43  const_f32_as_f32x8!(SQRT_2, core::f32::consts::SQRT_2);
44  const_f32_as_f32x8!(TAU, core::f32::consts::TAU);
45}
46
47unsafe impl Zeroable for f32x8 {}
48unsafe impl Pod for f32x8 {}
49
50impl Add for f32x8 {
51  type Output = Self;
52  #[inline]
53  #[must_use]
54  fn add(self, rhs: Self) -> Self::Output {
55    pick! {
56      if #[cfg(target_feature="avx")] {
57        Self { avx: add_m256(self.avx, rhs.avx) }
58      } else {
59        Self {
60          a : self.a.add(rhs.a),
61          b : self.b.add(rhs.b),
62        }
63      }
64    }
65  }
66}
67
68impl Sub for f32x8 {
69  type Output = Self;
70  #[inline]
71  #[must_use]
72  fn sub(self, rhs: Self) -> Self::Output {
73    pick! {
74      if #[cfg(target_feature="avx")] {
75        Self { avx: sub_m256(self.avx, rhs.avx) }
76      } else {
77        Self {
78          a : self.a.sub(rhs.a),
79          b : self.b.sub(rhs.b),
80        }
81      }
82    }
83  }
84}
85
86impl Mul for f32x8 {
87  type Output = Self;
88  #[inline]
89  #[must_use]
90  fn mul(self, rhs: Self) -> Self::Output {
91    pick! {
92      if #[cfg(target_feature="avx")] {
93        Self { avx: mul_m256(self.avx, rhs.avx) }
94      } else {
95        Self {
96          a : self.a.mul(rhs.a),
97          b : self.b.mul(rhs.b),
98        }
99      }
100    }
101  }
102}
103
104impl Div for f32x8 {
105  type Output = Self;
106  #[inline]
107  #[must_use]
108  fn div(self, rhs: Self) -> Self::Output {
109    pick! {
110      if #[cfg(target_feature="avx")] {
111        Self { avx: div_m256(self.avx, rhs.avx) }
112      } else {
113        Self {
114          a : self.a.div(rhs.a),
115          b : self.b.div(rhs.b),
116        }
117      }
118    }
119  }
120}
121
122impl Add<f32> for f32x8 {
123  type Output = Self;
124  #[inline]
125  #[must_use]
126  fn add(self, rhs: f32) -> Self::Output {
127    self.add(Self::splat(rhs))
128  }
129}
130
131impl Sub<f32> for f32x8 {
132  type Output = Self;
133  #[inline]
134  #[must_use]
135  fn sub(self, rhs: f32) -> Self::Output {
136    self.sub(Self::splat(rhs))
137  }
138}
139
140impl Mul<f32> for f32x8 {
141  type Output = Self;
142  #[inline]
143  #[must_use]
144  fn mul(self, rhs: f32) -> Self::Output {
145    self.mul(Self::splat(rhs))
146  }
147}
148
149impl Div<f32> for f32x8 {
150  type Output = Self;
151  #[inline]
152  #[must_use]
153  fn div(self, rhs: f32) -> Self::Output {
154    self.div(Self::splat(rhs))
155  }
156}
157
158impl Add<f32x8> for f32 {
159  type Output = f32x8;
160  #[inline]
161  #[must_use]
162  fn add(self, rhs: f32x8) -> Self::Output {
163    f32x8::splat(self).add(rhs)
164  }
165}
166
167impl Sub<f32x8> for f32 {
168  type Output = f32x8;
169  #[inline]
170  #[must_use]
171  fn sub(self, rhs: f32x8) -> Self::Output {
172    f32x8::splat(self).sub(rhs)
173  }
174}
175
176impl Mul<f32x8> for f32 {
177  type Output = f32x8;
178  #[inline]
179  #[must_use]
180  fn mul(self, rhs: f32x8) -> Self::Output {
181    f32x8::splat(self).mul(rhs)
182  }
183}
184
185impl Div<f32x8> for f32 {
186  type Output = f32x8;
187  #[inline]
188  #[must_use]
189  fn div(self, rhs: f32x8) -> Self::Output {
190    f32x8::splat(self).div(rhs)
191  }
192}
193
194impl BitAnd for f32x8 {
195  type Output = Self;
196  #[inline]
197  #[must_use]
198  fn bitand(self, rhs: Self) -> Self::Output {
199    pick! {
200      if #[cfg(target_feature="avx")] {
201        Self { avx: bitand_m256(self.avx, rhs.avx) }
202      } else {
203        Self {
204          a : self.a.bitand(rhs.a),
205          b : self.b.bitand(rhs.b),
206        }
207      }
208    }
209  }
210}
211
212impl BitOr for f32x8 {
213  type Output = Self;
214  #[inline]
215  #[must_use]
216  fn bitor(self, rhs: Self) -> Self::Output {
217    pick! {
218      if #[cfg(target_feature="avx")] {
219        Self { avx: bitor_m256(self.avx, rhs.avx) }
220      } else {
221        Self {
222          a : self.a.bitor(rhs.a),
223          b : self.b.bitor(rhs.b),
224        }
225      }
226    }
227  }
228}
229
230impl BitXor for f32x8 {
231  type Output = Self;
232  #[inline]
233  #[must_use]
234  fn bitxor(self, rhs: Self) -> Self::Output {
235    pick! {
236      if #[cfg(target_feature="avx")] {
237        Self { avx: bitxor_m256(self.avx, rhs.avx) }
238      } else {
239        Self {
240          a : self.a.bitxor(rhs.a),
241          b : self.b.bitxor(rhs.b),
242        }
243      }
244    }
245  }
246}
247
248impl CmpEq for f32x8 {
249  type Output = Self;
250  #[inline]
251  #[must_use]
252  fn cmp_eq(self, rhs: Self) -> Self::Output {
253    pick! {
254      if #[cfg(target_feature="avx")] {
255        Self { avx: cmp_op_mask_m256::<{cmp_op!(EqualOrdered)}>(self.avx, rhs.avx) }
256      } else {
257        Self {
258          a : self.a.cmp_eq(rhs.a),
259          b : self.b.cmp_eq(rhs.b),
260        }
261      }
262    }
263  }
264}
265
266impl CmpGe for f32x8 {
267  type Output = Self;
268  #[inline]
269  #[must_use]
270  fn cmp_ge(self, rhs: Self) -> Self::Output {
271    pick! {
272      if #[cfg(target_feature="avx")] {
273        Self { avx: cmp_op_mask_m256::<{cmp_op!(GreaterEqualOrdered)}>(self.avx, rhs.avx) }
274      } else {
275        Self {
276          a : self.a.cmp_ge(rhs.a),
277          b : self.b.cmp_ge(rhs.b),
278        }
279      }
280    }
281  }
282}
283
284impl CmpGt for f32x8 {
285  type Output = Self;
286  #[inline]
287  #[must_use]
288  fn cmp_gt(self, rhs: Self) -> Self::Output {
289    pick! {
290      if #[cfg(target_feature="avx")] {
291        Self { avx: cmp_op_mask_m256::<{cmp_op!(GreaterThanOrdered)}>(self.avx, rhs.avx) }
292      } else {
293        Self {
294          a : self.a.cmp_gt(rhs.a),
295          b : self.b.cmp_gt(rhs.b),
296        }
297      }
298    }
299  }
300}
301
302impl CmpNe for f32x8 {
303  type Output = Self;
304  #[inline]
305  #[must_use]
306  fn cmp_ne(self, rhs: Self) -> Self::Output {
307    pick! {
308      if #[cfg(target_feature="avx")] {
309        Self { avx: cmp_op_mask_m256::<{cmp_op!(NotEqualOrdered)}>(self.avx, rhs.avx) }
310      } else {
311        Self {
312          a : self.a.cmp_ne(rhs.a),
313          b : self.b.cmp_ne(rhs.b),
314        }
315      }
316    }
317  }
318}
319
320impl CmpLe for f32x8 {
321  type Output = Self;
322  #[inline]
323  #[must_use]
324  fn cmp_le(self, rhs: Self) -> Self::Output {
325    pick! {
326      if #[cfg(target_feature="avx")] {
327        Self { avx: cmp_op_mask_m256::<{cmp_op!(LessEqualOrdered)}>(self.avx, rhs.avx) }
328      } else {
329        Self {
330          a : self.a.cmp_le(rhs.a),
331          b : self.b.cmp_le(rhs.b),
332        }
333      }
334    }
335  }
336}
337
338impl CmpLt for f32x8 {
339  type Output = Self;
340  #[inline]
341  #[must_use]
342  fn cmp_lt(self, rhs: Self) -> Self::Output {
343    pick! {
344      if #[cfg(target_feature="avx")] {
345        Self { avx: cmp_op_mask_m256::<{cmp_op!(LessThanOrdered)}>(self.avx, rhs.avx) }
346      } else {
347        Self {
348          a : self.a.cmp_lt(rhs.a),
349          b : self.b.cmp_lt(rhs.b),
350        }
351      }
352    }
353  }
354}
355
356impl f32x8 {
357  #[inline]
358  #[must_use]
359  pub const fn new(array: [f32; 8]) -> Self {
360    unsafe { core::intrinsics::transmute(array) }
361  }
362  #[inline]
363  #[must_use]
364  pub fn blend(self, t: Self, f: Self) -> Self {
365    pick! {
366      if #[cfg(target_feature="avx")] {
367        Self { avx: blend_varying_m256(f.avx, t.avx, self.avx) }
368      } else {
369        Self {
370          a : self.a.blend(t.a, f.a),
371          b : self.b.blend(t.b, f.b),
372        }
373      }
374    }
375  }
376  #[inline]
377  #[must_use]
378  pub fn abs(self) -> Self {
379    pick! {
380      if #[cfg(target_feature="avx")] {
381        let non_sign_bits = f32x8::from(f32::from_bits(i32::MAX as u32));
382        self & non_sign_bits
383      } else {
384        Self {
385          a : self.a.abs(),
386          b : self.b.abs(),
387        }
388      }
389    }
390  }
391  #[inline]
392  #[must_use]
393  pub fn floor(self) -> Self {
394    pick! {
395      if #[cfg(target_feature="avx")] {
396        Self { avx: floor_m256(self.avx) }
397      } else {
398        Self {
399          a : self.a.floor(),
400          b : self.b.floor(),
401        }
402      }
403    }
404  }
405  #[inline]
406  #[must_use]
407  pub fn ceil(self) -> Self {
408    pick! {
409      if #[cfg(target_feature="avx")] {
410        Self { avx: ceil_m256(self.avx) }
411      } else {
412        Self {
413          a : self.a.ceil(),
414          b : self.b.ceil(),
415        }
416      }
417    }
418  }
419
420  /// Calculates the lanewise maximum of both vectors. This is a faster
421  /// implementation than `max`, but it doesn't specify any behavior if NaNs are
422  /// involved.
423  #[inline]
424  #[must_use]
425  pub fn fast_max(self, rhs: Self) -> Self {
426    pick! {
427      if #[cfg(target_feature="avx")] {
428        Self { avx: max_m256(self.avx, rhs.avx) }
429      } else {
430        Self {
431          a : self.a.fast_max(rhs.a),
432          b : self.b.fast_max(rhs.b),
433        }
434      }
435    }
436  }
437
438  /// Calculates the lanewise maximum of both vectors. This doesn't match
439  /// IEEE-754 and instead is defined as `self < rhs ? rhs : self`.
440  #[inline]
441  #[must_use]
442  pub fn max(self, rhs: Self) -> Self {
443    pick! {
444      if #[cfg(target_feature="avx")] {
445        // max_m256 seems to do rhs < self ? self : rhs. So if there's any NaN
446        // involved, it chooses rhs, so we need to specifically check rhs for
447        // NaN.
448        rhs.is_nan().blend(self, Self { avx: max_m256(self.avx, rhs.avx) })
449      } else {
450        Self {
451          a : self.a.max(rhs.a),
452          b : self.b.max(rhs.b),
453        }
454      }
455
456    }
457  }
458
459  /// Calculates the lanewise minimum of both vectors. This is a faster
460  /// implementation than `min`, but it doesn't specify any behavior if NaNs are
461  /// involved.
462  #[inline]
463  #[must_use]
464  pub fn fast_min(self, rhs: Self) -> Self {
465    pick! {
466      if #[cfg(target_feature="avx")] {
467        Self { avx: min_m256(self.avx, rhs.avx) }
468      } else {
469        Self {
470          a : self.a.fast_min(rhs.a),
471          b : self.b.fast_min(rhs.b),
472        }
473      }
474    }
475  }
476
477  /// Calculates the lanewise minimum of both vectors. If either lane is NaN,
478  /// the other lane gets chosen. Use `fast_min` for a faster implementation
479  /// that doesn't handle NaNs.
480  #[inline]
481  #[must_use]
482  pub fn min(self, rhs: Self) -> Self {
483    pick! {
484      if #[cfg(target_feature="avx")] {
485        // min_m256 seems to do rhs > self ? self : rhs. So if there's any NaN
486        // involved, it chooses rhs, so we need to specifically check rhs for
487        // NaN.
488        rhs.is_nan().blend(self, Self { avx: min_m256(self.avx, rhs.avx) })
489      } else {
490        Self {
491          a : self.a.min(rhs.a),
492          b : self.b.min(rhs.b),
493        }
494      }
495    }
496  }
497  #[inline]
498  #[must_use]
499  pub fn is_nan(self) -> Self {
500    pick! {
501      if #[cfg(target_feature="avx")] {
502        Self { avx: cmp_op_mask_m256::<{cmp_op!(Unordered)}>(self.avx, self.avx) }
503      } else {
504        Self {
505          a : self.a.is_nan(),
506          b : self.b.is_nan(),
507        }
508      }
509    }
510  }
511  #[inline]
512  #[must_use]
513  pub fn is_finite(self) -> Self {
514    let shifted_exp_mask = u32x8::from(0xFF000000);
515    let u: u32x8 = cast(self);
516    let shift_u = u << 1_u64;
517    let out = !(shift_u & shifted_exp_mask).cmp_eq(shifted_exp_mask);
518    cast(out)
519  }
520  #[inline]
521  #[must_use]
522  pub fn is_inf(self) -> Self {
523    let shifted_inf = u32x8::from(0xFF000000);
524    let u: u32x8 = cast(self);
525    let shift_u = u << 1_u64;
526    let out = (shift_u).cmp_eq(shifted_inf);
527    cast(out)
528  }
529
530  #[inline]
531  #[must_use]
532  pub fn round(self) -> Self {
533    pick! {
534      // NOTE: Is there an SSE2 version of this? f32x4 version probably translates but I've not had time to figure it out
535      if #[cfg(target_feature="avx")] {
536        Self { avx: round_m256::<{round_op!(Nearest)}>(self.avx) }
537      } else {
538        Self {
539          a : self.a.round(),
540          b : self.b.round(),
541        }
542      }
543    }
544  }
545
546  /// Rounds each lane into an integer. This is a faster implementation than
547  /// `round_int`, but it doesn't handle out of range values or NaNs. For those
548  /// values you get implementation defined behavior.
549  #[inline]
550  #[must_use]
551  pub fn fast_round_int(self) -> i32x8 {
552    pick! {
553      if #[cfg(target_feature="avx")] {
554        cast(convert_to_i32_m256i_from_m256(self.avx))
555      } else {
556        cast([
557          self.a.fast_round_int(),
558          self.b.fast_round_int()])
559      }
560    }
561  }
562
563  /// Rounds each lane into an integer. This saturates out of range values and
564  /// turns NaNs into 0. Use `fast_round_int` for a faster implementation that
565  /// doesn't handle out of range values or NaNs.
566  #[inline]
567  #[must_use]
568  pub fn round_int(self) -> i32x8 {
569    pick! {
570      if #[cfg(target_feature="avx")] {
571        // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
572        let non_nan_mask = self.cmp_eq(self);
573        let non_nan = self & non_nan_mask;
574        let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0)));
575        let cast: i32x8 = cast(convert_to_i32_m256i_from_m256(non_nan.avx));
576        flip_to_max ^ cast
577      } else {
578        cast([
579          self.a.round_int(),
580          self.b.round_int(),
581        ])
582      }
583    }
584  }
585
586  /// Truncates each lane into an integer. This is a faster implementation than
587  /// `trunc_int`, but it doesn't handle out of range values or NaNs. For those
588  /// values you get implementation defined behavior.
589  #[inline]
590  #[must_use]
591  pub fn fast_trunc_int(self) -> i32x8 {
592    pick! {
593      if #[cfg(all(target_feature="avx"))] {
594        cast(convert_truncate_to_i32_m256i_from_m256(self.avx))
595      } else {
596        cast([
597          self.a.fast_trunc_int(),
598          self.b.fast_trunc_int(),
599        ])
600      }
601    }
602  }
603
604  /// Truncates each lane into an integer. This saturates out of range values
605  /// and turns NaNs into 0. Use `fast_trunc_int` for a faster implementation
606  /// that doesn't handle out of range values or NaNs.
607  #[inline]
608  #[must_use]
609  pub fn trunc_int(self) -> i32x8 {
610    pick! {
611        if #[cfg(target_feature="avx")] {
612        // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
613        let non_nan_mask = self.cmp_eq(self);
614        let non_nan = self & non_nan_mask;
615        let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0)));
616        let cast: i32x8 = cast(convert_truncate_to_i32_m256i_from_m256(non_nan.avx));
617        flip_to_max ^ cast
618      } else {
619        cast([
620          self.a.trunc_int(),
621          self.b.trunc_int(),
622        ])
623      }
624    }
625  }
626  #[inline]
627  #[must_use]
628  pub fn mul_add(self, m: Self, a: Self) -> Self {
629    pick! {
630      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
631        Self { avx: fused_mul_add_m256(self.avx, m.avx, a.avx) }
632      } else if #[cfg(target_feature="avx")] {
633        // still want to use 256 bit ops
634        (self * m) + a
635      } else {
636        Self {
637          a : self.a.mul_add(m.a, a.a),
638          b : self.b.mul_add(m.b, a.b),
639        }
640      }
641    }
642  }
643
644  #[inline]
645  #[must_use]
646  pub fn mul_sub(self, m: Self, a: Self) -> Self {
647    pick! {
648      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
649        Self { avx: fused_mul_sub_m256(self.avx, m.avx, a.avx) }
650      } else if #[cfg(target_feature="avx")] {
651        // still want to use 256 bit ops
652        (self * m) - a
653      } else {
654        Self {
655          a : self.a.mul_sub(m.a, a.a),
656          b : self.b.mul_sub(m.b, a.b),
657        }
658      }
659    }
660  }
661
662  #[inline]
663  #[must_use]
664  pub fn mul_neg_add(self, m: Self, a: Self) -> Self {
665    pick! {
666      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
667        Self { avx: fused_mul_neg_add_m256(self.avx, m.avx, a.avx) }
668      } else if #[cfg(target_feature="avx")] {
669        // still want to use 256 bit ops
670        a - (self * m)
671      } else {
672        Self {
673          a : self.a.mul_neg_add(m.a, a.a),
674          b : self.b.mul_neg_add(m.b, a.b),
675        }
676      }
677    }
678  }
679
680  #[inline]
681  #[must_use]
682  pub fn mul_neg_sub(self, m: Self, a: Self) -> Self {
683    pick! {
684      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
685        Self { avx: fused_mul_neg_sub_m256(self.avx, m.avx, a.avx) }
686      } else if #[cfg(target_feature="avx")] {
687        // still want to use 256 bit ops
688        -(self * m) - a
689      } else {
690        Self {
691          a : self.a.mul_neg_sub(m.a, a.a),
692          b : self.b.mul_neg_sub(m.b, a.b),
693        }
694      }
695    }
696  }
697
698  #[inline]
699  #[must_use]
700  pub fn flip_signs(self, signs: Self) -> Self {
701    self ^ (signs & Self::from(-0.0))
702  }
703
704  #[inline]
705  #[must_use]
706  pub fn copysign(self, sign: Self) -> Self {
707    let magnitude_mask = Self::from(f32::from_bits(u32::MAX >> 1));
708    (self & magnitude_mask) | (sign & Self::from(-0.0))
709  }
710
711  #[inline]
712  pub fn asin_acos(self) -> (Self, Self) {
713    // Based on the Agner Fog "vector class library":
714    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
715    const_f32_as_f32x8!(P4asinf, 4.2163199048E-2);
716    const_f32_as_f32x8!(P3asinf, 2.4181311049E-2);
717    const_f32_as_f32x8!(P2asinf, 4.5470025998E-2);
718    const_f32_as_f32x8!(P1asinf, 7.4953002686E-2);
719    const_f32_as_f32x8!(P0asinf, 1.6666752422E-1);
720
721    let xa = self.abs();
722    let big = xa.cmp_ge(f32x8::splat(0.5));
723
724    let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa);
725    let x2 = xa * xa;
726    let x3 = big.blend(x1, x2);
727
728    let xb = x1.sqrt();
729
730    let x4 = big.blend(xb, xa);
731
732    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
733    let z = z.mul_add(x3 * x4, x4);
734
735    let z1 = z + z;
736
737    // acos
738    let z3 = self.cmp_lt(f32x8::ZERO).blend(f32x8::PI - z1, z1);
739    let z4 = f32x8::FRAC_PI_2 - z.flip_signs(self);
740    let acos = big.blend(z3, z4);
741
742    // asin
743    let z3 = f32x8::FRAC_PI_2 - z1;
744    let asin = big.blend(z3, z);
745    let asin = asin.flip_signs(self);
746
747    (asin, acos)
748  }
749
750  #[inline]
751  #[must_use]
752  pub fn asin(self) -> Self {
753    // Based on the Agner Fog "vector class library":
754    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
755    const_f32_as_f32x8!(P4asinf, 4.2163199048E-2);
756    const_f32_as_f32x8!(P3asinf, 2.4181311049E-2);
757    const_f32_as_f32x8!(P2asinf, 4.5470025998E-2);
758    const_f32_as_f32x8!(P1asinf, 7.4953002686E-2);
759    const_f32_as_f32x8!(P0asinf, 1.6666752422E-1);
760
761    let xa = self.abs();
762    let big = xa.cmp_ge(f32x8::splat(0.5));
763
764    let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa);
765    let x2 = xa * xa;
766    let x3 = big.blend(x1, x2);
767
768    let xb = x1.sqrt();
769
770    let x4 = big.blend(xb, xa);
771
772    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
773    let z = z.mul_add(x3 * x4, x4);
774
775    let z1 = z + z;
776
777    // asin
778    let z3 = f32x8::FRAC_PI_2 - z1;
779    let asin = big.blend(z3, z);
780    let asin = asin.flip_signs(self);
781
782    asin
783  }
784
785  #[inline]
786  #[must_use]
787  pub fn acos(self) -> Self {
788    // Based on the Agner Fog "vector class library":
789    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
790    const_f32_as_f32x8!(P4asinf, 4.2163199048E-2);
791    const_f32_as_f32x8!(P3asinf, 2.4181311049E-2);
792    const_f32_as_f32x8!(P2asinf, 4.5470025998E-2);
793    const_f32_as_f32x8!(P1asinf, 7.4953002686E-2);
794    const_f32_as_f32x8!(P0asinf, 1.6666752422E-1);
795
796    let xa = self.abs();
797    let big = xa.cmp_ge(f32x8::splat(0.5));
798
799    let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa);
800    let x2 = xa * xa;
801    let x3 = big.blend(x1, x2);
802
803    let xb = x1.sqrt();
804
805    let x4 = big.blend(xb, xa);
806
807    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
808    let z = z.mul_add(x3 * x4, x4);
809
810    let z1 = z + z;
811
812    // acos
813    let z3 = self.cmp_lt(f32x8::ZERO).blend(f32x8::PI - z1, z1);
814    let z4 = f32x8::FRAC_PI_2 - z.flip_signs(self);
815    let acos = big.blend(z3, z4);
816
817    acos
818  }
819
820  #[inline]
821  pub fn atan(self) -> Self {
822    // Based on the Agner Fog "vector class library":
823    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
824    const_f32_as_f32x8!(P3atanf, 8.05374449538E-2);
825    const_f32_as_f32x8!(P2atanf, -1.38776856032E-1);
826    const_f32_as_f32x8!(P1atanf, 1.99777106478E-1);
827    const_f32_as_f32x8!(P0atanf, -3.33329491539E-1);
828
829    let t = self.abs();
830
831    // small:  z = t / 1.0;
832    // medium: z = (t-1.0) / (t+1.0);
833    // big:    z = -1.0 / t;
834    let notsmal = t.cmp_ge(Self::SQRT_2 - Self::ONE);
835    let notbig = t.cmp_le(Self::SQRT_2 + Self::ONE);
836
837    let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2);
838    s = notsmal & s;
839
840    let mut a = notbig & t;
841    a = notsmal.blend(a - Self::ONE, a);
842    let mut b = notbig & Self::ONE;
843    b = notsmal.blend(b + t, b);
844    let z = a / b;
845
846    let zz = z * z;
847
848    // Taylor expansion
849    let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
850    re = re.mul_add(zz * z, z) + s;
851
852    // get sign bit
853    re = (self.sign_bit()).blend(-re, re);
854
855    re
856  }
857
858  #[inline]
859  pub fn atan2(self, x: Self) -> Self {
860    // Based on the Agner Fog "vector class library":
861    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
862    const_f32_as_f32x8!(P3atanf, 8.05374449538E-2);
863    const_f32_as_f32x8!(P2atanf, -1.38776856032E-1);
864    const_f32_as_f32x8!(P1atanf, 1.99777106478E-1);
865    const_f32_as_f32x8!(P0atanf, -3.33329491539E-1);
866
867    let y = self;
868
869    // move in first octant
870    let x1 = x.abs();
871    let y1 = y.abs();
872    let swapxy = y1.cmp_gt(x1);
873    // swap x and y if y1 > x1
874    let mut x2 = swapxy.blend(y1, x1);
875    let mut y2 = swapxy.blend(x1, y1);
876
877    // check for special case: x and y are both +/- INF
878    let both_infinite = x.is_inf() & y.is_inf();
879    if both_infinite.any() {
880      let minus_one = -Self::ONE;
881      x2 = both_infinite.blend(x2 & minus_one, x2);
882      y2 = both_infinite.blend(y2 & minus_one, y2);
883    }
884
885    // x = y = 0 will produce NAN. No problem, fixed below
886    let t = y2 / x2;
887
888    // small:  z = t / 1.0;
889    // medium: z = (t-1.0) / (t+1.0);
890    let notsmal = t.cmp_ge(Self::SQRT_2 - Self::ONE);
891
892    let a = notsmal.blend(t - Self::ONE, t);
893    let b = notsmal.blend(t + Self::ONE, Self::ONE);
894    let s = notsmal & Self::FRAC_PI_4;
895    let z = a / b;
896
897    let zz = z * z;
898
899    // Taylor expansion
900    let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
901    re = re.mul_add(zz * z, z) + s;
902
903    // move back in place
904    re = swapxy.blend(Self::FRAC_PI_2 - re, re);
905    re = ((x | y).cmp_eq(Self::ZERO)).blend(Self::ZERO, re);
906    re = (x.sign_bit()).blend(Self::PI - re, re);
907
908    // get sign bit
909    re = (y.sign_bit()).blend(-re, re);
910
911    re
912  }
913
914  #[inline]
915  #[must_use]
916  pub fn sin_cos(self) -> (Self, Self) {
917    // Based on the Agner Fog "vector class library":
918    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
919
920    const_f32_as_f32x8!(DP1F, 0.78515625_f32 * 2.0);
921    const_f32_as_f32x8!(DP2F, 2.4187564849853515625E-4_f32 * 2.0);
922    const_f32_as_f32x8!(DP3F, 3.77489497744594108E-8_f32 * 2.0);
923
924    const_f32_as_f32x8!(P0sinf, -1.6666654611E-1);
925    const_f32_as_f32x8!(P1sinf, 8.3321608736E-3);
926    const_f32_as_f32x8!(P2sinf, -1.9515295891E-4);
927
928    const_f32_as_f32x8!(P0cosf, 4.166664568298827E-2);
929    const_f32_as_f32x8!(P1cosf, -1.388731625493765E-3);
930    const_f32_as_f32x8!(P2cosf, 2.443315711809948E-5);
931
932    const_f32_as_f32x8!(TWO_OVER_PI, 2.0 / core::f32::consts::PI);
933
934    let xa = self.abs();
935
936    // Find quadrant
937    let y = (xa * TWO_OVER_PI).round();
938    let q: i32x8 = y.round_int();
939
940    let x = y.mul_neg_add(DP3F, y.mul_neg_add(DP2F, y.mul_neg_add(DP1F, xa)));
941
942    let x2 = x * x;
943    let mut s = polynomial_2!(x2, P0sinf, P1sinf, P2sinf) * (x * x2) + x;
944    let mut c = polynomial_2!(x2, P0cosf, P1cosf, P2cosf) * (x2 * x2)
945      + f32x8::from(0.5).mul_neg_add(x2, f32x8::from(1.0));
946
947    let swap = !(q & i32x8::from(1)).cmp_eq(i32x8::from(0));
948
949    let mut overflow: f32x8 = cast(q.cmp_gt(i32x8::from(0x2000000)));
950    overflow &= xa.is_finite();
951    s = overflow.blend(f32x8::from(0.0), s);
952    c = overflow.blend(f32x8::from(1.0), c);
953
954    // calc sin
955    let mut sin1 = cast::<_, f32x8>(swap).blend(c, s);
956    let sign_sin: i32x8 = (q << 30) ^ cast::<_, i32x8>(self);
957    sin1 = sin1.flip_signs(cast(sign_sin));
958
959    // calc cos
960    let mut cos1 = cast::<_, f32x8>(swap).blend(s, c);
961    let sign_cos: i32x8 = ((q + i32x8::from(1)) & i32x8::from(2)) << 30;
962    cos1 ^= cast::<_, f32x8>(sign_cos);
963
964    (sin1, cos1)
965  }
966  #[inline]
967  #[must_use]
968  pub fn sin(self) -> Self {
969    let (s, _) = self.sin_cos();
970    s
971  }
972  #[inline]
973  #[must_use]
974  pub fn cos(self) -> Self {
975    let (_, c) = self.sin_cos();
976    c
977  }
978  #[inline]
979  #[must_use]
980  pub fn tan(self) -> Self {
981    let (s, c) = self.sin_cos();
982    s / c
983  }
984  #[inline]
985  #[must_use]
986  pub fn to_degrees(self) -> Self {
987    const_f32_as_f32x8!(RAD_TO_DEG_RATIO, 180.0_f32 / core::f32::consts::PI);
988    self * RAD_TO_DEG_RATIO
989  }
990  #[inline]
991  #[must_use]
992  pub fn to_radians(self) -> Self {
993    const_f32_as_f32x8!(DEG_TO_RAD_RATIO, core::f32::consts::PI / 180.0_f32);
994    self * DEG_TO_RAD_RATIO
995  }
996  #[inline]
997  #[must_use]
998  pub fn recip(self) -> Self {
999    pick! {
1000      if #[cfg(target_feature="avx")] {
1001        Self { avx: reciprocal_m256(self.avx) }
1002      } else {
1003        Self {
1004          a : self.a.recip(),
1005          b : self.b.recip(),
1006        }
1007      }
1008    }
1009  }
1010  #[inline]
1011  #[must_use]
1012  pub fn recip_sqrt(self) -> Self {
1013    pick! {
1014      if #[cfg(target_feature="avx")] {
1015        Self { avx: reciprocal_sqrt_m256(self.avx) }
1016      } else {
1017        Self {
1018          a : self.a.recip_sqrt(),
1019          b : self.b.recip_sqrt(),
1020        }
1021      }
1022    }
1023  }
1024  #[inline]
1025  #[must_use]
1026  pub fn sqrt(self) -> Self {
1027    pick! {
1028      if #[cfg(target_feature="avx")] {
1029        Self { avx: sqrt_m256(self.avx) }
1030      } else {
1031        Self {
1032          a : self.a.sqrt(),
1033          b : self.b.sqrt(),
1034        }
1035      }
1036    }
1037  }
1038  #[inline]
1039  #[must_use]
1040  pub fn move_mask(self) -> i32 {
1041    pick! {
1042      if #[cfg(target_feature="avx")] {
1043        move_mask_m256(self.avx)
1044      } else {
1045        (self.b.move_mask() << 4) | self.a.move_mask()
1046      }
1047    }
1048  }
1049  #[inline]
1050  #[must_use]
1051  pub fn any(self) -> bool {
1052    pick! {
1053      if #[cfg(target_feature="avx")] {
1054        move_mask_m256(self.avx) != 0
1055      } else {
1056        self.a.any() || self.b.any()
1057      }
1058    }
1059  }
1060  #[inline]
1061  #[must_use]
1062  pub fn all(self) -> bool {
1063    pick! {
1064      if #[cfg(target_feature="avx")] {
1065        move_mask_m256(self.avx) == 0b11111111
1066      } else {
1067        self.a.all() && self.b.all()
1068      }
1069    }
1070  }
1071  #[inline]
1072  #[must_use]
1073  pub fn none(self) -> bool {
1074    !self.any()
1075  }
1076
1077  #[inline]
1078  fn vm_pow2n(self) -> Self {
1079    const_f32_as_f32x8!(pow2_23, 8388608.0);
1080    const_f32_as_f32x8!(bias, 127.0);
1081    let a = self + (bias + pow2_23);
1082    let c = cast::<_, i32x8>(a) << 23;
1083    cast::<_, f32x8>(c)
1084  }
1085
1086  /// Calculate the exponent of a packed `f32x8`
1087  #[inline]
1088  #[must_use]
1089  pub fn exp(self) -> Self {
1090    const_f32_as_f32x8!(P0, 1.0 / 2.0);
1091    const_f32_as_f32x8!(P1, 1.0 / 6.0);
1092    const_f32_as_f32x8!(P2, 1. / 24.);
1093    const_f32_as_f32x8!(P3, 1. / 120.);
1094    const_f32_as_f32x8!(P4, 1. / 720.);
1095    const_f32_as_f32x8!(P5, 1. / 5040.);
1096    const_f32_as_f32x8!(LN2D_HI, 0.693359375);
1097    const_f32_as_f32x8!(LN2D_LO, -2.12194440e-4);
1098    let max_x = f32x8::from(87.3);
1099    let r = (self * Self::LOG2_E).round();
1100    let x = r.mul_neg_add(LN2D_HI, self);
1101    let x = r.mul_neg_add(LN2D_LO, x);
1102    let z = polynomial_5!(x, P0, P1, P2, P3, P4, P5);
1103    let x2 = x * x;
1104    let z = z.mul_add(x2, x);
1105    let n2 = Self::vm_pow2n(r);
1106    let z = (z + Self::ONE) * n2;
1107    // check for overflow
1108    let in_range = self.abs().cmp_lt(max_x);
1109    let in_range = in_range & self.is_finite();
1110    in_range.blend(z, Self::ZERO)
1111  }
1112
1113  #[inline]
1114  fn exponent(self) -> f32x8 {
1115    const_f32_as_f32x8!(pow2_23, 8388608.0);
1116    const_f32_as_f32x8!(bias, 127.0);
1117    let a = cast::<_, u32x8>(self);
1118    let b = a >> 23;
1119    let c = b | cast::<_, u32x8>(pow2_23);
1120    let d = cast::<_, f32x8>(c);
1121    let e = d - (pow2_23 + bias);
1122    e
1123  }
1124
1125  #[inline]
1126  fn fraction_2(self) -> Self {
1127    let t1 = cast::<_, u32x8>(self);
1128    let t2 = cast::<_, u32x8>(
1129      (t1 & u32x8::from(0x007FFFFF)) | u32x8::from(0x3F000000),
1130    );
1131    cast::<_, f32x8>(t2)
1132  }
1133  #[inline]
1134  fn is_zero_or_subnormal(self) -> Self {
1135    let t = cast::<_, i32x8>(self);
1136    let t = t & i32x8::splat(0x7F800000);
1137    i32x8::round_float(t.cmp_eq(i32x8::splat(0)))
1138  }
1139  #[inline]
1140  fn infinity() -> Self {
1141    cast::<_, f32x8>(i32x8::splat(0x7F800000))
1142  }
1143  #[inline]
1144  fn nan_log() -> Self {
1145    cast::<_, f32x8>(i32x8::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1146  }
1147  #[inline]
1148  fn nan_pow() -> Self {
1149    cast::<_, f32x8>(i32x8::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1150  }
1151  #[inline]
1152  pub fn sign_bit(self) -> Self {
1153    let t1 = cast::<_, i32x8>(self);
1154    let t2 = t1 >> 31;
1155    !cast::<_, f32x8>(t2).cmp_eq(f32x8::ZERO)
1156  }
1157
1158  /// horizontal add of all the elements of the vector
1159  #[inline]
1160  #[must_use]
1161  pub fn reduce_add(self) -> f32 {
1162    pick! {
1163      // From https://stackoverflow.com/questions/13219146/how-to-sum-m256-horizontally
1164      if #[cfg(target_feature="avx")]{
1165        let hi_quad = extract_m128_from_m256::<1>(self.avx);
1166        let lo_quad = cast_to_m128_from_m256(self.avx);
1167        let sum_quad = add_m128(lo_quad,hi_quad);
1168        let lo_dual = sum_quad;
1169        let hi_dual = move_high_low_m128(sum_quad,sum_quad);
1170        let sum_dual = add_m128(lo_dual,hi_dual);
1171        let lo = sum_dual;
1172        let hi = shuffle_abi_f32_all_m128::<0b_01>(sum_dual, sum_dual);
1173        let sum = add_m128_s(lo, hi);
1174        get_f32_from_m128_s(sum)
1175      } else {
1176        self.a.reduce_add() + self.b.reduce_add()
1177      }
1178    }
1179  }
1180
1181  /// Natural log (ln(x))
1182  #[inline]
1183  #[must_use]
1184  pub fn ln(self) -> Self {
1185    const_f32_as_f32x8!(HALF, 0.5);
1186    const_f32_as_f32x8!(P0, 3.3333331174E-1);
1187    const_f32_as_f32x8!(P1, -2.4999993993E-1);
1188    const_f32_as_f32x8!(P2, 2.0000714765E-1);
1189    const_f32_as_f32x8!(P3, -1.6668057665E-1);
1190    const_f32_as_f32x8!(P4, 1.4249322787E-1);
1191    const_f32_as_f32x8!(P5, -1.2420140846E-1);
1192    const_f32_as_f32x8!(P6, 1.1676998740E-1);
1193    const_f32_as_f32x8!(P7, -1.1514610310E-1);
1194    const_f32_as_f32x8!(P8, 7.0376836292E-2);
1195    const_f32_as_f32x8!(LN2F_HI, 0.693359375);
1196    const_f32_as_f32x8!(LN2F_LO, -2.12194440e-4);
1197    const_f32_as_f32x8!(VM_SMALLEST_NORMAL, 1.17549435E-38);
1198
1199    let x1 = self;
1200    let x = Self::fraction_2(x1);
1201    let e = Self::exponent(x1);
1202    let mask = x.cmp_gt(Self::SQRT_2 * HALF);
1203    let x = (!mask).blend(x + x, x);
1204    let fe = mask.blend(e + Self::ONE, e);
1205    let x = x - Self::ONE;
1206    let res = polynomial_8!(x, P0, P1, P2, P3, P4, P5, P6, P7, P8);
1207    let x2 = x * x;
1208    let res = x2 * x * res;
1209    let res = fe.mul_add(LN2F_LO, res);
1210    let res = res + x2.mul_neg_add(HALF, x);
1211    let res = fe.mul_add(LN2F_HI, res);
1212    let overflow = !self.is_finite();
1213    let underflow = x1.cmp_lt(VM_SMALLEST_NORMAL);
1214    let mask = overflow | underflow;
1215    if !mask.any() {
1216      res
1217    } else {
1218      let is_zero = self.is_zero_or_subnormal();
1219      let res = underflow.blend(Self::nan_log(), res);
1220      let res = is_zero.blend(Self::infinity(), res);
1221      let res = overflow.blend(self, res);
1222      res
1223    }
1224  }
1225
1226  #[inline]
1227  #[must_use]
1228  pub fn log2(self) -> Self {
1229    Self::ln(self) * Self::LOG2_E
1230  }
1231  #[inline]
1232  #[must_use]
1233  pub fn log10(self) -> Self {
1234    Self::ln(self) * Self::LOG10_E
1235  }
1236
1237  #[inline]
1238  #[must_use]
1239  pub fn pow_f32x8(self, y: Self) -> Self {
1240    const_f32_as_f32x8!(ln2f_hi, 0.693359375);
1241    const_f32_as_f32x8!(ln2f_lo, -2.12194440e-4);
1242    const_f32_as_f32x8!(P0logf, 3.3333331174E-1);
1243    const_f32_as_f32x8!(P1logf, -2.4999993993E-1);
1244    const_f32_as_f32x8!(P2logf, 2.0000714765E-1);
1245    const_f32_as_f32x8!(P3logf, -1.6668057665E-1);
1246    const_f32_as_f32x8!(P4logf, 1.4249322787E-1);
1247    const_f32_as_f32x8!(P5logf, -1.2420140846E-1);
1248    const_f32_as_f32x8!(P6logf, 1.1676998740E-1);
1249    const_f32_as_f32x8!(P7logf, -1.1514610310E-1);
1250    const_f32_as_f32x8!(P8logf, 7.0376836292E-2);
1251
1252    const_f32_as_f32x8!(p2expf, 1.0 / 2.0); // coefficients for Taylor expansion of exp
1253    const_f32_as_f32x8!(p3expf, 1.0 / 6.0);
1254    const_f32_as_f32x8!(p4expf, 1.0 / 24.0);
1255    const_f32_as_f32x8!(p5expf, 1.0 / 120.0);
1256    const_f32_as_f32x8!(p6expf, 1.0 / 720.0);
1257    const_f32_as_f32x8!(p7expf, 1.0 / 5040.0);
1258
1259    let x1 = self.abs();
1260    let x = x1.fraction_2();
1261    let mask = x.cmp_gt(f32x8::SQRT_2 * f32x8::HALF);
1262    let x = (!mask).blend(x + x, x);
1263
1264    let x = x - f32x8::ONE;
1265    let x2 = x * x;
1266    let lg1 = polynomial_8!(
1267      x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf
1268    );
1269    let lg1 = lg1 * x2 * x;
1270
1271    let ef = x1.exponent();
1272    let ef = mask.blend(ef + f32x8::ONE, ef);
1273    let e1 = (ef * y).round();
1274    let yr = ef.mul_sub(y, e1);
1275
1276    let lg = f32x8::HALF.mul_neg_add(x2, x) + lg1;
1277    let x2_err = (f32x8::HALF * x).mul_sub(x, f32x8::HALF * x2);
1278    let lg_err = f32x8::HALF.mul_add(x2, lg - x) - lg1;
1279
1280    let e2 = (lg * y * f32x8::LOG2_E).round();
1281    let v = lg.mul_sub(y, e2 * ln2f_hi);
1282    let v = e2.mul_neg_add(ln2f_lo, v);
1283    let v = v - (lg_err + x2_err).mul_sub(y, yr * f32x8::LN_2);
1284
1285    let x = v;
1286    let e3 = (x * f32x8::LOG2_E).round();
1287    let x = e3.mul_neg_add(f32x8::LN_2, x);
1288    let x2 = x * x;
1289    let z = x2.mul_add(
1290      polynomial_5!(x, p2expf, p3expf, p4expf, p5expf, p6expf, p7expf),
1291      x + f32x8::ONE,
1292    );
1293
1294    let ee = e1 + e2 + e3;
1295    let ei = cast::<_, i32x8>(ee.round_int());
1296    let ej = cast::<_, i32x8>(ei + (cast::<_, i32x8>(z) >> 23));
1297
1298    let overflow = cast::<_, f32x8>(ej.cmp_gt(i32x8::splat(0x0FF)))
1299      | (ee.cmp_gt(f32x8::splat(300.0)));
1300    let underflow = cast::<_, f32x8>(ej.cmp_lt(i32x8::splat(0x000)))
1301      | (ee.cmp_lt(f32x8::splat(-300.0)));
1302
1303    // Add exponent by integer addition
1304    let z = cast::<_, f32x8>(cast::<_, i32x8>(z) + (ei << 23));
1305    // Check for overflow/underflow
1306    let z = underflow.blend(f32x8::ZERO, z);
1307    let z = overflow.blend(Self::infinity(), z);
1308
1309    // Check for self == 0
1310    let x_zero = self.is_zero_or_subnormal();
1311    let z = x_zero.blend(
1312      y.cmp_lt(f32x8::ZERO).blend(
1313        Self::infinity(),
1314        y.cmp_eq(f32x8::ZERO).blend(f32x8::ONE, f32x8::ZERO),
1315      ),
1316      z,
1317    );
1318
1319    let x_sign = self.sign_bit();
1320    let z = if x_sign.any() {
1321      // Y into an integer
1322      let yi = y.cmp_eq(y.round());
1323
1324      // Is y odd?
1325      let y_odd = cast::<_, i32x8>(y.round_int() << 31).round_float();
1326
1327      let z1 =
1328        yi.blend(z | y_odd, self.cmp_eq(Self::ZERO).blend(z, Self::nan_pow()));
1329
1330      x_sign.blend(z1, z)
1331    } else {
1332      z
1333    };
1334
1335    let x_finite = self.is_finite();
1336    let y_finite = y.is_finite();
1337    let e_finite = ee.is_finite();
1338    if (x_finite & y_finite & (e_finite | x_zero)).all() {
1339      return z;
1340    }
1341
1342    (self.is_nan() | y.is_nan()).blend(self + y, z)
1343  }
1344  #[inline]
1345  pub fn powf(self, y: f32) -> Self {
1346    Self::pow_f32x8(self, f32x8::splat(y))
1347  }
1348
1349  /// Transpose matrix of 8x8 `f32` matrix. Currently only accelerated on AVX.
1350  #[must_use]
1351  #[inline]
1352  pub fn transpose(data: [f32x8; 8]) -> [f32x8; 8] {
1353    pick! {
1354      if #[cfg(target_feature="avx")] {
1355        let a0 = unpack_lo_m256(data[0].avx, data[1].avx);
1356        let a1 = unpack_hi_m256(data[0].avx, data[1].avx);
1357        let a2 = unpack_lo_m256(data[2].avx, data[3].avx);
1358        let a3 = unpack_hi_m256(data[2].avx, data[3].avx);
1359        let a4 = unpack_lo_m256(data[4].avx, data[5].avx);
1360        let a5 = unpack_hi_m256(data[4].avx, data[5].avx);
1361        let a6 = unpack_lo_m256(data[6].avx, data[7].avx);
1362        let a7 = unpack_hi_m256(data[6].avx, data[7].avx);
1363
1364        pub const fn mm_shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 {
1365          (z << 6) | (y << 4) | (x << 2) | w
1366        }
1367
1368        const SHUFF_LO : i32 = mm_shuffle(1,0,1,0);
1369        const SHUFF_HI : i32 = mm_shuffle(3,2,3,2);
1370
1371        // possible todo: intel performance manual suggests alternative with blend to avoid port 5 pressure
1372        // (since blend runs on a different port than shuffle)
1373        let b0 = shuffle_m256::<SHUFF_LO>(a0,a2);
1374        let b1 = shuffle_m256::<SHUFF_HI>(a0,a2);
1375        let b2 = shuffle_m256::<SHUFF_LO>(a1,a3);
1376        let b3 = shuffle_m256::<SHUFF_HI>(a1,a3);
1377        let b4 = shuffle_m256::<SHUFF_LO>(a4,a6);
1378        let b5 = shuffle_m256::<SHUFF_HI>(a4,a6);
1379        let b6 = shuffle_m256::<SHUFF_LO>(a5,a7);
1380        let b7 = shuffle_m256::<SHUFF_HI>(a5,a7);
1381
1382        [
1383          f32x8 { avx: permute2z_m256::<0x20>(b0, b4) },
1384          f32x8 { avx: permute2z_m256::<0x20>(b1, b5) },
1385          f32x8 { avx: permute2z_m256::<0x20>(b2, b6) },
1386          f32x8 { avx: permute2z_m256::<0x20>(b3, b7) },
1387          f32x8 { avx: permute2z_m256::<0x31>(b0, b4) },
1388          f32x8 { avx: permute2z_m256::<0x31>(b1, b5) },
1389          f32x8 { avx: permute2z_m256::<0x31>(b2, b6) },
1390          f32x8 { avx: permute2z_m256::<0x31>(b3, b7) }
1391        ]
1392      } else {
1393        // possible todo: not sure that 128bit SIMD gives us a a lot of speedup here
1394
1395        #[inline(always)]
1396        fn transpose_column(data: &[f32x8; 8], index: usize) -> f32x8 {
1397          f32x8::new([
1398            data[0].as_array_ref()[index],
1399            data[1].as_array_ref()[index],
1400            data[2].as_array_ref()[index],
1401            data[3].as_array_ref()[index],
1402            data[4].as_array_ref()[index],
1403            data[5].as_array_ref()[index],
1404            data[6].as_array_ref()[index],
1405            data[7].as_array_ref()[index],
1406          ])
1407        }
1408
1409        [
1410          transpose_column(&data, 0),
1411          transpose_column(&data, 1),
1412          transpose_column(&data, 2),
1413          transpose_column(&data, 3),
1414          transpose_column(&data, 4),
1415          transpose_column(&data, 5),
1416          transpose_column(&data, 6),
1417          transpose_column(&data, 7),
1418        ]
1419      }
1420    }
1421  }
1422
1423  #[inline]
1424  pub fn to_array(self) -> [f32; 8] {
1425    cast(self)
1426  }
1427
1428  #[inline]
1429  pub fn as_array_ref(&self) -> &[f32; 8] {
1430    cast_ref(self)
1431  }
1432
1433  #[inline]
1434  pub fn as_array_mut(&mut self) -> &mut [f32; 8] {
1435    cast_mut(self)
1436  }
1437
1438  #[inline]
1439  pub fn from_i32x8(v: i32x8) -> Self {
1440    pick! {
1441      if #[cfg(target_feature="avx2")] {
1442        Self { avx: convert_to_m256_from_i32_m256i(v.avx2) }
1443      } else {
1444        Self::new([
1445            v.as_array_ref()[0] as f32,
1446            v.as_array_ref()[1] as f32,
1447            v.as_array_ref()[2] as f32,
1448            v.as_array_ref()[3] as f32,
1449            v.as_array_ref()[4] as f32,
1450            v.as_array_ref()[5] as f32,
1451            v.as_array_ref()[6] as f32,
1452            v.as_array_ref()[7] as f32,
1453          ])
1454      }
1455    }
1456  }
1457}
1458
1459impl Not for f32x8 {
1460  type Output = Self;
1461  #[inline]
1462  fn not(self) -> Self {
1463    pick! {
1464      if #[cfg(target_feature="avx")] {
1465        Self { avx: self.avx.not()  }
1466      } else {
1467        Self {
1468          a : self.a.not(),
1469          b : self.b.not(),
1470        }
1471      }
1472    }
1473  }
1474}