wide/
f64x4_.rs

1use super::*;
2
3pick! {
4  if #[cfg(target_feature="avx")] {
5    #[derive(Default, Clone, Copy, PartialEq)]
6    #[repr(C, align(32))]
7    pub struct f64x4 { pub(crate) avx: m256d }
8  } else {
9    #[derive(Default, Clone, Copy, PartialEq)]
10    #[repr(C, align(32))]
11    pub struct f64x4 { pub(crate) a: f64x2, pub(crate) b: f64x2 }
12  }
13}
14
15macro_rules! const_f64_as_f64x4 {
16  ($i:ident, $f:expr) => {
17    #[allow(non_upper_case_globals)]
18    pub const $i: f64x4 = f64x4::new([$f; 4]);
19  };
20}
21
22impl f64x4 {
23  const_f64_as_f64x4!(ONE, 1.0);
24  const_f64_as_f64x4!(ZERO, 0.0);
25  const_f64_as_f64x4!(HALF, 0.5);
26  const_f64_as_f64x4!(E, core::f64::consts::E);
27  const_f64_as_f64x4!(FRAC_1_PI, core::f64::consts::FRAC_1_PI);
28  const_f64_as_f64x4!(FRAC_2_PI, core::f64::consts::FRAC_2_PI);
29  const_f64_as_f64x4!(FRAC_2_SQRT_PI, core::f64::consts::FRAC_2_SQRT_PI);
30  const_f64_as_f64x4!(FRAC_1_SQRT_2, core::f64::consts::FRAC_1_SQRT_2);
31  const_f64_as_f64x4!(FRAC_PI_2, core::f64::consts::FRAC_PI_2);
32  const_f64_as_f64x4!(FRAC_PI_3, core::f64::consts::FRAC_PI_3);
33  const_f64_as_f64x4!(FRAC_PI_4, core::f64::consts::FRAC_PI_4);
34  const_f64_as_f64x4!(FRAC_PI_6, core::f64::consts::FRAC_PI_6);
35  const_f64_as_f64x4!(FRAC_PI_8, core::f64::consts::FRAC_PI_8);
36  const_f64_as_f64x4!(LN_2, core::f64::consts::LN_2);
37  const_f64_as_f64x4!(LN_10, core::f64::consts::LN_10);
38  const_f64_as_f64x4!(LOG2_E, core::f64::consts::LOG2_E);
39  const_f64_as_f64x4!(LOG10_E, core::f64::consts::LOG10_E);
40  const_f64_as_f64x4!(LOG10_2, core::f64::consts::LOG10_2);
41  const_f64_as_f64x4!(LOG2_10, core::f64::consts::LOG2_10);
42  const_f64_as_f64x4!(PI, core::f64::consts::PI);
43  const_f64_as_f64x4!(SQRT_2, core::f64::consts::SQRT_2);
44  const_f64_as_f64x4!(TAU, core::f64::consts::TAU);
45}
46
47unsafe impl Zeroable for f64x4 {}
48unsafe impl Pod for f64x4 {}
49
50impl Add for f64x4 {
51  type Output = Self;
52  #[inline]
53  #[must_use]
54  fn add(self, rhs: Self) -> Self::Output {
55    pick! {
56      if #[cfg(target_feature="avx")] {
57        Self { avx: add_m256d(self.avx, rhs.avx) }
58      } else {
59        Self {
60          a : self.a.add(rhs.a),
61          b : self.b.add(rhs.b),
62        }
63      }
64    }
65  }
66}
67
68impl Sub for f64x4 {
69  type Output = Self;
70  #[inline]
71  #[must_use]
72  fn sub(self, rhs: Self) -> Self::Output {
73    pick! {
74      if #[cfg(target_feature="avx")] {
75        Self { avx: sub_m256d(self.avx, rhs.avx) }
76      } else {
77        Self {
78          a : self.a.sub(rhs.a),
79          b : self.b.sub(rhs.b),
80        }
81      }
82    }
83  }
84}
85
86impl Mul for f64x4 {
87  type Output = Self;
88  #[inline]
89  #[must_use]
90  fn mul(self, rhs: Self) -> Self::Output {
91    pick! {
92      if #[cfg(target_feature="avx")] {
93        Self { avx: mul_m256d(self.avx, rhs.avx) }
94      } else {
95        Self {
96          a : self.a.mul(rhs.a),
97          b : self.b.mul(rhs.b),
98        }
99      }
100    }
101  }
102}
103
104impl Div for f64x4 {
105  type Output = Self;
106  #[inline]
107  #[must_use]
108  fn div(self, rhs: Self) -> Self::Output {
109    pick! {
110      if #[cfg(target_feature="avx")] {
111        Self { avx: div_m256d(self.avx, rhs.avx) }
112      } else {
113        Self {
114          a : self.a.div(rhs.a),
115          b : self.b.div(rhs.b),
116        }
117      }
118    }
119  }
120}
121
122impl Add<f64> for f64x4 {
123  type Output = Self;
124  #[inline]
125  #[must_use]
126  fn add(self, rhs: f64) -> Self::Output {
127    self.add(Self::splat(rhs))
128  }
129}
130
131impl Sub<f64> for f64x4 {
132  type Output = Self;
133  #[inline]
134  #[must_use]
135  fn sub(self, rhs: f64) -> Self::Output {
136    self.sub(Self::splat(rhs))
137  }
138}
139
140impl Mul<f64> for f64x4 {
141  type Output = Self;
142  #[inline]
143  #[must_use]
144  fn mul(self, rhs: f64) -> Self::Output {
145    self.mul(Self::splat(rhs))
146  }
147}
148
149impl Div<f64> for f64x4 {
150  type Output = Self;
151  #[inline]
152  #[must_use]
153  fn div(self, rhs: f64) -> Self::Output {
154    self.div(Self::splat(rhs))
155  }
156}
157
158impl Add<f64x4> for f64 {
159  type Output = f64x4;
160  #[inline]
161  #[must_use]
162  fn add(self, rhs: f64x4) -> Self::Output {
163    f64x4::splat(self).add(rhs)
164  }
165}
166
167impl Sub<f64x4> for f64 {
168  type Output = f64x4;
169  #[inline]
170  #[must_use]
171  fn sub(self, rhs: f64x4) -> Self::Output {
172    f64x4::splat(self).sub(rhs)
173  }
174}
175
176impl Mul<f64x4> for f64 {
177  type Output = f64x4;
178  #[inline]
179  #[must_use]
180  fn mul(self, rhs: f64x4) -> Self::Output {
181    f64x4::splat(self).mul(rhs)
182  }
183}
184
185impl Div<f64x4> for f64 {
186  type Output = f64x4;
187  #[inline]
188  #[must_use]
189  fn div(self, rhs: f64x4) -> Self::Output {
190    f64x4::splat(self).div(rhs)
191  }
192}
193
194impl BitAnd for f64x4 {
195  type Output = Self;
196  #[inline]
197  #[must_use]
198  fn bitand(self, rhs: Self) -> Self::Output {
199    pick! {
200      if #[cfg(target_feature="avx")] {
201        Self { avx: bitand_m256d(self.avx, rhs.avx) }
202      } else {
203        Self {
204          a : self.a.bitand(rhs.a),
205          b : self.b.bitand(rhs.b),
206        }
207      }
208    }
209  }
210}
211
212impl BitOr for f64x4 {
213  type Output = Self;
214  #[inline]
215  #[must_use]
216  fn bitor(self, rhs: Self) -> Self::Output {
217    pick! {
218      if #[cfg(target_feature="avx")] {
219        Self { avx: bitor_m256d(self.avx, rhs.avx) }
220      } else {
221        Self {
222          a : self.a.bitor(rhs.a),
223          b : self.b.bitor(rhs.b),
224        }
225      }
226    }
227  }
228}
229
230impl BitXor for f64x4 {
231  type Output = Self;
232  #[inline]
233  #[must_use]
234  fn bitxor(self, rhs: Self) -> Self::Output {
235    pick! {
236      if #[cfg(target_feature="avx")] {
237        Self { avx: bitxor_m256d(self.avx, rhs.avx) }
238      } else {
239        Self {
240          a : self.a.bitxor(rhs.a),
241          b : self.b.bitxor(rhs.b),
242        }
243      }
244    }
245  }
246}
247
248impl CmpEq for f64x4 {
249  type Output = Self;
250  #[inline]
251  #[must_use]
252  fn cmp_eq(self, rhs: Self) -> Self::Output {
253    pick! {
254      if #[cfg(target_feature="avx")]{
255        Self { avx: cmp_op_mask_m256d::<{cmp_op!(EqualOrdered)}>(self.avx, rhs.avx) }
256      } else {
257        Self {
258          a : self.a.cmp_eq(rhs.a),
259          b : self.b.cmp_eq(rhs.b),
260        }
261      }
262    }
263  }
264}
265
266impl CmpGe for f64x4 {
267  type Output = Self;
268  #[inline]
269  #[must_use]
270  fn cmp_ge(self, rhs: Self) -> Self::Output {
271    pick! {
272      if #[cfg(target_feature="avx")]{
273        Self { avx: cmp_op_mask_m256d::<{cmp_op!(GreaterEqualOrdered)}>(self.avx, rhs.avx) }
274      } else {
275        Self {
276          a : self.a.cmp_ge(rhs.a),
277          b : self.b.cmp_ge(rhs.b),
278        }
279      }
280    }
281  }
282}
283
284impl CmpGt for f64x4 {
285  type Output = Self;
286  #[inline]
287  #[must_use]
288  fn cmp_gt(self, rhs: Self) -> Self::Output {
289    pick! {
290      if #[cfg(target_feature="avx")]{
291        Self { avx: cmp_op_mask_m256d::<{cmp_op!( GreaterThanOrdered)}>(self.avx, rhs.avx) }
292      } else {
293        Self {
294          a : self.a.cmp_gt(rhs.a),
295          b : self.b.cmp_gt(rhs.b),
296        }
297      }
298    }
299  }
300}
301
302impl CmpNe for f64x4 {
303  type Output = Self;
304  #[inline]
305  #[must_use]
306  fn cmp_ne(self, rhs: Self) -> Self::Output {
307    pick! {
308      if #[cfg(target_feature="avx")]{
309        Self { avx: cmp_op_mask_m256d::<{cmp_op!(NotEqualOrdered)}>(self.avx, rhs.avx) }
310      } else {
311        Self {
312          a : self.a.cmp_ne(rhs.a),
313          b : self.b.cmp_ne(rhs.b),
314        }
315      }
316    }
317  }
318}
319
320impl CmpLe for f64x4 {
321  type Output = Self;
322  #[inline]
323  #[must_use]
324  fn cmp_le(self, rhs: Self) -> Self::Output {
325    pick! {
326      if #[cfg(target_feature="avx")]{
327        Self { avx: cmp_op_mask_m256d::<{cmp_op!(LessEqualOrdered)}>(self.avx, rhs.avx) }
328      } else {
329        Self {
330          a : self.a.cmp_le(rhs.a),
331          b : self.b.cmp_le(rhs.b),
332        }
333      }
334    }
335  }
336}
337
338impl CmpLt for f64x4 {
339  type Output = Self;
340  #[inline]
341  #[must_use]
342  fn cmp_lt(self, rhs: Self) -> Self::Output {
343    pick! {
344      if #[cfg(target_feature="avx")]{
345        Self { avx: cmp_op_mask_m256d::<{cmp_op!(LessThanOrdered)}>(self.avx, rhs.avx) }
346      } else {
347        Self {
348          a : self.a.cmp_lt(rhs.a),
349          b : self.b.cmp_lt(rhs.b),
350        }
351      }
352    }
353  }
354}
355
356impl f64x4 {
357  #[inline]
358  #[must_use]
359  pub const fn new(array: [f64; 4]) -> Self {
360    unsafe { core::intrinsics::transmute(array) }
361  }
362  #[inline]
363  #[must_use]
364  pub fn blend(self, t: Self, f: Self) -> Self {
365    pick! {
366      if #[cfg(target_feature="avx")] {
367        Self { avx: blend_varying_m256d(f.avx, t.avx, self.avx) }
368      } else {
369        Self {
370          a : self.a.blend(t.a, f.a),
371          b : self.b.blend(t.b, f.b),
372        }
373      }
374    }
375  }
376
377  #[inline]
378  #[must_use]
379  pub fn abs(self) -> Self {
380    pick! {
381      if #[cfg(target_feature="avx")] {
382        let non_sign_bits = f64x4::from(f64::from_bits(i64::MAX as u64));
383        self & non_sign_bits
384      } else {
385        Self {
386          a : self.a.abs(),
387          b : self.b.abs(),
388        }
389      }
390    }
391  }
392
393  #[inline]
394  #[must_use]
395  pub fn floor(self) -> Self {
396    pick! {
397      if #[cfg(target_feature="avx")] {
398        Self { avx: floor_m256d(self.avx) }
399      } else {
400        Self {
401          a : self.a.floor(),
402          b : self.b.floor(),
403        }
404      }
405    }
406  }
407  #[inline]
408  #[must_use]
409  pub fn ceil(self) -> Self {
410    pick! {
411      if #[cfg(target_feature="avx")] {
412        Self { avx: ceil_m256d(self.avx) }
413      } else {
414        Self {
415          a : self.a.ceil(),
416          b : self.b.ceil(),
417        }
418      }
419    }
420  }
421
422  /// Calculates the lanewise maximum of both vectors. This is a faster
423  /// implementation than `max`, but it doesn't specify any behavior if NaNs are
424  /// involved.
425  #[inline]
426  #[must_use]
427  pub fn fast_max(self, rhs: Self) -> Self {
428    pick! {
429      if #[cfg(target_feature="avx")] {
430        Self { avx: max_m256d(self.avx, rhs.avx) }
431      } else {
432        Self {
433          a : self.a.fast_max(rhs.a),
434          b : self.b.fast_max(rhs.b),
435        }
436      }
437    }
438  }
439
440  /// Calculates the lanewise maximum of both vectors. If either lane is NaN,
441  /// the other lane gets chosen. Use `fast_max` for a faster implementation
442  /// that doesn't handle NaNs.
443  #[inline]
444  #[must_use]
445  pub fn max(self, rhs: Self) -> Self {
446    pick! {
447      if #[cfg(target_feature="avx")] {
448        // max_m256d seems to do rhs < self ? self : rhs. So if there's any NaN
449        // involved, it chooses rhs, so we need to specifically check rhs for
450        // NaN.
451        rhs.is_nan().blend(self, Self { avx: max_m256d(self.avx, rhs.avx) })
452      } else {
453        Self {
454          a : self.a.max(rhs.a),
455          b : self.b.max(rhs.b),
456        }
457      }
458    }
459  }
460
461  /// Calculates the lanewise minimum of both vectors. This is a faster
462  /// implementation than `min`, but it doesn't specify any behavior if NaNs are
463  /// involved.
464  #[inline]
465  #[must_use]
466  pub fn fast_min(self, rhs: Self) -> Self {
467    pick! {
468      if #[cfg(target_feature="avx")] {
469        Self { avx: min_m256d(self.avx, rhs.avx) }
470      } else {
471        Self {
472          a : self.a.fast_min(rhs.a),
473          b : self.b.fast_min(rhs.b),
474        }
475      }
476    }
477  }
478
479  /// Calculates the lanewise minimum of both vectors. If either lane is NaN,
480  /// the other lane gets chosen. Use `fast_min` for a faster implementation
481  /// that doesn't handle NaNs.
482  #[inline]
483  #[must_use]
484  pub fn min(self, rhs: Self) -> Self {
485    pick! {
486      if #[cfg(target_feature="avx")] {
487        // min_m256d seems to do rhs < self ? self : rhs. So if there's any NaN
488        // involved, it chooses rhs, so we need to specifically check rhs for
489        // NaN.
490        rhs.is_nan().blend(self, Self { avx: min_m256d(self.avx, rhs.avx) })
491      } else {
492        Self {
493          a : self.a.min(rhs.a),
494          b : self.b.min(rhs.b),
495        }
496      }
497    }
498  }
499
500  #[inline]
501  #[must_use]
502  pub fn is_nan(self) -> Self {
503    pick! {
504      if #[cfg(target_feature="avx")] {
505        Self { avx: cmp_op_mask_m256d::<{cmp_op!(Unordered)}>(self.avx, self.avx ) }
506      } else {
507        Self {
508          a : self.a.is_nan(),
509          b : self.b.is_nan(),
510        }
511      }
512    }
513  }
514
515  #[inline]
516  #[must_use]
517  pub fn is_finite(self) -> Self {
518    let shifted_exp_mask = u64x4::from(0xFFE0000000000000);
519    let u: u64x4 = cast(self);
520    let shift_u = u << 1_u64;
521    let out = !(shift_u & shifted_exp_mask).cmp_eq(shifted_exp_mask);
522    cast(out)
523  }
524
525  #[inline]
526  #[must_use]
527  pub fn is_inf(self) -> Self {
528    let shifted_inf = u64x4::from(0xFFE0000000000000);
529    let u: u64x4 = cast(self);
530    let shift_u = u << 1_u64;
531    let out = (shift_u).cmp_eq(shifted_inf);
532    cast(out)
533  }
534
535  #[inline]
536  #[must_use]
537  pub fn round(self) -> Self {
538    pick! {
539      if #[cfg(target_feature="avx")] {
540        Self { avx: round_m256d::<{round_op!(Nearest)}>(self.avx) }
541      } else {
542        Self {
543          a : self.a.round(),
544          b : self.b.round(),
545        }
546      }
547    }
548  }
549
550  #[inline]
551  #[must_use]
552  pub fn round_int(self) -> i64x4 {
553    // NOTE:No optimization for this currently available so delegate to LLVM
554    let rounded: [f64; 4] = cast(self.round());
555    cast([
556      rounded[0] as i64,
557      rounded[1] as i64,
558      rounded[2] as i64,
559      rounded[3] as i64,
560    ])
561  }
562
563  #[inline]
564  #[must_use]
565  pub fn mul_add(self, m: Self, a: Self) -> Self {
566    pick! {
567      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
568        Self { avx: fused_mul_add_m256d(self.avx, m.avx, a.avx) }
569      } else if #[cfg(target_feature="avx")] {
570        // still want to use 256 bit ops
571        (self * m) + a
572      } else {
573        Self {
574          a : self.a.mul_add(m.a, a.a),
575          b : self.b.mul_add(m.b, a.b),
576        }
577      }
578    }
579  }
580
581  #[inline]
582  #[must_use]
583  pub fn mul_sub(self, m: Self, a: Self) -> Self {
584    pick! {
585      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
586        Self { avx: fused_mul_sub_m256d(self.avx, m.avx, a.avx) }
587      } else if #[cfg(target_feature="avx")] {
588        // still want to use 256 bit ops
589        (self * m) - a
590      } else {
591        Self {
592          a : self.a.mul_sub(m.a, a.a),
593          b : self.b.mul_sub(m.b, a.b),
594        }
595      }
596    }
597  }
598
599  #[inline]
600  #[must_use]
601  pub fn mul_neg_add(self, m: Self, a: Self) -> Self {
602    pick! {
603      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
604        Self { avx: fused_mul_neg_add_m256d(self.avx, m.avx, a.avx) }
605      } else if #[cfg(target_feature="avx")] {
606        // still want to use 256 bit ops
607        a - (self * m)
608      } else {
609        Self {
610          a : self.a.mul_neg_add(m.a, a.a),
611          b : self.b.mul_neg_add(m.b, a.b),
612        }
613      }
614    }
615  }
616
617  #[inline]
618  #[must_use]
619  pub fn mul_neg_sub(self, m: Self, a: Self) -> Self {
620    pick! {
621       if #[cfg(all(target_feature="avx",target_feature="fma"))] {
622         Self { avx: fused_mul_neg_sub_m256d(self.avx, m.avx, a.avx) }
623        } else if #[cfg(target_feature="avx")] {
624          // still want to use 256 bit ops
625          -(self * m) - a
626        } else {
627         Self {
628           a : self.a.mul_neg_sub(m.a, a.a),
629           b : self.b.mul_neg_sub(m.b, a.b),
630         }
631       }
632    }
633  }
634
635  #[inline]
636  #[must_use]
637  pub fn flip_signs(self, signs: Self) -> Self {
638    self ^ (signs & Self::from(-0.0))
639  }
640
641  #[inline]
642  #[must_use]
643  pub fn copysign(self, sign: Self) -> Self {
644    let magnitude_mask = Self::from(f64::from_bits(u64::MAX >> 1));
645    (self & magnitude_mask) | (sign & Self::from(-0.0))
646  }
647
648  #[inline]
649  pub fn asin_acos(self) -> (Self, Self) {
650    // Based on the Agner Fog "vector class library":
651    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
652    const_f64_as_f64x4!(R4asin, 2.967721961301243206100E-3);
653    const_f64_as_f64x4!(R3asin, -5.634242780008963776856E-1);
654    const_f64_as_f64x4!(R2asin, 6.968710824104713396794E0);
655    const_f64_as_f64x4!(R1asin, -2.556901049652824852289E1);
656    const_f64_as_f64x4!(R0asin, 2.853665548261061424989E1);
657
658    const_f64_as_f64x4!(S3asin, -2.194779531642920639778E1);
659    const_f64_as_f64x4!(S2asin, 1.470656354026814941758E2);
660    const_f64_as_f64x4!(S1asin, -3.838770957603691357202E2);
661    const_f64_as_f64x4!(S0asin, 3.424398657913078477438E2);
662
663    const_f64_as_f64x4!(P5asin, 4.253011369004428248960E-3);
664    const_f64_as_f64x4!(P4asin, -6.019598008014123785661E-1);
665    const_f64_as_f64x4!(P3asin, 5.444622390564711410273E0);
666    const_f64_as_f64x4!(P2asin, -1.626247967210700244449E1);
667    const_f64_as_f64x4!(P1asin, 1.956261983317594739197E1);
668    const_f64_as_f64x4!(P0asin, -8.198089802484824371615E0);
669
670    const_f64_as_f64x4!(Q4asin, -1.474091372988853791896E1);
671    const_f64_as_f64x4!(Q3asin, 7.049610280856842141659E1);
672    const_f64_as_f64x4!(Q2asin, -1.471791292232726029859E2);
673    const_f64_as_f64x4!(Q1asin, 1.395105614657485689735E2);
674    const_f64_as_f64x4!(Q0asin, -4.918853881490881290097E1);
675
676    let xa = self.abs();
677
678    let big = xa.cmp_ge(f64x4::splat(0.625));
679
680    let x1 = big.blend(f64x4::splat(1.0) - xa, xa * xa);
681
682    let x2 = x1 * x1;
683    let x3 = x2 * x1;
684    let x4 = x2 * x2;
685    let x5 = x4 * x1;
686
687    let do_big = big.any();
688    let do_small = !big.all();
689
690    let mut rx = f64x4::default();
691    let mut sx = f64x4::default();
692    let mut px = f64x4::default();
693    let mut qx = f64x4::default();
694
695    if do_big {
696      rx = x3.mul_add(R3asin, x2 * R2asin)
697        + x4.mul_add(R4asin, x1.mul_add(R1asin, R0asin));
698      sx =
699        x3.mul_add(S3asin, x4) + x2.mul_add(S2asin, x1.mul_add(S1asin, S0asin));
700    }
701
702    if do_small {
703      px = x3.mul_add(P3asin, P0asin)
704        + x4.mul_add(P4asin, x1 * P1asin)
705        + x5.mul_add(P5asin, x2 * P2asin);
706      qx = x4.mul_add(Q4asin, x5)
707        + x3.mul_add(Q3asin, x1 * Q1asin)
708        + x2.mul_add(Q2asin, Q0asin);
709    };
710
711    let vx = big.blend(rx, px);
712    let wx = big.blend(sx, qx);
713
714    let y1 = vx / wx * x1;
715
716    let mut z1 = f64x4::default();
717    let mut z2 = f64x4::default();
718    if do_big {
719      let xb = (x1 + x1).sqrt();
720      z1 = xb.mul_add(y1, xb);
721    }
722
723    if do_small {
724      z2 = xa.mul_add(y1, xa);
725    }
726
727    // asin
728    let z3 = f64x4::FRAC_PI_2 - z1;
729    let asin = big.blend(z3, z2);
730    let asin = asin.flip_signs(self);
731
732    // acos
733    let z3 = self.cmp_lt(f64x4::ZERO).blend(f64x4::PI - z1, z1);
734    let z4 = f64x4::FRAC_PI_2 - z2.flip_signs(self);
735    let acos = big.blend(z3, z4);
736
737    (asin, acos)
738  }
739
740  #[inline]
741  pub fn acos(self) -> Self {
742    // Based on the Agner Fog "vector class library":
743    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
744    const_f64_as_f64x4!(R4asin, 2.967721961301243206100E-3);
745    const_f64_as_f64x4!(R3asin, -5.634242780008963776856E-1);
746    const_f64_as_f64x4!(R2asin, 6.968710824104713396794E0);
747    const_f64_as_f64x4!(R1asin, -2.556901049652824852289E1);
748    const_f64_as_f64x4!(R0asin, 2.853665548261061424989E1);
749
750    const_f64_as_f64x4!(S3asin, -2.194779531642920639778E1);
751    const_f64_as_f64x4!(S2asin, 1.470656354026814941758E2);
752    const_f64_as_f64x4!(S1asin, -3.838770957603691357202E2);
753    const_f64_as_f64x4!(S0asin, 3.424398657913078477438E2);
754
755    const_f64_as_f64x4!(P5asin, 4.253011369004428248960E-3);
756    const_f64_as_f64x4!(P4asin, -6.019598008014123785661E-1);
757    const_f64_as_f64x4!(P3asin, 5.444622390564711410273E0);
758    const_f64_as_f64x4!(P2asin, -1.626247967210700244449E1);
759    const_f64_as_f64x4!(P1asin, 1.956261983317594739197E1);
760    const_f64_as_f64x4!(P0asin, -8.198089802484824371615E0);
761
762    const_f64_as_f64x4!(Q4asin, -1.474091372988853791896E1);
763    const_f64_as_f64x4!(Q3asin, 7.049610280856842141659E1);
764    const_f64_as_f64x4!(Q2asin, -1.471791292232726029859E2);
765    const_f64_as_f64x4!(Q1asin, 1.395105614657485689735E2);
766    const_f64_as_f64x4!(Q0asin, -4.918853881490881290097E1);
767
768    let xa = self.abs();
769
770    let big = xa.cmp_ge(f64x4::splat(0.625));
771
772    let x1 = big.blend(f64x4::splat(1.0) - xa, xa * xa);
773
774    let x2 = x1 * x1;
775    let x3 = x2 * x1;
776    let x4 = x2 * x2;
777    let x5 = x4 * x1;
778
779    let do_big = big.any();
780    let do_small = !big.all();
781
782    let mut rx = f64x4::default();
783    let mut sx = f64x4::default();
784    let mut px = f64x4::default();
785    let mut qx = f64x4::default();
786
787    if do_big {
788      rx = x3.mul_add(R3asin, x2 * R2asin)
789        + x4.mul_add(R4asin, x1.mul_add(R1asin, R0asin));
790      sx =
791        x3.mul_add(S3asin, x4) + x2.mul_add(S2asin, x1.mul_add(S1asin, S0asin));
792    }
793    if do_small {
794      px = x3.mul_add(P3asin, P0asin)
795        + x4.mul_add(P4asin, x1 * P1asin)
796        + x5.mul_add(P5asin, x2 * P2asin);
797      qx = x4.mul_add(Q4asin, x5)
798        + x3.mul_add(Q3asin, x1 * Q1asin)
799        + x2.mul_add(Q2asin, Q0asin);
800    };
801
802    let vx = big.blend(rx, px);
803    let wx = big.blend(sx, qx);
804
805    let y1 = vx / wx * x1;
806
807    let mut z1 = f64x4::default();
808    let mut z2 = f64x4::default();
809    if do_big {
810      let xb = (x1 + x1).sqrt();
811      z1 = xb.mul_add(y1, xb);
812    }
813
814    if do_small {
815      z2 = xa.mul_add(y1, xa);
816    }
817
818    // acos
819    let z3 = self.cmp_lt(f64x4::ZERO).blend(f64x4::PI - z1, z1);
820    let z4 = f64x4::FRAC_PI_2 - z2.flip_signs(self);
821    let acos = big.blend(z3, z4);
822
823    acos
824  }
825  #[inline]
826  #[must_use]
827  pub fn asin(self) -> Self {
828    // Based on the Agner Fog "vector class library":
829    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
830    const_f64_as_f64x4!(R4asin, 2.967721961301243206100E-3);
831    const_f64_as_f64x4!(R3asin, -5.634242780008963776856E-1);
832    const_f64_as_f64x4!(R2asin, 6.968710824104713396794E0);
833    const_f64_as_f64x4!(R1asin, -2.556901049652824852289E1);
834    const_f64_as_f64x4!(R0asin, 2.853665548261061424989E1);
835
836    const_f64_as_f64x4!(S3asin, -2.194779531642920639778E1);
837    const_f64_as_f64x4!(S2asin, 1.470656354026814941758E2);
838    const_f64_as_f64x4!(S1asin, -3.838770957603691357202E2);
839    const_f64_as_f64x4!(S0asin, 3.424398657913078477438E2);
840
841    const_f64_as_f64x4!(P5asin, 4.253011369004428248960E-3);
842    const_f64_as_f64x4!(P4asin, -6.019598008014123785661E-1);
843    const_f64_as_f64x4!(P3asin, 5.444622390564711410273E0);
844    const_f64_as_f64x4!(P2asin, -1.626247967210700244449E1);
845    const_f64_as_f64x4!(P1asin, 1.956261983317594739197E1);
846    const_f64_as_f64x4!(P0asin, -8.198089802484824371615E0);
847
848    const_f64_as_f64x4!(Q4asin, -1.474091372988853791896E1);
849    const_f64_as_f64x4!(Q3asin, 7.049610280856842141659E1);
850    const_f64_as_f64x4!(Q2asin, -1.471791292232726029859E2);
851    const_f64_as_f64x4!(Q1asin, 1.395105614657485689735E2);
852    const_f64_as_f64x4!(Q0asin, -4.918853881490881290097E1);
853
854    let xa = self.abs();
855
856    let big = xa.cmp_ge(f64x4::splat(0.625));
857
858    let x1 = big.blend(f64x4::splat(1.0) - xa, xa * xa);
859
860    let x2 = x1 * x1;
861    let x3 = x2 * x1;
862    let x4 = x2 * x2;
863    let x5 = x4 * x1;
864
865    let do_big = big.any();
866    let do_small = !big.all();
867
868    let mut rx = f64x4::default();
869    let mut sx = f64x4::default();
870    let mut px = f64x4::default();
871    let mut qx = f64x4::default();
872
873    if do_big {
874      rx = x3.mul_add(R3asin, x2 * R2asin)
875        + x4.mul_add(R4asin, x1.mul_add(R1asin, R0asin));
876      sx =
877        x3.mul_add(S3asin, x4) + x2.mul_add(S2asin, x1.mul_add(S1asin, S0asin));
878    }
879    if do_small {
880      px = x3.mul_add(P3asin, P0asin)
881        + x4.mul_add(P4asin, x1 * P1asin)
882        + x5.mul_add(P5asin, x2 * P2asin);
883      qx = x4.mul_add(Q4asin, x5)
884        + x3.mul_add(Q3asin, x1 * Q1asin)
885        + x2.mul_add(Q2asin, Q0asin);
886    };
887
888    let vx = big.blend(rx, px);
889    let wx = big.blend(sx, qx);
890
891    let y1 = vx / wx * x1;
892
893    let mut z1 = f64x4::default();
894    let mut z2 = f64x4::default();
895    if do_big {
896      let xb = (x1 + x1).sqrt();
897      z1 = xb.mul_add(y1, xb);
898    }
899
900    if do_small {
901      z2 = xa.mul_add(y1, xa);
902    }
903
904    // asin
905    let z3 = f64x4::FRAC_PI_2 - z1;
906    let asin = big.blend(z3, z2);
907    let asin = asin.flip_signs(self);
908
909    asin
910  }
911
912  #[inline]
913  pub fn atan(self) -> Self {
914    // Based on the Agner Fog "vector class library":
915    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
916    const_f64_as_f64x4!(MORE_BITS, 6.123233995736765886130E-17);
917    const_f64_as_f64x4!(MORE_BITS_O2, 6.123233995736765886130E-17 * 0.5);
918    const_f64_as_f64x4!(T3PO8, core::f64::consts::SQRT_2 + 1.0);
919
920    const_f64_as_f64x4!(P4atan, -8.750608600031904122785E-1);
921    const_f64_as_f64x4!(P3atan, -1.615753718733365076637E1);
922    const_f64_as_f64x4!(P2atan, -7.500855792314704667340E1);
923    const_f64_as_f64x4!(P1atan, -1.228866684490136173410E2);
924    const_f64_as_f64x4!(P0atan, -6.485021904942025371773E1);
925
926    const_f64_as_f64x4!(Q4atan, 2.485846490142306297962E1);
927    const_f64_as_f64x4!(Q3atan, 1.650270098316988542046E2);
928    const_f64_as_f64x4!(Q2atan, 4.328810604912902668951E2);
929    const_f64_as_f64x4!(Q1atan, 4.853903996359136964868E2);
930    const_f64_as_f64x4!(Q0atan, 1.945506571482613964425E2);
931
932    let t = self.abs();
933
934    // small:  t < 0.66
935    // medium: t <= t <= 2.4142 (1+sqrt(2))
936    // big:    t > 2.4142
937    let notbig = t.cmp_le(T3PO8);
938    let notsmal = t.cmp_ge(Self::splat(0.66));
939
940    let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2);
941    s = notsmal & s;
942    let mut fac = notbig.blend(MORE_BITS_O2, MORE_BITS);
943    fac = notsmal & fac;
944
945    // small:  z = t / 1.0;
946    // medium: z = (t-1.0) / (t+1.0);
947    // big:    z = -1.0 / t;
948    let mut a = notbig & t;
949    a = notsmal.blend(a - Self::ONE, a);
950    let mut b = notbig & Self::ONE;
951    b = notsmal.blend(b + t, b);
952    let z = a / b;
953
954    let zz = z * z;
955
956    let px = polynomial_4!(zz, P0atan, P1atan, P2atan, P3atan, P4atan);
957    let qx = polynomial_5n!(zz, Q0atan, Q1atan, Q2atan, Q3atan, Q4atan);
958
959    let mut re = (px / qx).mul_add(z * zz, z);
960    re += s + fac;
961
962    // get sign bit
963    re = (self.sign_bit()).blend(-re, re);
964
965    re
966  }
967
968  #[inline]
969  pub fn atan2(self, x: Self) -> Self {
970    // Based on the Agner Fog "vector class library":
971    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
972    const_f64_as_f64x4!(MORE_BITS, 6.123233995736765886130E-17);
973    const_f64_as_f64x4!(MORE_BITS_O2, 6.123233995736765886130E-17 * 0.5);
974    const_f64_as_f64x4!(T3PO8, core::f64::consts::SQRT_2 + 1.0);
975
976    const_f64_as_f64x4!(P4atan, -8.750608600031904122785E-1);
977    const_f64_as_f64x4!(P3atan, -1.615753718733365076637E1);
978    const_f64_as_f64x4!(P2atan, -7.500855792314704667340E1);
979    const_f64_as_f64x4!(P1atan, -1.228866684490136173410E2);
980    const_f64_as_f64x4!(P0atan, -6.485021904942025371773E1);
981
982    const_f64_as_f64x4!(Q4atan, 2.485846490142306297962E1);
983    const_f64_as_f64x4!(Q3atan, 1.650270098316988542046E2);
984    const_f64_as_f64x4!(Q2atan, 4.328810604912902668951E2);
985    const_f64_as_f64x4!(Q1atan, 4.853903996359136964868E2);
986    const_f64_as_f64x4!(Q0atan, 1.945506571482613964425E2);
987
988    let y = self;
989
990    // move in first octant
991    let x1 = x.abs();
992    let y1 = y.abs();
993    let swapxy = y1.cmp_gt(x1);
994    // swap x and y if y1 > x1
995    let mut x2 = swapxy.blend(y1, x1);
996    let mut y2 = swapxy.blend(x1, y1);
997
998    // check for special case: x and y are both +/- INF
999    let both_infinite = x.is_inf() & y.is_inf();
1000    if both_infinite.any() {
1001      let minus_one = -Self::ONE;
1002      x2 = both_infinite.blend(x2 & minus_one, x2);
1003      y2 = both_infinite.blend(y2 & minus_one, y2);
1004    }
1005
1006    // x = y = 0 gives NAN here
1007    let t = y2 / x2;
1008
1009    // small:  t < 0.66
1010    // medium: t <= t <= 2.4142 (1+sqrt(2))
1011    // big:    t > 2.4142
1012    let notbig = t.cmp_le(T3PO8);
1013    let notsmal = t.cmp_ge(Self::splat(0.66));
1014
1015    let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2);
1016    s = notsmal & s;
1017    let mut fac = notbig.blend(MORE_BITS_O2, MORE_BITS);
1018    fac = notsmal & fac;
1019
1020    // small:  z = t / 1.0;
1021    // medium: z = (t-1.0) / (t+1.0);
1022    // big:    z = -1.0 / t;
1023    let mut a = notbig & t;
1024    a = notsmal.blend(a - Self::ONE, a);
1025    let mut b = notbig & Self::ONE;
1026    b = notsmal.blend(b + t, b);
1027    let z = a / b;
1028
1029    let zz = z * z;
1030
1031    let px = polynomial_4!(zz, P0atan, P1atan, P2atan, P3atan, P4atan);
1032    let qx = polynomial_5n!(zz, Q0atan, Q1atan, Q2atan, Q3atan, Q4atan);
1033
1034    let mut re = (px / qx).mul_add(z * zz, z);
1035    re += s + fac;
1036
1037    // move back in place
1038    re = swapxy.blend(Self::FRAC_PI_2 - re, re);
1039    re = ((x | y).cmp_eq(Self::ZERO)).blend(Self::ZERO, re);
1040    re = (x.sign_bit()).blend(Self::PI - re, re);
1041
1042    // get sign bit
1043    re = (y.sign_bit()).blend(-re, re);
1044
1045    re
1046  }
1047
1048  #[inline]
1049  #[must_use]
1050  pub fn sin_cos(self) -> (Self, Self) {
1051    // Based on the Agner Fog "vector class library":
1052    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1053
1054    const_f64_as_f64x4!(P0sin, -1.66666666666666307295E-1);
1055    const_f64_as_f64x4!(P1sin, 8.33333333332211858878E-3);
1056    const_f64_as_f64x4!(P2sin, -1.98412698295895385996E-4);
1057    const_f64_as_f64x4!(P3sin, 2.75573136213857245213E-6);
1058    const_f64_as_f64x4!(P4sin, -2.50507477628578072866E-8);
1059    const_f64_as_f64x4!(P5sin, 1.58962301576546568060E-10);
1060
1061    const_f64_as_f64x4!(P0cos, 4.16666666666665929218E-2);
1062    const_f64_as_f64x4!(P1cos, -1.38888888888730564116E-3);
1063    const_f64_as_f64x4!(P2cos, 2.48015872888517045348E-5);
1064    const_f64_as_f64x4!(P3cos, -2.75573141792967388112E-7);
1065    const_f64_as_f64x4!(P4cos, 2.08757008419747316778E-9);
1066    const_f64_as_f64x4!(P5cos, -1.13585365213876817300E-11);
1067
1068    const_f64_as_f64x4!(DP1, 7.853981554508209228515625E-1 * 2.);
1069    const_f64_as_f64x4!(DP2, 7.94662735614792836714E-9 * 2.);
1070    const_f64_as_f64x4!(DP3, 3.06161699786838294307E-17 * 2.);
1071
1072    const_f64_as_f64x4!(TWO_OVER_PI, 2.0 / core::f64::consts::PI);
1073
1074    let xa = self.abs();
1075
1076    let y = (xa * TWO_OVER_PI).round();
1077    let q = y.round_int();
1078
1079    let x = y.mul_neg_add(DP3, y.mul_neg_add(DP2, y.mul_neg_add(DP1, xa)));
1080
1081    let x2 = x * x;
1082    let mut s = polynomial_5!(x2, P0sin, P1sin, P2sin, P3sin, P4sin, P5sin);
1083    let mut c = polynomial_5!(x2, P0cos, P1cos, P2cos, P3cos, P4cos, P5cos);
1084    s = (x * x2).mul_add(s, x);
1085    c =
1086      (x2 * x2).mul_add(c, x2.mul_neg_add(f64x4::from(0.5), f64x4::from(1.0)));
1087
1088    let swap = !((q & i64x4::from(1)).cmp_eq(i64x4::from(0)));
1089
1090    let mut overflow: f64x4 = cast(q.cmp_gt(i64x4::from(0x80000000000000)));
1091    overflow &= xa.is_finite();
1092    s = overflow.blend(f64x4::from(0.0), s);
1093    c = overflow.blend(f64x4::from(1.0), c);
1094
1095    // calc sin
1096    let mut sin1 = cast::<_, f64x4>(swap).blend(c, s);
1097    let sign_sin: i64x4 = (q << 62) ^ cast::<_, i64x4>(self);
1098    sin1 = sin1.flip_signs(cast(sign_sin));
1099
1100    // calc cos
1101    let mut cos1 = cast::<_, f64x4>(swap).blend(s, c);
1102    let sign_cos: i64x4 = ((q + i64x4::from(1)) & i64x4::from(2)) << 62;
1103    cos1 ^= cast::<_, f64x4>(sign_cos);
1104
1105    (sin1, cos1)
1106  }
1107  #[inline]
1108  #[must_use]
1109  pub fn sin(self) -> Self {
1110    let (s, _) = self.sin_cos();
1111    s
1112  }
1113  #[inline]
1114  #[must_use]
1115  pub fn cos(self) -> Self {
1116    let (_, c) = self.sin_cos();
1117    c
1118  }
1119  #[inline]
1120  #[must_use]
1121  pub fn tan(self) -> Self {
1122    let (s, c) = self.sin_cos();
1123    s / c
1124  }
1125  #[inline]
1126  #[must_use]
1127  pub fn to_degrees(self) -> Self {
1128    const_f64_as_f64x4!(RAD_TO_DEG_RATIO, 180.0_f64 / core::f64::consts::PI);
1129    self * RAD_TO_DEG_RATIO
1130  }
1131  #[inline]
1132  #[must_use]
1133  pub fn to_radians(self) -> Self {
1134    const_f64_as_f64x4!(DEG_TO_RAD_RATIO, core::f64::consts::PI / 180.0_f64);
1135    self * DEG_TO_RAD_RATIO
1136  }
1137  #[inline]
1138  #[must_use]
1139  pub fn sqrt(self) -> Self {
1140    pick! {
1141      if #[cfg(target_feature="avx")] {
1142        Self { avx: sqrt_m256d(self.avx) }
1143      } else {
1144        Self {
1145          a : self.a.sqrt(),
1146          b : self.b.sqrt(),
1147        }
1148      }
1149    }
1150  }
1151  #[inline]
1152  #[must_use]
1153  pub fn move_mask(self) -> i32 {
1154    pick! {
1155      if #[cfg(target_feature="avx")] {
1156        move_mask_m256d(self.avx)
1157      } else {
1158        (self.b.move_mask() << 2) | self.a.move_mask()
1159      }
1160    }
1161  }
1162  #[inline]
1163  #[must_use]
1164  pub fn any(self) -> bool {
1165    pick! {
1166      if #[cfg(target_feature="avx")] {
1167        move_mask_m256d(self.avx) != 0
1168      } else {
1169        self.a.any() || self.b.any()
1170      }
1171    }
1172  }
1173  #[inline]
1174  #[must_use]
1175  pub fn all(self) -> bool {
1176    pick! {
1177      if #[cfg(target_feature="avx")] {
1178        move_mask_m256d(self.avx) == 0b1111
1179      } else {
1180        self.a.all() && self.b.all()
1181      }
1182    }
1183  }
1184  #[inline]
1185  #[must_use]
1186  pub fn none(self) -> bool {
1187    !self.any()
1188  }
1189
1190  #[inline]
1191  fn vm_pow2n(self) -> Self {
1192    const_f64_as_f64x4!(pow2_52, 4503599627370496.0);
1193    const_f64_as_f64x4!(bias, 1023.0);
1194    let a = self + (bias + pow2_52);
1195    let c = cast::<_, i64x4>(a) << 52;
1196    cast::<_, f64x4>(c)
1197  }
1198
1199  /// Calculate the exponent of a packed `f64x4`
1200  #[inline]
1201  #[must_use]
1202  pub fn exp(self) -> Self {
1203    const_f64_as_f64x4!(P2, 1.0 / 2.0);
1204    const_f64_as_f64x4!(P3, 1.0 / 6.0);
1205    const_f64_as_f64x4!(P4, 1. / 24.);
1206    const_f64_as_f64x4!(P5, 1. / 120.);
1207    const_f64_as_f64x4!(P6, 1. / 720.);
1208    const_f64_as_f64x4!(P7, 1. / 5040.);
1209    const_f64_as_f64x4!(P8, 1. / 40320.);
1210    const_f64_as_f64x4!(P9, 1. / 362880.);
1211    const_f64_as_f64x4!(P10, 1. / 3628800.);
1212    const_f64_as_f64x4!(P11, 1. / 39916800.);
1213    const_f64_as_f64x4!(P12, 1. / 479001600.);
1214    const_f64_as_f64x4!(P13, 1. / 6227020800.);
1215    const_f64_as_f64x4!(LN2D_HI, 0.693145751953125);
1216    const_f64_as_f64x4!(LN2D_LO, 1.42860682030941723212E-6);
1217    let max_x = f64x4::from(708.39);
1218    let r = (self * Self::LOG2_E).round();
1219    let x = r.mul_neg_add(LN2D_HI, self);
1220    let x = r.mul_neg_add(LN2D_LO, x);
1221    let z =
1222      polynomial_13!(x, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13);
1223    let n2 = Self::vm_pow2n(r);
1224    let z = (z + Self::ONE) * n2;
1225    // check for overflow
1226    let in_range = self.abs().cmp_lt(max_x);
1227    let in_range = in_range & self.is_finite();
1228    in_range.blend(z, Self::ZERO)
1229  }
1230
1231  #[inline]
1232  fn exponent(self) -> f64x4 {
1233    const_f64_as_f64x4!(pow2_52, 4503599627370496.0);
1234    const_f64_as_f64x4!(bias, 1023.0);
1235    let a = cast::<_, u64x4>(self);
1236    let b = a >> 52;
1237    let c = b | cast::<_, u64x4>(pow2_52);
1238    let d = cast::<_, f64x4>(c);
1239    let e = d - (pow2_52 + bias);
1240    e
1241  }
1242
1243  #[inline]
1244  fn fraction_2(self) -> Self {
1245    let t1 = cast::<_, u64x4>(self);
1246    let t2 = cast::<_, u64x4>(
1247      (t1 & u64x4::from(0x000FFFFFFFFFFFFF)) | u64x4::from(0x3FE0000000000000),
1248    );
1249    cast::<_, f64x4>(t2)
1250  }
1251  #[inline]
1252  fn is_zero_or_subnormal(self) -> Self {
1253    let t = cast::<_, i64x4>(self);
1254    let t = t & i64x4::splat(0x7FF0000000000000);
1255    i64x4::round_float(t.cmp_eq(i64x4::splat(0)))
1256  }
1257  #[inline]
1258  fn infinity() -> Self {
1259    cast::<_, f64x4>(i64x4::splat(0x7FF0000000000000))
1260  }
1261  #[inline]
1262  fn nan_log() -> Self {
1263    cast::<_, f64x4>(i64x4::splat(0x7FF8000000000000 | 0x101 << 29))
1264  }
1265  #[inline]
1266  fn nan_pow() -> Self {
1267    cast::<_, f64x4>(i64x4::splat(0x7FF8000000000000 | 0x101 << 29))
1268  }
1269  #[inline]
1270  fn sign_bit(self) -> Self {
1271    let t1 = cast::<_, i64x4>(self);
1272    let t2 = t1 >> 63;
1273    !cast::<_, f64x4>(t2).cmp_eq(f64x4::ZERO)
1274  }
1275
1276  /// horizontal add of all the elements of the vector
1277  #[inline]
1278  pub fn reduce_add(self) -> f64 {
1279    pick! {
1280      if #[cfg(target_feature="avx")] {
1281        // From https://stackoverflow.com/questions/49941645/get-sum-of-values-stored-in-m256d-with-sse-avx
1282        let lo = cast_to_m128d_from_m256d(self.avx);
1283        let hi = extract_m128d_from_m256d::<1>(self.avx);
1284        let lo = add_m128d(lo,hi);
1285        let hi64 = unpack_high_m128d(lo,lo);
1286        let sum = add_m128d_s(lo,hi64);
1287        get_f64_from_m128d_s(sum)
1288      } else {
1289        self.a.reduce_add() + self.b.reduce_add()
1290      }
1291    }
1292  }
1293
1294  /// Natural log (ln(x))
1295  #[inline]
1296  #[must_use]
1297  pub fn ln(self) -> Self {
1298    const_f64_as_f64x4!(HALF, 0.5);
1299    const_f64_as_f64x4!(P0, 7.70838733755885391666E0);
1300    const_f64_as_f64x4!(P1, 1.79368678507819816313E1);
1301    const_f64_as_f64x4!(P2, 1.44989225341610930846E1);
1302    const_f64_as_f64x4!(P3, 4.70579119878881725854E0);
1303    const_f64_as_f64x4!(P4, 4.97494994976747001425E-1);
1304    const_f64_as_f64x4!(P5, 1.01875663804580931796E-4);
1305
1306    const_f64_as_f64x4!(Q0, 2.31251620126765340583E1);
1307    const_f64_as_f64x4!(Q1, 7.11544750618563894466E1);
1308    const_f64_as_f64x4!(Q2, 8.29875266912776603211E1);
1309    const_f64_as_f64x4!(Q3, 4.52279145837532221105E1);
1310    const_f64_as_f64x4!(Q4, 1.12873587189167450590E1);
1311    const_f64_as_f64x4!(LN2F_HI, 0.693359375);
1312    const_f64_as_f64x4!(LN2F_LO, -2.12194440e-4);
1313    const_f64_as_f64x4!(VM_SQRT2, 1.414213562373095048801);
1314    const_f64_as_f64x4!(VM_SMALLEST_NORMAL, 1.17549435E-38);
1315
1316    let x1 = self;
1317    let x = Self::fraction_2(x1);
1318    let e = Self::exponent(x1);
1319    let mask = x.cmp_gt(VM_SQRT2 * HALF);
1320    let x = (!mask).blend(x + x, x);
1321    let fe = mask.blend(e + Self::ONE, e);
1322    let x = x - Self::ONE;
1323    let px = polynomial_5!(x, P0, P1, P2, P3, P4, P5);
1324    let x2 = x * x;
1325    let px = x2 * x * px;
1326    let qx = polynomial_5n!(x, Q0, Q1, Q2, Q3, Q4);
1327    let res = px / qx;
1328    let res = fe.mul_add(LN2F_LO, res);
1329    let res = res + x2.mul_neg_add(HALF, x);
1330    let res = fe.mul_add(LN2F_HI, res);
1331    let overflow = !self.is_finite();
1332    let underflow = x1.cmp_lt(VM_SMALLEST_NORMAL);
1333    let mask = overflow | underflow;
1334    if !mask.any() {
1335      res
1336    } else {
1337      let is_zero = self.is_zero_or_subnormal();
1338      let res = underflow.blend(Self::nan_log(), res);
1339      let res = is_zero.blend(Self::infinity(), res);
1340      let res = overflow.blend(self, res);
1341      res
1342    }
1343  }
1344
1345  #[inline]
1346  #[must_use]
1347  pub fn log2(self) -> Self {
1348    Self::ln(self) * Self::LOG2_E
1349  }
1350  #[inline]
1351  #[must_use]
1352  pub fn log10(self) -> Self {
1353    Self::ln(self) * Self::LOG10_E
1354  }
1355
1356  #[inline]
1357  #[must_use]
1358  pub fn pow_f64x4(self, y: Self) -> Self {
1359    const_f64_as_f64x4!(ln2d_hi, 0.693145751953125);
1360    const_f64_as_f64x4!(ln2d_lo, 1.42860682030941723212E-6);
1361    const_f64_as_f64x4!(P0log, 2.0039553499201281259648E1);
1362    const_f64_as_f64x4!(P1log, 5.7112963590585538103336E1);
1363    const_f64_as_f64x4!(P2log, 6.0949667980987787057556E1);
1364    const_f64_as_f64x4!(P3log, 2.9911919328553073277375E1);
1365    const_f64_as_f64x4!(P4log, 6.5787325942061044846969E0);
1366    const_f64_as_f64x4!(P5log, 4.9854102823193375972212E-1);
1367    const_f64_as_f64x4!(P6log, 4.5270000862445199635215E-5);
1368    const_f64_as_f64x4!(Q0log, 6.0118660497603843919306E1);
1369    const_f64_as_f64x4!(Q1log, 2.1642788614495947685003E2);
1370    const_f64_as_f64x4!(Q2log, 3.0909872225312059774938E2);
1371    const_f64_as_f64x4!(Q3log, 2.2176239823732856465394E2);
1372    const_f64_as_f64x4!(Q4log, 8.3047565967967209469434E1);
1373    const_f64_as_f64x4!(Q5log, 1.5062909083469192043167E1);
1374
1375    // Taylor expansion constants
1376    const_f64_as_f64x4!(p2, 1.0 / 2.0); // coefficients for Taylor expansion of exp
1377    const_f64_as_f64x4!(p3, 1.0 / 6.0);
1378    const_f64_as_f64x4!(p4, 1.0 / 24.0);
1379    const_f64_as_f64x4!(p5, 1.0 / 120.0);
1380    const_f64_as_f64x4!(p6, 1.0 / 720.0);
1381    const_f64_as_f64x4!(p7, 1.0 / 5040.0);
1382    const_f64_as_f64x4!(p8, 1.0 / 40320.0);
1383    const_f64_as_f64x4!(p9, 1.0 / 362880.0);
1384    const_f64_as_f64x4!(p10, 1.0 / 3628800.0);
1385    const_f64_as_f64x4!(p11, 1.0 / 39916800.0);
1386    const_f64_as_f64x4!(p12, 1.0 / 479001600.0);
1387    const_f64_as_f64x4!(p13, 1.0 / 6227020800.0);
1388
1389    let x1 = self.abs();
1390    let x = x1.fraction_2();
1391    let mask = x.cmp_gt(f64x4::SQRT_2 * f64x4::HALF);
1392    let x = (!mask).blend(x + x, x);
1393    let x = x - f64x4::ONE;
1394    let x2 = x * x;
1395    let px = polynomial_6!(x, P0log, P1log, P2log, P3log, P4log, P5log, P6log);
1396    let px = px * x * x2;
1397    let qx = polynomial_6n!(x, Q0log, Q1log, Q2log, Q3log, Q4log, Q5log);
1398    let lg1 = px / qx;
1399
1400    let ef = x1.exponent();
1401    let ef = mask.blend(ef + f64x4::ONE, ef);
1402    let e1 = (ef * y).round();
1403    let yr = ef.mul_sub(y, e1);
1404
1405    let lg = f64x4::HALF.mul_neg_add(x2, x) + lg1;
1406    let x2err = (f64x4::HALF * x).mul_sub(x, f64x4::HALF * x2);
1407    let lg_err = f64x4::HALF.mul_add(x2, lg - x) - lg1;
1408
1409    let e2 = (lg * y * f64x4::LOG2_E).round();
1410    let v = lg.mul_sub(y, e2 * ln2d_hi);
1411    let v = e2.mul_neg_add(ln2d_lo, v);
1412    let v = v - (lg_err + x2err).mul_sub(y, yr * f64x4::LN_2);
1413
1414    let x = v;
1415    let e3 = (x * f64x4::LOG2_E).round();
1416    let x = e3.mul_neg_add(f64x4::LN_2, x);
1417    let z =
1418      polynomial_13m!(x, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13)
1419        + f64x4::ONE;
1420    let ee = e1 + e2 + e3;
1421    let ei = cast::<_, i64x4>(ee.round_int());
1422    let ej = cast::<_, i64x4>(ei + (cast::<_, i64x4>(z) >> 52));
1423
1424    let overflow = cast::<_, f64x4>(!ej.cmp_lt(i64x4::splat(0x07FF)))
1425      | ee.cmp_gt(f64x4::splat(3000.0));
1426    let underflow = cast::<_, f64x4>(!ej.cmp_gt(i64x4::splat(0x000)))
1427      | ee.cmp_lt(f64x4::splat(-3000.0));
1428
1429    // Add exponent by integer addition
1430    let z = cast::<_, f64x4>(cast::<_, i64x4>(z) + (ei << 52));
1431
1432    // Check for overflow/underflow
1433    let z = if (overflow | underflow).any() {
1434      let z = underflow.blend(f64x4::ZERO, z);
1435      overflow.blend(Self::infinity(), z)
1436    } else {
1437      z
1438    };
1439
1440    // Check for self == 0
1441    let x_zero = self.is_zero_or_subnormal();
1442    let z = x_zero.blend(
1443      y.cmp_lt(f64x4::ZERO).blend(
1444        Self::infinity(),
1445        y.cmp_eq(f64x4::ZERO).blend(f64x4::ONE, f64x4::ZERO),
1446      ),
1447      z,
1448    );
1449
1450    let x_sign = self.sign_bit();
1451
1452    let z = if x_sign.any() {
1453      // Y into an integer
1454      let yi = y.cmp_eq(y.round());
1455      // Is y odd?
1456      let y_odd = cast::<_, i64x4>(y.round_int() << 63).round_float();
1457      let z1 =
1458        yi.blend(z | y_odd, self.cmp_eq(Self::ZERO).blend(z, Self::nan_pow()));
1459      x_sign.blend(z1, z)
1460    } else {
1461      z
1462    };
1463
1464    let x_finite = self.is_finite();
1465    let y_finite = y.is_finite();
1466    let e_finite = ee.is_finite();
1467
1468    if (x_finite & y_finite & (e_finite | x_zero)).all() {
1469      return z;
1470    }
1471
1472    (self.is_nan() | y.is_nan()).blend(self + y, z)
1473  }
1474  #[inline]
1475  pub fn powf(self, y: f64) -> Self {
1476    Self::pow_f64x4(self, f64x4::splat(y))
1477  }
1478
1479  #[inline]
1480  pub fn to_array(self) -> [f64; 4] {
1481    cast(self)
1482  }
1483
1484  #[inline]
1485  pub fn as_array_ref(&self) -> &[f64; 4] {
1486    cast_ref(self)
1487  }
1488
1489  #[inline]
1490  pub fn as_array_mut(&mut self) -> &mut [f64; 4] {
1491    cast_mut(self)
1492  }
1493
1494  #[inline]
1495  pub fn from_i32x4(v: i32x4) -> Self {
1496    pick! {
1497      if #[cfg(target_feature="avx")] {
1498        Self { avx: convert_to_m256d_from_i32_m128i(v.sse) }
1499      } else {
1500        Self::new([
1501          v.as_array_ref()[0] as f64,
1502          v.as_array_ref()[1] as f64,
1503          v.as_array_ref()[2] as f64,
1504          v.as_array_ref()[3] as f64,
1505        ])
1506      }
1507    }
1508  }
1509}
1510
1511impl From<i32x4> for f64x4 {
1512  #[inline]
1513  fn from(v: i32x4) -> Self {
1514    Self::from_i32x4(v)
1515  }
1516}
1517
1518impl Not for f64x4 {
1519  type Output = Self;
1520  #[inline]
1521  fn not(self) -> Self {
1522    pick! {
1523      if #[cfg(target_feature="avx")] {
1524        Self { avx: self.avx.not()  }
1525      } else {
1526        Self {
1527          a : self.a.not(),
1528          b : self.b.not(),
1529        }
1530      }
1531    }
1532  }
1533}
wide/f64x4_.rs

wide/
f64x4_.rs