1use super::*;
2
3pick! {
4 if #[cfg(target_feature="sse")] {
5 #[derive(Default, Clone, Copy, PartialEq)]
6 #[repr(C, align(16))]
7 pub struct f32x4 { pub(crate) sse: m128 }
8 } else if #[cfg(target_feature="simd128")] {
9 use core::arch::wasm32::*;
10
11 #[derive(Clone, Copy)]
12 #[repr(transparent)]
13 pub struct f32x4 { pub(crate) simd: v128 }
14
15 impl Default for f32x4 {
16 fn default() -> Self {
17 Self::splat(0.0)
18 }
19 }
20
21 impl PartialEq for f32x4 {
22 fn eq(&self, other: &Self) -> bool {
23 u32x4_all_true(f32x4_eq(self.simd, other.simd))
24 }
25 }
26 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
27 use core::arch::aarch64::*;
28 #[repr(C)]
29 #[derive(Copy, Clone)]
30 pub struct f32x4 { pub(crate) neon : float32x4_t }
31
32 impl Default for f32x4 {
33 #[inline]
34 #[must_use]
35 fn default() -> Self {
36 unsafe { Self { neon: vdupq_n_f32(0.0)} }
37 }
38 }
39
40 impl PartialEq for f32x4 {
41 #[inline]
42 #[must_use]
43 fn eq(&self, other: &Self) -> bool {
44 unsafe { vminvq_u32(vceqq_f32(self.neon, other.neon))==u32::MAX }
45 }
46
47 }
48 } else {
49 #[derive(Default, Clone, Copy, PartialEq)]
50 #[repr(C, align(16))]
51 pub struct f32x4 { pub(crate) arr: [f32;4] }
52 }
53}
54
55macro_rules! const_f32_as_f32x4 {
56 ($i:ident, $f:expr) => {
57 #[allow(non_upper_case_globals)]
58 pub const $i: f32x4 = f32x4::new([$f; 4]);
59 };
60}
61
62impl f32x4 {
63 const_f32_as_f32x4!(ONE, 1.0);
64 const_f32_as_f32x4!(ZERO, 0.0);
65 const_f32_as_f32x4!(HALF, 0.5);
66 const_f32_as_f32x4!(E, core::f32::consts::E);
67 const_f32_as_f32x4!(FRAC_1_PI, core::f32::consts::FRAC_1_PI);
68 const_f32_as_f32x4!(FRAC_2_PI, core::f32::consts::FRAC_2_PI);
69 const_f32_as_f32x4!(FRAC_2_SQRT_PI, core::f32::consts::FRAC_2_SQRT_PI);
70 const_f32_as_f32x4!(FRAC_1_SQRT_2, core::f32::consts::FRAC_1_SQRT_2);
71 const_f32_as_f32x4!(FRAC_PI_2, core::f32::consts::FRAC_PI_2);
72 const_f32_as_f32x4!(FRAC_PI_3, core::f32::consts::FRAC_PI_3);
73 const_f32_as_f32x4!(FRAC_PI_4, core::f32::consts::FRAC_PI_4);
74 const_f32_as_f32x4!(FRAC_PI_6, core::f32::consts::FRAC_PI_6);
75 const_f32_as_f32x4!(FRAC_PI_8, core::f32::consts::FRAC_PI_8);
76 const_f32_as_f32x4!(LN_2, core::f32::consts::LN_2);
77 const_f32_as_f32x4!(LN_10, core::f32::consts::LN_10);
78 const_f32_as_f32x4!(LOG2_E, core::f32::consts::LOG2_E);
79 const_f32_as_f32x4!(LOG10_E, core::f32::consts::LOG10_E);
80 const_f32_as_f32x4!(LOG10_2, core::f32::consts::LOG10_2);
81 const_f32_as_f32x4!(LOG2_10, core::f32::consts::LOG2_10);
82 const_f32_as_f32x4!(PI, core::f32::consts::PI);
83 const_f32_as_f32x4!(SQRT_2, core::f32::consts::SQRT_2);
84 const_f32_as_f32x4!(TAU, core::f32::consts::TAU);
85}
86
87unsafe impl Zeroable for f32x4 {}
88unsafe impl Pod for f32x4 {}
89
90impl Add for f32x4 {
91 type Output = Self;
92 #[inline]
93 #[must_use]
94 fn add(self, rhs: Self) -> Self::Output {
95 pick! {
96 if #[cfg(target_feature="sse")] {
97 Self { sse: add_m128(self.sse, rhs.sse) }
98 } else if #[cfg(target_feature="simd128")] {
99 Self { simd: f32x4_add(self.simd, rhs.simd) }
100 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
101 unsafe { Self { neon: vaddq_f32(self.neon, rhs.neon) } }
102 } else {
103 Self { arr: [
104 self.arr[0] + rhs.arr[0],
105 self.arr[1] + rhs.arr[1],
106 self.arr[2] + rhs.arr[2],
107 self.arr[3] + rhs.arr[3],
108 ]}
109 }
110 }
111 }
112}
113
114impl Sub for f32x4 {
115 type Output = Self;
116 #[inline]
117 #[must_use]
118 fn sub(self, rhs: Self) -> Self::Output {
119 pick! {
120 if #[cfg(target_feature="sse")] {
121 Self { sse: sub_m128(self.sse, rhs.sse) }
122 } else if #[cfg(target_feature="simd128")] {
123 Self { simd: f32x4_sub(self.simd, rhs.simd) }
124 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
125 unsafe {Self { neon: vsubq_f32(self.neon, rhs.neon) }}
126 } else {
127 Self { arr: [
128 self.arr[0] - rhs.arr[0],
129 self.arr[1] - rhs.arr[1],
130 self.arr[2] - rhs.arr[2],
131 self.arr[3] - rhs.arr[3],
132 ]}
133 }
134 }
135 }
136}
137
138impl Mul for f32x4 {
139 type Output = Self;
140 #[inline]
141 #[must_use]
142 fn mul(self, rhs: Self) -> Self::Output {
143 pick! {
144 if #[cfg(target_feature="sse")] {
145 Self { sse: mul_m128(self.sse, rhs.sse) }
146 } else if #[cfg(target_feature="simd128")] {
147 Self { simd: f32x4_mul(self.simd, rhs.simd) }
148 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
149 unsafe {Self { neon: vmulq_f32(self.neon, rhs.neon) }}
150 } else {
151 Self { arr: [
152 self.arr[0] * rhs.arr[0],
153 self.arr[1] * rhs.arr[1],
154 self.arr[2] * rhs.arr[2],
155 self.arr[3] * rhs.arr[3],
156 ]}
157 }
158 }
159 }
160}
161
162impl Div for f32x4 {
163 type Output = Self;
164 #[inline]
165 #[must_use]
166 fn div(self, rhs: Self) -> Self::Output {
167 pick! {
168 if #[cfg(target_feature="sse")] {
169 Self { sse: div_m128(self.sse, rhs.sse) }
170 } else if #[cfg(target_feature="simd128")] {
171 Self { simd: f32x4_div(self.simd, rhs.simd) }
172 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
173 unsafe {Self { neon: vdivq_f32(self.neon, rhs.neon) }}
174 } else {
175 Self { arr: [
176 self.arr[0] / rhs.arr[0],
177 self.arr[1] / rhs.arr[1],
178 self.arr[2] / rhs.arr[2],
179 self.arr[3] / rhs.arr[3],
180 ]}
181 }
182 }
183 }
184}
185
186impl Add<f32> for f32x4 {
187 type Output = Self;
188 #[inline]
189 #[must_use]
190 fn add(self, rhs: f32) -> Self::Output {
191 self.add(Self::splat(rhs))
192 }
193}
194
195impl Sub<f32> for f32x4 {
196 type Output = Self;
197 #[inline]
198 #[must_use]
199 fn sub(self, rhs: f32) -> Self::Output {
200 self.sub(Self::splat(rhs))
201 }
202}
203
204impl Mul<f32> for f32x4 {
205 type Output = Self;
206 #[inline]
207 #[must_use]
208 fn mul(self, rhs: f32) -> Self::Output {
209 self.mul(Self::splat(rhs))
210 }
211}
212
213impl Div<f32> for f32x4 {
214 type Output = Self;
215 #[inline]
216 #[must_use]
217 fn div(self, rhs: f32) -> Self::Output {
218 self.div(Self::splat(rhs))
219 }
220}
221
222impl Add<f32x4> for f32 {
223 type Output = f32x4;
224 #[inline]
225 #[must_use]
226 fn add(self, rhs: f32x4) -> Self::Output {
227 f32x4::splat(self).add(rhs)
228 }
229}
230
231impl Sub<f32x4> for f32 {
232 type Output = f32x4;
233 #[inline]
234 #[must_use]
235 fn sub(self, rhs: f32x4) -> Self::Output {
236 f32x4::splat(self).sub(rhs)
237 }
238}
239
240impl Mul<f32x4> for f32 {
241 type Output = f32x4;
242 #[inline]
243 #[must_use]
244 fn mul(self, rhs: f32x4) -> Self::Output {
245 f32x4::splat(self).mul(rhs)
246 }
247}
248
249impl Div<f32x4> for f32 {
250 type Output = f32x4;
251 #[inline]
252 #[must_use]
253 fn div(self, rhs: f32x4) -> Self::Output {
254 f32x4::splat(self).div(rhs)
255 }
256}
257
258impl BitAnd for f32x4 {
259 type Output = Self;
260 #[inline]
261 #[must_use]
262 fn bitand(self, rhs: Self) -> Self::Output {
263 pick! {
264 if #[cfg(target_feature="sse")] {
265 Self { sse: bitand_m128(self.sse, rhs.sse) }
266 } else if #[cfg(target_feature="simd128")] {
267 Self { simd: v128_and(self.simd, rhs.simd) }
268 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
269 unsafe {Self { neon: vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }}
270 } else {
271 Self { arr: [
272 f32::from_bits(self.arr[0].to_bits() & rhs.arr[0].to_bits()),
273 f32::from_bits(self.arr[1].to_bits() & rhs.arr[1].to_bits()),
274 f32::from_bits(self.arr[2].to_bits() & rhs.arr[2].to_bits()),
275 f32::from_bits(self.arr[3].to_bits() & rhs.arr[3].to_bits()),
276 ]}
277 }
278 }
279 }
280}
281
282impl BitOr for f32x4 {
283 type Output = Self;
284 #[inline]
285 #[must_use]
286 fn bitor(self, rhs: Self) -> Self::Output {
287 pick! {
288 if #[cfg(target_feature="sse")] {
289 Self { sse: bitor_m128(self.sse, rhs.sse) }
290 } else if #[cfg(target_feature="simd128")] {
291 Self { simd: v128_or(self.simd, rhs.simd) }
292 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
293 unsafe {Self { neon: vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }}
294 } else {
295 Self { arr: [
296 f32::from_bits(self.arr[0].to_bits() | rhs.arr[0].to_bits()),
297 f32::from_bits(self.arr[1].to_bits() | rhs.arr[1].to_bits()),
298 f32::from_bits(self.arr[2].to_bits() | rhs.arr[2].to_bits()),
299 f32::from_bits(self.arr[3].to_bits() | rhs.arr[3].to_bits()),
300 ]}
301 }
302 }
303 }
304}
305
306impl BitXor for f32x4 {
307 type Output = Self;
308 #[inline]
309 #[must_use]
310 fn bitxor(self, rhs: Self) -> Self::Output {
311 pick! {
312 if #[cfg(target_feature="sse")] {
313 Self { sse: bitxor_m128(self.sse, rhs.sse) }
314 } else if #[cfg(target_feature="simd128")] {
315 Self { simd: v128_xor(self.simd, rhs.simd) }
316 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
317 unsafe {Self { neon: vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }}
318 } else {
319 Self { arr: [
320 f32::from_bits(self.arr[0].to_bits() ^ rhs.arr[0].to_bits()),
321 f32::from_bits(self.arr[1].to_bits() ^ rhs.arr[1].to_bits()),
322 f32::from_bits(self.arr[2].to_bits() ^ rhs.arr[2].to_bits()),
323 f32::from_bits(self.arr[3].to_bits() ^ rhs.arr[3].to_bits()),
324 ]}
325 }
326 }
327 }
328}
329
330impl CmpEq for f32x4 {
331 type Output = Self;
332 #[inline]
333 #[must_use]
334 fn cmp_eq(self, rhs: Self) -> Self::Output {
335 pick! {
336 if #[cfg(target_feature="sse")] {
337 Self { sse: cmp_eq_mask_m128(self.sse, rhs.sse) }
338 } else if #[cfg(target_feature="simd128")] {
339 Self { simd: f32x4_eq(self.simd, rhs.simd) }
340 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
341 unsafe {Self { neon: vreinterpretq_f32_u32(vceqq_f32(self.neon, rhs.neon)) }}
342 } else {
343 Self { arr: [
344 if self.arr[0] == rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
345 if self.arr[1] == rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
346 if self.arr[2] == rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
347 if self.arr[3] == rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
348 ]}
349 }
350 }
351 }
352}
353
354impl CmpGe for f32x4 {
355 type Output = Self;
356 #[inline]
357 #[must_use]
358 fn cmp_ge(self, rhs: Self) -> Self::Output {
359 pick! {
360 if #[cfg(target_feature="sse")] {
361 Self { sse: cmp_ge_mask_m128(self.sse, rhs.sse) }
362 } else if #[cfg(target_feature="simd128")] {
363 Self { simd: f32x4_ge(self.simd, rhs.simd) }
364 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
365 unsafe {Self { neon: vreinterpretq_f32_u32(vcgeq_f32(self.neon, rhs.neon)) }}
366 } else {
367 Self { arr: [
368 if self.arr[0] >= rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
369 if self.arr[1] >= rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
370 if self.arr[2] >= rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
371 if self.arr[3] >= rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
372 ]}
373 }
374 }
375 }
376}
377
378impl CmpGt for f32x4 {
379 type Output = Self;
380 #[inline]
381 #[must_use]
382 fn cmp_gt(self, rhs: Self) -> Self::Output {
383 pick! {
384 if #[cfg(target_feature="sse")] {
385 Self { sse: cmp_gt_mask_m128(self.sse, rhs.sse) }
386 } else if #[cfg(target_feature="simd128")] {
387 Self { simd: f32x4_gt(self.simd, rhs.simd) }
388 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
389 unsafe {Self { neon: vreinterpretq_f32_u32(vcgtq_f32(self.neon, rhs.neon)) }}
390 } else {
391 Self { arr: [
392 if self.arr[0] > rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
393 if self.arr[1] > rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
394 if self.arr[2] > rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
395 if self.arr[3] > rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
396 ]}
397 }
398 }
399 }
400}
401
402impl CmpNe for f32x4 {
403 type Output = Self;
404 #[inline]
405 #[must_use]
406 fn cmp_ne(self, rhs: Self) -> Self::Output {
407 pick! {
408 if #[cfg(target_feature="sse")] {
409 Self { sse: cmp_neq_mask_m128(self.sse, rhs.sse) }
410 } else if #[cfg(target_feature="simd128")] {
411 Self { simd: f32x4_ne(self.simd, rhs.simd) }
412 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
413 unsafe {Self { neon: vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(self.neon, rhs.neon))) }}
414 } else {
415 Self { arr: [
416 if self.arr[0] != rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
417 if self.arr[1] != rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
418 if self.arr[2] != rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
419 if self.arr[3] != rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
420 ]}
421 }
422 }
423 }
424}
425
426impl CmpLe for f32x4 {
427 type Output = Self;
428 #[inline]
429 #[must_use]
430 fn cmp_le(self, rhs: Self) -> Self::Output {
431 pick! {
432 if #[cfg(target_feature="sse")] {
433 Self { sse: cmp_le_mask_m128(self.sse, rhs.sse) }
434 } else if #[cfg(target_feature="simd128")] {
435 Self { simd: f32x4_le(self.simd, rhs.simd) }
436 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
437 unsafe {Self { neon: vreinterpretq_f32_u32(vcleq_f32(self.neon, rhs.neon)) }}
438 } else {
439 Self { arr: [
440 if self.arr[0] <= rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
441 if self.arr[1] <= rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
442 if self.arr[2] <= rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
443 if self.arr[3] <= rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
444 ]}
445 }
446 }
447 }
448}
449
450impl CmpLt for f32x4 {
451 type Output = Self;
452 #[inline]
453 #[must_use]
454 fn cmp_lt(self, rhs: Self) -> Self::Output {
455 pick! {
456 if #[cfg(target_feature="sse")] {
457 Self { sse: cmp_lt_mask_m128(self.sse, rhs.sse) }
458 } else if #[cfg(target_feature="simd128")] {
459 Self { simd: f32x4_lt(self.simd, rhs.simd) }
460 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
461 unsafe {Self { neon: vreinterpretq_f32_u32(vcltq_f32(self.neon, rhs.neon)) }}
462 } else {
463 Self { arr: [
464 if self.arr[0] < rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
465 if self.arr[1] < rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
466 if self.arr[2] < rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
467 if self.arr[3] < rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
468 ]}
469 }
470 }
471 }
472}
473
474impl f32x4 {
475 #[inline]
476 #[must_use]
477 pub const fn new(array: [f32; 4]) -> Self {
478 #[allow(non_upper_case_globals)]
479 unsafe {
480 core::intrinsics::transmute(array)
481 }
482 }
483
484 #[inline]
485 #[must_use]
486 pub fn blend(self, t: Self, f: Self) -> Self {
487 pick! {
488 if #[cfg(target_feature="sse4.1")] {
489 Self { sse: blend_varying_m128(f.sse, t.sse, self.sse) }
490 } else if #[cfg(target_feature="simd128")] {
491 Self { simd: v128_bitselect(t.simd, f.simd, self.simd) }
492 } else {
493 generic_bit_blend(self, t, f)
494 }
495 }
496 }
497 #[inline]
498 #[must_use]
499 pub fn abs(self) -> Self {
500 pick! {
501 if #[cfg(target_feature="simd128")] {
502 Self { simd: f32x4_abs(self.simd) }
503 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
504 unsafe {Self { neon: vabsq_f32(self.neon) }}
505 } else {
506 let non_sign_bits = f32x4::from(f32::from_bits(i32::MAX as u32));
507 self & non_sign_bits
508 }
509 }
510 }
511 #[inline]
512 #[must_use]
513 pub fn floor(self) -> Self {
514 pick! {
515 if #[cfg(target_feature="simd128")] {
516 Self { simd: f32x4_floor(self.simd) }
517 } else if #[cfg(target_feature="sse4.1")] {
518 Self { sse: floor_m128(self.sse) }
519 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
520 unsafe {Self { neon: vrndmq_f32(self.neon) }}
521 } else if #[cfg(feature="std")] {
522 let base: [f32; 4] = cast(self);
523 cast(base.map(|val| val.floor()))
524 } else {
525 let base: [f32; 4] = cast(self);
526 let rounded: [f32; 4] = cast(self.round());
527 cast([
528 if base[0] < rounded[0] { rounded[0] - 1.0 } else { rounded[0] },
529 if base[1] < rounded[1] { rounded[1] - 1.0 } else { rounded[1] },
530 if base[2] < rounded[2] { rounded[2] - 1.0 } else { rounded[2] },
531 if base[3] < rounded[3] { rounded[3] - 1.0 } else { rounded[3] },
532 ])
533 }
534 }
535 }
536 #[inline]
537 #[must_use]
538 pub fn ceil(self) -> Self {
539 pick! {
540 if #[cfg(target_feature="simd128")] {
541 Self { simd: f32x4_ceil(self.simd) }
542 } else if #[cfg(target_feature="sse4.1")] {
543 Self { sse: ceil_m128(self.sse) }
544 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
545 unsafe {Self { neon: vrndpq_f32(self.neon) }}
546 } else if #[cfg(feature="std")] {
547 let base: [f32; 4] = cast(self);
548 cast(base.map(|val| val.ceil()))
549 } else {
550 let base: [f32; 4] = cast(self);
551 let rounded: [f32; 4] = cast(self.round());
552 cast([
553 if base[0] > rounded[0] { rounded[0] + 1.0 } else { rounded[0] },
554 if base[1] > rounded[1] { rounded[1] + 1.0 } else { rounded[1] },
555 if base[2] > rounded[2] { rounded[2] + 1.0 } else { rounded[2] },
556 if base[3] > rounded[3] { rounded[3] + 1.0 } else { rounded[3] },
557 ])
558 }
559 }
560 }
561
562 #[inline]
566 #[must_use]
567 pub fn fast_max(self, rhs: Self) -> Self {
568 pick! {
569 if #[cfg(target_feature="sse")] {
570 Self { sse: max_m128(self.sse, rhs.sse) }
571 } else if #[cfg(target_feature="simd128")] {
572 Self {
573 simd: f32x4_pmax(self.simd, rhs.simd),
574 }
575 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
576 unsafe {Self { neon: vmaxq_f32(self.neon, rhs.neon) }}
577 } else {
578 Self { arr: [
579 if self.arr[0] < rhs.arr[0] { rhs.arr[0] } else { self.arr[0] },
580 if self.arr[1] < rhs.arr[1] { rhs.arr[1] } else { self.arr[1] },
581 if self.arr[2] < rhs.arr[2] { rhs.arr[2] } else { self.arr[2] },
582 if self.arr[3] < rhs.arr[3] { rhs.arr[3] } else { self.arr[3] },
583 ]}
584 }
585 }
586 }
587
588 #[inline]
592 #[must_use]
593 pub fn max(self, rhs: Self) -> Self {
594 pick! {
595 if #[cfg(target_feature="sse")] {
596 rhs.is_nan().blend(self, Self { sse: max_m128(self.sse, rhs.sse) })
600 } else if #[cfg(target_feature="simd128")] {
601 Self {
608 simd: v128_bitselect(
609 rhs.simd,
610 f32x4_pmax(self.simd, rhs.simd),
611 f32x4_ne(self.simd, self.simd), )
613 }
614 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
615 unsafe {Self { neon: vmaxnmq_f32(self.neon, rhs.neon) }}
616 } else {
617 Self { arr: [
618 self.arr[0].max(rhs.arr[0]),
619 self.arr[1].max(rhs.arr[1]),
620 self.arr[2].max(rhs.arr[2]),
621 self.arr[3].max(rhs.arr[3]),
622 ]}
623 }
624 }
625 }
626
627 #[inline]
631 #[must_use]
632 pub fn fast_min(self, rhs: Self) -> Self {
633 pick! {
634 if #[cfg(target_feature="sse")] {
635 Self { sse: min_m128(self.sse, rhs.sse) }
636 } else if #[cfg(target_feature="simd128")] {
637 Self {
638 simd: f32x4_pmin(self.simd, rhs.simd),
639 }
640 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
641 unsafe {Self { neon: vminq_f32(self.neon, rhs.neon) }}
642 } else {
643 Self { arr: [
644 if self.arr[0] < rhs.arr[0] { self.arr[0] } else { rhs.arr[0] },
645 if self.arr[1] < rhs.arr[1] { self.arr[1] } else { rhs.arr[1] },
646 if self.arr[2] < rhs.arr[2] { self.arr[2] } else { rhs.arr[2] },
647 if self.arr[3] < rhs.arr[3] { self.arr[3] } else { rhs.arr[3] },
648 ]}
649 }
650 }
651 }
652
653 #[inline]
657 #[must_use]
658 pub fn min(self, rhs: Self) -> Self {
659 pick! {
660 if #[cfg(target_feature="sse")] {
661 rhs.is_nan().blend(self, Self { sse: min_m128(self.sse, rhs.sse) })
665 } else if #[cfg(target_feature="simd128")] {
666 Self {
673 simd: v128_bitselect(
674 rhs.simd,
675 f32x4_pmin(self.simd, rhs.simd),
676 f32x4_ne(self.simd, self.simd), )
678 }
679 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
680 unsafe {Self { neon: vminnmq_f32(self.neon, rhs.neon) }}
681 } else {
682 Self { arr: [
683 self.arr[0].min(rhs.arr[0]),
684 self.arr[1].min(rhs.arr[1]),
685 self.arr[2].min(rhs.arr[2]),
686 self.arr[3].min(rhs.arr[3]),
687 ]}
688 }
689 }
690 }
691 #[inline]
692 #[must_use]
693 pub fn is_nan(self) -> Self {
694 pick! {
695 if #[cfg(target_feature="sse")] {
696 Self { sse: cmp_unord_mask_m128(self.sse, self.sse) }
697 } else if #[cfg(target_feature="simd128")] {
698 Self { simd: f32x4_ne(self.simd, self.simd) }
699 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
700 unsafe {Self { neon: vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(self.neon, self.neon))) }}
701 } else {
702 Self { arr: [
703 if self.arr[0].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
704 if self.arr[1].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
705 if self.arr[2].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
706 if self.arr[3].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
707 ]}
708 }
709 }
710 }
711 #[inline]
712 #[must_use]
713 pub fn is_finite(self) -> Self {
714 let shifted_exp_mask = u32x4::from(0xFF000000);
715 let u: u32x4 = cast(self);
716 let shift_u = u << 1_u64;
717 let out = !(shift_u & shifted_exp_mask).cmp_eq(shifted_exp_mask);
718 cast(out)
719 }
720 #[inline]
721 #[must_use]
722 pub fn is_inf(self) -> Self {
723 let shifted_inf = u32x4::from(0xFF000000);
724 let u: u32x4 = cast(self);
725 let shift_u = u << 1_u64;
726 let out = (shift_u).cmp_eq(shifted_inf);
727 cast(out)
728 }
729
730 #[inline]
731 #[must_use]
732 pub fn round(self) -> Self {
733 pick! {
734 if #[cfg(target_feature="sse4.1")] {
735 Self { sse: round_m128::<{round_op!(Nearest)}>(self.sse) }
736 } else if #[cfg(target_feature="sse2")] {
737 let mi: m128i = convert_to_i32_m128i_from_m128(self.sse);
738 let f: f32x4 = f32x4 { sse: convert_to_m128_from_i32_m128i(mi) };
739 let i: i32x4 = cast(mi);
740 let mask: f32x4 = cast(i.cmp_eq(i32x4::from(0x80000000_u32 as i32)));
741 mask.blend(self, f)
742 } else if #[cfg(target_feature="simd128")] {
743 Self { simd: f32x4_nearest(self.simd) }
744 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
745 unsafe {Self { neon: vrndnq_f32(self.neon) }}
746 } else {
747 let to_int = f32x4::from(1.0 / f32::EPSILON);
751 let u: u32x4 = cast(self);
752 let e: i32x4 = cast((u >> 23) & u32x4::from(0xff));
753 let mut y: f32x4;
754
755 let no_op_magic = i32x4::from(0x7f + 23);
756 let no_op_mask: f32x4 = cast(e.cmp_gt(no_op_magic) | e.cmp_eq(no_op_magic));
757 let no_op_val: f32x4 = self;
758
759 let zero_magic = i32x4::from(0x7f - 1);
760 let zero_mask: f32x4 = cast(e.cmp_lt(zero_magic));
761 let zero_val: f32x4 = self * f32x4::from(0.0);
762
763 let neg_bit: f32x4 = cast(cast::<u32x4, i32x4>(u).cmp_lt(i32x4::default()));
764 let x: f32x4 = neg_bit.blend(-self, self);
765 y = x + to_int - to_int - x;
766 y = y.cmp_gt(f32x4::from(0.5)).blend(
767 y + x - f32x4::from(-1.0),
768 y.cmp_lt(f32x4::from(-0.5)).blend(y + x + f32x4::from(1.0), y + x),
769 );
770 y = neg_bit.blend(-y, y);
771
772 no_op_mask.blend(no_op_val, zero_mask.blend(zero_val, y))
773 }
774 }
775 }
776
777 #[inline]
781 #[must_use]
782 pub fn fast_round_int(self) -> i32x4 {
783 pick! {
784 if #[cfg(target_feature="sse2")] {
785 cast(convert_to_i32_m128i_from_m128(self.sse))
786 } else {
787 self.round_int()
788 }
789 }
790 }
791
792 #[inline]
796 #[must_use]
797 pub fn round_int(self) -> i32x4 {
798 pick! {
799 if #[cfg(target_feature="sse2")] {
800 let non_nan_mask = self.cmp_eq(self);
802 let non_nan = self & non_nan_mask;
803 let flip_to_max: i32x4 = cast(self.cmp_ge(Self::splat(2147483648.0)));
804 let cast: i32x4 = cast(convert_to_i32_m128i_from_m128(non_nan.sse));
805 flip_to_max ^ cast
806 } else if #[cfg(target_feature="simd128")] {
807 cast(Self { simd: i32x4_trunc_sat_f32x4(f32x4_nearest(self.simd)) })
808 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
809 cast(unsafe {Self { neon: vreinterpretq_f32_s32(vcvtnq_s32_f32(self.neon)) }})
810 } else {
811 let rounded: [f32; 4] = cast(self.round());
812 cast([
813 rounded[0] as i32,
814 rounded[1] as i32,
815 rounded[2] as i32,
816 rounded[3] as i32,
817 ])
818 }
819 }
820 }
821
822 #[inline]
826 #[must_use]
827 pub fn fast_trunc_int(self) -> i32x4 {
828 pick! {
829 if #[cfg(target_feature="sse2")] {
830 cast(truncate_m128_to_m128i(self.sse))
831 } else {
832 self.trunc_int()
833 }
834 }
835 }
836
837 #[inline]
841 #[must_use]
842 pub fn trunc_int(self) -> i32x4 {
843 pick! {
844 if #[cfg(target_feature="sse2")] {
845 let non_nan_mask = self.cmp_eq(self);
847 let non_nan = self & non_nan_mask;
848 let flip_to_max: i32x4 = cast(self.cmp_ge(Self::splat(2147483648.0)));
849 let cast: i32x4 = cast(truncate_m128_to_m128i(non_nan.sse));
850 flip_to_max ^ cast
851 } else if #[cfg(target_feature="simd128")] {
852 cast(Self { simd: i32x4_trunc_sat_f32x4(self.simd) })
853 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
854 cast(unsafe {Self { neon: vreinterpretq_f32_s32(vcvtq_s32_f32(self.neon)) }})
855 } else {
856 let n: [f32;4] = cast(self);
857 cast([
858 n[0] as i32,
859 n[1] as i32,
860 n[2] as i32,
861 n[3] as i32,
862 ])
863 }
864 }
865 }
866 #[inline]
867 #[must_use]
868 pub fn mul_add(self, m: Self, a: Self) -> Self {
869 pick! {
870 if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
871 Self { sse: fused_mul_add_m128(self.sse, m.sse, a.sse) }
872 } else {
873 (self * m) + a
874 }
875 }
876 }
877
878 #[inline]
879 #[must_use]
880 pub fn mul_sub(self, m: Self, s: Self) -> Self {
881 pick! {
882 if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
883 Self { sse: fused_mul_sub_m128(self.sse, m.sse, s.sse) }
884 } else {
885 (self * m) - s
886 }
887 }
888 }
889
890 #[inline]
891 #[must_use]
892 pub fn mul_neg_add(self, m: Self, a: Self) -> Self {
893 pick! {
894 if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
895 Self { sse: fused_mul_neg_add_m128(self.sse, m.sse, a.sse) }
896 } else {
897 a - (self * m)
898 }
899 }
900 }
901
902 #[inline]
903 #[must_use]
904 pub fn mul_neg_sub(self, m: Self, a: Self) -> Self {
905 pick! {
906 if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
907 Self { sse: fused_mul_neg_sub_m128(self.sse, m.sse, a.sse) }
908 } else {
909 -(self * m) - a
910 }
911 }
912 }
913
914 #[inline]
915 #[must_use]
916 pub fn flip_signs(self, signs: Self) -> Self {
917 self ^ (signs & Self::from(-0.0))
918 }
919
920 #[inline]
921 #[must_use]
922 pub fn copysign(self, sign: Self) -> Self {
923 let magnitude_mask = Self::from(f32::from_bits(u32::MAX >> 1));
924 (self & magnitude_mask) | (sign & Self::from(-0.0))
925 }
926
927 #[inline]
928 pub fn asin_acos(self) -> (Self, Self) {
929 const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
932 const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
933 const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
934 const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
935 const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
936
937 let xa = self.abs();
938 let big = xa.cmp_ge(f32x4::splat(0.5));
939
940 let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
941 let x2 = xa * xa;
942 let x3 = big.blend(x1, x2);
943
944 let xb = x1.sqrt();
945
946 let x4 = big.blend(xb, xa);
947
948 let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
949 let z = z.mul_add(x3 * x4, x4);
950
951 let z1 = z + z;
952
953 let z3 = self.cmp_lt(f32x4::ZERO).blend(f32x4::PI - z1, z1);
955 let z4 = f32x4::FRAC_PI_2 - z.flip_signs(self);
956 let acos = big.blend(z3, z4);
957
958 let z3 = f32x4::FRAC_PI_2 - z1;
960 let asin = big.blend(z3, z);
961 let asin = asin.flip_signs(self);
962
963 (asin, acos)
964 }
965
966 #[inline]
967 pub fn asin(self) -> Self {
968 const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
971 const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
972 const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
973 const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
974 const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
975
976 let xa = self.abs();
977 let big = xa.cmp_ge(f32x4::splat(0.5));
978
979 let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
980 let x2 = xa * xa;
981 let x3 = big.blend(x1, x2);
982
983 let xb = x1.sqrt();
984
985 let x4 = big.blend(xb, xa);
986
987 let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
988 let z = z.mul_add(x3 * x4, x4);
989
990 let z1 = z + z;
991
992 let z3 = f32x4::FRAC_PI_2 - z1;
994 let asin = big.blend(z3, z);
995 let asin = asin.flip_signs(self);
996
997 asin
998 }
999
1000 #[inline]
1001 #[must_use]
1002 pub fn acos(self) -> Self {
1003 const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
1006 const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
1007 const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
1008 const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
1009 const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
1010
1011 let xa = self.abs();
1012 let big = xa.cmp_ge(f32x4::splat(0.5));
1013
1014 let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
1015 let x2 = xa * xa;
1016 let x3 = big.blend(x1, x2);
1017
1018 let xb = x1.sqrt();
1019
1020 let x4 = big.blend(xb, xa);
1021
1022 let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
1023 let z = z.mul_add(x3 * x4, x4);
1024
1025 let z1 = z + z;
1026
1027 let z3 = self.cmp_lt(f32x4::ZERO).blend(f32x4::PI - z1, z1);
1029 let z4 = f32x4::FRAC_PI_2 - z.flip_signs(self);
1030 let acos = big.blend(z3, z4);
1031
1032 acos
1033 }
1034
1035 #[inline]
1036 pub fn atan(self) -> Self {
1037 const_f32_as_f32x4!(P3atanf, 8.05374449538E-2);
1040 const_f32_as_f32x4!(P2atanf, -1.38776856032E-1);
1041 const_f32_as_f32x4!(P1atanf, 1.99777106478E-1);
1042 const_f32_as_f32x4!(P0atanf, -3.33329491539E-1);
1043
1044 let t = self.abs();
1045
1046 let notsmal = t.cmp_ge(Self::SQRT_2 - Self::ONE);
1050 let notbig = t.cmp_le(Self::SQRT_2 + Self::ONE);
1051
1052 let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2);
1053 s = notsmal & s;
1054
1055 let mut a = notbig & t;
1056 a = notsmal.blend(a - Self::ONE, a);
1057 let mut b = notbig & Self::ONE;
1058 b = notsmal.blend(b + t, b);
1059 let z = a / b;
1060
1061 let zz = z * z;
1062
1063 let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
1065 re = re.mul_add(zz * z, z) + s;
1066
1067 re = (self.sign_bit()).blend(-re, re);
1069
1070 re
1071 }
1072
1073 #[inline]
1074 pub fn atan2(self, x: Self) -> Self {
1075 const_f32_as_f32x4!(P3atanf, 8.05374449538E-2);
1078 const_f32_as_f32x4!(P2atanf, -1.38776856032E-1);
1079 const_f32_as_f32x4!(P1atanf, 1.99777106478E-1);
1080 const_f32_as_f32x4!(P0atanf, -3.33329491539E-1);
1081
1082 let y = self;
1083
1084 let x1 = x.abs();
1086 let y1 = y.abs();
1087 let swapxy = y1.cmp_gt(x1);
1088 let mut x2 = swapxy.blend(y1, x1);
1090 let mut y2 = swapxy.blend(x1, y1);
1091
1092 let both_infinite = x.is_inf() & y.is_inf();
1094 if both_infinite.any() {
1095 let minus_one = -Self::ONE;
1096 x2 = both_infinite.blend(x2 & minus_one, x2);
1097 y2 = both_infinite.blend(y2 & minus_one, y2);
1098 }
1099
1100 let t = y2 / x2;
1102
1103 let notsmal = t.cmp_ge(Self::SQRT_2 - Self::ONE);
1106
1107 let a = notsmal.blend(t - Self::ONE, t);
1108 let b = notsmal.blend(t + Self::ONE, Self::ONE);
1109 let s = notsmal & Self::FRAC_PI_4;
1110 let z = a / b;
1111
1112 let zz = z * z;
1113
1114 let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
1116 re = re.mul_add(zz * z, z) + s;
1117
1118 re = swapxy.blend(Self::FRAC_PI_2 - re, re);
1120 re = ((x | y).cmp_eq(Self::ZERO)).blend(Self::ZERO, re);
1121 re = (x.sign_bit()).blend(Self::PI - re, re);
1122
1123 re = (y.sign_bit()).blend(-re, re);
1125
1126 re
1127 }
1128
1129 #[inline]
1130 #[must_use]
1131 pub fn sin_cos(self) -> (Self, Self) {
1132 const_f32_as_f32x4!(DP1F, 0.78515625_f32 * 2.0);
1136 const_f32_as_f32x4!(DP2F, 2.4187564849853515625E-4_f32 * 2.0);
1137 const_f32_as_f32x4!(DP3F, 3.77489497744594108E-8_f32 * 2.0);
1138
1139 const_f32_as_f32x4!(P0sinf, -1.6666654611E-1);
1140 const_f32_as_f32x4!(P1sinf, 8.3321608736E-3);
1141 const_f32_as_f32x4!(P2sinf, -1.9515295891E-4);
1142
1143 const_f32_as_f32x4!(P0cosf, 4.166664568298827E-2);
1144 const_f32_as_f32x4!(P1cosf, -1.388731625493765E-3);
1145 const_f32_as_f32x4!(P2cosf, 2.443315711809948E-5);
1146
1147 const_f32_as_f32x4!(TWO_OVER_PI, 2.0 / core::f32::consts::PI);
1148
1149 let xa = self.abs();
1150
1151 let y = (xa * TWO_OVER_PI).round();
1153 let q: i32x4 = y.round_int();
1154
1155 let x = y.mul_neg_add(DP3F, y.mul_neg_add(DP2F, y.mul_neg_add(DP1F, xa)));
1156
1157 let x2 = x * x;
1158 let mut s = polynomial_2!(x2, P0sinf, P1sinf, P2sinf) * (x * x2) + x;
1159 let mut c = polynomial_2!(x2, P0cosf, P1cosf, P2cosf) * (x2 * x2)
1160 + f32x4::from(0.5).mul_neg_add(x2, f32x4::from(1.0));
1161
1162 let swap = !(q & i32x4::from(1)).cmp_eq(i32x4::from(0));
1163
1164 let mut overflow: f32x4 = cast(q.cmp_gt(i32x4::from(0x2000000)));
1165 overflow &= xa.is_finite();
1166 s = overflow.blend(f32x4::from(0.0), s);
1167 c = overflow.blend(f32x4::from(1.0), c);
1168
1169 let mut sin1 = cast::<_, f32x4>(swap).blend(c, s);
1171 let sign_sin: i32x4 = (q << 30) ^ cast::<_, i32x4>(self);
1172 sin1 = sin1.flip_signs(cast(sign_sin));
1173
1174 let mut cos1 = cast::<_, f32x4>(swap).blend(s, c);
1176 let sign_cos: i32x4 = ((q + i32x4::from(1)) & i32x4::from(2)) << 30;
1177 cos1 ^= cast::<_, f32x4>(sign_cos);
1178
1179 (sin1, cos1)
1180 }
1181
1182 #[inline]
1183 #[must_use]
1184 pub fn sin(self) -> Self {
1185 let (s, _) = self.sin_cos();
1186 s
1187 }
1188 #[inline]
1189 #[must_use]
1190 pub fn cos(self) -> Self {
1191 let (_, c) = self.sin_cos();
1192 c
1193 }
1194 #[inline]
1195 #[must_use]
1196 pub fn tan(self) -> Self {
1197 let (s, c) = self.sin_cos();
1198 s / c
1199 }
1200 #[inline]
1201 #[must_use]
1202 pub fn to_degrees(self) -> Self {
1203 const_f32_as_f32x4!(RAD_TO_DEG_RATIO, 180.0_f32 / core::f32::consts::PI);
1204 self * RAD_TO_DEG_RATIO
1205 }
1206 #[inline]
1207 #[must_use]
1208 pub fn to_radians(self) -> Self {
1209 const_f32_as_f32x4!(DEG_TO_RAD_RATIO, core::f32::consts::PI / 180.0_f32);
1210 self * DEG_TO_RAD_RATIO
1211 }
1212 #[inline]
1213 #[must_use]
1214 pub fn recip(self) -> Self {
1215 pick! {
1216 if #[cfg(target_feature="sse")] {
1217 Self { sse: reciprocal_m128(self.sse) }
1218 } else if #[cfg(target_feature="simd128")] {
1219 Self { simd: f32x4_div(f32x4_splat(1.0), self.simd) }
1220 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1221 unsafe {Self { neon: vdivq_f32(vdupq_n_f32(1.0), self.neon) }}
1222 } else {
1223 Self { arr: [
1224 1.0 / self.arr[0],
1225 1.0 / self.arr[1],
1226 1.0 / self.arr[2],
1227 1.0 / self.arr[3],
1228 ]}
1229 }
1230 }
1231 }
1232 #[inline]
1233 #[must_use]
1234 pub fn recip_sqrt(self) -> Self {
1235 pick! {
1236 if #[cfg(target_feature="sse")] {
1237 Self { sse: reciprocal_sqrt_m128(self.sse) }
1238 } else if #[cfg(target_feature="simd128")] {
1239 Self { simd: f32x4_div(f32x4_splat(1.0), f32x4_sqrt(self.simd)) }
1240 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1241 unsafe {Self { neon: vdivq_f32(vdupq_n_f32(1.0), vsqrtq_f32(self.neon)) }}
1242 } else if #[cfg(feature="std")] {
1243 Self { arr: [
1244 1.0 / self.arr[0].sqrt(),
1245 1.0 / self.arr[1].sqrt(),
1246 1.0 / self.arr[2].sqrt(),
1247 1.0 / self.arr[3].sqrt(),
1248 ]}
1249 } else {
1250 Self { arr: [
1251 1.0 / software_sqrt(self.arr[0] as f64) as f32,
1252 1.0 / software_sqrt(self.arr[1] as f64) as f32,
1253 1.0 / software_sqrt(self.arr[2] as f64) as f32,
1254 1.0 / software_sqrt(self.arr[3] as f64) as f32,
1255 ]}
1256 }
1257 }
1258 }
1259 #[inline]
1260 #[must_use]
1261 pub fn sqrt(self) -> Self {
1262 pick! {
1263 if #[cfg(target_feature="sse")] {
1264 Self { sse: sqrt_m128(self.sse) }
1265 } else if #[cfg(target_feature="simd128")] {
1266 Self { simd: f32x4_sqrt(self.simd) }
1267 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1268 unsafe {Self { neon: vsqrtq_f32(self.neon) }}
1269 } else if #[cfg(feature="std")] {
1270 Self { arr: [
1271 self.arr[0].sqrt(),
1272 self.arr[1].sqrt(),
1273 self.arr[2].sqrt(),
1274 self.arr[3].sqrt(),
1275 ]}
1276 } else {
1277 Self { arr: [
1278 software_sqrt(self.arr[0] as f64) as f32,
1279 software_sqrt(self.arr[1] as f64) as f32,
1280 software_sqrt(self.arr[2] as f64) as f32,
1281 software_sqrt(self.arr[3] as f64) as f32,
1282 ]}
1283 }
1284 }
1285 }
1286
1287 #[inline]
1288 #[must_use]
1289 pub fn move_mask(self) -> i32 {
1290 pick! {
1291 if #[cfg(target_feature="sse")] {
1292 move_mask_m128(self.sse)
1293 } else if #[cfg(target_feature="simd128")] {
1294 u32x4_bitmask(self.simd) as i32
1295 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1296 unsafe
1297 {
1298 let masked = vcltq_s32( vreinterpretq_s32_f32(self.neon), vdupq_n_s32(0));
1300
1301 let selectbit : uint32x4_t = core::intrinsics::transmute([1u32, 2, 4, 8]);
1303 let r = vandq_u32(masked, selectbit);
1304
1305 vaddvq_u32(r) as i32
1307 }
1308 } else {
1309 (((self.arr[0].to_bits() as i32) < 0) as i32) << 0 |
1310 (((self.arr[1].to_bits() as i32) < 0) as i32) << 1 |
1311 (((self.arr[2].to_bits() as i32) < 0) as i32) << 2 |
1312 (((self.arr[3].to_bits() as i32) < 0) as i32) << 3
1313 }
1314 }
1315 }
1316 #[inline]
1317 #[must_use]
1318 pub fn any(self) -> bool {
1319 pick! {
1320 if #[cfg(target_feature="simd128")] {
1321 v128_any_true(self.simd)
1322 } else {
1323 self.move_mask() != 0
1324 }
1325 }
1326 }
1327 #[inline]
1328 #[must_use]
1329 pub fn all(self) -> bool {
1330 pick! {
1331 if #[cfg(target_feature="simd128")] {
1332 u32x4_all_true(self.simd)
1333 } else {
1334 self.move_mask() == 0b1111
1336 }
1337 }
1338 }
1339 #[inline]
1340 #[must_use]
1341 pub fn none(self) -> bool {
1342 !self.any()
1343 }
1344
1345 #[inline]
1346 fn vm_pow2n(self) -> Self {
1347 const_f32_as_f32x4!(pow2_23, 8388608.0);
1348 const_f32_as_f32x4!(bias, 127.0);
1349 let a = self + (bias + pow2_23);
1350 let c = cast::<_, i32x4>(a) << 23;
1351 cast::<_, f32x4>(c)
1352 }
1353
1354 #[inline]
1356 #[must_use]
1357 pub fn exp(self) -> Self {
1358 const_f32_as_f32x4!(P0, 1.0 / 2.0);
1359 const_f32_as_f32x4!(P1, 1.0 / 6.0);
1360 const_f32_as_f32x4!(P2, 1. / 24.);
1361 const_f32_as_f32x4!(P3, 1. / 120.);
1362 const_f32_as_f32x4!(P4, 1. / 720.);
1363 const_f32_as_f32x4!(P5, 1. / 5040.);
1364 const_f32_as_f32x4!(LN2D_HI, 0.693359375);
1365 const_f32_as_f32x4!(LN2D_LO, -2.12194440e-4);
1366 let max_x = f32x4::from(87.3);
1367 let r = (self * Self::LOG2_E).round();
1368 let x = r.mul_neg_add(LN2D_HI, self);
1369 let x = r.mul_neg_add(LN2D_LO, x);
1370 let z = polynomial_5!(x, P0, P1, P2, P3, P4, P5);
1371 let x2 = x * x;
1372 let z = z.mul_add(x2, x);
1373 let n2 = Self::vm_pow2n(r);
1374 let z = (z + Self::ONE) * n2;
1375 let in_range = self.abs().cmp_lt(max_x);
1377 let in_range = in_range & self.is_finite();
1378 in_range.blend(z, Self::ZERO)
1379 }
1380
1381 #[inline]
1382 fn exponent(self) -> f32x4 {
1383 const_f32_as_f32x4!(pow2_23, 8388608.0);
1384 const_f32_as_f32x4!(bias, 127.0);
1385 let a = cast::<_, u32x4>(self);
1386 let b = a >> 23;
1387 let c = b | cast::<_, u32x4>(pow2_23);
1388 let d = cast::<_, f32x4>(c);
1389 let e = d - (pow2_23 + bias);
1390 e
1391 }
1392
1393 #[inline]
1394 fn fraction_2(self) -> Self {
1395 let t1 = cast::<_, u32x4>(self);
1396 let t2 = cast::<_, u32x4>(
1397 (t1 & u32x4::from(0x007FFFFF)) | u32x4::from(0x3F000000),
1398 );
1399 cast::<_, f32x4>(t2)
1400 }
1401 #[inline]
1402 fn is_zero_or_subnormal(self) -> Self {
1403 let t = cast::<_, i32x4>(self);
1404 let t = t & i32x4::splat(0x7F800000);
1405 i32x4::round_float(t.cmp_eq(i32x4::splat(0)))
1406 }
1407 #[inline]
1408 fn infinity() -> Self {
1409 cast::<_, f32x4>(i32x4::splat(0x7F800000))
1410 }
1411 #[inline]
1412 fn nan_log() -> Self {
1413 cast::<_, f32x4>(i32x4::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1414 }
1415 #[inline]
1416 fn nan_pow() -> Self {
1417 cast::<_, f32x4>(i32x4::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1418 }
1419 #[inline]
1420 pub fn sign_bit(self) -> Self {
1421 let t1 = cast::<_, i32x4>(self);
1422 let t2 = t1 >> 31;
1423 !cast::<_, f32x4>(t2).cmp_eq(f32x4::ZERO)
1424 }
1425
1426 #[inline]
1428 #[must_use]
1429 pub fn reduce_add(self) -> f32 {
1430 let arr: [f32; 4] = cast(self);
1431 arr.iter().sum()
1432 }
1433
1434 #[inline]
1436 #[must_use]
1437 pub fn ln(self) -> Self {
1438 const_f32_as_f32x4!(HALF, 0.5);
1439 const_f32_as_f32x4!(P0, 3.3333331174E-1);
1440 const_f32_as_f32x4!(P1, -2.4999993993E-1);
1441 const_f32_as_f32x4!(P2, 2.0000714765E-1);
1442 const_f32_as_f32x4!(P3, -1.6668057665E-1);
1443 const_f32_as_f32x4!(P4, 1.4249322787E-1);
1444 const_f32_as_f32x4!(P5, -1.2420140846E-1);
1445 const_f32_as_f32x4!(P6, 1.1676998740E-1);
1446 const_f32_as_f32x4!(P7, -1.1514610310E-1);
1447 const_f32_as_f32x4!(P8, 7.0376836292E-2);
1448 const_f32_as_f32x4!(LN2F_HI, 0.693359375);
1449 const_f32_as_f32x4!(LN2F_LO, -2.12194440e-4);
1450 const_f32_as_f32x4!(VM_SMALLEST_NORMAL, 1.17549435E-38);
1451
1452 let x1 = self;
1453 let x = Self::fraction_2(x1);
1454 let e = Self::exponent(x1);
1455 let mask = x.cmp_gt(Self::SQRT_2 * HALF);
1456 let x = (!mask).blend(x + x, x);
1457 let fe = mask.blend(e + Self::ONE, e);
1458 let x = x - Self::ONE;
1459 let res = polynomial_8!(x, P0, P1, P2, P3, P4, P5, P6, P7, P8);
1460 let x2 = x * x;
1461 let res = x2 * x * res;
1462 let res = fe.mul_add(LN2F_LO, res);
1463 let res = res + x2.mul_neg_add(HALF, x);
1464 let res = fe.mul_add(LN2F_HI, res);
1465 let overflow = !self.is_finite();
1466 let underflow = x1.cmp_lt(VM_SMALLEST_NORMAL);
1467 let mask = overflow | underflow;
1468 if !mask.any() {
1469 res
1470 } else {
1471 let is_zero = self.is_zero_or_subnormal();
1472 let res = underflow.blend(Self::nan_log(), res);
1473 let res = is_zero.blend(Self::infinity(), res);
1474 let res = overflow.blend(self, res);
1475 res
1476 }
1477 }
1478
1479 #[inline]
1480 #[must_use]
1481 pub fn log2(self) -> Self {
1482 Self::ln(self) * Self::LOG2_E
1483 }
1484 #[inline]
1485 #[must_use]
1486 pub fn log10(self) -> Self {
1487 Self::ln(self) * Self::LOG10_E
1488 }
1489
1490 #[inline]
1491 #[must_use]
1492 pub fn pow_f32x4(self, y: f32x4) -> Self {
1493 const_f32_as_f32x4!(ln2f_hi, 0.693359375);
1494 const_f32_as_f32x4!(ln2f_lo, -2.12194440e-4);
1495 const_f32_as_f32x4!(P0logf, 3.3333331174E-1);
1496 const_f32_as_f32x4!(P1logf, -2.4999993993E-1);
1497 const_f32_as_f32x4!(P2logf, 2.0000714765E-1);
1498 const_f32_as_f32x4!(P3logf, -1.6668057665E-1);
1499 const_f32_as_f32x4!(P4logf, 1.4249322787E-1);
1500 const_f32_as_f32x4!(P5logf, -1.2420140846E-1);
1501 const_f32_as_f32x4!(P6logf, 1.1676998740E-1);
1502 const_f32_as_f32x4!(P7logf, -1.1514610310E-1);
1503 const_f32_as_f32x4!(P8logf, 7.0376836292E-2);
1504
1505 const_f32_as_f32x4!(p2expf, 1.0 / 2.0); const_f32_as_f32x4!(p3expf, 1.0 / 6.0);
1507 const_f32_as_f32x4!(p4expf, 1.0 / 24.0);
1508 const_f32_as_f32x4!(p5expf, 1.0 / 120.0);
1509 const_f32_as_f32x4!(p6expf, 1.0 / 720.0);
1510 const_f32_as_f32x4!(p7expf, 1.0 / 5040.0);
1511
1512 let x1 = self.abs();
1513 let x = x1.fraction_2();
1514
1515 let mask = x.cmp_gt(f32x4::SQRT_2 * f32x4::HALF);
1516 let x = (!mask).blend(x + x, x);
1517
1518 let x = x - f32x4::ONE;
1519 let x2 = x * x;
1520 let lg1 = polynomial_8!(
1521 x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf
1522 );
1523 let lg1 = lg1 * x2 * x;
1524
1525 let ef = x1.exponent();
1526 let ef = mask.blend(ef + f32x4::ONE, ef);
1527
1528 let e1 = (ef * y).round();
1529 let yr = ef.mul_sub(y, e1);
1530
1531 let lg = f32x4::HALF.mul_neg_add(x2, x) + lg1;
1532 let x2_err = (f32x4::HALF * x).mul_sub(x, f32x4::HALF * x2);
1533 let lg_err = f32x4::HALF.mul_add(x2, lg - x) - lg1;
1534
1535 let e2 = (lg * y * f32x4::LOG2_E).round();
1536 let v = lg.mul_sub(y, e2 * ln2f_hi);
1537 let v = e2.mul_neg_add(ln2f_lo, v);
1538 let v = v - (lg_err + x2_err).mul_sub(y, yr * f32x4::LN_2);
1539
1540 let x = v;
1541 let e3 = (x * f32x4::LOG2_E).round();
1542 let x = e3.mul_neg_add(f32x4::LN_2, x);
1543 let x2 = x * x;
1544 let z = x2.mul_add(
1545 polynomial_5!(x, p2expf, p3expf, p4expf, p5expf, p6expf, p7expf),
1546 x + f32x4::ONE,
1547 );
1548
1549 let ee = e1 + e2 + e3;
1550 let ei = cast::<_, i32x4>(ee.round_int());
1551 let ej = cast::<_, i32x4>(ei + (cast::<_, i32x4>(z) >> 23));
1552
1553 let overflow = cast::<_, f32x4>(ej.cmp_gt(i32x4::splat(0x0FF)))
1554 | (ee.cmp_gt(f32x4::splat(300.0)));
1555 let underflow = cast::<_, f32x4>(ej.cmp_lt(i32x4::splat(0x000)))
1556 | (ee.cmp_lt(f32x4::splat(-300.0)));
1557
1558 let z = cast::<_, f32x4>(cast::<_, i32x4>(z) + (ei << 23));
1560
1561 let z = if (overflow | underflow).any() {
1563 let z = underflow.blend(f32x4::ZERO, z);
1564 overflow.blend(Self::infinity(), z)
1565 } else {
1566 z
1567 };
1568
1569 let x_zero = self.is_zero_or_subnormal();
1571 let z = x_zero.blend(
1572 y.cmp_lt(f32x4::ZERO).blend(
1573 Self::infinity(),
1574 y.cmp_eq(f32x4::ZERO).blend(f32x4::ONE, f32x4::ZERO),
1575 ),
1576 z,
1577 );
1578
1579 let x_sign = self.sign_bit();
1580 let z = if x_sign.any() {
1581 let yi = y.cmp_eq(y.round());
1583 let y_odd = cast::<_, i32x4>(y.round_int() << 31).round_float();
1585
1586 let z1 =
1587 yi.blend(z | y_odd, self.cmp_eq(Self::ZERO).blend(z, Self::nan_pow()));
1588 x_sign.blend(z1, z)
1589 } else {
1590 z
1591 };
1592
1593 let x_finite = self.is_finite();
1594 let y_finite = y.is_finite();
1595 let e_finite = ee.is_finite();
1596 if (x_finite & y_finite & (e_finite | x_zero)).all() {
1597 return z;
1598 }
1599
1600 (self.is_nan() | y.is_nan()).blend(self + y, z)
1601 }
1602
1603 #[inline]
1604 pub fn powf(self, y: f32) -> Self {
1605 Self::pow_f32x4(self, f32x4::splat(y))
1606 }
1607
1608 #[inline]
1609 pub fn to_array(self) -> [f32; 4] {
1610 cast(self)
1611 }
1612
1613 #[inline]
1614 pub fn as_array_ref(&self) -> &[f32; 4] {
1615 cast_ref(self)
1616 }
1617
1618 #[inline]
1619 pub fn as_array_mut(&mut self) -> &mut [f32; 4] {
1620 cast_mut(self)
1621 }
1622
1623 #[inline]
1624 pub fn from_i32x4(v: i32x4) -> Self {
1625 pick! {
1626 if #[cfg(target_feature="sse2")] {
1627 Self { sse: convert_to_m128_from_i32_m128i(v.sse) }
1628 } else if #[cfg(target_feature="simd128")] {
1629 Self { simd: f32x4_convert_i32x4(v.simd) }
1630 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
1631 Self { neon: unsafe { vcvtq_f32_s32(v.neon) }}
1632 } else {
1633 Self { arr: [
1634 v.as_array_ref()[0] as f32,
1635 v.as_array_ref()[1] as f32,
1636 v.as_array_ref()[2] as f32,
1637 v.as_array_ref()[3] as f32,
1638 ] }
1639 }
1640 }
1641 }
1642}