safe_arch/x86_x64/
sse.rs

1#![cfg(target_feature = "sse")]
2
3use super::*;
4
5/// Fetches the cache line containing `addr` into all levels of the cache
6/// hierarchy.
7#[inline(always)]
8#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
9pub fn prefetch_t0<T>(addr: &T) {
10  unsafe { _mm_prefetch(addr as *const T as *const i8, _MM_HINT_T0) }
11}
12
13/// Fetches into L2 and higher.
14#[inline(always)]
15#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
16pub fn prefetch_t1<T>(addr: &T) {
17  unsafe { _mm_prefetch(addr as *const T as *const i8, _MM_HINT_T1) }
18}
19
20/// Fetches into L3 and higher or an implementation-specific choice (e.g., L2 if
21/// there is no L3).
22#[inline(always)]
23#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
24pub fn prefetch_t2<T>(addr: &T) {
25  unsafe { _mm_prefetch(addr as *const T as *const i8, _MM_HINT_T2) }
26}
27
28/// Fetch data using the
29///   non-temporal access (NTA) hint. It may be a place closer than main memory
30///   but outside of the cache hierarchy. This is used to reduce access latency
31///   without polluting the cache.
32#[inline(always)]
33#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
34pub fn prefetch_nta<T>(addr: &T) {
35  unsafe { _mm_prefetch(addr as *const T as *const i8, _MM_HINT_NTA) }
36}
37
38/// Fetches the cache line containing `addr` into all levels of the cache
39/// hierarchy, anticipating write
40#[inline(always)]
41#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
42pub fn prefetch_et0<T>(addr: &T) {
43  unsafe { _mm_prefetch(addr as *const T as *const i8, _MM_HINT_ET0) }
44}
45
46/// Fetches into L2 and higher, anticipating write
47#[inline(always)]
48#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
49pub fn prefetch_et1<T>(addr: &T) {
50  unsafe { _mm_prefetch(addr as *const T as *const i8, _MM_HINT_ET1) }
51}
52
53/// Lanewise `a + b`.
54/// ```
55/// # use safe_arch::*;
56/// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]);
57/// let b = m128::from_array([5.0, 6.0, 7.0, 8.5]);
58/// let c = add_m128(a, b).to_array();
59/// assert_eq!(c, [6.0, 8.0, 10.0, 12.5]);
60/// ```
61#[must_use]
62#[inline(always)]
63#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
64pub fn add_m128(a: m128, b: m128) -> m128 {
65  m128(unsafe { _mm_add_ps(a.0, b.0) })
66}
67
68/// Low lane `a + b`, other lanes unchanged.
69/// ```
70/// # use safe_arch::*;
71/// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]);
72/// let b = m128::from_array([5.0, 6.0, 7.0, 8.5]);
73/// let c = add_m128_s(a, b).to_array();
74/// assert_eq!(c, [6.0, 2.0, 3.0, 4.0]);
75/// ```
76#[must_use]
77#[inline(always)]
78#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
79pub fn add_m128_s(a: m128, b: m128) -> m128 {
80  m128(unsafe { _mm_add_ss(a.0, b.0) })
81}
82
83/// Bitwise `a & b`.
84/// ```
85/// # use safe_arch::*;
86/// let a = m128::from_array([1.0, 0.0, 1.0, 0.0]);
87/// let b = m128::from_array([1.0, 1.0, 0.0, 0.0]);
88/// let c = bitand_m128(a, b).to_array();
89/// assert_eq!(c, [1.0, 0.0, 0.0, 0.0]);
90/// ```
91#[must_use]
92#[inline(always)]
93#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
94pub fn bitand_m128(a: m128, b: m128) -> m128 {
95  m128(unsafe { _mm_and_ps(a.0, b.0) })
96}
97
98/// Bitwise `(!a) & b`.
99/// ```
100/// # use safe_arch::*;
101/// let a = m128::from_array([1.0, 0.0, 1.0, 0.0]);
102/// let b = m128::from_array([1.0, 1.0, 0.0, 0.0]);
103/// let c = bitandnot_m128(a, b).to_array();
104/// assert_eq!(c, [0.0, 1.0, 0.0, 0.0]);
105/// ```
106#[must_use]
107#[inline(always)]
108#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
109pub fn bitandnot_m128(a: m128, b: m128) -> m128 {
110  m128(unsafe { _mm_andnot_ps(a.0, b.0) })
111}
112
113/// Lanewise `a == b`.
114///
115/// Mask output.
116/// ```
117/// # use safe_arch::*;
118/// let a = m128::from_array([1.0, 0.0, 1.0, 0.0]);
119/// let b = m128::from_array([1.0, 1.0, 0.0, 0.0]);
120/// let c = cmp_eq_mask_m128(a, b).to_bits();
121/// assert_eq!(c, [u32::MAX, 0, 0, u32::MAX]);
122/// ```
123#[must_use]
124#[inline(always)]
125#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
126pub fn cmp_eq_mask_m128(a: m128, b: m128) -> m128 {
127  m128(unsafe { _mm_cmpeq_ps(a.0, b.0) })
128}
129
130/// Low lane `a == b`, other lanes unchanged.
131///
132/// Mask output.
133/// ```
134/// # use safe_arch::*;
135/// let a = m128::from_array([1.0, 0.0, 1.0, 0.0]);
136/// let b = m128::from_array([1.0, 1.0, 0.0, 0.0]);
137/// let c = cmp_eq_mask_m128_s(a, b).to_bits();
138/// assert_eq!(c, [u32::MAX, 0, 1_f32.to_bits(), 0]);
139/// ```
140#[must_use]
141#[inline(always)]
142#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
143pub fn cmp_eq_mask_m128_s(a: m128, b: m128) -> m128 {
144  m128(unsafe { _mm_cmpeq_ss(a.0, b.0) })
145}
146
147/// Lanewise `a >= b`.
148///
149/// Mask output.
150/// ```
151/// # use safe_arch::*;
152/// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]);
153/// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]);
154/// let c = cmp_ge_mask_m128(a, b).to_bits();
155/// assert_eq!(c, [0, u32::MAX, u32::MAX, u32::MAX]);
156/// ```
157#[must_use]
158#[inline(always)]
159#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
160pub fn cmp_ge_mask_m128(a: m128, b: m128) -> m128 {
161  m128(unsafe { _mm_cmpge_ps(a.0, b.0) })
162}
163
164/// Low lane `a >= b`, other lanes unchanged.
165///
166/// Mask output.
167/// ```
168/// # use safe_arch::*;
169/// let a = m128::from_array([2.0, 2.0, 3.0, 4.0]);
170/// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]);
171/// let c = cmp_ge_mask_m128_s(a, b).to_bits();
172/// assert_eq!(c, [u32::MAX, 2_f32.to_bits(), 3_f32.to_bits(), 4_f32.to_bits()]);
173/// ```
174#[must_use]
175#[inline(always)]
176#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
177pub fn cmp_ge_mask_m128_s(a: m128, b: m128) -> m128 {
178  m128(unsafe { _mm_cmpge_ss(a.0, b.0) })
179}
180
181/// Lanewise `a > b`.
182///
183/// Mask output.
184/// ```
185/// # use safe_arch::*;
186/// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]);
187/// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]);
188/// let c = cmp_gt_mask_m128(a, b).to_bits();
189/// assert_eq!(c, [0, 0, u32::MAX, u32::MAX]);
190/// ```
191#[must_use]
192#[inline(always)]
193#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
194pub fn cmp_gt_mask_m128(a: m128, b: m128) -> m128 {
195  m128(unsafe { _mm_cmpgt_ps(a.0, b.0) })
196}
197
198/// Low lane `a > b`, other lanes unchanged.
199///
200/// Mask output.
201/// ```
202/// # use safe_arch::*;
203/// let a = m128::from_array([2.5, 2.0, 3.0, 4.0]);
204/// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]);
205/// let c = cmp_gt_mask_m128_s(a, b).to_bits();
206/// assert_eq!(c, [u32::MAX, 2_f32.to_bits(), 3_f32.to_bits(), 4_f32.to_bits()]);
207/// ```
208#[must_use]
209#[inline(always)]
210#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
211pub fn cmp_gt_mask_m128_s(a: m128, b: m128) -> m128 {
212  m128(unsafe { _mm_cmpgt_ss(a.0, b.0) })
213}
214
215/// Lanewise `a <= b`.
216///
217/// Mask output.
218/// ```
219/// # use safe_arch::*;
220/// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]);
221/// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]);
222/// let c = cmp_le_mask_m128(a, b).to_bits();
223/// assert_eq!(c, [u32::MAX, u32::MAX, 0, 0]);
224/// ```
225#[must_use]
226#[inline(always)]
227#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
228pub fn cmp_le_mask_m128(a: m128, b: m128) -> m128 {
229  m128(unsafe { _mm_cmple_ps(a.0, b.0) })
230}
231
232/// Low lane `a <= b`, other lanes unchanged.
233///
234/// Mask output.
235/// ```
236/// # use safe_arch::*;
237/// let a = m128::from_array([2.0, 2.0, 3.0, 4.0]);
238/// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]);
239/// let c = cmp_le_mask_m128_s(a, b).to_bits();
240/// assert_eq!(c, [u32::MAX, 2_f32.to_bits(), 3_f32.to_bits(), 4_f32.to_bits()]);
241/// ```
242#[must_use]
243#[inline(always)]
244#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
245pub fn cmp_le_mask_m128_s(a: m128, b: m128) -> m128 {
246  m128(unsafe { _mm_cmple_ss(a.0, b.0) })
247}
248
249/// Lanewise `a < b`.
250///
251/// Mask output.
252/// ```
253/// # use safe_arch::*;
254/// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]);
255/// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]);
256/// let c = cmp_lt_mask_m128(a, b).to_bits();
257/// assert_eq!(c, [u32::MAX, 0, 0, 0]);
258/// ```
259#[must_use]
260#[inline(always)]
261#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
262pub fn cmp_lt_mask_m128(a: m128, b: m128) -> m128 {
263  m128(unsafe { _mm_cmplt_ps(a.0, b.0) })
264}
265
266/// Low lane `a < b`, other lanes unchanged.
267///
268/// Mask output.
269/// ```
270/// # use safe_arch::*;
271/// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]);
272/// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]);
273/// let c = cmp_lt_mask_m128_s(a, b).to_bits();
274/// assert_eq!(c, [u32::MAX, 2_f32.to_bits(), 3_f32.to_bits(), 4_f32.to_bits()]);
275/// ```
276#[must_use]
277#[inline(always)]
278#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
279pub fn cmp_lt_mask_m128_s(a: m128, b: m128) -> m128 {
280  m128(unsafe { _mm_cmplt_ss(a.0, b.0) })
281}
282
283/// Lanewise `a != b`.
284///
285/// Mask output.
286/// ```
287/// # use safe_arch::*;
288/// let a = m128::from_array([1.0, 0.0, 1.0, 0.0]);
289/// let b = m128::from_array([1.0, 1.0, 0.0, 0.0]);
290/// let c = cmp_neq_mask_m128(a, b).to_bits();
291/// assert_eq!(c, [0, u32::MAX, u32::MAX, 0]);
292/// ```
293#[must_use]
294#[inline(always)]
295#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
296pub fn cmp_neq_mask_m128(a: m128, b: m128) -> m128 {
297  m128(unsafe { _mm_cmpneq_ps(a.0, b.0) })
298}
299
300/// Low lane `a != b`, other lanes unchanged.
301///
302/// Mask output.
303/// ```
304/// # use safe_arch::*;
305/// let a = m128::from_array([1.0, 0.0, 1.0, 0.0]);
306/// let b = m128::from_array([1.0, 1.0, 0.0, 0.0]);
307/// let c = cmp_neq_mask_m128_s(a, b).to_bits();
308/// assert_eq!(c, [0, 0, 1_f32.to_bits(), 0]);
309/// ```
310#[must_use]
311#[inline(always)]
312#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
313pub fn cmp_neq_mask_m128_s(a: m128, b: m128) -> m128 {
314  m128(unsafe { _mm_cmpneq_ss(a.0, b.0) })
315}
316
317/// Lanewise `!(a >= b)`.
318///
319/// Mask output.
320/// ```
321/// # use safe_arch::*;
322/// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]);
323/// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]);
324/// let c = cmp_nge_mask_m128(a, b).to_bits();
325/// assert_eq!(c, [u32::MAX, 0, 0, 0]);
326/// ```
327#[must_use]
328#[inline(always)]
329#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
330pub fn cmp_nge_mask_m128(a: m128, b: m128) -> m128 {
331  m128(unsafe { _mm_cmpnge_ps(a.0, b.0) })
332}
333
334/// Low lane `!(a >= b)`, other lanes unchanged.
335///
336/// Mask output.
337/// ```
338/// # use safe_arch::*;
339/// let a = m128::from_array([2.0, 2.0, 3.0, 4.0]);
340/// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]);
341/// let c = cmp_nge_mask_m128_s(a, b).to_bits();
342/// assert_eq!(c, [0, 2_f32.to_bits(), 3_f32.to_bits(), 4_f32.to_bits()]);
343/// ```
344#[must_use]
345#[inline(always)]
346#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
347pub fn cmp_nge_mask_m128_s(a: m128, b: m128) -> m128 {
348  m128(unsafe { _mm_cmpnge_ss(a.0, b.0) })
349}
350
351/// Lanewise `!(a > b)`.
352///
353/// Mask output.
354/// ```
355/// # use safe_arch::*;
356/// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]);
357/// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]);
358/// let c = cmp_ngt_mask_m128(a, b).to_bits();
359/// assert_eq!(c, [u32::MAX, u32::MAX, 0, 0]);
360/// ```
361#[must_use]
362#[inline(always)]
363#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
364pub fn cmp_ngt_mask_m128(a: m128, b: m128) -> m128 {
365  m128(unsafe { _mm_cmpngt_ps(a.0, b.0) })
366}
367
368/// Low lane `!(a > b)`, other lanes unchanged.
369///
370/// Mask output.
371/// ```
372/// # use safe_arch::*;
373/// let a = m128::from_array([2.5, 2.0, 3.0, 4.0]);
374/// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]);
375/// let c = cmp_ngt_mask_m128_s(a, b).to_bits();
376/// assert_eq!(c, [0, 2_f32.to_bits(), 3_f32.to_bits(), 4_f32.to_bits()]);
377/// ```
378#[must_use]
379#[inline(always)]
380#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
381pub fn cmp_ngt_mask_m128_s(a: m128, b: m128) -> m128 {
382  m128(unsafe { _mm_cmpngt_ss(a.0, b.0) })
383}
384
385/// Lanewise `!(a <= b)`.
386///
387/// Mask output.
388/// ```
389/// # use safe_arch::*;
390/// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]);
391/// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]);
392/// let c = cmp_nle_mask_m128(a, b).to_bits();
393/// assert_eq!(c, [0, 0, u32::MAX, u32::MAX]);
394/// ```
395#[must_use]
396#[inline(always)]
397#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
398pub fn cmp_nle_mask_m128(a: m128, b: m128) -> m128 {
399  m128(unsafe { _mm_cmpnle_ps(a.0, b.0) })
400}
401
402/// Low lane `!(a <= b)`, other lanes unchanged.
403///
404/// Mask output.
405/// ```
406/// # use safe_arch::*;
407/// let a = m128::from_array([2.0, 2.0, 3.0, 4.0]);
408/// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]);
409/// let c = cmp_nle_mask_m128_s(a, b).to_bits();
410/// assert_eq!(c, [0, 2_f32.to_bits(), 3_f32.to_bits(), 4_f32.to_bits()]);
411/// ```
412#[must_use]
413#[inline(always)]
414#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
415pub fn cmp_nle_mask_m128_s(a: m128, b: m128) -> m128 {
416  m128(unsafe { _mm_cmpnle_ss(a.0, b.0) })
417}
418
419/// Lanewise `!(a < b)`.
420///
421/// Mask output.
422/// ```
423/// # use safe_arch::*;
424/// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]);
425/// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]);
426/// let c = cmp_nlt_mask_m128(a, b).to_bits();
427/// assert_eq!(c, [0, u32::MAX, u32::MAX, u32::MAX]);
428/// ```
429#[must_use]
430#[inline(always)]
431#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
432pub fn cmp_nlt_mask_m128(a: m128, b: m128) -> m128 {
433  m128(unsafe { _mm_cmpnlt_ps(a.0, b.0) })
434}
435
436/// Low lane `!(a < b)`, other lanes unchanged.
437///
438/// Mask output.
439/// ```
440/// # use safe_arch::*;
441/// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]);
442/// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]);
443/// let c = cmp_nlt_mask_m128_s(a, b).to_bits();
444/// assert_eq!(c, [0, 2_f32.to_bits(), 3_f32.to_bits(), 4_f32.to_bits()]);
445/// ```
446#[must_use]
447#[inline(always)]
448#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
449pub fn cmp_nlt_mask_m128_s(a: m128, b: m128) -> m128 {
450  m128(unsafe { _mm_cmpnlt_ss(a.0, b.0) })
451}
452
453/// Lanewise `(!a.is_nan()) & (!b.is_nan())`.
454///
455/// Mask output.
456/// ```
457/// # use safe_arch::*;
458/// let a = m128::from_array([0.0, f32::NAN, 0.0, f32::NAN]);
459/// let b = m128::from_array([0.0, 0.0, f32::NAN, f32::NAN]);
460/// let c = cmp_ordered_mask_m128(a, b).to_bits();
461/// assert_eq!(c, [u32::MAX, 0, 0, 0]);
462/// ```
463#[must_use]
464#[inline(always)]
465#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
466pub fn cmp_ordered_mask_m128(a: m128, b: m128) -> m128 {
467  m128(unsafe { _mm_cmpord_ps(a.0, b.0) })
468}
469
470/// Low lane `(!a.is_nan()) & (!b.is_nan())`, other lanes unchanged.
471///
472/// Mask output.
473/// ```
474/// # use safe_arch::*;
475/// let a = m128::from_array([0.0, 2.0, 3.0, 4.0]);
476/// let b = m128::from_array([0.0, f32::NAN, f32::NAN, f32::NAN]);
477/// let c = cmp_ordered_mask_m128_s(a, b).to_bits();
478/// assert_eq!(c, [u32::MAX, 2_f32.to_bits(), 3_f32.to_bits(), 4_f32.to_bits()]);
479/// ```
480#[must_use]
481#[inline(always)]
482#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
483pub fn cmp_ordered_mask_m128_s(a: m128, b: m128) -> m128 {
484  m128(unsafe { _mm_cmpord_ss(a.0, b.0) })
485}
486
487/// Lanewise `a.is_nan() | b.is_nan()`.
488///
489/// Mask output.
490/// ```
491/// # use safe_arch::*;
492/// let a = m128::from_array([0.0, f32::NAN, 0.0, f32::NAN]);
493/// let b = m128::from_array([0.0, 0.0, f32::NAN, f32::NAN]);
494/// let c = cmp_unord_mask_m128(a, b).to_bits();
495/// assert_eq!(c, [0, u32::MAX, u32::MAX, u32::MAX]);
496/// ```
497#[must_use]
498#[inline(always)]
499#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
500pub fn cmp_unord_mask_m128(a: m128, b: m128) -> m128 {
501  m128(unsafe { _mm_cmpunord_ps(a.0, b.0) })
502}
503
504/// Low lane `a.is_nan() | b.is_nan()`, other lanes unchanged.
505///
506/// Mask output.
507/// ```
508/// # use safe_arch::*;
509/// let a = m128::from_array([0.0, 2.0, 3.0, 4.0]);
510/// let b = m128::from_array([0.0, f32::NAN, f32::NAN, f32::NAN]);
511/// let c = cmp_unord_mask_m128_s(a, b).to_bits();
512/// assert_eq!(c, [0, 2_f32.to_bits(), 3_f32.to_bits(), 4_f32.to_bits()]);
513/// ```
514#[must_use]
515#[inline(always)]
516#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
517pub fn cmp_unord_mask_m128_s(a: m128, b: m128) -> m128 {
518  m128(unsafe { _mm_cmpunord_ss(a.0, b.0) })
519}
520
521/// Low lane equality.
522///
523/// `i32` output.
524/// ```
525/// # use safe_arch::*;
526/// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]);
527/// let b = m128::from_array([1.0, 1.0, 1.0, 1.0]);
528/// assert_eq!(1_i32, cmp_eq_i32_m128_s(a, b));
529/// ```
530#[must_use]
531#[inline(always)]
532#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
533pub fn cmp_eq_i32_m128_s(a: m128, b: m128) -> i32 {
534  unsafe { _mm_comieq_ss(a.0, b.0) }
535}
536
537/// Low lane greater than or equal to.
538///
539/// `i32` output.
540/// ```
541/// # use safe_arch::*;
542/// let a = m128::from_array([2.0, 2.0, 3.0, 4.0]);
543/// let b = m128::from_array([1.0, 1.0, 1.0, 1.0]);
544/// assert_eq!(1_i32, cmp_ge_i32_m128_s(a, b));
545/// ```
546#[must_use]
547#[inline(always)]
548#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
549pub fn cmp_ge_i32_m128_s(a: m128, b: m128) -> i32 {
550  unsafe { _mm_comige_ss(a.0, b.0) }
551}
552
553/// Low lane greater than.
554///
555/// `i32` output.
556/// ```
557/// # use safe_arch::*;
558/// let a = m128::from_array([2.0, 2.0, 3.0, 4.0]);
559/// let b = m128::from_array([1.0, 1.0, 1.0, 1.0]);
560/// assert_eq!(1_i32, cmp_gt_i32_m128_s(a, b));
561/// ```
562#[must_use]
563#[inline(always)]
564#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
565pub fn cmp_gt_i32_m128_s(a: m128, b: m128) -> i32 {
566  unsafe { _mm_comigt_ss(a.0, b.0) }
567}
568
569/// Low lane less than or equal to.
570///
571/// `i32` output.
572/// ```
573/// # use safe_arch::*;
574/// let a = m128::from_array([0.5, 2.0, 3.0, 4.0]);
575/// let b = m128::from_array([1.0, 1.0, 1.0, 1.0]);
576/// assert_eq!(1_i32, cmp_le_i32_m128_s(a, b));
577/// ```
578#[must_use]
579#[inline(always)]
580#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
581pub fn cmp_le_i32_m128_s(a: m128, b: m128) -> i32 {
582  unsafe { _mm_comile_ss(a.0, b.0) }
583}
584
585/// Low lane less than.
586///
587/// `i32` output.
588/// ```
589/// # use safe_arch::*;
590/// let a = m128::from_array([0.5, 2.0, 3.0, 4.0]);
591/// let b = m128::from_array([1.0, 1.0, 1.0, 1.0]);
592/// assert_eq!(1_i32, cmp_lt_i32_m128_s(a, b));
593/// ```
594#[must_use]
595#[inline(always)]
596#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
597pub fn cmp_lt_i32_m128_s(a: m128, b: m128) -> i32 {
598  unsafe { _mm_comilt_ss(a.0, b.0) }
599}
600
601/// Low lane not equal to.
602///
603/// `i32` output.
604/// ```
605/// # use safe_arch::*;
606/// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]);
607/// let b = m128::from_array([1.0, 1.0, 1.0, 1.0]);
608/// assert_eq!(0_i32, cmp_neq_i32_m128_s(a, b));
609/// ```
610#[must_use]
611#[inline(always)]
612#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
613pub fn cmp_neq_i32_m128_s(a: m128, b: m128) -> i32 {
614  unsafe { _mm_comineq_ss(a.0, b.0) }
615}
616
617/// Convert `i32` to `f32` and replace the low lane of the input.
618/// ```
619/// # use safe_arch::*;
620/// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]);
621/// let b = convert_i32_replace_m128_s(a, 5_i32).to_array();
622/// assert_eq!(b, [5.0, 2.0, 3.0, 4.0]);
623/// ```
624/// * **Intrinsic:** [`_mm_cvtsi32_ss`]
625/// * **Assembly:** `cvtsi2ss xmm, r32`
626#[must_use]
627#[inline(always)]
628#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
629pub fn convert_i32_replace_m128_s(a: m128, i: i32) -> m128 {
630  m128(unsafe { _mm_cvtsi32_ss(a.0, i) })
631}
632
633/// Convert `i64` to `f32` and replace the low lane of the input.
634/// ```
635/// # use safe_arch::*;
636/// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]);
637/// let b = convert_i64_replace_m128_s(a, 5_i64).to_array();
638/// assert_eq!(b, [5.0, 2.0, 3.0, 4.0]);
639/// ```
640/// * **Intrinsic:** [`_mm_cvtsi64_ss`]
641/// * **Assembly:** `cvtsi2ss xmm, r64`
642#[must_use]
643#[inline(always)]
644#[cfg(target_arch = "x86_64")]
645#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
646pub fn convert_i64_replace_m128_s(a: m128, i: i64) -> m128 {
647  m128(unsafe { _mm_cvtsi64_ss(a.0, i) })
648}
649
650/// Gets the low lane as an individual `f32` value.
651/// ```
652/// # use safe_arch::*;
653/// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]);
654/// assert_eq!(1_f32, get_f32_from_m128_s(a));
655/// ```
656#[must_use]
657#[inline(always)]
658#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
659pub fn get_f32_from_m128_s(a: m128) -> f32 {
660  unsafe { _mm_cvtss_f32(a.0) }
661}
662
663/// Converts the low lane to `i32` and extracts as an individual value.
664/// ```
665/// # use safe_arch::*;
666/// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]);
667/// assert_eq!(1_i32, get_i32_from_m128_s(a));
668/// ```
669#[must_use]
670#[inline(always)]
671#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
672pub fn get_i32_from_m128_s(a: m128) -> i32 {
673  unsafe { _mm_cvtss_si32(a.0) }
674}
675
676/// Converts the low lane to `i64` and extracts as an individual value.
677/// ```
678/// # use safe_arch::*;
679/// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]);
680/// assert_eq!(1_i64, get_i64_from_m128_s(a));
681/// ```
682#[must_use]
683#[inline(always)]
684#[cfg(target_arch = "x86_64")]
685#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
686pub fn get_i64_from_m128_s(a: m128) -> i64 {
687  unsafe { _mm_cvttss_si64(a.0) }
688}
689
690/// Lanewise `a / b`.
691/// ```
692/// # use safe_arch::*;
693/// let a = m128::from_array([10.0, 12.0, 13.0, 14.0]);
694/// let b = m128::from_array([2.0, 6.0, 13.0, 2.0]);
695/// let c = div_m128(a, b).to_array();
696/// assert_eq!(c, [5.0, 2.0, 1.0, 7.0]);
697/// ```
698#[must_use]
699#[inline(always)]
700#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
701pub fn div_m128(a: m128, b: m128) -> m128 {
702  m128(unsafe { _mm_div_ps(a.0, b.0) })
703}
704
705/// Low lane `a / b`, other lanes unchanged.
706/// ```
707/// # use safe_arch::*;
708/// let a = m128::from_array([10.0, 12.0, 13.0, 14.0]);
709/// let b = m128::from_array([2.0, 6.0, 13.0, 2.0]);
710/// let c = div_m128_s(a, b).to_array();
711/// assert_eq!(c, [5.0, 12.0, 13.0, 14.0]);
712/// ```
713#[must_use]
714#[inline(always)]
715#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
716pub fn div_m128_s(a: m128, b: m128) -> m128 {
717  m128(unsafe { _mm_div_ss(a.0, b.0) })
718}
719
720/// Loads the reference into a register.
721/// ```
722/// # use safe_arch::*;
723/// let a = m128::from_array([10.0, 12.0, 13.0, 14.0]);
724/// let b = load_m128(&a);
725/// assert_eq!(a.to_bits(), b.to_bits());
726/// ```
727#[must_use]
728#[inline(always)]
729#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
730pub fn load_m128(a: &m128) -> m128 {
731  m128(unsafe { _mm_load_ps(a as *const m128 as *const f32) })
732}
733
734/// Loads the `f32` reference into all lanes of a register.
735/// ```
736/// # use safe_arch::*;
737/// let a = 1.0;
738/// let b = load_f32_splat_m128(&a);
739/// assert_eq!(m128::from_array([1.0, 1.0, 1.0, 1.0]).to_bits(), b.to_bits());
740/// ```
741#[must_use]
742#[inline(always)]
743#[allow(clippy::trivially_copy_pass_by_ref)]
744#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
745pub fn load_f32_splat_m128(a: &f32) -> m128 {
746  // question: how is this different from _mm_broadcast_ss?
747  m128(unsafe { _mm_load_ps1(a) })
748}
749
750/// Loads the `f32` reference into the low lane of the register.
751/// ```
752/// # use safe_arch::*;
753/// let a = 1.0;
754/// let b = load_f32_m128_s(&a);
755/// assert_eq!(m128::from_array([1.0, 0.0, 0.0, 0.0]).to_bits(), b.to_bits());
756/// ```
757#[must_use]
758#[inline(always)]
759#[allow(clippy::trivially_copy_pass_by_ref)]
760#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
761pub fn load_f32_m128_s(a: &f32) -> m128 {
762  m128(unsafe { _mm_load_ss(a) })
763}
764
765/// Loads the reference into a register with reversed order.
766/// ```
767/// # use safe_arch::*;
768/// let a = m128::from_array([10.0, 12.0, 13.0, 14.0]);
769/// let b = load_reverse_m128(&a);
770/// assert_eq!(m128::from_array([14.0, 13.0, 12.0, 10.0]).to_bits(), b.to_bits());
771/// ```
772#[must_use]
773#[inline(always)]
774#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
775pub fn load_reverse_m128(a: &m128) -> m128 {
776  m128(unsafe { _mm_loadr_ps(a as *const m128 as *const f32) })
777}
778
779/// Loads the reference into a register.
780///
781/// This generally has no speed penalty if the reference happens to be 16-byte
782/// aligned, but there is a slight speed penalty if the reference is only 4-byte
783/// aligned.
784/// ```
785/// # use safe_arch::*;
786/// let a = [10.0, 12.0, 13.0, 14.0];
787/// let b = load_unaligned_m128(&a);
788/// assert_eq!(m128::from_array(a).to_bits(), b.to_bits());
789/// ```
790#[must_use]
791#[inline(always)]
792#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
793pub fn load_unaligned_m128(a: &[f32; 4]) -> m128 {
794  m128(unsafe { _mm_loadu_ps(a as *const [f32; 4] as *const f32) })
795}
796
797/// Lanewise `max(a, b)`.
798/// ```
799/// # use safe_arch::*;
800/// let a = m128::from_array([1.0, 12.0, 3.0, 4.0]);
801/// let b = m128::from_array([5.0, 6.0, 7.0, 8.5]);
802/// let c = max_m128(a, b).to_array();
803/// assert_eq!(c, [5.0, 12.0, 7.0, 8.5]);
804/// ```
805#[must_use]
806#[inline(always)]
807#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
808pub fn max_m128(a: m128, b: m128) -> m128 {
809  m128(unsafe { _mm_max_ps(a.0, b.0) })
810}
811
812/// Low lane `max(a, b)`, other lanes unchanged.
813/// ```
814/// # use safe_arch::*;
815/// let a = m128::from_array([1.0, 12.0, 3.0, 4.0]);
816/// let b = m128::from_array([5.0, 6.0, 7.0, 8.5]);
817/// let c = max_m128_s(a, b).to_array();
818/// assert_eq!(c, [5.0, 12.0, 3.0, 4.0]);
819/// ```
820#[must_use]
821#[inline(always)]
822#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
823pub fn max_m128_s(a: m128, b: m128) -> m128 {
824  m128(unsafe { _mm_max_ss(a.0, b.0) })
825}
826
827/// Lanewise `min(a, b)`.
828/// ```
829/// # use safe_arch::*;
830/// let a = m128::from_array([1.0, 12.0, 3.0, 4.0]);
831/// let b = m128::from_array([5.0, 6.0, 7.0, 8.5]);
832/// let c = min_m128(a, b).to_array();
833/// assert_eq!(c, [1.0, 6.0, 3.0, 4.0]);
834/// ```
835#[must_use]
836#[inline(always)]
837#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
838pub fn min_m128(a: m128, b: m128) -> m128 {
839  m128(unsafe { _mm_min_ps(a.0, b.0) })
840}
841
842/// Low lane `min(a, b)`, other lanes unchanged.
843/// ```
844/// # use safe_arch::*;
845/// let a = m128::from_array([1.0, 12.0, 3.0, 4.0]);
846/// let b = m128::from_array([0.0, 6.0, 7.0, 8.5]);
847/// let c = min_m128_s(a, b).to_array();
848/// assert_eq!(c, [0.0, 12.0, 3.0, 4.0]);
849/// ```
850#[must_use]
851#[inline(always)]
852#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
853pub fn min_m128_s(a: m128, b: m128) -> m128 {
854  m128(unsafe { _mm_min_ss(a.0, b.0) })
855}
856
857/// Move the low lane of `b` to `a`, other lanes unchanged.
858/// ```
859/// # use safe_arch::*;
860/// let a = m128::from_array([1.0, 12.0, 3.0, 4.0]);
861/// let b = m128::from_array([8.0, 6.0, 7.0, 8.5]);
862/// let c = move_m128_s(a, b).to_array();
863/// assert_eq!(c, [8.0, 12.0, 3.0, 4.0]);
864/// ```
865#[must_use]
866#[inline(always)]
867#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
868pub fn move_m128_s(a: m128, b: m128) -> m128 {
869  m128(unsafe { _mm_move_ss(a.0, b.0) })
870}
871
872/// Move the high lanes of `b` to the low lanes of `a`, other lanes unchanged.
873/// ```
874/// # use safe_arch::*;
875/// let a = m128::from_array([1.0, 12.0, 3.0, 4.0]);
876/// let b = m128::from_array([8.0, 6.0, 7.0, 8.5]);
877/// let c = move_high_low_m128(a, b).to_array();
878/// assert_eq!(c, [7.0, 8.5, 3.0, 4.0]);
879/// ```
880#[must_use]
881#[inline(always)]
882#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
883pub fn move_high_low_m128(a: m128, b: m128) -> m128 {
884  m128(unsafe { _mm_movehl_ps(a.0, b.0) })
885}
886
887/// Move the low lanes of `b` to the high lanes of `a`, other lanes unchanged.
888/// ```
889/// # use safe_arch::*;
890/// let a = m128::from_array([1.0, 12.0, 3.0, 4.0]);
891/// let b = m128::from_array([8.0, 6.0, 7.0, 8.5]);
892/// let c = move_low_high_m128(a, b).to_array();
893/// assert_eq!(c, [1.0, 12.0, 8.0, 6.0]);
894/// ```
895#[must_use]
896#[inline(always)]
897#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
898pub fn move_low_high_m128(a: m128, b: m128) -> m128 {
899  m128(unsafe { _mm_movelh_ps(a.0, b.0) })
900}
901
902/// Gathers the sign bit of each lane.
903///
904/// The output has lane 0 as bit 0, lane 1 as bit 1, and so on.
905/// ```
906/// # use safe_arch::*;
907/// let a = m128::from_array([-1.0, 12.0, -3.0, -4.0]);
908/// let i = move_mask_m128(a);
909/// assert_eq!(i, 0b1101);
910/// ```
911#[must_use]
912#[inline(always)]
913#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
914pub fn move_mask_m128(a: m128) -> i32 {
915  unsafe { _mm_movemask_ps(a.0) }
916}
917
918/// Lanewise `a * b`.
919/// ```
920/// # use safe_arch::*;
921/// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]);
922/// let b = m128::from_array([5.0, 6.0, 7.0, 8.5]);
923/// let c = mul_m128(a, b).to_array();
924/// assert_eq!(c, [5.0, 12.0, 21.0, 34.0]);
925/// ```
926#[must_use]
927#[inline(always)]
928#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
929pub fn mul_m128(a: m128, b: m128) -> m128 {
930  m128(unsafe { _mm_mul_ps(a.0, b.0) })
931}
932
933/// Low lane `a * b`, other lanes unchanged.
934/// ```
935/// # use safe_arch::*;
936/// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]);
937/// let b = m128::from_array([5.0, 6.0, 7.0, 8.5]);
938/// let c = mul_m128_s(a, b).to_array();
939/// assert_eq!(c, [5.0, 2.0, 3.0, 4.0]);
940/// ```
941#[must_use]
942#[inline(always)]
943#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
944pub fn mul_m128_s(a: m128, b: m128) -> m128 {
945  m128(unsafe { _mm_mul_ss(a.0, b.0) })
946}
947
948/// Bitwise `a | b`.
949/// ```
950/// # use safe_arch::*;
951/// let a = m128::from_array([1.0, 0.0, 1.0, 0.0]);
952/// let b = m128::from_array([1.0, 1.0, 0.0, 0.0]);
953/// let c = bitor_m128(a, b).to_array();
954/// assert_eq!(c, [1.0, 1.0, 1.0, 0.0]);
955/// ```
956#[must_use]
957#[inline(always)]
958#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
959pub fn bitor_m128(a: m128, b: m128) -> m128 {
960  m128(unsafe { _mm_or_ps(a.0, b.0) })
961}
962
963/// Lanewise `1.0 / a` approximation.
964/// ```
965/// # use safe_arch::*;
966/// let a = m128::from_array([1.0, 2.0, 4.0, 8.0]);
967/// let b = reciprocal_m128(a).to_array();
968/// let expected = [1.0, 0.5, 0.25, 0.125];
969/// for i in 0..4 {
970///   assert!((b[i] - expected[i]).abs() < 0.001);
971/// }
972/// ```
973#[must_use]
974#[inline(always)]
975#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
976pub fn reciprocal_m128(a: m128) -> m128 {
977  m128(unsafe { _mm_rcp_ps(a.0) })
978}
979
980/// Low lane `1.0 / a` approximation, other lanes unchanged.
981/// ```
982/// # use safe_arch::*;
983/// let a = m128::from_array([1.0, 2.0, 4.0, 8.0]);
984/// let b = reciprocal_m128_s(a).to_array();
985/// let expected = [1.0, 2.0, 4.0, 8.0];
986/// for i in 0..4 {
987///   assert!((b[i] - expected[i]).abs() < 0.001);
988/// }
989/// ```
990#[must_use]
991#[inline(always)]
992#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
993pub fn reciprocal_m128_s(a: m128) -> m128 {
994  m128(unsafe { _mm_rcp_ss(a.0) })
995}
996
997/// Lanewise `1.0 / sqrt(a)` approximation.
998/// ```
999/// # use safe_arch::*;
1000/// let a = m128::from_array([16.0, 9.0, 4.0, 25.0]);
1001/// let b = reciprocal_sqrt_m128(a).to_array();
1002/// let expected = [0.25, 0.33333, 0.5, 0.2];
1003/// for i in 0..4 {
1004///   assert!((b[i] - expected[i]).abs() < 0.001);
1005/// }
1006/// ```
1007#[must_use]
1008#[inline(always)]
1009#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
1010pub fn reciprocal_sqrt_m128(a: m128) -> m128 {
1011  m128(unsafe { _mm_rsqrt_ps(a.0) })
1012}
1013
1014/// Low lane `1.0 / sqrt(a)` approximation, other lanes unchanged.
1015/// ```
1016/// # use safe_arch::*;
1017/// let a = m128::from_array([16.0, 8.0, 9.0, 10.0]);
1018/// let b = reciprocal_sqrt_m128_s(a).to_array();
1019/// let expected = [0.25, 8.0, 9.0, 10.0];
1020/// for i in 0..4 {
1021///   assert!((b[i] - expected[i]).abs() < 0.001);
1022/// }
1023/// ```
1024#[must_use]
1025#[inline(always)]
1026#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
1027pub fn reciprocal_sqrt_m128_s(a: m128) -> m128 {
1028  m128(unsafe { _mm_rsqrt_ss(a.0) })
1029}
1030
1031/// Sets the args into an `m128`, first arg is the high lane.
1032/// ```
1033/// # use safe_arch::*;
1034/// let a = set_m128(1.0, 2.0, 3.0, 4.0).to_array();
1035/// let b = m128::from_array([4.0, 3.0, 2.0, 1.0]).to_array();
1036/// assert_eq!(a, b);
1037/// ```
1038#[must_use]
1039#[inline(always)]
1040#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
1041pub fn set_m128(three: f32, two: f32, one: f32, zero: f32) -> m128 {
1042  m128(unsafe { _mm_set_ps(three, two, one, zero) })
1043}
1044
1045/// Sets the args into an `m128`, first arg is the high lane.
1046/// ```
1047/// # use safe_arch::*;
1048/// let a = set_m128_s(1.0).to_array();
1049/// let b = m128::from_array([1.0, 0.0, 0.0, 0.0]).to_array();
1050/// assert_eq!(a, b);
1051/// ```
1052#[must_use]
1053#[inline(always)]
1054#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
1055pub fn set_m128_s(low: f32) -> m128 {
1056  m128(unsafe { _mm_set_ss(low) })
1057}
1058
1059/// Splats the value to all lanes.
1060/// ```
1061/// # use safe_arch::*;
1062/// let a = set_splat_m128(1.0).to_array();
1063/// let b = m128::from_array([1.0, 1.0, 1.0, 1.0]).to_array();
1064/// assert_eq!(a, b);
1065/// ```
1066#[must_use]
1067#[inline(always)]
1068#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
1069pub fn set_splat_m128(all: f32) -> m128 {
1070  m128(unsafe { _mm_set1_ps(all) })
1071}
1072
1073/// Sets the args into an `m128`, first arg is the low lane.
1074/// ```
1075/// # use safe_arch::*;
1076/// let a = set_reversed_m128(1.0, 2.0, 3.0, 4.0).to_array();
1077/// let b = m128::from_array([1.0, 2.0, 3.0, 4.0]).to_array();
1078/// assert_eq!(a, b);
1079/// ```
1080#[must_use]
1081#[inline(always)]
1082#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
1083pub fn set_reversed_m128(zero: f32, one: f32, two: f32, three: f32) -> m128 {
1084  m128(unsafe { _mm_setr_ps(zero, one, two, three) })
1085}
1086
1087/// All lanes zero.
1088/// ```
1089/// # use safe_arch::*;
1090/// let a = zeroed_m128().to_array();
1091/// assert_eq!(a, [0.0, 0.0, 0.0, 0.0]);
1092/// ```
1093#[must_use]
1094#[inline(always)]
1095#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
1096pub fn zeroed_m128() -> m128 {
1097  m128(unsafe { _mm_setzero_ps() })
1098}
1099
1100/// Shuffle the `f32` lanes from `$a` and `$b` together using an immediate
1101/// control value.
1102///
1103/// The `a:` and `b:` prefixes on the index selection values are literal tokens
1104/// that you type. It helps keep clear what value comes from where. The first
1105/// two output lanes come from `$a`, the second two output lanes come from `$b`.
1106///
1107/// You can pass the same value as both arguments, but if you want to swizzle
1108/// within only a single register and you have `avx` available consider using
1109/// [`shuffle_ai_f32_all_m128`] instead. You'll get much better performance.
1110/// ```
1111/// # use safe_arch::*;
1112/// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]);
1113/// let b = m128::from_array([5.0, 6.0, 7.0, 8.0]);
1114/// //
1115/// let c = shuffle_abi_f32_all_m128::<0>(a, b).to_array();
1116/// assert_eq!(c, [1.0, 1.0, 5.0, 5.0]);
1117/// //
1118/// let c = shuffle_abi_f32_all_m128::<0b11_10_01_00>(a, b).to_array();
1119/// assert_eq!(c, [1.0, 2.0, 7.0, 8.0]);
1120/// //
1121/// let c = shuffle_abi_f32_all_m128::<0b00_10_10_01>(a, b).to_array();
1122/// assert_eq!(c, [2.0, 3.0, 7.0, 5.0]);
1123/// ```
1124/// * **Intrinsic:** [`_mm_shuffle_ps`]
1125/// * **Assembly:** `shufps xmm, xmm, imm8`
1126#[must_use]
1127#[inline(always)]
1128#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
1129pub fn shuffle_abi_f32_all_m128<const MASK: i32>(a: m128, b: m128) -> m128 {
1130  m128(unsafe { _mm_shuffle_ps(a.0, b.0, MASK) })
1131}
1132
1133/// Lanewise `sqrt(a)`.
1134/// ```
1135/// # use safe_arch::*;
1136/// let a = m128::from_array([25.0, 16.0, 9.0, 4.0]);
1137/// let b = sqrt_m128(a).to_array();
1138/// assert_eq!(b, [5.0, 4.0, 3.0, 2.0]);
1139/// ```
1140#[must_use]
1141#[inline(always)]
1142#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
1143pub fn sqrt_m128(a: m128) -> m128 {
1144  m128(unsafe { _mm_sqrt_ps(a.0) })
1145}
1146
1147/// Low lane `sqrt(a)`, other lanes unchanged.
1148/// ```
1149/// # use safe_arch::*;
1150/// let a = m128::from_array([4.0, 8.0, 7.0, 6.0]);
1151/// let b = sqrt_m128_s(a).to_array();
1152/// assert_eq!(b, [2.0, 8.0, 7.0, 6.0]);
1153/// ```
1154#[must_use]
1155#[inline(always)]
1156#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
1157pub fn sqrt_m128_s(a: m128) -> m128 {
1158  m128(unsafe { _mm_sqrt_ss(a.0) })
1159}
1160
1161/// Stores the value to the reference given.
1162/// ```
1163/// # use safe_arch::*;
1164/// let a = m128::from_array([10.0, 12.0, 13.0, 14.0]);
1165/// let mut b = zeroed_m128();
1166/// store_m128(&mut b, a);
1167/// let c = b.to_array();
1168/// assert_eq!(c, [10.0, 12.0, 13.0, 14.0]);
1169/// ```
1170#[inline(always)]
1171#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
1172pub fn store_m128(r: &mut m128, a: m128) {
1173  unsafe { _mm_store_ps(r as *mut m128 as *mut f32, a.0) }
1174}
1175
1176/// Stores the low lane value to the reference given.
1177/// ```
1178/// # use safe_arch::*;
1179/// let a = m128::from_array([10.0, 12.0, 13.0, 14.0]);
1180/// let mut f = 0.0;
1181/// store_m128_s(&mut f, a);
1182/// assert_eq!(f, 10.0);
1183/// ```
1184#[inline(always)]
1185#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
1186pub fn store_m128_s(r: &mut f32, a: m128) {
1187  unsafe { _mm_store_ss(r as *mut f32, a.0) }
1188}
1189
1190/// Stores the low lane value to all lanes of the reference given.
1191/// ```
1192/// # use safe_arch::*;
1193/// let a = m128::from_array([10.0, 12.0, 13.0, 14.0]);
1194/// let mut b = zeroed_m128();
1195/// store_splat_m128(&mut b, a);
1196/// let c = b.to_array();
1197/// assert_eq!(c, [10.0, 10.0, 10.0, 10.0]);
1198/// ```
1199#[inline(always)]
1200#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
1201pub fn store_splat_m128(r: &mut m128, a: m128) {
1202  unsafe { _mm_store1_ps(r as *mut m128 as *mut f32, a.0) }
1203}
1204
1205/// Stores the value to the reference given in reverse order.
1206/// ```
1207/// # use safe_arch::*;
1208/// let a = m128::from_array([10.0, 12.0, 13.0, 14.0]);
1209/// let mut b = zeroed_m128();
1210/// store_reverse_m128(&mut b, a);
1211/// let c = b.to_array();
1212/// assert_eq!(c, [14.0, 13.0, 12.0, 10.0]);
1213/// ```
1214#[inline(always)]
1215#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
1216pub fn store_reverse_m128(r: &mut m128, a: m128) {
1217  unsafe { _mm_storer_ps(r as *mut m128 as *mut f32, a.0) }
1218}
1219
1220/// Stores the value to the reference given.
1221///
1222/// This generally has no speed penalty if the reference happens to be 16-byte
1223/// aligned, but there is a slight speed penalty if the reference is only 4-byte
1224/// aligned.
1225/// ```
1226/// # use safe_arch::*;
1227/// let a = m128::from_array([10.0, 12.0, 13.0, 14.0]);
1228/// let mut b = [0.0; 4];
1229/// store_unaligned_m128(&mut b, a);
1230/// assert_eq!(b, [10.0, 12.0, 13.0, 14.0]);
1231/// ```
1232#[inline(always)]
1233#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
1234pub fn store_unaligned_m128(r: &mut [f32; 4], a: m128) {
1235  unsafe { _mm_storeu_ps(r.as_mut_ptr(), a.0) }
1236}
1237
1238/// Lanewise `a - b`.
1239/// ```
1240/// # use safe_arch::*;
1241/// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]);
1242/// let b = m128::from_array([5.0, 8.0, 12.0, 3.0]);
1243/// let c = sub_m128(a, b).to_array();
1244/// assert_eq!(c, [-4.0, -6.0, -9.0, 1.0]);
1245/// ```
1246#[must_use]
1247#[inline(always)]
1248#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
1249pub fn sub_m128(a: m128, b: m128) -> m128 {
1250  m128(unsafe { _mm_sub_ps(a.0, b.0) })
1251}
1252
1253/// Low lane `a - b`, other lanes unchanged.
1254/// ```
1255/// # use safe_arch::*;
1256/// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]);
1257/// let b = m128::from_array([5.0, 8.0, 12.0, 3.0]);
1258/// let c = sub_m128_s(a, b).to_array();
1259/// assert_eq!(c, [-4.0, 2.0, 3.0, 4.0]);
1260/// ```
1261#[must_use]
1262#[inline(always)]
1263#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
1264pub fn sub_m128_s(a: m128, b: m128) -> m128 {
1265  m128(unsafe { _mm_sub_ss(a.0, b.0) })
1266}
1267
1268/// Transpose four `m128` as if they were a 4x4 matrix.
1269/// ```
1270/// # use safe_arch::*;
1271/// let mut a = m128::from_array([1.0, 2.0, 3.0, 4.0]);
1272/// let mut b = m128::from_array([5.0, 6.0, 7.0, 8.0]);
1273/// let mut c = m128::from_array([9.0, 10.0, 11.0, 12.0]);
1274/// let mut d = m128::from_array([13.0, 14.0, 15.0, 16.0]);
1275/// transpose_four_m128(&mut a, &mut b, &mut c, &mut d);
1276/// assert_eq!(a.to_array(), [1.0, 5.0, 9.0, 13.0]);
1277/// assert_eq!(b.to_array(), [2.0, 6.0, 10.0, 14.0]);
1278/// assert_eq!(c.to_array(), [3.0, 7.0, 11.0, 15.0]);
1279/// assert_eq!(d.to_array(), [4.0, 8.0, 12.0, 16.0]);
1280/// ```
1281#[inline(always)]
1282#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
1283pub fn transpose_four_m128(a: &mut m128, b: &mut m128, c: &mut m128, d: &mut m128) {
1284  unsafe { _MM_TRANSPOSE4_PS(&mut a.0, &mut b.0, &mut c.0, &mut d.0) }
1285}
1286
1287/// Unpack and interleave high lanes of `a` and `b`.
1288/// ```
1289/// # use safe_arch::*;
1290/// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]);
1291/// let b = m128::from_array([5.0, 6.0, 7.0, 8.0]);
1292/// let c = unpack_high_m128(a, b).to_array();
1293/// assert_eq!(c, [3.0, 7.0, 4.0, 8.0]);
1294/// ```
1295#[must_use]
1296#[inline(always)]
1297#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
1298pub fn unpack_high_m128(a: m128, b: m128) -> m128 {
1299  m128(unsafe { _mm_unpackhi_ps(a.0, b.0) })
1300}
1301
1302/// Unpack and interleave low lanes of `a` and `b`.
1303/// ```
1304/// # use safe_arch::*;
1305/// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]);
1306/// let b = m128::from_array([5.0, 6.0, 7.0, 8.0]);
1307/// let c = unpack_low_m128(a, b).to_array();
1308/// assert_eq!(c, [1.0, 5.0, 2.0, 6.0]);
1309/// ```
1310#[must_use]
1311#[inline(always)]
1312#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
1313pub fn unpack_low_m128(a: m128, b: m128) -> m128 {
1314  m128(unsafe { _mm_unpacklo_ps(a.0, b.0) })
1315}
1316
1317/// Bitwise `a ^ b`.
1318/// ```
1319/// # use safe_arch::*;
1320/// let a = m128::from_array([1.0, 0.0, 1.0, 0.0]);
1321/// let b = m128::from_array([1.0, 1.0, 0.0, 0.0]);
1322/// let c = bitxor_m128(a, b).to_array();
1323/// assert_eq!(c, [0.0, 1.0, 1.0, 0.0]);
1324/// ```
1325#[must_use]
1326#[inline(always)]
1327#[cfg_attr(docsrs, doc(cfg(target_feature = "sse")))]
1328pub fn bitxor_m128(a: m128, b: m128) -> m128 {
1329  m128(unsafe { _mm_xor_ps(a.0, b.0) })
1330}
1331
1332//
1333// Here we define the Operator Overloads for `m128`. Each one just calls the
1334// correct function from above. By putting the impls here and not with the
1335// `m128` type we theoretically would be able to build the crate safely even if
1336// there's no `sse` feature enabled. You'd just have a `m128` type without the
1337// operator overloads is all. Not that the standard Rust distribution can build
1338// properly without `sse` enabled, but maybe you're using a custom target or
1339// something. It doesn't really put us out of our way, so it doesn't hurt to try
1340// and accommodate the potential use case.
1341//
1342
1343impl Add for m128 {
1344  type Output = Self;
1345  #[must_use]
1346  #[inline(always)]
1347  fn add(self, rhs: Self) -> Self {
1348    add_m128(self, rhs)
1349  }
1350}
1351impl AddAssign for m128 {
1352  #[inline(always)]
1353  fn add_assign(&mut self, rhs: Self) {
1354    *self = *self + rhs;
1355  }
1356}
1357
1358impl BitAnd for m128 {
1359  type Output = Self;
1360  #[must_use]
1361  #[inline(always)]
1362  fn bitand(self, rhs: Self) -> Self {
1363    bitand_m128(self, rhs)
1364  }
1365}
1366impl BitAndAssign for m128 {
1367  #[inline(always)]
1368  fn bitand_assign(&mut self, rhs: Self) {
1369    *self = *self & rhs;
1370  }
1371}
1372
1373impl BitOr for m128 {
1374  type Output = Self;
1375  #[must_use]
1376  #[inline(always)]
1377  fn bitor(self, rhs: Self) -> Self {
1378    bitor_m128(self, rhs)
1379  }
1380}
1381impl BitOrAssign for m128 {
1382  #[inline(always)]
1383  fn bitor_assign(&mut self, rhs: Self) {
1384    *self = *self | rhs;
1385  }
1386}
1387
1388impl BitXor for m128 {
1389  type Output = Self;
1390  #[must_use]
1391  #[inline(always)]
1392  fn bitxor(self, rhs: Self) -> Self {
1393    bitxor_m128(self, rhs)
1394  }
1395}
1396impl BitXorAssign for m128 {
1397  #[inline(always)]
1398  fn bitxor_assign(&mut self, rhs: Self) {
1399    *self = *self ^ rhs;
1400  }
1401}
1402
1403impl Div for m128 {
1404  type Output = Self;
1405  #[must_use]
1406  #[inline(always)]
1407  fn div(self, rhs: Self) -> Self {
1408    div_m128(self, rhs)
1409  }
1410}
1411impl DivAssign for m128 {
1412  #[inline(always)]
1413  fn div_assign(&mut self, rhs: Self) {
1414    *self = *self / rhs;
1415  }
1416}
1417
1418impl Mul for m128 {
1419  type Output = Self;
1420  #[must_use]
1421  #[inline(always)]
1422  fn mul(self, rhs: Self) -> Self {
1423    mul_m128(self, rhs)
1424  }
1425}
1426impl MulAssign for m128 {
1427  #[inline(always)]
1428  fn mul_assign(&mut self, rhs: Self) {
1429    *self = *self * rhs;
1430  }
1431}
1432
1433impl Neg for m128 {
1434  type Output = Self;
1435  #[must_use]
1436  #[inline(always)]
1437  fn neg(self) -> Self {
1438    sub_m128(zeroed_m128(), self)
1439  }
1440}
1441
1442impl Not for m128 {
1443  type Output = Self;
1444  /// Not a direct intrinsic, but it's very useful and the implementation is
1445  /// simple enough.
1446  ///
1447  /// Negates the bits by performing an `xor` with an all-1s bit pattern.
1448  #[must_use]
1449  #[inline(always)]
1450  fn not(self) -> Self {
1451    let all_bits = set_splat_m128(f32::from_bits(u32::MAX));
1452    self ^ all_bits
1453  }
1454}
1455
1456impl Sub for m128 {
1457  type Output = Self;
1458  #[must_use]
1459  #[inline(always)]
1460  fn sub(self, rhs: Self) -> Self {
1461    sub_m128(self, rhs)
1462  }
1463}
1464impl SubAssign for m128 {
1465  #[inline(always)]
1466  fn sub_assign(&mut self, rhs: Self) {
1467    *self = *self - rhs;
1468  }
1469}
1470
1471impl PartialEq for m128 {
1472  /// Not a direct intrinsic, this is a `cmp_eq_mask` and then a `move_mask`.
1473  #[must_use]
1474  #[inline(always)]
1475  fn eq(&self, other: &Self) -> bool {
1476    move_mask_m128(cmp_eq_mask_m128(*self, *other)) == 0b1111
1477  }
1478}