wide/
i32x8_.rs

1use super::*;
2
3pick! {
4  if #[cfg(target_feature="avx2")] {
5    #[derive(Default, Clone, Copy, PartialEq, Eq)]
6    #[repr(C, align(32))]
7    pub struct i32x8 { pub(crate) avx2: m256i }
8  } else {
9    #[derive(Default, Clone, Copy, PartialEq, Eq)]
10    #[repr(C, align(32))]
11    pub struct i32x8 { pub(crate) a : i32x4, pub(crate) b : i32x4}
12  }
13}
14
15int_uint_consts!(i32, 8, i32x8, 256);
16
17unsafe impl Zeroable for i32x8 {}
18unsafe impl Pod for i32x8 {}
19
20impl Add for i32x8 {
21  type Output = Self;
22  #[inline]
23  #[must_use]
24  fn add(self, rhs: Self) -> Self::Output {
25    pick! {
26      if #[cfg(target_feature="avx2")] {
27        Self { avx2: add_i32_m256i(self.avx2, rhs.avx2) }
28      } else {
29        Self {
30          a : self.a.add(rhs.a),
31          b : self.b.add(rhs.b),
32        }
33      }
34    }
35  }
36}
37
38impl Sub for i32x8 {
39  type Output = Self;
40  #[inline]
41  #[must_use]
42  fn sub(self, rhs: Self) -> Self::Output {
43    pick! {
44      if #[cfg(target_feature="avx2")] {
45        Self { avx2: sub_i32_m256i(self.avx2, rhs.avx2) }
46      } else {
47        Self {
48          a : self.a.sub(rhs.a),
49          b : self.b.sub(rhs.b),
50        }
51      }
52    }
53  }
54}
55
56impl Mul for i32x8 {
57  type Output = Self;
58  #[inline]
59  #[must_use]
60  fn mul(self, rhs: Self) -> Self::Output {
61    pick! {
62      if #[cfg(target_feature="avx2")] {
63        Self { avx2: mul_i32_keep_low_m256i(self.avx2, rhs.avx2) }
64      } else {
65        Self {
66          a : self.a.mul(rhs.a),
67          b : self.b.mul(rhs.b),
68        }
69      }
70    }
71  }
72}
73
74impl Add<i32> for i32x8 {
75  type Output = Self;
76  #[inline]
77  #[must_use]
78  fn add(self, rhs: i32) -> Self::Output {
79    self.add(Self::splat(rhs))
80  }
81}
82
83impl Sub<i32> for i32x8 {
84  type Output = Self;
85  #[inline]
86  #[must_use]
87  fn sub(self, rhs: i32) -> Self::Output {
88    self.sub(Self::splat(rhs))
89  }
90}
91
92impl Mul<i32> for i32x8 {
93  type Output = Self;
94  #[inline]
95  #[must_use]
96  fn mul(self, rhs: i32) -> Self::Output {
97    self.mul(Self::splat(rhs))
98  }
99}
100
101impl Add<i32x8> for i32 {
102  type Output = i32x8;
103  #[inline]
104  #[must_use]
105  fn add(self, rhs: i32x8) -> Self::Output {
106    i32x8::splat(self) + rhs
107  }
108}
109
110impl Sub<i32x8> for i32 {
111  type Output = i32x8;
112  #[inline]
113  #[must_use]
114  fn sub(self, rhs: i32x8) -> Self::Output {
115    i32x8::splat(self) - rhs
116  }
117}
118
119impl Mul<i32x8> for i32 {
120  type Output = i32x8;
121  #[inline]
122  #[must_use]
123  fn mul(self, rhs: i32x8) -> Self::Output {
124    i32x8::splat(self) * rhs
125  }
126}
127
128impl BitAnd for i32x8 {
129  type Output = Self;
130  #[inline]
131  #[must_use]
132  fn bitand(self, rhs: Self) -> Self::Output {
133    pick! {
134      if #[cfg(target_feature="avx2")] {
135        Self { avx2: bitand_m256i(self.avx2, rhs.avx2) }
136      } else {
137        Self {
138          a : self.a.bitand(rhs.a),
139          b : self.b.bitand(rhs.b),
140        }
141      }
142    }
143  }
144}
145
146impl BitOr for i32x8 {
147  type Output = Self;
148  #[inline]
149  #[must_use]
150  fn bitor(self, rhs: Self) -> Self::Output {
151    pick! {
152    if #[cfg(target_feature="avx2")] {
153      Self { avx2: bitor_m256i(self.avx2, rhs.avx2) }
154    } else {
155      Self {
156        a : self.a.bitor(rhs.a),
157        b : self.b.bitor(rhs.b),
158      }
159    }    }
160  }
161}
162
163impl BitXor for i32x8 {
164  type Output = Self;
165  #[inline]
166  #[must_use]
167  fn bitxor(self, rhs: Self) -> Self::Output {
168    pick! {
169      if #[cfg(target_feature="avx2")] {
170        Self { avx2: bitxor_m256i(self.avx2, rhs.avx2) }
171      } else {
172        Self {
173          a : self.a.bitxor(rhs.a),
174          b : self.b.bitxor(rhs.b),
175        }
176      }
177    }
178  }
179}
180
181macro_rules! impl_shl_t_for_i32x8 {
182  ($($shift_type:ty),+ $(,)?) => {
183    $(impl Shl<$shift_type> for i32x8 {
184      type Output = Self;
185      /// Shifts all lanes by the value given.
186      #[inline]
187      #[must_use]
188      fn shl(self, rhs: $shift_type) -> Self::Output {
189        pick! {
190          if #[cfg(target_feature="avx2")] {
191            let shift = cast([rhs as u64, 0]);
192            Self { avx2: shl_all_u32_m256i(self.avx2, shift) }
193          } else {
194            Self {
195              a : self.a.shl(rhs),
196              b : self.b.shl(rhs),
197            }
198          }
199        }
200      }
201    })+
202  };
203}
204impl_shl_t_for_i32x8!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
205
206macro_rules! impl_shr_t_for_i32x8 {
207  ($($shift_type:ty),+ $(,)?) => {
208    $(impl Shr<$shift_type> for i32x8 {
209      type Output = Self;
210      /// Shifts all lanes by the value given.
211      #[inline]
212      #[must_use]
213      fn shr(self, rhs: $shift_type) -> Self::Output {
214        pick! {
215          if #[cfg(target_feature="avx2")] {
216            let shift = cast([rhs as u64, 0]);
217            Self { avx2: shr_all_i32_m256i(self.avx2, shift) }
218          } else {
219            Self {
220              a : self.a.shr(rhs),
221              b : self.b.shr(rhs),
222            }
223          }
224        }
225      }
226    })+
227  };
228}
229
230impl_shr_t_for_i32x8!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
231
232/// Shifts lanes by the corresponding lane.
233///
234/// Bitwise shift-right; yields `self >> mask(rhs)`, where mask removes any
235/// high-order bits of `rhs` that would cause the shift to exceed the bitwidth
236/// of the type. (same as `wrapping_shr`)
237impl Shr<i32x8> for i32x8 {
238  type Output = Self;
239
240  #[inline]
241  #[must_use]
242  fn shr(self, rhs: i32x8) -> Self::Output {
243    pick! {
244      if #[cfg(target_feature="avx2")] {
245        // ensure same behavior as scalar
246        let shift_by = bitand_m256i(rhs.avx2, set_splat_i32_m256i(31));
247        Self { avx2: shr_each_i32_m256i(self.avx2, shift_by ) }
248      } else {
249        Self {
250          a : self.a.shr(rhs.a),
251          b : self.b.shr(rhs.b),
252        }
253      }
254    }
255  }
256}
257
258/// Shifts lanes by the corresponding lane.
259///
260/// Bitwise shift-left; yields `self << mask(rhs)`, where mask removes any
261/// high-order bits of `rhs` that would cause the shift to exceed the bitwidth
262/// of the type. (same as `wrapping_shl`)
263impl Shl<i32x8> for i32x8 {
264  type Output = Self;
265
266  #[inline]
267  #[must_use]
268  fn shl(self, rhs: i32x8) -> Self::Output {
269    pick! {
270      if #[cfg(target_feature="avx2")] {
271        // ensure same behavior as scalar wrapping_shl by masking the shift count
272        let shift_by = bitand_m256i(rhs.avx2, set_splat_i32_m256i(31));
273        // shl is the same for unsigned and signed
274        Self { avx2: shl_each_u32_m256i(self.avx2, shift_by) }
275      } else {
276        Self {
277          a : self.a.shl(rhs.a),
278          b : self.b.shl(rhs.b),
279        }
280      }
281    }
282  }
283}
284
285impl CmpEq for i32x8 {
286  type Output = Self;
287  #[inline]
288  #[must_use]
289  fn cmp_eq(self, rhs: Self) -> Self::Output {
290    pick! {
291      if #[cfg(target_feature="avx2")] {
292        Self { avx2: cmp_eq_mask_i32_m256i(self.avx2, rhs.avx2) }
293      } else {
294        Self {
295          a : self.a.cmp_eq(rhs.a),
296          b : self.b.cmp_eq(rhs.b),
297        }
298      }
299    }
300  }
301}
302
303impl CmpGt for i32x8 {
304  type Output = Self;
305  #[inline]
306  #[must_use]
307  fn cmp_gt(self, rhs: Self) -> Self::Output {
308    pick! {
309      if #[cfg(target_feature="avx2")] {
310        Self { avx2: cmp_gt_mask_i32_m256i(self.avx2, rhs.avx2) }
311      } else {
312        Self {
313          a : self.a.cmp_gt(rhs.a),
314          b : self.b.cmp_gt(rhs.b),
315        }
316      }
317    }
318  }
319}
320
321impl CmpLt for i32x8 {
322  type Output = Self;
323  #[inline]
324  #[must_use]
325  fn cmp_lt(self, rhs: Self) -> Self::Output {
326    pick! {
327      if #[cfg(target_feature="avx2")] {
328        Self { avx2: !cmp_gt_mask_i32_m256i(self.avx2, rhs.avx2)  ^ cmp_eq_mask_i32_m256i(self.avx2,rhs.avx2) }
329      } else {
330        Self {
331          a : self.a.cmp_lt(rhs.a),
332          b : self.b.cmp_lt(rhs.b),
333        }
334      }
335    }
336  }
337}
338
339impl From<i16x8> for i32x8 {
340  #[inline]
341  #[must_use]
342  fn from(value: i16x8) -> Self {
343    i32x8::from_i16x8(value)
344  }
345}
346
347impl i32x8 {
348  #[inline]
349  #[must_use]
350  pub const fn new(array: [i32; 8]) -> Self {
351    unsafe { core::intrinsics::transmute(array) }
352  }
353
354  /// widens and sign extends to `i32x8`
355  #[inline]
356  #[must_use]
357  pub fn from_i16x8(v: i16x8) -> Self {
358    pick! {
359      if #[cfg(target_feature="avx2")] {
360        i32x8 { avx2:convert_to_i32_m256i_from_i16_m128i(v.sse) }
361      } else if #[cfg(target_feature="sse2")] {
362        i32x8 {
363          a: i32x4 { sse: shr_imm_i32_m128i::<16>( unpack_low_i16_m128i(v.sse, v.sse)) },
364          b: i32x4 { sse: shr_imm_i32_m128i::<16>( unpack_high_i16_m128i(v.sse, v.sse)) },
365        }
366      } else {
367        i32x8::new([
368          i32::from(v.as_array_ref()[0]),
369          i32::from(v.as_array_ref()[1]),
370          i32::from(v.as_array_ref()[2]),
371          i32::from(v.as_array_ref()[3]),
372          i32::from(v.as_array_ref()[4]),
373          i32::from(v.as_array_ref()[5]),
374          i32::from(v.as_array_ref()[6]),
375          i32::from(v.as_array_ref()[7]),
376        ])
377      }
378    }
379  }
380
381  /// widens and zero extends to `i32x8`
382  #[inline]
383  #[must_use]
384  pub fn from_u16x8(v: u16x8) -> Self {
385    pick! {
386      if #[cfg(target_feature="avx2")] {
387        i32x8 { avx2:convert_to_i32_m256i_from_u16_m128i(v.sse) }
388      } else if #[cfg(target_feature="sse2")] {
389        i32x8 {
390          a: i32x4 { sse: shr_imm_u32_m128i::<16>( unpack_low_i16_m128i(v.sse, v.sse)) },
391          b: i32x4 { sse: shr_imm_u32_m128i::<16>( unpack_high_i16_m128i(v.sse, v.sse)) },
392        }
393      } else {
394        i32x8::new([
395          i32::from(v.as_array_ref()[0]),
396          i32::from(v.as_array_ref()[1]),
397          i32::from(v.as_array_ref()[2]),
398          i32::from(v.as_array_ref()[3]),
399          i32::from(v.as_array_ref()[4]),
400          i32::from(v.as_array_ref()[5]),
401          i32::from(v.as_array_ref()[6]),
402          i32::from(v.as_array_ref()[7]),
403        ])
404      }
405    }
406  }
407
408  #[inline]
409  #[must_use]
410  pub fn blend(self, t: Self, f: Self) -> Self {
411    pick! {
412      if #[cfg(target_feature="avx2")] {
413        Self { avx2: blend_varying_i8_m256i(f.avx2, t.avx2, self.avx2) }
414      } else {
415        Self {
416          a : self.a.blend(t.a, f.a),
417          b : self.b.blend(t.b, f.b)
418        }
419      }
420    }
421  }
422
423  /// horizontal add of all the elements of the vector
424  #[inline]
425  #[must_use]
426  pub fn reduce_add(self) -> i32 {
427    let arr: [i32x4; 2] = cast(self);
428    (arr[0] + arr[1]).reduce_add()
429  }
430
431  /// horizontal max of all the elements of the vector
432  #[inline]
433  #[must_use]
434  pub fn reduce_max(self) -> i32 {
435    let arr: [i32x4; 2] = cast(self);
436    arr[0].max(arr[1]).reduce_max()
437  }
438
439  /// horizontal min of all the elements of the vector
440  #[inline]
441  #[must_use]
442  pub fn reduce_min(self) -> i32 {
443    let arr: [i32x4; 2] = cast(self);
444    arr[0].min(arr[1]).reduce_min()
445  }
446
447  #[inline]
448  #[must_use]
449  pub fn abs(self) -> Self {
450    pick! {
451      if #[cfg(target_feature="avx2")] {
452        Self { avx2: abs_i32_m256i(self.avx2) }
453      } else {
454        Self {
455          a : self.a.abs(),
456          b : self.b.abs(),
457        }
458      }
459    }
460  }
461
462  #[inline]
463  #[must_use]
464  pub fn unsigned_abs(self) -> u32x8 {
465    pick! {
466      if #[cfg(target_feature="avx2")] {
467        u32x8 { avx2: abs_i32_m256i(self.avx2) }
468      } else {
469        u32x8 {
470          a : self.a.unsigned_abs(),
471          b : self.b.unsigned_abs(),
472        }
473      }
474    }
475  }
476
477  #[inline]
478  #[must_use]
479  pub fn max(self, rhs: Self) -> Self {
480    pick! {
481      if #[cfg(target_feature="avx2")] {
482        Self { avx2: max_i32_m256i(self.avx2, rhs.avx2) }
483      } else {
484        Self {
485          a : self.a.max(rhs.a),
486          b : self.b.max(rhs.b),
487        }
488      }
489    }
490  }
491  #[inline]
492  #[must_use]
493  pub fn min(self, rhs: Self) -> Self {
494    pick! {
495      if #[cfg(target_feature="avx2")] {
496        Self { avx2: min_i32_m256i(self.avx2, rhs.avx2) }
497      } else {
498        Self {
499          a : self.a.min(rhs.a),
500          b : self.b.min(rhs.b),
501        }
502      }
503    }
504  }
505  #[inline]
506  #[must_use]
507  pub fn round_float(self) -> f32x8 {
508    pick! {
509      if #[cfg(target_feature="avx2")] {
510        cast(convert_to_m256_from_i32_m256i(self.avx2))
511      } else {
512        cast([
513          self.a.round_float(),
514          self.b.round_float(),
515        ])
516      }
517    }
518  }
519
520  #[inline]
521  #[must_use]
522  pub fn move_mask(self) -> i32 {
523    pick! {
524      if #[cfg(target_feature="avx2")] {
525        // use f32 move_mask since it is the same size as i32
526        move_mask_m256(cast(self.avx2))
527      } else {
528        self.a.move_mask() | (self.b.move_mask() << 4)
529      }
530    }
531  }
532
533  #[inline]
534  #[must_use]
535  pub fn any(self) -> bool {
536    pick! {
537      if #[cfg(target_feature="avx2")] {
538        move_mask_m256(cast(self.avx2)) != 0
539      } else {
540        (self.a | self.b).any()
541      }
542    }
543  }
544  #[inline]
545  #[must_use]
546  pub fn all(self) -> bool {
547    pick! {
548      if #[cfg(target_feature="avx2")] {
549        move_mask_m256(cast(self.avx2)) == 0b11111111
550      } else {
551        (self.a & self.b).all()
552      }
553    }
554  }
555  #[inline]
556  #[must_use]
557  pub fn none(self) -> bool {
558    !self.any()
559  }
560
561  /// Transpose matrix of 8x8 `i32` matrix. Currently only accelerated on AVX2.
562  #[must_use]
563  #[inline]
564  pub fn transpose(data: [i32x8; 8]) -> [i32x8; 8] {
565    pick! {
566      if #[cfg(target_feature="avx2")] {
567        let a0 = unpack_low_i32_m256i(data[0].avx2, data[1].avx2);
568        let a1 = unpack_high_i32_m256i(data[0].avx2, data[1].avx2);
569        let a2 = unpack_low_i32_m256i(data[2].avx2, data[3].avx2);
570        let a3 = unpack_high_i32_m256i(data[2].avx2, data[3].avx2);
571        let a4 = unpack_low_i32_m256i(data[4].avx2, data[5].avx2);
572        let a5 = unpack_high_i32_m256i(data[4].avx2, data[5].avx2);
573        let a6 = unpack_low_i32_m256i(data[6].avx2, data[7].avx2);
574        let a7 = unpack_high_i32_m256i(data[6].avx2, data[7].avx2);
575
576        pub const fn mm_shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 {
577          (z << 6) | (y << 4) | (x << 2) | w
578        }
579
580        const SHUFF_LO : i32 = mm_shuffle(1,0,1,0);
581        const SHUFF_HI : i32 = mm_shuffle(3,2,3,2);
582
583        // possible todo: intel performance manual suggests alternative with blend to avoid port 5 pressure
584        // (since blend runs on a different port than shuffle)
585        let b0 = cast::<m256,m256i>(shuffle_m256::<SHUFF_LO>(cast(a0),cast(a2)));
586        let b1 = cast::<m256,m256i>(shuffle_m256::<SHUFF_HI>(cast(a0),cast(a2)));
587        let b2 = cast::<m256,m256i>(shuffle_m256::<SHUFF_LO>(cast(a1),cast(a3)));
588        let b3 = cast::<m256,m256i>(shuffle_m256::<SHUFF_HI>(cast(a1),cast(a3)));
589        let b4 = cast::<m256,m256i>(shuffle_m256::<SHUFF_LO>(cast(a4),cast(a6)));
590        let b5 = cast::<m256,m256i>(shuffle_m256::<SHUFF_HI>(cast(a4),cast(a6)));
591        let b6 = cast::<m256,m256i>(shuffle_m256::<SHUFF_LO>(cast(a5),cast(a7)));
592        let b7 = cast::<m256,m256i>(shuffle_m256::<SHUFF_HI>(cast(a5),cast(a7)));
593
594        [
595          i32x8 { avx2: permute2z_m256i::<0x20>(b0, b4) },
596          i32x8 { avx2: permute2z_m256i::<0x20>(b1, b5) },
597          i32x8 { avx2: permute2z_m256i::<0x20>(b2, b6) },
598          i32x8 { avx2: permute2z_m256i::<0x20>(b3, b7) },
599          i32x8 { avx2: permute2z_m256i::<0x31>(b0, b4) },
600          i32x8 { avx2: permute2z_m256i::<0x31>(b1, b5) },
601          i32x8 { avx2: permute2z_m256i::<0x31>(b2, b6) },
602          i32x8 { avx2: permute2z_m256i::<0x31>(b3, b7) }
603        ]
604      } else {
605        // possible todo: not sure that 128bit SIMD gives us a a lot of speedup here
606
607        #[inline(always)]
608        fn transpose_column(data: &[i32x8; 8], index: usize) -> i32x8 {
609          i32x8::new([
610            data[0].as_array_ref()[index],
611            data[1].as_array_ref()[index],
612            data[2].as_array_ref()[index],
613            data[3].as_array_ref()[index],
614            data[4].as_array_ref()[index],
615            data[5].as_array_ref()[index],
616            data[6].as_array_ref()[index],
617            data[7].as_array_ref()[index],
618          ])
619        }
620
621        [
622          transpose_column(&data, 0),
623          transpose_column(&data, 1),
624          transpose_column(&data, 2),
625          transpose_column(&data, 3),
626          transpose_column(&data, 4),
627          transpose_column(&data, 5),
628          transpose_column(&data, 6),
629          transpose_column(&data, 7),
630        ]
631      }
632    }
633  }
634
635  #[inline]
636  pub fn to_array(self) -> [i32; 8] {
637    cast(self)
638  }
639
640  #[inline]
641  pub fn as_array_ref(&self) -> &[i32; 8] {
642    cast_ref(self)
643  }
644
645  #[inline]
646  pub fn as_array_mut(&mut self) -> &mut [i32; 8] {
647    cast_mut(self)
648  }
649}
650
651impl Not for i32x8 {
652  type Output = Self;
653  #[inline]
654  fn not(self) -> Self {
655    pick! {
656      if #[cfg(target_feature="avx2")] {
657        Self { avx2: self.avx2.not()  }
658      } else {
659        Self {
660          a : self.a.not(),
661          b : self.b.not(),
662        }
663      }
664    }
665  }
666}