wide/
i16x8_.rs

1use super::*;
2
3pick! {
4  if #[cfg(target_feature="sse2")] {
5    #[derive(Default, Clone, Copy, PartialEq, Eq)]
6    #[repr(C, align(16))]
7    pub struct i16x8 { pub(crate) sse: m128i }
8  } else if #[cfg(target_feature="simd128")] {
9    use core::arch::wasm32::*;
10
11    #[derive(Clone, Copy)]
12    #[repr(transparent)]
13    pub struct i16x8 { pub(crate) simd: v128 }
14
15    impl Default for i16x8 {
16      fn default() -> Self {
17        Self::splat(0)
18      }
19    }
20
21    impl PartialEq for i16x8 {
22      fn eq(&self, other: &Self) -> bool {
23        u16x8_all_true(i16x8_eq(self.simd, other.simd))
24      }
25    }
26
27    impl Eq for i16x8 { }
28  } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
29    use core::arch::aarch64::*;
30    #[repr(C)]
31    #[derive(Copy, Clone)]
32    pub struct i16x8 { pub(crate) neon : int16x8_t }
33
34    impl Default for i16x8 {
35      #[inline]
36      #[must_use]
37      fn default() -> Self {
38        Self::splat(0)
39      }
40    }
41
42    impl PartialEq for i16x8 {
43      #[inline]
44      #[must_use]
45      fn eq(&self, other: &Self) -> bool {
46        unsafe { vminvq_u16(vceqq_s16(self.neon, other.neon))==u16::MAX }
47      }
48    }
49
50    impl Eq for i16x8 { }
51  } else {
52    #[derive(Default, Clone, Copy, PartialEq, Eq)]
53    #[repr(C, align(16))]
54    pub struct i16x8 { pub(crate) arr: [i16;8] }
55  }
56}
57
58int_uint_consts!(i16, 8, i16x8, 128);
59
60unsafe impl Zeroable for i16x8 {}
61unsafe impl Pod for i16x8 {}
62
63impl Add for i16x8 {
64  type Output = Self;
65  #[inline]
66  #[must_use]
67  fn add(self, rhs: Self) -> Self::Output {
68    pick! {
69      if #[cfg(target_feature="sse2")] {
70        Self { sse: add_i16_m128i(self.sse, rhs.sse) }
71      } else if #[cfg(target_feature="simd128")] {
72        Self { simd: i16x8_add(self.simd, rhs.simd) }
73      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
74        unsafe { Self { neon: vaddq_s16(self.neon, rhs.neon) } }
75      } else {
76        Self { arr: [
77          self.arr[0].wrapping_add(rhs.arr[0]),
78          self.arr[1].wrapping_add(rhs.arr[1]),
79          self.arr[2].wrapping_add(rhs.arr[2]),
80          self.arr[3].wrapping_add(rhs.arr[3]),
81          self.arr[4].wrapping_add(rhs.arr[4]),
82          self.arr[5].wrapping_add(rhs.arr[5]),
83          self.arr[6].wrapping_add(rhs.arr[6]),
84          self.arr[7].wrapping_add(rhs.arr[7]),
85        ]}
86      }
87    }
88  }
89}
90
91impl Sub for i16x8 {
92  type Output = Self;
93  #[inline]
94  #[must_use]
95  fn sub(self, rhs: Self) -> Self::Output {
96    pick! {
97      if #[cfg(target_feature="sse2")] {
98        Self { sse: sub_i16_m128i(self.sse, rhs.sse) }
99      } else if #[cfg(target_feature="simd128")] {
100        Self { simd: i16x8_sub(self.simd, rhs.simd) }
101      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
102        unsafe {Self { neon: vsubq_s16(self.neon, rhs.neon) }}
103      } else {
104        Self { arr: [
105          self.arr[0].wrapping_sub(rhs.arr[0]),
106          self.arr[1].wrapping_sub(rhs.arr[1]),
107          self.arr[2].wrapping_sub(rhs.arr[2]),
108          self.arr[3].wrapping_sub(rhs.arr[3]),
109          self.arr[4].wrapping_sub(rhs.arr[4]),
110          self.arr[5].wrapping_sub(rhs.arr[5]),
111          self.arr[6].wrapping_sub(rhs.arr[6]),
112          self.arr[7].wrapping_sub(rhs.arr[7]),
113        ]}
114      }
115    }
116  }
117}
118
119impl Mul for i16x8 {
120  type Output = Self;
121  #[inline]
122  #[must_use]
123  fn mul(self, rhs: Self) -> Self::Output {
124    pick! {
125      if #[cfg(target_feature="sse2")] {
126        Self { sse: mul_i16_keep_low_m128i(self.sse, rhs.sse) }
127      } else if #[cfg(target_feature="simd128")] {
128        Self { simd: i16x8_mul(self.simd, rhs.simd) }
129      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
130        unsafe {Self { neon: vmulq_s16(self.neon, rhs.neon) }}
131      } else {
132        Self { arr: [
133          self.arr[0].wrapping_mul(rhs.arr[0]),
134          self.arr[1].wrapping_mul(rhs.arr[1]),
135          self.arr[2].wrapping_mul(rhs.arr[2]),
136          self.arr[3].wrapping_mul(rhs.arr[3]),
137          self.arr[4].wrapping_mul(rhs.arr[4]),
138          self.arr[5].wrapping_mul(rhs.arr[5]),
139          self.arr[6].wrapping_mul(rhs.arr[6]),
140          self.arr[7].wrapping_mul(rhs.arr[7]),
141        ]}
142      }
143    }
144  }
145}
146
147impl Add<i16> for i16x8 {
148  type Output = Self;
149  #[inline]
150  #[must_use]
151  fn add(self, rhs: i16) -> Self::Output {
152    self.add(Self::splat(rhs))
153  }
154}
155
156impl Sub<i16> for i16x8 {
157  type Output = Self;
158  #[inline]
159  #[must_use]
160  fn sub(self, rhs: i16) -> Self::Output {
161    self.sub(Self::splat(rhs))
162  }
163}
164
165impl Mul<i16> for i16x8 {
166  type Output = Self;
167  #[inline]
168  #[must_use]
169  fn mul(self, rhs: i16) -> Self::Output {
170    self.mul(Self::splat(rhs))
171  }
172}
173
174impl Add<i16x8> for i16 {
175  type Output = i16x8;
176  #[inline]
177  #[must_use]
178  fn add(self, rhs: i16x8) -> Self::Output {
179    i16x8::splat(self).add(rhs)
180  }
181}
182
183impl Sub<i16x8> for i16 {
184  type Output = i16x8;
185  #[inline]
186  #[must_use]
187  fn sub(self, rhs: i16x8) -> Self::Output {
188    i16x8::splat(self).sub(rhs)
189  }
190}
191
192impl Mul<i16x8> for i16 {
193  type Output = i16x8;
194  #[inline]
195  #[must_use]
196  fn mul(self, rhs: i16x8) -> Self::Output {
197    i16x8::splat(self).mul(rhs)
198  }
199}
200
201impl BitAnd for i16x8 {
202  type Output = Self;
203  #[inline]
204  #[must_use]
205  fn bitand(self, rhs: Self) -> Self::Output {
206    pick! {
207      if #[cfg(target_feature="sse2")] {
208        Self { sse: bitand_m128i(self.sse, rhs.sse) }
209      } else if #[cfg(target_feature="simd128")] {
210        Self { simd: v128_and(self.simd, rhs.simd) }
211      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
212        unsafe {Self { neon: vandq_s16(self.neon, rhs.neon) }}
213      } else {
214        Self { arr: [
215          self.arr[0].bitand(rhs.arr[0]),
216          self.arr[1].bitand(rhs.arr[1]),
217          self.arr[2].bitand(rhs.arr[2]),
218          self.arr[3].bitand(rhs.arr[3]),
219          self.arr[4].bitand(rhs.arr[4]),
220          self.arr[5].bitand(rhs.arr[5]),
221          self.arr[6].bitand(rhs.arr[6]),
222          self.arr[7].bitand(rhs.arr[7]),
223        ]}
224      }
225    }
226  }
227}
228
229impl BitOr for i16x8 {
230  type Output = Self;
231  #[inline]
232  #[must_use]
233  fn bitor(self, rhs: Self) -> Self::Output {
234    pick! {
235      if #[cfg(target_feature="sse2")] {
236        Self { sse: bitor_m128i(self.sse, rhs.sse) }
237      } else if #[cfg(target_feature="simd128")] {
238        Self { simd: v128_or(self.simd, rhs.simd) }
239      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
240        unsafe {Self { neon: vorrq_s16(self.neon, rhs.neon) }}
241      } else {
242        Self { arr: [
243          self.arr[0].bitor(rhs.arr[0]),
244          self.arr[1].bitor(rhs.arr[1]),
245          self.arr[2].bitor(rhs.arr[2]),
246          self.arr[3].bitor(rhs.arr[3]),
247          self.arr[4].bitor(rhs.arr[4]),
248          self.arr[5].bitor(rhs.arr[5]),
249          self.arr[6].bitor(rhs.arr[6]),
250          self.arr[7].bitor(rhs.arr[7]),
251        ]}
252      }
253    }
254  }
255}
256
257impl BitXor for i16x8 {
258  type Output = Self;
259  #[inline]
260  #[must_use]
261  fn bitxor(self, rhs: Self) -> Self::Output {
262    pick! {
263      if #[cfg(target_feature="sse2")] {
264        Self { sse: bitxor_m128i(self.sse, rhs.sse) }
265      } else if #[cfg(target_feature="simd128")] {
266        Self { simd: v128_xor(self.simd, rhs.simd) }
267      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
268        unsafe {Self { neon: veorq_s16(self.neon, rhs.neon) }}
269      } else {
270        Self { arr: [
271          self.arr[0].bitxor(rhs.arr[0]),
272          self.arr[1].bitxor(rhs.arr[1]),
273          self.arr[2].bitxor(rhs.arr[2]),
274          self.arr[3].bitxor(rhs.arr[3]),
275          self.arr[4].bitxor(rhs.arr[4]),
276          self.arr[5].bitxor(rhs.arr[5]),
277          self.arr[6].bitxor(rhs.arr[6]),
278          self.arr[7].bitxor(rhs.arr[7]),
279        ]}
280      }
281    }
282  }
283}
284
285macro_rules! impl_shl_t_for_i16x8 {
286  ($($shift_type:ty),+ $(,)?) => {
287    $(impl Shl<$shift_type> for i16x8 {
288      type Output = Self;
289      /// Shifts all lanes by the value given.
290      #[inline]
291      #[must_use]
292      fn shl(self, rhs: $shift_type) -> Self::Output {
293        pick! {
294          if #[cfg(target_feature="sse2")] {
295            let shift = cast([rhs as u64, 0]);
296            Self { sse: shl_all_u16_m128i(self.sse, shift) }
297          } else if #[cfg(target_feature="simd128")] {
298            Self { simd: i16x8_shl(self.simd, rhs as u32) }
299          } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
300            unsafe {Self { neon: vshlq_s16(self.neon, vmovq_n_s16(rhs as i16)) }}
301          } else {
302            let u = rhs as u64;
303            Self { arr: [
304              self.arr[0] << u,
305              self.arr[1] << u,
306              self.arr[2] << u,
307              self.arr[3] << u,
308              self.arr[4] << u,
309              self.arr[5] << u,
310              self.arr[6] << u,
311              self.arr[7] << u,
312            ]}
313          }
314        }
315      }
316    })+
317  };
318}
319impl_shl_t_for_i16x8!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
320
321macro_rules! impl_shr_t_for_i16x8 {
322  ($($shift_type:ty),+ $(,)?) => {
323    $(impl Shr<$shift_type> for i16x8 {
324      type Output = Self;
325      /// Shifts all lanes by the value given.
326      #[inline]
327      #[must_use]
328      fn shr(self, rhs: $shift_type) -> Self::Output {
329        pick! {
330          if #[cfg(target_feature="sse2")] {
331            let shift = cast([rhs as u64, 0]);
332            Self { sse: shr_all_i16_m128i(self.sse, shift) }
333          } else if #[cfg(target_feature="simd128")] {
334            Self { simd: i16x8_shr(self.simd, rhs as u32) }
335          } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
336            unsafe {Self { neon: vshlq_s16(self.neon, vmovq_n_s16( -(rhs as i16))) }}
337          } else {
338            let u = rhs as u64;
339            Self { arr: [
340              self.arr[0] >> u,
341              self.arr[1] >> u,
342              self.arr[2] >> u,
343              self.arr[3] >> u,
344              self.arr[4] >> u,
345              self.arr[5] >> u,
346              self.arr[6] >> u,
347              self.arr[7] >> u,
348            ]}
349          }
350        }
351      }
352    })+
353  };
354}
355impl_shr_t_for_i16x8!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
356
357impl CmpEq for i16x8 {
358  type Output = Self;
359  #[inline]
360  #[must_use]
361  fn cmp_eq(self, rhs: Self) -> Self::Output {
362    pick! {
363      if #[cfg(target_feature="sse2")] {
364        Self { sse: cmp_eq_mask_i16_m128i(self.sse, rhs.sse) }
365      } else if #[cfg(target_feature="simd128")] {
366        Self { simd: i16x8_eq(self.simd, rhs.simd) }
367      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
368        unsafe {Self { neon: vreinterpretq_s16_u16(vceqq_s16(self.neon, rhs.neon)) }}
369      } else {
370        Self { arr: [
371          if self.arr[0] == rhs.arr[0] { -1 } else { 0 },
372          if self.arr[1] == rhs.arr[1] { -1 } else { 0 },
373          if self.arr[2] == rhs.arr[2] { -1 } else { 0 },
374          if self.arr[3] == rhs.arr[3] { -1 } else { 0 },
375          if self.arr[4] == rhs.arr[4] { -1 } else { 0 },
376          if self.arr[5] == rhs.arr[5] { -1 } else { 0 },
377          if self.arr[6] == rhs.arr[6] { -1 } else { 0 },
378          if self.arr[7] == rhs.arr[7] { -1 } else { 0 },
379        ]}
380      }
381    }
382  }
383}
384
385impl CmpGt for i16x8 {
386  type Output = Self;
387  #[inline]
388  #[must_use]
389  fn cmp_gt(self, rhs: Self) -> Self::Output {
390    pick! {
391      if #[cfg(target_feature="sse2")] {
392        Self { sse: cmp_gt_mask_i16_m128i(self.sse, rhs.sse) }
393      } else if #[cfg(target_feature="simd128")] {
394        Self { simd: i16x8_gt(self.simd, rhs.simd) }
395      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
396        unsafe {Self { neon: vreinterpretq_s16_u16(vcgtq_s16(self.neon, rhs.neon)) }}
397      } else {
398        Self { arr: [
399          if self.arr[0] > rhs.arr[0] { -1 } else { 0 },
400          if self.arr[1] > rhs.arr[1] { -1 } else { 0 },
401          if self.arr[2] > rhs.arr[2] { -1 } else { 0 },
402          if self.arr[3] > rhs.arr[3] { -1 } else { 0 },
403          if self.arr[4] > rhs.arr[4] { -1 } else { 0 },
404          if self.arr[5] > rhs.arr[5] { -1 } else { 0 },
405          if self.arr[6] > rhs.arr[6] { -1 } else { 0 },
406          if self.arr[7] > rhs.arr[7] { -1 } else { 0 },
407        ]}
408      }
409    }
410  }
411}
412
413impl CmpLt for i16x8 {
414  type Output = Self;
415  #[inline]
416  #[must_use]
417  fn cmp_lt(self, rhs: Self) -> Self::Output {
418    pick! {
419      if #[cfg(target_feature="sse2")] {
420        Self { sse: cmp_lt_mask_i16_m128i(self.sse, rhs.sse) }
421      } else if #[cfg(target_feature="simd128")] {
422        Self { simd: i16x8_lt(self.simd, rhs.simd) }
423      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
424        unsafe {Self { neon: vreinterpretq_s16_u16(vcltq_s16(self.neon, rhs.neon)) }}
425      } else {
426        Self { arr: [
427          if self.arr[0] < rhs.arr[0] { -1 } else { 0 },
428          if self.arr[1] < rhs.arr[1] { -1 } else { 0 },
429          if self.arr[2] < rhs.arr[2] { -1 } else { 0 },
430          if self.arr[3] < rhs.arr[3] { -1 } else { 0 },
431          if self.arr[4] < rhs.arr[4] { -1 } else { 0 },
432          if self.arr[5] < rhs.arr[5] { -1 } else { 0 },
433          if self.arr[6] < rhs.arr[6] { -1 } else { 0 },
434          if self.arr[7] < rhs.arr[7] { -1 } else { 0 },
435        ]}
436      }
437    }
438  }
439}
440
441impl i16x8 {
442  #[inline]
443  #[must_use]
444  pub const fn new(array: [i16; 8]) -> Self {
445    unsafe { core::intrinsics::transmute(array) }
446  }
447
448  #[inline]
449  #[must_use]
450  pub fn move_mask(self) -> i32 {
451    pick! {
452      if #[cfg(target_feature="sse2")] {
453        move_mask_i8_m128i( pack_i16_to_i8_m128i(self.sse,self.sse)) & 0xff
454      } else if #[cfg(target_feature="simd128")] {
455        i16x8_bitmask(self.simd) as i32
456      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
457        unsafe
458        {
459          // set all to 1 if top bit is set, else 0
460          let masked = vcltq_s16(self.neon, vdupq_n_s16(0));
461
462          // select the right bit out of each lane
463          let selectbit : uint16x8_t = core::intrinsics::transmute([1u16, 2, 4, 8, 16, 32, 64, 128]);
464          let r = vandq_u16(masked, selectbit);
465
466          // horizontally add the 16-bit lanes
467          vaddvq_u16(r) as i32
468         }
469       } else {
470        ((self.arr[0] < 0) as i32) << 0 |
471        ((self.arr[1] < 0) as i32) << 1 |
472        ((self.arr[2] < 0) as i32) << 2 |
473        ((self.arr[3] < 0) as i32) << 3 |
474        ((self.arr[4] < 0) as i32) << 4 |
475        ((self.arr[5] < 0) as i32) << 5 |
476        ((self.arr[6] < 0) as i32) << 6 |
477        ((self.arr[7] < 0) as i32) << 7
478      }
479    }
480  }
481
482  #[inline]
483  #[must_use]
484  pub fn any(self) -> bool {
485    pick! {
486      if #[cfg(target_feature="sse2")] {
487        (move_mask_i8_m128i(self.sse) & 0b1010101010101010) != 0
488      } else if #[cfg(target_feature="simd128")] {
489        u16x8_bitmask(self.simd) != 0
490      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
491        unsafe {
492          vminvq_s16(self.neon) < 0
493        }
494      } else {
495        let v : [u64;2] = cast(self);
496        ((v[0] | v[1]) & 0x8000800080008000) != 0
497      }
498    }
499  }
500
501  #[inline]
502  #[must_use]
503  pub fn all(self) -> bool {
504    pick! {
505      if #[cfg(target_feature="sse2")] {
506        (move_mask_i8_m128i(self.sse) & 0b1010101010101010) == 0b1010101010101010
507      } else if #[cfg(target_feature="simd128")] {
508        u16x8_bitmask(self.simd) == 0b11111111
509      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
510        unsafe {
511          vmaxvq_s16(self.neon) < 0
512        }
513      } else {
514        let v : [u64;2] = cast(self);
515        (v[0] & v[1] & 0x8000800080008000) == 0x8000800080008000
516      }
517    }
518  }
519
520  #[inline]
521  #[must_use]
522  pub fn none(self) -> bool {
523    !self.any()
524  }
525
526  /// Unpack the lower half of the input and expand it to `i16` values.
527  #[inline]
528  #[must_use]
529  pub fn from_u8x16_low(u: u8x16) -> Self {
530    pick! {
531      if #[cfg(target_feature="sse2")] {
532        Self{ sse: unpack_low_i8_m128i(u.sse, m128i::zeroed()) }
533      } else {
534        let u_arr: [u8; 16] = cast(u);
535        cast([
536          u_arr[0] as u16 as i16,
537          u_arr[1] as u16 as i16,
538          u_arr[2] as u16 as i16,
539          u_arr[3] as u16 as i16,
540          u_arr[4] as u16 as i16,
541          u_arr[5] as u16 as i16,
542          u_arr[6] as u16 as i16,
543          u_arr[7] as u16 as i16,
544        ])
545      }
546    }
547  }
548
549  /// Unpack the upper half of the input and expand it to `i16` values.
550  #[inline]
551  #[must_use]
552  pub fn from_u8x16_high(u: u8x16) -> Self {
553    pick! {
554      if #[cfg(target_feature="sse2")] {
555        Self{ sse: unpack_high_i8_m128i(u.sse, m128i::zeroed()) }
556      } else {
557        let u_arr: [u8; 16] = cast(u);
558        cast([
559          u_arr[8] as u16 as i16,
560          u_arr[9] as u16 as i16,
561          u_arr[10] as u16 as i16,
562          u_arr[11] as u16 as i16,
563          u_arr[12] as u16 as i16,
564          u_arr[13] as u16 as i16,
565          u_arr[14] as u16 as i16,
566          u_arr[15] as u16 as i16,
567        ])
568      }
569    }
570  }
571
572  /// returns low `i16` of `i32`, saturating values that are too large
573  #[inline]
574  #[must_use]
575  pub fn from_i32x8_saturate(v: i32x8) -> Self {
576    pick! {
577      if #[cfg(target_feature="avx2")] {
578        i16x8 { sse: pack_i32_to_i16_m128i( extract_m128i_from_m256i::<0>(v.avx2), extract_m128i_from_m256i::<1>(v.avx2))  }
579      } else if #[cfg(target_feature="sse2")] {
580        i16x8 { sse: pack_i32_to_i16_m128i( v.a.sse, v.b.sse ) }
581      } else if #[cfg(target_feature="simd128")] {
582        use core::arch::wasm32::*;
583
584        i16x8 { simd: i16x8_narrow_i32x4(v.a.simd, v.b.simd) }
585      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
586        use core::arch::aarch64::*;
587
588        unsafe {
589          i16x8 { neon: vcombine_s16(vqmovn_s32(v.a.neon), vqmovn_s32(v.b.neon)) }
590        }
591      } else {
592        fn clamp(a : i32) -> i16 {
593            if a < i16::MIN as i32 {
594                i16::MIN
595            }
596            else if a > i16::MAX as i32 {
597                i16::MAX
598            } else {
599                a as i16
600            }
601        }
602
603        i16x8::new([
604          clamp(v.as_array_ref()[0]),
605          clamp(v.as_array_ref()[1]),
606          clamp(v.as_array_ref()[2]),
607          clamp(v.as_array_ref()[3]),
608          clamp(v.as_array_ref()[4]),
609          clamp(v.as_array_ref()[5]),
610          clamp(v.as_array_ref()[6]),
611          clamp(v.as_array_ref()[7]),
612        ])
613      }
614    }
615  }
616
617  /// returns low `i16` of `i32`, truncating the upper bits if they are set
618  #[inline]
619  #[must_use]
620  pub fn from_i32x8_truncate(v: i32x8) -> Self {
621    pick! {
622      if #[cfg(target_feature="avx2")] {
623        let a = v.avx2.bitand(set_splat_i32_m256i(0xffff));
624        i16x8 { sse: pack_i32_to_u16_m128i( extract_m128i_from_m256i::<0>(a), extract_m128i_from_m256i::<1>(a) ) }
625      } else if #[cfg(target_feature="sse2")] {
626        let a = shr_imm_i32_m128i::<16>(shl_imm_u32_m128i::<16>(v.a.sse));
627        let b = shr_imm_i32_m128i::<16>(shl_imm_u32_m128i::<16>(v.b.sse));
628
629        i16x8 { sse: pack_i32_to_i16_m128i( a, b)  }
630      } else {
631      i16x8::new([
632        v.as_array_ref()[0] as i16,
633        v.as_array_ref()[1] as i16,
634        v.as_array_ref()[2] as i16,
635        v.as_array_ref()[3] as i16,
636        v.as_array_ref()[4] as i16,
637        v.as_array_ref()[5] as i16,
638        v.as_array_ref()[6] as i16,
639        v.as_array_ref()[7] as i16,
640      ])
641      }
642    }
643  }
644
645  #[inline]
646  #[must_use]
647  pub fn from_slice_unaligned(input: &[i16]) -> Self {
648    assert!(input.len() >= 8);
649
650    pick! {
651      if #[cfg(target_feature="sse2")] {
652        unsafe { Self { sse: load_unaligned_m128i( &*(input.as_ptr() as * const [u8;16]) ) } }
653      } else if #[cfg(target_feature="simd128")] {
654        unsafe { Self { simd: v128_load(input.as_ptr() as *const v128 ) } }
655      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
656        unsafe { Self { neon: vld1q_s16( input.as_ptr() as *const i16 ) } }
657      } else {
658        // 2018 edition doesn't have try_into
659        unsafe { Self::new( *(input.as_ptr() as * const [i16;8]) ) }
660      }
661    }
662  }
663
664  #[inline]
665  #[must_use]
666  pub fn blend(self, t: Self, f: Self) -> Self {
667    pick! {
668      if #[cfg(target_feature="sse4.1")] {
669        Self { sse: blend_varying_i8_m128i(f.sse, t.sse, self.sse) }
670      } else if #[cfg(target_feature="simd128")] {
671        Self { simd: v128_bitselect(t.simd, f.simd, self.simd) }
672      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
673        unsafe {Self { neon: vbslq_s16(vreinterpretq_u16_s16(self.neon), t.neon, f.neon) }}
674      } else {
675        generic_bit_blend(self, t, f)
676      }
677    }
678  }
679  #[inline]
680  #[must_use]
681  pub fn is_negative(self) -> Self {
682    self.cmp_lt(Self::zeroed())
683  }
684
685  /// horizontal add of all the elements of the vector
686  #[inline]
687  #[must_use]
688  pub fn reduce_add(self) -> i16 {
689    pick! {
690      if #[cfg(target_feature="sse2")] {
691        // there is a horizontal add instruction on ssse3, but apparently it is very slow on some AMD CPUs
692        let hi64 = shuffle_ai_f32_all_m128i::<0b01_00_11_10>(self.sse);
693        let sum64 = add_i16_m128i(self.sse, hi64);
694        let hi32 = shuffle_ai_f32_all_m128i::<0b11_10_00_01>(sum64);
695        let sum32 = add_i16_m128i(sum64, hi32);
696        let lo16 = shr_imm_u32_m128i::<16>(sum32);
697        let sum16 = add_i16_m128i(sum32, lo16);
698        extract_i16_as_i32_m128i::<0>(sum16) as i16
699      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
700        unsafe { vaddvq_s16(self.neon) }
701      } else {
702        let arr: [i16; 8] = cast(self);
703
704        // most boring implementation possible so optimizer doesn't overthink this
705        let mut r = arr[0];
706        r = r.wrapping_add(arr[1]);
707        r = r.wrapping_add(arr[2]);
708        r = r.wrapping_add(arr[3]);
709        r = r.wrapping_add(arr[4]);
710        r = r.wrapping_add(arr[5]);
711        r = r.wrapping_add(arr[6]);
712        r.wrapping_add(arr[7])
713      }
714    }
715  }
716
717  /// horizontal min of all the elements of the vector
718  #[inline]
719  #[must_use]
720  pub fn reduce_min(self) -> i16 {
721    pick! {
722        if #[cfg(target_feature="sse2")] {
723          let hi64 = shuffle_ai_f32_all_m128i::<0b01_00_11_10>(self.sse);
724          let sum64 = min_i16_m128i(self.sse, hi64);
725          let hi32 = shuffle_ai_f32_all_m128i::<0b11_10_00_01>(sum64);
726          let sum32 = min_i16_m128i(sum64, hi32);
727          let lo16 = shr_imm_u32_m128i::<16>(sum32);
728          let sum16 = min_i16_m128i(sum32, lo16);
729          extract_i16_as_i32_m128i::<0>(sum16) as i16
730        } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
731          unsafe { vminvq_s16(self.neon) }
732        } else {
733        let arr: [i16; 8] = cast(self);
734
735        // most boring implementation possible so optimizer doesn't overthink this
736        let mut r = arr[0];
737        r = r.min(arr[1]);
738        r = r.min(arr[2]);
739        r = r.min(arr[3]);
740        r = r.min(arr[4]);
741        r = r.min(arr[5]);
742        r = r.min(arr[6]);
743        r.min(arr[7])
744      }
745    }
746  }
747
748  /// horizontal max of all the elements of the vector
749  #[inline]
750  #[must_use]
751  pub fn reduce_max(self) -> i16 {
752    pick! {
753        if #[cfg(target_feature="sse2")] {
754          let hi64 = shuffle_ai_f32_all_m128i::<0b01_00_11_10>(self.sse);
755          let sum64 = max_i16_m128i(self.sse, hi64);
756          let hi32 = shuffle_ai_f32_all_m128i::<0b11_10_00_01>(sum64);
757          let sum32 = max_i16_m128i(sum64, hi32);
758          let lo16 = shr_imm_u32_m128i::<16>(sum32);
759          let sum16 = max_i16_m128i(sum32, lo16);
760          extract_i16_as_i32_m128i::<0>(sum16) as i16
761        } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
762          unsafe { vmaxvq_s16(self.neon) }
763        } else {
764        let arr: [i16; 8] = cast(self);
765
766        // most boring implementation possible so optimizer doesn't overthink this
767        let mut r = arr[0];
768        r = r.max(arr[1]);
769        r = r.max(arr[2]);
770        r = r.max(arr[3]);
771        r = r.max(arr[4]);
772        r = r.max(arr[5]);
773        r = r.max(arr[6]);
774        r.max(arr[7])
775      }
776    }
777  }
778
779  #[inline]
780  #[must_use]
781  pub fn abs(self) -> Self {
782    pick! {
783      if #[cfg(target_feature="sse2")] {
784        let mask = shr_imm_i16_m128i::<15>(self.sse);
785        Self { sse: bitxor_m128i(add_i16_m128i(self.sse, mask), mask) }
786      } else if #[cfg(target_feature="ssse3")] {
787        Self { sse: abs_i16_m128i(self.sse) }
788      } else if #[cfg(target_feature="simd128")] {
789        Self { simd: i16x8_abs(self.simd) }
790      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
791        unsafe {Self { neon: vabsq_s16(self.neon) }}
792      } else {
793        let arr: [i16; 8] = cast(self);
794        cast(
795          [
796            arr[0].wrapping_abs(),
797            arr[1].wrapping_abs(),
798            arr[2].wrapping_abs(),
799            arr[3].wrapping_abs(),
800            arr[4].wrapping_abs(),
801            arr[5].wrapping_abs(),
802            arr[6].wrapping_abs(),
803            arr[7].wrapping_abs(),
804          ])
805      }
806    }
807  }
808
809  #[inline]
810  #[must_use]
811  pub fn unsigned_abs(self) -> u16x8 {
812    pick! {
813      if #[cfg(target_feature="sse2")] {
814        let mask = shr_imm_i16_m128i::<15>(self.sse);
815        u16x8 { sse: bitxor_m128i(add_i16_m128i(self.sse, mask), mask) }
816      } else if #[cfg(target_feature="ssse3")] {
817        u16x8 { sse: abs_i16_m128i(self.sse) }
818      } else if #[cfg(target_feature="simd128")] {
819        u16x8 { simd: i16x8_abs(self.simd) }
820      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
821        unsafe {u16x8 { neon: vreinterpretq_u16_s16(vabsq_s16(self.neon)) }}
822      } else {
823        let arr: [i16; 8] = cast(self);
824        cast(
825          [
826            arr[0].unsigned_abs(),
827            arr[1].unsigned_abs(),
828            arr[2].unsigned_abs(),
829            arr[3].unsigned_abs(),
830            arr[4].unsigned_abs(),
831            arr[5].unsigned_abs(),
832            arr[6].unsigned_abs(),
833            arr[7].unsigned_abs(),
834          ])
835      }
836    }
837  }
838
839  #[inline]
840  #[must_use]
841  pub fn max(self, rhs: Self) -> Self {
842    pick! {
843      if #[cfg(target_feature="sse2")] {
844        Self { sse: max_i16_m128i(self.sse, rhs.sse) }
845      } else if #[cfg(target_feature="simd128")] {
846        Self { simd: i16x8_max(self.simd, rhs.simd) }
847      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
848        unsafe {Self { neon: vmaxq_s16(self.neon, rhs.neon) }}
849      } else {
850        self.cmp_lt(rhs).blend(rhs, self)
851      }
852    }
853  }
854  #[inline]
855  #[must_use]
856  pub fn min(self, rhs: Self) -> Self {
857    pick! {
858      if #[cfg(target_feature="sse2")] {
859        Self { sse: min_i16_m128i(self.sse, rhs.sse) }
860      } else if #[cfg(target_feature="simd128")] {
861        Self { simd: i16x8_min(self.simd, rhs.simd) }
862      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
863        unsafe {Self { neon: vminq_s16(self.neon, rhs.neon) }}
864      } else {
865        self.cmp_lt(rhs).blend(self, rhs)
866      }
867    }
868  }
869
870  #[inline]
871  #[must_use]
872  pub fn saturating_add(self, rhs: Self) -> Self {
873    pick! {
874      if #[cfg(target_feature="sse2")] {
875        Self { sse: add_saturating_i16_m128i(self.sse, rhs.sse) }
876      } else if #[cfg(target_feature="simd128")] {
877        Self { simd: i16x8_add_sat(self.simd, rhs.simd) }
878      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
879        unsafe {Self { neon: vqaddq_s16(self.neon, rhs.neon) }}
880      } else {
881        Self { arr: [
882          self.arr[0].saturating_add(rhs.arr[0]),
883          self.arr[1].saturating_add(rhs.arr[1]),
884          self.arr[2].saturating_add(rhs.arr[2]),
885          self.arr[3].saturating_add(rhs.arr[3]),
886          self.arr[4].saturating_add(rhs.arr[4]),
887          self.arr[5].saturating_add(rhs.arr[5]),
888          self.arr[6].saturating_add(rhs.arr[6]),
889          self.arr[7].saturating_add(rhs.arr[7]),
890        ]}
891      }
892    }
893  }
894  #[inline]
895  #[must_use]
896  pub fn saturating_sub(self, rhs: Self) -> Self {
897    pick! {
898      if #[cfg(target_feature="sse2")] {
899        Self { sse: sub_saturating_i16_m128i(self.sse, rhs.sse) }
900      } else if #[cfg(target_feature="simd128")] {
901        Self { simd: i16x8_sub_sat(self.simd, rhs.simd) }
902      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
903        unsafe { Self { neon: vqsubq_s16(self.neon, rhs.neon) } }
904      } else {
905        Self { arr: [
906          self.arr[0].saturating_sub(rhs.arr[0]),
907          self.arr[1].saturating_sub(rhs.arr[1]),
908          self.arr[2].saturating_sub(rhs.arr[2]),
909          self.arr[3].saturating_sub(rhs.arr[3]),
910          self.arr[4].saturating_sub(rhs.arr[4]),
911          self.arr[5].saturating_sub(rhs.arr[5]),
912          self.arr[6].saturating_sub(rhs.arr[6]),
913          self.arr[7].saturating_sub(rhs.arr[7]),
914        ]}
915      }
916    }
917  }
918
919  /// Calculates partial dot product.
920  /// Multiplies packed signed 16-bit integers, producing intermediate signed
921  /// 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit
922  /// integers.
923  #[inline]
924  #[must_use]
925  pub fn dot(self, rhs: Self) -> i32x4 {
926    pick! {
927      if #[cfg(target_feature="sse2")] {
928        i32x4 { sse:  mul_i16_horizontal_add_m128i(self.sse, rhs.sse) }
929      } else if #[cfg(target_feature="simd128")] {
930        i32x4 { simd: i32x4_dot_i16x8(self.simd, rhs.simd) }
931      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
932        unsafe {
933          let pl = vmull_s16(vget_low_s16(self.neon),  vget_low_s16(rhs.neon));
934          let ph = vmull_high_s16(self.neon, rhs.neon);
935          i32x4 { neon: vpaddq_s32(pl, ph) }
936        }
937      } else {
938        i32x4 { arr: [
939          (i32::from(self.arr[0]) * i32::from(rhs.arr[0])) + (i32::from(self.arr[1]) * i32::from(rhs.arr[1])),
940          (i32::from(self.arr[2]) * i32::from(rhs.arr[2])) + (i32::from(self.arr[3]) * i32::from(rhs.arr[3])),
941          (i32::from(self.arr[4]) * i32::from(rhs.arr[4])) + (i32::from(self.arr[5]) * i32::from(rhs.arr[5])),
942          (i32::from(self.arr[6]) * i32::from(rhs.arr[6])) + (i32::from(self.arr[7]) * i32::from(rhs.arr[7])),
943        ] }
944      }
945    }
946  }
947
948  /// Multiply and scale equivalent to `((self * rhs) + 0x4000) >> 15` on each
949  /// lane, effectively multiplying by a 16 bit fixed point number between `-1`
950  /// and `1`. This corresponds to the following instructions:
951  /// - `vqrdmulhq_s16` instruction on neon
952  /// - `i16x8_q15mulr_sat` on simd128
953  /// - `_mm_mulhrs_epi16` on ssse3
954  /// - emulated via `mul_i16_*` on sse2
955  #[inline]
956  #[must_use]
957  pub fn mul_scale_round(self, rhs: Self) -> Self {
958    pick! {
959      if #[cfg(target_feature="ssse3")] {
960        Self { sse:  mul_i16_scale_round_m128i(self.sse, rhs.sse) }
961      } else if #[cfg(target_feature="sse2")] {
962        // unfortunately mul_i16_scale_round_m128i only got added in sse3
963        let hi = mul_i16_keep_high_m128i(self.sse, rhs.sse);
964        let lo = mul_i16_keep_low_m128i(self.sse, rhs.sse);
965        let mut v1 = unpack_low_i16_m128i(lo, hi);
966        let mut v2 = unpack_high_i16_m128i(lo, hi);
967        let a = set_splat_i32_m128i(0x4000);
968        v1 = shr_imm_i32_m128i::<15>(add_i32_m128i(v1, a));
969        v2 = shr_imm_i32_m128i::<15>(add_i32_m128i(v2, a));
970        let s = pack_i32_to_i16_m128i(v1, v2);
971        Self { sse: s }
972      } else if #[cfg(target_feature="simd128")] {
973        Self { simd: i16x8_q15mulr_sat(self.simd, rhs.simd) }
974      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
975        unsafe { Self { neon: vqrdmulhq_s16(self.neon, rhs.neon) } }
976      } else {
977        // compiler does a surprisingly good job of vectorizing this
978        Self { arr: [
979          ((i32::from(self.arr[0]) * i32::from(rhs.arr[0]) + 0x4000) >> 15) as i16,
980          ((i32::from(self.arr[1]) * i32::from(rhs.arr[1]) + 0x4000) >> 15) as i16,
981          ((i32::from(self.arr[2]) * i32::from(rhs.arr[2]) + 0x4000) >> 15) as i16,
982          ((i32::from(self.arr[3]) * i32::from(rhs.arr[3]) + 0x4000) >> 15) as i16,
983          ((i32::from(self.arr[4]) * i32::from(rhs.arr[4]) + 0x4000) >> 15) as i16,
984          ((i32::from(self.arr[5]) * i32::from(rhs.arr[5]) + 0x4000) >> 15) as i16,
985          ((i32::from(self.arr[6]) * i32::from(rhs.arr[6]) + 0x4000) >> 15) as i16,
986          ((i32::from(self.arr[7]) * i32::from(rhs.arr[7]) + 0x4000) >> 15) as i16,
987        ]}
988      }
989    }
990  }
991
992  /// Multiples two `i16x8` and return the high part of intermediate `i32x8`
993  #[inline]
994  #[must_use]
995  pub fn mul_keep_high(lhs: Self, rhs: Self) -> Self {
996    pick! {
997      if #[cfg(target_feature="sse2")] {
998        Self { sse: mul_i16_keep_high_m128i(lhs.sse, rhs.sse) }
999      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
1000        let lhs_low = unsafe { vget_low_s16(lhs.neon) };
1001        let rhs_low = unsafe { vget_low_s16(rhs.neon) };
1002
1003        let lhs_high = unsafe { vget_high_s16(lhs.neon) };
1004        let rhs_high = unsafe { vget_high_s16(rhs.neon) };
1005
1006        let low = unsafe { vmull_s16(lhs_low, rhs_low) };
1007        let high = unsafe { vmull_s16(lhs_high, rhs_high) };
1008
1009        i16x8 { neon: unsafe { vreinterpretq_s16_u16(vuzpq_u16(vreinterpretq_u16_s32(low), vreinterpretq_u16_s32(high)).1) } }
1010      } else if #[cfg(target_feature="simd128")] {
1011        let low =  i32x4_extmul_low_i16x8(lhs.simd, rhs.simd);
1012        let high = i32x4_extmul_high_i16x8(lhs.simd, rhs.simd);
1013
1014        Self { simd: i16x8_shuffle::<1, 3, 5, 7, 9, 11, 13, 15>(low, high) }
1015      } else {
1016        i16x8::new([
1017          ((i32::from(rhs.as_array_ref()[0]) * i32::from(lhs.as_array_ref()[0])) >> 16) as i16,
1018          ((i32::from(rhs.as_array_ref()[1]) * i32::from(lhs.as_array_ref()[1])) >> 16) as i16,
1019          ((i32::from(rhs.as_array_ref()[2]) * i32::from(lhs.as_array_ref()[2])) >> 16) as i16,
1020          ((i32::from(rhs.as_array_ref()[3]) * i32::from(lhs.as_array_ref()[3])) >> 16) as i16,
1021          ((i32::from(rhs.as_array_ref()[4]) * i32::from(lhs.as_array_ref()[4])) >> 16) as i16,
1022          ((i32::from(rhs.as_array_ref()[5]) * i32::from(lhs.as_array_ref()[5])) >> 16) as i16,
1023          ((i32::from(rhs.as_array_ref()[6]) * i32::from(lhs.as_array_ref()[6])) >> 16) as i16,
1024          ((i32::from(rhs.as_array_ref()[7]) * i32::from(lhs.as_array_ref()[7])) >> 16) as i16,
1025        ])
1026      }
1027    }
1028  }
1029
1030  /// multiplies two `i16x8` and returns the result as a widened `i32x8`
1031  #[inline]
1032  #[must_use]
1033  pub fn mul_widen(self, rhs: Self) -> i32x8 {
1034    pick! {
1035      if #[cfg(target_feature="avx2")] {
1036        let a = convert_to_i32_m256i_from_i16_m128i(self.sse);
1037        let b = convert_to_i32_m256i_from_i16_m128i(rhs.sse);
1038        i32x8 { avx2: mul_i32_keep_low_m256i(a,b) }
1039      } else if #[cfg(target_feature="sse2")] {
1040         let low = mul_i16_keep_low_m128i(self.sse, rhs.sse);
1041         let high = mul_i16_keep_high_m128i(self.sse, rhs.sse);
1042         i32x8 {
1043          a: i32x4 { sse:unpack_low_i16_m128i(low, high) },
1044          b: i32x4 { sse:unpack_high_i16_m128i(low, high) }
1045        }
1046      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
1047         let lhs_low = unsafe { vget_low_s16(self.neon) };
1048         let rhs_low = unsafe { vget_low_s16(rhs.neon) };
1049
1050         let lhs_high = unsafe { vget_high_s16(self.neon) };
1051         let rhs_high = unsafe { vget_high_s16(rhs.neon) };
1052
1053         let low = unsafe { vmull_s16(lhs_low, rhs_low) };
1054         let high = unsafe { vmull_s16(lhs_high, rhs_high) };
1055
1056         i32x8 { a: i32x4 { neon: low }, b: i32x4 {neon: high } }
1057       } else {
1058        let a = self.as_array_ref();
1059        let b = rhs.as_array_ref();
1060         i32x8::new([
1061           i32::from(a[0]) * i32::from(b[0]),
1062           i32::from(a[1]) * i32::from(b[1]),
1063           i32::from(a[2]) * i32::from(b[2]),
1064           i32::from(a[3]) * i32::from(b[3]),
1065           i32::from(a[4]) * i32::from(b[4]),
1066           i32::from(a[5]) * i32::from(b[5]),
1067           i32::from(a[6]) * i32::from(b[6]),
1068           i32::from(a[7]) * i32::from(b[7]),
1069         ])
1070       }
1071    }
1072  }
1073
1074  /// transpose matrix of 8x8 i16 matrix
1075  #[must_use]
1076  #[inline]
1077  pub fn transpose(data: [i16x8; 8]) -> [i16x8; 8] {
1078    pick! {
1079      if #[cfg(target_feature="sse2")] {
1080        let a1 = unpack_low_i16_m128i(data[0].sse, data[1].sse);
1081        let a2 = unpack_high_i16_m128i(data[0].sse, data[1].sse);
1082        let a3 = unpack_low_i16_m128i(data[2].sse, data[3].sse);
1083        let a4 = unpack_high_i16_m128i(data[2].sse, data[3].sse);
1084        let a5 = unpack_low_i16_m128i(data[4].sse, data[5].sse);
1085        let a6 = unpack_high_i16_m128i(data[4].sse, data[5].sse);
1086        let a7 = unpack_low_i16_m128i(data[6].sse, data[7].sse);
1087        let a8 = unpack_high_i16_m128i(data[6].sse, data[7].sse);
1088
1089        let b1 = unpack_low_i32_m128i(a1, a3);
1090        let b2 = unpack_high_i32_m128i(a1, a3);
1091        let b3 = unpack_low_i32_m128i(a2, a4);
1092        let b4 = unpack_high_i32_m128i(a2, a4);
1093        let b5 = unpack_low_i32_m128i(a5, a7);
1094        let b6 = unpack_high_i32_m128i(a5, a7);
1095        let b7 = unpack_low_i32_m128i(a6, a8);
1096        let b8 = unpack_high_i32_m128i(a6, a8);
1097
1098        [
1099          i16x8 { sse: unpack_low_i64_m128i(b1, b5) },
1100          i16x8 { sse: unpack_high_i64_m128i(b1, b5) },
1101          i16x8 { sse: unpack_low_i64_m128i(b2, b6) },
1102          i16x8 { sse: unpack_high_i64_m128i(b2, b6) },
1103          i16x8 { sse: unpack_low_i64_m128i(b3, b7) },
1104          i16x8 { sse: unpack_high_i64_m128i(b3, b7) },
1105          i16x8 { sse: unpack_low_i64_m128i(b4, b8) },
1106          i16x8 { sse: unpack_high_i64_m128i(b4, b8) } ,
1107        ]
1108     } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1109
1110          #[inline] fn vtrq32(a : int16x8_t, b : int16x8_t) -> (int16x8_t, int16x8_t)
1111          {
1112              unsafe {
1113                let r = vtrnq_s32(vreinterpretq_s32_s16(a),vreinterpretq_s32_s16(b));
1114                (vreinterpretq_s16_s32(r.0), vreinterpretq_s16_s32(r.1))
1115              }
1116          }
1117
1118        unsafe {
1119          let (q0,q2) = vtrq32(data[0].neon, data[2].neon);
1120          let (q1,q3) = vtrq32(data[1].neon, data[3].neon);
1121          let (q4,q6) = vtrq32(data[4].neon, data[6].neon);
1122          let (q5,q7) = vtrq32(data[5].neon, data[7].neon);
1123
1124          let b1 = vtrnq_s16(q0, q1);
1125          let b2 = vtrnq_s16(q2, q3);
1126          let b3 = vtrnq_s16(q4, q5);
1127          let b4 = vtrnq_s16(q6, q7);
1128
1129          // There is no vtrnq_s64 unfortunately, so there's this mess
1130          // which does a somewhat reasonable job, but not as good as the
1131          // assembly versions which just swap the 64 bit register aliases.
1132          [
1133            i16x8 { neon: vcombine_s16(vget_low_s16(b1.0), vget_low_s16(b3.0)) },
1134            i16x8 { neon: vcombine_s16(vget_low_s16(b1.1), vget_low_s16(b3.1)) },
1135            i16x8 { neon: vcombine_s16(vget_low_s16(b2.0), vget_low_s16(b4.0)) },
1136            i16x8 { neon: vcombine_s16(vget_low_s16(b2.1), vget_low_s16(b4.1)) },
1137            i16x8 { neon: vcombine_s16(vget_high_s16(b1.0), vget_high_s16(b3.0)) },
1138            i16x8 { neon: vcombine_s16(vget_high_s16(b1.1), vget_high_s16(b3.1)) },
1139            i16x8 { neon: vcombine_s16(vget_high_s16(b2.0), vget_high_s16(b4.0)) },
1140            i16x8 { neon: vcombine_s16(vget_high_s16(b2.1), vget_high_s16(b4.1)) },
1141          ]
1142        }
1143      } else if #[cfg(target_feature="simd128")] {
1144        #[inline] fn lo_i16(a : v128, b : v128) -> v128 { i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(a,b) }
1145        #[inline] fn hi_i16(a : v128, b : v128) -> v128 { i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(a,b) }
1146        #[inline] fn lo_i32(a : v128, b : v128) -> v128 { i32x4_shuffle::<0, 4, 1, 5>(a,b) }
1147        #[inline] fn hi_i32(a : v128, b : v128) -> v128 { i32x4_shuffle::<2, 6, 3, 7>(a,b) }
1148        #[inline] fn lo_i64(a : v128, b : v128) -> v128 { i64x2_shuffle::<0, 2>(a,b) }
1149        #[inline] fn hi_i64(a : v128, b : v128) -> v128 { i64x2_shuffle::<1, 3>(a,b) }
1150
1151        let a1 = lo_i16(data[0].simd, data[1].simd);
1152        let a2 = hi_i16(data[0].simd, data[1].simd);
1153        let a3 = lo_i16(data[2].simd, data[3].simd);
1154        let a4 = hi_i16(data[2].simd, data[3].simd);
1155        let a5 = lo_i16(data[4].simd, data[5].simd);
1156        let a6 = hi_i16(data[4].simd, data[5].simd);
1157        let a7 = lo_i16(data[6].simd, data[7].simd);
1158        let a8 = hi_i16(data[6].simd, data[7].simd);
1159
1160        let b1 = lo_i32(a1, a3);
1161        let b2 = hi_i32(a1, a3);
1162        let b3 = lo_i32(a2, a4);
1163        let b4 = hi_i32(a2, a4);
1164        let b5 = lo_i32(a5, a7);
1165        let b6 = hi_i32(a5, a7);
1166        let b7 = lo_i32(a6, a8);
1167        let b8 = hi_i32(a6, a8);
1168
1169        [
1170          i16x8 { simd: lo_i64(b1, b5) },
1171          i16x8 { simd: hi_i64(b1, b5) },
1172          i16x8 { simd: lo_i64(b2, b6) },
1173          i16x8 { simd: hi_i64(b2, b6) },
1174          i16x8 { simd: lo_i64(b3, b7) },
1175          i16x8 { simd: hi_i64(b3, b7) },
1176          i16x8 { simd: lo_i64(b4, b8) },
1177          i16x8 { simd: hi_i64(b4, b8) } ,
1178        ]
1179
1180      } else {
1181        #[inline(always)]
1182        fn transpose_column(data: &[i16x8; 8], index: usize) -> i16x8 {
1183          i16x8::new([
1184            data[0].as_array_ref()[index],
1185            data[1].as_array_ref()[index],
1186            data[2].as_array_ref()[index],
1187            data[3].as_array_ref()[index],
1188            data[4].as_array_ref()[index],
1189            data[5].as_array_ref()[index],
1190            data[6].as_array_ref()[index],
1191            data[7].as_array_ref()[index],
1192          ])
1193        }
1194
1195        [
1196          transpose_column(&data, 0),
1197          transpose_column(&data, 1),
1198          transpose_column(&data, 2),
1199          transpose_column(&data, 3),
1200          transpose_column(&data, 4),
1201          transpose_column(&data, 5),
1202          transpose_column(&data, 6),
1203          transpose_column(&data, 7),
1204        ]
1205      }
1206    }
1207  }
1208
1209  #[inline]
1210  #[must_use]
1211  /// Multiply and scale, equivalent to `((self * rhs) + 0x4000) >> 15` on each
1212  /// lane, effectively multiplying by a 16 bit fixed point number between `-1`
1213  /// and `1`. This corresponds to the following instructions:
1214  /// - `vqrdmulhq_n_s16` instruction on neon
1215  /// - `i16x8_q15mulr_sat` on simd128
1216  /// - `_mm_mulhrs_epi16` on ssse3
1217  /// - emulated via `mul_i16_*` on sse2
1218  pub fn mul_scale_round_n(self, rhs: i16) -> Self {
1219    pick! {
1220      if #[cfg(target_feature="ssse3")] {
1221        Self { sse:  mul_i16_scale_round_m128i(self.sse, set_splat_i16_m128i(rhs)) }
1222      } else if #[cfg(target_feature="sse2")] {
1223        // unfortunately mul_i16_scale_round_m128i only got added in sse3
1224        let r = set_splat_i16_m128i(rhs);
1225        let hi = mul_i16_keep_high_m128i(self.sse, r);
1226        let lo = mul_i16_keep_low_m128i(self.sse, r);
1227        let mut v1 = unpack_low_i16_m128i(lo, hi);
1228        let mut v2 = unpack_high_i16_m128i(lo, hi);
1229        let a = set_splat_i32_m128i(0x4000);
1230        v1 = shr_imm_i32_m128i::<15>(add_i32_m128i(v1, a));
1231        v2 = shr_imm_i32_m128i::<15>(add_i32_m128i(v2, a));
1232        let s = pack_i32_to_i16_m128i(v1, v2);
1233        Self { sse: s }
1234      } else if #[cfg(target_feature="simd128")] {
1235        Self { simd: i16x8_q15mulr_sat(self.simd, i16x8_splat(rhs)) }
1236      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1237        unsafe { Self { neon: vqrdmulhq_n_s16(self.neon, rhs) } }
1238      } else {
1239        // compiler does a surprisingly good job of vectorizing this
1240        Self { arr: [
1241          ((i32::from(self.arr[0]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1242          ((i32::from(self.arr[1]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1243          ((i32::from(self.arr[2]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1244          ((i32::from(self.arr[3]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1245          ((i32::from(self.arr[4]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1246          ((i32::from(self.arr[5]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1247          ((i32::from(self.arr[6]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1248          ((i32::from(self.arr[7]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1249        ]}
1250      }
1251    }
1252  }
1253
1254  #[inline]
1255  pub fn to_array(self) -> [i16; 8] {
1256    cast(self)
1257  }
1258
1259  #[inline]
1260  pub fn as_array_ref(&self) -> &[i16; 8] {
1261    cast_ref(self)
1262  }
1263
1264  #[inline]
1265  pub fn as_array_mut(&mut self) -> &mut [i16; 8] {
1266    cast_mut(self)
1267  }
1268}