26#if HWY_COMPILER_GCC_ACTUAL
33#if HWY_TARGET == HWY_SSSE3
46#include <sanitizer/msan_interface.h>
69template <
typename T,
size_t N = 16 /
sizeof(T)>
75 static constexpr size_t kPrivateN =
N;
80 return *
this = (*
this * other);
83 return *
this = (*
this / other);
86 return *
this = (*
this + other);
89 return *
this = (*
this - other);
92 return *
this = (*
this & other);
95 return *
this = (*
this | other);
98 return *
this = (*
this ^ other);
105using Vec64 = Vec128<T, 8 /
sizeof(T)>;
108using Vec32 = Vec128<T, 4 /
sizeof(T)>;
110#if HWY_TARGET <= HWY_AVX3
115template <
size_t size>
136template <
typename T,
size_t N = 16 /
sizeof(T)>
150template <
typename T,
size_t N = 16 /
sizeof(T)>
158using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
161using TFromV =
typename V::PrivateT;
171template <
typename T,
size_t N>
178struct BitCastFromInteger128 {
182struct BitCastFromInteger128<float> {
190template <
typename T,
size_t N>
198template <
typename T,
size_t N,
typename FromT>
200 Vec128<FromT,
N *
sizeof(T) /
sizeof(FromT)>
v) {
207template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
209 return Vec128<T, N>{_mm_setzero_si128()};
211template <
size_t N, HWY_IF_LE128(
float, N)>
213 return Vec128<float, N>{_mm_setzero_ps()};
215template <
size_t N, HWY_IF_LE128(
double, N)>
226template <
size_t N, HWY_IF_LE128(u
int8_t, N)>
227HWY_API Vec128<uint8_t, N>
Set(Simd<uint8_t, N, 0> ,
const uint8_t t) {
228 return Vec128<uint8_t, N>{_mm_set1_epi8(
static_cast<char>(t))};
230template <
size_t N, HWY_IF_LE128(u
int16_t, N)>
231HWY_API Vec128<uint16_t, N>
Set(Simd<uint16_t, N, 0> ,
233 return Vec128<uint16_t, N>{_mm_set1_epi16(
static_cast<short>(t))};
235template <
size_t N, HWY_IF_LE128(u
int32_t, N)>
236HWY_API Vec128<uint32_t, N>
Set(Simd<uint32_t, N, 0> ,
238 return Vec128<uint32_t, N>{_mm_set1_epi32(
static_cast<int>(t))};
240template <
size_t N, HWY_IF_LE128(u
int64_t, N)>
241HWY_API Vec128<uint64_t, N>
Set(Simd<uint64_t, N, 0> ,
243 return Vec128<uint64_t, N>{
244 _mm_set1_epi64x(
static_cast<long long>(t))};
246template <
size_t N, HWY_IF_LE128(
int8_t, N)>
247HWY_API Vec128<int8_t, N>
Set(Simd<int8_t, N, 0> ,
const int8_t t) {
248 return Vec128<int8_t, N>{_mm_set1_epi8(
static_cast<char>(t))};
250template <
size_t N, HWY_IF_LE128(
int16_t, N)>
251HWY_API Vec128<int16_t, N>
Set(Simd<int16_t, N, 0> ,
const int16_t t) {
252 return Vec128<int16_t, N>{_mm_set1_epi16(
static_cast<short>(t))};
254template <
size_t N, HWY_IF_LE128(
int32_t, N)>
255HWY_API Vec128<int32_t, N>
Set(Simd<int32_t, N, 0> ,
const int32_t t) {
256 return Vec128<int32_t, N>{_mm_set1_epi32(t)};
258template <
size_t N, HWY_IF_LE128(
int64_t, N)>
259HWY_API Vec128<int64_t, N>
Set(Simd<int64_t, N, 0> ,
const int64_t t) {
260 return Vec128<int64_t, N>{
261 _mm_set1_epi64x(
static_cast<long long>(t))};
263template <
size_t N, HWY_IF_LE128(
float, N)>
264HWY_API Vec128<float, N>
Set(Simd<float, N, 0> ,
const float t) {
265 return Vec128<float, N>{_mm_set1_ps(t)};
267template <
size_t N, HWY_IF_LE128(
double, N)>
276template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
280 return Vec128<T, N>{_mm_undefined_si128()};
282template <
size_t N, HWY_IF_LE128(
float, N)>
286template <
size_t N, HWY_IF_LE128(
double, N)>
296template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
298 return static_cast<T
>(_mm_cvtsi128_si32(
v.raw) & 0xFF);
300template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
302 return static_cast<T
>(_mm_cvtsi128_si32(
v.raw) & 0xFFFF);
304template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
306 return static_cast<T
>(_mm_cvtsi128_si32(
v.raw));
310 return _mm_cvtss_f32(
v.raw);
315 alignas(16) uint64_t lanes[2];
319 return static_cast<uint64_t
>(_mm_cvtsi128_si64(
v.raw));
325 alignas(16) int64_t lanes[2];
329 return _mm_cvtsi128_si64(
v.raw);
334 return _mm_cvtsd_f64(
v.raw);
341template <
typename T,
size_t N>
342HWY_API Vec128<T, N>
And(Vec128<T, N> a, Vec128<T, N> b) {
343 return Vec128<T, N>{_mm_and_si128(a.raw, b.raw)};
359template <
typename T,
size_t N>
360HWY_API Vec128<T, N>
AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
361 return Vec128<T, N>{_mm_andnot_si128(not_mask.raw, mask.raw)};
376template <
typename T,
size_t N>
377HWY_API Vec128<T, N>
Or(Vec128<T, N> a, Vec128<T, N> b) {
378 return Vec128<T, N>{_mm_or_si128(a.raw, b.raw)};
394template <
typename T,
size_t N>
395HWY_API Vec128<T, N>
Xor(Vec128<T, N> a, Vec128<T, N> b) {
396 return Vec128<T, N>{_mm_xor_si128(a.raw, b.raw)};
411template <
typename T,
size_t N>
415 using VU =
VFromD<
decltype(du)>;
416#if HWY_TARGET <= HWY_AVX3
417 const __m128i vu =
BitCast(du,
v).raw;
418 return BitCast(
d, VU{_mm_ternarylogic_epi32(vu, vu, vu, 0x55)});
425template <
typename T,
size_t N>
426HWY_API Vec128<T, N>
Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
427#if HWY_TARGET <= HWY_AVX3
430 using VU =
VFromD<
decltype(du)>;
431 const __m128i ret = _mm_ternarylogic_epi64(
435 return Xor(x1,
Xor(x2, x3));
440template <
typename T,
size_t N>
441HWY_API Vec128<T, N>
Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
442#if HWY_TARGET <= HWY_AVX3
445 using VU =
VFromD<
decltype(du)>;
446 const __m128i ret = _mm_ternarylogic_epi64(
450 return Or(o1,
Or(o2, o3));
455template <
typename T,
size_t N>
456HWY_API Vec128<T, N>
OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
457#if HWY_TARGET <= HWY_AVX3
460 using VU =
VFromD<
decltype(du)>;
461 const __m128i ret = _mm_ternarylogic_epi64(
465 return Or(o,
And(a1, a2));
470template <
typename T,
size_t N>
473#if HWY_TARGET <= HWY_AVX3
476 using VU =
VFromD<
decltype(du)>;
478 d, VU{_mm_ternarylogic_epi64(
BitCast(du, mask).raw,
BitCast(du, yes).raw,
487template <
typename T,
size_t N>
492template <
typename T,
size_t N>
497template <
typename T,
size_t N>
505#if HWY_TARGET == HWY_AVX3_DL
507#ifdef HWY_NATIVE_POPCNT
508#undef HWY_NATIVE_POPCNT
510#define HWY_NATIVE_POPCNT
515template <
typename T,
size_t N>
520template <
typename T,
size_t N>
525template <
typename T,
size_t N>
530template <
typename T,
size_t N>
538template <
typename T,
size_t N>
552template <
typename T,
size_t N>
557template <
typename T,
size_t N>
564template <
typename T,
size_t N>
573HWY_API Vec128<int8_t, N>
Abs(
const Vec128<int8_t, N>
v) {
577 return Vec128<int8_t, N>{_mm_max_epi8(
v.raw, (zero -
v).raw)};
579 return Vec128<int8_t, N>{_mm_abs_epi8(
v.raw)};
583HWY_API Vec128<int16_t, N>
Abs(
const Vec128<int16_t, N>
v) {
584 return Vec128<int16_t, N>{_mm_abs_epi16(
v.raw)};
587HWY_API Vec128<int32_t, N>
Abs(
const Vec128<int32_t, N>
v) {
588 return Vec128<int32_t, N>{_mm_abs_epi32(
v.raw)};
592HWY_API Vec128<float, N>
Abs(
const Vec128<float, N>
v) {
593 const Vec128<int32_t, N> mask{_mm_set1_epi32(0x7FFFFFFF)};
604template <
typename T,
size_t N>
606 const Vec128<T, N> sign) {
607 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
609 const DFromV<
decltype(magn)>
d;
612#if HWY_TARGET <= HWY_AVX3
624 const __m128i out = _mm_ternarylogic_epi32(
632template <
typename T,
size_t N>
634 const Vec128<T, N> sign) {
635#if HWY_TARGET <= HWY_AVX3
651 __msan_unpoison(unaligned, count *
sizeof(T));
660#if HWY_TARGET <= HWY_AVX3
669template <
typename T,
size_t N>
675template <
typename T,
size_t N>
681template <
typename T,
size_t N>
687template <
typename T,
size_t N>
696template <
typename T,
size_t N>
717template <
typename T,
size_t N>
722template <
typename T,
size_t N>
727template <
typename T,
size_t N>
732template <
typename T,
size_t N>
740template <
typename T,
size_t N>
759template <
typename T,
size_t N>
765template <
typename T,
size_t N>
770template <
typename T,
size_t N>
775template <
typename T,
size_t N>
783template <
typename T,
size_t N>
803#if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS)
804#if HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC_ACTUAL >= 700 || \
805 HWY_COMPILER_CLANG >= 800
806#define HWY_COMPILER_HAS_MASK_INTRINSICS 1
808#define HWY_COMPILER_HAS_MASK_INTRINSICS 0
814template <
typename T,
size_t N>
817#if HWY_COMPILER_HAS_MASK_INTRINSICS
823template <
typename T,
size_t N>
826#if HWY_COMPILER_HAS_MASK_INTRINSICS
832template <
typename T,
size_t N>
835#if HWY_COMPILER_HAS_MASK_INTRINSICS
841template <
typename T,
size_t N>
844#if HWY_COMPILER_HAS_MASK_INTRINSICS
851template <
typename T,
size_t N>
854#if HWY_COMPILER_HAS_MASK_INTRINSICS
860template <
typename T,
size_t N>
863#if HWY_COMPILER_HAS_MASK_INTRINSICS
869template <
typename T,
size_t N>
872#if HWY_COMPILER_HAS_MASK_INTRINSICS
878template <
typename T,
size_t N>
881#if HWY_COMPILER_HAS_MASK_INTRINSICS
888template <
typename T,
size_t N>
891#if HWY_COMPILER_HAS_MASK_INTRINSICS
897template <
typename T,
size_t N>
900#if HWY_COMPILER_HAS_MASK_INTRINSICS
906template <
typename T,
size_t N>
909#if HWY_COMPILER_HAS_MASK_INTRINSICS
915template <
typename T,
size_t N>
918#if HWY_COMPILER_HAS_MASK_INTRINSICS
925template <
typename T,
size_t N>
928#if HWY_COMPILER_HAS_MASK_INTRINSICS
934template <
typename T,
size_t N>
937#if HWY_COMPILER_HAS_MASK_INTRINSICS
943template <
typename T,
size_t N>
946#if HWY_COMPILER_HAS_MASK_INTRINSICS
952template <
typename T,
size_t N>
955#if HWY_COMPILER_HAS_MASK_INTRINSICS
962template <
typename T,
size_t N>
966#if HWY_COMPILER_HAS_MASK_INTRINSICS
972template <
typename T,
size_t N>
976#if HWY_COMPILER_HAS_MASK_INTRINSICS
982template <
typename T,
size_t N>
986#if HWY_COMPILER_HAS_MASK_INTRINSICS
992template <
typename T,
size_t N>
996#if HWY_COMPILER_HAS_MASK_INTRINSICS
1005template <
typename T,
size_t N>
1006HWY_API Mask128<T, N>
And(
const Mask128<T, N> a, Mask128<T, N> b) {
1010template <
typename T,
size_t N>
1011HWY_API Mask128<T, N>
AndNot(
const Mask128<T, N> a, Mask128<T, N> b) {
1015template <
typename T,
size_t N>
1016HWY_API Mask128<T, N>
Or(
const Mask128<T, N> a, Mask128<T, N> b) {
1020template <
typename T,
size_t N>
1021HWY_API Mask128<T, N>
Xor(
const Mask128<T, N> a, Mask128<T, N> b) {
1025template <
typename T,
size_t N>
1026HWY_API Mask128<T, N>
Not(
const Mask128<T, N> m) {
1032template <
typename T,
size_t N>
1042template <
typename T,
size_t N>
1044 return Mask128<T, N>{
v.raw};
1047template <
typename T,
size_t N>
1049 return Vec128<T, N>{
v.raw};
1052template <
typename T,
size_t N>
1054 const Mask128<T, N>
v) {
1055 return Vec128<T, N>{
v.raw};
1058#if HWY_TARGET == HWY_SSSE3
1061template <
typename T,
size_t N>
1071template <
typename T,
size_t N>
1074 return Vec128<T, N>{_mm_blendv_epi8(no.raw, yes.raw, mask.raw)};
1078 const Vec128<float, N> yes,
1079 const Vec128<float, N> no) {
1080 return Vec128<float, N>{_mm_blendv_ps(no.raw, yes.raw, mask.raw)};
1084 const Vec128<double, N> yes,
1085 const Vec128<double, N> no) {
1086 return Vec128<double, N>{_mm_blendv_pd(no.raw, yes.raw, mask.raw)};
1092template <
typename T,
size_t N>
1098template <
typename T,
size_t N>
1105template <
typename T,
size_t N>
1106HWY_API Mask128<T, N>
Not(
const Mask128<T, N> m) {
1110template <
typename T,
size_t N>
1111HWY_API Mask128<T, N>
And(
const Mask128<T, N> a, Mask128<T, N> b) {
1112 const Simd<T, N, 0>
d;
1116template <
typename T,
size_t N>
1117HWY_API Mask128<T, N>
AndNot(
const Mask128<T, N> a, Mask128<T, N> b) {
1118 const Simd<T, N, 0>
d;
1122template <
typename T,
size_t N>
1123HWY_API Mask128<T, N>
Or(
const Mask128<T, N> a, Mask128<T, N> b) {
1124 const Simd<T, N, 0>
d;
1128template <
typename T,
size_t N>
1129HWY_API Mask128<T, N>
Xor(
const Mask128<T, N> a, Mask128<T, N> b) {
1130 const Simd<T, N, 0>
d;
1134template <
typename T,
size_t N>
1136 const Simd<T, N, 0>
d;
1144template <
int kBits,
size_t N>
1146 return Vec128<uint16_t, N>{_mm_slli_epi16(
v.raw, kBits)};
1149template <
int kBits,
size_t N>
1151 return Vec128<uint32_t, N>{_mm_slli_epi32(
v.raw, kBits)};
1154template <
int kBits,
size_t N>
1156 return Vec128<uint64_t, N>{_mm_slli_epi64(
v.raw, kBits)};
1159template <
int kBits,
size_t N>
1161 return Vec128<int16_t, N>{_mm_slli_epi16(
v.raw, kBits)};
1163template <
int kBits,
size_t N>
1165 return Vec128<int32_t, N>{_mm_slli_epi32(
v.raw, kBits)};
1167template <
int kBits,
size_t N>
1169 return Vec128<int64_t, N>{_mm_slli_epi64(
v.raw, kBits)};
1172template <
int kBits,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
1176 const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{
v.raw}).raw};
1179 : (shifted &
Set(d8,
static_cast<T
>((0xFF << kBits) & 0xFF)));
1184template <
int kBits,
size_t N>
1186 return Vec128<uint16_t, N>{_mm_srli_epi16(
v.raw, kBits)};
1188template <
int kBits,
size_t N>
1190 return Vec128<uint32_t, N>{_mm_srli_epi32(
v.raw, kBits)};
1192template <
int kBits,
size_t N>
1194 return Vec128<uint64_t, N>{_mm_srli_epi64(
v.raw, kBits)};
1197template <
int kBits,
size_t N>
1201 const Vec128<uint8_t, N> shifted{
1202 ShiftRight<kBits>(Vec128<uint16_t>{
v.raw}).raw};
1203 return shifted &
Set(d8, 0xFF >> kBits);
1206template <
int kBits,
size_t N>
1208 return Vec128<int16_t, N>{_mm_srai_epi16(
v.raw, kBits)};
1210template <
int kBits,
size_t N>
1212 return Vec128<int32_t, N>{_mm_srai_epi32(
v.raw, kBits)};
1215template <
int kBits,
size_t N>
1220 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> kBits));
1221 return (shifted ^ shifted_sign) - shifted_sign;
1229template <
typename T,
size_t N,
typename TI,
size_t NI>
1231 const Vec128<TI, NI> from) {
1232 return Vec128<TI, NI>{_mm_shuffle_epi8(bytes.raw, from.raw)};
1237template <
class V,
class VI>
1250template <
typename T,
size_t N>
1252 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
1253 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
1254 return Vec128<T, N>{_mm_shuffle_epi32(
v.raw, 0xB1)};
1258 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
1267template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1270 const auto ba =
Combine(d2, b, a);
1271 alignas(16)
const T kShuffle[8] = {1, 0, 7, 6};
1274template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1277 const auto ba =
Combine(d2, b, a);
1278 alignas(16)
const T kShuffle[8] = {0x0302, 0x0100, 0x0f0e, 0x0d0c};
1281template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1285 constexpr int m = _MM_SHUFFLE(2, 3, 0, 1);
1290template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1293 const auto ba =
Combine(d2, b, a);
1294 alignas(16)
const T kShuffle[8] = {0, 3, 6, 5};
1297template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1300 const auto ba =
Combine(d2, b, a);
1301 alignas(16)
const T kShuffle[8] = {0x0100, 0x0706, 0x0d0c, 0x0b0a};
1304template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1308 constexpr int m = _MM_SHUFFLE(1, 2, 3, 0);
1313template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1316 const auto ba =
Combine(d2, b, a);
1317 alignas(16)
const T kShuffle[8] = {2, 1, 4, 7};
1320template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1323 const auto ba =
Combine(d2, b, a);
1324 alignas(16)
const T kShuffle[8] = {0x0504, 0x0302, 0x0908, 0x0f0e};
1327template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1331 constexpr int m = _MM_SHUFFLE(3, 0, 1, 2);
1392#if HWY_TARGET <= HWY_AVX3
1396template <
typename TFrom,
size_t NFrom,
typename TTo,
size_t NTo>
1399 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
1405template <
typename T,
size_t N>
1410template <
typename T,
size_t N>
1415template <
typename T,
size_t N>
1420template <
typename T,
size_t N>
1428template <
typename T,
size_t N>
1429HWY_API Mask128<T, N>
TestBit(
const Vec128<T, N>
v,
const Vec128<T, N> bit) {
1430 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
1436template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
1441template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1443 return Mask128<T, N>{_mm_cmpeq_epi16_mask(a.raw, b.raw)};
1446template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1448 return Mask128<T, N>{_mm_cmpeq_epi32_mask(a.raw, b.raw)};
1451template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1453 return Mask128<T, N>{_mm_cmpeq_epi64_mask(a.raw, b.raw)};
1458 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1469template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
1474template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1476 return Mask128<T, N>{_mm_cmpneq_epi16_mask(a.raw, b.raw)};
1479template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1481 return Mask128<T, N>{_mm_cmpneq_epi32_mask(a.raw, b.raw)};
1484template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1486 return Mask128<T, N>{_mm_cmpneq_epi64_mask(a.raw, b.raw)};
1491 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1504HWY_API Mask128<int8_t, N>
operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
1505 return Mask128<int8_t, N>{_mm_cmpgt_epi8_mask(a.raw, b.raw)};
1509 Vec128<int16_t, N> b) {
1510 return Mask128<int16_t, N>{_mm_cmpgt_epi16_mask(a.raw, b.raw)};
1514 Vec128<int32_t, N> b) {
1515 return Mask128<int32_t, N>{_mm_cmpgt_epi32_mask(a.raw, b.raw)};
1519 Vec128<int64_t, N> b) {
1520 return Mask128<int64_t, N>{_mm_cmpgt_epi64_mask(a.raw, b.raw)};
1525 Vec128<uint8_t, N> b) {
1526 return Mask128<uint8_t, N>{_mm_cmpgt_epu8_mask(a.raw, b.raw)};
1530 Vec128<uint16_t, N> b) {
1531 return Mask128<uint16_t, N>{_mm_cmpgt_epu16_mask(a.raw, b.raw)};
1535 Vec128<uint32_t, N> b) {
1536 return Mask128<uint32_t, N>{_mm_cmpgt_epu32_mask(a.raw, b.raw)};
1540 Vec128<uint64_t, N> b) {
1541 return Mask128<uint64_t, N>{_mm_cmpgt_epu64_mask(a.raw, b.raw)};
1545HWY_API Mask128<float, N>
operator>(Vec128<float, N> a, Vec128<float, N> b) {
1546 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
1557 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
1569template <
typename T,
size_t N>
1574template <
typename T,
size_t N>
1579template <
typename T,
size_t N>
1584template <
typename T,
size_t N>
1592template <
typename T,
size_t N>
1608template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
1613template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1615 return Vec128<T, N>{_mm_movm_epi16(
v.raw)};
1618template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1620 return Vec128<T, N>{_mm_movm_epi32(
v.raw)};
1623template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1625 return Vec128<T, N>{_mm_movm_epi64(
v.raw)};
1638template <
typename T,
size_t N>
1640 const Mask128<T, N>
v) {
1648template <
typename TFrom,
typename TTo,
size_t N>
1650 Mask128<TFrom, N> m) {
1651 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
1652 const Simd<TFrom, N, 0>
d;
1656template <
typename T,
size_t N>
1658 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
1659 return (
v & bit) == bit;
1667 const Vec128<uint8_t, N> b) {
1668 return Mask128<uint8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
1672 const Vec128<uint16_t, N> b) {
1673 return Mask128<uint16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
1677 const Vec128<uint32_t, N> b) {
1678 return Mask128<uint32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
1682 const Vec128<uint64_t, N> b) {
1683#if HWY_TARGET == HWY_SSSE3
1684 const Simd<uint32_t, N * 2, 0> d32;
1685 const Simd<uint64_t, N, 0> d64;
1690 return Mask128<uint64_t, N>{_mm_cmpeq_epi64(a.raw, b.raw)};
1697 const Vec128<int8_t, N> b) {
1698 return Mask128<int8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
1702 Vec128<int16_t, N> b) {
1703 return Mask128<int16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
1707 const Vec128<int32_t, N> b) {
1708 return Mask128<int32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
1712 const Vec128<int64_t, N> b) {
1722 const Vec128<float, N> b) {
1723 return Mask128<float, N>{_mm_cmpeq_ps(a.raw, b.raw)};
1727 const Vec128<double, N> b) {
1728 return Mask128<double, N>{_mm_cmpeq_pd(a.raw, b.raw)};
1738 Vec128<uint8_t, N> b) {
1743 Vec128<uint16_t, N> b) {
1748 Vec128<uint32_t, N> b) {
1753 Vec128<uint64_t, N> b) {
1758 Vec128<int8_t, N> b) {
1763 Vec128<int16_t, N> b) {
1768 Vec128<int32_t, N> b) {
1773 Vec128<int64_t, N> b) {
1779 const Vec128<float, N> b) {
1780 return Mask128<float, N>{_mm_cmpneq_ps(a.raw, b.raw)};
1784 const Vec128<double, N> b) {
1785 return Mask128<double, N>{_mm_cmpneq_pd(a.raw, b.raw)};
1794 Vec128<int8_t, N> b) {
1795 return Mask128<int8_t, N>{_mm_cmpgt_epi8(a.raw, b.raw)};
1799 Vec128<int16_t, N> b) {
1800 return Mask128<int16_t, N>{_mm_cmpgt_epi16(a.raw, b.raw)};
1804 Vec128<int32_t, N> b) {
1805 return Mask128<int32_t, N>{_mm_cmpgt_epi32(a.raw, b.raw)};
1810 const Vec128<int64_t, N> a,
1811 const Vec128<int64_t, N> b) {
1812#if HWY_TARGET == HWY_SSSE3
1814 const Simd<int64_t, N, 0>
d;
1816 const Vec128<int64_t, N> m_eq32{Eq(
BitCast(d32, a),
BitCast(d32, b)).raw};
1820 const __m128i upper =
OrAnd(m_gt32, m_eq32,
Sub(b, a)).raw;
1822 return Mask128<int64_t, N>{_mm_shuffle_epi32(upper, _MM_SHUFFLE(3, 3, 1, 1))};
1824 return Mask128<int64_t, N>{_mm_cmpgt_epi64(a.raw, b.raw)};
1828template <
typename T,
size_t N>
1831 const DFromV<
decltype(a)> du;
1833 const Vec128<T, N> msb =
Set(du, (LimitsMax<T>() >> 1) + 1);
1841 Vec128<float, N> b) {
1842 return Mask128<float, N>{_mm_cmpgt_ps(a.raw, b.raw)};
1846 Vec128<double, N> b) {
1847 return Mask128<double, N>{_mm_cmpgt_pd(a.raw, b.raw)};
1852template <
typename T,
size_t N>
1861 const Vec128<float, N> b) {
1862 return Mask128<float, N>{_mm_cmpge_ps(a.raw, b.raw)};
1866 const Vec128<double, N> b) {
1867 return Mask128<double, N>{_mm_cmpge_pd(a.raw, b.raw)};
1874template <
typename T,
size_t N>
1879template <
typename T,
size_t N>
1886template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
1888#if HWY_TARGET <= HWY_AVX3
1890 const uint64_t all = (1ull <<
N) - 1;
1892 const uint64_t bits = (num > 255) ? all : _bzhi_u64(all, num);
1911#ifndef HWY_SAFE_PARTIAL_LOAD_STORE
1912#if defined(__clang_analyzer__) || \
1913 (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
1914#define HWY_SAFE_PARTIAL_LOAD_STORE 1
1916#define HWY_SAFE_PARTIAL_LOAD_STORE 0
1922template <
typename T>
1924 return Vec128<T>{_mm_load_si128(
reinterpret_cast<const __m128i*
>(aligned))};
1935template <
typename T>
1937 return Vec128<T>{_mm_loadu_si128(
reinterpret_cast<const __m128i*
>(p))};
1941 return Vec128<float>{_mm_loadu_ps(p)};
1948template <
typename T>
1950#if HWY_SAFE_PARTIAL_LOAD_STORE
1951 __m128i
v = _mm_setzero_si128();
1952 CopyBytes<8>(p, &
v);
1955 return Vec64<T>{_mm_loadl_epi64(
reinterpret_cast<const __m128i*
>(p))};
1961#if HWY_SAFE_PARTIAL_LOAD_STORE
1962 __m128
v = _mm_setzero_ps();
1963 CopyBytes<8>(p, &
v);
1966 const __m128 hi = _mm_setzero_ps();
1967 return Vec128<float, 2>{_mm_loadl_pi(hi,
reinterpret_cast<const __m64*
>(p))};
1973#if HWY_SAFE_PARTIAL_LOAD_STORE
1974 __m128d
v = _mm_setzero_pd();
1975 CopyBytes<8>(p, &
v);
1984#if HWY_SAFE_PARTIAL_LOAD_STORE
1985 __m128
v = _mm_setzero_ps();
1986 CopyBytes<4>(p, &
v);
1994template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
1996 constexpr size_t kSize =
sizeof(T) *
N;
1997#if HWY_SAFE_PARTIAL_LOAD_STORE
1998 __m128
v = _mm_setzero_ps();
1999 CopyBytes<kSize>(p, &
v);
2000 return Vec128<T, N>{
v};
2003 CopyBytes<kSize>(p, &bits);
2004 return Vec128<T, N>{_mm_cvtsi32_si128(bits)};
2009template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2015template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
2021template <
typename T,
size_t N,
typename T2, HWY_IF_LE128(T, N)>
2024 for (
size_t i = 0; i < 16 /
sizeof(T); ++i) {
2028 return Load(
d, lanes);
2033#if HWY_TARGET <= HWY_AVX3
2035template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
2041template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2044 return Vec128<T, N>{_mm_maskz_loadu_epi16(m.raw, p)};
2047template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2050 return Vec128<T, N>{_mm_maskz_loadu_epi32(m.raw, p)};
2053template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
2056 return Vec128<T, N>{_mm_maskz_loadu_epi64(m.raw, p)};
2073#elif HWY_TARGET == HWY_AVX2
2075template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2078 auto p_p =
reinterpret_cast<const int*
>(p);
2079 return Vec128<T, N>{_mm_maskload_epi32(p_p, m.raw)};
2082template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
2085 auto p_p =
reinterpret_cast<const long long*
>(p);
2086 return Vec128<T, N>{_mm_maskload_epi64(p_p, m.raw)};
2092 const Vec128<int32_t, N> mi =
2094 return Vec128<float, N>{_mm_maskload_ps(p, mi.raw)};
2100 const Vec128<int64_t, N> mi =
2102 return Vec128<double, N>{_mm_maskload_pd(p, mi.raw)};
2106template <
typename T,
size_t N, HWY_IF_LANE_SIZE_ONE_OF(T, 6)>
2115template <
typename T,
size_t N>
2125template <
typename T>
2127 _mm_store_si128(
reinterpret_cast<__m128i*
>(aligned),
v.raw);
2131 _mm_store_ps(aligned,
v.raw);
2135 _mm_store_pd(aligned,
v.raw);
2138template <
typename T>
2140 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(p),
v.raw);
2144 _mm_storeu_ps(p,
v.raw);
2148 _mm_storeu_pd(p,
v.raw);
2151template <
typename T>
2153#if HWY_SAFE_PARTIAL_LOAD_STORE
2154 CopyBytes<8>(&
v, p);
2156 _mm_storel_epi64(
reinterpret_cast<__m128i*
>(p),
v.raw);
2161#if HWY_SAFE_PARTIAL_LOAD_STORE
2162 CopyBytes<8>(&
v, p);
2164 _mm_storel_pi(
reinterpret_cast<__m64*
>(p),
v.raw);
2169#if HWY_SAFE_PARTIAL_LOAD_STORE
2170 CopyBytes<8>(&
v, p);
2172 _mm_storel_pd(p,
v.raw);
2177template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
2179 CopyBytes<sizeof(T) * N>(&
v, p);
2183#if HWY_SAFE_PARTIAL_LOAD_STORE
2184 CopyBytes<4>(&
v, p);
2186 _mm_store_ss(p,
v.raw);
2191template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2204template <
typename T,
size_t N>
2208 using TI =
TFromD<
decltype(di)>;
2209 alignas(16) TI buf[
N];
2210 alignas(16) TI mask[
N];
2213 for (
size_t i = 0; i <
N; ++i) {
2221#if HWY_TARGET <= HWY_AVX3
2223template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
2226 _mm_mask_storeu_epi8(p, m.
raw,
v.raw);
2228template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2231 _mm_mask_storeu_epi16(p, m.raw,
v.raw);
2234template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2237 auto pi =
reinterpret_cast<int*
>(p);
2238 _mm_mask_storeu_epi32(pi, m.raw,
v.raw);
2241template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
2244 auto pi =
reinterpret_cast<long long*
>(p);
2245 _mm_mask_storeu_epi64(pi, m.raw,
v.raw);
2251 _mm_mask_storeu_ps(p, m.
raw,
v.raw);
2257 _mm_mask_storeu_pd(p, m.
raw,
v.raw);
2260#elif HWY_TARGET == HWY_AVX2
2262template <
typename T,
size_t N, HWY_IF_LANE_SIZE_ONE_OF(T, 6)>
2268template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2273 const Full128<T> df;
2274 const Mask128<T> mf{m.raw};
2275 m = Mask128<T, N>{
And(mf,
FirstN(df,
N)).raw};
2278 auto pi =
reinterpret_cast<int*
>(p);
2279 _mm_maskstore_epi32(pi, m.raw,
v.raw);
2282template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
2287 const Full128<T> df;
2288 const Mask128<T> mf{m.raw};
2289 m = Mask128<T, N>{
And(mf,
FirstN(df,
N)).raw};
2292 auto pi =
reinterpret_cast<long long*
>(p);
2293 _mm_maskstore_epi64(pi, m.raw,
v.raw);
2302 const Full128<T> df;
2303 const Mask128<T> mf{m.raw};
2304 m = Mask128<T, N>{
And(mf,
FirstN(df,
N)).raw};
2307 const Vec128<MakeSigned<T>,
N> mi =
2309 _mm_maskstore_ps(p, mi.raw,
v.raw);
2318 const Full128<T> df;
2319 const Mask128<T> mf{m.raw};
2320 m = Mask128<T, N>{
And(mf,
FirstN(df,
N)).raw};
2323 const Vec128<MakeSigned<T>,
N> mi =
2325 _mm_maskstore_pd(p, mi.raw,
v.raw);
2330template <
typename T,
size_t N>
2346 const Vec128<uint8_t, N> b) {
2347 return Vec128<uint8_t, N>{_mm_add_epi8(a.raw, b.raw)};
2351 const Vec128<uint16_t, N> b) {
2352 return Vec128<uint16_t, N>{_mm_add_epi16(a.raw, b.raw)};
2356 const Vec128<uint32_t, N> b) {
2357 return Vec128<uint32_t, N>{_mm_add_epi32(a.raw, b.raw)};
2361 const Vec128<uint64_t, N> b) {
2362 return Vec128<uint64_t, N>{_mm_add_epi64(a.raw, b.raw)};
2368 const Vec128<int8_t, N> b) {
2369 return Vec128<int8_t, N>{_mm_add_epi8(a.raw, b.raw)};
2373 const Vec128<int16_t, N> b) {
2374 return Vec128<int16_t, N>{_mm_add_epi16(a.raw, b.raw)};
2378 const Vec128<int32_t, N> b) {
2379 return Vec128<int32_t, N>{_mm_add_epi32(a.raw, b.raw)};
2383 const Vec128<int64_t, N> b) {
2384 return Vec128<int64_t, N>{_mm_add_epi64(a.raw, b.raw)};
2390 const Vec128<float, N> b) {
2391 return Vec128<float, N>{_mm_add_ps(a.raw, b.raw)};
2404 const Vec128<uint8_t, N> b) {
2405 return Vec128<uint8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
2409 Vec128<uint16_t, N> b) {
2410 return Vec128<uint16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
2414 const Vec128<uint32_t, N> b) {
2415 return Vec128<uint32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
2419 const Vec128<uint64_t, N> b) {
2420 return Vec128<uint64_t, N>{_mm_sub_epi64(a.raw, b.raw)};
2426 const Vec128<int8_t, N> b) {
2427 return Vec128<int8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
2431 const Vec128<int16_t, N> b) {
2432 return Vec128<int16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
2436 const Vec128<int32_t, N> b) {
2437 return Vec128<int32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
2441 const Vec128<int64_t, N> b) {
2442 return Vec128<int64_t, N>{_mm_sub_epi64(a.raw, b.raw)};
2448 const Vec128<float, N> b) {
2449 return Vec128<float, N>{_mm_sub_ps(a.raw, b.raw)};
2460 return Vec128<uint64_t,
N / 8>{_mm_sad_epu8(
v.raw, _mm_setzero_si128())};
2470 const Vec128<uint8_t, N> b) {
2471 return Vec128<uint8_t, N>{_mm_adds_epu8(a.raw, b.raw)};
2475 const Vec128<uint16_t, N> b) {
2476 return Vec128<uint16_t, N>{_mm_adds_epu16(a.raw, b.raw)};
2482 const Vec128<int8_t, N> b) {
2483 return Vec128<int8_t, N>{_mm_adds_epi8(a.raw, b.raw)};
2487 const Vec128<int16_t, N> b) {
2488 return Vec128<int16_t, N>{_mm_adds_epi16(a.raw, b.raw)};
2498 const Vec128<uint8_t, N> b) {
2499 return Vec128<uint8_t, N>{_mm_subs_epu8(a.raw, b.raw)};
2503 const Vec128<uint16_t, N> b) {
2504 return Vec128<uint16_t, N>{_mm_subs_epu16(a.raw, b.raw)};
2510 const Vec128<int8_t, N> b) {
2511 return Vec128<int8_t, N>{_mm_subs_epi8(a.raw, b.raw)};
2515 const Vec128<int16_t, N> b) {
2516 return Vec128<int16_t, N>{_mm_subs_epi16(a.raw, b.raw)};
2526 const Vec128<uint8_t, N> b) {
2527 return Vec128<uint8_t, N>{_mm_avg_epu8(a.raw, b.raw)};
2531 const Vec128<uint16_t, N> b) {
2532 return Vec128<uint16_t, N>{_mm_avg_epu16(a.raw, b.raw)};
2539 const Vec128<uint16_t, N> b) {
2540 return Vec128<uint16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
2544 const Vec128<int16_t, N> b) {
2545 return Vec128<int16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
2551 const Vec128<uint16_t, N> b) {
2552 return Vec128<uint16_t, N>{_mm_mulhi_epu16(a.raw, b.raw)};
2556 const Vec128<int16_t, N> b) {
2557 return Vec128<int16_t, N>{_mm_mulhi_epi16(a.raw, b.raw)};
2562 const Vec128<int16_t, N> b) {
2563 return Vec128<int16_t, N>{_mm_mulhrs_epi16(a.raw, b.raw)};
2569HWY_API Vec128<uint64_t, (
N + 1) / 2>
MulEven(
const Vec128<uint32_t, N> a,
2570 const Vec128<uint32_t, N> b) {
2571 return Vec128<uint64_t, (
N + 1) / 2>{_mm_mul_epu32(a.raw, b.raw)};
2574#if HWY_TARGET == HWY_SSSE3
2576template <
size_t N, HWY_IF_LE64(
int32_t, N)>
2579 return Set(
Simd<int64_t, (
N + 1) / 2, 0>(),
2583 const Vec128<int32_t> b) {
2584 alignas(16) int32_t a_lanes[4];
2585 alignas(16) int32_t b_lanes[4];
2586 const Full128<int32_t> di32;
2587 Store(a, di32, a_lanes);
2588 Store(b, di32, b_lanes);
2589 alignas(16) int64_t mul[2];
2590 mul[0] =
static_cast<int64_t
>(a_lanes[0]) * b_lanes[0];
2591 mul[1] =
static_cast<int64_t
>(a_lanes[2]) * b_lanes[2];
2592 return Load(Full128<int64_t>(), mul);
2598HWY_API Vec128<int64_t, (
N + 1) / 2>
MulEven(
const Vec128<int32_t, N> a,
2599 const Vec128<int32_t, N> b) {
2600 return Vec128<int64_t, (
N + 1) / 2>{_mm_mul_epi32(a.raw, b.raw)};
2607 const Vec128<uint32_t, N> b) {
2608#if HWY_TARGET == HWY_SSSE3
2612 const __m128i a_x3x1 = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 3, 1, 1));
2613 const auto mullo_x2x0 =
MulEven(a, b);
2614 const __m128i b_x3x1 = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(3, 3, 1, 1));
2615 const auto mullo_x3x1 =
2616 MulEven(Vec128<uint32_t, N>{a_x3x1}, Vec128<uint32_t, N>{b_x3x1});
2619 const __m128i mul_20 =
2620 _mm_shuffle_epi32(mullo_x2x0.raw, _MM_SHUFFLE(2, 0, 2, 0));
2621 const __m128i mul_31 =
2622 _mm_shuffle_epi32(mullo_x3x1.raw, _MM_SHUFFLE(2, 0, 2, 0));
2623 return Vec128<uint32_t, N>{_mm_unpacklo_epi32(mul_20, mul_31)};
2625 return Vec128<uint32_t, N>{_mm_mullo_epi32(a.raw, b.raw)};
2631 const Vec128<int32_t, N> b) {
2640template <
int kBits,
size_t N>
2642 static_assert(0 <= kBits && kBits < 32,
"Invalid shift count");
2643#if HWY_TARGET <= HWY_AVX3
2644 return Vec128<uint32_t, N>{_mm_ror_epi32(
v.raw, kBits)};
2646 if (kBits == 0)
return v;
2651template <
int kBits,
size_t N>
2653 static_assert(0 <= kBits && kBits < 64,
"Invalid shift count");
2654#if HWY_TARGET <= HWY_AVX3
2655 return Vec128<uint64_t, N>{_mm_ror_epi64(
v.raw, kBits)};
2657 if (kBits == 0)
return v;
2672 return ShiftRight<15>(
v);
2677 return ShiftRight<31>(
v);
2683#if HWY_TARGET <= HWY_AVX3
2686#elif HWY_TARGET == HWY_AVX2 || HWY_TARGET == HWY_SSE4
2692 const auto sign = ShiftRight<31>(
BitCast(d32,
v));
2694 _mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))};
2699HWY_API Vec128<int64_t, N>
Abs(
const Vec128<int64_t, N>
v) {
2700#if HWY_TARGET <= HWY_AVX3
2701 return Vec128<int64_t, N>{_mm_abs_epi64(
v.raw)};
2708template <
int kBits,
size_t N>
2710#if HWY_TARGET <= HWY_AVX3
2711 return Vec128<int64_t, N>{_mm_srai_epi64(
v.raw, kBits)};
2717 return right | sign;
2722template <
typename T,
size_t N>
2724 static_assert(IsFloat<T>(),
"Only works for float");
2726#if HWY_TARGET == HWY_SSSE3
2744template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2747 static_assert(IsSigned<T>(),
"Only works for signed/float");
2756template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
2759 static_assert(IsSigned<T>(),
"Only works for signed/float");
2773 return Vec128<uint16_t, N>{_mm_sll_epi16(
v.raw, _mm_cvtsi32_si128(bits))};
2778 return Vec128<uint32_t, N>{_mm_sll_epi32(
v.raw, _mm_cvtsi32_si128(bits))};
2783 return Vec128<uint64_t, N>{_mm_sll_epi64(
v.raw, _mm_cvtsi32_si128(bits))};
2789 return Vec128<int16_t, N>{_mm_sll_epi16(
v.raw, _mm_cvtsi32_si128(bits))};
2795 return Vec128<int32_t, N>{_mm_sll_epi32(
v.raw, _mm_cvtsi32_si128(bits))};
2801 return Vec128<int64_t, N>{_mm_sll_epi64(
v.raw, _mm_cvtsi32_si128(bits))};
2804template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
2808 const Vec128<T, N> shifted{
2810 return shifted &
Set(d8,
static_cast<T
>((0xFF << bits) & 0xFF));
2818 return Vec128<uint16_t, N>{_mm_srl_epi16(
v.raw, _mm_cvtsi32_si128(bits))};
2823 return Vec128<uint32_t, N>{_mm_srl_epi32(
v.raw, _mm_cvtsi32_si128(bits))};
2828 return Vec128<uint64_t, N>{_mm_srl_epi64(
v.raw, _mm_cvtsi32_si128(bits))};
2836 const Vec128<uint8_t, N> shifted{
2838 return shifted &
Set(d8,
static_cast<uint8_t
>(0xFF >> bits));
2844 return Vec128<int16_t, N>{_mm_sra_epi16(
v.raw, _mm_cvtsi32_si128(bits))};
2850 return Vec128<int32_t, N>{_mm_sra_epi32(
v.raw, _mm_cvtsi32_si128(bits))};
2855#if HWY_TARGET <= HWY_AVX3
2856 return Vec128<int64_t, N>{_mm_sra_epi64(
v.raw, _mm_cvtsi32_si128(bits))};
2862 return right | sign;
2871 const auto shifted_sign =
2872 BitCast(di,
Set(du,
static_cast<uint8_t
>(0x80 >> bits)));
2873 return (shifted ^ shifted_sign) - shifted_sign;
2879HWY_API Vec128<float, N>
operator*(Vec128<float, N> a, Vec128<float, N> b) {
2880 return Vec128<float, N>{_mm_mul_ps(a.raw, b.raw)};
2897 const Vec128<float, N> b) {
2898 return Vec128<float, N>{_mm_div_ps(a.raw, b.raw)};
2916 return Vec128<float, N>{_mm_rcp_ps(
v.raw)};
2925 const Vec128<float, N> b) {
2934 const Vec128<float, N> x,
2935 const Vec128<float, N> add) {
2936#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2937 return mul * x + add;
2939 return Vec128<float, N>{_mm_fmadd_ps(mul.raw, x.raw, add.raw)};
2946#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2947 return mul * x + add;
2956 const Vec128<float, N> x,
2957 const Vec128<float, N> add) {
2958#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2959 return add - mul * x;
2961 return Vec128<float, N>{_mm_fnmadd_ps(mul.raw, x.raw, add.raw)};
2968#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2969 return add - mul * x;
2978 const Vec128<float, N> x,
2979 const Vec128<float, N> sub) {
2980#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2981 return mul * x - sub;
2983 return Vec128<float, N>{_mm_fmsub_ps(mul.raw, x.raw, sub.raw)};
2990#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2991 return mul * x - sub;
3000 const Vec128<float, N> x,
3001 const Vec128<float, N> sub) {
3002#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3003 return Neg(mul) * x - sub;
3005 return Vec128<float, N>{_mm_fnmsub_ps(mul.raw, x.raw, sub.raw)};
3012#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3013 return Neg(mul) * x - sub;
3023HWY_API Vec128<float, N>
Sqrt(
const Vec128<float, N>
v) {
3024 return Vec128<float, N>{_mm_sqrt_ps(
v.raw)};
3040 return Vec128<float, N>{_mm_rsqrt_ps(
v.raw)};
3050template <
typename T,
size_t N>
3056 const auto msb =
Set(du,
static_cast<T
>(T(1) << (
sizeof(T) * 8 - 1)));
3065HWY_API Vec128<uint8_t, N>
Min(
const Vec128<uint8_t, N> a,
3066 const Vec128<uint8_t, N> b) {
3067 return Vec128<uint8_t, N>{_mm_min_epu8(a.raw, b.raw)};
3070HWY_API Vec128<uint16_t, N>
Min(
const Vec128<uint16_t, N> a,
3071 const Vec128<uint16_t, N> b) {
3072#if HWY_TARGET == HWY_SSSE3
3075 return Vec128<uint16_t, N>{_mm_min_epu16(a.raw, b.raw)};
3079HWY_API Vec128<uint32_t, N>
Min(
const Vec128<uint32_t, N> a,
3080 const Vec128<uint32_t, N> b) {
3081#if HWY_TARGET == HWY_SSSE3
3084 return Vec128<uint32_t, N>{_mm_min_epu32(a.raw, b.raw)};
3088HWY_API Vec128<uint64_t, N>
Min(
const Vec128<uint64_t, N> a,
3089 const Vec128<uint64_t, N> b) {
3090#if HWY_TARGET <= HWY_AVX3
3091 return Vec128<uint64_t, N>{_mm_min_epu64(a.raw, b.raw)};
3099HWY_API Vec128<int8_t, N>
Min(
const Vec128<int8_t, N> a,
3100 const Vec128<int8_t, N> b) {
3101#if HWY_TARGET == HWY_SSSE3
3104 return Vec128<int8_t, N>{_mm_min_epi8(a.raw, b.raw)};
3108HWY_API Vec128<int16_t, N>
Min(
const Vec128<int16_t, N> a,
3109 const Vec128<int16_t, N> b) {
3110 return Vec128<int16_t, N>{_mm_min_epi16(a.raw, b.raw)};
3113HWY_API Vec128<int32_t, N>
Min(
const Vec128<int32_t, N> a,
3114 const Vec128<int32_t, N> b) {
3115#if HWY_TARGET == HWY_SSSE3
3118 return Vec128<int32_t, N>{_mm_min_epi32(a.raw, b.raw)};
3122HWY_API Vec128<int64_t, N>
Min(
const Vec128<int64_t, N> a,
3123 const Vec128<int64_t, N> b) {
3124#if HWY_TARGET <= HWY_AVX3
3125 return Vec128<int64_t, N>{_mm_min_epi64(a.raw, b.raw)};
3133HWY_API Vec128<float, N>
Min(
const Vec128<float, N> a,
3134 const Vec128<float, N> b) {
3135 return Vec128<float, N>{_mm_min_ps(a.raw, b.raw)};
3146template <
typename T,
size_t N>
3152 const auto msb =
Set(du,
static_cast<T
>(T(1) << (
sizeof(T) * 8 - 1)));
3161HWY_API Vec128<uint8_t, N>
Max(
const Vec128<uint8_t, N> a,
3162 const Vec128<uint8_t, N> b) {
3163 return Vec128<uint8_t, N>{_mm_max_epu8(a.raw, b.raw)};
3166HWY_API Vec128<uint16_t, N>
Max(
const Vec128<uint16_t, N> a,
3167 const Vec128<uint16_t, N> b) {
3168#if HWY_TARGET == HWY_SSSE3
3171 return Vec128<uint16_t, N>{_mm_max_epu16(a.raw, b.raw)};
3175HWY_API Vec128<uint32_t, N>
Max(
const Vec128<uint32_t, N> a,
3176 const Vec128<uint32_t, N> b) {
3177#if HWY_TARGET == HWY_SSSE3
3180 return Vec128<uint32_t, N>{_mm_max_epu32(a.raw, b.raw)};
3184HWY_API Vec128<uint64_t, N>
Max(
const Vec128<uint64_t, N> a,
3185 const Vec128<uint64_t, N> b) {
3186#if HWY_TARGET <= HWY_AVX3
3187 return Vec128<uint64_t, N>{_mm_max_epu64(a.raw, b.raw)};
3195HWY_API Vec128<int8_t, N>
Max(
const Vec128<int8_t, N> a,
3196 const Vec128<int8_t, N> b) {
3197#if HWY_TARGET == HWY_SSSE3
3200 return Vec128<int8_t, N>{_mm_max_epi8(a.raw, b.raw)};
3204HWY_API Vec128<int16_t, N>
Max(
const Vec128<int16_t, N> a,
3205 const Vec128<int16_t, N> b) {
3206 return Vec128<int16_t, N>{_mm_max_epi16(a.raw, b.raw)};
3209HWY_API Vec128<int32_t, N>
Max(
const Vec128<int32_t, N> a,
3210 const Vec128<int32_t, N> b) {
3211#if HWY_TARGET == HWY_SSSE3
3214 return Vec128<int32_t, N>{_mm_max_epi32(a.raw, b.raw)};
3218HWY_API Vec128<int64_t, N>
Max(
const Vec128<int64_t, N> a,
3219 const Vec128<int64_t, N> b) {
3220#if HWY_TARGET <= HWY_AVX3
3221 return Vec128<int64_t, N>{_mm_max_epi64(a.raw, b.raw)};
3229HWY_API Vec128<float, N>
Max(
const Vec128<float, N> a,
3230 const Vec128<float, N> b) {
3231 return Vec128<float, N>{_mm_max_ps(a.raw, b.raw)};
3245template <
typename T,
size_t N>
3248 _mm_stream_si128(
reinterpret_cast<__m128i*
>(aligned),
v.raw);
3253 _mm_stream_ps(aligned,
v.raw);
3258 _mm_stream_pd(aligned,
v.raw);
3269static_assert(sizeof(
GatherIndex64) == 8, "Must be 64-bit type");
3271#if HWY_TARGET <= HWY_AVX3
3274template <
typename T,
size_t N>
3279 _mm_i32scatter_epi32(base, offset.
raw,
v.raw, 1);
3281 const __mmask8 mask = (1u <<
N) - 1;
3282 _mm_mask_i32scatter_epi32(base, mask, offset.
raw,
v.raw, 1);
3285template <
typename T,
size_t N>
3290 _mm_i32scatter_epi32(base, index.
raw,
v.raw, 4);
3292 const __mmask8 mask = (1u <<
N) - 1;
3293 _mm_mask_i32scatter_epi32(base, mask, index.
raw,
v.raw, 4);
3297template <
typename T,
size_t N>
3302 _mm_i64scatter_epi64(base, offset.
raw,
v.raw, 1);
3304 const __mmask8 mask = (1u <<
N) - 1;
3305 _mm_mask_i64scatter_epi64(base, mask, offset.
raw,
v.raw, 1);
3308template <
typename T,
size_t N>
3313 _mm_i64scatter_epi64(base, index.
raw,
v.raw, 8);
3315 const __mmask8 mask = (1u <<
N) - 1;
3316 _mm_mask_i64scatter_epi64(base, mask, index.
raw,
v.raw, 8);
3322template <
typename T,
size_t N,
typename Offset>
3326 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
3329template <
typename T,
size_t N,
typename Index>
3331 const Vec128<Index, N> index) {
3332 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
3341 _mm_i32scatter_ps(base, offset.
raw,
v.raw, 1);
3343 const __mmask8 mask = (1u <<
N) - 1;
3344 _mm_mask_i32scatter_ps(base, mask, offset.
raw,
v.raw, 1);
3352 _mm_i32scatter_ps(base, index.
raw,
v.raw, 4);
3354 const __mmask8 mask = (1u <<
N) - 1;
3355 _mm_mask_i32scatter_ps(base, mask, index.
raw,
v.raw, 4);
3364 _mm_i64scatter_pd(base, offset.
raw,
v.raw, 1);
3366 const __mmask8 mask = (1u <<
N) - 1;
3367 _mm_mask_i64scatter_pd(base, mask, offset.
raw,
v.raw, 1);
3375 _mm_i64scatter_pd(base, index.
raw,
v.raw, 8);
3377 const __mmask8 mask = (1u <<
N) - 1;
3378 _mm_mask_i64scatter_pd(base, mask, index.
raw,
v.raw, 8);
3383template <
typename T,
size_t N,
typename Offset, HWY_IF_LE128(T, N)>
3386 const Vec128<Offset, N> offset) {
3387 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
3389 alignas(16) T lanes[
N];
3392 alignas(16) Offset offset_lanes[
N];
3393 Store(offset,
Rebind<Offset,
decltype(
d)>(), offset_lanes);
3395 uint8_t* base_bytes =
reinterpret_cast<uint8_t*
>(base);
3396 for (
size_t i = 0; i <
N; ++i) {
3397 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
3401template <
typename T,
size_t N,
typename Index, HWY_IF_LE128(T, N)>
3403 const Vec128<Index, N> index) {
3404 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
3406 alignas(16) T lanes[
N];
3409 alignas(16) Index index_lanes[
N];
3410 Store(index,
Rebind<Index,
decltype(
d)>(), index_lanes);
3412 for (
size_t i = 0; i <
N; ++i) {
3413 base[index_lanes[i]] = lanes[i];
3421#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3423template <
typename T,
size_t N,
typename Offset>
3426 const Vec128<Offset, N> offset) {
3427 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
3429 alignas(16) Offset offset_lanes[
N];
3430 Store(offset,
Rebind<Offset,
decltype(
d)>(), offset_lanes);
3432 alignas(16) T lanes[
N];
3433 const uint8_t* base_bytes =
reinterpret_cast<const uint8_t*
>(base);
3434 for (
size_t i = 0; i <
N; ++i) {
3435 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
3437 return Load(
d, lanes);
3440template <
typename T,
size_t N,
typename Index>
3443 const Vec128<Index, N> index) {
3444 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
3446 alignas(16) Index index_lanes[
N];
3447 Store(index,
Rebind<Index,
decltype(
d)>(), index_lanes);
3449 alignas(16) T lanes[
N];
3450 for (
size_t i = 0; i <
N; ++i) {
3451 lanes[i] = base[index_lanes[i]];
3453 return Load(
d, lanes);
3460template <
typename T,
size_t N>
3464 const Vec128<int32_t, N> offset) {
3465 return Vec128<T, N>{_mm_i32gather_epi32(
3466 reinterpret_cast<const int32_t*
>(base), offset.raw, 1)};
3468template <
typename T,
size_t N>
3472 const Vec128<int32_t, N> index) {
3473 return Vec128<T, N>{_mm_i32gather_epi32(
3474 reinterpret_cast<const int32_t*
>(base), index.raw, 4)};
3477template <
typename T,
size_t N>
3481 const Vec128<int64_t, N> offset) {
3482 return Vec128<T, N>{_mm_i64gather_epi64(
3483 reinterpret_cast<const GatherIndex64*
>(base), offset.raw, 1)};
3485template <
typename T,
size_t N>
3489 const Vec128<int64_t, N> index) {
3490 return Vec128<T, N>{_mm_i64gather_epi64(
3491 reinterpret_cast<const GatherIndex64*
>(base), index.raw, 8)};
3496template <
typename T,
size_t N,
typename Offset>
3498 const Vec128<Offset, N> offset) {
3501template <
typename T,
size_t N,
typename Index>
3503 const Vec128<Index, N> index) {
3510 const Vec128<int32_t, N> offset) {
3511 return Vec128<float, N>{_mm_i32gather_ps(base, offset.raw, 1)};
3516 const Vec128<int32_t, N> index) {
3517 return Vec128<float, N>{_mm_i32gather_ps(base, index.raw, 4)};
3523 const Vec128<int64_t, N> offset) {
3524 return Vec128<double, N>{_mm_i64gather_pd(base, offset.raw, 1)};
3529 const Vec128<int64_t, N> index) {
3530 return Vec128<double, N>{_mm_i64gather_pd(base, index.raw, 8)};
3542template <
typename T,
size_t N>
3545 return Vec128<T,
N / 2>{
v.raw};
3548template <
typename T,
size_t N>
3555template <
int kBytes,
typename T,
size_t N>
3557 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
3558 return Vec128<T, N>{_mm_slli_si128(
v.raw, kBytes)};
3561template <
int kBytes,
typename T,
size_t N>
3563 return ShiftLeftBytes<kBytes>(
DFromV<
decltype(
v)>(),
v);
3568template <
int kLanes,
typename T,
size_t N>
3574template <
int kLanes,
typename T,
size_t N>
3576 return ShiftLeftLanes<kLanes>(
DFromV<
decltype(
v)>(),
v);
3580template <
int kBytes,
typename T,
size_t N>
3582 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
3584 if (
N != 16 /
sizeof(T)) {
3585 const Vec128<T> vfull{
v.raw};
3588 return Vec128<T, N>{_mm_srli_si128(
v.raw, kBytes)};
3592template <
int kLanes,
typename T,
size_t N>
3601template <
typename T>
3603 return Vec64<T>{_mm_unpackhi_epi64(
v.raw,
v.raw)};
3606 return Vec128<float, 2>{_mm_movehl_ps(
v.raw,
v.raw)};
3613template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3620 return Vec128<T, (
N + 1) / 2>{upper.raw};
3627template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
3629 static_assert(kLane <
N,
"Lane index out of bounds");
3630#if HWY_TARGET == HWY_SSSE3
3631 const int pair = _mm_extract_epi16(
v.raw, kLane / 2);
3632 constexpr int kShift = kLane & 1 ? 8 : 0;
3633 return static_cast<T
>((pair >> kShift) & 0xFF);
3635 return static_cast<T
>(_mm_extract_epi8(
v.raw, kLane) & 0xFF);
3639template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
3641 static_assert(kLane <
N,
"Lane index out of bounds");
3642 return static_cast<T
>(_mm_extract_epi16(
v.raw, kLane) & 0xFFFF);
3645template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
3647 static_assert(kLane <
N,
"Lane index out of bounds");
3648#if HWY_TARGET == HWY_SSSE3
3649 alignas(16) T lanes[4];
3651 return lanes[kLane];
3653 return static_cast<T
>(_mm_extract_epi32(
v.raw, kLane));
3657template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
3659 static_assert(kLane <
N,
"Lane index out of bounds");
3660#if HWY_TARGET == HWY_SSSE3 || HWY_ARCH_X86_32
3661 alignas(16) T lanes[2];
3663 return lanes[kLane];
3665 return static_cast<T
>(_mm_extract_epi64(
v.raw, kLane));
3669template <
size_t kLane,
size_t N>
3671 static_assert(kLane <
N,
"Lane index out of bounds");
3672#if HWY_TARGET == HWY_SSSE3
3673 alignas(16)
float lanes[4];
3675 return lanes[kLane];
3678 const int32_t bits = _mm_extract_ps(
v.raw, kLane);
3686template <
size_t kLane>
3688 static_assert(kLane == 0,
"Lane index out of bounds");
3692template <
size_t kLane>
3694 static_assert(kLane < 2,
"Lane index out of bounds");
3703template <
typename T>
3710template <
typename T>
3712#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
3713 if (__builtin_constant_p(i)) {
3716 return detail::ExtractLane<0>(
v);
3718 return detail::ExtractLane<1>(
v);
3722 alignas(16) T lanes[2];
3727template <
typename T>
3729#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
3730 if (__builtin_constant_p(i)) {
3733 return detail::ExtractLane<0>(
v);
3735 return detail::ExtractLane<1>(
v);
3737 return detail::ExtractLane<2>(
v);
3739 return detail::ExtractLane<3>(
v);
3743 alignas(16) T lanes[4];
3748template <
typename T>
3750#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
3751 if (__builtin_constant_p(i)) {
3754 return detail::ExtractLane<0>(
v);
3756 return detail::ExtractLane<1>(
v);
3758 return detail::ExtractLane<2>(
v);
3760 return detail::ExtractLane<3>(
v);
3762 return detail::ExtractLane<4>(
v);
3764 return detail::ExtractLane<5>(
v);
3766 return detail::ExtractLane<6>(
v);
3768 return detail::ExtractLane<7>(
v);
3772 alignas(16) T lanes[8];
3777template <
typename T>
3779#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
3780 if (__builtin_constant_p(i)) {
3783 return detail::ExtractLane<0>(
v);
3785 return detail::ExtractLane<1>(
v);
3787 return detail::ExtractLane<2>(
v);
3789 return detail::ExtractLane<3>(
v);
3791 return detail::ExtractLane<4>(
v);
3793 return detail::ExtractLane<5>(
v);
3795 return detail::ExtractLane<6>(
v);
3797 return detail::ExtractLane<7>(
v);
3799 return detail::ExtractLane<8>(
v);
3801 return detail::ExtractLane<9>(
v);
3803 return detail::ExtractLane<10>(
v);
3805 return detail::ExtractLane<11>(
v);
3807 return detail::ExtractLane<12>(
v);
3809 return detail::ExtractLane<13>(
v);
3811 return detail::ExtractLane<14>(
v);
3813 return detail::ExtractLane<15>(
v);
3817 alignas(16) T lanes[16];
3826template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
3828 static_assert(kLane <
N,
"Lane index out of bounds");
3829#if HWY_TARGET == HWY_SSSE3
3831 alignas(16) T lanes[16];
3834 return Load(
d, lanes);
3836 return Vec128<T, N>{_mm_insert_epi8(
v.raw, t, kLane)};
3840template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
3842 static_assert(kLane <
N,
"Lane index out of bounds");
3843 return Vec128<T, N>{_mm_insert_epi16(
v.raw, t, kLane)};
3846template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
3848 static_assert(kLane <
N,
"Lane index out of bounds");
3849#if HWY_TARGET == HWY_SSSE3
3850 alignas(16) T lanes[4];
3854 return Load(
d, lanes);
3858 return Vec128<T, N>{_mm_insert_epi32(
v.raw, ti, kLane)};
3862template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
3864 static_assert(kLane <
N,
"Lane index out of bounds");
3865#if HWY_TARGET == HWY_SSSE3 || HWY_ARCH_X86_32
3867 alignas(16) T lanes[2];
3870 return Load(
d, lanes);
3874 return Vec128<T, N>{_mm_insert_epi64(
v.raw, ti, kLane)};
3878template <
size_t kLane,
size_t N>
3880 static_assert(kLane <
N,
"Lane index out of bounds");
3881#if HWY_TARGET == HWY_SSSE3
3883 alignas(16)
float lanes[4];
3886 return Load(
d, lanes);
3888 return Vec128<float, N>{_mm_insert_ps(
v.raw, _mm_set_ss(t), kLane << 4)};
3893template <
size_t kLane>
3895 static_assert(kLane == 0,
"Lane index out of bounds");
3899template <
size_t kLane>
3901 static_assert(kLane < 2,
"Lane index out of bounds");
3915template <
typename T>
3922template <
typename T>
3924#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
3925 if (__builtin_constant_p(i)) {
3928 return detail::InsertLane<0>(
v, t);
3930 return detail::InsertLane<1>(
v, t);
3935 alignas(16) T lanes[2];
3938 return Load(
d, lanes);
3941template <
typename T>
3943#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
3944 if (__builtin_constant_p(i)) {
3947 return detail::InsertLane<0>(
v, t);
3949 return detail::InsertLane<1>(
v, t);
3951 return detail::InsertLane<2>(
v, t);
3953 return detail::InsertLane<3>(
v, t);
3958 alignas(16) T lanes[4];
3961 return Load(
d, lanes);
3964template <
typename T>
3966#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
3967 if (__builtin_constant_p(i)) {
3970 return detail::InsertLane<0>(
v, t);
3972 return detail::InsertLane<1>(
v, t);
3974 return detail::InsertLane<2>(
v, t);
3976 return detail::InsertLane<3>(
v, t);
3978 return detail::InsertLane<4>(
v, t);
3980 return detail::InsertLane<5>(
v, t);
3982 return detail::InsertLane<6>(
v, t);
3984 return detail::InsertLane<7>(
v, t);
3989 alignas(16) T lanes[8];
3992 return Load(
d, lanes);
3995template <
typename T>
3997#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
3998 if (__builtin_constant_p(i)) {
4001 return detail::InsertLane<0>(
v, t);
4003 return detail::InsertLane<1>(
v, t);
4005 return detail::InsertLane<2>(
v, t);
4007 return detail::InsertLane<3>(
v, t);
4009 return detail::InsertLane<4>(
v, t);
4011 return detail::InsertLane<5>(
v, t);
4013 return detail::InsertLane<6>(
v, t);
4015 return detail::InsertLane<7>(
v, t);
4017 return detail::InsertLane<8>(
v, t);
4019 return detail::InsertLane<9>(
v, t);
4021 return detail::InsertLane<10>(
v, t);
4023 return detail::InsertLane<11>(
v, t);
4025 return detail::InsertLane<12>(
v, t);
4027 return detail::InsertLane<13>(
v, t);
4029 return detail::InsertLane<14>(
v, t);
4031 return detail::InsertLane<15>(
v, t);
4036 alignas(16) T lanes[16];
4039 return Load(
d, lanes);
4044template <
int kBytes,
typename T,
class V = Vec128<T>>
4047 return BitCast(
d, Vec128<uint8_t>{_mm_alignr_epi8(
4051template <
int kBytes,
typename T,
size_t N,
HWY_IF_LE64(T,
N),
4052 class V = Vec128<T, N>>
4054 constexpr size_t kSize =
N *
sizeof(T);
4055 static_assert(0 < kBytes && kBytes < kSize,
"kBytes invalid");
4057 const Full128<uint8_t> d_full8;
4058 using V8 =
VFromD<
decltype(d_full8)>;
4059 const V8 hi8{
BitCast(d8, hi).raw};
4063 return V{
BitCast(Full128<T>(), r).raw};
4069template <
int kLane,
size_t N>
4071 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
4073 const __m128i lo = _mm_shufflelo_epi16(
v.raw, (0x55 * kLane) & 0xFF);
4076 const __m128i hi = _mm_shufflehi_epi16(
v.raw, (0x55 * (kLane - 4)) & 0xFF);
4080template <
int kLane,
size_t N>
4082 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
4085template <
int kLane,
size_t N>
4087 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
4092template <
int kLane,
size_t N>
4094 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
4096 const __m128i lo = _mm_shufflelo_epi16(
v.raw, (0x55 * kLane) & 0xFF);
4099 const __m128i hi = _mm_shufflehi_epi16(
v.raw, (0x55 * (kLane - 4)) & 0xFF);
4103template <
int kLane,
size_t N>
4105 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
4108template <
int kLane,
size_t N>
4110 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
4115template <
int kLane,
size_t N>
4117 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
4120template <
int kLane,
size_t N>
4122 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
4129template <
typename T,
size_t N = 16 /
sizeof(T)>
4137 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
4138#if HWY_IS_DEBUG_BUILD
4139 const Rebind<TI,
decltype(
d)> di;
4144#if HWY_TARGET <= HWY_AVX2
4149 using V8 =
VFromD<
decltype(d8)>;
4150 alignas(16)
constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
4151 0, 1, 2, 3, 0, 1, 2, 3};
4154 alignas(16)
constexpr uint8_t kBroadcastLaneBytes[16] = {
4155 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
4160 const V8 byte_indices =
BitCast(d8, ShiftLeft<2>(
BitCast(d16, lane_indices)));
4169 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
4170#if HWY_IS_DEBUG_BUILD
4171 const Rebind<TI,
decltype(
d)> di;
4173 AllTrue(di, Lt(vec,
Set(di,
static_cast<TI
>(
N)))));
4179 return Indices128<T, N>{vec.raw};
4182template <
typename T,
size_t N,
typename TI, HWY_IF_LE128(T, N)>
4184 const Rebind<TI,
decltype(
d)> di;
4188template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4190#if HWY_TARGET <= HWY_AVX2
4200template <
size_t N, HWY_IF_GE64(
float, N)>
4203#if HWY_TARGET <= HWY_AVX2
4214template <
typename T>
4220template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4224#if HWY_TARGET <= HWY_AVX2
4244#if HWY_TARGET <= HWY_AVX2
4262template <
typename T>
4270template <
typename T>
4276template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4281template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4287template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4293template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4295#if HWY_TARGET <= HWY_AVX3
4296 if (
N == 1)
return v;
4302 alignas(16)
constexpr int16_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
4303 const Vec128<int16_t, N> idx =
Load(di, kReverse + (
N == 8 ? 0 : 4));
4304 return BitCast(
d, Vec128<int16_t, N>{
4305 _mm_permutexvar_epi16(idx.raw,
BitCast(di,
v).raw)});
4315template <
typename T>
4320template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
4322 alignas(16)
const T kShuffle[16] = {1, 0, 3, 2, 5, 4, 7, 6,
4323 9, 8, 11, 10, 13, 12, 15, 14};
4327template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4333template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4338template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4345template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4350 return BitCast(
d, Vec128<int16_t, N>{_mm_shufflelo_epi16(
4351 BitCast(di,
v).raw, _MM_SHUFFLE(0, 1, 2, 3))});
4354#if HWY_TARGET <= HWY_AVX3
4355 alignas(16)
constexpr int16_t kReverse4[8] = {3, 2, 1, 0, 7, 6, 5, 4};
4356 const Vec128<int16_t, N> idx =
Load(di, kReverse4);
4357 return BitCast(
d, Vec128<int16_t, N>{
4358 _mm_permutexvar_epi16(idx.raw,
BitCast(di,
v).raw)});
4366template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4371template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4378template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4380#if HWY_TARGET <= HWY_AVX3
4382 alignas(32)
constexpr int16_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0,
4383 15, 14, 13, 12, 11, 10, 9, 8};
4384 const Vec128<int16_t, N> idx =
Load(di, kReverse8);
4385 return BitCast(
d, Vec128<int16_t, N>{
4386 _mm_permutexvar_epi16(idx.raw,
BitCast(di,
v).raw)});
4393template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
4404template <
size_t N, HWY_IF_LE128(u
int8_t, N)>
4409template <
size_t N, HWY_IF_LE128(u
int16_t, N)>
4414template <
size_t N, HWY_IF_LE128(u
int32_t, N)>
4419template <
size_t N, HWY_IF_LE128(u
int64_t, N)>
4425template <
size_t N, HWY_IF_LE128(
int8_t, N)>
4430template <
size_t N, HWY_IF_LE128(
int16_t, N)>
4435template <
size_t N, HWY_IF_LE128(
int32_t, N)>
4440template <
size_t N, HWY_IF_LE128(
int64_t, N)>
4446template <
size_t N, HWY_IF_LE128(
float, N)>
4448 const Vec128<float, N> b) {
4449 return Vec128<float, N>{_mm_unpacklo_ps(a.raw, b.raw)};
4451template <
size_t N, HWY_IF_LE128(
double, N)>
4503 const Vec128<float> b) {
4504 return Vec128<float>{_mm_unpackhi_ps(a.raw, b.raw)};
4514template <
typename T,
class V = Vec128<T>>
4520template <
typename T,
size_t N, HWY_IF_LE64(T, N),
class V = Vec128<T, N>>
4522 const Half<
decltype(
d)> d2;
4530template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
4534template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
4539template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
4549template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
4550HWY_API Vec128<T, N>
Combine(Simd<T, N, 0>
d, Vec128<T, N / 2> hi_half,
4551 Vec128<T, N / 2> lo_half) {
4552 const Half<
decltype(
d)> d2;
4556 const VU lo{
BitCast(du2, lo_half).raw};
4557 const VU hi{
BitCast(du2, hi_half).raw};
4566template <
typename T>
4572template <
typename T>
4581template <
typename T>
4586template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4594template <
typename T>
4601template <
typename T>
4608template <
typename T>
4610 const Vec128<T> lo) {
4611 return CombineShiftRightBytes<8>(
d, hi, lo);
4615template <
typename T>
4618#if HWY_TARGET == HWY_SSSE3
4621 _MM_SHUFFLE2(1, 0))});
4630#if HWY_TARGET == HWY_SSSE3
4642#if HWY_TARGET == HWY_SSSE3
4652template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4655 const Half<
decltype(
d)> d2;
4659template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4662 const Half<
decltype(
d)> d2;
4666template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4668 const Vec128<T, N> lo) {
4669 const Half<
decltype(
d)> d2;
4673template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4676 const Half<
decltype(
d)> d2;
4683template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4687 const Vec128<uint16_t> uH = ShiftRight<8>(
BitCast(dw, hi));
4688 const Vec128<uint16_t> uL = ShiftRight<8>(
BitCast(dw, lo));
4689 return Vec128<T>{_mm_packus_epi16(uL.raw, uH.raw)};
4693template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4697 alignas(16)
const uint8_t kCompactOddU8[8] = {1, 3, 5, 7};
4705template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4709 alignas(16)
const uint8_t kCompactOddU8[4] = {1, 3};
4717template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4722 const Vec128<int32_t> uH = ShiftRight<16>(
BitCast(dw, hi));
4723 const Vec128<int32_t> uL = ShiftRight<16>(
BitCast(dw, lo));
4724 return Vec128<T>{_mm_packs_epi32(uL.raw, uH.raw)};
4728template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4732 alignas(16)
const uint8_t kCompactOddU16[8] = {2, 3, 6, 7};
4740template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4744 d, Vec128<float>{_mm_shuffle_ps(
BitCast(df, lo).raw,
BitCast(df, hi).raw,
4745 _MM_SHUFFLE(3, 1, 3, 1))});
4754template <
typename T>
4763template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4767 const Vec128<uint16_t> mask =
Set(dw, 0x00FF);
4768 const Vec128<uint16_t> uH =
And(
BitCast(dw, hi), mask);
4769 const Vec128<uint16_t> uL =
And(
BitCast(dw, lo), mask);
4770 return Vec128<T>{_mm_packus_epi16(uL.raw, uH.raw)};
4774template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4778 alignas(16)
const uint8_t kCompactEvenU8[8] = {0, 2, 4, 6};
4786template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4790 alignas(16)
const uint8_t kCompactEvenU8[4] = {0, 2};
4798template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4800#if HWY_TARGET <= HWY_SSE4
4803 const Vec128<uint32_t> mask =
Set(dw, 0x0000FFFF);
4804 const Vec128<uint32_t> uH =
And(
BitCast(dw, hi), mask);
4805 const Vec128<uint32_t> uL =
And(
BitCast(dw, lo), mask);
4806 return Vec128<T>{_mm_packus_epi32(uL.raw, uH.raw)};
4810 alignas(16)
const T kCompactEvenU16[8] = {0x0100, 0x0504, 0x0908, 0x0D0C};
4811 const Vec128<T> shuf =
BitCast(
d,
Load(
d, kCompactEvenU16));
4819template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4823 alignas(16)
const uint8_t kCompactEvenU16[8] = {0, 1, 4, 5};
4831template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4835 d, Vec128<float>{_mm_shuffle_ps(
BitCast(df, lo).raw,
BitCast(df, hi).raw,
4836 _MM_SHUFFLE(2, 0, 2, 0))});
4844template <
typename T>
4852template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4854 return Vec128<T, N>{_mm_shuffle_epi32(
v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
4859 _mm_shuffle_ps(
v.raw,
v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
4862template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4869template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4871 return Vec128<T, N>{_mm_shuffle_epi32(
v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
4876 _mm_shuffle_ps(
v.raw,
v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
4879template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4886template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
4890 alignas(16)
constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
4891 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
4895template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4897#if HWY_TARGET == HWY_SSSE3
4900 alignas(16)
constexpr uint8_t mask[16] = {0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0,
4901 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0};
4904 return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x55)};
4908template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4910#if HWY_TARGET == HWY_SSSE3
4911 const __m128i odd = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 1, 3, 1));
4912 const __m128i even = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(2, 0, 2, 0));
4913 return Vec128<T, N>{_mm_unpacklo_epi32(even, odd)};
4923template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4929#if HWY_TARGET == HWY_SSSE3
4931 d, Vec128<double, N>{_mm_shuffle_pd(
4941HWY_API Vec128<float, N>
OddEven(Vec128<float, N> a, Vec128<float, N> b) {
4942#if HWY_TARGET == HWY_SSSE3
4945 const __m128 odd = _mm_shuffle_ps(a.raw, a.raw, _MM_SHUFFLE(3, 1, 3, 1));
4946 const __m128 even = _mm_shuffle_ps(b.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0));
4947 return Vec128<float, N>{_mm_unpacklo_ps(even, odd)};
4949 return Vec128<float, N>{_mm_blend_ps(a.raw, b.raw, 5)};
4954template <
typename T,
size_t N>
4961template <
typename T,
size_t N>
4973#if HWY_TARGET > HWY_AVX3
4976template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4980 const Rebind<float,
decltype(dw)> df;
4981 const auto zero =
Zero(
d);
4984 const auto upper = exp +
Set(
d, 0x3F80);
4986 const auto f0 =
ZipLower(dw, zero, upper);
4987 const auto f1 =
ZipUpper(dw, zero, upper);
4989 const Vec128<int32_t, N> bits0{_mm_cvtps_epi32(
BitCast(df, f0).raw)};
4990 const Vec128<int32_t, N> bits1{_mm_cvtps_epi32(
BitCast(df, f1).raw)};
4991 return Vec128<MakeUnsigned<T>,
N>{_mm_packus_epi32(bits0.raw, bits1.raw)};
4995template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4998 const auto exp = ShiftLeft<23>(
v);
4999 const auto f = exp +
Set(
d, 0x3F800000);
5003 return Vec128<MakeUnsigned<T>,
N>{_mm_cvtps_epi32(_mm_castsi128_ps(f.raw))};
5011#if HWY_TARGET <= HWY_AVX3
5014 return v *
Pow2(bits);
5025#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
5026 return v *
Pow2(bits);
5038#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
5041 const __m128i bits1 = _mm_unpackhi_epi64(bits.
raw, bits.
raw);
5054template <
typename T,
size_t N>
5065template <
typename T,
size_t N>
5080#if HWY_TARGET <= HWY_AVX3
5098#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
5106 const auto out20 = ShiftRight<32>(
MulEven(in, mul));
5123 const Vec128<uint64_t> bits) {
5124#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
5126 const Vec128<uint64_t> out0{_mm_srl_epi64(
v.raw, bits.raw)};
5127 const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw);
5128 const Vec128<uint64_t> out1{_mm_srl_epi64(
v.raw, bits1)};
5131 return Vec128<uint64_t>{_mm_srlv_epi64(
v.raw, bits.raw)};
5135 const Vec64<uint64_t> bits) {
5136 return Vec64<uint64_t>{_mm_srl_epi64(
v.raw, bits.raw)};
5139#if HWY_TARGET > HWY_AVX3
5143template <
class DI,
class V>
5144HWY_INLINE V SignedShr(
const DI di,
const V
v,
const V count_i) {
5145 const RebindToUnsigned<DI> du;
5146 const auto count =
BitCast(du, count_i);
5150 const auto abs =
BitCast(du,
v ^ sign);
5151 return BitCast(di, abs >> count) ^ sign;
5160#if HWY_TARGET <= HWY_AVX3
5174#if HWY_TARGET <= HWY_AVX3
5188#if HWY_TARGET <= HWY_AVX3
5198 const Vec128<uint64_t> b) {
5199 alignas(16) uint64_t mul[2];
5201 return Load(Full128<uint64_t>(), mul);
5205 const Vec128<uint64_t> b) {
5206 alignas(16) uint64_t mul[2];
5207 const Half<Full128<uint64_t>> d2;
5210 return Load(Full128<uint64_t>(), mul);
5215template <
class V,
size_t N,
class D16 = Simd<b
float16_t, 2 * N, 0>>
5223 using VU32 =
VFromD<
decltype(du32)>;
5224 const VU32 odd =
Set(du32, 0xFFFF0000u);
5225 const VU32 ae = ShiftLeft<16>(
BitCast(du32, a));
5227 const VU32 be = ShiftLeft<16>(
BitCast(du32, b));
5236 Simd<int32_t, N, 0> , Vec128<int16_t, 2 * N> a,
5237 Vec128<int16_t, 2 * N> b,
const Vec128<int32_t, N> sum0,
5238 Vec128<int32_t, N>& ) {
5239 return sum0 + Vec128<int32_t, N>{_mm_madd_epi16(a.raw, b.raw)};
5245 Vec128<int32_t, N> ) {
5251 return Add(sum0, sum1);
5262#if HWY_TARGET == HWY_SSSE3
5263 const __m128i zero = _mm_setzero_si128();
5271 const Vec128<uint16_t, N>
v) {
5272#if HWY_TARGET == HWY_SSSE3
5273 return Vec128<uint32_t, N>{_mm_unpacklo_epi16(
v.raw, _mm_setzero_si128())};
5275 return Vec128<uint32_t, N>{_mm_cvtepu16_epi32(
v.raw)};
5281#if HWY_TARGET == HWY_SSSE3
5290#if HWY_TARGET == HWY_SSSE3
5291 const __m128i zero = _mm_setzero_si128();
5292 const __m128i u16 = _mm_unpacklo_epi8(
v.raw, zero);
5319 const Vec128<int8_t, N>
v) {
5320#if HWY_TARGET == HWY_SSSE3
5321 return ShiftRight<8>(Vec128<int16_t, N>{_mm_unpacklo_epi8(
v.raw,
v.raw)});
5323 return Vec128<int16_t, N>{_mm_cvtepi8_epi16(
v.raw)};
5328 const Vec128<int16_t, N>
v) {
5329#if HWY_TARGET == HWY_SSSE3
5330 return ShiftRight<16>(Vec128<int32_t, N>{_mm_unpacklo_epi16(
v.raw,
v.raw)});
5332 return Vec128<int32_t, N>{_mm_cvtepi16_epi32(
v.raw)};
5337 const Vec128<int32_t, N>
v) {
5338#if HWY_TARGET == HWY_SSSE3
5339 return ShiftRight<32>(Vec128<int64_t, N>{_mm_unpacklo_epi32(
v.raw,
v.raw)});
5341 return Vec128<int64_t, N>{_mm_cvtepi32_epi64(
v.raw)};
5346 const Vec128<int8_t, N>
v) {
5347#if HWY_TARGET == HWY_SSSE3
5348 const __m128i x2 = _mm_unpacklo_epi8(
v.raw,
v.raw);
5349 const __m128i x4 = _mm_unpacklo_epi16(x2, x2);
5350 return ShiftRight<24>(Vec128<int32_t, N>{x4});
5352 return Vec128<int32_t, N>{_mm_cvtepi8_epi32(
v.raw)};
5358#if HWY_IS_MSAN && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100)
5359#define HWY_INLINE_F16 HWY_NOINLINE
5361#define HWY_INLINE_F16 HWY_INLINE
5366#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C)
5371 const auto sign = ShiftRight<15>(bits16);
5372 const auto biased_exp = ShiftRight<10>(bits16) &
Set(du32, 0x1F);
5373 const auto mantissa = bits16 &
Set(du32, 0x3FF);
5374 const auto subnormal =
5376 Set(df32, 1.0f / 16384 / 1024));
5378 const auto biased_exp32 = biased_exp +
Set(du32, 127 - 15);
5379 const auto mantissa32 =
ShiftLeft<23 - 10>(mantissa);
5380 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
5381 const auto bits32 =
IfThenElse(biased_exp ==
Zero(du32), subnormal, normal);
5382 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
5391 const Vec128<bfloat16_t, N>
v) {
5392 const Rebind<uint16_t,
decltype(df32)> du16;
5413 const Vec128<int32_t, N>
v) {
5414#if HWY_TARGET == HWY_SSSE3
5415 const Simd<int32_t, N, 0> di32;
5416 const Simd<uint16_t, N * 2, 0> du16;
5417 const auto zero_if_neg =
AndNot(ShiftRight<31>(
v),
v);
5419 const auto clamped =
Or(zero_if_neg, too_big);
5421 alignas(16)
constexpr uint16_t kLower2Bytes[16] = {
5422 0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080};
5423 const auto lo2 =
Load(du16, kLower2Bytes);
5426 return Vec128<uint16_t, N>{_mm_packus_epi32(
v.raw,
v.raw)};
5432 const Vec128<int32_t, N>
v) {
5433 return Vec128<int16_t, N>{_mm_packs_epi32(
v.raw,
v.raw)};
5438 const Vec128<int32_t, N>
v) {
5439 const __m128i i16 = _mm_packs_epi32(
v.raw,
v.raw);
5440 return Vec128<uint8_t, N>{_mm_packus_epi16(i16, i16)};
5445 const Vec128<int16_t, N>
v) {
5446 return Vec128<uint8_t, N>{_mm_packus_epi16(
v.raw,
v.raw)};
5451 const Vec128<int32_t, N>
v) {
5452 const __m128i i16 = _mm_packs_epi32(
v.raw,
v.raw);
5453 return Vec128<int8_t, N>{_mm_packs_epi16(i16, i16)};
5458 const Vec128<int16_t, N>
v) {
5459 return Vec128<int8_t, N>{_mm_packs_epi16(
v.raw,
v.raw)};
5469 const Vec128<float, N>
v) {
5470#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C)
5472 const Rebind<uint32_t,
decltype(df16)> du;
5474 const auto bits32 =
BitCast(du,
v);
5475 const auto sign = ShiftRight<31>(bits32);
5476 const auto biased_exp32 = ShiftRight<23>(bits32) &
Set(du, 0xFF);
5477 const auto mantissa32 = bits32 &
Set(du, 0x7FFFFF);
5479 const auto k15 =
Set(di, 15);
5480 const auto exp =
Min(
BitCast(di, biased_exp32) -
Set(di, 127), k15);
5481 const auto is_tiny = exp <
Set(di, -24);
5483 const auto is_subnormal = exp <
Set(di, -14);
5484 const auto biased_exp16 =
5486 const auto sub_exp =
BitCast(du,
Set(di, -14) - exp);
5487 const auto sub_m = (
Set(du, 1) << (
Set(du, 10) - sub_exp)) +
5488 (mantissa32 >> (
Set(du, 13) + sub_exp));
5490 ShiftRight<13>(mantissa32));
5492 const auto sign16 = ShiftLeft<15>(sign);
5493 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
5498 return Vec128<float16_t, N>{_mm_cvtps_ph(
v.raw, _MM_FROUND_NO_EXC)};
5506 const Vec128<float, N>
v) {
5508 const Rebind<int32_t,
decltype(dbf16)> di32;
5509 const Rebind<uint32_t,
decltype(dbf16)> du32;
5510 const Rebind<uint16_t,
decltype(dbf16)> du16;
5511 const auto bits_in_32 =
BitCast(di32, ShiftRight<16>(
BitCast(du32,
v)));
5517 Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
5520 const Repartition<uint32_t,
decltype(dbf16)> du32;
5521 const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(
BitCast(du32, b));
5527 Vec128<int32_t, 1> a,
5528 Vec128<int32_t, 1> b) {
5529 const Half<
decltype(dn)> dnh;
5531 const Vec128<int16_t, 2> an{
DemoteTo(dnh, a).raw};
5532 const Vec128<int16_t, 2> bn{
DemoteTo(dnh, b).raw};
5536 Vec128<int32_t, 2> a,
5537 Vec128<int32_t, 2> b) {
5538 const Half<
decltype(dn)> dnh;
5540 const Vec128<int16_t, 4> an{
DemoteTo(dnh, a).raw};
5541 const Vec128<int16_t, 4> bn{
DemoteTo(dnh, b).raw};
5545 Vec128<int32_t> a, Vec128<int32_t> b) {
5546 return Vec128<int16_t>{_mm_packs_epi32(a.raw, b.raw)};
5551 const Vec128<double, N>
v) {
5552 return Vec128<float, N>{_mm_cvtpd_ps(
v.raw)};
5561 ->
decltype(
Zero(
d)) {
5564 return Min(
v,
Set(
d, 2147483647.0));
5570template <
class DI,
class DF = RebindToFloat<DI>>
5572 decltype(
Zero(di).raw) converted_raw)
5579 const auto converted =
VFromD<DI>{converted_raw};
5580 const auto sign_wrong =
AndNot(
BitCast(di, original), converted);
5581#if HWY_COMPILER_GCC_ACTUAL
5597 const Vec128<double, N>
v) {
5599 return Vec128<int32_t, N>{_mm_cvttpd_epi32(clamped.raw)};
5605 const Simd<uint32_t, N, 0> d32;
5606 const Simd<uint8_t, N * 4, 0> d8;
5607 alignas(16)
static constexpr uint32_t k8From32[4] = {
5608 0x0C080400u, 0x0C080400u, 0x0C080400u, 0x0C080400u};
5616template <
typename From,
typename To,
5620 static_assert(!IsSigned<To>() && !IsSigned<From>(),
"Unsigned only");
5627 const Vec128<uint64_t, 2>
v) {
5628 const Full128<uint8_t> d8;
5629 alignas(16)
static constexpr uint8_t kMap[16] = {0, 8, 0, 8, 0, 8, 0, 8,
5630 0, 8, 0, 8, 0, 8, 0, 8};
5635 const Vec128<uint64_t, 2>
v) {
5636 const Full128<uint16_t> d16;
5637 alignas(16)
static constexpr uint16_t kMap[8] = {
5638 0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u};
5643 const Vec128<uint64_t, 2>
v) {
5644 return Vec128<uint32_t, 2>{_mm_shuffle_epi32(
v.raw, 0x88)};
5647template <
size_t N, hwy::EnableIf<N >= 2>* =
nullptr>
5649 const Vec128<uint32_t, N>
v) {
5651 alignas(16)
static constexpr uint8_t kMap[16] = {
5652 0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu,
5653 0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu};
5657template <
size_t N, hwy::EnableIf<N >= 2>* =
nullptr>
5659 const Vec128<uint32_t, N>
v) {
5665template <
size_t N, hwy::EnableIf<N >= 2>* =
nullptr>
5667 const Vec128<uint16_t, N>
v) {
5677 const Vec128<int32_t, N>
v) {
5678 return Vec128<float, N>{_mm_cvtepi32_ps(
v.raw)};
5684#if HWY_TARGET <= HWY_AVX3
5691 const auto msk_lo =
Set(du32, 0xFFFF);
5692 const auto cnst2_16_flt =
Set(df, 65536.0f);
5696 const auto v_hi =
BitCast(d32, ShiftRight<16>(
v));
5704#if HWY_TARGET <= HWY_AVX3
5713 const auto k84_63 =
Set(d64, 0x4530000080000000ULL);
5714 const auto v_upper =
BitCast(dd, ShiftRight<32>(
BitCast(d64,
v)) ^ k84_63);
5717 const auto k52 =
Set(d32, 0x43300000);
5720 const auto k84_63_52 =
BitCast(dd,
Set(d64, 0x4530000080100000ULL));
5721 return (v_upper - k84_63_52) + v_lower;
5728#if HWY_TARGET <= HWY_AVX3
5733 using VU =
VFromD<
decltype(d64)>;
5735 const VU msk_lo =
Set(d64, 0xFFFFFFFF);
5736 const auto cnst2_32_dbl =
Set(dd, 4294967296.0);
5739 const VU v_lo =
And(
v, msk_lo);
5740 const VU v_hi = ShiftRight<32>(
v);
5742 auto uint64_to_double128_fast = [&dd](VU w)
HWY_ATTR {
5744 return BitCast(dd, w) -
Set(dd, 0x0010000000000000);
5747 const auto v_lo_dbl = uint64_to_double128_fast(v_lo);
5748 return MulAdd(cnst2_32_dbl, uint64_to_double128_fast(v_hi), v_lo_dbl);
5755 const Vec128<float, N>
v) {
5761#if HWY_TARGET <= HWY_AVX3 && HWY_ARCH_X86_64
5763#elif HWY_ARCH_X86_64
5764 const __m128i i0 = _mm_cvtsi64_si128(_mm_cvttsd_si64(
v.raw));
5766 const __m128i i1 = _mm_cvtsi64_si128(_mm_cvttsd_si64(
UpperHalf(dd2,
v).raw));
5769 using VI =
VFromD<
decltype(di)>;
5770 const VI k0 =
Zero(di);
5771 const VI k1 =
Set(di, 1);
5772 const VI k51 =
Set(di, 51);
5775 const VI biased_exp = ShiftRight<52>(
BitCast(di,
v)) &
Set(di, 0x7FF);
5776 const VI exp = biased_exp -
Set(di, 0x3FF);
5777 const auto in_range = exp <
Set(di, 63);
5785 const VI shift_mnt =
Max(k51 - exp, k0);
5786 const VI shift_int =
Max(exp - k51, k0);
5787 const VI mantissa =
BitCast(di,
v) &
Set(di, (1ULL << 52) - 1);
5789 const VI int52 = (mantissa |
Set(di, 1ULL << 52)) >> (shift_mnt + k1);
5791 const VI shifted = int52 << shift_int;
5793 const VI restored = shifted | ((mantissa & k1) << (shift_int - k1));
5797 const VI limit =
Set(di, LimitsMax<int64_t>()) - sign_mask;
5798 const VI magnitude =
IfThenElse(in_range, restored, limit);
5801 return (magnitude ^ sign_mask) - sign_mask;
5806#if HWY_TARGET > HWY_AVX3 && HWY_ARCH_X86_64
5818 const Simd<int32_t, N, 0> di;
5824#if HWY_TARGET == HWY_SSSE3
5827template <
typename T,
size_t N>
5829 static_assert(IsFloat<T>(),
"Only for float");
5833 const Simd<T, N, 0> df;
5834 const auto max =
Set(df, MantissaEnd<T>());
5836 const auto added = large +
v;
5837 const auto rounded = added - large;
5847template <
typename T,
size_t N>
5849 static_assert(IsFloat<T>(),
"Only for float");
5856template <
typename T,
size_t N>
5858 static_assert(IsFloat<T>(),
"Only for float");
5859 const Simd<T, N, 0> df;
5863 const auto int_f =
ConvertTo(df, integer);
5869template <
typename T,
size_t N>
5871 static_assert(IsFloat<T>(),
"Only for float");
5876 const auto int_f =
ConvertTo(df, integer);
5885template <
typename T,
size_t N>
5887 static_assert(IsFloat<T>(),
"Only for float");
5892 const auto int_f =
ConvertTo(df, integer);
5905 return Vec128<float, N>{
5906 _mm_round_ps(
v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
5910 return Vec128<double, N>{
5911 _mm_round_pd(
v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
5917 return Vec128<float, N>{
5918 _mm_round_ps(
v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
5922 return Vec128<double, N>{
5923 _mm_round_pd(
v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
5928HWY_API Vec128<float, N>
Ceil(
const Vec128<float, N>
v) {
5929 return Vec128<float, N>{
5930 _mm_round_ps(
v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
5933HWY_API Vec128<double, N>
Ceil(
const Vec128<double, N>
v) {
5934 return Vec128<double, N>{
5935 _mm_round_pd(
v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
5941 return Vec128<float, N>{
5942 _mm_round_ps(
v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
5946 return Vec128<double, N>{
5947 _mm_round_pd(
v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
5956#if HWY_TARGET <= HWY_AVX3
5964#if HWY_TARGET <= HWY_AVX3
5971#if HWY_TARGET <= HWY_AVX3
5996template <
typename T,
size_t N>
5998 static_assert(IsFloat<T>(),
"Only for float");
5999 const Simd<T, N, 0>
d;
6007template <
typename T,
size_t N>
6009 static_assert(IsFloat<T>(),
"Only for float");
6010 const Simd<T, N, 0>
d;
6018 const VFromD<
decltype(di)> exp =
6020 return RebindMask(
d, Lt(exp,
Set(di, hwy::MaxExponentField<T>())));
6027#if !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET != HWY_SSSE3
6030#ifdef HWY_NATIVE_AES
6031#undef HWY_NATIVE_AES
6033#define HWY_NATIVE_AES
6037 Vec128<uint8_t> round_key) {
6038 return Vec128<uint8_t>{_mm_aesenc_si128(state.raw, round_key.raw)};
6042 Vec128<uint8_t> round_key) {
6043 return Vec128<uint8_t>{_mm_aesenclast_si128(state.raw, round_key.raw)};
6046template <
size_t N, HWY_IF_LE128(u
int64_t, N)>
6048 Vec128<uint64_t, N> b) {
6049 return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x00)};
6052template <
size_t N, HWY_IF_LE128(u
int64_t, N)>
6054 Vec128<uint64_t, N> b) {
6055 return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x11)};
6064#if HWY_TARGET > HWY_AVX3
6067template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
6072 const Vec128<T, N> vbits{_mm_cvtsi32_si128(
static_cast<int>(mask_bits))};
6075 alignas(16)
constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
6076 1, 1, 1, 1, 1, 1, 1, 1};
6079 alignas(16)
constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
6080 1, 2, 4, 8, 16, 32, 64, 128};
6084template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
6087 alignas(16)
constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
6088 const auto vmask_bits =
Set(du,
static_cast<uint16_t
>(mask_bits));
6092template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
6095 alignas(16)
constexpr uint32_t kBit[8] = {1, 2, 4, 8};
6096 const auto vmask_bits =
Set(du,
static_cast<uint32_t
>(mask_bits));
6100template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
6103 alignas(16)
constexpr uint64_t kBit[8] = {1, 2};
6111template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
6114#if HWY_TARGET <= HWY_AVX3
6116 uint64_t mask_bits = 0;
6117 constexpr size_t kNumBytes = (
N + 7) / 8;
6118 CopyBytes<kNumBytes>(bits, &mask_bits);
6120 mask_bits &= (1ull <<
N) - 1;
6125 uint64_t mask_bits = 0;
6126 constexpr size_t kNumBytes = (
N + 7) / 8;
6127 CopyBytes<kNumBytes>(bits, &mask_bits);
6129 mask_bits &= (1ull <<
N) - 1;
6136template <
typename T>
6137struct CompressIsPartition {
6138#if HWY_TARGET <= HWY_AVX3
6144 enum {
value = (
sizeof(T) == 8) };
6147 enum {
value = (
sizeof(T) != 1) };
6151#if HWY_TARGET <= HWY_AVX3
6156template <
typename T,
size_t N>
6158 const Mask128<T, N> mask, uint8_t* bits) {
6159 constexpr size_t kNumBytes = (
N + 7) / 8;
6160 CopyBytes<kNumBytes>(&mask.raw, bits);
6164 const int mask_bits = (1 <<
N) - 1;
6165 bits[0] =
static_cast<uint8_t
>(bits[0] & mask_bits);
6175template <
typename T,
size_t N>
6177 const Mask128<T, N> mask) {
6178 const uint64_t mask_bits =
static_cast<uint64_t
>(mask.raw) & ((1u <<
N) - 1);
6182template <
typename T,
size_t N>
6184 const Mask128<T, N> mask) {
6185 const uint32_t mask_bits =
static_cast<uint32_t
>(mask.raw) & ((1u <<
N) - 1);
6189template <
typename T,
size_t N>
6191 const Mask128<T, N> mask) {
6192 const uint32_t mask_bits =
static_cast<uint32_t
>(mask.raw) & ((1u <<
N) - 1);
6196template <
typename T,
size_t N>
6197HWY_API bool AllFalse(
const Simd<T, N, 0> ,
const Mask128<T, N> mask) {
6198 const uint64_t mask_bits =
static_cast<uint64_t
>(mask.raw) & ((1u <<
N) - 1);
6199 return mask_bits == 0;
6202template <
typename T,
size_t N>
6203HWY_API bool AllTrue(
const Simd<T, N, 0> ,
const Mask128<T, N> mask) {
6204 const uint64_t mask_bits =
static_cast<uint64_t
>(mask.raw) & ((1u <<
N) - 1);
6206 return mask_bits == (1u <<
N) - 1;
6214template <
typename T>
6219template <
size_t N, HWY_IF_GE64(
float, N)>
6224template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6229 alignas(16)
constexpr uint8_t u8_indices[64] = {
6230 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6231 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6232 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6233 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6237 const auto index =
Load(d8, u8_indices + 16 * mask.raw);
6244template <
typename T>
6249template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6252 alignas(16)
constexpr uint64_t packed_array[16] = {0x00000010, 0x00000001,
6253 0x00000010, 0x00000010};
6259 const auto packed =
Set(du64, packed_array[mask.raw]);
6260 alignas(16)
constexpr uint64_t shifts[2] = {0, 4};
6261 const auto indices = Indices128<T>{(packed >>
Load(du64, shifts)).raw};
6267 Mask128<uint64_t> ) {
6273template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
6277 _mm_mask_compressstoreu_epi32(unaligned, mask.raw,
v.raw);
6278 const size_t count =
PopCount(uint64_t{mask.raw} & ((1ull <<
N) - 1));
6283template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
6287 _mm_mask_compressstoreu_epi64(unaligned, mask.raw,
v.raw);
6288 const size_t count =
PopCount(uint64_t{mask.raw} & ((1ull <<
N) - 1));
6293template <
size_t N, HWY_IF_LE128(
float, N)>
6297 _mm_mask_compressstoreu_ps(unaligned, mask.
raw,
v.raw);
6298 const size_t count =
PopCount(uint64_t{mask.
raw} & ((1ull <<
N) - 1));
6303template <
size_t N, HWY_IF_LE128(
double, N)>
6307 _mm_mask_compressstoreu_pd(unaligned, mask.
raw,
v.raw);
6308 const size_t count =
PopCount(uint64_t{mask.
raw} & ((1ull <<
N) - 1));
6314template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6322 if (
N != 16 /
sizeof(T)) {
6328 const Vec128<T, N> compressed =
Compress(
v, m);
6329#if HWY_MEM_OPS_MIGHT_FAULT
6332 alignas(16) T buf[
N];
6333 Store(compressed,
d, buf);
6334 memcpy(unaligned, buf, count *
sizeof(T));
6345template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6358constexpr HWY_INLINE uint64_t U64FromInt(
int mask_bits) {
6359 return static_cast<uint64_t
>(
static_cast<unsigned>(mask_bits));
6362template <
typename T,
size_t N>
6364 const Mask128<T, N> mask) {
6365 const Simd<T, N, 0>
d;
6367 return U64FromInt(_mm_movemask_epi8(sign_bits));
6370template <
typename T,
size_t N>
6372 const Mask128<T, N> mask) {
6374 const auto sign_bits = _mm_packs_epi16(mask.raw, _mm_setzero_si128());
6375 return U64FromInt(_mm_movemask_epi8(sign_bits));
6378template <
typename T,
size_t N>
6380 const Mask128<T, N> mask) {
6381 const Simd<T, N, 0>
d;
6382 const Simd<float, N, 0> df;
6384 return U64FromInt(_mm_movemask_ps(sign_bits.raw));
6387template <
typename T,
size_t N>
6389 const Mask128<T, N> mask) {
6390 const Simd<T, N, 0>
d;
6391 const Simd<double, N, 0> df;
6393 return U64FromInt(_mm_movemask_pd(sign_bits.raw));
6397template <
typename T,
size_t N>
6398constexpr uint64_t
OnlyActive(uint64_t mask_bits) {
6399 return ((
N *
sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull <<
N) - 1);
6402template <
typename T,
size_t N>
6410template <
typename T,
size_t N>
6412 const Mask128<T, N> mask, uint8_t* bits) {
6413 constexpr size_t kNumBytes = (
N + 7) / 8;
6415 CopyBytes<kNumBytes>(&mask_bits, bits);
6421template <
typename T,
size_t N>
6422HWY_API bool AllFalse(
const Simd<T, N, 0> ,
const Mask128<T, N> mask) {
6427template <
typename T,
size_t N>
6428HWY_API bool AllTrue(
const Simd<T, N, 0> ,
const Mask128<T, N> mask) {
6429 constexpr uint64_t kAllBits =
6430 detail::OnlyActive<T, N>((1ull << (16 /
sizeof(T))) - 1);
6434template <
typename T,
size_t N>
6436 const Mask128<T, N> mask) {
6440template <
typename T,
size_t N>
6442 const Mask128<T, N> mask) {
6447template <
typename T,
size_t N>
6449 const Mask128<T, N> mask) {
6459template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
6460HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0>
d, uint64_t mask_bits) {
6462 const Rebind<uint8_t,
decltype(
d)> d8;
6463 const Simd<uint16_t, N, 0> du;
6473 alignas(16)
constexpr uint8_t table[2048] = {
6475 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
6476 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
6477 4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14,
6478 2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
6479 6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14,
6480 2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14,
6481 4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14,
6482 2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
6483 8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14,
6484 2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14,
6485 4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14,
6486 2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14,
6487 6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14,
6488 2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14,
6489 4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14,
6490 2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
6491 10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14,
6492 2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14,
6493 4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14,
6494 2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14,
6495 6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14,
6496 2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14,
6497 4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14,
6498 2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14,
6499 8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14,
6500 2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14,
6501 4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14,
6502 2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14,
6503 6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14,
6504 2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14,
6505 4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14,
6506 2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
6507 12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14,
6508 2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14,
6509 4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14,
6510 2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14,
6511 6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14,
6512 2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14,
6513 4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14,
6514 2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14,
6515 8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14,
6516 2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14,
6517 4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14,
6518 2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14,
6519 6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14,
6520 2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14,
6521 4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14,
6522 2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14,
6523 10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14,
6524 2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14,
6525 4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14,
6526 2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14,
6527 6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14,
6528 2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14,
6529 4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14,
6530 2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14,
6531 8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14,
6532 2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14,
6533 4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14,
6534 2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14,
6535 6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14,
6536 2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14,
6537 4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14,
6538 2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14,
6539 14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12,
6540 2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12,
6541 4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12,
6542 2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12,
6543 6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12,
6544 2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12,
6545 4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12,
6546 2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12,
6547 8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12,
6548 2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12,
6549 4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12,
6550 2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12,
6551 6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12,
6552 2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12,
6553 4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12,
6554 2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12,
6555 10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12,
6556 2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12,
6557 4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12,
6558 2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12,
6559 6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12,
6560 2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12,
6561 4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12,
6562 2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12,
6563 8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12,
6564 2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12,
6565 4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12,
6566 2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12,
6567 6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12,
6568 2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12,
6569 4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12,
6570 2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12,
6571 12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10,
6572 2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10,
6573 4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10,
6574 2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10,
6575 6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10,
6576 2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10,
6577 4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10,
6578 2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10,
6579 8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10,
6580 2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10,
6581 4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10,
6582 2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10,
6583 6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10,
6584 2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10,
6585 4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10,
6586 2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10,
6587 10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8,
6588 2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8,
6589 4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8,
6590 2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8,
6591 6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8,
6592 2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8,
6593 4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8,
6594 2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8,
6595 8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6,
6596 2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6,
6597 4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6,
6598 2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6,
6599 6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4,
6600 2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4,
6601 4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2,
6602 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
6604 const Vec128<uint8_t, 2 * N> byte_idx{
Load(d8, table + mask_bits * 8).raw};
6605 const Vec128<uint16_t, N> pairs =
ZipLower(byte_idx, byte_idx);
6609template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
6610HWY_INLINE Vec128<T, N> IndicesFromNotBits(Simd<T, N, 0>
d,
6611 uint64_t mask_bits) {
6613 const Rebind<uint8_t,
decltype(
d)> d8;
6614 const Simd<uint16_t, N, 0> du;
6624 alignas(16)
constexpr uint8_t table[2048] = {
6626 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0,
6627 0, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2,
6628 0, 2, 6, 8, 10, 12, 14, 4, 2, 6, 8, 10, 12, 14, 0, 4,
6629 0, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4,
6630 0, 2, 4, 8, 10, 12, 14, 6, 2, 4, 8, 10, 12, 14, 0, 6,
6631 0, 4, 8, 10, 12, 14, 2, 6, 4, 8, 10, 12, 14, 0, 2, 6,
6632 0, 2, 8, 10, 12, 14, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6,
6633 0, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6,
6634 0, 2, 4, 6, 10, 12, 14, 8, 2, 4, 6, 10, 12, 14, 0, 8,
6635 0, 4, 6, 10, 12, 14, 2, 8, 4, 6, 10, 12, 14, 0, 2, 8,
6636 0, 2, 6, 10, 12, 14, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8,
6637 0, 6, 10, 12, 14, 2, 4, 8, 6, 10, 12, 14, 0, 2, 4, 8,
6638 0, 2, 4, 10, 12, 14, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8,
6639 0, 4, 10, 12, 14, 2, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8,
6640 0, 2, 10, 12, 14, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8,
6641 0, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8,
6642 0, 2, 4, 6, 8, 12, 14, 10, 2, 4, 6, 8, 12, 14, 0, 10,
6643 0, 4, 6, 8, 12, 14, 2, 10, 4, 6, 8, 12, 14, 0, 2, 10,
6644 0, 2, 6, 8, 12, 14, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10,
6645 0, 6, 8, 12, 14, 2, 4, 10, 6, 8, 12, 14, 0, 2, 4, 10,
6646 0, 2, 4, 8, 12, 14, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10,
6647 0, 4, 8, 12, 14, 2, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10,
6648 0, 2, 8, 12, 14, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10,
6649 0, 8, 12, 14, 2, 4, 6, 10, 8, 12, 14, 0, 2, 4, 6, 10,
6650 0, 2, 4, 6, 12, 14, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10,
6651 0, 4, 6, 12, 14, 2, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10,
6652 0, 2, 6, 12, 14, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10,
6653 0, 6, 12, 14, 2, 4, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10,
6654 0, 2, 4, 12, 14, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10,
6655 0, 4, 12, 14, 2, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10,
6656 0, 2, 12, 14, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10,
6657 0, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10,
6658 0, 2, 4, 6, 8, 10, 14, 12, 2, 4, 6, 8, 10, 14, 0, 12,
6659 0, 4, 6, 8, 10, 14, 2, 12, 4, 6, 8, 10, 14, 0, 2, 12,
6660 0, 2, 6, 8, 10, 14, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12,
6661 0, 6, 8, 10, 14, 2, 4, 12, 6, 8, 10, 14, 0, 2, 4, 12,
6662 0, 2, 4, 8, 10, 14, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12,
6663 0, 4, 8, 10, 14, 2, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12,
6664 0, 2, 8, 10, 14, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12,
6665 0, 8, 10, 14, 2, 4, 6, 12, 8, 10, 14, 0, 2, 4, 6, 12,
6666 0, 2, 4, 6, 10, 14, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12,
6667 0, 4, 6, 10, 14, 2, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12,
6668 0, 2, 6, 10, 14, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12,
6669 0, 6, 10, 14, 2, 4, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12,
6670 0, 2, 4, 10, 14, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12,
6671 0, 4, 10, 14, 2, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12,
6672 0, 2, 10, 14, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12,
6673 0, 10, 14, 2, 4, 6, 8, 12, 10, 14, 0, 2, 4, 6, 8, 12,
6674 0, 2, 4, 6, 8, 14, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12,
6675 0, 4, 6, 8, 14, 2, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12,
6676 0, 2, 6, 8, 14, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12,
6677 0, 6, 8, 14, 2, 4, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12,
6678 0, 2, 4, 8, 14, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12,
6679 0, 4, 8, 14, 2, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12,
6680 0, 2, 8, 14, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12,
6681 0, 8, 14, 2, 4, 6, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12,
6682 0, 2, 4, 6, 14, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12,
6683 0, 4, 6, 14, 2, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12,
6684 0, 2, 6, 14, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12,
6685 0, 6, 14, 2, 4, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12,
6686 0, 2, 4, 14, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12,
6687 0, 4, 14, 2, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12,
6688 0, 2, 14, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12,
6689 0, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12,
6690 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 0, 14,
6691 0, 4, 6, 8, 10, 12, 2, 14, 4, 6, 8, 10, 12, 0, 2, 14,
6692 0, 2, 6, 8, 10, 12, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14,
6693 0, 6, 8, 10, 12, 2, 4, 14, 6, 8, 10, 12, 0, 2, 4, 14,
6694 0, 2, 4, 8, 10, 12, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14,
6695 0, 4, 8, 10, 12, 2, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14,
6696 0, 2, 8, 10, 12, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14,
6697 0, 8, 10, 12, 2, 4, 6, 14, 8, 10, 12, 0, 2, 4, 6, 14,
6698 0, 2, 4, 6, 10, 12, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14,
6699 0, 4, 6, 10, 12, 2, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14,
6700 0, 2, 6, 10, 12, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14,
6701 0, 6, 10, 12, 2, 4, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14,
6702 0, 2, 4, 10, 12, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14,
6703 0, 4, 10, 12, 2, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14,
6704 0, 2, 10, 12, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14,
6705 0, 10, 12, 2, 4, 6, 8, 14, 10, 12, 0, 2, 4, 6, 8, 14,
6706 0, 2, 4, 6, 8, 12, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14,
6707 0, 4, 6, 8, 12, 2, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14,
6708 0, 2, 6, 8, 12, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14,
6709 0, 6, 8, 12, 2, 4, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14,
6710 0, 2, 4, 8, 12, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14,
6711 0, 4, 8, 12, 2, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14,
6712 0, 2, 8, 12, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14,
6713 0, 8, 12, 2, 4, 6, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14,
6714 0, 2, 4, 6, 12, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14,
6715 0, 4, 6, 12, 2, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14,
6716 0, 2, 6, 12, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14,
6717 0, 6, 12, 2, 4, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14,
6718 0, 2, 4, 12, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14,
6719 0, 4, 12, 2, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14,
6720 0, 2, 12, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14,
6721 0, 12, 2, 4, 6, 8, 10, 14, 12, 0, 2, 4, 6, 8, 10, 14,
6722 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14,
6723 0, 4, 6, 8, 10, 2, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14,
6724 0, 2, 6, 8, 10, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14,
6725 0, 6, 8, 10, 2, 4, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14,
6726 0, 2, 4, 8, 10, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14,
6727 0, 4, 8, 10, 2, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14,
6728 0, 2, 8, 10, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14,
6729 0, 8, 10, 2, 4, 6, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14,
6730 0, 2, 4, 6, 10, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14,
6731 0, 4, 6, 10, 2, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14,
6732 0, 2, 6, 10, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14,
6733 0, 6, 10, 2, 4, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14,
6734 0, 2, 4, 10, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14,
6735 0, 4, 10, 2, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14,
6736 0, 2, 10, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14,
6737 0, 10, 2, 4, 6, 8, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14,
6738 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14,
6739 0, 4, 6, 8, 2, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14,
6740 0, 2, 6, 8, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14,
6741 0, 6, 8, 2, 4, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14,
6742 0, 2, 4, 8, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14,
6743 0, 4, 8, 2, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14,
6744 0, 2, 8, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14,
6745 0, 8, 2, 4, 6, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14,
6746 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14,
6747 0, 4, 6, 2, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14,
6748 0, 2, 6, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14,
6749 0, 6, 2, 4, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14,
6750 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14,
6751 0, 4, 2, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14,
6752 0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14,
6753 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14};
6755 const Vec128<uint8_t, 2 * N> byte_idx{
Load(d8, table + mask_bits * 8).raw};
6756 const Vec128<uint16_t, N> pairs =
ZipLower(byte_idx, byte_idx);
6760template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE128(T, N)>
6761HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0>
d, uint64_t mask_bits) {
6765 alignas(16)
constexpr uint8_t u8_indices[256] = {
6767 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6768 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6769 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15,
6770 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6771 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15,
6772 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15,
6773 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15,
6774 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6775 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
6776 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11,
6777 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11,
6778 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
6779 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6780 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7,
6781 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
6782 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6785 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
6788template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE128(T, N)>
6789HWY_INLINE Vec128<T, N> IndicesFromNotBits(Simd<T, N, 0>
d,
6790 uint64_t mask_bits) {
6794 alignas(16)
constexpr uint8_t u8_indices[256] = {
6796 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
6797 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
6798 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
6799 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
6800 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
6801 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
6802 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
6803 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6804 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
6805 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
6806 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
6807 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
6808 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
6809 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
6813 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
6816template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8), HWY_IF_LE128(T, N)>
6817HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0>
d, uint64_t mask_bits) {
6821 alignas(16)
constexpr uint8_t u8_indices[64] = {
6823 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6824 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6825 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6826 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6829 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
6832template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8), HWY_IF_LE128(T, N)>
6833HWY_INLINE Vec128<T, N> IndicesFromNotBits(Simd<T, N, 0>
d,
6834 uint64_t mask_bits) {
6838 alignas(16)
constexpr uint8_t u8_indices[64] = {
6840 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6841 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6842 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6843 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6846 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
6849template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6851 const Simd<T, N, 0>
d;
6855 const auto indices =
BitCast(du, detail::IndicesFromBits(
d, mask_bits));
6859template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6860HWY_API Vec128<T, N> CompressNotBits(Vec128<T, N>
v, uint64_t mask_bits) {
6861 const Simd<T, N, 0>
d;
6865 const auto indices =
BitCast(du, detail::IndicesFromNotBits(
d, mask_bits));
6872template <
typename T>
6878template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6883 const Vec128<T> maskL =
DupEven(m);
6884 const Vec128<T> maskH =
DupOdd(m);
6885 const Vec128<T> swap =
AndNot(maskL, maskH);
6890template <
typename T,
size_t N, HWY_IF_LANE_SIZE_ONE_OF(T, 0x14)>
6896template <
typename T>
6902template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6907 const Vec128<T> maskL =
DupEven(m);
6908 const Vec128<T> maskH =
DupOdd(m);
6909 const Vec128<T> swap =
AndNot(maskH, maskL);
6914template <
typename T,
size_t N, HWY_IF_LANE_SIZE_ONE_OF(T, 0x14)>
6918 if (
N < 16 /
sizeof(T)) {
6926 Mask128<uint64_t> ) {
6930template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6933 uint64_t mask_bits = 0;
6934 constexpr size_t kNumBytes = (
N + 7) / 8;
6935 CopyBytes<kNumBytes>(bits, &mask_bits);
6937 mask_bits &= (1ull <<
N) - 1;
6945template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6952 const size_t count =
PopCount(mask_bits);
6955 const auto indices =
BitCast(du, detail::IndicesFromBits(
d, mask_bits));
6957 StoreU(compressed,
d, unaligned);
6962template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6970 const size_t count =
PopCount(mask_bits);
6973 const auto indices =
BitCast(du, detail::IndicesFromBits(
d, mask_bits));
6980template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6986 uint64_t mask_bits = 0;
6987 constexpr size_t kNumBytes = (
N + 7) / 8;
6988 CopyBytes<kNumBytes>(bits, &mask_bits);
6990 mask_bits &= (1ull <<
N) - 1;
6992 const size_t count =
PopCount(mask_bits);
6995 const auto indices =
BitCast(du, detail::IndicesFromBits(
d, mask_bits));
6997 StoreU(compressed,
d, unaligned);
7015template <
typename T>
7017 const Vec128<T, 1>
v) {
7020template <
typename T>
7022 const Vec128<T, 1>
v) {
7025template <
typename T>
7027 const Vec128<T, 1>
v) {
7034template <
typename T>
7036 const Vec128<T, 2> v10) {
7039template <
typename T>
7041 const Vec128<T, 2> v10) {
7044template <
typename T>
7046 const Vec128<T, 2> v10) {
7051template <
typename T>
7053 const Vec128<T> v3210) {
7055 const Vec128<T> v31_20_31_20 = v3210 + v1032;
7056 const Vec128<T> v20_31_20_31 =
Shuffle0321(v31_20_31_20);
7057 return v20_31_20_31 + v31_20_31_20;
7059template <
typename T>
7061 const Vec128<T> v3210) {
7063 const Vec128<T> v31_20_31_20 =
Min(v3210, v1032);
7064 const Vec128<T> v20_31_20_31 =
Shuffle0321(v31_20_31_20);
7065 return Min(v20_31_20_31, v31_20_31_20);
7067template <
typename T>
7069 const Vec128<T> v3210) {
7071 const Vec128<T> v31_20_31_20 =
Max(v3210, v1032);
7072 const Vec128<T> v20_31_20_31 =
Shuffle0321(v31_20_31_20);
7073 return Max(v20_31_20_31, v31_20_31_20);
7079template <
typename T>
7081 const Vec128<T> v10) {
7085template <
typename T>
7087 const Vec128<T> v10) {
7089 return Min(v10, v01);
7091template <
typename T>
7093 const Vec128<T> v10) {
7095 return Max(v10, v01);
7098template <
size_t N, HWY_IF_GE32(u
int16_t, N)>
7100 Vec128<uint16_t, N>
v) {
7101 const Simd<uint16_t, N, 0>
d;
7104 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
7109template <
size_t N, HWY_IF_GE32(
int16_t, N)>
7111 Vec128<int16_t, N>
v) {
7112 const Simd<int16_t, N, 0>
d;
7115 const auto even = ShiftRight<16>(ShiftLeft<16>(
BitCast(d32,
v)));
7116 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
7131 return Set(
d,
static_cast<uint8_t
>(
GetLane(sums) & 0xFF));
7134template <
size_t N, HWY_IF_GE64(
int8_t, N)>
7139 const auto is_neg =
v <
Zero(
d);
7149#if HWY_TARGET <= HWY_SSE4
7152 using V =
decltype(
v);
7153 return Broadcast<0>(V{_mm_minpos_epu16(
v.raw)});
7180#elif HWY_TARGET == HWY_SSSE3
7181template <
size_t N, HWY_IF_GE64(u
int8_t, N)>
7197template <
size_t N, HWY_IF_GE64(u
int8_t, N)>
7199 const Vec128<uint8_t, N>
v) {
7215template <
size_t N, HWY_IF_GE64(
int8_t, N)>
7220 const auto mask =
SignBit(du);
7224template <
size_t N, HWY_IF_GE64(
int8_t, N)>
7229 const auto mask =
SignBit(du);
7234template <
size_t N, HWY_IF_GE32(u
int16_t, N)>
7240 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
7245template <
size_t N, HWY_IF_GE32(
int16_t, N)>
7247 Vec128<int16_t, N>
v) {
7248 const Simd<int16_t, N, 0>
d;
7251 const auto even = ShiftRight<16>(ShiftLeft<16>(
BitCast(d32,
v)));
7252 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
7258template <
size_t N, HWY_IF_GE32(u
int16_t, N)>
7260 Vec128<uint16_t, N>
v) {
7261 const Simd<uint16_t, N, 0>
d;
7264 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
7269template <
size_t N, HWY_IF_GE32(
int16_t, N)>
7271 Vec128<int16_t, N>
v) {
7272 const Simd<int16_t, N, 0>
d;
7275 const auto even = ShiftRight<16>(ShiftLeft<16>(
BitCast(d32,
v)));
7276 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
7285template <
typename T,
size_t N>
7289template <
typename T,
size_t N>
7293template <
typename T,
size_t N>
7303template <
class D,
class V = VFromD<D>>
7305 static_assert(!IsSigned<TFromD<D>>() &&
sizeof(
TFromD<D>) == 8,
7320 const auto eqHL = Eq(a, b);
7322 const V ltLX = ShiftLeftLanes<1>(ltHL);
7323 const V vecHx =
IfThenElse(eqHL, ltLX, ltHL);
7328template <
class D,
class V = VFromD<D>>
7330 static_assert(!IsSigned<TFromD<D>>() &&
sizeof(
TFromD<D>) == 8,
7334 return And(eqHL, eqLH);
7337template <
class D,
class V = VFromD<D>>
7339 static_assert(!IsSigned<TFromD<D>>() &&
sizeof(
TFromD<D>) == 8,
7343 return Or(neHL, neLH);
7346template <
class D,
class V = VFromD<D>>
7354template <
class D,
class V = VFromD<D>>
7362template <
class D,
class V = VFromD<D>>
7372template <
class D,
class V = VFromD<D>>
7377template <
class D,
class V = VFromD<D>>
7382template <
class D,
class V = VFromD<D>>
7387template <
class D,
class V = VFromD<D>>
7392template <
class D,
class V = VFromD<D>>
7397template <
class D,
class V = VFromD<D>>
7405template <
class D,
class V = VFromD<D>>
7410template <
class D,
class V = VFromD<D>>
7415template <
class D,
class V = VFromD<D>>
7420template <
class D,
class V = VFromD<D>>
#define HWY_RESTRICT
Definition: base.h:64
#define HWY_IF_LANE_SIZE(T, bytes)
Definition: base.h:420
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:78
#define HWY_IF_LE64(T, N)
Definition: base.h:407
#define HWY_API
Definition: base.h:129
#define HWY_IF_LE128(T, N)
Definition: base.h:406
#define HWY_MIN(a, b)
Definition: base.h:134
#define HWY_INLINE
Definition: base.h:70
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:79
#define HWY_DASSERT(condition)
Definition: base.h:238
#define HWY_MAYBE_UNUSED
Definition: base.h:82
#define HWY_ASSERT(condition)
Definition: base.h:192
Definition: arm_neon-inl.h:825
Raw raw
Definition: arm_neon-inl.h:835
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition: arm_neon-inl.h:827
static Mask128< T, N > FromBits(uint64_t mask_bits)
Definition: x86_128-inl.h:140
Definition: arm_neon-inl.h:778
T PrivateT
Definition: arm_neon-inl.h:782
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: x86_128-inl.h:82
typename detail::Raw128< T, N >::type Raw
Definition: arm_neon-inl.h:779
Raw raw
Definition: arm_neon-inl.h:814
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: x86_128-inl.h:88
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: x86_128-inl.h:97
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: x86_128-inl.h:94
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: x86_128-inl.h:79
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: x86_128-inl.h:91
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: x86_128-inl.h:85
#define HWY_AVX3_DL
Definition: detect_targets.h:65
#define HWY_TARGET
Definition: detect_targets.h:380
HWY_API Vec128< T, N > Neg(hwy::NonFloatTag, Vec128< T, N > v)
Definition: emu128-inl.h:726
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2413
HWY_INLINE void MaybeUnpoison(T *HWY_RESTRICT unaligned, size_t count)
Definition: x86_128-inl.h:648
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:718
HWY_INLINE V Eq128UpperVec(const D d, const V a, const V b)
Definition: x86_128-inl.h:7355
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5447
HWY_INLINE Vec128< T, N > Abs(SignedTag, Vec128< T, N > a)
Definition: emu128-inl.h:633
HWY_INLINE auto FixConversionOverflow(DI di, VFromD< DF > original, decltype(Zero(di).raw) converted_raw) -> VFromD< DI >
Definition: x86_128-inl.h:5571
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N, 0 > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition: x86_128-inl.h:5560
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1570
HWY_INLINE Mask128< T, N > ExclusiveNeither(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:963
HWY_API void ScalarMaskedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: x86_128-inl.h:2205
HWY_INLINE Vec128< T, N > Add(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:535
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:815
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition: x86_256-inl.h:2612
HWY_API Vec128< uint16_t, N > Shl(hwy::UnsignedTag, Vec128< uint16_t, N > v, Vec128< uint16_t, N > bits)
Definition: x86_128-inl.h:5009
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition: x86_128-inl.h:3286
HWY_INLINE T ExtractLane(const Vec128< T, N > v)
Definition: wasm_128-inl.h:1688
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:3023
HWY_INLINE Vec128< T, N > InsertLane(const Vec128< T, N > v, T t)
Definition: wasm_128-inl.h:1844
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition: x86_128-inl.h:3275
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:130
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:888
HWY_INLINE Vec128< T, N > Min(hwy::NonFloatTag, Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:663
HWY_API Vec128< T, 4 > Shuffle1230(const Vec128< T, 4 > a, const Vec128< T, 4 > b)
Definition: x86_128-inl.h:1291
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:5063
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:889
HWY_INLINE Vec128< T, 1 > SumOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:5058
HWY_INLINE V Ne128UpperVec(const D d, const V a, const V b)
Definition: x86_128-inl.h:7363
HWY_INLINE Vec128< T, N > Sub(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:545
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:852
HWY_INLINE V Lt128UpperVec(const D d, const V a, const V b)
Definition: x86_128-inl.h:7347
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3418
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:861
HWY_INLINE svuint64_t Ne128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition: arm_sve-inl.h:3051
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:2080
HWY_INLINE Vec128< T, N > Max(hwy::NonFloatTag, Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:671
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:670
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition: x86_128-inl.h:760
HWY_INLINE HWY_MAYBE_UNUSED Vec128< T, N > MaxU(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: x86_128-inl.h:3147
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:5068
HWY_INLINE __m128i BitCastToInteger(__m128d v)
Definition: x86_128-inl.h:169
HWY_INLINE Vec128< T > ZeroExtendVector(hwy::NonFloatTag, Full128< T >, Vec64< T > lo)
Definition: x86_128-inl.h:4567
constexpr uint64_t OnlyActive(uint64_t bits)
Definition: arm_neon-inl.h:5589
HWY_INLINE svuint64_t Eq128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition: arm_sve-inl.h:3038
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4235
trn2 HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition: arm_sve-inl.h:2990
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:5364
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition: x86_256-inl.h:2604
HWY_API Vec128< T, 4 > Shuffle3012(const Vec128< T, 4 > a, const Vec128< T, 4 > b)
Definition: x86_128-inl.h:1314
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:926
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1406
HWY_INLINE HWY_MAYBE_UNUSED Vec128< T, N > MinU(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: x86_128-inl.h:3051
static bool SignBit(float f)
Definition: scalar-inl.h:601
d
Definition: rvv-inl.h:1998
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1631
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:619
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2190
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4697
decltype(FirstN(D(), 0)) MFromD
Definition: arm_sve-inl.h:276
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2445
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:576
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2230
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4662
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1139
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4272
HWY_INLINE Mask128< T, N > Ne128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6685
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5716
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4131
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1684
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4147
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3436
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4448
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3506
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5691
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3592
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3695
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2456
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5701
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1799
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2955
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition: arm_neon-inl.h:2025
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:221
HWY_INLINE Mask128< T, N > Eq128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6668
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:5334
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2207
HWY_API Vec128< To, 1 > TruncateTo(Simd< To, 1, 0 >, const Vec128< From, 1 > v)
Definition: arm_neon-inl.h:4806
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2517
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4453
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:2137
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2555
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2217
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4517
HWY_INLINE Mask128< T, N > Ne128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6677
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:212
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:597
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:5037
Vec128< T, 4/sizeof(T)> Vec32
Definition: arm_neon-inl.h:821
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4912
HWY_INLINE Mask128< T, N > Eq128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6660
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4617
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4141
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1931
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3511
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4544
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3540
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2055
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2060
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4181
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4872
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4719
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition: ops/shared-inl.h:214
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:6198
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2758
typename D::Twice Twice
Definition: ops/shared-inl.h:231
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:210
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1163
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:6226
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4288
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2047
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2065
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2941
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5671
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2223
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4646
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2253
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2477
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2753
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1413
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4922
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4417
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:303
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1998
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3467
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition: arm_neon-inl.h:842
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1853
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2198
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2772
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6705
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4586
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3453
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:223
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3973
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4704
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3684
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6695
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4061
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2326
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:322
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4352
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4113
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:69
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:5342
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1049
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition: arm_neon-inl.h:2314
typename V::PrivateT TFromV
Definition: arm_neon-inl.h:845
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:6234
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5407
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2277
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4135
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6710
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6623
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1761
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3145
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2591
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:2040
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2266
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4570
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1462
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1642
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:997
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5710
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3739
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1085
HWY_API svbool_t Gt(const V a, const V b)
Definition: arm_sve-inl.h:881
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4984
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1040
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4281
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:386
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4456
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:207
HWY_API Vec128< float, N > RearrangeToOddPlusEven(const Vec128< float, N > sum0, const Vec128< float, N > sum1)
Definition: arm_neon-inl.h:4412
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4442
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1020
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2449
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1635
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4256
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:5020
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2260
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1148
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1986
HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D)
Definition: ops/shared-inl.h:271
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6700
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3497
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1734
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3287
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:4013
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1076
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:5002
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1832
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2965
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1180
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:2033
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1030
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2765
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4678
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1720
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4153
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3425
typename D::Half Half
Definition: ops/shared-inl.h:227
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:5338
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3707
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6248
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:218
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2146
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3327
N
Definition: rvv-inl.h:1998
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1913
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6273
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1444
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3521
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1964
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1361
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1885
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4712
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6257
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4090
HWY_API size_t FindKnownFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5683
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:580
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4030
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1542
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2934
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1225
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6651
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:608
Vec128< T, 8/sizeof(T)> Vec64
Definition: arm_neon-inl.h:818
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:376
long long int GatherIndex64
Definition: x86_128-inl.h:3268
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3885
const vfloat64m1_t v
Definition: rvv-inl.h:1998
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4429
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1773
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3713
typename D::T TFromD
Definition: ops/shared-inl.h:203
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4977
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:6174
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1861
Definition: aligned_allocator.h:27
HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag, T t, size_t n)
Definition: base.h:906
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:806
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:924
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:607
HWY_API void CopySameSize(const From *HWY_RESTRICT from, To *HWY_RESTRICT to)
Definition: base.h:961
typename EnableIfT< Condition >::type EnableIf
Definition: base.h:383
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:865
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:796
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
#define HWY_ATTR
Definition: set_macros-inl.h:443
@ value
Definition: arm_neon-inl.h:5730
Definition: arm_neon-inl.h:3968
__m128i raw
Definition: x86_128-inl.h:4131
detail::Raw128< T, N >::type raw
Definition: arm_neon-inl.h:3969
Definition: ops/shared-inl.h:52
Definition: x86_128-inl.h:186
HWY_INLINE __m128d operator()(__m128i v)
Definition: x86_128-inl.h:187
HWY_INLINE __m128 operator()(__m128i v)
Definition: x86_128-inl.h:183
HWY_INLINE __m128i operator()(__m128i v)
Definition: x86_128-inl.h:179
__m128d type
Definition: x86_128-inl.h:64
__f32x4 type
Definition: wasm_128-inl.h:65
Definition: x86_128-inl.h:55
__v128_u type
Definition: wasm_128-inl.h:61
Definition: x86_128-inl.h:118
__mmask16 type
Definition: x86_128-inl.h:119
Definition: x86_128-inl.h:122
__mmask8 type
Definition: x86_128-inl.h:123
Definition: x86_128-inl.h:126
__mmask8 type
Definition: x86_128-inl.h:127
Definition: x86_128-inl.h:130
__mmask8 type
Definition: x86_128-inl.h:131
Definition: x86_128-inl.h:116
#define HWY_INLINE_F16
Definition: x86_128-inl.h:5361