9#if defined(JPH_USE_SSE)
10 mValue = _mm_set_epi32(
int(inW),
int(inZ),
int(inY),
int(inX));
11#elif defined(JPH_USE_NEON)
12 uint32x2_t xy = vcreate_u32(
static_cast<uint64>(inX) | (
static_cast<uint64>(inY) << 32));
13 uint32x2_t zw = vcreate_u32(
static_cast<uint64>(inZ) | (
static_cast<uint64>(inW) << 32));
14 mValue = vcombine_u32(xy, zw);
28template<u
int32 SwizzleX, u
int32 SwizzleY, u
int32 SwizzleZ, u
int32 SwizzleW>
31 static_assert(SwizzleX <= 3,
"SwizzleX template parameter out of range");
32 static_assert(SwizzleY <= 3,
"SwizzleY template parameter out of range");
33 static_assert(SwizzleZ <= 3,
"SwizzleZ template parameter out of range");
34 static_assert(SwizzleW <= 3,
"SwizzleW template parameter out of range");
36#if defined(JPH_USE_SSE)
37 return _mm_shuffle_epi32(
mValue, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX));
38#elif defined(JPH_USE_NEON)
39 return JPH_NEON_SHUFFLE_U32x4(
mValue,
mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);
47#if defined(JPH_USE_SSE)
48 return _mm_setzero_si128();
49#elif defined(JPH_USE_NEON)
50 return vdupq_n_u32(0);
52 return UVec4(0, 0, 0, 0);
58#if defined(JPH_USE_SSE)
59 return _mm_set1_epi32(
int(inV));
60#elif defined(JPH_USE_NEON)
61 return vdupq_n_u32(inV);
63 return UVec4(inV, inV, inV, inV);
69#if defined(JPH_USE_SSE)
70 return _mm_castps_si128(_mm_load_ss(
reinterpret_cast<const float*
>(inV)));
71#elif defined(JPH_USE_NEON)
72 return vsetq_lane_u32(*inV, vdupq_n_u32(0), 0);
74 return UVec4(*inV, 0, 0, 0);
80#if defined(JPH_USE_SSE)
81 return _mm_loadu_si128(
reinterpret_cast<const __m128i *
>(inV));
82#elif defined(JPH_USE_NEON)
83 return vld1q_u32(inV);
85 return UVec4(inV[0], inV[1], inV[2], inV[3]);
91#if defined(JPH_USE_SSE)
92 return _mm_load_si128(
reinterpret_cast<const __m128i *
>(inV));
93#elif defined(JPH_USE_NEON)
94 return vld1q_u32(inV);
96 return UVec4(inV[0], inV[1], inV[2], inV[3]);
100template <const
int Scale>
104 return _mm_i32gather_epi32(
reinterpret_cast<const int *
>(inBase), inOffsets.
mValue, Scale);
106 const uint8 *base =
reinterpret_cast<const uint8 *
>(inBase);
107 uint32 x = *
reinterpret_cast<const uint32 *
>(base + inOffsets.
GetX() * Scale);
108 uint32 y = *
reinterpret_cast<const uint32 *
>(base + inOffsets.
GetY() * Scale);
109 uint32 z = *
reinterpret_cast<const uint32 *
>(base + inOffsets.
GetZ() * Scale);
110 uint32 w = *
reinterpret_cast<const uint32 *
>(base + inOffsets.
GetW() * Scale);
111 return UVec4(x, y, z, w);
117#if defined(JPH_USE_SSE4_1)
119#elif defined(JPH_USE_NEON)
123 for (
int i = 0; i < 4; i++)
131#if defined(JPH_USE_SSE4_1)
133#elif defined(JPH_USE_NEON)
137 for (
int i = 0; i < 4; i++)
145#if defined(JPH_USE_SSE)
147#elif defined(JPH_USE_NEON)
151 inV1.
mU32[1] == inV2.
mU32[1]? 0xffffffffu : 0,
152 inV1.
mU32[2] == inV2.
mU32[2]? 0xffffffffu : 0,
153 inV1.
mU32[3] == inV2.
mU32[3]? 0xffffffffu : 0);
159#if defined(JPH_USE_SSE4_1) && !defined(JPH_PLATFORM_WASM)
160 return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(inNotSet.
mValue), _mm_castsi128_ps(inSet.
mValue), _mm_castsi128_ps(inControl.
mValue)));
161#elif defined(JPH_USE_SSE)
162 __m128 is_set = _mm_castsi128_ps(_mm_srai_epi32(inControl.
mValue, 31));
163 return _mm_castps_si128(_mm_or_ps(_mm_and_ps(is_set, _mm_castsi128_ps(inSet.
mValue)), _mm_andnot_ps(is_set, _mm_castsi128_ps(inNotSet.
mValue))));
164#elif defined(JPH_USE_NEON)
165 return vbslq_u32(vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(inControl.
mValue), 31)), inSet.
mValue, inNotSet.
mValue);
168 for (
int i = 0; i < 4; i++)
169 result.
mU32[i] = (inControl.
mU32[i] & 0x80000000u) ? inSet.
mU32[i] : inNotSet.
mU32[i];
176#if defined(JPH_USE_SSE)
178#elif defined(JPH_USE_NEON)
190#if defined(JPH_USE_SSE)
192#elif defined(JPH_USE_NEON)
204#if defined(JPH_USE_SSE)
206#elif defined(JPH_USE_NEON)
219#if defined(JPH_USE_AVX512)
221#elif defined(JPH_USE_SSE)
223#elif defined(JPH_USE_NEON)
224 return vmvnq_u32(inV1.
mValue);
246#if defined(JPH_USE_SSE4_1)
248#elif defined(JPH_USE_NEON)
252 for (
int i = 0; i < 4; i++)
260#if defined(JPH_USE_SSE)
262#elif defined(JPH_USE_NEON)
274#if defined(JPH_USE_SSE)
276#elif defined(JPH_USE_NEON)
279 for (
int i = 0; i < 4; ++i)
287#if defined(JPH_USE_SSE)
288 return _mm_shuffle_epi32(
mValue, _MM_SHUFFLE(0, 0, 0, 0));
289#elif defined(JPH_USE_NEON)
290 return vdupq_laneq_u32(
mValue, 0);
298#if defined(JPH_USE_SSE)
299 return _mm_shuffle_epi32(
mValue, _MM_SHUFFLE(1, 1, 1, 1));
300#elif defined(JPH_USE_NEON)
301 return vdupq_laneq_u32(
mValue, 1);
309#if defined(JPH_USE_SSE)
310 return _mm_shuffle_epi32(
mValue, _MM_SHUFFLE(2, 2, 2, 2));
311#elif defined(JPH_USE_NEON)
312 return vdupq_laneq_u32(
mValue, 2);
320#if defined(JPH_USE_SSE)
321 return _mm_shuffle_epi32(
mValue, _MM_SHUFFLE(3, 3, 3, 3));
322#elif defined(JPH_USE_NEON)
323 return vdupq_laneq_u32(
mValue, 3);
331#if defined(JPH_USE_SSE)
332 return _mm_cvtepi32_ps(
mValue);
333#elif defined(JPH_USE_NEON)
334 return vcvtq_f32_u32(
mValue);
342#if defined(JPH_USE_SSE)
344#elif defined(JPH_USE_NEON)
345 return vreinterpretq_f32_u32(
mValue);
347 return *
reinterpret_cast<const Vec4 *
>(
this);
353#if defined(JPH_USE_SSE)
354 _mm_storeu_si128(
reinterpret_cast<__m128i *
>(outV),
mValue);
355#elif defined(JPH_USE_NEON)
358 for (
int i = 0; i < 4; ++i)
365#if defined(JPH_USE_SSE)
366 _mm_store_si128(
reinterpret_cast<__m128i *
>(outV),
mValue);
367#elif defined(JPH_USE_NEON)
370 for (
int i = 0; i < 4; ++i)
377#if defined(JPH_USE_SSE)
379#elif defined(JPH_USE_NEON)
380 return vaddvq_u32(vshrq_n_u32(
mValue, 31));
382 return (
mU32[0] >> 31) + (
mU32[1] >> 31) + (
mU32[2] >> 31) + (
mU32[3] >> 31);
388#if defined(JPH_USE_SSE)
389 return _mm_movemask_ps(_mm_castsi128_ps(
mValue));
390#elif defined(JPH_USE_NEON)
391 int32x4_t shift = JPH_NEON_INT32x4(0, 1, 2, 3);
392 return vaddvq_u32(vshlq_u32(vshrq_n_u32(
mValue, 31), shift));
394 return (
mU32[0] >> 31) | ((
mU32[1] >> 31) << 1) | ((
mU32[2] >> 31) << 2) | ((
mU32[3] >> 31) << 3);
415 return (
GetTrues() & 0b111) == 0b111;
418template <const u
int Count>
421 static_assert(Count <= 31,
"Invalid shift");
423#if defined(JPH_USE_SSE)
424 return _mm_slli_epi32(
mValue, Count);
425#elif defined(JPH_USE_NEON)
426 return vshlq_n_u32(
mValue, Count);
432template <const u
int Count>
435 static_assert(Count <= 31,
"Invalid shift");
437#if defined(JPH_USE_SSE)
438 return _mm_srli_epi32(
mValue, Count);
439#elif defined(JPH_USE_NEON)
440 return vshrq_n_u32(
mValue, Count);
446template <const u
int Count>
449 static_assert(Count <= 31,
"Invalid shift");
451#if defined(JPH_USE_SSE)
452 return _mm_srai_epi32(
mValue, Count);
453#elif defined(JPH_USE_NEON)
454 return vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(
mValue), Count));
465#if defined(JPH_USE_SSE)
466 return _mm_unpacklo_epi16(
mValue, _mm_castps_si128(_mm_setzero_ps()));
467#elif defined(JPH_USE_NEON)
468 uint16x4_t value = vget_low_u16(vreinterpretq_u16_u32(
mValue));
469 uint16x4_t zero = vdup_n_u16(0);
470 return vreinterpretq_u32_u16(vcombine_u16(vzip1_u16(value, zero), vzip2_u16(value, zero)));
473 (
mU32[0] >> 16) & 0xffff,
475 (
mU32[1] >> 16) & 0xffff);
481#if defined(JPH_USE_SSE)
482 return _mm_unpackhi_epi16(
mValue, _mm_castps_si128(_mm_setzero_ps()));
483#elif defined(JPH_USE_NEON)
484 uint16x4_t value = vget_high_u16(vreinterpretq_u16_u32(
mValue));
485 uint16x4_t zero = vdup_n_u16(0);
486 return vreinterpretq_u32_u16(vcombine_u16(vzip1_u16(value, zero), vzip2_u16(value, zero)));
489 (
mU32[2] >> 16) & 0xffff,
491 (
mU32[3] >> 16) & 0xffff);
497#if defined(JPH_USE_SSE4_1)
498 return _mm_shuffle_epi8(
mValue, _mm_set_epi32(
int(0xffffff03),
int(0xffffff02),
int(0xffffff01),
int(0xffffff00)));
499#elif defined(JPH_USE_NEON)
500 uint8x16_t idx = JPH_NEON_UINT8x16(0x00, 0x7f, 0x7f, 0x7f, 0x01, 0x7f, 0x7f, 0x7f, 0x02, 0x7f, 0x7f, 0x7f, 0x03, 0x7f, 0x7f, 0x7f);
501 return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(
mValue), idx));
504 for (
int i = 0; i < 4; i++)
505 result.
mU32[i] = (
mU32[0] >> (i * 8)) & 0xff;
512#if defined(JPH_USE_SSE4_1)
513 return _mm_shuffle_epi8(
mValue, _mm_set_epi32(
int(0xffffff07),
int(0xffffff06),
int(0xffffff05),
int(0xffffff04)));
514#elif defined(JPH_USE_NEON)
515 uint8x16_t idx = JPH_NEON_UINT8x16(0x04, 0x7f, 0x7f, 0x7f, 0x05, 0x7f, 0x7f, 0x7f, 0x06, 0x7f, 0x7f, 0x7f, 0x07, 0x7f, 0x7f, 0x7f);
516 return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(
mValue), idx));
519 for (
int i = 0; i < 4; i++)
520 result.
mU32[i] = (
mU32[1] >> (i * 8)) & 0xff;
527#if defined(JPH_USE_SSE4_1)
528 return _mm_shuffle_epi8(
mValue, _mm_set_epi32(
int(0xffffff0b),
int(0xffffff0a),
int(0xffffff09),
int(0xffffff08)));
529#elif defined(JPH_USE_NEON)
530 uint8x16_t idx = JPH_NEON_UINT8x16(0x08, 0x7f, 0x7f, 0x7f, 0x09, 0x7f, 0x7f, 0x7f, 0x0a, 0x7f, 0x7f, 0x7f, 0x0b, 0x7f, 0x7f, 0x7f);
531 return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(
mValue), idx));
534 for (
int i = 0; i < 4; i++)
535 result.
mU32[i] = (
mU32[2] >> (i * 8)) & 0xff;
542#if defined(JPH_USE_SSE4_1)
543 return _mm_shuffle_epi8(
mValue, _mm_set_epi32(
int(0xffffff0f),
int(0xffffff0e),
int(0xffffff0d),
int(0xffffff0c)));
544#elif defined(JPH_USE_NEON)
545 uint8x16_t idx = JPH_NEON_UINT8x16(0x0c, 0x7f, 0x7f, 0x7f, 0x0d, 0x7f, 0x7f, 0x7f, 0x0e, 0x7f, 0x7f, 0x7f, 0x0f, 0x7f, 0x7f, 0x7f);
546 return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(
mValue), idx));
549 for (
int i = 0; i < 4; i++)
550 result.
mU32[i] = (
mU32[3] >> (i * 8)) & 0xff;
557#if defined(JPH_USE_SSE4_1) || defined(JPH_USE_NEON)
558 alignas(
UVec4)
static constexpr uint32 sFourMinusXShuffle[5][4] =
560 { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff },
561 { 0x0f0e0d0c, 0xffffffff, 0xffffffff, 0xffffffff },
562 { 0x0b0a0908, 0x0f0e0d0c, 0xffffffff, 0xffffffff },
563 { 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0xffffffff },
564 { 0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c }
568#if defined(JPH_USE_SSE4_1)
569 return _mm_shuffle_epi8(
mValue, *
reinterpret_cast<const UVec4::Type *
>(sFourMinusXShuffle[inCount]));
570#elif defined(JPH_USE_NEON)
571 uint8x16_t idx = vreinterpretq_u8_u32(*
reinterpret_cast<const UVec4::Type *
>(sFourMinusXShuffle[inCount]));
572 return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(
mValue), idx));
575 for (
int i = 0; i < inCount; i++)
576 result.
mU32[i] =
mU32[i + 4 - inCount];
std::uint8_t uint8
Definition Core.h:482
std::uint64_t uint64
Definition Core.h:485
#define JPH_NAMESPACE_END
Definition Core.h:414
std::uint32_t uint32
Definition Core.h:484
#define JPH_NAMESPACE_BEGIN
Definition Core.h:408
uint CountBits(uint32 inValue)
Count the number of 1 bits in a value.
Definition Math.h:164
@ SWIZZLE_Z
Use the Z component.
Definition Swizzle.h:14
@ SWIZZLE_W
Use the W component.
Definition Swizzle.h:15
@ SWIZZLE_X
Use the X component.
Definition Swizzle.h:12
@ SWIZZLE_Y
Use the Y component.
Definition Swizzle.h:13
JPH_INLINE UVec4 Swizzle() const
Swizzle the elements in inV.
static JPH_INLINE UVec4 sNot(UVec4Arg inV1)
Logical not (component wise)
Definition UVec4.inl:217
JPH_INLINE uint32 GetZ() const
Definition UVec4.h:104
static JPH_INLINE UVec4 sMin(UVec4Arg inV1, UVec4Arg inV2)
Return the minimum value of each of the components.
Definition UVec4.inl:115
JPH_INLINE UVec4 LogicalShiftLeft() const
Shift all components by Count bits to the left (filling with zeros from the left)
JPH_INLINE int CountTrues() const
Count the number of components that are true (true is when highest bit of component is set)
Definition UVec4.inl:375
JPH_INLINE UVec4 SplatY() const
Replicate the Y component to all components.
Definition UVec4.inl:296
static JPH_INLINE UVec4 sSelect(UVec4Arg inNotSet, UVec4Arg inSet, UVec4Arg inControl)
Component wise select, returns inNotSet when highest bit of inControl = 0 and inSet when highest bit ...
Definition UVec4.inl:157
static JPH_INLINE UVec4 sLoadInt(const uint32 *inV)
Load 1 int from memory and place it in the X component, zeros Y, Z and W.
Definition UVec4.inl:67
JPH_INLINE UVec4 Expand4Uint16Lo() const
Takes the lower 4 16 bits and expands them to X, Y, Z and W.
Definition UVec4.inl:463
static JPH_INLINE UVec4 sSort4True(UVec4Arg inValue, UVec4Arg inIndex)
Definition UVec4.inl:230
JPH_INLINE uint32 GetY() const
Definition UVec4.h:103
JPH_INLINE UVec4 LogicalShiftRight() const
Shift all components by Count bits to the right (filling with zeros from the right)
static JPH_INLINE UVec4 sReplicate(uint32 inV)
Replicate int inV across all components.
Definition UVec4.inl:56
JPH_INLINE UVec4 SplatX() const
Replicate the X component to all components.
Definition UVec4.inl:285
JPH_INLINE UVec4 Expand4Byte4() const
Takes byte 4 .. 7 and expands them to X, Y, Z and W.
Definition UVec4.inl:510
JPH_INLINE bool TestAllTrue() const
Test if all components are true (true is when highest bit of component is set)
Definition UVec4.inl:408
JPH_INLINE UVec4 Expand4Byte0() const
Takes byte 0 .. 3 and expands them to X, Y, Z and W.
Definition UVec4.inl:495
JPH_INLINE int GetTrues() const
Store if X is true in bit 0, Y in bit 1, Z in bit 2 and W in bit 3 (true is when highest bit of compo...
Definition UVec4.inl:386
JPH_INLINE bool TestAnyXYZTrue() const
Test if any of X, Y or Z components are true (true is when highest bit of component is set)
Definition UVec4.inl:403
JPH_INLINE UVec4 & operator+=(UVec4Arg inV2)
Add two integer vectors (component wise)
Definition UVec4.inl:272
static JPH_INLINE UVec4 sGatherInt4(const uint32 *inBase, UVec4Arg inOffsets)
Gather 4 ints from memory at inBase + inOffsets[i] * Scale.
static JPH_INLINE UVec4 sAnd(UVec4Arg inV1, UVec4Arg inV2)
Logical and (component wise)
Definition UVec4.inl:202
{ uint32 mData[4] Type
Definition UVec4.h:22
static JPH_INLINE UVec4 sEquals(UVec4Arg inV1, UVec4Arg inV2)
Equals (component wise)
Definition UVec4.inl:143
static JPH_INLINE UVec4 sOr(UVec4Arg inV1, UVec4Arg inV2)
Logical or (component wise)
Definition UVec4.inl:174
JPH_INLINE uint32 GetW() const
Definition UVec4.h:105
JPH_INLINE bool TestAllXYZTrue() const
Test if X, Y and Z components are true (true is when highest bit of component is set)
Definition UVec4.inl:413
JPH_INLINE UVec4 ShiftComponents4Minus(int inCount) const
Shift vector components by 4 - Count floats to the left, so if Count = 1 the resulting vector is (W,...
Definition UVec4.inl:555
JPH_INLINE bool operator==(UVec4Arg inV2) const
Comparison.
Definition UVec4.inl:23
static JPH_INLINE UVec4 sMax(UVec4Arg inV1, UVec4Arg inV2)
Return the maximum of each of the components.
Definition UVec4.inl:129
JPH_INLINE UVec4 SplatZ() const
Replicate the Z component to all components.
Definition UVec4.inl:307
Type mValue
Definition UVec4.h:211
JPH_INLINE UVec4 SplatW() const
Replicate the W component to all components.
Definition UVec4.inl:318
JPH_INLINE void StoreInt4(uint32 *outV) const
Store 4 ints to memory.
Definition UVec4.inl:351
JPH_INLINE uint32 GetX() const
Get individual components.
Definition UVec4.h:102
JPH_INLINE UVec4 Expand4Byte8() const
Takes byte 8 .. 11 and expands them to X, Y, Z and W.
Definition UVec4.inl:525
static JPH_INLINE UVec4 sLoadInt4Aligned(const uint32 *inV)
Load 4 ints from memory, aligned to 16 bytes.
Definition UVec4.inl:89
static JPH_INLINE UVec4 sLoadInt4(const uint32 *inV)
Load 4 ints from memory.
Definition UVec4.inl:78
JPH_INLINE UVec4 Expand4Byte12() const
Takes byte 12 .. 15 and expands them to X, Y, Z and W.
Definition UVec4.inl:540
static JPH_INLINE UVec4 sXor(UVec4Arg inV1, UVec4Arg inV2)
Logical xor (component wise)
Definition UVec4.inl:188
JPH_INLINE UVec4 Expand4Uint16Hi() const
Takes the upper 4 16 bits and expands them to X, Y, Z and W.
Definition UVec4.inl:479
static JPH_INLINE UVec4 sZero()
Vector with all zeros.
Definition UVec4.inl:45
JPH_INLINE UVec4 operator+(UVec4Arg inV2)
Adds an integer value to all integer components (discards any overflow)
Definition UVec4.inl:258
JPH_INLINE UVec4 ArithmeticShiftRight() const
Shift all components by Count bits to the right (shifting in the value of the highest bit)
UVec4()=default
Constructor.
JPH_INLINE UVec4 operator*(UVec4Arg inV2) const
Multiplies each of the 4 integer components with an integer (discards any overflow)
Definition UVec4.inl:244
JPH_INLINE Vec4 ToFloat() const
Convert each component from an int to a float.
Definition UVec4.inl:329
JPH_INLINE Vec4 ReinterpretAsFloat() const
Reinterpret UVec4 as a Vec4 (doesn't change the bits)
Definition UVec4.inl:340
JPH_INLINE void StoreInt4Aligned(uint32 *outV) const
Store 4 ints to memory, aligned to 16 bytes.
Definition UVec4.inl:363
JPH_INLINE bool TestAnyTrue() const
Test if any of the components are true (true is when highest bit of component is set)
Definition UVec4.inl:398
uint32 mU32[4]
Definition UVec4.h:212