9#if defined(JPH_USE_SSE)
10 mValue = _mm_set_epi32(
int(inW),
int(inZ),
int(inY),
int(inX));
11#elif defined(JPH_USE_NEON)
12 uint32x2_t xy = vcreate_u32(
static_cast<uint64>(inX) | (
static_cast<uint64>(inY) << 32));
13 uint32x2_t zw = vcreate_u32(
static_cast<uint64>(inZ) | (
static_cast<uint64>(inW) << 32));
14 mValue = vcombine_u32(xy, zw);
28template<u
int32 SwizzleX, u
int32 SwizzleY, u
int32 SwizzleZ, u
int32 SwizzleW>
31 static_assert(SwizzleX <= 3,
"SwizzleX template parameter out of range");
32 static_assert(SwizzleY <= 3,
"SwizzleY template parameter out of range");
33 static_assert(SwizzleZ <= 3,
"SwizzleZ template parameter out of range");
34 static_assert(SwizzleW <= 3,
"SwizzleW template parameter out of range");
36#if defined(JPH_USE_SSE)
37 return _mm_shuffle_epi32(
mValue, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX));
38#elif defined(JPH_USE_NEON)
39 return JPH_NEON_SHUFFLE_U32x4(
mValue,
mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);
47#if defined(JPH_USE_SSE)
48 return _mm_setzero_si128();
49#elif defined(JPH_USE_NEON)
50 return vdupq_n_u32(0);
52 return UVec4(0, 0, 0, 0);
58#if defined(JPH_USE_SSE)
59 return _mm_set1_epi32(
int(inV));
60#elif defined(JPH_USE_NEON)
61 return vdupq_n_u32(inV);
63 return UVec4(inV, inV, inV, inV);
69#if defined(JPH_USE_SSE)
70 return _mm_castps_si128(_mm_load_ss(
reinterpret_cast<const float*
>(inV)));
71#elif defined(JPH_USE_NEON)
72 return vsetq_lane_u32(*inV, vdupq_n_u32(0), 0);
74 return UVec4(*inV, 0, 0, 0);
80#if defined(JPH_USE_SSE)
81 return _mm_loadu_si128(
reinterpret_cast<const __m128i *
>(inV));
82#elif defined(JPH_USE_NEON)
83 return vld1q_u32(inV);
85 return UVec4(inV[0], inV[1], inV[2], inV[3]);
91#if defined(JPH_USE_SSE)
92 return _mm_load_si128(
reinterpret_cast<const __m128i *
>(inV));
93#elif defined(JPH_USE_NEON)
94 return vld1q_u32(inV);
96 return UVec4(inV[0], inV[1], inV[2], inV[3]);
100template <const
int Scale>
104 return _mm_i32gather_epi32(
reinterpret_cast<const int *
>(inBase), inOffsets.
mValue, Scale);
106 const uint8 *base =
reinterpret_cast<const uint8 *
>(inBase);
107 uint32 x = *
reinterpret_cast<const uint32 *
>(base + inOffsets.
GetX() * Scale);
108 uint32 y = *
reinterpret_cast<const uint32 *
>(base + inOffsets.
GetY() * Scale);
109 uint32 z = *
reinterpret_cast<const uint32 *
>(base + inOffsets.
GetZ() * Scale);
110 uint32 w = *
reinterpret_cast<const uint32 *
>(base + inOffsets.
GetW() * Scale);
111 return UVec4(x, y, z, w);
117#if defined(JPH_USE_SSE4_1)
119#elif defined(JPH_USE_NEON)
123 for (
int i = 0; i < 4; i++)
131#if defined(JPH_USE_SSE4_1)
133#elif defined(JPH_USE_NEON)
137 for (
int i = 0; i < 4; i++)
145#if defined(JPH_USE_SSE)
147#elif defined(JPH_USE_NEON)
151 inV1.
mU32[1] == inV2.
mU32[1]? 0xffffffffu : 0,
152 inV1.
mU32[2] == inV2.
mU32[2]? 0xffffffffu : 0,
153 inV1.
mU32[3] == inV2.
mU32[3]? 0xffffffffu : 0);
159#if defined(JPH_USE_SSE4_1)
160 return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(inV1.
mValue), _mm_castsi128_ps(inV2.
mValue), _mm_castsi128_ps(inControl.
mValue)));
161#elif defined(JPH_USE_NEON)
162 return vbslq_u32(vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(inControl.
mValue), 31)), inV2.
mValue, inV1.
mValue);
165 for (
int i = 0; i < 4; i++)
173#if defined(JPH_USE_SSE)
175#elif defined(JPH_USE_NEON)
187#if defined(JPH_USE_SSE)
189#elif defined(JPH_USE_NEON)
201#if defined(JPH_USE_SSE)
203#elif defined(JPH_USE_NEON)
216#if defined(JPH_USE_AVX512)
218#elif defined(JPH_USE_SSE)
220#elif defined(JPH_USE_NEON)
221 return vmvnq_u32(inV1.
mValue);
243#if defined(JPH_USE_SSE4_1)
245#elif defined(JPH_USE_NEON)
249 for (
int i = 0; i < 4; i++)
257#if defined(JPH_USE_SSE)
259#elif defined(JPH_USE_NEON)
271#if defined(JPH_USE_SSE)
273#elif defined(JPH_USE_NEON)
276 for (
int i = 0; i < 4; ++i)
284#if defined(JPH_USE_SSE)
285 return _mm_shuffle_epi32(
mValue, _MM_SHUFFLE(0, 0, 0, 0));
286#elif defined(JPH_USE_NEON)
287 return vdupq_laneq_u32(
mValue, 0);
295#if defined(JPH_USE_SSE)
296 return _mm_shuffle_epi32(
mValue, _MM_SHUFFLE(1, 1, 1, 1));
297#elif defined(JPH_USE_NEON)
298 return vdupq_laneq_u32(
mValue, 1);
306#if defined(JPH_USE_SSE)
307 return _mm_shuffle_epi32(
mValue, _MM_SHUFFLE(2, 2, 2, 2));
308#elif defined(JPH_USE_NEON)
309 return vdupq_laneq_u32(
mValue, 2);
317#if defined(JPH_USE_SSE)
318 return _mm_shuffle_epi32(
mValue, _MM_SHUFFLE(3, 3, 3, 3));
319#elif defined(JPH_USE_NEON)
320 return vdupq_laneq_u32(
mValue, 3);
328#if defined(JPH_USE_SSE)
329 return _mm_cvtepi32_ps(
mValue);
330#elif defined(JPH_USE_NEON)
331 return vcvtq_f32_u32(
mValue);
339#if defined(JPH_USE_SSE)
341#elif defined(JPH_USE_NEON)
342 return vreinterpretq_f32_u32(
mValue);
344 return *
reinterpret_cast<const Vec4 *
>(
this);
350#if defined(JPH_USE_SSE)
351 _mm_storeu_si128(
reinterpret_cast<__m128i *
>(outV),
mValue);
352#elif defined(JPH_USE_NEON)
355 for (
int i = 0; i < 4; ++i)
362#if defined(JPH_USE_SSE)
363 _mm_store_si128(
reinterpret_cast<__m128i *
>(outV),
mValue);
364#elif defined(JPH_USE_NEON)
367 for (
int i = 0; i < 4; ++i)
374#if defined(JPH_USE_SSE)
376#elif defined(JPH_USE_NEON)
377 return vaddvq_u32(vshrq_n_u32(
mValue, 31));
379 return (
mU32[0] >> 31) + (
mU32[1] >> 31) + (
mU32[2] >> 31) + (
mU32[3] >> 31);
385#if defined(JPH_USE_SSE)
386 return _mm_movemask_ps(_mm_castsi128_ps(
mValue));
387#elif defined(JPH_USE_NEON)
388 int32x4_t shift = JPH_NEON_INT32x4(0, 1, 2, 3);
389 return vaddvq_u32(vshlq_u32(vshrq_n_u32(
mValue, 31), shift));
391 return (
mU32[0] >> 31) | ((
mU32[1] >> 31) << 1) | ((
mU32[2] >> 31) << 2) | ((
mU32[3] >> 31) << 3);
412 return (
GetTrues() & 0b111) == 0b111;
415template <const u
int Count>
418 static_assert(Count <= 31,
"Invalid shift");
420#if defined(JPH_USE_SSE)
421 return _mm_slli_epi32(
mValue, Count);
422#elif defined(JPH_USE_NEON)
423 return vshlq_n_u32(
mValue, Count);
429template <const u
int Count>
432 static_assert(Count <= 31,
"Invalid shift");
434#if defined(JPH_USE_SSE)
435 return _mm_srli_epi32(
mValue, Count);
436#elif defined(JPH_USE_NEON)
437 return vshrq_n_u32(
mValue, Count);
443template <const u
int Count>
446 static_assert(Count <= 31,
"Invalid shift");
448#if defined(JPH_USE_SSE)
449 return _mm_srai_epi32(
mValue, Count);
450#elif defined(JPH_USE_NEON)
451 return vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(
mValue), Count));
462#if defined(JPH_USE_SSE)
463 return _mm_unpacklo_epi16(
mValue, _mm_castps_si128(_mm_setzero_ps()));
464#elif defined(JPH_USE_NEON)
465 uint16x4_t value = vget_low_u16(vreinterpretq_u16_u32(
mValue));
466 uint16x4_t zero = vdup_n_u16(0);
467 return vreinterpretq_u32_u16(vcombine_u16(vzip1_u16(value, zero), vzip2_u16(value, zero)));
470 (
mU32[0] >> 16) & 0xffff,
472 (
mU32[1] >> 16) & 0xffff);
478#if defined(JPH_USE_SSE)
479 return _mm_unpackhi_epi16(
mValue, _mm_castps_si128(_mm_setzero_ps()));
480#elif defined(JPH_USE_NEON)
481 uint16x4_t value = vget_high_u16(vreinterpretq_u16_u32(
mValue));
482 uint16x4_t zero = vdup_n_u16(0);
483 return vreinterpretq_u32_u16(vcombine_u16(vzip1_u16(value, zero), vzip2_u16(value, zero)));
486 (
mU32[2] >> 16) & 0xffff,
488 (
mU32[3] >> 16) & 0xffff);
494#if defined(JPH_USE_SSE4_1)
495 return _mm_shuffle_epi8(
mValue, _mm_set_epi32(
int(0xffffff03),
int(0xffffff02),
int(0xffffff01),
int(0xffffff00)));
496#elif defined(JPH_USE_NEON)
497 uint8x16_t idx = JPH_NEON_UINT8x16(0x00, 0x7f, 0x7f, 0x7f, 0x01, 0x7f, 0x7f, 0x7f, 0x02, 0x7f, 0x7f, 0x7f, 0x03, 0x7f, 0x7f, 0x7f);
498 return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(
mValue), idx));
501 for (
int i = 0; i < 4; i++)
502 result.
mU32[i] = (
mU32[0] >> (i * 8)) & 0xff;
509#if defined(JPH_USE_SSE4_1)
510 return _mm_shuffle_epi8(
mValue, _mm_set_epi32(
int(0xffffff07),
int(0xffffff06),
int(0xffffff05),
int(0xffffff04)));
511#elif defined(JPH_USE_NEON)
512 uint8x16_t idx = JPH_NEON_UINT8x16(0x04, 0x7f, 0x7f, 0x7f, 0x05, 0x7f, 0x7f, 0x7f, 0x06, 0x7f, 0x7f, 0x7f, 0x07, 0x7f, 0x7f, 0x7f);
513 return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(
mValue), idx));
516 for (
int i = 0; i < 4; i++)
517 result.
mU32[i] = (
mU32[1] >> (i * 8)) & 0xff;
524#if defined(JPH_USE_SSE4_1)
525 return _mm_shuffle_epi8(
mValue, _mm_set_epi32(
int(0xffffff0b),
int(0xffffff0a),
int(0xffffff09),
int(0xffffff08)));
526#elif defined(JPH_USE_NEON)
527 uint8x16_t idx = JPH_NEON_UINT8x16(0x08, 0x7f, 0x7f, 0x7f, 0x09, 0x7f, 0x7f, 0x7f, 0x0a, 0x7f, 0x7f, 0x7f, 0x0b, 0x7f, 0x7f, 0x7f);
528 return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(
mValue), idx));
531 for (
int i = 0; i < 4; i++)
532 result.
mU32[i] = (
mU32[2] >> (i * 8)) & 0xff;
539#if defined(JPH_USE_SSE4_1)
540 return _mm_shuffle_epi8(
mValue, _mm_set_epi32(
int(0xffffff0f),
int(0xffffff0e),
int(0xffffff0d),
int(0xffffff0c)));
541#elif defined(JPH_USE_NEON)
542 uint8x16_t idx = JPH_NEON_UINT8x16(0x0c, 0x7f, 0x7f, 0x7f, 0x0d, 0x7f, 0x7f, 0x7f, 0x0e, 0x7f, 0x7f, 0x7f, 0x0f, 0x7f, 0x7f, 0x7f);
543 return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(
mValue), idx));
546 for (
int i = 0; i < 4; i++)
547 result.
mU32[i] = (
mU32[3] >> (i * 8)) & 0xff;
554#if defined(JPH_USE_SSE4_1) || defined(JPH_USE_NEON)
555 alignas(
UVec4)
static constexpr uint32 sFourMinusXShuffle[5][4] =
557 { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff },
558 { 0x0f0e0d0c, 0xffffffff, 0xffffffff, 0xffffffff },
559 { 0x0b0a0908, 0x0f0e0d0c, 0xffffffff, 0xffffffff },
560 { 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0xffffffff },
561 { 0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c }
565#if defined(JPH_USE_SSE4_1)
566 return _mm_shuffle_epi8(
mValue, *
reinterpret_cast<const UVec4::Type *
>(sFourMinusXShuffle[inCount]));
567#elif defined(JPH_USE_NEON)
568 uint8x16_t idx = vreinterpretq_u8_u32(*
reinterpret_cast<const UVec4::Type *
>(sFourMinusXShuffle[inCount]));
569 return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(
mValue), idx));
572 for (
int i = 0; i < inCount; i++)
573 result.
mU32[i] =
mU32[i + 4 - inCount];
std::uint8_t uint8
Definition Core.h:453
std::uint64_t uint64
Definition Core.h:456
#define JPH_NAMESPACE_END
Definition Core.h:378
std::uint32_t uint32
Definition Core.h:455
#define JPH_NAMESPACE_BEGIN
Definition Core.h:372
uint CountBits(uint32 inValue)
Count the number of 1 bits in a value.
Definition Math.h:161
@ SWIZZLE_Z
Use the Z component.
Definition Swizzle.h:14
@ SWIZZLE_W
Use the W component.
Definition Swizzle.h:15
@ SWIZZLE_X
Use the X component.
Definition Swizzle.h:12
@ SWIZZLE_Y
Use the Y component.
Definition Swizzle.h:13
JPH_INLINE UVec4 Swizzle() const
Swizzle the elements in inV.
static JPH_INLINE UVec4 sNot(UVec4Arg inV1)
Logical not (component wise)
Definition UVec4.inl:214
JPH_INLINE uint32 GetZ() const
Definition UVec4.h:104
static JPH_INLINE UVec4 sMin(UVec4Arg inV1, UVec4Arg inV2)
Return the minimum value of each of the components.
Definition UVec4.inl:115
JPH_INLINE UVec4 LogicalShiftLeft() const
Shift all components by Count bits to the left (filling with zeros from the left)
JPH_INLINE int CountTrues() const
Count the number of components that are true (true is when highest bit of component is set)
Definition UVec4.inl:372
JPH_INLINE UVec4 SplatY() const
Replicate the Y component to all components.
Definition UVec4.inl:293
static JPH_INLINE UVec4 sLoadInt(const uint32 *inV)
Load 1 int from memory and place it in the X component, zeros Y, Z and W.
Definition UVec4.inl:67
JPH_INLINE UVec4 Expand4Uint16Lo() const
Takes the lower 4 16 bits and expands them to X, Y, Z and W.
Definition UVec4.inl:460
static JPH_INLINE UVec4 sSort4True(UVec4Arg inValue, UVec4Arg inIndex)
Definition UVec4.inl:227
JPH_INLINE uint32 GetY() const
Definition UVec4.h:103
JPH_INLINE UVec4 LogicalShiftRight() const
Shift all components by Count bits to the right (filling with zeros from the right)
static JPH_INLINE UVec4 sReplicate(uint32 inV)
Replicate int inV across all components.
Definition UVec4.inl:56
JPH_INLINE UVec4 SplatX() const
Replicate the X component to all components.
Definition UVec4.inl:282
JPH_INLINE UVec4 Expand4Byte4() const
Takes byte 4 .. 7 and expands them to X, Y, Z and W.
Definition UVec4.inl:507
JPH_INLINE bool TestAllTrue() const
Test if all components are true (true is when highest bit of component is set)
Definition UVec4.inl:405
JPH_INLINE UVec4 Expand4Byte0() const
Takes byte 0 .. 3 and expands them to X, Y, Z and W.
Definition UVec4.inl:492
JPH_INLINE int GetTrues() const
Store if X is true in bit 0, Y in bit 1, Z in bit 2 and W in bit 3 (true is when highest bit of compo...
Definition UVec4.inl:383
JPH_INLINE bool TestAnyXYZTrue() const
Test if any of X, Y or Z components are true (true is when highest bit of component is set)
Definition UVec4.inl:400
JPH_INLINE UVec4 & operator+=(UVec4Arg inV2)
Add two integer vectors (component wise)
Definition UVec4.inl:269
static JPH_INLINE UVec4 sGatherInt4(const uint32 *inBase, UVec4Arg inOffsets)
Gather 4 ints from memory at inBase + inOffsets[i] * Scale.
static JPH_INLINE UVec4 sAnd(UVec4Arg inV1, UVec4Arg inV2)
Logical and (component wise)
Definition UVec4.inl:199
{ uint32 mData[4] Type
Definition UVec4.h:22
static JPH_INLINE UVec4 sEquals(UVec4Arg inV1, UVec4Arg inV2)
Equals (component wise)
Definition UVec4.inl:143
static JPH_INLINE UVec4 sOr(UVec4Arg inV1, UVec4Arg inV2)
Logical or (component wise)
Definition UVec4.inl:171
JPH_INLINE uint32 GetW() const
Definition UVec4.h:105
JPH_INLINE bool TestAllXYZTrue() const
Test if X, Y and Z components are true (true is when highest bit of component is set)
Definition UVec4.inl:410
JPH_INLINE UVec4 ShiftComponents4Minus(int inCount) const
Shift vector components by 4 - Count floats to the left, so if Count = 1 the resulting vector is (W,...
Definition UVec4.inl:552
JPH_INLINE bool operator==(UVec4Arg inV2) const
Comparison.
Definition UVec4.inl:23
static JPH_INLINE UVec4 sMax(UVec4Arg inV1, UVec4Arg inV2)
Return the maximum of each of the components.
Definition UVec4.inl:129
JPH_INLINE UVec4 SplatZ() const
Replicate the Z component to all components.
Definition UVec4.inl:304
Type mValue
Definition UVec4.h:211
JPH_INLINE UVec4 SplatW() const
Replicate the W component to all components.
Definition UVec4.inl:315
JPH_INLINE void StoreInt4(uint32 *outV) const
Store 4 ints to memory.
Definition UVec4.inl:348
JPH_INLINE uint32 GetX() const
Get individual components.
Definition UVec4.h:102
JPH_INLINE UVec4 Expand4Byte8() const
Takes byte 8 .. 11 and expands them to X, Y, Z and W.
Definition UVec4.inl:522
static JPH_INLINE UVec4 sLoadInt4Aligned(const uint32 *inV)
Load 4 ints from memory, aligned to 16 bytes.
Definition UVec4.inl:89
static JPH_INLINE UVec4 sLoadInt4(const uint32 *inV)
Load 4 ints from memory.
Definition UVec4.inl:78
JPH_INLINE UVec4 Expand4Byte12() const
Takes byte 12 .. 15 and expands them to X, Y, Z and W.
Definition UVec4.inl:537
static JPH_INLINE UVec4 sXor(UVec4Arg inV1, UVec4Arg inV2)
Logical xor (component wise)
Definition UVec4.inl:185
JPH_INLINE UVec4 Expand4Uint16Hi() const
Takes the upper 4 16 bits and expands them to X, Y, Z and W.
Definition UVec4.inl:476
static JPH_INLINE UVec4 sZero()
Vector with all zeros.
Definition UVec4.inl:45
JPH_INLINE UVec4 operator+(UVec4Arg inV2)
Adds an integer value to all integer components (discards any overflow)
Definition UVec4.inl:255
JPH_INLINE UVec4 ArithmeticShiftRight() const
Shift all components by Count bits to the right (shifting in the value of the highest bit)
UVec4()=default
Constructor.
JPH_INLINE UVec4 operator*(UVec4Arg inV2) const
Multiplies each of the 4 integer components with an integer (discards any overflow)
Definition UVec4.inl:241
static JPH_INLINE UVec4 sSelect(UVec4Arg inV1, UVec4Arg inV2, UVec4Arg inControl)
Component wise select, returns inV1 when highest bit of inControl = 0 and inV2 when highest bit of in...
Definition UVec4.inl:157
JPH_INLINE Vec4 ToFloat() const
Convert each component from an int to a float.
Definition UVec4.inl:326
JPH_INLINE Vec4 ReinterpretAsFloat() const
Reinterpret UVec4 as a Vec4 (doesn't change the bits)
Definition UVec4.inl:337
JPH_INLINE void StoreInt4Aligned(uint32 *outV) const
Store 4 ints to memory, aligned to 16 bytes.
Definition UVec4.inl:360
JPH_INLINE bool TestAnyTrue() const
Test if any of the components are true (true is when highest bit of component is set)
Definition UVec4.inl:395
uint32 mU32[4]
Definition UVec4.h:212