JoltPhysics/_u_vec4_8inl_source.html

// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)

// SPDX-FileCopyrightText: 2021 Jorrit Rouwe

// SPDX-License-Identifier: MIT


JPH_NAMESPACE_BEGIN


UVec4::UVec4(uint32 inX, uint32 inY, uint32 inZ, uint32 inW)

{

#if defined(JPH_USE_SSE)

    mValue = _mm_set_epi32(int(inW), int(inZ), int(inY), int(inX));

#elif defined(JPH_USE_NEON)

    uint32x2_t xy = vcreate_u32(static_cast<uint64>(inX) | (static_cast<uint64>(inY) << 32));

    uint32x2_t zw = vcreate_u32(static_cast<uint64>(inZ) | (static_cast<uint64>(inW) << 32));

    mValue = vcombine_u32(xy, zw);

#else

    mU32[0] = inX;

    mU32[1] = inY;

    mU32[2] = inZ;

    mU32[3] = inW;

#endif

}


bool UVec4::operator == (UVec4Arg inV2) const

{

    return sEquals(*this, inV2).TestAllTrue();

}


template<uint32 SwizzleX, uint32 SwizzleY, uint32 SwizzleZ, uint32 SwizzleW>


UVec4 UVec4::Swizzle() const

{

    static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");

    static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");

    static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");

    static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");


#if defined(JPH_USE_SSE)

    return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX));

#elif defined(JPH_USE_NEON)

    return JPH_NEON_SHUFFLE_U32x4(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);

#else

    return UVec4(mU32[SwizzleX], mU32[SwizzleY], mU32[SwizzleZ], mU32[SwizzleW]);

#endif

}


UVec4 UVec4::sZero()

{

#if defined(JPH_USE_SSE)

    return _mm_setzero_si128();

#elif defined(JPH_USE_NEON)

    return vdupq_n_u32(0);

#else

    return UVec4(0, 0, 0, 0);

#endif

}


UVec4 UVec4::sReplicate(uint32 inV)

{

#if defined(JPH_USE_SSE)

    return _mm_set1_epi32(int(inV));

#elif defined(JPH_USE_NEON)

    return vdupq_n_u32(inV);

#else

    return UVec4(inV, inV, inV, inV);

#endif

}


UVec4 UVec4::sLoadInt(const uint32 *inV)

{

#if defined(JPH_USE_SSE)

    return _mm_castps_si128(_mm_load_ss(reinterpret_cast<const float*>(inV)));

#elif defined(JPH_USE_NEON)

    return vsetq_lane_u32(*inV, vdupq_n_u32(0), 0);

#else

    return UVec4(*inV, 0, 0, 0);

#endif

}


UVec4 UVec4::sLoadInt4(const uint32 *inV)

{

#if defined(JPH_USE_SSE)

    return _mm_loadu_si128(reinterpret_cast<const __m128i *>(inV));

#elif defined(JPH_USE_NEON)

    return vld1q_u32(inV);

#else

    return UVec4(inV[0], inV[1], inV[2], inV[3]);

#endif

}


UVec4 UVec4::sLoadInt4Aligned(const uint32 *inV)

{

#if defined(JPH_USE_SSE)

    return _mm_load_si128(reinterpret_cast<const __m128i *>(inV));

#elif defined(JPH_USE_NEON)

    return vld1q_u32(inV); // ARM doesn't make distinction between aligned or not

#else

    return UVec4(inV[0], inV[1], inV[2], inV[3]);

#endif

}


template <const int Scale>


UVec4 UVec4::sGatherInt4(const uint32 *inBase, UVec4Arg inOffsets)

{

#ifdef JPH_USE_AVX2

    return _mm_i32gather_epi32(reinterpret_cast<const int *>(inBase), inOffsets.mValue, Scale);

#else

    const uint8 *base = reinterpret_cast<const uint8 *>(inBase);

    uint32 x = *reinterpret_cast<const uint32 *>(base + inOffsets.GetX() * Scale);

    uint32 y = *reinterpret_cast<const uint32 *>(base + inOffsets.GetY() * Scale);

    uint32 z = *reinterpret_cast<const uint32 *>(base + inOffsets.GetZ() * Scale);

    uint32 w = *reinterpret_cast<const uint32 *>(base + inOffsets.GetW() * Scale);

    return UVec4(x, y, z, w);

#endif

}


UVec4 UVec4::sMin(UVec4Arg inV1, UVec4Arg inV2)

{

#if defined(JPH_USE_SSE4_1)

    return _mm_min_epu32(inV1.mValue, inV2.mValue);

#elif defined(JPH_USE_NEON)

    return vminq_u32(inV1.mValue, inV2.mValue);

#else

    UVec4 result;

    for (int i = 0; i < 4; i++)

        result.mU32[i] = min(inV1.mU32[i], inV2.mU32[i]);

    return result;

#endif

}


UVec4 UVec4::sMax(UVec4Arg inV1, UVec4Arg inV2)

{

#if defined(JPH_USE_SSE4_1)

    return _mm_max_epu32(inV1.mValue, inV2.mValue);

#elif defined(JPH_USE_NEON)

    return vmaxq_u32(inV1.mValue, inV2.mValue);

#else

    UVec4 result;

    for (int i = 0; i < 4; i++)

        result.mU32[i] = max(inV1.mU32[i], inV2.mU32[i]);

    return result;

#endif

}


UVec4 UVec4::sEquals(UVec4Arg inV1, UVec4Arg inV2)

{

#if defined(JPH_USE_SSE)

    return _mm_cmpeq_epi32(inV1.mValue, inV2.mValue);

#elif defined(JPH_USE_NEON)

    return vceqq_u32(inV1.mValue, inV2.mValue);

#else

    return UVec4(inV1.mU32[0] == inV2.mU32[0]? 0xffffffffu : 0,

                 inV1.mU32[1] == inV2.mU32[1]? 0xffffffffu : 0,

                 inV1.mU32[2] == inV2.mU32[2]? 0xffffffffu : 0,

                 inV1.mU32[3] == inV2.mU32[3]? 0xffffffffu : 0);

#endif

}


UVec4 UVec4::sSelect(UVec4Arg inNotSet, UVec4Arg inSet, UVec4Arg inControl)

{

#if defined(JPH_USE_SSE4_1) && !defined(JPH_PLATFORM_WASM) // _mm_blendv_ps has problems on FireFox

    return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(inNotSet.mValue), _mm_castsi128_ps(inSet.mValue), _mm_castsi128_ps(inControl.mValue)));

#elif defined(JPH_USE_SSE)

    __m128 is_set = _mm_castsi128_ps(_mm_srai_epi32(inControl.mValue, 31));

    return _mm_castps_si128(_mm_or_ps(_mm_and_ps(is_set, _mm_castsi128_ps(inSet.mValue)), _mm_andnot_ps(is_set, _mm_castsi128_ps(inNotSet.mValue))));

#elif defined(JPH_USE_NEON)

    return vbslq_u32(vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(inControl.mValue), 31)), inSet.mValue, inNotSet.mValue);

#else

    UVec4 result;

    for (int i = 0; i < 4; i++)

        result.mU32[i] = (inControl.mU32[i] & 0x80000000u) ? inSet.mU32[i] : inNotSet.mU32[i];

    return result;

#endif

}


UVec4 UVec4::sOr(UVec4Arg inV1, UVec4Arg inV2)

{

#if defined(JPH_USE_SSE)

    return _mm_or_si128(inV1.mValue, inV2.mValue);

#elif defined(JPH_USE_NEON)

    return vorrq_u32(inV1.mValue, inV2.mValue);

#else

    return UVec4(inV1.mU32[0] | inV2.mU32[0],

                 inV1.mU32[1] | inV2.mU32[1],

                 inV1.mU32[2] | inV2.mU32[2],

                 inV1.mU32[3] | inV2.mU32[3]);

#endif

}


UVec4 UVec4::sXor(UVec4Arg inV1, UVec4Arg inV2)

{

#if defined(JPH_USE_SSE)

    return _mm_xor_si128(inV1.mValue, inV2.mValue);

#elif defined(JPH_USE_NEON)

    return veorq_u32(inV1.mValue, inV2.mValue);

#else

    return UVec4(inV1.mU32[0] ^ inV2.mU32[0],

                 inV1.mU32[1] ^ inV2.mU32[1],

                 inV1.mU32[2] ^ inV2.mU32[2],

                 inV1.mU32[3] ^ inV2.mU32[3]);

#endif

}


UVec4 UVec4::sAnd(UVec4Arg inV1, UVec4Arg inV2)

{

#if defined(JPH_USE_SSE)

    return _mm_and_si128(inV1.mValue, inV2.mValue);

#elif defined(JPH_USE_NEON)

    return vandq_u32(inV1.mValue, inV2.mValue);

#else

    return UVec4(inV1.mU32[0] & inV2.mU32[0],

                 inV1.mU32[1] & inV2.mU32[1],

                 inV1.mU32[2] & inV2.mU32[2],

                 inV1.mU32[3] & inV2.mU32[3]);

#endif

}


UVec4 UVec4::sNot(UVec4Arg inV1)

{

#if defined(JPH_USE_AVX512)

    return _mm_ternarylogic_epi32(inV1.mValue, inV1.mValue, inV1.mValue, 0b01010101);

#elif defined(JPH_USE_SSE)

    return sXor(inV1, sReplicate(0xffffffff));

#elif defined(JPH_USE_NEON)

    return vmvnq_u32(inV1.mValue);

#else

    return UVec4(~inV1.mU32[0], ~inV1.mU32[1], ~inV1.mU32[2], ~inV1.mU32[3]);

#endif

}


UVec4 UVec4::sSort4True(UVec4Arg inValue, UVec4Arg inIndex)

{

    // If inValue.z is false then shift W to Z

    UVec4 v = UVec4::sSelect(inIndex.Swizzle<SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_W>(), inIndex, inValue.SplatZ());


    // If inValue.y is false then shift Z and further to Y and further

    v = UVec4::sSelect(v.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_W>(), v, inValue.SplatY());


    // If inValue.x is false then shift X and further to Y and further

    v = UVec4::sSelect(v.Swizzle<SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_W>(), v, inValue.SplatX());


    return v;

}


UVec4 UVec4::operator * (UVec4Arg inV2) const

{

#if defined(JPH_USE_SSE4_1)

    return _mm_mullo_epi32(mValue, inV2.mValue);

#elif defined(JPH_USE_NEON)

    return vmulq_u32(mValue, inV2.mValue);

#else

    UVec4 result;

    for (int i = 0; i < 4; i++)

        result.mU32[i] = mU32[i] * inV2.mU32[i];

    return result;

#endif

}


UVec4 UVec4::operator + (UVec4Arg inV2) const

{

#if defined(JPH_USE_SSE)

    return _mm_add_epi32(mValue, inV2.mValue);

#elif defined(JPH_USE_NEON)

    return vaddq_u32(mValue, inV2.mValue);

#else

    return UVec4(mU32[0] + inV2.mU32[0],

                 mU32[1] + inV2.mU32[1],

                 mU32[2] + inV2.mU32[2],

                 mU32[3] + inV2.mU32[3]);

#endif

}


UVec4 &UVec4::operator += (UVec4Arg inV2)

{

#if defined(JPH_USE_SSE)

    mValue = _mm_add_epi32(mValue, inV2.mValue);

#elif defined(JPH_USE_NEON)

    mValue = vaddq_u32(mValue, inV2.mValue);

#else

    for (int i = 0; i < 4; ++i)

        mU32[i] += inV2.mU32[i];

#endif

    return *this;

}


UVec4 UVec4::operator - (UVec4Arg inV2) const

{

#if defined(JPH_USE_SSE)

    return _mm_sub_epi32(mValue, inV2.mValue);

#elif defined(JPH_USE_NEON)

    return vsubq_u32(mValue, inV2.mValue);

#else

    return UVec4(mU32[0] - inV2.mU32[0],

                 mU32[1] - inV2.mU32[1],

                 mU32[2] - inV2.mU32[2],

                 mU32[3] - inV2.mU32[3]);

#endif

}


UVec4 &UVec4::operator -= (UVec4Arg inV2)

{

#if defined(JPH_USE_SSE)

    mValue = _mm_sub_epi32(mValue, inV2.mValue);

#elif defined(JPH_USE_NEON)

    mValue = vsubq_u32(mValue, inV2.mValue);

#else

    for (int i = 0; i < 4; ++i)

        mU32[i] -= inV2.mU32[i];

#endif

    return *this;

}


UVec4 UVec4::SplatX() const

{

#if defined(JPH_USE_SSE)

    return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(0, 0, 0, 0));

#elif defined(JPH_USE_NEON)

    return vdupq_laneq_u32(mValue, 0);

#else

    return UVec4(mU32[0], mU32[0], mU32[0], mU32[0]);

#endif

}


UVec4 UVec4::SplatY() const

{

#if defined(JPH_USE_SSE)

    return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(1, 1, 1, 1));

#elif defined(JPH_USE_NEON)

    return vdupq_laneq_u32(mValue, 1);

#else

    return UVec4(mU32[1], mU32[1], mU32[1], mU32[1]);

#endif

}


UVec4 UVec4::SplatZ() const

{

#if defined(JPH_USE_SSE)

    return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(2, 2, 2, 2));

#elif defined(JPH_USE_NEON)

    return vdupq_laneq_u32(mValue, 2);

#else

    return UVec4(mU32[2], mU32[2], mU32[2], mU32[2]);

#endif

}


UVec4 UVec4::SplatW() const

{

#if defined(JPH_USE_SSE)

    return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(3, 3, 3, 3));

#elif defined(JPH_USE_NEON)

    return vdupq_laneq_u32(mValue, 3);

#else

    return UVec4(mU32[3], mU32[3], mU32[3], mU32[3]);

#endif

}


Vec4 UVec4::ToFloat() const

{

#if defined(JPH_USE_SSE)

    return _mm_cvtepi32_ps(mValue);

#elif defined(JPH_USE_NEON)

    return vcvtq_f32_u32(mValue);

#else

    return Vec4((float)mU32[0], (float)mU32[1], (float)mU32[2], (float)mU32[3]);

#endif

}


Vec4 UVec4::ReinterpretAsFloat() const

{

#if defined(JPH_USE_SSE)

    return Vec4(_mm_castsi128_ps(mValue));

#elif defined(JPH_USE_NEON)

    return vreinterpretq_f32_u32(mValue);

#else

    return *reinterpret_cast<const Vec4 *>(this);

#endif

}


UVec4 UVec4::DotV(UVec4Arg inV2) const

{

#if defined(JPH_USE_SSE4_1)

    __m128i mul = _mm_mullo_epi32(mValue, inV2.mValue);

    __m128i sum = _mm_add_epi32(mul, _mm_shuffle_epi32(mul, _MM_SHUFFLE(2, 3, 0, 1)));

    return _mm_add_epi32(sum, _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2)));

#elif defined(JPH_USE_NEON)

    uint32x4_t mul = vmulq_u32(mValue, inV2.mValue);

    return vdupq_n_u32(vaddvq_u32(mul));

#else

    return UVec4::sReplicate(mU32[0] * inV2.mU32[0] + mU32[1] * inV2.mU32[1] + mU32[2] * inV2.mU32[2] + mU32[3] * inV2.mU32[3]);

#endif

}


uint32 UVec4::Dot(UVec4Arg inV2) const

{

#if defined(JPH_USE_SSE4_1)

    __m128i mul = _mm_mullo_epi32(mValue, inV2.mValue);

    __m128i sum = _mm_add_epi32(mul, _mm_shuffle_epi32(mul, _MM_SHUFFLE(2, 3, 0, 1)));

    return _mm_cvtsi128_si32(_mm_add_epi32(sum, _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2))));

#elif defined(JPH_USE_NEON)

    uint32x4_t mul = vmulq_u32(mValue, inV2.mValue);

    return vaddvq_u32(mul);

#else

    return mU32[0] * inV2.mU32[0] + mU32[1] * inV2.mU32[1] + mU32[2] * inV2.mU32[2] + mU32[3] * inV2.mU32[3];

#endif

}


void UVec4::StoreInt4(uint32 *outV) const

{

#if defined(JPH_USE_SSE)

    _mm_storeu_si128(reinterpret_cast<__m128i *>(outV), mValue);

#elif defined(JPH_USE_NEON)

    vst1q_u32(outV, mValue);

#else

    for (int i = 0; i < 4; ++i)

        outV[i] = mU32[i];

#endif

}


void UVec4::StoreInt4Aligned(uint32 *outV) const

{

#if defined(JPH_USE_SSE)

    _mm_store_si128(reinterpret_cast<__m128i *>(outV), mValue);

#elif defined(JPH_USE_NEON)

    vst1q_u32(outV, mValue); // ARM doesn't make distinction between aligned or not

#else

    for (int i = 0; i < 4; ++i)

        outV[i] = mU32[i];

#endif

}


int UVec4::CountTrues() const

{

#if defined(JPH_USE_SSE)

    return CountBits(_mm_movemask_ps(_mm_castsi128_ps(mValue)));

#elif defined(JPH_USE_NEON)

    return vaddvq_u32(vshrq_n_u32(mValue, 31));

#else

    return (mU32[0] >> 31) + (mU32[1] >> 31) + (mU32[2] >> 31) + (mU32[3] >> 31);

#endif

}


int UVec4::GetTrues() const

{

#if defined(JPH_USE_SSE)

    return _mm_movemask_ps(_mm_castsi128_ps(mValue));

#elif defined(JPH_USE_NEON)

    int32x4_t shift = JPH_NEON_INT32x4(0, 1, 2, 3);

    return vaddvq_u32(vshlq_u32(vshrq_n_u32(mValue, 31), shift));

#else

    return (mU32[0] >> 31) | ((mU32[1] >> 31) << 1) | ((mU32[2] >> 31) << 2) | ((mU32[3] >> 31) << 3);

#endif

}


bool UVec4::TestAnyTrue() const

{

    return GetTrues() != 0;

}


bool UVec4::TestAnyXYZTrue() const

{

    return (GetTrues() & 0b111) != 0;

}


bool UVec4::TestAllTrue() const

{

    return GetTrues() == 0b1111;

}


bool UVec4::TestAllXYZTrue() const

{

    return (GetTrues() & 0b111) == 0b111;

}


template <const uint Count>


UVec4 UVec4::LogicalShiftLeft() const

{

    static_assert(Count <= 31, "Invalid shift");


#if defined(JPH_USE_SSE)

    return _mm_slli_epi32(mValue, Count);

#elif defined(JPH_USE_NEON)

    return vshlq_n_u32(mValue, Count);

#else

    return UVec4(mU32[0] << Count, mU32[1] << Count, mU32[2] << Count, mU32[3] << Count);

#endif

}


template <const uint Count>


UVec4 UVec4::LogicalShiftRight() const

{

    static_assert(Count <= 31, "Invalid shift");


#if defined(JPH_USE_SSE)

    return _mm_srli_epi32(mValue, Count);

#elif defined(JPH_USE_NEON)

    return vshrq_n_u32(mValue, Count);

#else

    return UVec4(mU32[0] >> Count, mU32[1] >> Count, mU32[2] >> Count, mU32[3] >> Count);

#endif

}


template <const uint Count>


UVec4 UVec4::ArithmeticShiftRight() const

{

    static_assert(Count <= 31, "Invalid shift");


#if defined(JPH_USE_SSE)

    return _mm_srai_epi32(mValue, Count);

#elif defined(JPH_USE_NEON)

    return vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(mValue), Count));

#else

    return UVec4(uint32(int32_t(mU32[0]) >> Count),

                 uint32(int32_t(mU32[1]) >> Count),

                 uint32(int32_t(mU32[2]) >> Count),

                 uint32(int32_t(mU32[3]) >> Count));

#endif

}


UVec4 UVec4::Expand4Uint16Lo() const

{

#if defined(JPH_USE_SSE)

    return _mm_unpacklo_epi16(mValue, _mm_castps_si128(_mm_setzero_ps()));

#elif defined(JPH_USE_NEON)

    uint16x4_t value = vget_low_u16(vreinterpretq_u16_u32(mValue));

    uint16x4_t zero = vdup_n_u16(0);

    return vreinterpretq_u32_u16(vcombine_u16(vzip1_u16(value, zero), vzip2_u16(value, zero)));

#else

    return UVec4(mU32[0] & 0xffff,

                 (mU32[0] >> 16) & 0xffff,

                 mU32[1] & 0xffff,

                 (mU32[1] >> 16) & 0xffff);

#endif

}


UVec4 UVec4::Expand4Uint16Hi() const

{

#if defined(JPH_USE_SSE)

    return _mm_unpackhi_epi16(mValue, _mm_castps_si128(_mm_setzero_ps()));

#elif defined(JPH_USE_NEON)

    uint16x4_t value = vget_high_u16(vreinterpretq_u16_u32(mValue));

    uint16x4_t zero = vdup_n_u16(0);

    return vreinterpretq_u32_u16(vcombine_u16(vzip1_u16(value, zero), vzip2_u16(value, zero)));

#else

    return UVec4(mU32[2] & 0xffff,

                 (mU32[2] >> 16) & 0xffff,

                 mU32[3] & 0xffff,

                 (mU32[3] >> 16) & 0xffff);

#endif

}


UVec4 UVec4::Expand4Byte0() const

{

#if defined(JPH_USE_SSE4_1)

    return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff03), int(0xffffff02), int(0xffffff01), int(0xffffff00)));

#elif defined(JPH_USE_NEON)

    uint8x16_t idx = JPH_NEON_UINT8x16(0x00, 0x7f, 0x7f, 0x7f, 0x01, 0x7f, 0x7f, 0x7f, 0x02, 0x7f, 0x7f, 0x7f, 0x03, 0x7f, 0x7f, 0x7f);

    return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));

#else

    UVec4 result;

    for (int i = 0; i < 4; i++)

        result.mU32[i] = (mU32[0] >> (i * 8)) & 0xff;

    return result;

#endif

}


UVec4 UVec4::Expand4Byte4() const

{

#if defined(JPH_USE_SSE4_1)

    return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff07), int(0xffffff06), int(0xffffff05), int(0xffffff04)));

#elif defined(JPH_USE_NEON)

    uint8x16_t idx = JPH_NEON_UINT8x16(0x04, 0x7f, 0x7f, 0x7f, 0x05, 0x7f, 0x7f, 0x7f, 0x06, 0x7f, 0x7f, 0x7f, 0x07, 0x7f, 0x7f, 0x7f);

    return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));

#else

    UVec4 result;

    for (int i = 0; i < 4; i++)

        result.mU32[i] = (mU32[1] >> (i * 8)) & 0xff;

    return result;

#endif

}


UVec4 UVec4::Expand4Byte8() const

{

#if defined(JPH_USE_SSE4_1)

    return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff0b), int(0xffffff0a), int(0xffffff09), int(0xffffff08)));

#elif defined(JPH_USE_NEON)

    uint8x16_t idx = JPH_NEON_UINT8x16(0x08, 0x7f, 0x7f, 0x7f, 0x09, 0x7f, 0x7f, 0x7f, 0x0a, 0x7f, 0x7f, 0x7f, 0x0b, 0x7f, 0x7f, 0x7f);

    return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));

#else

    UVec4 result;

    for (int i = 0; i < 4; i++)

        result.mU32[i] = (mU32[2] >> (i * 8)) & 0xff;

    return result;

#endif

}


UVec4 UVec4::Expand4Byte12() const

{

#if defined(JPH_USE_SSE4_1)

    return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff0f), int(0xffffff0e), int(0xffffff0d), int(0xffffff0c)));

#elif defined(JPH_USE_NEON)

    uint8x16_t idx = JPH_NEON_UINT8x16(0x0c, 0x7f, 0x7f, 0x7f, 0x0d, 0x7f, 0x7f, 0x7f, 0x0e, 0x7f, 0x7f, 0x7f, 0x0f, 0x7f, 0x7f, 0x7f);

    return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));

#else

    UVec4 result;

    for (int i = 0; i < 4; i++)

        result.mU32[i] = (mU32[3] >> (i * 8)) & 0xff;

    return result;

#endif

}


UVec4 UVec4::ShiftComponents4Minus(int inCount) const

{

#if defined(JPH_USE_SSE4_1) || defined(JPH_USE_NEON)

    alignas(UVec4) static constexpr uint32 sFourMinusXShuffle[5][4] =

    {

        { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff },

        { 0x0f0e0d0c, 0xffffffff, 0xffffffff, 0xffffffff },

        { 0x0b0a0908, 0x0f0e0d0c, 0xffffffff, 0xffffffff },

        { 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0xffffffff },

        { 0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c }

    };

#endif


#if defined(JPH_USE_SSE4_1)

    return _mm_shuffle_epi8(mValue, *reinterpret_cast<const UVec4::Type *>(sFourMinusXShuffle[inCount]));

#elif defined(JPH_USE_NEON)

    uint8x16_t idx = vreinterpretq_u8_u32(*reinterpret_cast<const UVec4::Type *>(sFourMinusXShuffle[inCount]));

    return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));

#else

    UVec4 result = UVec4::sZero();

    for (int i = 0; i < inCount; i++)

        result.mU32[i] = mU32[i + 4 - inCount];

    return result;

#endif

}


JPH_NAMESPACE_END

uint8
std::uint8_t uint8
Definition Core.h:494

uint64
std::uint64_t uint64
Definition Core.h:497

JPH_NAMESPACE_END
#define JPH_NAMESPACE_END
Definition Core.h:419

uint32
std::uint32_t uint32
Definition Core.h:496

JPH_NAMESPACE_BEGIN
#define JPH_NAMESPACE_BEGIN
Definition Core.h:413

CountBits
uint CountBits(uint32 inValue)
Count the number of 1 bits in a value.
Definition Math.h:164

SWIZZLE_Z
@ SWIZZLE_Z
Use the Z component.
Definition Swizzle.h:14

SWIZZLE_W
@ SWIZZLE_W
Use the W component.
Definition Swizzle.h:15

SWIZZLE_X
@ SWIZZLE_X
Use the X component.
Definition Swizzle.h:12

SWIZZLE_Y
@ SWIZZLE_Y
Use the Y component.
Definition Swizzle.h:13

UVec4
Definition UVec4.h:12

UVec4::operator-
JPH_INLINE UVec4 operator-(UVec4Arg inV2) const
Subtract two integer vectors (component wise)
Definition UVec4.inl:285

UVec4::Swizzle
JPH_INLINE UVec4 Swizzle() const
Swizzle the elements in inV.

UVec4::sNot
static JPH_INLINE UVec4 sNot(UVec4Arg inV1)
Logical not (component wise)
Definition UVec4.inl:217

UVec4::GetZ
JPH_INLINE uint32 GetZ() const
Definition UVec4.h:104

UVec4::sMin
static JPH_INLINE UVec4 sMin(UVec4Arg inV1, UVec4Arg inV2)
Return the minimum value of each of the components.
Definition UVec4.inl:115

UVec4::LogicalShiftLeft
JPH_INLINE UVec4 LogicalShiftLeft() const
Shift all components by Count bits to the left (filling with zeros from the left)

UVec4::CountTrues
JPH_INLINE int CountTrues() const
Count the number of components that are true (true is when highest bit of component is set)
Definition UVec4.inl:430

UVec4::operator-=
JPH_INLINE UVec4 & operator-=(UVec4Arg inV2)
Subtract two integer vectors (component wise)
Definition UVec4.inl:299

UVec4::SplatY
JPH_INLINE UVec4 SplatY() const
Replicate the Y component to all components.
Definition UVec4.inl:323

UVec4::sSelect
static JPH_INLINE UVec4 sSelect(UVec4Arg inNotSet, UVec4Arg inSet, UVec4Arg inControl)
Component wise select, returns inNotSet when highest bit of inControl = 0 and inSet when highest bit ...
Definition UVec4.inl:157

UVec4::sLoadInt
static JPH_INLINE UVec4 sLoadInt(const uint32 *inV)
Load 1 int from memory and place it in the X component, zeros Y, Z and W.
Definition UVec4.inl:67

UVec4::Expand4Uint16Lo
JPH_INLINE UVec4 Expand4Uint16Lo() const
Takes the lower 4 16 bits and expands them to X, Y, Z and W.
Definition UVec4.inl:518

UVec4::sSort4True
static JPH_INLINE UVec4 sSort4True(UVec4Arg inValue, UVec4Arg inIndex)
Definition UVec4.inl:230

UVec4::operator+
JPH_INLINE UVec4 operator+(UVec4Arg inV2) const
Add two integer vectors (component wise)
Definition UVec4.inl:258

UVec4::GetY
JPH_INLINE uint32 GetY() const
Definition UVec4.h:103

UVec4::LogicalShiftRight
JPH_INLINE UVec4 LogicalShiftRight() const
Shift all components by Count bits to the right (filling with zeros from the right)

UVec4::sReplicate
static JPH_INLINE UVec4 sReplicate(uint32 inV)
Replicate int inV across all components.
Definition UVec4.inl:56

UVec4::SplatX
JPH_INLINE UVec4 SplatX() const
Replicate the X component to all components.
Definition UVec4.inl:312

UVec4::Expand4Byte4
JPH_INLINE UVec4 Expand4Byte4() const
Takes byte 4 .. 7 and expands them to X, Y, Z and W.
Definition UVec4.inl:565

UVec4::TestAllTrue
JPH_INLINE bool TestAllTrue() const
Test if all components are true (true is when highest bit of component is set)
Definition UVec4.inl:463

UVec4::Expand4Byte0
JPH_INLINE UVec4 Expand4Byte0() const
Takes byte 0 .. 3 and expands them to X, Y, Z and W.
Definition UVec4.inl:550

UVec4::GetTrues
JPH_INLINE int GetTrues() const
Store if X is true in bit 0, Y in bit 1, Z in bit 2 and W in bit 3 (true is when highest bit of compo...
Definition UVec4.inl:441

UVec4::TestAnyXYZTrue
JPH_INLINE bool TestAnyXYZTrue() const
Test if any of X, Y or Z components are true (true is when highest bit of component is set)
Definition UVec4.inl:458

UVec4::operator+=
JPH_INLINE UVec4 & operator+=(UVec4Arg inV2)
Add two integer vectors (component wise)
Definition UVec4.inl:272

UVec4::sGatherInt4
static JPH_INLINE UVec4 sGatherInt4(const uint32 *inBase, UVec4Arg inOffsets)
Gather 4 ints from memory at inBase + inOffsets[i] * Scale.

UVec4::sAnd
static JPH_INLINE UVec4 sAnd(UVec4Arg inV1, UVec4Arg inV2)
Logical and (component wise)
Definition UVec4.inl:202

UVec4::sEquals
static JPH_INLINE UVec4 sEquals(UVec4Arg inV1, UVec4Arg inV2)
Equals (component wise)
Definition UVec4.inl:143

UVec4::sOr
static JPH_INLINE UVec4 sOr(UVec4Arg inV1, UVec4Arg inV2)
Logical or (component wise)
Definition UVec4.inl:174

UVec4::Type
struct { uint32 mData[4];} Type
Definition UVec4.h:22

UVec4::GetW
JPH_INLINE uint32 GetW() const
Definition UVec4.h:105

UVec4::TestAllXYZTrue
JPH_INLINE bool TestAllXYZTrue() const
Test if X, Y and Z components are true (true is when highest bit of component is set)
Definition UVec4.inl:468

UVec4::ShiftComponents4Minus
JPH_INLINE UVec4 ShiftComponents4Minus(int inCount) const
Shift vector components by 4 - Count floats to the left, so if Count = 1 the resulting vector is (W,...
Definition UVec4.inl:610

UVec4::operator==
JPH_INLINE bool operator==(UVec4Arg inV2) const
Comparison.
Definition UVec4.inl:23

UVec4::sMax
static JPH_INLINE UVec4 sMax(UVec4Arg inV1, UVec4Arg inV2)
Return the maximum of each of the components.
Definition UVec4.inl:129

UVec4::SplatZ
JPH_INLINE UVec4 SplatZ() const
Replicate the Z component to all components.
Definition UVec4.inl:334

UVec4::mValue
Type mValue
Definition UVec4.h:223

UVec4::SplatW
JPH_INLINE UVec4 SplatW() const
Replicate the W component to all components.
Definition UVec4.inl:345

UVec4::StoreInt4
JPH_INLINE void StoreInt4(uint32 *outV) const
Store 4 ints to memory.
Definition UVec4.inl:406

UVec4::GetX
JPH_INLINE uint32 GetX() const
Get individual components.
Definition UVec4.h:102

UVec4::Expand4Byte8
JPH_INLINE UVec4 Expand4Byte8() const
Takes byte 8 .. 11 and expands them to X, Y, Z and W.
Definition UVec4.inl:580

UVec4::sLoadInt4Aligned
static JPH_INLINE UVec4 sLoadInt4Aligned(const uint32 *inV)
Load 4 ints from memory, aligned to 16 bytes.
Definition UVec4.inl:89

UVec4::sLoadInt4
static JPH_INLINE UVec4 sLoadInt4(const uint32 *inV)
Load 4 ints from memory.
Definition UVec4.inl:78

UVec4::Expand4Byte12
JPH_INLINE UVec4 Expand4Byte12() const
Takes byte 12 .. 15 and expands them to X, Y, Z and W.
Definition UVec4.inl:595

UVec4::sXor
static JPH_INLINE UVec4 sXor(UVec4Arg inV1, UVec4Arg inV2)
Logical xor (component wise)
Definition UVec4.inl:188

UVec4::Expand4Uint16Hi
JPH_INLINE UVec4 Expand4Uint16Hi() const
Takes the upper 4 16 bits and expands them to X, Y, Z and W.
Definition UVec4.inl:534

UVec4::sZero
static JPH_INLINE UVec4 sZero()
Vector with all zeros.
Definition UVec4.inl:45

UVec4::Dot
JPH_INLINE uint32 Dot(UVec4Arg inV2) const
Dot product.
Definition UVec4.inl:392

UVec4::DotV
JPH_INLINE UVec4 DotV(UVec4Arg inV2) const
Dot product, returns the dot product in X, Y, Z and W components.
Definition UVec4.inl:378

UVec4::ArithmeticShiftRight
JPH_INLINE UVec4 ArithmeticShiftRight() const
Shift all components by Count bits to the right (shifting in the value of the highest bit)

UVec4::UVec4
UVec4()=default
Constructor.

UVec4::operator*
JPH_INLINE UVec4 operator*(UVec4Arg inV2) const
Component wise multiplication of two integer vectors (stores low 32 bits of result only)
Definition UVec4.inl:244

UVec4::ToFloat
JPH_INLINE Vec4 ToFloat() const
Convert each component from an int to a float.
Definition UVec4.inl:356

UVec4::ReinterpretAsFloat
JPH_INLINE Vec4 ReinterpretAsFloat() const
Reinterpret UVec4 as a Vec4 (doesn't change the bits)
Definition UVec4.inl:367

UVec4::StoreInt4Aligned
JPH_INLINE void StoreInt4Aligned(uint32 *outV) const
Store 4 ints to memory, aligned to 16 bytes.
Definition UVec4.inl:418

UVec4::TestAnyTrue
JPH_INLINE bool TestAnyTrue() const
Test if any of the components are true (true is when highest bit of component is set)
Definition UVec4.inl:453

UVec4::mU32
uint32 mU32[4]
Definition UVec4.h:224

Vec4
Definition Vec4.h:14