9#if defined(JPH_USE_SSE)
10 mValue = _mm_set_epi32(
int(inW),
int(inZ),
int(inY),
int(inX));
11#elif defined(JPH_USE_NEON)
12 uint32x2_t
xy = vcreate_u32(
static_cast<uint64>(inX) | (
static_cast<uint64>(inY) << 32));
13 uint32x2_t zw = vcreate_u32(
static_cast<uint64>(inZ) | (
static_cast<uint64>(inW) << 32));
15#elif defined(JPH_USE_RVV)
16 vuint32m1_t v = __riscv_vmv_v_x_u32m1(inW, 4);
17 v = __riscv_vslide1up_vx_u32m1(v, inZ, 4);
18 v = __riscv_vslide1up_vx_u32m1(v, inY, 4);
19 v = __riscv_vslide1up_vx_u32m1(v, inX, 4);
20 __riscv_vse32_v_u32m1(
mU32, v, 4);
34template<u
int32 SwizzleX, u
int32 SwizzleY, u
int32 SwizzleZ, u
int32 SwizzleW>
37 static_assert(SwizzleX <= 3,
"SwizzleX template parameter out of range");
38 static_assert(SwizzleY <= 3,
"SwizzleY template parameter out of range");
39 static_assert(SwizzleZ <= 3,
"SwizzleZ template parameter out of range");
40 static_assert(SwizzleW <= 3,
"SwizzleW template parameter out of range");
42#if defined(JPH_USE_SSE)
43 return _mm_shuffle_epi32(
mValue, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX));
44#elif defined(JPH_USE_NEON)
45 return JPH_NEON_SHUFFLE_U32x4(
mValue,
mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);
46#elif defined(JPH_USE_RVV)
48 const vuint32m1_t data = __riscv_vle32_v_u32m1(
mU32, 4);
49 const uint32 stored_indices[4] = { SwizzleX, SwizzleY, SwizzleZ, SwizzleW };
50 const vuint32m1_t index = __riscv_vle32_v_u32m1(stored_indices, 4);
51 const vuint32m1_t swizzled = __riscv_vrgather_vv_u32m1(data, index, 4);
52 __riscv_vse32_v_u32m1(v.
mU32, swizzled, 4);
61#if defined(JPH_USE_SSE)
62 return _mm_setzero_si128();
63#elif defined(JPH_USE_NEON)
64 return vdupq_n_u32(0);
65#elif defined(JPH_USE_RVV)
67 const vuint32m1_t zero_vec = __riscv_vmv_v_x_u32m1(0, 4);
68 __riscv_vse32_v_u32m1(v.
mU32, zero_vec, 4);
71 return UVec4(0, 0, 0, 0);
77#if defined(JPH_USE_SSE)
78 return _mm_set1_epi32(
int(inV));
79#elif defined(JPH_USE_NEON)
80 return vdupq_n_u32(inV);
81#elif defined(JPH_USE_RVV)
83 const vuint32m1_t v = __riscv_vmv_v_x_u32m1(inV, 4);
84 __riscv_vse32_v_u32m1(vec.
mU32, v, 4);
87 return UVec4(inV, inV, inV, inV);
93#if defined(JPH_USE_SSE)
94 return _mm_castps_si128(_mm_load_ss(
reinterpret_cast<const float*
>(inV)));
95#elif defined(JPH_USE_NEON)
96 return vsetq_lane_u32(*inV, vdupq_n_u32(0), 0);
98 return UVec4(*inV, 0, 0, 0);
104#if defined(JPH_USE_SSE)
105 return _mm_loadu_si128(
reinterpret_cast<const __m128i *
>(inV));
106#elif defined(JPH_USE_NEON)
107 return vld1q_u32(inV);
108#elif defined(JPH_USE_RVV)
110 const vuint32m1_t v = __riscv_vle32_v_u32m1(inV, 4);
111 __riscv_vse32_v_u32m1(vector.
mU32, v, 4);
114 return UVec4(inV[0], inV[1], inV[2], inV[3]);
120#if defined(JPH_USE_SSE)
121 return _mm_load_si128(
reinterpret_cast<const __m128i *
>(inV));
122#elif defined(JPH_USE_NEON)
123 return vld1q_u32(inV);
124#elif defined(JPH_USE_RVV)
126 const vuint32m1_t v = __riscv_vle32_v_u32m1(inV, 4);
127 __riscv_vse32_v_u32m1(vector.
mU32, v, 4);
130 return UVec4(inV[0], inV[1], inV[2], inV[3]);
134template <const
int Scale>
138 return _mm_i32gather_epi32(
reinterpret_cast<const int *
>(inBase), inOffsets.
mValue, Scale);
139#elif defined(JPH_USE_RVV)
141 const vuint32m1_t offsets = __riscv_vle32_v_u32m1(inOffsets.
mU32, 4);
142 const vuint32m1_t scaled_offsets = __riscv_vmul_vx_u32m1(offsets, Scale, 4);
143 const vuint32m1_t gathered = __riscv_vluxei32_v_u32m1(inBase, scaled_offsets, 4);
144 __riscv_vse32_v_u32m1(v.
mU32, gathered, 4);
147 const uint8 *base =
reinterpret_cast<const uint8 *
>(inBase);
148 uint32 x = *
reinterpret_cast<const uint32 *
>(base + inOffsets.
GetX() * Scale);
149 uint32 y = *
reinterpret_cast<const uint32 *
>(base + inOffsets.
GetY() * Scale);
150 uint32 z = *
reinterpret_cast<const uint32 *
>(base + inOffsets.
GetZ() * Scale);
151 uint32 w = *
reinterpret_cast<const uint32 *
>(base + inOffsets.
GetW() * Scale);
152 return UVec4(x, y, z, w);
158#if defined(JPH_USE_SSE4_1)
160#elif defined(JPH_USE_NEON)
162#elif defined(JPH_USE_RVV)
164 const vuint32m1_t v1 = __riscv_vle32_v_u32m1(inV1.
mU32, 4);
165 const vuint32m1_t v2 = __riscv_vle32_v_u32m1(inV2.
mU32, 4);
166 const vuint32m1_t min = __riscv_vminu_vv_u32m1(v1, v2, 4);
167 __riscv_vse32_v_u32m1(res.
mU32, min, 4);
171 for (
int i = 0; i < 4; i++)
179#if defined(JPH_USE_SSE4_1)
181#elif defined(JPH_USE_NEON)
183#elif defined(JPH_USE_RVV)
185 const vuint32m1_t v1 = __riscv_vle32_v_u32m1(inV1.
mU32, 4);
186 const vuint32m1_t v2 = __riscv_vle32_v_u32m1(inV2.
mU32, 4);
187 const vuint32m1_t max = __riscv_vmaxu_vv_u32m1(v1, v2, 4);
188 __riscv_vse32_v_u32m1(res.
mU32, max, 4);
192 for (
int i = 0; i < 4; i++)
200#if defined(JPH_USE_SSE)
202#elif defined(JPH_USE_NEON)
204#elif defined(JPH_USE_RVV)
206 const vuint32m1_t v1 = __riscv_vle32_v_u32m1(inV1.
mU32, 4);
207 const vuint32m1_t v2 = __riscv_vle32_v_u32m1(inV2.
mU32, 4);
208 const vbool32_t mask = __riscv_vmseq_vv_u32m1_b32(v1, v2, 4);
209 const vuint32m1_t zeros = __riscv_vmv_v_x_u32m1(0x0, 4);
210 const vuint32m1_t merged = __riscv_vmerge_vxm_u32m1(zeros, 0xFFFFFFFF, mask, 4);
211 __riscv_vse32_v_u32m1(res.
mU32, merged, 4);
215 inV1.
mU32[1] == inV2.
mU32[1]? 0xffffffffu : 0,
216 inV1.
mU32[2] == inV2.
mU32[2]? 0xffffffffu : 0,
217 inV1.
mU32[3] == inV2.
mU32[3]? 0xffffffffu : 0);
223#if defined(JPH_USE_SSE4_1) && !defined(JPH_PLATFORM_WASM)
224 return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(inNotSet.
mValue), _mm_castsi128_ps(inSet.
mValue), _mm_castsi128_ps(inControl.
mValue)));
225#elif defined(JPH_USE_SSE)
226 __m128 is_set = _mm_castsi128_ps(_mm_srai_epi32(inControl.
mValue, 31));
227 return _mm_castps_si128(_mm_or_ps(_mm_and_ps(is_set, _mm_castsi128_ps(inSet.
mValue)), _mm_andnot_ps(is_set, _mm_castsi128_ps(inNotSet.
mValue))));
228#elif defined(JPH_USE_NEON)
229 return vbslq_u32(vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(inControl.
mValue), 31)), inSet.
mValue, inNotSet.
mValue);
230#elif defined(JPH_USE_RVV)
232 const vuint32m1_t control = __riscv_vle32_v_u32m1(inControl.
mU32, 4);
233 const vuint32m1_t not_set = __riscv_vle32_v_u32m1(inNotSet.
mU32, 4);
234 const vuint32m1_t set = __riscv_vle32_v_u32m1(inSet.
mU32, 4);
237 const vuint32m1_t r = __riscv_vand_vx_u32m1(control, 0x80000000u, 4);
238 const vbool32_t rvv_mask = __riscv_vmsne_vx_u32m1_b32(r, 0x0, 4);
239 const vuint32m1_t merged = __riscv_vmerge_vvm_u32m1(not_set, set, rvv_mask, 4);
240 __riscv_vse32_v_u32m1(masked.
mU32, merged, 4);
244 for (
int i = 0; i < 4; i++)
245 result.
mU32[i] = (inControl.
mU32[i] & 0x80000000u) ? inSet.
mU32[i] : inNotSet.
mU32[i];
252#if defined(JPH_USE_SSE)
254#elif defined(JPH_USE_NEON)
256#elif defined(JPH_USE_RVV)
258 const vuint32m1_t v1 = __riscv_vle32_v_u32m1(inV1.
mU32, 4);
259 const vuint32m1_t v2 = __riscv_vle32_v_u32m1(inV2.
mU32, 4);
260 const vuint32m1_t res = __riscv_vor_vv_u32m1(v1, v2, 4);
261 __riscv_vse32_v_u32m1(or_result.
mU32, res, 4);
273#if defined(JPH_USE_SSE)
275#elif defined(JPH_USE_NEON)
277#elif defined(JPH_USE_RVV)
279 const vuint32m1_t v1 = __riscv_vle32_v_u32m1(inV1.
mU32, 4);
280 const vuint32m1_t v2 = __riscv_vle32_v_u32m1(inV2.
mU32, 4);
281 const vuint32m1_t res = __riscv_vxor_vv_u32m1(v1, v2, 4);
282 __riscv_vse32_v_u32m1(xor_result.
mU32, res, 4);
294#if defined(JPH_USE_SSE)
296#elif defined(JPH_USE_NEON)
298#elif defined(JPH_USE_RVV)
300 const vuint32m1_t v1 = __riscv_vle32_v_u32m1(inV1.
mU32, 4);
301 const vuint32m1_t v2 = __riscv_vle32_v_u32m1(inV2.
mU32, 4);
302 const vuint32m1_t res = __riscv_vand_vv_u32m1(v1, v2, 4);
303 __riscv_vse32_v_u32m1(and_result.
mU32, res, 4);
316#if defined(JPH_USE_AVX512)
318#elif defined(JPH_USE_SSE)
320#elif defined(JPH_USE_NEON)
321 return vmvnq_u32(inV1.
mValue);
322#elif defined(JPH_USE_RVV)
324 const vuint32m1_t src = __riscv_vle32_v_u32m1(inV1.
mU32, 4);
325 const vuint32m1_t rvv_not = __riscv_vxor_vx_u32m1(src, -1, 4);
326 __riscv_vse32_v_u32m1(v.
mU32, rvv_not, 4);
349#if defined(JPH_USE_SSE4_1)
351#elif defined(JPH_USE_NEON)
353#elif defined(JPH_USE_RVV)
355 const vuint32m1_t v1 = __riscv_vle32_v_u32m1(
mU32, 4);
356 const vuint32m1_t v2 = __riscv_vle32_v_u32m1(inV2.
mU32, 4);
357 const vuint32m1_t mul = __riscv_vmul_vv_u32m1(v1, v2, 4);
358 __riscv_vse32_v_u32m1(res.
mU32, mul, 4);
362 for (
int i = 0; i < 4; i++)
370#if defined(JPH_USE_SSE)
372#elif defined(JPH_USE_NEON)
374#elif defined(JPH_USE_RVV)
376 const vuint32m1_t v1 = __riscv_vle32_v_u32m1(
mU32, 4);
377 const vuint32m1_t v2 = __riscv_vle32_v_u32m1(inV2.
mU32, 4);
378 const vuint32m1_t rvv_add = __riscv_vadd_vv_u32m1(v1, v2, 4);
379 __riscv_vse32_v_u32m1(res.
mU32, rvv_add, 4);
391#if defined(JPH_USE_SSE)
393#elif defined(JPH_USE_NEON)
395#elif defined(JPH_USE_RVV)
396 const vuint32m1_t v1 = __riscv_vle32_v_u32m1(
mU32, 4);
397 const vuint32m1_t v2 = __riscv_vle32_v_u32m1(inV2.
mU32, 4);
398 const vuint32m1_t rvv_add = __riscv_vadd_vv_u32m1(v1, v2, 4);
399 __riscv_vse32_v_u32m1(
mU32, rvv_add, 4);
401 for (
int i = 0; i < 4; ++i)
409#if defined(JPH_USE_SSE)
411#elif defined(JPH_USE_NEON)
413#elif defined(JPH_USE_RVV)
415 const vuint32m1_t v1 = __riscv_vle32_v_u32m1(
mU32, 4);
416 const vuint32m1_t v2 = __riscv_vle32_v_u32m1(inV2.
mU32, 4);
417 const vuint32m1_t rvv_add = __riscv_vsub_vv_u32m1(v1, v2, 4);
418 __riscv_vse32_v_u32m1(res.
mU32, rvv_add, 4);
430#if defined(JPH_USE_SSE)
432#elif defined(JPH_USE_NEON)
434#elif defined(JPH_USE_RVV)
435 const vuint32m1_t v1 = __riscv_vle32_v_u32m1(
mU32, 4);
436 const vuint32m1_t v2 = __riscv_vle32_v_u32m1(inV2.
mU32, 4);
437 const vuint32m1_t rvv_sub = __riscv_vsub_vv_u32m1(v1, v2, 4);
438 __riscv_vse32_v_u32m1(
mU32, rvv_sub, 4);
440 for (
int i = 0; i < 4; ++i)
448#if defined(JPH_USE_SSE)
449 return _mm_shuffle_epi32(
mValue, _MM_SHUFFLE(0, 0, 0, 0));
450#elif defined(JPH_USE_NEON)
451 return vdupq_laneq_u32(
mValue, 0);
452#elif defined(JPH_USE_RVV)
454 const vuint32m1_t splat = __riscv_vmv_v_x_u32m1(
mU32[0], 4);
455 __riscv_vse32_v_u32m1(vec.
mU32, splat, 4);
464#if defined(JPH_USE_SSE)
465 return _mm_shuffle_epi32(
mValue, _MM_SHUFFLE(1, 1, 1, 1));
466#elif defined(JPH_USE_NEON)
467 return vdupq_laneq_u32(
mValue, 1);
468#elif defined(JPH_USE_RVV)
470 const vuint32m1_t splat = __riscv_vmv_v_x_u32m1(
mU32[1], 4);
471 __riscv_vse32_v_u32m1(vec.
mU32, splat, 4);
480#if defined(JPH_USE_SSE)
481 return _mm_shuffle_epi32(
mValue, _MM_SHUFFLE(2, 2, 2, 2));
482#elif defined(JPH_USE_NEON)
483 return vdupq_laneq_u32(
mValue, 2);
484#elif defined(JPH_USE_RVV)
486 const vuint32m1_t splat = __riscv_vmv_v_x_u32m1(
mU32[2], 4);
487 __riscv_vse32_v_u32m1(vec.
mU32, splat, 4);
496#if defined(JPH_USE_SSE)
497 return _mm_shuffle_epi32(
mValue, _MM_SHUFFLE(3, 3, 3, 3));
498#elif defined(JPH_USE_NEON)
499 return vdupq_laneq_u32(
mValue, 3);
500#elif defined(JPH_USE_RVV)
502 const vuint32m1_t splat = __riscv_vmv_v_x_u32m1(
mU32[3], 4);
503 __riscv_vse32_v_u32m1(vec.
mU32, splat, 4);
512#if defined(JPH_USE_SSE)
513 return _mm_cvtepi32_ps(
mValue);
514#elif defined(JPH_USE_NEON)
515 return vcvtq_f32_u32(
mValue);
516#elif defined(JPH_USE_RVV)
518 const vuint32m1_t v = __riscv_vle32_v_u32m1(
mU32, 4);
519 const vfloat32m1_t v_float = __riscv_vfcvt_f_xu_v_f32m1(v, 4);
520 __riscv_vse32_v_f32m1(res.
mF32, v_float, 4);
529#if defined(JPH_USE_SSE)
531#elif defined(JPH_USE_NEON)
532 return vreinterpretq_f32_u32(
mValue);
534 return *
reinterpret_cast<const Vec4 *
>(
this);
540#if defined(JPH_USE_SSE4_1)
542 __m128i sum = _mm_add_epi32(mul, _mm_shuffle_epi32(mul, _MM_SHUFFLE(2, 3, 0, 1)));
543 return _mm_add_epi32(sum, _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2)));
544#elif defined(JPH_USE_NEON)
546 return vdupq_n_u32(vaddvq_u32(mul));
547#elif defined(JPH_USE_RVV)
549 const vuint32m1_t zeros = __riscv_vmv_v_x_u32m1(0, 4);
550 const vuint32m1_t v1 = __riscv_vle32_v_u32m1(
mU32, 4);
551 const vuint32m1_t v2 = __riscv_vle32_v_u32m1(inV2.
mU32, 4);
552 const vuint32m1_t mul = __riscv_vmul_vv_u32m1(v1, v2, 4);
553 const vuint32m1_t sum = __riscv_vredsum_vs_u32m1_u32m1(mul, zeros, 4);
554 const vuint32m1_t splat = __riscv_vrgather_vx_u32m1(sum, 0, 4);
555 __riscv_vse32_v_u32m1(res.
mU32, splat, 4);
565#if defined(JPH_USE_SSE4_1)
567 __m128i sum = _mm_add_epi32(mul, _mm_shuffle_epi32(mul, _MM_SHUFFLE(2, 3, 0, 1)));
568 return _mm_cvtsi128_si32(_mm_add_epi32(sum, _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2))));
569#elif defined(JPH_USE_NEON)
571 return vaddvq_u32(mul);
572#elif defined(JPH_USE_RVV)
573 const vuint32m1_t zeros = __riscv_vmv_v_x_u32m1(0, 4);
574 const vuint32m1_t v1 = __riscv_vle32_v_u32m1(
mU32, 4);
575 const vuint32m1_t v2 = __riscv_vle32_v_u32m1(inV2.
mU32, 4);
576 const vuint32m1_t mul = __riscv_vmul_vv_u32m1(v1, v2, 4);
577 const vuint32m1_t sum = __riscv_vredsum_vs_u32m1_u32m1(mul, zeros, 4);
578 return __riscv_vmv_x_s_u32m1_u32(sum);
586#if defined(JPH_USE_SSE)
587 _mm_storeu_si128(
reinterpret_cast<__m128i *
>(outV),
mValue);
588#elif defined(JPH_USE_NEON)
590#elif defined(JPH_USE_RVV)
591 const vuint32m1_t v = __riscv_vle32_v_u32m1(
mU32, 4);
592 __riscv_vse32_v_u32m1(outV, v, 4);
594 for (
int i = 0; i < 4; ++i)
601#if defined(JPH_USE_SSE)
602 _mm_store_si128(
reinterpret_cast<__m128i *
>(outV),
mValue);
603#elif defined(JPH_USE_NEON)
605#elif defined(JPH_USE_RVV)
606 const vuint32m1_t v = __riscv_vle32_v_u32m1(
mU32, 4);
607 __riscv_vse32_v_u32m1(outV, v, 4);
609 for (
int i = 0; i < 4; ++i)
616#if defined(JPH_USE_SSE)
618#elif defined(JPH_USE_NEON)
619 return vaddvq_u32(vshrq_n_u32(
mValue, 31));
620#elif defined(JPH_USE_RVV)
621 const vuint32m1_t src = __riscv_vle32_v_u32m1(
mU32, 4);
622 const vuint32m1_t filter = __riscv_vand_vx_u32m1(src, 0x80000000, 4);
623 const vbool32_t mask = __riscv_vmsne_vx_u32m1_b32(filter, 0, 4);
624 return __riscv_vcpop_m_b32(mask, 4);
626 return (
mU32[0] >> 31) + (
mU32[1] >> 31) + (
mU32[2] >> 31) + (
mU32[3] >> 31);
632#if defined(JPH_USE_SSE)
633 return _mm_movemask_ps(_mm_castsi128_ps(
mValue));
634#elif defined(JPH_USE_NEON)
635 int32x4_t shift = JPH_NEON_INT32x4(0, 1, 2, 3);
636 return vaddvq_u32(vshlq_u32(vshrq_n_u32(
mValue, 31), shift));
637#elif defined(JPH_USE_RVV)
638 const vuint32m1_t src = __riscv_vle32_v_u32m1(
mU32, 4);
639 const vbool32_t mask = __riscv_vmsgeu_vx_u32m1_b32(src, 0x80000000, 4);
640 const vuint32m1_t as_int = __riscv_vreinterpret_v_b32_u32m1(mask);
641 const uint32 result = __riscv_vmv_x_s_u32m1_u32(as_int) & 0xF;
644 return (
mU32[0] >> 31) | ((
mU32[1] >> 31) << 1) | ((
mU32[2] >> 31) << 2) | ((
mU32[3] >> 31) << 3);
665 return (
GetTrues() & 0b111) == 0b111;
668template <const u
int Count>
671 static_assert(Count <= 31,
"Invalid shift");
673#if defined(JPH_USE_SSE)
674 return _mm_slli_epi32(
mValue, Count);
675#elif defined(JPH_USE_NEON)
676 return vshlq_n_u32(
mValue, Count);
677#elif defined(JPH_USE_RVV)
678 const vuint32m1_t v = __riscv_vle32_v_u32m1(
mU32, 4);
679 const vuint32m1_t shifted = __riscv_vsll_vx_u32m1(v, Count, 4);
682 __riscv_vse32_v_u32m1(vec.
mU32, shifted, 4);
689template <const u
int Count>
692 static_assert(Count <= 31,
"Invalid shift");
694#if defined(JPH_USE_SSE)
695 return _mm_srli_epi32(
mValue, Count);
696#elif defined(JPH_USE_NEON)
697 return vshrq_n_u32(
mValue, Count);
698#elif defined(JPH_USE_RVV)
699 const vuint32m1_t v = __riscv_vle32_v_u32m1(
mU32, 4);
700 const vuint32m1_t shifted = __riscv_vsrl_vx_u32m1(v, Count, 4);
703 __riscv_vse32_v_u32m1(vec.
mU32, shifted, 4);
710template <const u
int Count>
713 static_assert(Count <= 31,
"Invalid shift");
715#if defined(JPH_USE_SSE)
716 return _mm_srai_epi32(
mValue, Count);
717#elif defined(JPH_USE_NEON)
718 return vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(
mValue), Count));
719#elif defined(JPH_USE_RVV)
720 const vint32m1_t v = __riscv_vle32_v_i32m1(
reinterpret_cast<const int32 *
>(
mU32), 4);
721 const vint32m1_t shifted = __riscv_vsra_vx_i32m1(v, Count, 4);
724 __riscv_vse32_v_i32m1(
reinterpret_cast<int32 *
>(vec.
mU32), shifted, 4);
736#if defined(JPH_USE_SSE)
737 return _mm_unpacklo_epi16(
mValue, _mm_castps_si128(_mm_setzero_ps()));
738#elif defined(JPH_USE_NEON)
739 uint16x4_t value = vget_low_u16(vreinterpretq_u16_u32(
mValue));
740 uint16x4_t zero = vdup_n_u16(0);
741 return vreinterpretq_u32_u16(vcombine_u16(vzip1_u16(value, zero), vzip2_u16(value, zero)));
742#elif defined(JPH_USE_RVV)
743 const vuint16mf2_t v = __riscv_vle16_v_u16mf2(
reinterpret_cast<const uint16 *
>(
mU32), 4);
744 const vuint32m1_t zext = __riscv_vzext_vf2_u32m1(v, 4);
747 __riscv_vse32_v_u32m1(res.
mU32, zext, 4);
751 (
mU32[0] >> 16) & 0xffff,
753 (
mU32[1] >> 16) & 0xffff);
759#if defined(JPH_USE_SSE)
760 return _mm_unpackhi_epi16(
mValue, _mm_castps_si128(_mm_setzero_ps()));
761#elif defined(JPH_USE_NEON)
762 uint16x4_t value = vget_high_u16(vreinterpretq_u16_u32(
mValue));
763 uint16x4_t zero = vdup_n_u16(0);
764 return vreinterpretq_u32_u16(vcombine_u16(vzip1_u16(value, zero), vzip2_u16(value, zero)));
765#elif defined(JPH_USE_RVV)
766 const vuint16mf2_t v = __riscv_vle16_v_u16mf2(
reinterpret_cast<const uint16 *
>(&
mU32[2]), 4);
767 const vuint32m1_t zext = __riscv_vzext_vf2_u32m1(v, 4);
770 __riscv_vse32_v_u32m1(res.
mU32, zext, 4);
774 (
mU32[2] >> 16) & 0xffff,
776 (
mU32[3] >> 16) & 0xffff);
782#if defined(JPH_USE_SSE4_1)
783 return _mm_shuffle_epi8(
mValue, _mm_set_epi32(
int(0xffffff03),
int(0xffffff02),
int(0xffffff01),
int(0xffffff00)));
784#elif defined(JPH_USE_NEON)
785 uint8x16_t idx = JPH_NEON_UINT8x16(0x00, 0x7f, 0x7f, 0x7f, 0x01, 0x7f, 0x7f, 0x7f, 0x02, 0x7f, 0x7f, 0x7f, 0x03, 0x7f, 0x7f, 0x7f);
786 return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(
mValue), idx));
787#elif defined(JPH_USE_RVV)
788 const vuint8mf4_t v = __riscv_vle8_v_u8mf4(
reinterpret_cast<const uint8 *
>(
mU32), 4);
789 const vuint32m1_t zext = __riscv_vzext_vf4_u32m1(v, 4);
792 __riscv_vse32_v_u32m1(res.
mU32, zext, 4);
796 for (
int i = 0; i < 4; i++)
797 result.
mU32[i] = (
mU32[0] >> (i * 8)) & 0xff;
804#if defined(JPH_USE_SSE4_1)
805 return _mm_shuffle_epi8(
mValue, _mm_set_epi32(
int(0xffffff07),
int(0xffffff06),
int(0xffffff05),
int(0xffffff04)));
806#elif defined(JPH_USE_NEON)
807 uint8x16_t idx = JPH_NEON_UINT8x16(0x04, 0x7f, 0x7f, 0x7f, 0x05, 0x7f, 0x7f, 0x7f, 0x06, 0x7f, 0x7f, 0x7f, 0x07, 0x7f, 0x7f, 0x7f);
808 return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(
mValue), idx));
809#elif defined(JPH_USE_RVV)
810 const vuint8mf4_t v = __riscv_vle8_v_u8mf4(
reinterpret_cast<const uint8 *
>(&
mU32[1]), 4);
811 const vuint32m1_t zext = __riscv_vzext_vf4_u32m1(v, 4);
814 __riscv_vse32_v_u32m1(res.
mU32, zext, 4);
818 for (
int i = 0; i < 4; i++)
819 result.
mU32[i] = (
mU32[1] >> (i * 8)) & 0xff;
826#if defined(JPH_USE_SSE4_1)
827 return _mm_shuffle_epi8(
mValue, _mm_set_epi32(
int(0xffffff0b),
int(0xffffff0a),
int(0xffffff09),
int(0xffffff08)));
828#elif defined(JPH_USE_NEON)
829 uint8x16_t idx = JPH_NEON_UINT8x16(0x08, 0x7f, 0x7f, 0x7f, 0x09, 0x7f, 0x7f, 0x7f, 0x0a, 0x7f, 0x7f, 0x7f, 0x0b, 0x7f, 0x7f, 0x7f);
830 return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(
mValue), idx));
831#elif defined(JPH_USE_RVV)
832 const vuint8mf4_t v = __riscv_vle8_v_u8mf4(
reinterpret_cast<const uint8 *
>(&
mU32[2]), 4);
833 const vuint32m1_t zext = __riscv_vzext_vf4_u32m1(v, 4);
836 __riscv_vse32_v_u32m1(res.
mU32, zext, 4);
840 for (
int i = 0; i < 4; i++)
841 result.
mU32[i] = (
mU32[2] >> (i * 8)) & 0xff;
848#if defined(JPH_USE_SSE4_1)
849 return _mm_shuffle_epi8(
mValue, _mm_set_epi32(
int(0xffffff0f),
int(0xffffff0e),
int(0xffffff0d),
int(0xffffff0c)));
850#elif defined(JPH_USE_NEON)
851 uint8x16_t idx = JPH_NEON_UINT8x16(0x0c, 0x7f, 0x7f, 0x7f, 0x0d, 0x7f, 0x7f, 0x7f, 0x0e, 0x7f, 0x7f, 0x7f, 0x0f, 0x7f, 0x7f, 0x7f);
852 return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(
mValue), idx));
853#elif defined(JPH_USE_RVV)
854 const vuint8mf4_t v = __riscv_vle8_v_u8mf4(
reinterpret_cast<const uint8 *
>(&
mU32[3]), 4);
855 const vuint32m1_t zext = __riscv_vzext_vf4_u32m1(v, 4);
858 __riscv_vse32_v_u32m1(res.
mU32, zext, 4);
862 for (
int i = 0; i < 4; i++)
863 result.
mU32[i] = (
mU32[3] >> (i * 8)) & 0xff;
870#if defined(JPH_USE_SSE4_1) || defined(JPH_USE_NEON)
871 alignas(
UVec4)
static constexpr uint32 sFourMinusXShuffle[5][4] =
873 { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff },
874 { 0x0f0e0d0c, 0xffffffff, 0xffffffff, 0xffffffff },
875 { 0x0b0a0908, 0x0f0e0d0c, 0xffffffff, 0xffffffff },
876 { 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0xffffffff },
877 { 0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c }
881#if defined(JPH_USE_SSE4_1)
882 return _mm_shuffle_epi8(
mValue, *
reinterpret_cast<const UVec4::Type *
>(sFourMinusXShuffle[inCount]));
883#elif defined(JPH_USE_NEON)
884 uint8x16_t idx = vreinterpretq_u8_u32(*
reinterpret_cast<const UVec4::Type *
>(sFourMinusXShuffle[inCount]));
885 return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(
mValue), idx));
886#elif defined(JPH_USE_RVV)
887 const uint32 *start_ptr =
mU32 + (4 - inCount);
888 const vuint32m1_t v = __riscv_vle32_v_u32m1(start_ptr, inCount);
891 __riscv_vse32_v_u32m1(res.
mU32, v, inCount);
895 for (
int i = 0; i < inCount; i++)
896 result.
mU32[i] =
mU32[i + 4 - inCount];
std::uint8_t uint8
Definition Core.h:506
std::int32_t int32
Definition Core.h:509
std::uint64_t uint64
Definition Core.h:510
#define JPH_NAMESPACE_END
Definition Core.h:428
std::uint32_t uint32
Definition Core.h:508
#define JPH_NAMESPACE_BEGIN
Definition Core.h:422
std::uint16_t uint16
Definition Core.h:507
#define xy
Definition HLSLToCPP.h:511
uint CountBits(uint32 inValue)
Count the number of 1 bits in a value.
Definition Math.h:164
@ SWIZZLE_Z
Use the Z component.
Definition Swizzle.h:14
@ SWIZZLE_W
Use the W component.
Definition Swizzle.h:15
@ SWIZZLE_X
Use the X component.
Definition Swizzle.h:12
@ SWIZZLE_Y
Use the Y component.
Definition Swizzle.h:13
JPH_INLINE UVec4 operator-(UVec4Arg inV2) const
Subtract two integer vectors (component wise)
Definition UVec4.inl:407
JPH_INLINE UVec4 Swizzle() const
Swizzle the elements in inV.
static JPH_INLINE UVec4 sNot(UVec4Arg inV1)
Logical not (component wise)
Definition UVec4.inl:314
JPH_INLINE uint32 GetZ() const
Definition UVec4.h:104
static JPH_INLINE UVec4 sMin(UVec4Arg inV1, UVec4Arg inV2)
Return the minimum value of each of the components.
Definition UVec4.inl:156
JPH_INLINE UVec4 LogicalShiftLeft() const
Shift all components by Count bits to the left (filling with zeros from the left)
JPH_INLINE int CountTrues() const
Count the number of components that are true (true is when highest bit of component is set)
Definition UVec4.inl:614
JPH_INLINE UVec4 & operator-=(UVec4Arg inV2)
Subtract two integer vectors (component wise)
Definition UVec4.inl:428
JPH_INLINE UVec4 SplatY() const
Replicate the Y component to all components.
Definition UVec4.inl:462
static JPH_INLINE UVec4 sSelect(UVec4Arg inNotSet, UVec4Arg inSet, UVec4Arg inControl)
Component wise select, returns inNotSet when highest bit of inControl = 0 and inSet when highest bit ...
Definition UVec4.inl:221
static JPH_INLINE UVec4 sLoadInt(const uint32 *inV)
Load 1 int from memory and place it in the X component, zeros Y, Z and W.
Definition UVec4.inl:91
JPH_INLINE UVec4 Expand4Uint16Lo() const
Takes the lower 4 16 bits and expands them to X, Y, Z and W.
Definition UVec4.inl:734
static JPH_INLINE UVec4 sSort4True(UVec4Arg inValue, UVec4Arg inIndex)
Definition UVec4.inl:333
JPH_INLINE UVec4 operator+(UVec4Arg inV2) const
Add two integer vectors (component wise)
Definition UVec4.inl:368
JPH_INLINE uint32 GetY() const
Definition UVec4.h:103
JPH_INLINE UVec4 LogicalShiftRight() const
Shift all components by Count bits to the right (filling with zeros from the right)
static JPH_INLINE UVec4 sReplicate(uint32 inV)
Replicate int inV across all components.
Definition UVec4.inl:75
JPH_INLINE UVec4 SplatX() const
Replicate the X component to all components.
Definition UVec4.inl:446
JPH_INLINE UVec4 Expand4Byte4() const
Takes byte 4 .. 7 and expands them to X, Y, Z and W.
Definition UVec4.inl:802
JPH_INLINE bool TestAllTrue() const
Test if all components are true (true is when highest bit of component is set)
Definition UVec4.inl:658
JPH_INLINE UVec4 Expand4Byte0() const
Takes byte 0 .. 3 and expands them to X, Y, Z and W.
Definition UVec4.inl:780
JPH_INLINE int GetTrues() const
Store if X is true in bit 0, Y in bit 1, Z in bit 2 and W in bit 3 (true is when highest bit of compo...
Definition UVec4.inl:630
JPH_INLINE bool TestAnyXYZTrue() const
Test if any of X, Y or Z components are true (true is when highest bit of component is set)
Definition UVec4.inl:653
JPH_INLINE UVec4 & operator+=(UVec4Arg inV2)
Add two integer vectors (component wise)
Definition UVec4.inl:389
static JPH_INLINE UVec4 sGatherInt4(const uint32 *inBase, UVec4Arg inOffsets)
Gather 4 ints from memory at inBase + inOffsets[i] * Scale.
static JPH_INLINE UVec4 sAnd(UVec4Arg inV1, UVec4Arg inV2)
Logical and (component wise)
Definition UVec4.inl:292
static JPH_INLINE UVec4 sEquals(UVec4Arg inV1, UVec4Arg inV2)
Equals (component wise)
Definition UVec4.inl:198
static JPH_INLINE UVec4 sOr(UVec4Arg inV1, UVec4Arg inV2)
Logical or (component wise)
Definition UVec4.inl:250
struct { uint32 mData[4];} Type
Definition UVec4.h:22
JPH_INLINE uint32 GetW() const
Definition UVec4.h:105
JPH_INLINE bool TestAllXYZTrue() const
Test if X, Y and Z components are true (true is when highest bit of component is set)
Definition UVec4.inl:663
JPH_INLINE UVec4 ShiftComponents4Minus(int inCount) const
Shift vector components by 4 - Count floats to the left, so if Count = 1 the resulting vector is (W,...
Definition UVec4.inl:868
JPH_INLINE bool operator==(UVec4Arg inV2) const
Comparison.
Definition UVec4.inl:29
static JPH_INLINE UVec4 sMax(UVec4Arg inV1, UVec4Arg inV2)
Return the maximum of each of the components.
Definition UVec4.inl:177
JPH_INLINE UVec4 SplatZ() const
Replicate the Z component to all components.
Definition UVec4.inl:478
Type mValue
Definition UVec4.h:223
JPH_INLINE UVec4 SplatW() const
Replicate the W component to all components.
Definition UVec4.inl:494
JPH_INLINE void StoreInt4(uint32 *outV) const
Store 4 ints to memory.
Definition UVec4.inl:584
JPH_INLINE uint32 GetX() const
Get individual components.
Definition UVec4.h:102
JPH_INLINE UVec4 Expand4Byte8() const
Takes byte 8 .. 11 and expands them to X, Y, Z and W.
Definition UVec4.inl:824
static JPH_INLINE UVec4 sLoadInt4Aligned(const uint32 *inV)
Load 4 ints from memory, aligned to 16 bytes.
Definition UVec4.inl:118
static JPH_INLINE UVec4 sLoadInt4(const uint32 *inV)
Load 4 ints from memory.
Definition UVec4.inl:102
JPH_INLINE UVec4 Expand4Byte12() const
Takes byte 12 .. 15 and expands them to X, Y, Z and W.
Definition UVec4.inl:846
static JPH_INLINE UVec4 sXor(UVec4Arg inV1, UVec4Arg inV2)
Logical xor (component wise)
Definition UVec4.inl:271
JPH_INLINE UVec4 Expand4Uint16Hi() const
Takes the upper 4 16 bits and expands them to X, Y, Z and W.
Definition UVec4.inl:757
static JPH_INLINE UVec4 sZero()
Vector with all zeros.
Definition UVec4.inl:59
JPH_INLINE uint32 Dot(UVec4Arg inV2) const
Dot product.
Definition UVec4.inl:563
JPH_INLINE UVec4 DotV(UVec4Arg inV2) const
Dot product, returns the dot product in X, Y, Z and W components.
Definition UVec4.inl:538
JPH_INLINE UVec4 ArithmeticShiftRight() const
Shift all components by Count bits to the right (shifting in the value of the highest bit)
UVec4()=default
Constructor.
JPH_INLINE UVec4 operator*(UVec4Arg inV2) const
Component wise multiplication of two integer vectors (stores low 32 bits of result only)
Definition UVec4.inl:347
JPH_INLINE Vec4 ToFloat() const
Convert each component from an int to a float.
Definition UVec4.inl:510
JPH_INLINE Vec4 ReinterpretAsFloat() const
Reinterpret UVec4 as a Vec4 (doesn't change the bits)
Definition UVec4.inl:527
JPH_INLINE void StoreInt4Aligned(uint32 *outV) const
Store 4 ints to memory, aligned to 16 bytes.
Definition UVec4.inl:599
JPH_INLINE bool TestAnyTrue() const
Test if any of the components are true (true is when highest bit of component is set)
Definition UVec4.inl:648
uint32 mU32[4]
Definition UVec4.h:224
float mF32[4]
Definition Vec4.h:312