Jolt Physics
A multi core friendly Game Physics Engine
Loading...
Searching...
No Matches
Vec3.inl
Go to the documentation of this file.
1// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
2// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
3// SPDX-License-Identifier: MIT
4
5#include <Jolt/Math/Vec4.h>
6#include <Jolt/Math/UVec4.h>
8
10#include <random>
12
13// Create a std::hash/JPH::Hash for Vec3
14JPH_MAKE_HASHABLE(JPH::Vec3, t.GetX(), t.GetY(), t.GetZ())
15
17
18void Vec3::CheckW() const
19{
20#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
21 // Avoid asserts when both components are NaN
22 JPH_ASSERT(reinterpret_cast<const uint32 *>(mF32)[2] == reinterpret_cast<const uint32 *>(mF32)[3]);
23#endif // JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
24}
25
26JPH_INLINE Vec3::Type Vec3::sFixW(Type inValue)
27{
28#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
29 #if defined(JPH_USE_SSE)
30 return _mm_shuffle_ps(inValue, inValue, _MM_SHUFFLE(2, 2, 1, 0));
31 #elif defined(JPH_USE_NEON)
32 return JPH_NEON_SHUFFLE_F32x4(inValue, inValue, 0, 1, 2, 2);
33 #elif defined(JPH_USE_RVV)
34 Type value;
35 const vfloat32m1_t v = __riscv_vle32_v_f32m1(inValue.mData, 3);
36 __riscv_vse32_v_f32m1(value.mData, v, 3);
37 value.mData[3] = value.mData[2];
38 return value;
39 #else
40 Type value;
41 value.mData[0] = inValue.mData[0];
42 value.mData[1] = inValue.mData[1];
43 value.mData[2] = inValue.mData[2];
44 value.mData[3] = inValue.mData[2];
45 return value;
46 #endif
47#else
48 return inValue;
49#endif // JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
50}
51
53 mValue(sFixW(inRHS.mValue))
54{
55}
56
57Vec3::Vec3(const Float3 &inV)
58{
59#if defined(JPH_USE_SSE)
60 Type x = _mm_load_ss(&inV.x);
61 Type y = _mm_load_ss(&inV.y);
62 Type z = _mm_load_ss(&inV.z);
63 Type xy = _mm_unpacklo_ps(x, y);
64 mValue = _mm_shuffle_ps(xy, z, _MM_SHUFFLE(0, 0, 1, 0)); // Assure Z and W are the same
65#elif defined(JPH_USE_NEON)
66 float32x2_t xy = vld1_f32(&inV.x);
67 float32x2_t zz = vdup_n_f32(inV.z); // Assure Z and W are the same
68 mValue = vcombine_f32(xy, zz);
69#elif defined(JPH_USE_RVV)
70 const vfloat32m1_t v = __riscv_vle32_v_f32m1(&inV.x, 3);
71 __riscv_vse32_v_f32m1(mF32, v, 3);
72 mF32[3] = inV.z;
73#else
74 mF32[0] = inV.x;
75 mF32[1] = inV.y;
76 mF32[2] = inV.z;
77 mF32[3] = inV.z; // Not strictly needed when JPH_FLOATING_POINT_EXCEPTIONS_ENABLED is off but prevents warnings about uninitialized variables
78#endif
79}
80
81Vec3::Vec3(float inX, float inY, float inZ)
82{
83#if defined(JPH_USE_SSE)
84 mValue = _mm_set_ps(inZ, inZ, inY, inX);
85#elif defined(JPH_USE_NEON)
86 uint32x2_t xy = vcreate_u32(static_cast<uint64>(BitCast<uint32>(inX)) | (static_cast<uint64>(BitCast<uint32>(inY)) << 32));
87 uint32x2_t zz = vreinterpret_u32_f32(vdup_n_f32(inZ));
88 mValue = vreinterpretq_f32_u32(vcombine_u32(xy, zz));
89#elif defined(JPH_USE_RVV)
90 const float aggregated[4] = { inX, inY, inZ, inZ };
91 const vfloat32m1_t v = __riscv_vle32_v_f32m1(aggregated, 4);
92 __riscv_vse32_v_f32m1(mF32, v, 4);
93#else
94 mF32[0] = inX;
95 mF32[1] = inY;
96 mF32[2] = inZ;
97 mF32[3] = inZ; // Not strictly needed when JPH_FLOATING_POINT_EXCEPTIONS_ENABLED is off but prevents warnings about uninitialized variables
98#endif
99}
100
101template<uint32 SwizzleX, uint32 SwizzleY, uint32 SwizzleZ>
103{
104 static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
105 static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
106 static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
107
108#if defined(JPH_USE_SSE)
109 return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(SwizzleZ, SwizzleZ, SwizzleY, SwizzleX)); // Assure Z and W are the same
110#elif defined(JPH_USE_NEON)
111 return JPH_NEON_SHUFFLE_F32x4(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleZ);
112#elif defined(JPH_USE_RVV)
113 Vec3 v;
114 const vfloat32m1_t data = __riscv_vle32_v_f32m1(mF32, 4);
115 const uint32 stored_indices[4] = { SwizzleX, SwizzleY, SwizzleZ, SwizzleZ };
116 const vuint32m1_t index = __riscv_vle32_v_u32m1(stored_indices, 4);
117 const vfloat32m1_t swizzled = __riscv_vrgather_vv_f32m1(data, index, 4);
118 __riscv_vse32_v_f32m1(v.mF32, swizzled, 4);
119 return v;
120#else
121 return Vec3(mF32[SwizzleX], mF32[SwizzleY], mF32[SwizzleZ]);
122#endif
123}
124
126{
127#if defined(JPH_USE_SSE)
128 return _mm_setzero_ps();
129#elif defined(JPH_USE_NEON)
130 return vdupq_n_f32(0);
131#elif defined(JPH_USE_RVV)
132 Vec3 v;
133 const vfloat32m1_t zero_vec = __riscv_vfmv_v_f_f32m1(0.0f, 3);
134 __riscv_vse32_v_f32m1(v.mF32, zero_vec, 3);
135 return v;
136#else
137 return Vec3(0, 0, 0);
138#endif
139}
140
142{
143#if defined(JPH_USE_SSE)
144 return _mm_set1_ps(inV);
145#elif defined(JPH_USE_NEON)
146 return vdupq_n_f32(inV);
147#elif defined(JPH_USE_RVV)
148 Vec3 vec;
149 const vfloat32m1_t v = __riscv_vfmv_v_f_f32m1(inV, 3);
150 __riscv_vse32_v_f32m1(vec.mF32, v, 3);
151 return vec;
152#else
153 return Vec3(inV, inV, inV);
154#endif
155}
156
158{
159 return sReplicate(1.0f);
160}
161
163{
164 return sReplicate(numeric_limits<float>::quiet_NaN());
165}
166
168{
169#if defined(JPH_USE_SSE)
170 Type v = _mm_loadu_ps(&inV.x);
171#elif defined(JPH_USE_NEON)
172 Type v = vld1q_f32(&inV.x);
173#elif defined(JPH_USE_RVV)
174 Type v;
175 const vfloat32m1_t rvv = __riscv_vle32_v_f32m1(&inV.x, 3);
176 __riscv_vse32_v_f32m1(v.mData, rvv, 3);
177#else
178 Type v = { inV.x, inV.y, inV.z };
179#endif
180 return sFixW(v);
181}
182
184{
185#if defined(JPH_USE_SSE)
186 return _mm_min_ps(inV1.mValue, inV2.mValue);
187#elif defined(JPH_USE_NEON)
188 return vminq_f32(inV1.mValue, inV2.mValue);
189#elif defined(JPH_USE_RVV)
190 Vec3 res;
191 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(inV1.mF32, 3);
192 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 3);
193 const vfloat32m1_t min = __riscv_vfmin_vv_f32m1(v1, v2, 3);
194 __riscv_vse32_v_f32m1(res.mF32, min, 3);
195 return res;
196#else
197 return Vec3(min(inV1.mF32[0], inV2.mF32[0]),
198 min(inV1.mF32[1], inV2.mF32[1]),
199 min(inV1.mF32[2], inV2.mF32[2]));
200#endif
201}
202
204{
205#if defined(JPH_USE_SSE)
206 return _mm_max_ps(inV1.mValue, inV2.mValue);
207#elif defined(JPH_USE_NEON)
208 return vmaxq_f32(inV1.mValue, inV2.mValue);
209#elif defined(JPH_USE_RVV)
210 Vec3 res;
211 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(inV1.mF32, 3);
212 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 3);
213 const vfloat32m1_t max = __riscv_vfmax_vv_f32m1(v1, v2, 3);
214 __riscv_vse32_v_f32m1(res.mF32, max, 3);
215 return res;
216#else
217 return Vec3(max(inV1.mF32[0], inV2.mF32[0]),
218 max(inV1.mF32[1], inV2.mF32[1]),
219 max(inV1.mF32[2], inV2.mF32[2]));
220#endif
221}
222
224{
225 return sMax(sMin(inV, inMax), inMin);
226}
227
229{
230#if defined(JPH_USE_SSE)
231 return _mm_castps_si128(_mm_cmpeq_ps(inV1.mValue, inV2.mValue));
232#elif defined(JPH_USE_NEON)
233 return vceqq_f32(inV1.mValue, inV2.mValue);
234#elif defined(JPH_USE_RVV)
235 UVec4 res;
236 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(inV1.mF32, 3);
237 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 3);
238 const vbool32_t mask = __riscv_vmfeq_vv_f32m1_b32(v1, v2, 3);
239 const vuint32m1_t zeros = __riscv_vmv_v_x_u32m1(0x0, 3);
240 const vuint32m1_t merged = __riscv_vmerge_vxm_u32m1(zeros, 0xFFFFFFFF, mask, 3);
241 __riscv_vse32_v_u32m1(res.mU32, merged, 3);
242 res.mU32[3] = res.mU32[2];
243 return res;
244#else
245 uint32 z = inV1.mF32[2] == inV2.mF32[2]? 0xffffffffu : 0;
246 return UVec4(inV1.mF32[0] == inV2.mF32[0]? 0xffffffffu : 0,
247 inV1.mF32[1] == inV2.mF32[1]? 0xffffffffu : 0,
248 z,
249 z);
250#endif
251}
252
254{
255#if defined(JPH_USE_SSE)
256 return _mm_castps_si128(_mm_cmplt_ps(inV1.mValue, inV2.mValue));
257#elif defined(JPH_USE_NEON)
258 return vcltq_f32(inV1.mValue, inV2.mValue);
259#elif defined(JPH_USE_RVV)
260 UVec4 res;
261 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(inV1.mF32, 3);
262 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 3);
263 const vbool32_t mask = __riscv_vmflt_vv_f32m1_b32(v1, v2, 3);
264 const vuint32m1_t zeros = __riscv_vmv_v_x_u32m1(0x0, 3);
265 const vuint32m1_t merged = __riscv_vmerge_vxm_u32m1(zeros, 0xFFFFFFFF, mask, 3);
266 __riscv_vse32_v_u32m1(res.mU32, merged, 3);
267 res.mU32[3] = res.mU32[2];
268 return res;
269#else
270 uint32 z = inV1.mF32[2] < inV2.mF32[2]? 0xffffffffu : 0;
271 return UVec4(inV1.mF32[0] < inV2.mF32[0]? 0xffffffffu : 0,
272 inV1.mF32[1] < inV2.mF32[1]? 0xffffffffu : 0,
273 z,
274 z);
275#endif
276}
277
279{
280#if defined(JPH_USE_SSE)
281 return _mm_castps_si128(_mm_cmple_ps(inV1.mValue, inV2.mValue));
282#elif defined(JPH_USE_NEON)
283 return vcleq_f32(inV1.mValue, inV2.mValue);
284#elif defined(JPH_USE_RVV)
285 UVec4 res;
286 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(inV1.mF32, 3);
287 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 3);
288 const vbool32_t mask = __riscv_vmfle_vv_f32m1_b32(v1, v2, 3);
289 const vuint32m1_t zeros = __riscv_vmv_v_x_u32m1(0x0, 3);
290 const vuint32m1_t merged = __riscv_vmerge_vxm_u32m1(zeros, 0xFFFFFFFF, mask, 3);
291 __riscv_vse32_v_u32m1(res.mU32, merged, 3);
292 res.mU32[3] = res.mU32[2];
293 return res;
294#else
295 uint32 z = inV1.mF32[2] <= inV2.mF32[2]? 0xffffffffu : 0;
296 return UVec4(inV1.mF32[0] <= inV2.mF32[0]? 0xffffffffu : 0,
297 inV1.mF32[1] <= inV2.mF32[1]? 0xffffffffu : 0,
298 z,
299 z);
300#endif
301}
302
304{
305#if defined(JPH_USE_SSE)
306 return _mm_castps_si128(_mm_cmpgt_ps(inV1.mValue, inV2.mValue));
307#elif defined(JPH_USE_NEON)
308 return vcgtq_f32(inV1.mValue, inV2.mValue);
309#elif defined(JPH_USE_RVV)
310 UVec4 res;
311 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(inV1.mF32, 3);
312 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 3);
313 const vbool32_t mask = __riscv_vmfgt_vv_f32m1_b32(v1, v2, 3);
314 const vuint32m1_t zeros = __riscv_vmv_v_x_u32m1(0x0, 3);
315 const vuint32m1_t merged = __riscv_vmerge_vxm_u32m1(zeros, 0xFFFFFFFF, mask, 3);
316 __riscv_vse32_v_u32m1(res.mU32, merged, 3);
317 res.mU32[3] = res.mU32[2];
318 return res;
319#else
320 uint32 z = inV1.mF32[2] > inV2.mF32[2]? 0xffffffffu : 0;
321 return UVec4(inV1.mF32[0] > inV2.mF32[0]? 0xffffffffu : 0,
322 inV1.mF32[1] > inV2.mF32[1]? 0xffffffffu : 0,
323 z,
324 z);
325#endif
326}
327
329{
330#if defined(JPH_USE_SSE)
331 return _mm_castps_si128(_mm_cmpge_ps(inV1.mValue, inV2.mValue));
332#elif defined(JPH_USE_NEON)
333 return vcgeq_f32(inV1.mValue, inV2.mValue);
334#elif defined(JPH_USE_RVV)
335 UVec4 res;
336 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(inV1.mF32, 3);
337 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 3);
338 const vbool32_t mask = __riscv_vmfge_vv_f32m1_b32(v1, v2, 3);
339 const vuint32m1_t zeros = __riscv_vmv_v_x_u32m1(0x0, 3);
340 const vuint32m1_t merged = __riscv_vmerge_vxm_u32m1(zeros, 0xFFFFFFFF, mask, 3);
341 __riscv_vse32_v_u32m1(res.mU32, merged, 3);
342 res.mU32[3] = res.mU32[2];
343 return res;
344#else
345 uint32 z = inV1.mF32[2] >= inV2.mF32[2]? 0xffffffffu : 0;
346 return UVec4(inV1.mF32[0] >= inV2.mF32[0]? 0xffffffffu : 0,
347 inV1.mF32[1] >= inV2.mF32[1]? 0xffffffffu : 0,
348 z,
349 z);
350#endif
351}
352
354{
355#ifdef JPH_USE_FMADD
356 #ifdef JPH_USE_SSE
357 return _mm_fmadd_ps(inMul1.mValue, inMul2.mValue, inAdd.mValue);
358 #elif defined(JPH_USE_NEON)
359 return vmlaq_f32(inAdd.mValue, inMul1.mValue, inMul2.mValue);
360 #elif defined(JPH_USE_RVV)
361 Vec3 res;
362 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(inMul1.mF32, 3);
363 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inMul2.mF32, 3);
364 const vfloat32m1_t rvv_add = __riscv_vle32_v_f32m1(inAdd.mF32, 3);
365 const vfloat32m1_t fmadd = __riscv_vfmacc_vv_f32m1(rvv_add, v1, v2, 3);
366 __riscv_vse32_v_f32m1(res.mF32, fmadd, 3);
367 return res;
368 #else
369 return inMul1 * inMul2 + inAdd;
370 #endif
371#else
372 return inMul1 * inMul2 + inAdd;
373#endif
374}
375
376Vec3 Vec3::sSelect(Vec3Arg inNotSet, Vec3Arg inSet, UVec4Arg inControl)
377{
378#if defined(JPH_USE_SSE4_1) && !defined(JPH_PLATFORM_WASM) // _mm_blendv_ps has problems on FireFox
379 Type v = _mm_blendv_ps(inNotSet.mValue, inSet.mValue, _mm_castsi128_ps(inControl.mValue));
380 return sFixW(v);
381#elif defined(JPH_USE_SSE)
382 __m128 is_set = _mm_castsi128_ps(_mm_srai_epi32(inControl.mValue, 31));
383 Type v = _mm_or_ps(_mm_and_ps(is_set, inSet.mValue), _mm_andnot_ps(is_set, inNotSet.mValue));
384 return sFixW(v);
385#elif defined(JPH_USE_NEON)
386 Type v = vbslq_f32(vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(inControl.mValue), 31)), inSet.mValue, inNotSet.mValue);
387 return sFixW(v);
388#elif defined(JPH_USE_RVV)
389 Vec3 masked;
390 const vuint32m1_t control = __riscv_vle32_v_u32m1(inControl.mU32, 3);
391 const vfloat32m1_t not_set = __riscv_vle32_v_f32m1(inNotSet.mF32, 3);
392 const vfloat32m1_t set = __riscv_vle32_v_f32m1(inSet.mF32, 3);
393
394 // Generate RVV bool mask from UVec4
395 const vuint32m1_t r = __riscv_vand_vx_u32m1(control, 0x80000000u, 3);
396 const vbool32_t rvv_mask = __riscv_vmsne_vx_u32m1_b32(r, 0x0, 3);
397 const vfloat32m1_t merged = __riscv_vmerge_vvm_f32m1(not_set, set, rvv_mask, 3);
398 __riscv_vse32_v_f32m1(masked.mF32, merged, 3);
399 return masked;
400#else
401 Vec3 result;
402 for (int i = 0; i < 3; i++)
403 result.mF32[i] = (inControl.mU32[i] & 0x80000000u) ? inSet.mF32[i] : inNotSet.mF32[i];
404#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
405 result.mF32[3] = result.mF32[2];
406#endif // JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
407 return result;
408#endif
409}
410
412{
413#if defined(JPH_USE_SSE)
414 return _mm_or_ps(inV1.mValue, inV2.mValue);
415#elif defined(JPH_USE_NEON)
416 return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(inV1.mValue), vreinterpretq_u32_f32(inV2.mValue)));
417#elif defined(JPH_USE_RVV)
418 Vec3 or_result;
419 const vuint32m1_t v1 = __riscv_vle32_v_u32m1(reinterpret_cast<const uint32 *>(inV1.mF32), 3);
420 const vuint32m1_t v2 = __riscv_vle32_v_u32m1(reinterpret_cast<const uint32 *>(inV2.mF32), 3);
421 const vuint32m1_t res = __riscv_vor_vv_u32m1(v1, v2, 3);
422 __riscv_vse32_v_u32m1(reinterpret_cast<uint32 *>(or_result.mF32), res, 3);
423 return or_result;
424#else
426#endif
427}
428
430{
431#if defined(JPH_USE_SSE)
432 return _mm_xor_ps(inV1.mValue, inV2.mValue);
433#elif defined(JPH_USE_NEON)
434 return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(inV1.mValue), vreinterpretq_u32_f32(inV2.mValue)));
435#elif defined(JPH_USE_RVV)
436 Vec3 xor_result;
437 const vuint32m1_t v1 = __riscv_vle32_v_u32m1(reinterpret_cast<const uint32 *>(inV1.mF32), 3);
438 const vuint32m1_t v2 = __riscv_vle32_v_u32m1(reinterpret_cast<const uint32 *>(inV2.mF32), 3);
439 const vuint32m1_t res = __riscv_vxor_vv_u32m1(v1, v2, 3);
440 __riscv_vse32_v_u32m1(reinterpret_cast<uint32 *>(xor_result.mF32), res, 3);
441 return xor_result;
442#else
444#endif
445}
446
448{
449#if defined(JPH_USE_SSE)
450 return _mm_and_ps(inV1.mValue, inV2.mValue);
451#elif defined(JPH_USE_NEON)
452 return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(inV1.mValue), vreinterpretq_u32_f32(inV2.mValue)));
453#elif defined(JPH_USE_RVV)
454 Vec3 and_result;
455 const vuint32m1_t v1 = __riscv_vle32_v_u32m1(reinterpret_cast<const uint32 *>(inV1.mF32), 3);
456 const vuint32m1_t v2 = __riscv_vle32_v_u32m1(reinterpret_cast<const uint32 *>(inV2.mF32), 3);
457 const vuint32m1_t res = __riscv_vand_vv_u32m1(v1, v2, 3);
458 __riscv_vse32_v_u32m1(reinterpret_cast<uint32 *>(and_result.mF32), res, 3);
459 return and_result;
460#else
462#endif
463}
464
465Vec3 Vec3::sUnitSpherical(float inTheta, float inPhi)
466{
467 Vec4 s, c;
468 Vec4(inTheta, inPhi, 0, 0).SinCos(s, c);
469 return Vec3(s.GetX() * c.GetY(), s.GetX() * s.GetY(), c.GetX());
470}
471
472template <class Random>
473Vec3 Vec3::sRandom(Random &inRandom)
474{
475 // Generating uniform unit random vectors in Rn - Andersen Ang
476 // See: https://angms.science/doc/RM/randUnitVec.pdf
477 float z = -1.0f + 2.0f * float(inRandom() - inRandom.min()) / float(inRandom.max() - inRandom.min());
478 float r = JPH::Sqrt(1.0f - Square(z));
479 float theta = 2.0f * JPH_PI * float(inRandom() - inRandom.min()) / float(inRandom.max() - inRandom.min());
480 Vec4 s, c;
481 Vec4::sReplicate(theta).SinCos(s, c);
482 return Vec3(r * s.GetX(), r * c.GetX(), z);
483}
484
486{
487 return sEquals(*this, inV2).TestAllXYZTrue();
488}
489
490bool Vec3::IsClose(Vec3Arg inV2, float inMaxDistSq) const
491{
492 return (inV2 - *this).LengthSq() <= inMaxDistSq;
493}
494
495bool Vec3::IsNearZero(float inMaxDistSq) const
496{
497 return LengthSq() <= inMaxDistSq;
498}
499
501{
502#if defined(JPH_USE_SSE)
503 return _mm_mul_ps(mValue, inV2.mValue);
504#elif defined(JPH_USE_NEON)
505 return vmulq_f32(mValue, inV2.mValue);
506#elif defined(JPH_USE_RVV)
507 Vec3 res;
508 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(mF32, 3);
509 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 3);
510 const vfloat32m1_t mul = __riscv_vfmul_vv_f32m1(v1, v2, 3);
511 __riscv_vse32_v_f32m1(res.mF32, mul, 3);
512 return res;
513#else
514 return Vec3(mF32[0] * inV2.mF32[0], mF32[1] * inV2.mF32[1], mF32[2] * inV2.mF32[2]);
515#endif
516}
517
518Vec3 Vec3::operator * (float inV2) const
519{
520#if defined(JPH_USE_SSE)
521 return _mm_mul_ps(mValue, _mm_set1_ps(inV2));
522#elif defined(JPH_USE_NEON)
523 return vmulq_n_f32(mValue, inV2);
524#elif defined(JPH_USE_RVV)
525 Vec3 res;
526 const vfloat32m1_t src = __riscv_vle32_v_f32m1(mF32, 3);
527 const vfloat32m1_t mul = __riscv_vfmul_vf_f32m1(src, inV2, 3);
528 __riscv_vse32_v_f32m1(res.mF32, mul, 3);
529 return res;
530#else
531 return Vec3(mF32[0] * inV2, mF32[1] * inV2, mF32[2] * inV2);
532#endif
533}
534
535Vec3 operator * (float inV1, Vec3Arg inV2)
536{
537#if defined(JPH_USE_SSE)
538 return _mm_mul_ps(_mm_set1_ps(inV1), inV2.mValue);
539#elif defined(JPH_USE_NEON)
540 return vmulq_n_f32(inV2.mValue, inV1);
541#elif defined(JPH_USE_RVV)
542 Vec3 res;
543 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(inV2.mF32, 3);
544 const vfloat32m1_t mul = __riscv_vfmul_vf_f32m1(v1, inV1, 3);
545 __riscv_vse32_v_f32m1(res.mF32, mul, 3);
546 return res;
547#else
548 return Vec3(inV1 * inV2.mF32[0], inV1 * inV2.mF32[1], inV1 * inV2.mF32[2]);
549#endif
550}
551
552Vec3 Vec3::operator / (float inV2) const
553{
554#if defined(JPH_USE_SSE)
555 return _mm_div_ps(mValue, _mm_set1_ps(inV2));
556#elif defined(JPH_USE_NEON)
557 return vdivq_f32(mValue, vdupq_n_f32(inV2));
558#elif defined(JPH_USE_RVV)
559 Vec3 res;
560 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(mF32, 3);
561 const vfloat32m1_t div = __riscv_vfdiv_vf_f32m1(v1, inV2, 3);
562 __riscv_vse32_v_f32m1(res.mF32, div, 3);
563 return res;
564#else
565 return Vec3(mF32[0] / inV2, mF32[1] / inV2, mF32[2] / inV2);
566#endif
567}
568
570{
571#if defined(JPH_USE_SSE)
572 mValue = _mm_mul_ps(mValue, _mm_set1_ps(inV2));
573#elif defined(JPH_USE_NEON)
574 mValue = vmulq_n_f32(mValue, inV2);
575#elif defined(JPH_USE_RVV)
576 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(mF32, 3);
577 const vfloat32m1_t res = __riscv_vfmul_vf_f32m1(v1, inV2, 3);
578 __riscv_vse32_v_f32m1(mF32, res, 3);
579#else
580 for (int i = 0; i < 3; ++i)
581 mF32[i] *= inV2;
582 #ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
583 mF32[3] = mF32[2];
584 #endif
585#endif
586 return *this;
587}
588
590{
591#if defined(JPH_USE_SSE)
592 mValue = _mm_mul_ps(mValue, inV2.mValue);
593#elif defined(JPH_USE_NEON)
594 mValue = vmulq_f32(mValue, inV2.mValue);
595#elif defined(JPH_USE_RVV)
596 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(mF32, 3);
597 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 3);
598 const vfloat32m1_t rvv_res = __riscv_vfmul_vv_f32m1(v1, v2, 3);
599 __riscv_vse32_v_f32m1(mF32, rvv_res, 3);
600#else
601 for (int i = 0; i < 3; ++i)
602 mF32[i] *= inV2.mF32[i];
603 #ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
604 mF32[3] = mF32[2];
605 #endif
606#endif
607 return *this;
608}
609
611{
612#if defined(JPH_USE_SSE)
613 mValue = _mm_div_ps(mValue, _mm_set1_ps(inV2));
614#elif defined(JPH_USE_NEON)
615 mValue = vdivq_f32(mValue, vdupq_n_f32(inV2));
616#elif defined(JPH_USE_RVV)
617 const vfloat32m1_t v = __riscv_vle32_v_f32m1(mF32, 3);
618 const vfloat32m1_t res = __riscv_vfdiv_vf_f32m1(v, inV2, 3);
619 __riscv_vse32_v_f32m1(mF32, res, 3);
620#else
621 for (int i = 0; i < 3; ++i)
622 mF32[i] /= inV2;
623 #ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
624 mF32[3] = mF32[2];
625 #endif
626#endif
627 return *this;
628}
629
631{
632#if defined(JPH_USE_SSE)
633 return _mm_add_ps(mValue, inV2.mValue);
634#elif defined(JPH_USE_NEON)
635 return vaddq_f32(mValue, inV2.mValue);
636#elif defined(JPH_USE_RVV)
637 Vec3 res;
638 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(mF32, 3);
639 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 3);
640 const vfloat32m1_t rvv_add = __riscv_vfadd_vv_f32m1(v1, v2, 3);
641 __riscv_vse32_v_f32m1(res.mF32, rvv_add, 3);
642 return res;
643#else
644 return Vec3(mF32[0] + inV2.mF32[0], mF32[1] + inV2.mF32[1], mF32[2] + inV2.mF32[2]);
645#endif
646}
647
649{
650#if defined(JPH_USE_SSE)
651 mValue = _mm_add_ps(mValue, inV2.mValue);
652#elif defined(JPH_USE_NEON)
653 mValue = vaddq_f32(mValue, inV2.mValue);
654#elif defined(JPH_USE_RVV)
655 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(mF32, 3);
656 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 3);
657 const vfloat32m1_t rvv_add = __riscv_vfadd_vv_f32m1(v1, v2, 3);
658 __riscv_vse32_v_f32m1(mF32, rvv_add, 3);
659#else
660 for (int i = 0; i < 3; ++i)
661 mF32[i] += inV2.mF32[i];
662 #ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
663 mF32[3] = mF32[2];
664 #endif
665#endif
666 return *this;
667}
668
670{
671#if defined(JPH_USE_SSE)
672 return _mm_sub_ps(_mm_setzero_ps(), mValue);
673#elif defined(JPH_USE_NEON)
674 #ifdef JPH_CROSS_PLATFORM_DETERMINISTIC
675 return vsubq_f32(vdupq_n_f32(0), mValue);
676 #else
677 return vnegq_f32(mValue);
678 #endif
679#elif defined(JPH_USE_RVV)
680 #ifdef JPH_CROSS_PLATFORM_DETERMINISTIC
681 Vec3 res;
682 const vfloat32m1_t rvv_zero = __riscv_vfmv_v_f_f32m1(0.0f, 3);
683 const vfloat32m1_t v = __riscv_vle32_v_f32m1(mF32, 3);
684 const vfloat32m1_t rvv_neg = __riscv_vfsub_vv_f32m1(rvv_zero, v, 3);
685 __riscv_vse32_v_f32m1(res.mF32, rvv_neg, 3);
686 return res;
687 #else
688 Vec3 res;
689 const vfloat32m1_t v = __riscv_vle32_v_f32m1(mF32, 3);
690 const vfloat32m1_t rvv_neg = __riscv_vfsgnjn_vv_f32m1(v, v, 3);
691 __riscv_vse32_v_f32m1(res.mF32, rvv_neg, 3);
692 return res;
693 #endif
694#else
695 #ifdef JPH_CROSS_PLATFORM_DETERMINISTIC
696 return Vec3(0.0f - mF32[0], 0.0f - mF32[1], 0.0f - mF32[2]);
697 #else
698 return Vec3(-mF32[0], -mF32[1], -mF32[2]);
699 #endif
700#endif
701}
702
704{
705#if defined(JPH_USE_SSE)
706 return _mm_sub_ps(mValue, inV2.mValue);
707#elif defined(JPH_USE_NEON)
708 return vsubq_f32(mValue, inV2.mValue);
709#elif defined(JPH_USE_RVV)
710 Vec3 res;
711 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(mF32, 3);
712 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 3);
713 const vfloat32m1_t rvv_sub = __riscv_vfsub_vv_f32m1(v1, v2, 3);
714 __riscv_vse32_v_f32m1(res.mF32, rvv_sub, 3);
715 return res;
716#else
717 return Vec3(mF32[0] - inV2.mF32[0], mF32[1] - inV2.mF32[1], mF32[2] - inV2.mF32[2]);
718#endif
719}
720
722{
723#if defined(JPH_USE_SSE)
724 mValue = _mm_sub_ps(mValue, inV2.mValue);
725#elif defined(JPH_USE_NEON)
726 mValue = vsubq_f32(mValue, inV2.mValue);
727#elif defined(JPH_USE_RVV)
728 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(mF32, 3);
729 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 3);
730 const vfloat32m1_t rvv_sub = __riscv_vfsub_vv_f32m1(v1, v2, 3);
731 __riscv_vse32_v_f32m1(mF32, rvv_sub, 3);
732#else
733 for (int i = 0; i < 3; ++i)
734 mF32[i] -= inV2.mF32[i];
735 #ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
736 mF32[3] = mF32[2];
737 #endif
738#endif
739 return *this;
740}
741
743{
744 inV2.CheckW(); // Check W equals Z to avoid div by zero
745#if defined(JPH_USE_SSE)
746 return _mm_div_ps(mValue, inV2.mValue);
747#elif defined(JPH_USE_NEON)
748 return vdivq_f32(mValue, inV2.mValue);
749#elif defined(JPH_USE_RVV)
750 Vec3 res;
751 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(mF32, 3);
752 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 3);
753 const vfloat32m1_t rvv_div = __riscv_vfdiv_vv_f32m1(v1, v2, 3);
754 __riscv_vse32_v_f32m1(res.mF32, rvv_div, 3);
755 return res;
756#else
757 return Vec3(mF32[0] / inV2.mF32[0], mF32[1] / inV2.mF32[1], mF32[2] / inV2.mF32[2]);
758#endif
759}
760
762{
763#if defined(JPH_USE_SSE)
764 return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(0, 0, 0, 0));
765#elif defined(JPH_USE_NEON)
766 return vdupq_laneq_f32(mValue, 0);
767#elif defined(JPH_USE_RVV)
768 Vec4 vec;
769 const vfloat32m1_t splat = __riscv_vfmv_v_f_f32m1(mF32[0], 4);
770 __riscv_vse32_v_f32m1(vec.mF32, splat, 4);
771 return vec;
772#else
773 return Vec4(mF32[0], mF32[0], mF32[0], mF32[0]);
774#endif
775}
776
778{
779#if defined(JPH_USE_SSE)
780 return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(1, 1, 1, 1));
781#elif defined(JPH_USE_NEON)
782 return vdupq_laneq_f32(mValue, 1);
783#elif defined(JPH_USE_RVV)
784 Vec4 vec;
785 const vfloat32m1_t splat = __riscv_vfmv_v_f_f32m1(mF32[1], 4);
786 __riscv_vse32_v_f32m1(vec.mF32, splat, 4);
787 return vec;
788#else
789 return Vec4(mF32[1], mF32[1], mF32[1], mF32[1]);
790#endif
791}
792
794{
795#if defined(JPH_USE_SSE)
796 return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(2, 2, 2, 2));
797#elif defined(JPH_USE_NEON)
798 return vdupq_laneq_f32(mValue, 2);
799#elif defined(JPH_USE_RVV)
800 Vec4 vec;
801 const vfloat32m1_t splat = __riscv_vfmv_v_f_f32m1(mF32[2], 4);
802 __riscv_vse32_v_f32m1(vec.mF32, splat, 4);
803 return vec;
804#else
805 return Vec4(mF32[2], mF32[2], mF32[2], mF32[2]);
806#endif
807}
808
810{
811 return GetX() < GetY() ? (GetZ() < GetX() ? 2 : 0) : (GetZ() < GetY() ? 2 : 1);
812}
813
815{
816 return GetX() > GetY() ? (GetZ() > GetX() ? 2 : 0) : (GetZ() > GetY() ? 2 : 1);
817}
818
820{
821#if defined(JPH_USE_AVX512)
822 return _mm_range_ps(mValue, mValue, 0b1000);
823#elif defined(JPH_USE_SSE)
824 return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), mValue), mValue);
825#elif defined(JPH_USE_NEON)
826 return vabsq_f32(mValue);
827#elif defined(JPH_USE_RVV)
828 Vec3 res;
829 const vfloat32m1_t v = __riscv_vle32_v_f32m1(mF32, 3);
830 const vfloat32m1_t rvv_abs = __riscv_vfsgnj_vf_f32m1(v, 1.0, 3);
831 __riscv_vse32_v_f32m1(res.mF32, rvv_abs, 3);
832 return res;
833#else
834 return Vec3(abs(mF32[0]), abs(mF32[1]), abs(mF32[2]));
835#endif
836}
837
839{
840 return sOne() / mValue;
841}
842
844{
845#ifdef JPH_USE_FMADD
846 Vec3 cd = inC * inD;
847 Vec3 err = Vec3::sFusedMultiplyAdd(-inC, inD, cd);
848 Vec3 dop = Vec3::sFusedMultiplyAdd(inA, inB, -cd);
849 return dop + err;
850#else
851 return inA * inB - inC * inD;
852#endif
853}
854
856{
857#if defined(JPH_USE_SSE)
858 Type t1 = _mm_shuffle_ps(inV2.mValue, inV2.mValue, _MM_SHUFFLE(0, 0, 2, 1)); // Assure Z and W are the same
859 t1 = _mm_mul_ps(t1, mValue);
860 Type t2 = _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(0, 0, 2, 1)); // Assure Z and W are the same
861 t2 = _mm_mul_ps(t2, inV2.mValue);
862 Type t3 = _mm_sub_ps(t1, t2);
863 return _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(0, 0, 2, 1)); // Assure Z and W are the same
864#elif defined(JPH_USE_NEON)
865 Type t1 = JPH_NEON_SHUFFLE_F32x4(inV2.mValue, inV2.mValue, 1, 2, 0, 0); // Assure Z and W are the same
866 t1 = vmulq_f32(t1, mValue);
867 Type t2 = JPH_NEON_SHUFFLE_F32x4(mValue, mValue, 1, 2, 0, 0); // Assure Z and W are the same
868 t2 = vmulq_f32(t2, inV2.mValue);
869 Type t3 = vsubq_f32(t1, t2);
870 return JPH_NEON_SHUFFLE_F32x4(t3, t3, 1, 2, 0, 0); // Assure Z and W are the same
871#elif defined(JPH_USE_RVV)
872 const uint32 indices[3] = { 1, 2, 0 };
873 const vuint32m1_t gather_indices = __riscv_vle32_v_u32m1(indices, 3);
874 const vfloat32m1_t v0 = __riscv_vle32_v_f32m1(mF32, 3);
875 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(inV2.mF32, 3);
876 vfloat32m1_t t0 = __riscv_vrgather_vv_f32m1(v1, gather_indices, 3);
877 t0 = __riscv_vfmul_vv_f32m1(t0, v0, 3);
878 vfloat32m1_t t1 = __riscv_vrgather_vv_f32m1(v0, gather_indices, 3);
879 t1 = __riscv_vfmul_vv_f32m1(t1, v1, 3);
880 const vfloat32m1_t sub = __riscv_vfsub_vv_f32m1(t0, t1, 3);
881 const vfloat32m1_t cross = __riscv_vrgather_vv_f32m1(sub, gather_indices, 3);
882
883 Vec3 cross_result;
884 __riscv_vse32_v_f32m1(cross_result.mF32, cross, 3);
885 return cross_result;
886#else
887 return Vec3(mF32[1] * inV2.mF32[2] - mF32[2] * inV2.mF32[1],
888 mF32[2] * inV2.mF32[0] - mF32[0] * inV2.mF32[2],
889 mF32[0] * inV2.mF32[1] - mF32[1] * inV2.mF32[0]);
890#endif
891}
892
897
898float Vec3::ReduceSum() const
899{
900 // Ensure that we handle -0.0f correctly when cross platform deterministic behavior is required.
901#if defined(JPH_USE_SSE4_1)
902 #ifdef JPH_CROSS_PLATFORM_DETERMINISTIC
903 Type val = _mm_blend_ps(mValue, _mm_setzero_ps(), 0x8); // [x, y, z, 0]
904 Type shuf = _mm_movehdup_ps(val); // [y, y, 0, 0]
905 Type sums = _mm_add_ps(val, shuf); // [x + y, y + y, z + 0, 0]
906 shuf = _mm_movehl_ps(shuf, sums); // [z + 0, 0, 0, 0]
907 #else
908 Type shuf = _mm_movehdup_ps(mValue); // [y, y, w, w]
909 Type sums = _mm_add_ps(mValue, shuf); // [x + y, y + y, z + w, w + w]
910 shuf = _mm_movehl_ps(mValue, mValue); // [z, w, z, w]
911 #endif
912 sums = _mm_add_ps(sums, shuf); // Deterministic: [(x + y) + (z + 0), ...], non-deterministic: [(x + y) + z, ...]
913 return _mm_cvtss_f32(sums);
914#elif defined(JPH_USE_NEON)
915 Type v = vsetq_lane_f32(0, mValue, 3);
916 return vaddvq_f32(v);
917#elif defined(JPH_USE_RVV)
918 const vfloat32m1_t zeros = __riscv_vfmv_v_f_f32m1(0.0f, 3);
919 const vfloat32m1_t v = __riscv_vle32_v_f32m1(mF32, 3);
920 const vfloat32m1_t sum = __riscv_vfredosum_vs_f32m1_f32m1(v, zeros, 3);
921 return __riscv_vfmv_f_s_f32m1_f32(sum);
922#else
923 #ifdef JPH_CROSS_PLATFORM_DETERMINISTIC
924 return (mF32[0] + mF32[1]) + (mF32[2] + 0.0f);
925 #else
926 return mF32[0] + mF32[1] + mF32[2];
927 #endif
928#endif
929}
930
931float Vec3::Dot(Vec3Arg inV2) const
932{
933 return (*this * inV2).ReduceSum();
934}
935
937{
938 return Vec3::sReplicate(Dot(inV2));
939}
940
942{
943 return Vec4::sReplicate(Dot(inV2));
944}
945
946float Vec3::LengthSq() const
947{
948 return Dot(*this);
949}
950
951float Vec3::Length() const
952{
953 return JPH::Sqrt(LengthSq());
954}
955
957{
958#if defined(JPH_USE_SSE)
959 return _mm_sqrt_ps(mValue);
960#elif defined(JPH_USE_NEON)
961 return vsqrtq_f32(mValue);
962#elif defined(JPH_USE_RVV)
963 Vec3 res;
964 const vfloat32m1_t v = __riscv_vle32_v_f32m1(mF32, 3);
965 const vfloat32m1_t rvv_sqrt = __riscv_vfsqrt_v_f32m1(v, 3);
966 __riscv_vse32_v_f32m1(res.mF32, rvv_sqrt, 3);
967 return res;
968#else
969 return Vec3(JPH::Sqrt(mF32[0]), JPH::Sqrt(mF32[1]), JPH::Sqrt(mF32[2]));
970#endif
971}
972
974{
975 return *this / Length();
976}
977
979{
980#if defined(JPH_USE_SSE4_1) && !defined(JPH_PLATFORM_WASM) // _mm_blendv_ps has problems on FireFox
981 Type mul = _mm_mul_ps(mValue, mValue);
982 Type shuf = _mm_movehdup_ps(mul);
983 Type sums = _mm_add_ps(mul, shuf);
984 shuf = _mm_movehl_ps(mul, mul);
985 sums = _mm_add_ps(sums, shuf);
986 Type len_sq = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(0, 0, 0, 0));
987 // clang with '-ffast-math' (which you should not use!) can generate _mm_rsqrt_ps
988 // instructions which produce INFs/NaNs when they get a denormal float as input.
989 // We therefore treat denormals as zero here.
990 Type is_zero = _mm_cmple_ps(len_sq, _mm_set1_ps(FLT_MIN));
991#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
992 if (_mm_movemask_ps(is_zero) == 0xf)
993 return inZeroValue;
994 else
995 return _mm_div_ps(mValue, _mm_sqrt_ps(len_sq));
996#else
997 return _mm_blendv_ps(_mm_div_ps(mValue, _mm_sqrt_ps(len_sq)), inZeroValue.mValue, is_zero);
998#endif // JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
999#elif defined(JPH_USE_NEON)
1000 float32x4_t mul = vmulq_f32(mValue, mValue);
1001 mul = vsetq_lane_f32(0, mul, 3);
1002 float32x4_t len_sq = vdupq_n_f32(vaddvq_f32(mul));
1003 uint32x4_t is_zero = vcleq_f32(len_sq, vdupq_n_f32(FLT_MIN));
1004 return vbslq_f32(is_zero, inZeroValue.mValue, vdivq_f32(mValue, vsqrtq_f32(len_sq)));
1005#elif defined(JPH_USE_RVV)
1006 const vfloat32m1_t src = __riscv_vle32_v_f32m1(mF32, 3);
1007 const vfloat32m1_t zeros = __riscv_vfmv_v_f_f32m1(0.0f, 3);
1008 const vfloat32m1_t mul = __riscv_vfmul_vv_f32m1(src, src, 3);
1009 const vfloat32m1_t sum = __riscv_vfredosum_vs_f32m1_f32m1(mul, zeros, 3);
1010 const float dot = __riscv_vfmv_f_s_f32m1_f32(sum);
1011 if (dot <= FLT_MIN)
1012 return inZeroValue;
1013
1014 const vfloat32m1_t splat = __riscv_vrgather_vx_f32m1(sum, 0, 3);
1015 const vfloat32m1_t length = __riscv_vfsqrt_v_f32m1(splat, 3);
1016
1017 Vec3 v;
1018 const vfloat32m1_t norm = __riscv_vfdiv_vv_f32m1(src, length, 3);
1019 __riscv_vse32_v_f32m1(v.mF32, norm, 3);
1020 return v;
1021#else
1022 float len_sq = LengthSq();
1023 if (len_sq <= FLT_MIN)
1024 return inZeroValue;
1025 else
1026 return *this / JPH::Sqrt(len_sq);
1027#endif
1028}
1029
1030bool Vec3::IsNormalized(float inTolerance) const
1031{
1032 return abs(LengthSq() - 1.0f) <= inTolerance;
1033}
1034
1035bool Vec3::IsNaN() const
1036{
1037#if defined(JPH_USE_AVX512)
1038 return (_mm_fpclass_ps_mask(mValue, 0b10000001) & 0x7) != 0;
1039#elif defined(JPH_USE_SSE)
1040 return (_mm_movemask_ps(_mm_cmpunord_ps(mValue, mValue)) & 0x7) != 0;
1041#elif defined(JPH_USE_NEON)
1042 uint32x4_t mask = JPH_NEON_UINT32x4(1, 1, 1, 0);
1043 uint32x4_t is_equal = vceqq_f32(mValue, mValue); // If a number is not equal to itself it's a NaN
1044 return vaddvq_u32(vandq_u32(is_equal, mask)) != 3;
1045#elif defined(JPH_USE_RVV)
1046 const vfloat32m1_t v = __riscv_vle32_v_f32m1(mF32, 3);
1047 const vbool32_t mask = __riscv_vmfeq_vv_f32m1_b32(v, v, 3);
1048 const uint32 eq = __riscv_vcpop_m_b32(mask, 3);
1049 return eq != 3;
1050#else
1051 return isnan(mF32[0]) || isnan(mF32[1]) || isnan(mF32[2]);
1052#endif
1053}
1054
1055void Vec3::StoreFloat3(Float3 *outV) const
1056{
1057#if defined(JPH_USE_SSE)
1058 _mm_store_ss(&outV->x, mValue);
1060 _mm_store_ss(&outV->y, t.mValue);
1062 _mm_store_ss(&outV->z, t.mValue);
1063#elif defined(JPH_USE_NEON)
1064 float32x2_t xy = vget_low_f32(mValue);
1065 vst1_f32(&outV->x, xy);
1066 vst1q_lane_f32(&outV->z, mValue, 2);
1067#elif defined(JPH_USE_RVV)
1068 const vfloat32m1_t v = __riscv_vle32_v_f32m1(mF32, 3);
1069 __riscv_vse32_v_f32m1(&outV->x, v, 3);
1070#else
1071 outV->x = mF32[0];
1072 outV->y = mF32[1];
1073 outV->z = mF32[2];
1074#endif
1075}
1076
1078{
1079#if defined(JPH_USE_SSE)
1080 return _mm_cvttps_epi32(mValue);
1081#elif defined(JPH_USE_NEON)
1082 return vcvtq_u32_f32(mValue);
1083#elif defined(JPH_USE_RVV)
1084 UVec4 res;
1085 const vfloat32m1_t v = __riscv_vle32_v_f32m1(mF32, 4);
1086 const vuint32m1_t cast = __riscv_vfcvt_rtz_xu_f_v_u32m1(v, 4);
1087 __riscv_vse32_v_u32m1(res.mU32, cast, 4);
1088 return res;
1089#else
1090 return UVec4(uint32(mF32[0]), uint32(mF32[1]), uint32(mF32[2]), uint32(mF32[3]));
1091#endif
1092}
1093
1095{
1096#if defined(JPH_USE_SSE)
1097 return UVec4(_mm_castps_si128(mValue));
1098#elif defined(JPH_USE_NEON)
1099 return vreinterpretq_u32_f32(mValue);
1100#else
1101 return *reinterpret_cast<const UVec4 *>(this);
1102#endif
1103}
1104
1105float Vec3::ReduceMin() const
1106{
1109 return v.GetX();
1110}
1111
1112float Vec3::ReduceMax() const
1113{
1116 return v.GetX();
1117}
1118
1120{
1121#if defined(JPH_USE_SSE)
1122 // Build both perpendicular candidates without explicit masking:
1123 // perp_x = [z, 0, -x, 0] (used when |x| > |y|)
1124 // perp_y = [0, z, -y, 0] (used when |x| <= |y|)
1125 __m128 zero = _mm_setzero_ps();
1126 __m128 neg = _mm_sub_ps(zero, mValue);
1127 __m128 perp_x = _mm_shuffle_ps(_mm_unpackhi_ps(mValue, zero), neg, _MM_SHUFFLE(0, 0, 1, 0));
1128 __m128 perp_y = _mm_shuffle_ps(_mm_unpackhi_ps(zero, mValue), neg, _MM_SHUFFLE(1, 1, 1, 0));
1129
1130 // Compare squared components instead of absolute values (saves the abs computation).
1131 __m128 sq = _mm_mul_ps(mValue, mValue);
1132 __m128 xx = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 0));
1133 __m128 yy = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(1, 1, 1, 1));
1134 __m128 zz = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(2, 2, 2, 2));
1135 __m128 x_gt_y = _mm_cmpgt_ps(xx, yy);
1136
1137 // Select perpendicular based on |x| > |y|.
1138#if defined(JPH_USE_SSE4_1) && !defined(JPH_PLATFORM_WASM) // _mm_blendv_ps has problems on FireFox
1139 __m128 result = _mm_blendv_ps(perp_y, perp_x, x_gt_y);
1140#else
1141 __m128 result = _mm_or_ps(_mm_and_ps(x_gt_y, perp_x), _mm_andnot_ps(x_gt_y, perp_y));
1142#endif
1143
1144 // Normalize. Since the result has only two nonzero components; one of x^2 / y^2 plus z^2; the squared length is max(xx, yy) + zz. All lanes of the sqrt input are identical.
1145 __m128 len = _mm_sqrt_ps(_mm_add_ps(_mm_max_ps(xx, yy), zz));
1146 return _mm_div_ps(result, len);
1147#else
1148 float x = mF32[0], y = mF32[1], z = mF32[2];
1149 float xx = x * x, yy = y * y, zz = z * z;
1150#ifdef JPH_CROSS_PLATFORM_DETERMINISTIC
1151 Vec3 perp_x(z, 0.0f, 0.0f - x);
1152 Vec3 perp_y(0.0f, z, 0.0f - y);
1153#else
1154 Vec3 perp_x(z, 0.0f, -x);
1155 Vec3 perp_y(0.0f, z, -y);
1156#endif // JPH_CROSS_PLATFORM_DETERMINISTIC
1157 return (xx > yy ? perp_x : perp_y) / JPH::Sqrt(max(xx, yy) + zz);
1158#endif // JPH_USE_SSE
1159}
1160
1162{
1163#if defined(JPH_USE_AVX512)
1164 Type one = _mm_set1_ps(1.0f);
1165 return _mm_or_ps(_mm_fixupimm_ps(mValue, mValue, _mm_set1_epi32(0xA9A90100), 0), one);
1166#elif defined(JPH_USE_SSE)
1167 Type minus_one = _mm_set1_ps(-1.0f);
1168 Type one = _mm_set1_ps(1.0f);
1169 return _mm_or_ps(_mm_and_ps(mValue, minus_one), one);
1170#elif defined(JPH_USE_NEON)
1171 Type minus_one = vdupq_n_f32(-1.0f);
1172 Type one = vdupq_n_f32(1.0f);
1173 return vreinterpretq_f32_u32(vorrq_u32(vandq_u32(vreinterpretq_u32_f32(mValue), vreinterpretq_u32_f32(minus_one)), vreinterpretq_u32_f32(one)));
1174#elif defined(JPH_USE_RVV)
1175 Vec3 res;
1176 const vfloat32m1_t rvv_in = __riscv_vle32_v_f32m1(mF32, 3);
1177 const vfloat32m1_t rvv_one = __riscv_vfmv_v_f_f32m1(1.0, 3);
1178 const vfloat32m1_t rvv_signs = __riscv_vfsgnj_vv_f32m1(rvv_one, rvv_in, 3);
1179 __riscv_vse32_v_f32m1(res.mF32, rvv_signs, 3);
1180 return res;
1181#else
1182 return Vec3(std::signbit(mF32[0])? -1.0f : 1.0f,
1183 std::signbit(mF32[1])? -1.0f : 1.0f,
1184 std::signbit(mF32[2])? -1.0f : 1.0f);
1185#endif
1186}
1187
1188template <int X, int Y, int Z>
1189JPH_INLINE Vec3 Vec3::FlipSign() const
1190{
1191 static_assert(X == 1 || X == -1, "X must be 1 or -1");
1192 static_assert(Y == 1 || Y == -1, "Y must be 1 or -1");
1193 static_assert(Z == 1 || Z == -1, "Z must be 1 or -1");
1194 return Vec3::sXor(*this, Vec3(X > 0? 0.0f : -0.0f, Y > 0? 0.0f : -0.0f, Z > 0? 0.0f : -0.0f));
1195}
1196
1198{
1199 constexpr float cOneOverSqrt2 = 0.70710678f;
1200 constexpr uint cNumBits = 14;
1201 constexpr uint cMask = (1 << cNumBits) - 1;
1202 constexpr uint cMaxValue = cMask - 1; // Need odd number of buckets to quantize to or else we can't encode 0
1203 constexpr float cScale = float(cMaxValue) / (2.0f * cOneOverSqrt2);
1204
1205 // Store sign bit
1206 Vec3 v = *this;
1207 uint32 max_element = v.Abs().GetHighestComponentIndex();
1208 uint32 value = 0;
1209 if (v[max_element] < 0.0f)
1210 {
1211 value = 0x80000000u;
1212 v = -v;
1213 }
1214
1215 // Store highest component
1216 value |= max_element << 29;
1217
1218 // Store the other two components in a compressed format
1219 UVec4 compressed = Vec3::sClamp((v + Vec3::sReplicate(cOneOverSqrt2)) * cScale + Vec3::sReplicate(0.5f), Vec3::sZero(), Vec3::sReplicate(cMaxValue)).ToInt();
1220 switch (max_element)
1221 {
1222 case 0:
1223 compressed = compressed.Swizzle<SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_UNUSED, SWIZZLE_UNUSED>();
1224 break;
1225
1226 case 1:
1227 compressed = compressed.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_UNUSED, SWIZZLE_UNUSED>();
1228 break;
1229 }
1230
1231 value |= compressed.GetX();
1232 value |= compressed.GetY() << cNumBits;
1233 return value;
1234}
1235
1237{
1238 constexpr float cOneOverSqrt2 = 0.70710678f;
1239 constexpr uint cNumBits = 14;
1240 constexpr uint cMask = (1u << cNumBits) - 1;
1241 constexpr uint cMaxValue = cMask - 1; // Need odd number of buckets to quantize to or else we can't encode 0
1242 constexpr int cHalfMaxValue = int(cMaxValue >> 1);
1243 constexpr float cScale = 2.0f * cOneOverSqrt2 / float(cMaxValue);
1244
1245 // Restore two components
1246 Vec3 v = Vec3(float(int(inValue & cMask) - cHalfMaxValue), float(int((inValue >> cNumBits) & cMask) - cHalfMaxValue), 0) * cScale;
1247 JPH_ASSERT(v.GetZ() == 0.0f);
1248
1249 // Restore the highest component
1250 v.SetZ(JPH::Sqrt(max(1.0f - v.LengthSq(), 0.0f)));
1251
1252 // Extract sign
1253 if ((inValue & 0x80000000u) != 0)
1254 v = -v;
1255
1256 // Swizzle the components in place
1257 switch ((inValue >> 29) & 3)
1258 {
1259 case 0:
1261 break;
1262
1263 case 1:
1265 break;
1266 }
1267
1268 return v;
1269}
1270
#define JPH_SUPPRESS_WARNINGS_STD_BEGIN
Definition Core.h:439
#define JPH_SUPPRESS_WARNINGS_STD_END
Definition Core.h:452
std::uint64_t uint64
Definition Core.h:515
unsigned int uint
Definition Core.h:510
#define JPH_NAMESPACE_END
Definition Core.h:434
std::uint32_t uint32
Definition Core.h:513
#define JPH_NAMESPACE_BEGIN
Definition Core.h:428
#define xy
Definition HLSLToCPP.h:511
#define JPH_MAKE_HASHABLE(type,...)
Definition HashCombine.h:223
#define JPH_ASSERT(...)
Definition IssueReporting.h:33
JPH_INLINE constexpr T Square(T inV)
Square a value.
Definition Math.h:70
JPH_INLINE constexpr To BitCast(const From &inValue)
Simple implementation of C++20 std::bit_cast.
Definition Math.h:239
@ SWIZZLE_Z
Use the Z component.
Definition Swizzle.h:14
@ SWIZZLE_X
Use the X component.
Definition Swizzle.h:12
@ SWIZZLE_UNUSED
We always use the Z component when we don't specifically want to initialize a value,...
Definition Swizzle.h:16
@ SWIZZLE_Y
Use the Y component.
Definition Swizzle.h:13
Vec3 operator*(float inV1, Vec3Arg inV2)
Definition Vec3.inl:535
Class that holds 3 floats. Used as a storage class. Convert to Vec3 for calculations.
Definition Float3.h:13
float y
Definition Float3.h:39
float z
Definition Float3.h:40
float x
Definition Float3.h:38
Definition UVec4.h:12
JPH_INLINE UVec4 Swizzle() const
Swizzle the elements in inV.
JPH_INLINE uint32 GetY() const
Definition UVec4.h:103
static JPH_INLINE UVec4 sAnd(UVec4Arg inV1, UVec4Arg inV2)
Logical and (component wise)
Definition UVec4.inl:292
static JPH_INLINE UVec4 sOr(UVec4Arg inV1, UVec4Arg inV2)
Logical or (component wise)
Definition UVec4.inl:250
JPH_INLINE bool TestAllXYZTrue() const
Test if X, Y and Z components are true (true is when highest bit of component is set)
Definition UVec4.inl:663
Type mValue
Definition UVec4.h:223
JPH_INLINE uint32 GetX() const
Get individual components.
Definition UVec4.h:102
static JPH_INLINE UVec4 sXor(UVec4Arg inV1, UVec4Arg inV2)
Logical xor (component wise)
Definition UVec4.inl:271
JPH_INLINE Vec4 ReinterpretAsFloat() const
Reinterpret UVec4 as a Vec4 (doesn't change the bits)
Definition UVec4.inl:527
uint32 mU32[4]
Definition UVec4.h:224
Definition Vec3.h:17
JPH_INLINE bool IsClose(Vec3Arg inV2, float inMaxDistSq=1.0e-12f) const
Test if two vectors are close.
Definition Vec3.inl:490
static JPH_INLINE Vec3 sMax(Vec3Arg inV1, Vec3Arg inV2)
Return the maximum of each of the components.
Definition Vec3.inl:203
JPH_INLINE float Dot(Vec3Arg inV2) const
Dot product.
Definition Vec3.inl:931
JPH_INLINE Vec3 Normalized() const
Normalize vector.
Definition Vec3.inl:973
static JPH_INLINE Type sFixW(Type inValue)
Internal helper function that ensures that the Z component is replicated to the W component to preven...
Vec4::Type Type
Definition Vec3.h:27
JPH_INLINE bool operator==(Vec3Arg inV2) const
Comparison.
Definition Vec3.inl:485
JPH_INLINE Vec4 SplatX() const
Replicate the X component to all components.
Definition Vec3.inl:761
static JPH_INLINE Vec3 sMin(Vec3Arg inV1, Vec3Arg inV2)
Return the minimum value of each of the components.
Definition Vec3.inl:183
JPH_INLINE Vec3 Cross(Vec3Arg inV2) const
Cross product.
Definition Vec3.inl:855
JPH_INLINE Vec3 GetNormalizedPerpendicular() const
Get normalized vector that is perpendicular to this vector.
Definition Vec3.inl:1119
static Vec3 sRandom(Random &inRandom)
Get random unit vector.
Definition Vec3.inl:473
JPH_INLINE float GetX() const
Get individual components.
Definition Vec3.h:127
JPH_INLINE bool IsNormalized(float inTolerance=1.0e-6f) const
Test if vector is normalized.
Definition Vec3.inl:1030
static JPH_INLINE Vec3 sXor(Vec3Arg inV1, Vec3Arg inV2)
Logical xor (component wise)
Definition Vec3.inl:429
static JPH_INLINE Vec3 sDecompressUnitVector(uint32 inValue)
Decompress a unit vector from a 32 bit value.
Definition Vec3.inl:1236
JPH_INLINE float Length() const
Length of vector.
Definition Vec3.inl:951
static JPH_INLINE UVec4 sGreaterOrEqual(Vec3Arg inV1, Vec3Arg inV2)
Greater than or equal (component wise)
Definition Vec3.inl:328
JPH_INLINE float ReduceMin() const
Get the minimum of X, Y and Z.
Definition Vec3.inl:1105
JPH_INLINE Vec3 & operator-=(Vec3Arg inV2)
Subtract two float vectors (component wise)
Definition Vec3.inl:721
JPH_INLINE float ReduceMax() const
Get the maximum of X, Y and Z.
Definition Vec3.inl:1112
static JPH_INLINE Vec3 sDifferenceOfProducts(Vec3Arg inA, Vec3Arg inB, Vec3Arg inC, Vec3Arg inD)
Calculates inA * inB - inC * inD with more precision when FMA instructions are available....
Definition Vec3.inl:843
static JPH_INLINE UVec4 sLessOrEqual(Vec3Arg inV1, Vec3Arg inV2)
Less than or equal (component wise)
Definition Vec3.inl:278
JPH_INLINE Vec3 operator/(float inV2) const
Divide vector by float.
Definition Vec3.inl:552
friend JPH_INLINE Vec3 operator*(float inV1, Vec3Arg inV2)
Multiply vector with float.
Definition Vec3.inl:535
JPH_INLINE int GetLowestComponentIndex() const
Get index of component with lowest value.
Definition Vec3.inl:809
JPH_INLINE Vec3 & operator/=(float inV2)
Divide vector by float.
Definition Vec3.inl:610
JPH_INLINE Vec4 DotV4(Vec3Arg inV2) const
Dot product, returns the dot product in X, Y, Z and W components.
Definition Vec3.inl:941
JPH_INLINE Vec3 Abs() const
Return the absolute value of each of the components.
Definition Vec3.inl:819
static JPH_INLINE Vec3 sOne()
Vector with all ones.
Definition Vec3.inl:157
JPH_INLINE Vec3 Reciprocal() const
Reciprocal vector (1 / value) for each of the components.
Definition Vec3.inl:838
JPH_INLINE Vec3 NormalizedOr(Vec3Arg inZeroValue) const
Normalize vector or return inZeroValue if the length of the vector is zero.
Definition Vec3.inl:978
JPH_INLINE Vec3 FlipSign() const
Flips the signs of the components, e.g. FlipSign<-1, 1, -1>() will flip the signs of the X and Z comp...
Definition Vec3.inl:1189
JPH_INLINE Vec3 operator+(Vec3Arg inV2) const
Add two float vectors (component wise)
Definition Vec3.inl:630
JPH_INLINE float ReduceSum() const
Get the sum of X, Y and Z.
Definition Vec3.inl:898
JPH_INLINE uint32 CompressUnitVector() const
Compress a unit vector to a 32 bit value, precision is around 10^-4.
Definition Vec3.inl:1197
JPH_INLINE Vec4 SplatZ() const
Replicate the Z component to all components.
Definition Vec3.inl:793
JPH_INLINE Vec3 CrossPrecise(Vec3Arg inV2) const
Cross product (more precise version when FMA is available)
Definition Vec3.inl:893
static JPH_INLINE Vec3 sOr(Vec3Arg inV1, Vec3Arg inV2)
Logical or (component wise)
Definition Vec3.inl:411
static JPH_INLINE UVec4 sGreater(Vec3Arg inV1, Vec3Arg inV2)
Greater than (component wise)
Definition Vec3.inl:303
JPH_INLINE void SetZ(float inZ)
Definition Vec3.h:135
static JPH_INLINE Vec3 sAnd(Vec3Arg inV1, Vec3Arg inV2)
Logical and (component wise)
Definition Vec3.inl:447
JPH_INLINE void CheckW() const
Internal helper function that checks that W is equal to Z, so e.g. dividing by it should not generate...
static JPH_INLINE Vec3 sUnitSpherical(float inTheta, float inPhi)
Definition Vec3.inl:465
JPH_INLINE UVec4 ToInt() const
Convert each component from a float to an int.
Definition Vec3.inl:1077
Type mValue
Definition Vec3.h:308
JPH_INLINE float GetY() const
Definition Vec3.h:128
JPH_INLINE Vec4 SplatY() const
Replicate the Y component to all components.
Definition Vec3.inl:777
JPH_INLINE Vec3 operator-() const
Negate.
Definition Vec3.inl:669
JPH_INLINE void StoreFloat3(Float3 *outV) const
Store 3 floats to memory.
Definition Vec3.inl:1055
JPH_INLINE float LengthSq() const
Squared length of vector.
Definition Vec3.inl:946
float mF32[4]
Definition Vec3.h:309
static JPH_INLINE UVec4 sEquals(Vec3Arg inV1, Vec3Arg inV2)
Equals (component wise)
Definition Vec3.inl:228
JPH_INLINE bool IsNearZero(float inMaxDistSq=1.0e-12f) const
Test if vector is near zero.
Definition Vec3.inl:495
static JPH_INLINE Vec3 sZero()
Vector with all zeros.
Definition Vec3.inl:125
static JPH_INLINE UVec4 sLess(Vec3Arg inV1, Vec3Arg inV2)
Less than (component wise)
Definition Vec3.inl:253
static JPH_INLINE Vec3 sReplicate(float inV)
Replicate inV across all components.
Definition Vec3.inl:141
static JPH_INLINE Vec3 sClamp(Vec3Arg inV, Vec3Arg inMin, Vec3Arg inMax)
Clamp a vector between min and max (component wise)
Definition Vec3.inl:223
JPH_INLINE Vec3 & operator*=(float inV2)
Multiply vector with float.
Definition Vec3.inl:569
JPH_INLINE Vec3 & operator+=(Vec3Arg inV2)
Add two float vectors (component wise)
Definition Vec3.inl:648
static JPH_INLINE Vec3 sSelect(Vec3Arg inNotSet, Vec3Arg inSet, UVec4Arg inControl)
Component wise select, returns inNotSet when highest bit of inControl = 0 and inSet when highest bit ...
Definition Vec3.inl:376
JPH_INLINE bool IsNaN() const
Test if vector contains NaN elements.
Definition Vec3.inl:1035
JPH_INLINE Vec3 Sqrt() const
Component wise square root.
Definition Vec3.inl:956
JPH_INLINE UVec4 ReinterpretAsInt() const
Reinterpret Vec3 as a UVec4 (doesn't change the bits)
Definition Vec3.inl:1094
JPH_INLINE Vec3 DotV(Vec3Arg inV2) const
Dot product, returns the dot product in X, Y and Z components.
Definition Vec3.inl:936
static JPH_INLINE Vec3 sLoadFloat3Unsafe(const Float3 &inV)
Load 3 floats from memory (reads 32 bits extra which it doesn't use)
Definition Vec3.inl:167
JPH_INLINE float GetZ() const
Definition Vec3.h:129
JPH_INLINE Vec3 GetSign() const
Get vector that contains the sign of each element (returns 1.0f if positive, -1.0f if negative)
Definition Vec3.inl:1161
static JPH_INLINE Vec3 sNaN()
Vector with all NaN's.
Definition Vec3.inl:162
Vec3()=default
Constructor.
JPH_INLINE int GetHighestComponentIndex() const
Get index of component with highest value.
Definition Vec3.inl:814
static JPH_INLINE Vec3 sFusedMultiplyAdd(Vec3Arg inMul1, Vec3Arg inMul2, Vec3Arg inAdd)
Calculates inMul1 * inMul2 + inAdd.
Definition Vec3.inl:353
JPH_INLINE Vec3 Swizzle() const
Swizzle the elements in inV.
Definition Vec4.h:14
float mF32[4]
Definition Vec4.h:318
JPH_INLINE float GetX() const
Get individual components.
Definition Vec4.h:119
JPH_INLINE float GetY() const
Definition Vec4.h:120
static JPH_INLINE Vec4 sReplicate(float inV)
Replicate inV across all components.
Definition Vec4.inl:97
void SinCos(Vec4 &outSin, Vec4 &outCos) const
Calculate the sine and cosine for each element of this vector (input in radians)
Definition Vec4.inl:1171