Jolt Physics
A multi core friendly Game Physics Engine
Loading...
Searching...
No Matches
Vec4.inl
Go to the documentation of this file.
1// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
2// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
3// SPDX-License-Identifier: MIT
4
6#include <Jolt/Math/Vec3.h>
7#include <Jolt/Math/UVec4.h>
8
10
11// Constructor
13 mValue(inRHS.mValue)
14{
15}
16
17Vec4::Vec4(Vec3Arg inRHS, float inW)
18{
19#if defined(JPH_USE_SSE4_1)
20 mValue = _mm_blend_ps(inRHS.mValue, _mm_set1_ps(inW), 8);
21#elif defined(JPH_USE_NEON)
22 mValue = vsetq_lane_f32(inW, inRHS.mValue, 3);
23#elif defined(JPH_USE_RVV)
24 const vfloat32m1_t v = __riscv_vle32_v_f32m1(inRHS.mF32, 4);
25 __riscv_vse32_v_f32m1(mF32, v, 4);
26 mF32[3] = inW;
27#else
28 for (int i = 0; i < 3; i++)
29 mF32[i] = inRHS.mF32[i];
30 mF32[3] = inW;
31#endif
32}
33
34Vec4::Vec4(float inX, float inY, float inZ, float inW)
35{
36#if defined(JPH_USE_SSE)
37 mValue = _mm_set_ps(inW, inZ, inY, inX);
38#elif defined(JPH_USE_NEON)
39 uint32x2_t xy = vcreate_u32(static_cast<uint64>(BitCast<uint32>(inX)) | (static_cast<uint64>(BitCast<uint32>(inY)) << 32));
40 uint32x2_t zw = vcreate_u32(static_cast<uint64>(BitCast<uint32>(inZ)) | (static_cast<uint64>(BitCast<uint32>(inW)) << 32));
41 mValue = vreinterpretq_f32_u32(vcombine_u32(xy, zw));
42#elif defined(JPH_USE_RVV)
43 vfloat32m1_t v = __riscv_vfmv_v_f_f32m1(inW, 4);
44 v = __riscv_vfslide1up_vf_f32m1(v, inZ, 4);
45 v = __riscv_vfslide1up_vf_f32m1(v, inY, 4);
46 v = __riscv_vfslide1up_vf_f32m1(v, inX, 4);
47 __riscv_vse32_v_f32m1(mF32, v, 4);
48#else
49 mF32[0] = inX;
50 mF32[1] = inY;
51 mF32[2] = inZ;
52 mF32[3] = inW;
53#endif
54}
55
56template<uint32 SwizzleX, uint32 SwizzleY, uint32 SwizzleZ, uint32 SwizzleW>
58{
59 static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
60 static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
61 static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
62 static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
63
64#if defined(JPH_USE_SSE)
65 return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX));
66#elif defined(JPH_USE_NEON)
67 return JPH_NEON_SHUFFLE_F32x4(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);
68#elif defined(JPH_USE_RVV)
69 Vec4 v;
70 const vfloat32m1_t data = __riscv_vle32_v_f32m1(mF32, 4);
71 const uint32 stored_indices[4] = { SwizzleX, SwizzleY, SwizzleZ, SwizzleW };
72 const vuint32m1_t index = __riscv_vle32_v_u32m1(stored_indices, 4);
73 const vfloat32m1_t swizzled = __riscv_vrgather_vv_f32m1(data, index, 4);
74 __riscv_vse32_v_f32m1(v.mF32, swizzled, 4);
75 return v;
76#else
77 return Vec4(mF32[SwizzleX], mF32[SwizzleY], mF32[SwizzleZ], mF32[SwizzleW]);
78#endif
79}
80
82{
83#if defined(JPH_USE_SSE)
84 return _mm_setzero_ps();
85#elif defined(JPH_USE_NEON)
86 return vdupq_n_f32(0);
87#elif defined(JPH_USE_RVV)
88 Vec4 v;
89 const vfloat32m1_t zero_vec = __riscv_vfmv_v_f_f32m1(0.0f, 4);
90 __riscv_vse32_v_f32m1(v.mF32, zero_vec, 4);
91 return v;
92#else
93 return Vec4(0, 0, 0, 0);
94#endif
95}
96
98{
99#if defined(JPH_USE_SSE)
100 return _mm_set1_ps(inV);
101#elif defined(JPH_USE_NEON)
102 return vdupq_n_f32(inV);
103#elif defined(JPH_USE_RVV)
104 Vec4 vec;
105 const vfloat32m1_t v = __riscv_vfmv_v_f_f32m1(inV, 4);
106 __riscv_vse32_v_f32m1(vec.mF32, v, 4);
107 return vec;
108#else
109 return Vec4(inV, inV, inV, inV);
110#endif
111}
112
114{
115 return sReplicate(1.0f);
116}
117
119{
120 return sReplicate(numeric_limits<float>::quiet_NaN());
121}
122
124{
125#if defined(JPH_USE_SSE)
126 return _mm_loadu_ps(&inV->x);
127#elif defined(JPH_USE_NEON)
128 return vld1q_f32(&inV->x);
129#elif defined(JPH_USE_RVV)
130 Vec4 vector;
131 const vfloat32m1_t v = __riscv_vle32_v_f32m1(&inV->x, 4);
132 __riscv_vse32_v_f32m1(vector.mF32, v, 4);
133 return vector;
134#else
135 return Vec4(inV->x, inV->y, inV->z, inV->w);
136#endif
137}
138
140{
141#if defined(JPH_USE_SSE)
142 return _mm_load_ps(&inV->x);
143#elif defined(JPH_USE_NEON)
144 return vld1q_f32(&inV->x);
145#elif defined(JPH_USE_RVV)
146 Vec4 vector;
147 vfloat32m1_t v = __riscv_vle32_v_f32m1(&inV->x, 4);
148 __riscv_vse32_v_f32m1(vector.mF32, v, 4);
149 return vector;
150#else
151 return Vec4(inV->x, inV->y, inV->z, inV->w);
152#endif
153}
154
155template <const int Scale>
156Vec4 Vec4::sGatherFloat4(const float *inBase, UVec4Arg inOffsets)
157{
158#if defined(JPH_USE_SSE)
159 #ifdef JPH_USE_AVX2
160 return _mm_i32gather_ps(inBase, inOffsets.mValue, Scale);
161 #else
162 const uint8 *base = reinterpret_cast<const uint8 *>(inBase);
163 Type x = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetX() * Scale));
164 Type y = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetY() * Scale));
165 Type xy = _mm_unpacklo_ps(x, y);
166 Type z = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetZ() * Scale));
167 Type w = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetW() * Scale));
168 Type zw = _mm_unpacklo_ps(z, w);
169 return _mm_movelh_ps(xy, zw);
170 #endif
171#elif defined(JPH_USE_RVV)
172 Vec4 v;
173 const vuint32m1_t offsets = __riscv_vle32_v_u32m1(inOffsets.mU32, 4);
174 const vuint32m1_t scaled_offsets = __riscv_vmul_vx_u32m1(offsets, Scale, 4);
175 const vfloat32m1_t gathered = __riscv_vluxei32_v_f32m1(inBase, scaled_offsets, 4);
176 __riscv_vse32_v_f32m1(v.mF32, gathered, 4);
177 return v;
178#else
179 const uint8 *base = reinterpret_cast<const uint8 *>(inBase);
180 float x = *reinterpret_cast<const float *>(base + inOffsets.GetX() * Scale);
181 float y = *reinterpret_cast<const float *>(base + inOffsets.GetY() * Scale);
182 float z = *reinterpret_cast<const float *>(base + inOffsets.GetZ() * Scale);
183 float w = *reinterpret_cast<const float *>(base + inOffsets.GetW() * Scale);
184 return Vec4(x, y, z, w);
185#endif
186}
187
189{
190#if defined(JPH_USE_SSE)
191 return _mm_min_ps(inV1.mValue, inV2.mValue);
192#elif defined(JPH_USE_NEON)
193 return vminq_f32(inV1.mValue, inV2.mValue);
194#elif defined(JPH_USE_RVV)
195 Vec4 res;
196 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(inV1.mF32, 4);
197 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 4);
198 const vfloat32m1_t min = __riscv_vfmin_vv_f32m1(v1, v2, 4);
199 __riscv_vse32_v_f32m1(res.mF32, min, 4);
200 return res;
201#else
202 return Vec4(min(inV1.mF32[0], inV2.mF32[0]),
203 min(inV1.mF32[1], inV2.mF32[1]),
204 min(inV1.mF32[2], inV2.mF32[2]),
205 min(inV1.mF32[3], inV2.mF32[3]));
206#endif
207}
208
210{
211#if defined(JPH_USE_SSE)
212 return _mm_max_ps(inV1.mValue, inV2.mValue);
213#elif defined(JPH_USE_NEON)
214 return vmaxq_f32(inV1.mValue, inV2.mValue);
215#elif defined(JPH_USE_RVV)
216 Vec4 res;
217 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(inV1.mF32, 4);
218 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 4);
219 const vfloat32m1_t max = __riscv_vfmax_vv_f32m1(v1, v2, 4);
220 __riscv_vse32_v_f32m1(res.mF32, max, 4);
221 return res;
222#else
223 return Vec4(max(inV1.mF32[0], inV2.mF32[0]),
224 max(inV1.mF32[1], inV2.mF32[1]),
225 max(inV1.mF32[2], inV2.mF32[2]),
226 max(inV1.mF32[3], inV2.mF32[3]));
227#endif
228}
229
231{
232 return sMax(sMin(inV, inMax), inMin);
233}
234
236{
237#if defined(JPH_USE_SSE)
238 return _mm_castps_si128(_mm_cmpeq_ps(inV1.mValue, inV2.mValue));
239#elif defined(JPH_USE_NEON)
240 return vceqq_f32(inV1.mValue, inV2.mValue);
241#elif defined(JPH_USE_RVV)
242 UVec4 res;
243 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(inV1.mF32, 4);
244 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 4);
245 const vbool32_t mask = __riscv_vmfeq_vv_f32m1_b32(v1, v2, 4);
246 const vuint32m1_t zeros = __riscv_vmv_v_x_u32m1(0x0, 4);
247 const vuint32m1_t merged = __riscv_vmerge_vxm_u32m1(zeros, 0xFFFFFFFF, mask, 4);
248 __riscv_vse32_v_u32m1(res.mU32, merged, 4);
249 return res;
250#else
251 return UVec4(inV1.mF32[0] == inV2.mF32[0]? 0xffffffffu : 0,
252 inV1.mF32[1] == inV2.mF32[1]? 0xffffffffu : 0,
253 inV1.mF32[2] == inV2.mF32[2]? 0xffffffffu : 0,
254 inV1.mF32[3] == inV2.mF32[3]? 0xffffffffu : 0);
255#endif
256}
257
259{
260#if defined(JPH_USE_SSE)
261 return _mm_castps_si128(_mm_cmplt_ps(inV1.mValue, inV2.mValue));
262#elif defined(JPH_USE_NEON)
263 return vcltq_f32(inV1.mValue, inV2.mValue);
264#elif defined(JPH_USE_RVV)
265 UVec4 res;
266 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(inV1.mF32, 4);
267 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 4);
268 const vbool32_t mask = __riscv_vmflt_vv_f32m1_b32(v1, v2, 4);
269 const vuint32m1_t zeros = __riscv_vmv_v_x_u32m1(0x0, 4);
270 const vuint32m1_t merged = __riscv_vmerge_vxm_u32m1(zeros, 0xFFFFFFFF, mask, 4);
271 __riscv_vse32_v_u32m1(res.mU32, merged, 4);
272 return res;
273#else
274 return UVec4(inV1.mF32[0] < inV2.mF32[0]? 0xffffffffu : 0,
275 inV1.mF32[1] < inV2.mF32[1]? 0xffffffffu : 0,
276 inV1.mF32[2] < inV2.mF32[2]? 0xffffffffu : 0,
277 inV1.mF32[3] < inV2.mF32[3]? 0xffffffffu : 0);
278#endif
279}
280
282{
283#if defined(JPH_USE_SSE)
284 return _mm_castps_si128(_mm_cmple_ps(inV1.mValue, inV2.mValue));
285#elif defined(JPH_USE_NEON)
286 return vcleq_f32(inV1.mValue, inV2.mValue);
287#elif defined(JPH_USE_RVV)
288 UVec4 res;
289 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(inV1.mF32, 4);
290 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 4);
291 const vbool32_t mask = __riscv_vmfle_vv_f32m1_b32(v1, v2, 4);
292 const vuint32m1_t zeros = __riscv_vmv_v_x_u32m1(0x0, 4);
293 const vuint32m1_t merged = __riscv_vmerge_vxm_u32m1(zeros, 0xFFFFFFFF, mask, 4);
294 __riscv_vse32_v_u32m1(res.mU32, merged, 4);
295 return res;
296#else
297 return UVec4(inV1.mF32[0] <= inV2.mF32[0]? 0xffffffffu : 0,
298 inV1.mF32[1] <= inV2.mF32[1]? 0xffffffffu : 0,
299 inV1.mF32[2] <= inV2.mF32[2]? 0xffffffffu : 0,
300 inV1.mF32[3] <= inV2.mF32[3]? 0xffffffffu : 0);
301#endif
302}
303
305{
306#if defined(JPH_USE_SSE)
307 return _mm_castps_si128(_mm_cmpgt_ps(inV1.mValue, inV2.mValue));
308#elif defined(JPH_USE_NEON)
309 return vcgtq_f32(inV1.mValue, inV2.mValue);
310#elif defined(JPH_USE_RVV)
311 UVec4 res;
312 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(inV1.mF32, 4);
313 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 4);
314 const vbool32_t mask = __riscv_vmfgt_vv_f32m1_b32(v1, v2, 4);
315 const vuint32m1_t zeros = __riscv_vmv_v_x_u32m1(0x0, 4);
316 const vuint32m1_t merged = __riscv_vmerge_vxm_u32m1(zeros, 0xFFFFFFFF, mask, 4);
317 __riscv_vse32_v_u32m1(res.mU32, merged, 4);
318 return res;
319#else
320 return UVec4(inV1.mF32[0] > inV2.mF32[0]? 0xffffffffu : 0,
321 inV1.mF32[1] > inV2.mF32[1]? 0xffffffffu : 0,
322 inV1.mF32[2] > inV2.mF32[2]? 0xffffffffu : 0,
323 inV1.mF32[3] > inV2.mF32[3]? 0xffffffffu : 0);
324#endif
325}
326
328{
329#if defined(JPH_USE_SSE)
330 return _mm_castps_si128(_mm_cmpge_ps(inV1.mValue, inV2.mValue));
331#elif defined(JPH_USE_NEON)
332 return vcgeq_f32(inV1.mValue, inV2.mValue);
333#elif defined(JPH_USE_RVV)
334 UVec4 res;
335 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(inV1.mF32, 4);
336 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 4);
337 const vbool32_t mask = __riscv_vmfge_vv_f32m1_b32(v1, v2, 4);
338 const vuint32m1_t zeros = __riscv_vmv_v_x_u32m1(0x0, 4);
339 const vuint32m1_t merged = __riscv_vmerge_vxm_u32m1(zeros, 0xFFFFFFFF, mask, 4);
340 __riscv_vse32_v_u32m1(res.mU32, merged, 4);
341 return res;
342#else
343 return UVec4(inV1.mF32[0] >= inV2.mF32[0]? 0xffffffffu : 0,
344 inV1.mF32[1] >= inV2.mF32[1]? 0xffffffffu : 0,
345 inV1.mF32[2] >= inV2.mF32[2]? 0xffffffffu : 0,
346 inV1.mF32[3] >= inV2.mF32[3]? 0xffffffffu : 0);
347#endif
348}
349
351{
352#ifdef JPH_USE_FMADD
353 #ifdef JPH_USE_SSE
354 return _mm_fmadd_ps(inMul1.mValue, inMul2.mValue, inAdd.mValue);
355 #elif defined(JPH_USE_NEON)
356 return vmlaq_f32(inAdd.mValue, inMul1.mValue, inMul2.mValue);
357 #elif defined(JPH_USE_RVV)
358 Vec4 res;
359 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(inMul1.mF32, 4);
360 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inMul2.mF32, 4);
361 const vfloat32m1_t rvv_add = __riscv_vle32_v_f32m1(inAdd.mF32, 4);
362 const vfloat32m1_t mul = __riscv_vfmul_vv_f32m1(v1, v2, 4);
363 const vfloat32m1_t fmadd = __riscv_vfadd_vv_f32m1(rvv_add, mul, 4);
364 __riscv_vse32_v_f32m1(res.mF32, fmadd, 4);
365 return res;
366 #else
367 return inMul1 * inMul2 + inAdd;
368 #endif
369#else
370 return inMul1 * inMul2 + inAdd;
371#endif
372}
373
374Vec4 Vec4::sSelect(Vec4Arg inNotSet, Vec4Arg inSet, UVec4Arg inControl)
375{
376#if defined(JPH_USE_SSE4_1) && !defined(JPH_PLATFORM_WASM) // _mm_blendv_ps has problems on FireFox
377 return _mm_blendv_ps(inNotSet.mValue, inSet.mValue, _mm_castsi128_ps(inControl.mValue));
378#elif defined(JPH_USE_SSE)
379 __m128 is_set = _mm_castsi128_ps(_mm_srai_epi32(inControl.mValue, 31));
380 return _mm_or_ps(_mm_and_ps(is_set, inSet.mValue), _mm_andnot_ps(is_set, inNotSet.mValue));
381#elif defined(JPH_USE_NEON)
382 return vbslq_f32(vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(inControl.mValue), 31)), inSet.mValue, inNotSet.mValue);
383#elif defined(JPH_USE_RVV)
384 Vec4 masked;
385 const vuint32m1_t control = __riscv_vle32_v_u32m1(inControl.mU32, 4);
386 const vfloat32m1_t not_set = __riscv_vle32_v_f32m1(inNotSet.mF32, 4);
387 const vfloat32m1_t set = __riscv_vle32_v_f32m1(inSet.mF32, 4);
388
389 // Generate RVV bool mask from UVec4
390 const vuint32m1_t r = __riscv_vand_vx_u32m1(control, 0x80000000u, 4);
391 const vbool32_t rvv_mask = __riscv_vmsne_vx_u32m1_b32(r, 0x0, 4);
392 const vfloat32m1_t merged = __riscv_vmerge_vvm_f32m1(not_set, set, rvv_mask, 4);
393 __riscv_vse32_v_f32m1(masked.mF32, merged, 4);
394 return masked;
395#else
396 Vec4 result;
397 for (int i = 0; i < 4; i++)
398 result.mF32[i] = (inControl.mU32[i] & 0x80000000u) ? inSet.mF32[i] : inNotSet.mF32[i];
399 return result;
400#endif
401}
402
404{
405#if defined(JPH_USE_SSE)
406 return _mm_or_ps(inV1.mValue, inV2.mValue);
407#elif defined(JPH_USE_NEON)
408 return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(inV1.mValue), vreinterpretq_u32_f32(inV2.mValue)));
409#elif defined(JPH_USE_RVV)
410 Vec4 or_result;
411 const vuint32m1_t v1 = __riscv_vle32_v_u32m1(reinterpret_cast<const uint32 *>(inV1.mF32), 4);
412 const vuint32m1_t v2 = __riscv_vle32_v_u32m1(reinterpret_cast<const uint32 *>(inV2.mF32), 4);
413 const vuint32m1_t res = __riscv_vor_vv_u32m1(v1, v2, 4);
414 __riscv_vse32_v_u32m1(reinterpret_cast<uint32 *>(or_result.mF32), res, 4);
415 return or_result;
416#else
418#endif
419}
420
422{
423#if defined(JPH_USE_SSE)
424 return _mm_xor_ps(inV1.mValue, inV2.mValue);
425#elif defined(JPH_USE_NEON)
426 return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(inV1.mValue), vreinterpretq_u32_f32(inV2.mValue)));
427#elif defined(JPH_USE_RVV)
428 Vec4 xor_result;
429 const vuint32m1_t v1 = __riscv_vle32_v_u32m1(reinterpret_cast<const uint32 *>(inV1.mF32), 4);
430 const vuint32m1_t v2 = __riscv_vle32_v_u32m1(reinterpret_cast<const uint32 *>(inV2.mF32), 4);
431 const vuint32m1_t res = __riscv_vxor_vv_u32m1(v1, v2, 4);
432 __riscv_vse32_v_u32m1(reinterpret_cast<uint32 *>(xor_result.mF32), res, 4);
433 return xor_result;
434#else
436#endif
437}
438
440{
441#if defined(JPH_USE_SSE)
442 return _mm_and_ps(inV1.mValue, inV2.mValue);
443#elif defined(JPH_USE_NEON)
444 return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(inV1.mValue), vreinterpretq_u32_f32(inV2.mValue)));
445#elif defined(JPH_USE_RVV)
446 Vec4 and_result;
447 const vuint32m1_t v1 = __riscv_vle32_v_u32m1(reinterpret_cast<const uint32 *>(inV1.mF32), 4);
448 const vuint32m1_t v2 = __riscv_vle32_v_u32m1(reinterpret_cast<const uint32 *>(inV2.mF32), 4);
449 const vuint32m1_t res = __riscv_vand_vv_u32m1(v1, v2, 4);
450 __riscv_vse32_v_u32m1(reinterpret_cast<uint32 *>(and_result.mF32), res, 4);
451 return and_result;
452#else
454#endif
455}
456
457void Vec4::sSort4(Vec4 &ioValue, UVec4 &ioIndex)
458{
459 // Pass 1, test 1st vs 3rd, 2nd vs 4th
462 UVec4 c1 = sLess(ioValue, v1).Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_Z, SWIZZLE_W>();
463 ioValue = sSelect(ioValue, v1, c1);
464 ioIndex = UVec4::sSelect(ioIndex, i1, c1);
465
466 // Pass 2, test 1st vs 2nd, 3rd vs 4th
469 UVec4 c2 = sLess(ioValue, v2).Swizzle<SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_W>();
470 ioValue = sSelect(ioValue, v2, c2);
471 ioIndex = UVec4::sSelect(ioIndex, i2, c2);
472
473 // Pass 3, test 2nd vs 3rd component
476 UVec4 c3 = sLess(ioValue, v3).Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_W>();
477 ioValue = sSelect(ioValue, v3, c3);
478 ioIndex = UVec4::sSelect(ioIndex, i3, c3);
479}
480
481void Vec4::sSort4Reverse(Vec4 &ioValue, UVec4 &ioIndex)
482{
483 // Pass 1, test 1st vs 3rd, 2nd vs 4th
487 ioValue = sSelect(ioValue, v1, c1);
488 ioIndex = UVec4::sSelect(ioIndex, i1, c1);
489
490 // Pass 2, test 1st vs 2nd, 3rd vs 4th
494 ioValue = sSelect(ioValue, v2, c2);
495 ioIndex = UVec4::sSelect(ioIndex, i2, c2);
496
497 // Pass 3, test 2nd vs 3rd component
501 ioValue = sSelect(ioValue, v3, c3);
502 ioIndex = UVec4::sSelect(ioIndex, i3, c3);
503}
504
506{
507 return sEquals(*this, inV2).TestAllTrue();
508}
509
510bool Vec4::IsClose(Vec4Arg inV2, float inMaxDistSq) const
511{
512 return (inV2 - *this).LengthSq() <= inMaxDistSq;
513}
514
515bool Vec4::IsNearZero(float inMaxDistSq) const
516{
517 return LengthSq() <= inMaxDistSq;
518}
519
520bool Vec4::IsNormalized(float inTolerance) const
521{
522 return abs(LengthSq() - 1.0f) <= inTolerance;
523}
524
525bool Vec4::IsNaN() const
526{
527#if defined(JPH_USE_AVX512)
528 return _mm_fpclass_ps_mask(mValue, 0b10000001) != 0;
529#elif defined(JPH_USE_SSE)
530 return _mm_movemask_ps(_mm_cmpunord_ps(mValue, mValue)) != 0;
531#elif defined(JPH_USE_NEON)
532 uint32x4_t is_equal = vceqq_f32(mValue, mValue); // If a number is not equal to itself it's a NaN
533 return vaddvq_u32(vshrq_n_u32(is_equal, 31)) != 4;
534#elif defined(JPH_USE_RVV)
535 const vfloat32m1_t v = __riscv_vle32_v_f32m1(mF32, 4);
536 const vbool32_t mask = __riscv_vmfeq_vv_f32m1_b32(v, v, 4);
537 const uint32 eq = __riscv_vcpop_m_b32(mask, 4);
538 return eq != 4;
539#else
540 return isnan(mF32[0]) || isnan(mF32[1]) || isnan(mF32[2]) || isnan(mF32[3]);
541#endif
542}
543
545{
546#if defined(JPH_USE_SSE)
547 return _mm_mul_ps(mValue, inV2.mValue);
548#elif defined(JPH_USE_NEON)
549 return vmulq_f32(mValue, inV2.mValue);
550#elif defined(JPH_USE_RVV)
551 Vec4 res;
552 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(mF32, 4);
553 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 4);
554 const vfloat32m1_t mul = __riscv_vfmul_vv_f32m1(v1, v2, 4);
555 __riscv_vse32_v_f32m1(res.mF32, mul, 4);
556 return res;
557#else
558 return Vec4(mF32[0] * inV2.mF32[0],
559 mF32[1] * inV2.mF32[1],
560 mF32[2] * inV2.mF32[2],
561 mF32[3] * inV2.mF32[3]);
562#endif
563}
564
565Vec4 Vec4::operator * (float inV2) const
566{
567#if defined(JPH_USE_SSE)
568 return _mm_mul_ps(mValue, _mm_set1_ps(inV2));
569#elif defined(JPH_USE_NEON)
570 return vmulq_n_f32(mValue, inV2);
571#elif defined(JPH_USE_RVV)
572 Vec4 res;
573 const vfloat32m1_t src = __riscv_vle32_v_f32m1(mF32, 4);
574 const vfloat32m1_t mul = __riscv_vfmul_vf_f32m1(src, inV2, 4);
575 __riscv_vse32_v_f32m1(res.mF32, mul, 4);
576 return res;
577#else
578 return Vec4(mF32[0] * inV2, mF32[1] * inV2, mF32[2] * inV2, mF32[3] * inV2);
579#endif
580}
581
583Vec4 operator * (float inV1, Vec4Arg inV2)
584{
585#if defined(JPH_USE_SSE)
586 return _mm_mul_ps(_mm_set1_ps(inV1), inV2.mValue);
587#elif defined(JPH_USE_NEON)
588 return vmulq_n_f32(inV2.mValue, inV1);
589#elif defined(JPH_USE_RVV)
590 Vec4 res;
591 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(inV2.mF32, 4);
592 const vfloat32m1_t mul = __riscv_vfmul_vf_f32m1(v1, inV1, 4);
593 __riscv_vse32_v_f32m1(res.mF32, mul, 4);
594 return res;
595#else
596 return Vec4(inV1 * inV2.mF32[0],
597 inV1 * inV2.mF32[1],
598 inV1 * inV2.mF32[2],
599 inV1 * inV2.mF32[3]);
600#endif
601}
602
603Vec4 Vec4::operator / (float inV2) const
604{
605#if defined(JPH_USE_SSE)
606 return _mm_div_ps(mValue, _mm_set1_ps(inV2));
607#elif defined(JPH_USE_NEON)
608 return vdivq_f32(mValue, vdupq_n_f32(inV2));
609#elif defined(JPH_USE_RVV)
610 Vec4 res;
611 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(mF32, 4);
612 const vfloat32m1_t div = __riscv_vfdiv_vf_f32m1(v1, inV2, 4);
613 __riscv_vse32_v_f32m1(res.mF32, div, 4);
614 return res;
615#else
616 return Vec4(mF32[0] / inV2, mF32[1] / inV2, mF32[2] / inV2, mF32[3] / inV2);
617#endif
618}
619
621{
622#if defined(JPH_USE_SSE)
623 mValue = _mm_mul_ps(mValue, _mm_set1_ps(inV2));
624#elif defined(JPH_USE_NEON)
625 mValue = vmulq_n_f32(mValue, inV2);
626#elif defined(JPH_USE_RVV)
627 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(mF32, 4);
628 const vfloat32m1_t res = __riscv_vfmul_vf_f32m1(v1, inV2, 4);
629 __riscv_vse32_v_f32m1(mF32, res, 4);
630#else
631 for (int i = 0; i < 4; ++i)
632 mF32[i] *= inV2;
633#endif
634 return *this;
635}
636
638{
639#if defined(JPH_USE_SSE)
640 mValue = _mm_mul_ps(mValue, inV2.mValue);
641#elif defined(JPH_USE_NEON)
642 mValue = vmulq_f32(mValue, inV2.mValue);
643#elif defined(JPH_USE_RVV)
644 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(mF32, 4);
645 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 4);
646 const vfloat32m1_t rvv_res = __riscv_vfmul_vv_f32m1(v1, v2, 4);
647 __riscv_vse32_v_f32m1(mF32, rvv_res, 4);
648#else
649 for (int i = 0; i < 4; ++i)
650 mF32[i] *= inV2.mF32[i];
651#endif
652 return *this;
653}
654
656{
657#if defined(JPH_USE_SSE)
658 mValue = _mm_div_ps(mValue, _mm_set1_ps(inV2));
659#elif defined(JPH_USE_NEON)
660 mValue = vdivq_f32(mValue, vdupq_n_f32(inV2));
661#elif defined(JPH_USE_RVV)
662 const vfloat32m1_t v = __riscv_vle32_v_f32m1(mF32, 4);
663 const vfloat32m1_t res = __riscv_vfdiv_vf_f32m1(v, inV2, 4);
664 __riscv_vse32_v_f32m1(mF32, res, 4);
665#else
666 for (int i = 0; i < 4; ++i)
667 mF32[i] /= inV2;
668#endif
669 return *this;
670}
671
673{
674#if defined(JPH_USE_SSE)
675 return _mm_add_ps(mValue, inV2.mValue);
676#elif defined(JPH_USE_NEON)
677 return vaddq_f32(mValue, inV2.mValue);
678#elif defined(JPH_USE_RVV)
679 Vec4 res;
680 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(mF32, 4);
681 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 4);
682 const vfloat32m1_t rvv_add = __riscv_vfadd_vv_f32m1(v1, v2, 4);
683 __riscv_vse32_v_f32m1(res.mF32, rvv_add, 4);
684 return res;
685#else
686 return Vec4(mF32[0] + inV2.mF32[0],
687 mF32[1] + inV2.mF32[1],
688 mF32[2] + inV2.mF32[2],
689 mF32[3] + inV2.mF32[3]);
690#endif
691}
692
694{
695#if defined(JPH_USE_SSE)
696 mValue = _mm_add_ps(mValue, inV2.mValue);
697#elif defined(JPH_USE_NEON)
698 mValue = vaddq_f32(mValue, inV2.mValue);
699#elif defined(JPH_USE_RVV)
700 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(mF32, 4);
701 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 4);
702 const vfloat32m1_t rvv_add = __riscv_vfadd_vv_f32m1(v1, v2, 4);
703 __riscv_vse32_v_f32m1(mF32, rvv_add, 4);
704#else
705 for (int i = 0; i < 4; ++i)
706 mF32[i] += inV2.mF32[i];
707#endif
708 return *this;
709}
710
712{
713#if defined(JPH_USE_SSE)
714 return _mm_sub_ps(_mm_setzero_ps(), mValue);
715#elif defined(JPH_USE_NEON)
716 #ifdef JPH_CROSS_PLATFORM_DETERMINISTIC
717 return vsubq_f32(vdupq_n_f32(0), mValue);
718 #else
719 return vnegq_f32(mValue);
720 #endif
721#elif defined(JPH_USE_RVV)
722 #ifdef JPH_CROSS_PLATFORM_DETERMINISTIC
723 Vec4 res;
724 const vfloat32m1_t rvv_zero = __riscv_vfmv_v_f_f32m1(0.0f, 4);
725 const vfloat32m1_t v = __riscv_vle32_v_f32m1(mF32, 4);
726 const vfloat32m1_t rvv_neg = __riscv_vfsub_vv_f32m1(rvv_zero, v, 4);
727 __riscv_vse32_v_f32m1(res.mF32, rvv_neg, 4);
728 return res;
729 #else
730 Vec4 res;
731 const vfloat32m1_t v = __riscv_vle32_v_f32m1(mF32, 4);
732 const vfloat32m1_t rvv_neg = __riscv_vfsgnjn_vv_f32m1(v, v, 4);
733 __riscv_vse32_v_f32m1(res.mF32, rvv_neg, 4);
734 return res;
735 #endif
736#else
737 #ifdef JPH_CROSS_PLATFORM_DETERMINISTIC
738 return Vec4(0.0f - mF32[0], 0.0f - mF32[1], 0.0f - mF32[2], 0.0f - mF32[3]);
739 #else
740 return Vec4(-mF32[0], -mF32[1], -mF32[2], -mF32[3]);
741 #endif
742#endif
743}
744
746{
747#if defined(JPH_USE_SSE)
748 return _mm_sub_ps(mValue, inV2.mValue);
749#elif defined(JPH_USE_NEON)
750 return vsubq_f32(mValue, inV2.mValue);
751#elif defined(JPH_USE_RVV)
752 Vec4 res;
753 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(mF32, 4);
754 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 4);
755 const vfloat32m1_t rvv_sub = __riscv_vfsub_vv_f32m1(v1, v2, 4);
756 __riscv_vse32_v_f32m1(res.mF32, rvv_sub, 4);
757 return res;
758#else
759 return Vec4(mF32[0] - inV2.mF32[0],
760 mF32[1] - inV2.mF32[1],
761 mF32[2] - inV2.mF32[2],
762 mF32[3] - inV2.mF32[3]);
763#endif
764}
765
767{
768#if defined(JPH_USE_SSE)
769 mValue = _mm_sub_ps(mValue, inV2.mValue);
770#elif defined(JPH_USE_NEON)
771 mValue = vsubq_f32(mValue, inV2.mValue);
772#elif defined(JPH_USE_RVV)
773 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(mF32, 4);
774 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 4);
775 const vfloat32m1_t rvv_sub = __riscv_vfsub_vv_f32m1(v1, v2, 4);
776 __riscv_vse32_v_f32m1(mF32, rvv_sub, 4);
777#else
778 for (int i = 0; i < 4; ++i)
779 mF32[i] -= inV2.mF32[i];
780#endif
781 return *this;
782}
783
785{
786#if defined(JPH_USE_SSE)
787 return _mm_div_ps(mValue, inV2.mValue);
788#elif defined(JPH_USE_NEON)
789 return vdivq_f32(mValue, inV2.mValue);
790#elif defined(JPH_USE_RVV)
791 Vec4 res;
792 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(mF32, 4);
793 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 4);
794 const vfloat32m1_t rvv_div = __riscv_vfdiv_vv_f32m1(v1, v2, 4);
795 __riscv_vse32_v_f32m1(res.mF32, rvv_div, 4);
796 return res;
797#else
798 return Vec4(mF32[0] / inV2.mF32[0],
799 mF32[1] / inV2.mF32[1],
800 mF32[2] / inV2.mF32[2],
801 mF32[3] / inV2.mF32[3]);
802#endif
803}
804
806{
807#if defined(JPH_USE_SSE)
808 return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(0, 0, 0, 0));
809#elif defined(JPH_USE_NEON)
810 return vdupq_laneq_f32(mValue, 0);
811#elif defined(JPH_USE_RVV)
812 Vec4 vec;
813 const vfloat32m1_t splat = __riscv_vfmv_v_f_f32m1(mF32[0], 4);
814 __riscv_vse32_v_f32m1(vec.mF32, splat, 4);
815 return vec;
816#else
817 return Vec4(mF32[0], mF32[0], mF32[0], mF32[0]);
818#endif
819}
820
822{
823#if defined(JPH_USE_SSE)
824 return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(1, 1, 1, 1));
825#elif defined(JPH_USE_NEON)
826 return vdupq_laneq_f32(mValue, 1);
827#elif defined(JPH_USE_RVV)
828 Vec4 vec;
829 const vfloat32m1_t splat = __riscv_vfmv_v_f_f32m1(mF32[1], 4);
830 __riscv_vse32_v_f32m1(vec.mF32, splat, 4);
831 return vec;
832#else
833 return Vec4(mF32[1], mF32[1], mF32[1], mF32[1]);
834#endif
835}
836
838{
839#if defined(JPH_USE_SSE)
840 return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(2, 2, 2, 2));
841#elif defined(JPH_USE_NEON)
842 return vdupq_laneq_f32(mValue, 2);
843#elif defined(JPH_USE_RVV)
844 Vec4 vec;
845 const vfloat32m1_t splat = __riscv_vfmv_v_f_f32m1(mF32[2], 4);
846 __riscv_vse32_v_f32m1(vec.mF32, splat, 4);
847 return vec;
848#else
849 return Vec4(mF32[2], mF32[2], mF32[2], mF32[2]);
850#endif
851}
852
854{
855#if defined(JPH_USE_SSE)
856 return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(3, 3, 3, 3));
857#elif defined(JPH_USE_NEON)
858 return vdupq_laneq_f32(mValue, 3);
859#elif defined(JPH_USE_RVV)
860 Vec4 vec;
861 const vfloat32m1_t splat = __riscv_vfmv_v_f_f32m1(mF32[3], 4);
862 __riscv_vse32_v_f32m1(vec.mF32, splat, 4);
863 return vec;
864#else
865 return Vec4(mF32[3], mF32[3], mF32[3], mF32[3]);
866#endif
867}
868
870{
871#if defined(JPH_USE_SSE)
872 return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(0, 0, 0, 0));
873#elif defined(JPH_USE_NEON)
874 return vdupq_laneq_f32(mValue, 0);
875#elif defined(JPH_USE_RVV)
876 Vec3 vec;
877 const vfloat32m1_t splat = __riscv_vfmv_v_f_f32m1(mF32[0], 3);
878 __riscv_vse32_v_f32m1(vec.mF32, splat, 3);
879 return vec;
880#else
881 return Vec3(mF32[0], mF32[0], mF32[0]);
882#endif
883}
884
886{
887#if defined(JPH_USE_SSE)
888 return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(1, 1, 1, 1));
889#elif defined(JPH_USE_NEON)
890 return vdupq_laneq_f32(mValue, 1);
891#elif defined(JPH_USE_RVV)
892 Vec3 vec;
893 const vfloat32m1_t splat = __riscv_vfmv_v_f_f32m1(mF32[1], 3);
894 __riscv_vse32_v_f32m1(vec.mF32, splat, 3);
895 return vec;
896#else
897 return Vec3(mF32[1], mF32[1], mF32[1]);
898#endif
899}
900
902{
903#if defined(JPH_USE_SSE)
904 return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(2, 2, 2, 2));
905#elif defined(JPH_USE_NEON)
906 return vdupq_laneq_f32(mValue, 2);
907#elif defined(JPH_USE_RVV)
908 Vec3 vec;
909 const vfloat32m1_t splat = __riscv_vfmv_v_f_f32m1(mF32[2], 3);
910 __riscv_vse32_v_f32m1(vec.mF32, splat, 3);
911 return vec;
912#else
913 return Vec3(mF32[2], mF32[2], mF32[2]);
914#endif
915}
916
918{
919#if defined(JPH_USE_SSE)
920 return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(3, 3, 3, 3));
921#elif defined(JPH_USE_NEON)
922 return vdupq_laneq_f32(mValue, 3);
923#elif defined(JPH_USE_RVV)
924 Vec3 vec;
925 const vfloat32m1_t splat = __riscv_vfmv_v_f_f32m1(mF32[3], 3);
926 __riscv_vse32_v_f32m1(vec.mF32, splat, 3);
927 return vec;
928#else
929 return Vec3(mF32[3], mF32[3], mF32[3]);
930#endif
931}
932
934{
935 // Get the minimum value in all 4 components
937 value = Vec4::sMin(value, value.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>());
938
939 // Compare with the original vector to find which component is equal to the minimum value
940 return CountTrailingZeros(Vec4::sEquals(*this, value).GetTrues());
941}
942
944{
945 // Get the maximum value in all 4 components
947 value = Vec4::sMax(value, value.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>());
948
949 // Compare with the original vector to find which component is equal to the maximum value
950 return CountTrailingZeros(Vec4::sEquals(*this, value).GetTrues());
951}
952
954{
955#if defined(JPH_USE_AVX512)
956 return _mm_range_ps(mValue, mValue, 0b1000);
957#elif defined(JPH_USE_SSE)
958 return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), mValue), mValue);
959#elif defined(JPH_USE_NEON)
960 return vabsq_f32(mValue);
961#elif defined(JPH_USE_RVV)
962 Vec4 res;
963 const vfloat32m1_t v = __riscv_vle32_v_f32m1(mF32, 4);
964 const vfloat32m1_t rvv_abs = __riscv_vfsgnj_vf_f32m1(v, 1.0, 4);
965 __riscv_vse32_v_f32m1(res.mF32, rvv_abs, 4);
966 return res;
967#else
968 return Vec4(abs(mF32[0]), abs(mF32[1]), abs(mF32[2]), abs(mF32[3]));
969#endif
970}
971
973{
974 return sOne() / mValue;
975}
976
978{
979#ifdef JPH_USE_FMADD
980 Vec4 cd = inC * inD;
981 Vec4 err = Vec4::sFusedMultiplyAdd(-inC, inD, cd);
982 Vec4 dop = Vec4::sFusedMultiplyAdd(inA, inB, -cd);
983 return dop + err;
984#else
985 return inA * inB - inC * inD;
986#endif
987}
988
990{
991#if defined(JPH_USE_SSE4_1)
992 return _mm_dp_ps(mValue, inV2.mValue, 0xff);
993#elif defined(JPH_USE_NEON)
994 float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
995 return vdupq_n_f32(vaddvq_f32(mul));
996#elif defined(JPH_USE_RVV)
997 Vec4 res;
998 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(mF32, 4);
999 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 4);
1000 const vfloat32m1_t mul = __riscv_vfmul_vv_f32m1(v1, v2, 4);
1001 vfloat32m1_t dot = RVVSumElementsFloat32x4(mul);
1002 const vfloat32m1_t splat = __riscv_vrgather_vx_f32m1(dot, 0, 4);
1003 __riscv_vse32_v_f32m1(res.mF32, splat, 4);
1004 return res;
1005#else
1006 // Brackets placed so that the order is consistent with the vectorized version
1007 return Vec4::sReplicate((mF32[0] * inV2.mF32[0] + mF32[1] * inV2.mF32[1]) + (mF32[2] * inV2.mF32[2] + mF32[3] * inV2.mF32[3]));
1008#endif
1009}
1010
1011float Vec4::Dot(Vec4Arg inV2) const
1012{
1013#if defined(JPH_USE_SSE4_1)
1014 return _mm_cvtss_f32(_mm_dp_ps(mValue, inV2.mValue, 0xff));
1015#elif defined(JPH_USE_NEON)
1016 float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
1017 return vaddvq_f32(mul);
1018#elif defined(JPH_USE_RVV)
1019 const vfloat32m1_t v1 = __riscv_vle32_v_f32m1(mF32, 4);
1020 const vfloat32m1_t v2 = __riscv_vle32_v_f32m1(inV2.mF32, 4);
1021 const vfloat32m1_t mul = __riscv_vfmul_vv_f32m1(v1, v2, 4);
1022 return __riscv_vfmv_f_s_f32m1_f32(RVVSumElementsFloat32x4(mul));
1023#else
1024 // Brackets placed so that the order is consistent with the vectorized version
1025 return (mF32[0] * inV2.mF32[0] + mF32[1] * inV2.mF32[1]) + (mF32[2] * inV2.mF32[2] + mF32[3] * inV2.mF32[3]);
1026#endif
1027}
1028
1029float Vec4::LengthSq() const
1030{
1031#if defined(JPH_USE_SSE4_1)
1032 return _mm_cvtss_f32(_mm_dp_ps(mValue, mValue, 0xff));
1033#elif defined(JPH_USE_NEON)
1034 float32x4_t mul = vmulq_f32(mValue, mValue);
1035 return vaddvq_f32(mul);
1036#elif defined(JPH_USE_RVV)
1037 const vfloat32m1_t v = __riscv_vle32_v_f32m1(mF32, 4);
1038 const vfloat32m1_t mul = __riscv_vfmul_vv_f32m1(v, v, 4);
1039 return __riscv_vfmv_f_s_f32m1_f32(RVVSumElementsFloat32x4(mul));
1040#else
1041 // Brackets placed so that the order is consistent with the vectorized version
1042 return (mF32[0] * mF32[0] + mF32[1] * mF32[1]) + (mF32[2] * mF32[2] + mF32[3] * mF32[3]);
1043#endif
1044}
1045
1046float Vec4::Length() const
1047{
1048#if defined(JPH_USE_SSE4_1)
1049 return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(mValue, mValue, 0xff)));
1050#elif defined(JPH_USE_NEON)
1051 float32x4_t mul = vmulq_f32(mValue, mValue);
1052 float32x2_t sum = vdup_n_f32(vaddvq_f32(mul));
1053 return vget_lane_f32(vsqrt_f32(sum), 0);
1054#elif defined(JPH_USE_RVV)
1055 const vfloat32m1_t v = __riscv_vle32_v_f32m1(mF32, 4);
1056 const vfloat32m1_t mul = __riscv_vfmul_vv_f32m1(v, v, 4);
1057 const vfloat32m1_t sum = RVVSumElementsFloat32x4(mul);
1058 const vfloat32m1_t sqrt = __riscv_vfsqrt_v_f32m1(sum, 1);
1059 return __riscv_vfmv_f_s_f32m1_f32(sqrt);
1060#else
1061 // Brackets placed so that the order is consistent with the vectorized version
1062 return JPH::Sqrt((mF32[0] * mF32[0] + mF32[1] * mF32[1]) + (mF32[2] * mF32[2] + mF32[3] * mF32[3]));
1063#endif
1064}
1065
1067{
1068#if defined(JPH_USE_SSE)
1069 return _mm_sqrt_ps(mValue);
1070#elif defined(JPH_USE_NEON)
1071 return vsqrtq_f32(mValue);
1072#elif defined(JPH_USE_RVV)
1073 Vec4 res;
1074 const vfloat32m1_t rvv_v = __riscv_vle32_v_f32m1(mF32, 4);
1075 const vfloat32m1_t rvv_sqrt = __riscv_vfsqrt_v_f32m1(rvv_v, 4);
1076 __riscv_vse32_v_f32m1(res.mF32, rvv_sqrt, 4);
1077 return res;
1078#else
1079 return Vec4(JPH::Sqrt(mF32[0]), JPH::Sqrt(mF32[1]), JPH::Sqrt(mF32[2]), JPH::Sqrt(mF32[3]));
1080#endif
1081}
1082
1083
1085{
1086#if defined(JPH_USE_AVX512)
1087 Type one = _mm_set1_ps(1.0f);
1088 return _mm_or_ps(_mm_fixupimm_ps(mValue, mValue, _mm_set1_epi32(0xA9A90100), 0), one);
1089#elif defined(JPH_USE_SSE)
1090 Type minus_one = _mm_set1_ps(-1.0f);
1091 Type one = _mm_set1_ps(1.0f);
1092 return _mm_or_ps(_mm_and_ps(mValue, minus_one), one);
1093#elif defined(JPH_USE_NEON)
1094 Type minus_one = vdupq_n_f32(-1.0f);
1095 Type one = vdupq_n_f32(1.0f);
1096 return vreinterpretq_f32_u32(vorrq_u32(vandq_u32(vreinterpretq_u32_f32(mValue), vreinterpretq_u32_f32(minus_one)), vreinterpretq_u32_f32(one)));
1097#elif defined(JPH_USE_RVV)
1098 Vec4 res;
1099 const vfloat32m1_t rvv_in = __riscv_vle32_v_f32m1(mF32, 4);
1100 const vfloat32m1_t rvv_one = __riscv_vfmv_v_f_f32m1(1.0, 4);
1101 const vfloat32m1_t rvv_signs = __riscv_vfsgnj_vv_f32m1(rvv_one, rvv_in, 4);
1102 __riscv_vse32_v_f32m1(res.mF32, rvv_signs, 4);
1103 return res;
1104#else
1105 return Vec4(std::signbit(mF32[0])? -1.0f : 1.0f,
1106 std::signbit(mF32[1])? -1.0f : 1.0f,
1107 std::signbit(mF32[2])? -1.0f : 1.0f,
1108 std::signbit(mF32[3])? -1.0f : 1.0f);
1109#endif
1110}
1111
1112template <int X, int Y, int Z, int W>
1113JPH_INLINE Vec4 Vec4::FlipSign() const
1114{
1115 static_assert(X == 1 || X == -1, "X must be 1 or -1");
1116 static_assert(Y == 1 || Y == -1, "Y must be 1 or -1");
1117 static_assert(Z == 1 || Z == -1, "Z must be 1 or -1");
1118 static_assert(W == 1 || W == -1, "W must be 1 or -1");
1119 return Vec4::sXor(*this, Vec4(X > 0? 0.0f : -0.0f, Y > 0? 0.0f : -0.0f, Z > 0? 0.0f : -0.0f, W > 0? 0.0f : -0.0f));
1120}
1121
1123{
1124#if defined(JPH_USE_SSE4_1)
1125 return _mm_div_ps(mValue, _mm_sqrt_ps(_mm_dp_ps(mValue, mValue, 0xff)));
1126#elif defined(JPH_USE_NEON)
1127 float32x4_t mul = vmulq_f32(mValue, mValue);
1128 float32x4_t sum = vdupq_n_f32(vaddvq_f32(mul));
1129 return vdivq_f32(mValue, vsqrtq_f32(sum));
1130#elif defined(JPH_USE_RVV)
1131 const vfloat32m1_t v = __riscv_vle32_v_f32m1(mF32, 4);
1132 const vfloat32m1_t mul = __riscv_vfmul_vv_f32m1(v, v, 4);
1133
1134 const vfloat32m1_t sum = RVVSumElementsFloat32x4(mul);
1135 const vfloat32m1_t sum_splat = __riscv_vrgather_vx_f32m1(sum, 0, 4);
1136 const vfloat32m1_t sqrt = __riscv_vfsqrt_v_f32m1(sum_splat, 4);
1137 const vfloat32m1_t norm_v = __riscv_vfdiv_vv_f32m1(v, sqrt, 4);
1138
1139 Vec4 vec;
1140 __riscv_vse32_v_f32m1(vec.mF32, norm_v, 4);
1141 return vec;
1142#else
1143 return *this / Length();
1144#endif
1145}
1146
1147void Vec4::StoreFloat4(Float4 *outV) const
1148{
1149#if defined(JPH_USE_SSE)
1150 _mm_storeu_ps(&outV->x, mValue);
1151#elif defined(JPH_USE_NEON)
1152 vst1q_f32(&outV->x, mValue);
1153#elif defined(JPH_USE_RVV)
1154 const vfloat32m1_t v = __riscv_vle32_v_f32m1(mF32, 4);
1155 __riscv_vse32_v_f32m1(&outV->x, v, 4);
1156#else
1157 for (int i = 0; i < 4; ++i)
1158 (&outV->x)[i] = mF32[i];
1159#endif
1160}
1161
1163{
1164#if defined(JPH_USE_SSE)
1165 return _mm_cvttps_epi32(mValue);
1166#elif defined(JPH_USE_NEON)
1167 return vcvtq_u32_f32(mValue);
1168#elif defined(JPH_USE_RVV)
1169 UVec4 res;
1170 const vfloat32m1_t v = __riscv_vle32_v_f32m1(mF32, 4);
1171 const vuint32m1_t cast = __riscv_vfcvt_rtz_xu_f_v_u32m1(v, 4);
1172 __riscv_vse32_v_u32m1(res.mU32, cast, 4);
1173 return res;
1174#else
1175 return UVec4(uint32(mF32[0]), uint32(mF32[1]), uint32(mF32[2]), uint32(mF32[3]));
1176#endif
1177}
1178
1180{
1181#if defined(JPH_USE_SSE)
1182 return UVec4(_mm_castps_si128(mValue));
1183#elif defined(JPH_USE_NEON)
1184 return vreinterpretq_u32_f32(mValue);
1185#else
1186 return *reinterpret_cast<const UVec4 *>(this);
1187#endif
1188}
1189
1191{
1192#if defined(JPH_USE_SSE)
1193 return _mm_movemask_ps(mValue);
1194#elif defined(JPH_USE_NEON)
1195 int32x4_t shift = JPH_NEON_INT32x4(0, 1, 2, 3);
1196 return vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(mValue), 31), shift));
1197#elif defined(JPH_USE_RVV)
1198 const vuint32m1_t v = __riscv_vle32_v_u32m1(reinterpret_cast<const uint32 *>(mF32), 4);
1199 const vuint32m1_t shifted = __riscv_vsrl_vx_u32m1(v, 31, 4);
1200 const vbool32_t mask = __riscv_vmsne_vx_u32m1_b32(shifted, 0x0, 4);
1201 const vuint32m1_t as_int = __riscv_vreinterpret_v_b32_u32m1(mask);
1202 const uint32 result = __riscv_vmv_x_s_u32m1_u32(as_int) & 0xF;
1203 return result;
1204#else
1205 return (std::signbit(mF32[0])? 1 : 0) | (std::signbit(mF32[1])? 2 : 0) | (std::signbit(mF32[2])? 4 : 0) | (std::signbit(mF32[3])? 8 : 0);
1206#endif
1207}
1208
1215
1222
1229
1230void Vec4::SinCos(Vec4 &outSin, Vec4 &outCos) const
1231{
1232 // Implementation based on sinf.c from the cephes library, combines sinf and cosf in a single function, changes octants to quadrants and vectorizes it
1233 // Original implementation by Stephen L. Moshier (See: http://www.moshier.net/)
1234
1235 // Make argument positive and remember sign for sin only since cos is symmetric around x (highest bit of a float is the sign bit)
1236 UVec4 sin_sign = UVec4::sAnd(ReinterpretAsInt(), UVec4::sReplicate(0x80000000U));
1237 Vec4 x = Vec4::sXor(*this, sin_sign.ReinterpretAsFloat());
1238
1239 // x / (PI / 2) rounded to nearest int gives us the quadrant closest to x
1240 UVec4 quadrant = (0.6366197723675814f * x + Vec4::sReplicate(0.5f)).ToInt();
1241
1242 // Make x relative to the closest quadrant.
1243 // This does x = x - quadrant * PI / 2 using a two step Cody-Waite argument reduction.
1244 // This improves the accuracy of the result by avoiding loss of significant bits in the subtraction.
1245 // We start with x = x - quadrant * PI / 2, PI / 2 in hexadecimal notation is 0x3fc90fdb, we remove the lowest 16 bits to
1246 // get 0x3fc90000 (= 1.5703125) this means we can now multiply with a number of up to 2^16 without losing any bits.
1247 // This leaves us with: x = (x - quadrant * 1.5703125) - quadrant * (PI / 2 - 1.5703125).
1248 // PI / 2 - 1.5703125 in hexadecimal is 0x39fdaa22, stripping the lowest 12 bits we get 0x39fda000 (= 0.0004837512969970703125)
1249 // This leaves uw with: x = ((x - quadrant * 1.5703125) - quadrant * 0.0004837512969970703125) - quadrant * (PI / 2 - 1.5703125 - 0.0004837512969970703125)
1250 // See: https://stackoverflow.com/questions/42455143/sine-cosine-modular-extended-precision-arithmetic
1251 // After this we have x in the range [-PI / 4, PI / 4].
1252 Vec4 float_quadrant = quadrant.ToFloat();
1253 x = ((x - float_quadrant * 1.5703125f) - float_quadrant * 0.0004837512969970703125f) - float_quadrant * 7.549789948768648e-8f;
1254
1255 // Calculate x2 = x^2
1256 Vec4 x2 = x * x;
1257
1258 // Taylor expansion:
1259 // Cos(x) = 1 - x^2/2! + x^4/4! - x^6/6! + x^8/8! + ... = (((x2/8!- 1/6!) * x2 + 1/4!) * x2 - 1/2!) * x2 + 1
1260 Vec4 taylor_cos = ((2.443315711809948e-5f * x2 - Vec4::sReplicate(1.388731625493765e-3f)) * x2 + Vec4::sReplicate(4.166664568298827e-2f)) * x2 * x2 - 0.5f * x2 + Vec4::sOne();
1261 // Sin(x) = x - x^3/3! + x^5/5! - x^7/7! + ... = ((-x2/7! + 1/5!) * x2 - 1/3!) * x2 * x + x
1262 Vec4 taylor_sin = ((-1.9515295891e-4f * x2 + Vec4::sReplicate(8.3321608736e-3f)) * x2 - Vec4::sReplicate(1.6666654611e-1f)) * x2 * x + x;
1263
1264 // The lowest 2 bits of quadrant indicate the quadrant that we are in.
1265 // Let x be the original input value and x' our value that has been mapped to the range [-PI / 4, PI / 4].
1266 // since cos(x) = sin(x - PI / 2) and since we want to use the Taylor expansion as close as possible to 0,
1267 // we can alternate between using the Taylor expansion for sin and cos according to the following table:
1268 //
1269 // quadrant sin(x) cos(x)
1270 // XXX00b sin(x') cos(x')
1271 // XXX01b cos(x') -sin(x')
1272 // XXX10b -sin(x') -cos(x')
1273 // XXX11b -cos(x') sin(x')
1274 //
1275 // So: sin_sign = bit2, cos_sign = bit1 ^ bit2, bit1 determines if we use sin or cos Taylor expansion
1276 UVec4 bit1 = quadrant.LogicalShiftLeft<31>();
1277 UVec4 bit2 = UVec4::sAnd(quadrant.LogicalShiftLeft<30>(), UVec4::sReplicate(0x80000000U));
1278
1279 // Select which one of the results is sin and which one is cos
1280 Vec4 s = Vec4::sSelect(taylor_sin, taylor_cos, bit1);
1281 Vec4 c = Vec4::sSelect(taylor_cos, taylor_sin, bit1);
1282
1283 // Update the signs
1284 sin_sign = UVec4::sXor(sin_sign, bit2);
1285 UVec4 cos_sign = UVec4::sXor(bit1, bit2);
1286
1287 // Correct the signs
1288 outSin = Vec4::sXor(s, sin_sign.ReinterpretAsFloat());
1289 outCos = Vec4::sXor(c, cos_sign.ReinterpretAsFloat());
1290}
1291
1293{
1294 // Implementation based on tanf.c from the cephes library, see Vec4::SinCos for further details
1295 // Original implementation by Stephen L. Moshier (See: http://www.moshier.net/)
1296
1297 // Make argument positive
1298 UVec4 tan_sign = UVec4::sAnd(ReinterpretAsInt(), UVec4::sReplicate(0x80000000U));
1299 Vec4 x = Vec4::sXor(*this, tan_sign.ReinterpretAsFloat());
1300
1301 // x / (PI / 2) rounded to nearest int gives us the quadrant closest to x
1302 UVec4 quadrant = (0.6366197723675814f * x + Vec4::sReplicate(0.5f)).ToInt();
1303
1304 // Remap x to range [-PI / 4, PI / 4], see Vec4::SinCos
1305 Vec4 float_quadrant = quadrant.ToFloat();
1306 x = ((x - float_quadrant * 1.5703125f) - float_quadrant * 0.0004837512969970703125f) - float_quadrant * 7.549789948768648e-8f;
1307
1308 // Calculate x2 = x^2
1309 Vec4 x2 = x * x;
1310
1311 // Roughly equivalent to the Taylor expansion:
1312 // Tan(x) = x + x^3/3 + 2*x^5/15 + 17*x^7/315 + 62*x^9/2835 + ...
1313 Vec4 tan =
1314 (((((9.38540185543e-3f * x2 + Vec4::sReplicate(3.11992232697e-3f)) * x2 + Vec4::sReplicate(2.44301354525e-2f)) * x2
1315 + Vec4::sReplicate(5.34112807005e-2f)) * x2 + Vec4::sReplicate(1.33387994085e-1f)) * x2 + Vec4::sReplicate(3.33331568548e-1f)) * x2 * x + x;
1316
1317 // For the 2nd and 4th quadrant we need to invert the value
1318 UVec4 bit1 = quadrant.LogicalShiftLeft<31>();
1319 tan = Vec4::sSelect(tan, Vec4::sReplicate(-1.0f) / (tan JPH_IF_FLOATING_POINT_EXCEPTIONS_ENABLED(+ Vec4::sReplicate(FLT_MIN))), bit1); // Add small epsilon to prevent div by zero, works because tan is always positive
1320
1321 // Put the sign back
1322 return Vec4::sXor(tan, tan_sign.ReinterpretAsFloat());
1323}
1324
1326{
1327 // Implementation based on asinf.c from the cephes library
1328 // Original implementation by Stephen L. Moshier (See: http://www.moshier.net/)
1329
1330 // Make argument positive
1331 UVec4 asin_sign = UVec4::sAnd(ReinterpretAsInt(), UVec4::sReplicate(0x80000000U));
1332 Vec4 a = Vec4::sXor(*this, asin_sign.ReinterpretAsFloat());
1333
1334 // ASin is not defined outside the range [-1, 1] but it often happens that a value is slightly above 1 so we just clamp here
1335 a = Vec4::sMin(a, Vec4::sOne());
1336
1337 // When |x| <= 0.5 we use the asin approximation as is
1338 Vec4 z1 = a * a;
1339 Vec4 x1 = a;
1340
1341 // When |x| > 0.5 we use the identity asin(x) = PI / 2 - 2 * asin(sqrt((1 - x) / 2))
1342 Vec4 z2 = 0.5f * (Vec4::sOne() - a);
1343 Vec4 x2 = z2.Sqrt();
1344
1345 // Select which of the two situations we have
1346 UVec4 greater = Vec4::sGreater(a, Vec4::sReplicate(0.5f));
1347 Vec4 z = Vec4::sSelect(z1, z2, greater);
1348 Vec4 x = Vec4::sSelect(x1, x2, greater);
1349
1350 // Polynomial approximation of asin
1351 z = ((((4.2163199048e-2f * z + Vec4::sReplicate(2.4181311049e-2f)) * z + Vec4::sReplicate(4.5470025998e-2f)) * z + Vec4::sReplicate(7.4953002686e-2f)) * z + Vec4::sReplicate(1.6666752422e-1f)) * z * x + x;
1352
1353 // If |x| > 0.5 we need to apply the remainder of the identity above
1354 z = Vec4::sSelect(z, Vec4::sReplicate(0.5f * JPH_PI) - (z + z), greater);
1355
1356 // Put the sign back
1357 return Vec4::sXor(z, asin_sign.ReinterpretAsFloat());
1358}
1359
1361{
1362 // Not the most accurate, but simple
1363 return Vec4::sReplicate(0.5f * JPH_PI) - ASin();
1364}
1365
1367{
1368 // Implementation based on atanf.c from the cephes library
1369 // Original implementation by Stephen L. Moshier (See: http://www.moshier.net/)
1370
1371 // Make argument positive
1372 UVec4 atan_sign = UVec4::sAnd(ReinterpretAsInt(), UVec4::sReplicate(0x80000000U));
1373 Vec4 x = Vec4::sXor(*this, atan_sign.ReinterpretAsFloat());
1374 Vec4 y = Vec4::sZero();
1375
1376 // If x > Tan(PI / 8)
1377 UVec4 greater1 = Vec4::sGreater(x, Vec4::sReplicate(0.4142135623730950f));
1378 Vec4 x1 = (x - Vec4::sOne()) / (x + Vec4::sOne());
1379
1380 // If x > Tan(3 * PI / 8)
1381 UVec4 greater2 = Vec4::sGreater(x, Vec4::sReplicate(2.414213562373095f));
1382 Vec4 x2 = Vec4::sReplicate(-1.0f) / (x JPH_IF_FLOATING_POINT_EXCEPTIONS_ENABLED(+ Vec4::sReplicate(FLT_MIN))); // Add small epsilon to prevent div by zero, works because x is always positive
1383
1384 // Apply first if
1385 x = Vec4::sSelect(x, x1, greater1);
1386 y = Vec4::sSelect(y, Vec4::sReplicate(0.25f * JPH_PI), greater1);
1387
1388 // Apply second if
1389 x = Vec4::sSelect(x, x2, greater2);
1390 y = Vec4::sSelect(y, Vec4::sReplicate(0.5f * JPH_PI), greater2);
1391
1392 // Polynomial approximation
1393 Vec4 z = x * x;
1394 y += (((8.05374449538e-2f * z - Vec4::sReplicate(1.38776856032e-1f)) * z + Vec4::sReplicate(1.99777106478e-1f)) * z - Vec4::sReplicate(3.33329491539e-1f)) * z * x + x;
1395
1396 // Put the sign back
1397 return Vec4::sXor(y, atan_sign.ReinterpretAsFloat());
1398}
1399
1401{
1402 UVec4 sign_mask = UVec4::sReplicate(0x80000000U);
1403
1404 // Determine absolute value and sign of y
1405 UVec4 y_sign = UVec4::sAnd(inY.ReinterpretAsInt(), sign_mask);
1406 Vec4 y_abs = Vec4::sXor(inY, y_sign.ReinterpretAsFloat());
1407
1408 // Determine absolute value and sign of x
1409 UVec4 x_sign = UVec4::sAnd(inX.ReinterpretAsInt(), sign_mask);
1410 Vec4 x_abs = Vec4::sXor(inX, x_sign.ReinterpretAsFloat());
1411
1412 // Always divide smallest / largest to avoid dividing by zero
1413 UVec4 x_is_numerator = Vec4::sLess(x_abs, y_abs);
1414 Vec4 numerator = Vec4::sSelect(y_abs, x_abs, x_is_numerator);
1415 Vec4 denominator = Vec4::sSelect(x_abs, y_abs, x_is_numerator);
1416 Vec4 atan = (numerator / denominator).ATan();
1417
1418 // If we calculated x / y instead of y / x the result is PI / 2 - result (note that this is true because we know the result is positive because the input was positive)
1419 atan = Vec4::sSelect(atan, Vec4::sReplicate(0.5f * JPH_PI) - atan, x_is_numerator);
1420
1421 // Now we need to map to the correct quadrant
1422 // x_sign y_sign result
1423 // +1 +1 atan
1424 // -1 +1 -atan + PI
1425 // -1 -1 atan - PI
1426 // +1 -1 -atan
1427 // This can be written as: x_sign * y_sign * (atan - (x_sign < 0? PI : 0))
1428 atan -= Vec4::sAnd(x_sign.ArithmeticShiftRight<31>().ReinterpretAsFloat(), Vec4::sReplicate(JPH_PI));
1429 atan = Vec4::sXor(atan, UVec4::sXor(x_sign, y_sign).ReinterpretAsFloat());
1430 return atan;
1431}
1432
1434{
1435 constexpr float cOneOverSqrt2 = 0.70710678f;
1436 constexpr uint cNumBits = 9;
1437 constexpr uint cMask = (1 << cNumBits) - 1;
1438 constexpr uint cMaxValue = cMask - 1; // Need odd number of buckets to quantize to or else we can't encode 0
1439 constexpr float cScale = float(cMaxValue) / (2.0f * cOneOverSqrt2);
1440
1441 // Store sign bit
1442 Vec4 v = *this;
1443 uint32 max_element = v.Abs().GetHighestComponentIndex();
1444 uint32 value = 0;
1445 if (v[max_element] < 0.0f)
1446 {
1447 value = 0x80000000u;
1448 v = -v;
1449 }
1450
1451 // Store highest component
1452 value |= max_element << 29;
1453
1454 // Store the other three components in a compressed format
1455 UVec4 compressed = Vec4::sClamp((v + Vec4::sReplicate(cOneOverSqrt2)) * cScale + Vec4::sReplicate(0.5f), Vec4::sZero(), Vec4::sReplicate(cMaxValue)).ToInt();
1456 switch (max_element)
1457 {
1458 case 0:
1459 compressed = compressed.Swizzle<SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_UNUSED>();
1460 break;
1461
1462 case 1:
1463 compressed = compressed.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_UNUSED>();
1464 break;
1465
1466 case 2:
1467 compressed = compressed.Swizzle<SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_UNUSED>();
1468 break;
1469 }
1470
1471 value |= compressed.GetX();
1472 value |= compressed.GetY() << cNumBits;
1473 value |= compressed.GetZ() << 2 * cNumBits;
1474 return value;
1475}
1476
1478{
1479 constexpr float cOneOverSqrt2 = 0.70710678f;
1480 constexpr uint cNumBits = 9;
1481 constexpr uint cMask = (1u << cNumBits) - 1;
1482 constexpr uint cMaxValue = cMask - 1; // Need odd number of buckets to quantize to or else we can't encode 0
1483 constexpr float cScale = 2.0f * cOneOverSqrt2 / float(cMaxValue);
1484
1485 // Restore three components
1486 Vec4 v = Vec4(UVec4(inValue & cMask, (inValue >> cNumBits) & cMask, (inValue >> (2 * cNumBits)) & cMask, 0).ToFloat()) * cScale - Vec4(cOneOverSqrt2, cOneOverSqrt2, cOneOverSqrt2, 0.0f);
1487 JPH_ASSERT(v.GetW() == 0.0f);
1488
1489 // Restore the highest component
1490 v.SetW(JPH::Sqrt(max(1.0f - v.LengthSq(), 0.0f)));
1491
1492 // Extract sign
1493 if ((inValue & 0x80000000u) != 0)
1494 v = -v;
1495
1496 // Swizzle the components in place
1497 switch ((inValue >> 29) & 3)
1498 {
1499 case 0:
1501 break;
1502
1503 case 1:
1505 break;
1506
1507 case 2:
1509 break;
1510 }
1511
1512 return v;
1513}
1514
std::uint8_t uint8
Definition Core.h:510
std::uint64_t uint64
Definition Core.h:514
unsigned int uint
Definition Core.h:509
#define JPH_NAMESPACE_END
Definition Core.h:433
std::uint32_t uint32
Definition Core.h:512
#define JPH_IF_FLOATING_POINT_EXCEPTIONS_ENABLED(...)
Definition Core.h:584
#define JPH_NAMESPACE_BEGIN
Definition Core.h:427
#define xy
Definition HLSLToCPP.h:511
#define JPH_ASSERT(...)
Definition IssueReporting.h:33
uint CountTrailingZeros(uint32 inValue)
Compute number of trailing zero bits (how many low bits are zero)
Definition Math.h:145
JPH_INLINE To BitCast(const From &inValue)
Definition Math.h:239
@ SWIZZLE_Z
Use the Z component.
Definition Swizzle.h:14
@ SWIZZLE_W
Use the W component.
Definition Swizzle.h:15
@ SWIZZLE_X
Use the X component.
Definition Swizzle.h:12
@ SWIZZLE_UNUSED
We always use the Z component when we don't specifically want to initialize a value,...
Definition Swizzle.h:16
@ SWIZZLE_Y
Use the Y component.
Definition Swizzle.h:13
Vec4 operator*(float inV1, Vec4Arg inV2)
Multiply vector with float.
Definition Vec4.inl:583
Class that holds 4 float values. Convert to Vec4 to perform calculations.
Definition Float4.h:11
float x
Definition Float4.h:36
float y
Definition Float4.h:37
float z
Definition Float4.h:38
float w
Definition Float4.h:39
Definition UVec4.h:12
JPH_INLINE UVec4 Swizzle() const
Swizzle the elements in inV.
JPH_INLINE uint32 GetZ() const
Definition UVec4.h:104
JPH_INLINE UVec4 LogicalShiftLeft() const
Shift all components by Count bits to the left (filling with zeros from the left)
static JPH_INLINE UVec4 sSelect(UVec4Arg inNotSet, UVec4Arg inSet, UVec4Arg inControl)
Component wise select, returns inNotSet when highest bit of inControl = 0 and inSet when highest bit ...
Definition UVec4.inl:221
JPH_INLINE uint32 GetY() const
Definition UVec4.h:103
static JPH_INLINE UVec4 sReplicate(uint32 inV)
Replicate int inV across all components.
Definition UVec4.inl:75
JPH_INLINE bool TestAllTrue() const
Test if all components are true (true is when highest bit of component is set)
Definition UVec4.inl:658
static JPH_INLINE UVec4 sAnd(UVec4Arg inV1, UVec4Arg inV2)
Logical and (component wise)
Definition UVec4.inl:292
static JPH_INLINE UVec4 sOr(UVec4Arg inV1, UVec4Arg inV2)
Logical or (component wise)
Definition UVec4.inl:250
JPH_INLINE uint32 GetW() const
Definition UVec4.h:105
Type mValue
Definition UVec4.h:223
JPH_INLINE uint32 GetX() const
Get individual components.
Definition UVec4.h:102
static JPH_INLINE UVec4 sXor(UVec4Arg inV1, UVec4Arg inV2)
Logical xor (component wise)
Definition UVec4.inl:271
JPH_INLINE UVec4 ArithmeticShiftRight() const
Shift all components by Count bits to the right (shifting in the value of the highest bit)
JPH_INLINE Vec4 ToFloat() const
Convert each component from an int to a float.
Definition UVec4.inl:510
JPH_INLINE Vec4 ReinterpretAsFloat() const
Reinterpret UVec4 as a Vec4 (doesn't change the bits)
Definition UVec4.inl:527
uint32 mU32[4]
Definition UVec4.h:224
Definition Vec3.h:17
Type mValue
Definition Vec3.h:305
float mF32[4]
Definition Vec3.h:306
Definition Vec4.h:14
JPH_INLINE bool IsNearZero(float inMaxDistSq=1.0e-12f) const
Test if vector is near zero.
Definition Vec4.inl:515
JPH_INLINE Vec4 SplatX() const
Replicate the X component to all components.
Definition Vec4.inl:805
static JPH_INLINE void sSort4(Vec4 &ioValue, UVec4 &ioIndex)
Definition Vec4.inl:457
Vec4 ATan() const
Calculate the arc tangent for each element of this vector (returns value in the range [-PI / 2,...
Definition Vec4.inl:1366
static JPH_INLINE UVec4 sGreater(Vec4Arg inV1, Vec4Arg inV2)
Greater than (component wise)
Definition Vec4.inl:304
float mF32[4]
Definition Vec4.h:318
JPH_INLINE Vec3 SplatW3() const
Replicate the W component to all components.
Definition Vec4.inl:917
JPH_INLINE Vec4 operator-() const
Negate.
Definition Vec4.inl:711
Vec4()=default
Constructor.
static JPH_INLINE Vec4 sAnd(Vec4Arg inV1, Vec4Arg inV2)
Logical and (component wise)
Definition Vec4.inl:439
static JPH_INLINE Vec4 sLoadFloat4Aligned(const Float4 *inV)
Load 4 floats from memory, 16 bytes aligned.
Definition Vec4.inl:139
static Vec4 sATan2(Vec4Arg inY, Vec4Arg inX)
Calculate the arc tangent of y / x using the signs of the arguments to determine the correct quadrant...
Definition Vec4.inl:1400
JPH_INLINE void SetW(float inW)
Definition Vec4.h:129
JPH_INLINE Vec4 GetSign() const
Get vector that contains the sign of each element (returns 1.0f if positive, -1.0f if negative)
Definition Vec4.inl:1084
static JPH_INLINE Vec4 sDifferenceOfProducts(Vec4Arg inA, Vec4Arg inB, Vec4Arg inC, Vec4Arg inD)
Calculates inA * inB - inC * inD with more precision when FMA instructions are available....
Definition Vec4.inl:977
Vec4 ASin() const
Definition Vec4.inl:1325
JPH_INLINE Vec4 FlipSign() const
Flips the signs of the components, e.g. FlipSign<-1, 1, -1, 1>() will flip the signs of the X and Z c...
Definition Vec4.inl:1113
static JPH_INLINE Vec4 sXor(Vec4Arg inV1, Vec4Arg inV2)
Logical xor (component wise)
Definition Vec4.inl:421
JPH_INLINE Vec4 Abs() const
Return the absolute value of each of the components.
Definition Vec4.inl:953
JPH_INLINE Vec4 operator/(float inV2) const
Divide vector by float.
Definition Vec4.inl:603
Vec4 Tan() const
Calculate the tangent for each element of this vector (input in radians)
Definition Vec4.inl:1292
JPH_INLINE float GetW() const
Definition Vec4.h:122
JPH_INLINE UVec4 ToInt() const
Convert each component from a float to an int.
Definition Vec4.inl:1162
JPH_INLINE Vec4 & operator+=(Vec4Arg inV2)
Add two float vectors (component wise)
Definition Vec4.inl:693
static JPH_INLINE UVec4 sLessOrEqual(Vec4Arg inV1, Vec4Arg inV2)
Less than or equal (component wise)
Definition Vec4.inl:281
static JPH_INLINE UVec4 sLess(Vec4Arg inV1, Vec4Arg inV2)
Less than (component wise)
Definition Vec4.inl:258
JPH_INLINE int GetLowestComponentIndex() const
Get index of component with lowest value.
Definition Vec4.inl:933
JPH_INLINE float Length() const
Length of vector.
Definition Vec4.inl:1046
static JPH_INLINE void sSort4Reverse(Vec4 &ioValue, UVec4 &ioIndex)
Definition Vec4.inl:481
static JPH_INLINE Vec4 sOne()
Vector with all ones.
Definition Vec4.inl:113
static JPH_INLINE Vec4 sFusedMultiplyAdd(Vec4Arg inMul1, Vec4Arg inMul2, Vec4Arg inAdd)
Calculates inMul1 * inMul2 + inAdd.
Definition Vec4.inl:350
JPH_INLINE Vec4 Normalized() const
Normalize vector.
Definition Vec4.inl:1122
static JPH_INLINE UVec4 sEquals(Vec4Arg inV1, Vec4Arg inV2)
Equals (component wise)
Definition Vec4.inl:235
JPH_INLINE float ReduceMax() const
Get the maximum of X, Y, Z and W.
Definition Vec4.inl:1216
JPH_INLINE Vec4 Reciprocal() const
Reciprocal vector (1 / value) for each of the components.
Definition Vec4.inl:972
JPH_INLINE Vec4 SplatY() const
Replicate the Y component to all components.
Definition Vec4.inl:821
JPH_INLINE UVec4 ReinterpretAsInt() const
Reinterpret Vec4 as a UVec4 (doesn't change the bits)
Definition Vec4.inl:1179
static JPH_INLINE UVec4 sGreaterOrEqual(Vec4Arg inV1, Vec4Arg inV2)
Greater than or equal (component wise)
Definition Vec4.inl:327
JPH_INLINE float ReduceSum() const
Sum X, Y, Z and W.
Definition Vec4.inl:1223
static JPH_INLINE Vec4 sMin(Vec4Arg inV1, Vec4Arg inV2)
Return the minimum value of each of the components.
Definition Vec4.inl:188
JPH_INLINE Vec4 SplatZ() const
Replicate the Z component to all components.
Definition Vec4.inl:837
JPH_INLINE Vec4 Sqrt() const
Component wise square root.
Definition Vec4.inl:1066
JPH_INLINE Vec4 & operator*=(float inV2)
Multiply vector with float.
Definition Vec4.inl:620
static JPH_INLINE Vec4 sGatherFloat4(const float *inBase, UVec4Arg inOffsets)
Gather 4 floats from memory at inBase + inOffsets[i] * Scale.
JPH_INLINE Vec4 operator+(Vec4Arg inV2) const
Add two float vectors (component wise)
Definition Vec4.inl:672
JPH_INLINE Vec4 & operator/=(float inV2)
Divide vector by float.
Definition Vec4.inl:655
JPH_INLINE bool IsNormalized(float inTolerance=1.0e-6f) const
Test if vector is normalized.
Definition Vec4.inl:520
JPH_INLINE bool operator==(Vec4Arg inV2) const
Comparison.
Definition Vec4.inl:505
JPH_INLINE Vec4 SplatW() const
Replicate the W component to all components.
Definition Vec4.inl:853
JPH_INLINE Vec4 DotV(Vec4Arg inV2) const
Dot product, returns the dot product in X, Y, Z and W components.
Definition Vec4.inl:989
JPH_INLINE bool IsClose(Vec4Arg inV2, float inMaxDistSq=1.0e-12f) const
Test if two vectors are close.
Definition Vec4.inl:510
JPH_INLINE float GetX() const
Get individual components.
Definition Vec4.h:119
static JPH_INLINE Vec4 sLoadFloat4(const Float4 *inV)
Load 4 floats from memory.
Definition Vec4.inl:123
static JPH_INLINE Vec4 sZero()
Vector with all zeros.
Definition Vec4.inl:81
JPH_INLINE Vec4 Swizzle() const
Swizzle the elements in inV.
struct { float mData[4];} Type
Definition Vec4.h:24
static JPH_INLINE Vec4 sOr(Vec4Arg inV1, Vec4Arg inV2)
Logical or (component wise)
Definition Vec4.inl:403
JPH_INLINE float ReduceMin() const
Get the minimum of X, Y, Z and W.
Definition Vec4.inl:1209
Type mValue
Definition Vec4.h:317
static JPH_INLINE Vec4 sDecompressUnitVector(uint32 inValue)
Decompress a unit vector from a 32 bit value.
Definition Vec4.inl:1477
JPH_INLINE uint32 CompressUnitVector() const
Compress a unit vector to a 32 bit value, precision is around 0.5 * 10^-3.
Definition Vec4.inl:1433
JPH_INLINE Vec4 & operator-=(Vec4Arg inV2)
Subtract two float vectors (component wise)
Definition Vec4.inl:766
JPH_INLINE float LengthSq() const
Squared length of vector.
Definition Vec4.inl:1029
static JPH_INLINE Vec4 sMax(Vec4Arg inV1, Vec4Arg inV2)
Return the maximum of each of the components.
Definition Vec4.inl:209
JPH_INLINE float Dot(Vec4Arg inV2) const
Dot product.
Definition Vec4.inl:1011
JPH_INLINE Vec3 SplatZ3() const
Replicate the Z component to all components.
Definition Vec4.inl:901
JPH_INLINE bool IsNaN() const
Test if vector contains NaN elements.
Definition Vec4.inl:525
JPH_INLINE Vec3 SplatX3() const
Replicate the X component to all components.
Definition Vec4.inl:869
static JPH_INLINE Vec4 sNaN()
Vector with all NaN's.
Definition Vec4.inl:118
Vec4 ACos() const
Definition Vec4.inl:1360
static JPH_INLINE Vec4 sSelect(Vec4Arg inNotSet, Vec4Arg inSet, UVec4Arg inControl)
Component wise select, returns inNotSet when highest bit of inControl = 0 and inSet when highest bit ...
Definition Vec4.inl:374
JPH_INLINE int GetSignBits() const
Store if X is negative in bit 0, Y in bit 1, Z in bit 2 and W in bit 3.
Definition Vec4.inl:1190
JPH_INLINE int GetHighestComponentIndex() const
Get index of component with highest value.
Definition Vec4.inl:943
static JPH_INLINE Vec4 sReplicate(float inV)
Replicate inV across all components.
Definition Vec4.inl:97
JPH_INLINE Vec3 SplatY3() const
Replicate the Y component to all components.
Definition Vec4.inl:885
void SinCos(Vec4 &outSin, Vec4 &outCos) const
Calculate the sine and cosine for each element of this vector (input in radians)
Definition Vec4.inl:1230
JPH_INLINE void StoreFloat4(Float4 *outV) const
Store 4 floats to memory.
Definition Vec4.inl:1147
static JPH_INLINE Vec4 sClamp(Vec4Arg inV, Vec4Arg inMin, Vec4Arg inMax)
Clamp a vector between min and max (component wise)
Definition Vec4.inl:230
friend JPH_INLINE Vec4 operator*(float inV1, Vec4Arg inV2)
Multiply vector with float.
Definition Vec4.inl:583