11template <
unsigned IndexX,
unsigned IndexY,
unsigned IndexZ,
unsigned IndexW>
12JPH_INLINE vfloat32m1_t RVVShuffleFloat32x4(vfloat32m1_t inV0, vfloat32m1_t inV1)
14 vfloat32m2_t combined = __riscv_vlmul_ext_v_f32m1_f32m2(inV0);
15 combined = __riscv_vslideup_vx_f32m2(combined, __riscv_vlmul_ext_v_f32m1_f32m2(inV1), 4, 8);
17 const uint32 indices_raw[4] = { IndexX, IndexY, IndexZ, IndexW };
18 const vuint32m1_t v_indices_m1 = __riscv_vle32_v_u32m1(indices_raw, 4);
19 const vuint32m2_t v_indices_m2 = __riscv_vlmul_ext_v_u32m1_u32m2(v_indices_m1);
21 const vfloat32m2_t gathered_m2 = __riscv_vrgather_vv_f32m2(combined, v_indices_m2, 4);
22 return __riscv_vlmul_trunc_v_f32m2_f32m1(gathered_m2);
26JPH_INLINE vfloat32m1_t RVVShuffleFloat32x4<0, 1, 2, 3>(vfloat32m1_t inV0, vfloat32m1_t inV1)
32JPH_INLINE vfloat32m1_t RVVShuffleFloat32x4<0, 1, 4, 5>(vfloat32m1_t inV0, vfloat32m1_t inV1)
34 vfloat32m1_t result = inV0;
35 return __riscv_vslideup_vx_f32m1(result, inV1, 2, 4);
39JPH_INLINE vfloat32m1_t RVVShuffleFloat32x4<1, 0, 3, 2>(vfloat32m1_t inV0, vfloat32m1_t inV1)
42 const uint32 indices_raw[4] = { 1, 0, 3, 2 };
43 const vuint32m1_t indices = __riscv_vle32_v_u32m1(indices_raw, 4);
44 return __riscv_vrgather_vv_f32m1(inV0, indices, 4);
48JPH_INLINE vfloat32m1_t RVVShuffleFloat32x4<2, 3, 0, 1>(vfloat32m1_t inV0, vfloat32m1_t inV1)
50 vfloat32m1_t upper = __riscv_vslidedown_vx_f32m1(inV0, 2, 4);
51 return __riscv_vslideup_vx_f32m1(upper, inV0, 2, 4);
55JPH_INLINE vfloat32m1_t RVVShuffleFloat32x4<2, 3, 6, 7>(vfloat32m1_t inV0, vfloat32m1_t inV1)
57 return __riscv_vslidedown_vx_f32m1_tu(inV1, inV0, 2, 2);
61JPH_INLINE vfloat32m1_t RVVShuffleFloat32x4<4, 5, 6, 7>(vfloat32m1_t inV0, vfloat32m1_t inV1)
67JPH_INLINE vfloat32m1_t RVVShuffleFloat32x4<0, 2, 4, 6>(vfloat32m1_t inV0, vfloat32m1_t inV1)
69 vfloat32m2_t combined = __riscv_vlmul_ext_v_f32m1_f32m2(inV0);
70 combined = __riscv_vslideup_vx_f32m2(combined, __riscv_vlmul_ext_v_f32m1_f32m2(inV1), 4, 8);
72 vuint64m2_t combined_u64 = __riscv_vreinterpret_v_u32m2_u64m2(__riscv_vreinterpret_v_f32m2_u32m2(combined));
75 vuint32m1_t result = __riscv_vnsrl_wx_u32m1(combined_u64, 0, 4);
77 return __riscv_vreinterpret_v_u32m1_f32m1(result);
81JPH_INLINE vfloat32m1_t RVVShuffleFloat32x4<1, 3, 5, 7>(vfloat32m1_t inV0, vfloat32m1_t inV1)
83 vfloat32m2_t combined = __riscv_vlmul_ext_v_f32m1_f32m2(inV0);
84 combined = __riscv_vslideup_vx_f32m2(combined, __riscv_vlmul_ext_v_f32m1_f32m2(inV1), 4, 8);
86 vuint64m2_t combined_u64 = __riscv_vreinterpret_v_u32m2_u64m2(__riscv_vreinterpret_v_f32m2_u32m2(combined));
89 vuint32m1_t result = __riscv_vnsrl_wx_u32m1(combined_u64, 32, 4);
91 return __riscv_vreinterpret_v_u32m1_f32m1(result);
95JPH_INLINE
float RVVSumElementsFloat32x4(vfloat32m1_t inV)
97#ifdef JPH_CROSS_PLATFORM_DETERMINISTIC
98 const vfloat32m1_t shift1 = __riscv_vslidedown_vx_f32m1(inV, 1, 4);
99 const vfloat32m1_t sum_pairs = __riscv_vfadd_vv_f32m1(inV, shift1, 4);
100 const vfloat32m1_t shift2 = __riscv_vslidedown_vx_f32m1(sum_pairs, 2, 4);
101 const vfloat32m1_t sum = __riscv_vfadd_vv_f32m1(sum_pairs, shift2, 4);
103 const vfloat32m1_t zeros = __riscv_vfmv_v_f_f32m1(0.0f, 4);
104 const vfloat32m1_t sum = __riscv_vfredusum_vs_f32m1_f32m1(inV, zeros, 4);
106 return __riscv_vfmv_f_s_f32m1_f32(sum);
#define JPH_NAMESPACE_END
Definition Core.h:428
std::uint32_t uint32
Definition Core.h:508
#define JPH_NAMESPACE_BEGIN
Definition Core.h:422