10#ifdef JPH_COMPILER_MSVC
11 #define JPH_NEON_INT32x4(v1, v2, v3, v4) { int64_t(v1) + (int64_t(v2) << 32), int64_t(v3) + (int64_t(v4) << 32) }
12 #define JPH_NEON_UINT32x4(v1, v2, v3, v4) { uint64_t(v1) + (uint64_t(v2) << 32), uint64_t(v3) + (uint64_t(v4) << 32) }
13 #define JPH_NEON_INT8x16(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) { int64_t(v1) + (int64_t(v2) << 8) + (int64_t(v3) << 16) + (int64_t(v4) << 24) + (int64_t(v5) << 32) + (int64_t(v6) << 40) + (int64_t(v7) << 48) + (int64_t(v8) << 56), int64_t(v9) + (int64_t(v10) << 8) + (int64_t(v11) << 16) + (int64_t(v12) << 24) + (int64_t(v13) << 32) + (int64_t(v14) << 40) + (int64_t(v15) << 48) + (int64_t(v16) << 56) }
14 #define JPH_NEON_UINT8x16(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) { uint64_t(v1) + (uint64_t(v2) << 8) + (uint64_t(v3) << 16) + (uint64_t(v4) << 24) + (uint64_t(v5) << 32) + (uint64_t(v6) << 40) + (uint64_t(v7) << 48) + (uint64_t(v8) << 56), uint64_t(v9) + (uint64_t(v10) << 8) + (uint64_t(v11) << 16) + (uint64_t(v12) << 24) + (uint64_t(v13) << 32) + (uint64_t(v14) << 40) + (uint64_t(v15) << 48) + (uint64_t(v16) << 56) }
16 #define JPH_NEON_INT32x4(v1, v2, v3, v4) { v1, v2, v3, v4 }
17 #define JPH_NEON_UINT32x4(v1, v2, v3, v4) { v1, v2, v3, v4 }
18 #define JPH_NEON_INT8x16(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) { v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16 }
19 #define JPH_NEON_UINT8x16(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) { v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16 }
23#if defined(JPH_COMPILER_MSVC) || (defined(JPH_COMPILER_GCC) && __GNUC__ < 12)
27 template <
unsigned I1,
unsigned I2,
unsigned I3,
unsigned I4>
28 JPH_INLINE float32x4_t NeonShuffleFloat32x4(float32x4_t inV1, float32x4_t inV2)
30 float32x2_t lo = vcopy_laneq_f32(vdup_n_f32(0), 0, I1 >= 4? inV2 : inV1, I1 & 0b11);
31 lo = vcopy_laneq_f32(lo, 1, I2 >= 4? inV2 : inV1, I2 & 0b11);
33 float32x2_t hi = vcopy_laneq_f32(vdup_n_f32(0), 0, I3 >= 4? inV2 : inV1, I3 & 0b11);
34 hi = vcopy_laneq_f32(hi, 1, I4 >= 4? inV2 : inV1, I4 & 0b11);
36 return vcombine_f32(lo, hi);
41 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 0, 0, 0>(float32x4_t inV1, float32x4_t inV2)
43 return vdupq_laneq_f32(inV1, 0);
47 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 1, 0, 0>(float32x4_t inV1, float32x4_t inV2)
49 return vcombine_f32(vget_low_f32(inV1), vdup_lane_f32(vget_low_f32(inV1), 0));
53 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 1, 2, 2>(float32x4_t inV1, float32x4_t inV2)
55 return vcopyq_laneq_f32(inV1, 3, inV1, 2);
59 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 1, 2, 3>(float32x4_t inV1, float32x4_t inV2)
65 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 1, 3, 2>(float32x4_t inV1, float32x4_t inV2)
67 return vcombine_f32(vget_low_f32(inV1), vrev64_f32(vget_high_f32(inV1)));
71 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 1, 3, 3>(float32x4_t inV1, float32x4_t inV2)
73 return vcopyq_laneq_f32(inV1, 2, inV1, 3);
77 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 1, 4, 5>(float32x4_t inV1, float32x4_t inV2)
79 return vreinterpretq_f32_f64(vzip1q_f64(vreinterpretq_f64_f32(inV1), vreinterpretq_f64_f32(inV2)));
83 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 2, 1, 1>(float32x4_t inV1, float32x4_t inV2)
85 return vuzp1q_f32(inV1, vdupq_laneq_f32(inV1, 1));
89 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 2, 1, 3>(float32x4_t inV1, float32x4_t inV2)
91 return vuzp1q_f32(inV1, vrev64q_f32(inV1));
95 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 2, 2, 2>(float32x4_t inV1, float32x4_t inV2)
97 return vuzp1q_f32(inV1, vdupq_laneq_f32(inV1, 2));
101 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 2, 2, 3>(float32x4_t inV1, float32x4_t inV2)
103 return vcopyq_laneq_f32(inV1, 1, inV1, 2);
107 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 2, 3, 2>(float32x4_t inV1, float32x4_t inV2)
109 return vcopyq_laneq_f32(vuzp1q_f32(inV1, inV1), 2, inV1, 3);
113 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 2, 3, 3>(float32x4_t inV1, float32x4_t inV2)
115 return vuzp1q_f32(inV1, vdupq_laneq_f32(inV1, 3));
119 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 2, 4, 6>(float32x4_t inV1, float32x4_t inV2)
121 return vuzp1q_f32(inV1, inV2);
125 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 3, 1, 2>(float32x4_t inV1, float32x4_t inV2)
127 return vzip1q_f32(inV1, vextq_f32(inV1, vdupq_laneq_f32(inV1, 2), 3));
131 JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 0, 0, 0>(float32x4_t inV1, float32x4_t inV2)
133 return vcombine_f32(vrev64_f32(vget_low_f32(inV1)), vdup_lane_f32(vget_low_f32(inV1), 0));
137 JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 0, 0, 2>(float32x4_t inV1, float32x4_t inV2)
139 return vcombine_f32(vrev64_f32(vget_low_f32(inV1)), vzip1_f32(vget_low_f32(inV1), vget_high_f32(inV1)));
143 JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 0, 3, 2>(float32x4_t inV1, float32x4_t inV2)
145 return vrev64q_f32(inV1);
149 JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 1, 1, 1>(float32x4_t inV1, float32x4_t inV2)
151 return vdupq_laneq_f32(inV1, 1);
155 JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 1, 2, 2>(float32x4_t inV1, float32x4_t inV2)
157 float32x4_t t = vextq_f32(inV1, inV1, 1);
158 return vzip1q_f32(t, t);
162 JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 1, 3, 3>(float32x4_t inV1, float32x4_t inV2)
164 return vtrn2q_f32(inV1, inV1);
169 JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 2, 0, 0>(float32x4_t inV1, float32x4_t inV2)
171 return vcopyq_laneq_f32(vextq_f32(inV1, inV1, 1), 2, inV1, 0);
175 JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 2, 0, 1>(float32x4_t inV1, float32x4_t inV2)
177 return vextq_f32(vextq_f32(inV1, inV1, 3), inV1, 2);
181 JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 2, 0, 2>(float32x4_t inV1, float32x4_t inV2)
183 return vcopyq_laneq_f32(vuzp1q_f32(inV1, inV1), 0, inV1, 1);
187 JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 2, 2, 2>(float32x4_t inV1, float32x4_t inV2)
189 return vcombine_f32(vext_f32(vget_low_f32(inV1), vget_high_f32(inV1), 1), vdup_lane_f32(vget_high_f32(inV1), 0));
193 JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 2, 3, 2>(float32x4_t inV1, float32x4_t inV2)
195 return vextq_f32(inV1, vdupq_laneq_f32(inV1, 2), 1);
199 JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 2, 3, 3>(float32x4_t inV1, float32x4_t inV2)
201 return vextq_f32(inV1, vdupq_laneq_f32(inV1, 3), 1);
205 JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 3, 0, 2>(float32x4_t inV1, float32x4_t inV2)
207 return vuzp2q_f32(inV1, vrev64q_f32(inV1));
211 JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 3, 5, 7>(float32x4_t inV1, float32x4_t inV2)
213 return vuzp2q_f32(inV1, inV2);
217 JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 0, 1, 1>(float32x4_t inV1, float32x4_t inV2)
219 return vcopyq_laneq_f32(vzip1q_f32(inV1, inV1), 0, inV1, 2);
223 JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 0, 1, 2>(float32x4_t inV1, float32x4_t inV2)
225 return vextq_f32(vuzp1q_f32(inV1, inV1), inV1, 3);
229 JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 1, 0, 0>(float32x4_t inV1, float32x4_t inV2)
231 float32x4_t t = vextq_f32(vuzp1q_f32(inV1, inV1), inV1, 3);
232 return vuzp1q_f32(t, vuzp1q_f32(inV1, inV1));
236 JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 1, 0, 3>(float32x4_t inV1, float32x4_t inV2)
238 float32x4_t t = vrev64q_f32(inV1);
239 return vextq_f32(t, t, 3);
243 JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 2, 1, 0>(float32x4_t inV1, float32x4_t inV2)
245 return vextq_f32(vtrn1q_f32(inV1, inV1), vrev64q_f32(inV1), 2);
249 JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 2, 1, 1>(float32x4_t inV1, float32x4_t inV2)
251 float32x4_t t = vcopyq_laneq_f32(inV1, 3, inV1, 1);
252 return vzip2q_f32(t, t);
256 JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 2, 1, 2>(float32x4_t inV1, float32x4_t inV2)
258 return vcopyq_laneq_f32(vdupq_laneq_f32(inV1, 2), 2, inV1, 1);
262 JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 2, 2, 2>(float32x4_t inV1, float32x4_t inV2)
264 return vdupq_laneq_f32(inV1, 2);
268 JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 3, 0, 1>(float32x4_t inV1, float32x4_t inV2)
270 return vcombine_f32(vget_high_f32(inV1), vget_low_f32(inV1));
274 JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 3, 1, 2>(float32x4_t inV1, float32x4_t inV2)
276 return vextq_f32(inV1, vextq_f32(inV1, inV1, 1), 2);
280 JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 3, 2, 2>(float32x4_t inV1, float32x4_t inV2)
282 return vextq_f32(inV1, vdupq_laneq_f32(inV1, 2), 2);
286 JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 3, 2, 3>(float32x4_t inV1, float32x4_t inV2)
288 return vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(inV1), 1));
292 JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 3, 6, 7>(float32x4_t inV1, float32x4_t inV2)
294 return vreinterpretq_f32_f64(vzip2q_f64(vreinterpretq_f64_f32(inV1), vreinterpretq_f64_f32(inV2)));
298 JPH_INLINE float32x4_t NeonShuffleFloat32x4<3, 0, 1, 2>(float32x4_t inV1, float32x4_t inV2)
300 return vextq_f32(inV1, inV1, 3);
304 JPH_INLINE float32x4_t NeonShuffleFloat32x4<3, 0, 3, 2>(float32x4_t inV1, float32x4_t inV2)
306 return vtrn1q_f32(vdupq_laneq_f32(inV1, 3), inV1);
310 JPH_INLINE float32x4_t NeonShuffleFloat32x4<3, 2, 1, 0>(float32x4_t inV1, float32x4_t inV2)
312 float32x4_t t = vrev64q_f32(inV1);
313 return vextq_f32(t, t, 2);
317 JPH_INLINE float32x4_t NeonShuffleFloat32x4<3, 2, 3, 2>(float32x4_t inV1, float32x4_t inV2)
319 float32x2_t zy = vrev64_f32(vget_high_f32(inV1));
320 return vcombine_f32(zy, zy);
324 JPH_INLINE float32x4_t NeonShuffleFloat32x4<3, 3, 3, 3>(float32x4_t inV1, float32x4_t inV2)
326 return vdupq_laneq_f32(inV1, 3);
330 #define JPH_NEON_SHUFFLE_F32x4(vec1, vec2, index1, index2, index3, index4) NeonShuffleFloat32x4<index1, index2, index3, index4>(vec1, vec2)
331 #define JPH_NEON_SHUFFLE_U32x4(vec1, vec2, index1, index2, index3, index4) vreinterpretq_u32_f32((NeonShuffleFloat32x4<index1, index2, index3, index4>(vreinterpretq_f32_u32(vec1), vreinterpretq_f32_u32(vec2))))
336 #define JPH_NEON_SHUFFLE_F32x4(vec1, vec2, index1, index2, index3, index4) __builtin_shufflevector(vec1, vec2, index1, index2, index3, index4)
337 #define JPH_NEON_SHUFFLE_U32x4(vec1, vec2, index1, index2, index3, index4) __builtin_shufflevector(vec1, vec2, index1, index2, index3, index4)
#define JPH_NAMESPACE_END
Definition Core.h:434
#define JPH_NAMESPACE_BEGIN
Definition Core.h:428