Jolt Physics
A multi core friendly Game Physics Engine
Loading...
Searching...
No Matches
ARMNeon.h
Go to the documentation of this file.
1// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
2// SPDX-FileCopyrightText: 2022 Jorrit Rouwe
3// SPDX-License-Identifier: MIT
4
5#pragma once
6
7#ifdef JPH_USE_NEON
8
9// Constructing NEON values
10#ifdef JPH_COMPILER_MSVC
11 #define JPH_NEON_INT32x4(v1, v2, v3, v4) { int64_t(v1) + (int64_t(v2) << 32), int64_t(v3) + (int64_t(v4) << 32) }
12 #define JPH_NEON_UINT32x4(v1, v2, v3, v4) { uint64_t(v1) + (uint64_t(v2) << 32), uint64_t(v3) + (uint64_t(v4) << 32) }
13 #define JPH_NEON_INT8x16(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) { int64_t(v1) + (int64_t(v2) << 8) + (int64_t(v3) << 16) + (int64_t(v4) << 24) + (int64_t(v5) << 32) + (int64_t(v6) << 40) + (int64_t(v7) << 48) + (int64_t(v8) << 56), int64_t(v9) + (int64_t(v10) << 8) + (int64_t(v11) << 16) + (int64_t(v12) << 24) + (int64_t(v13) << 32) + (int64_t(v14) << 40) + (int64_t(v15) << 48) + (int64_t(v16) << 56) }
14 #define JPH_NEON_UINT8x16(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) { uint64_t(v1) + (uint64_t(v2) << 8) + (uint64_t(v3) << 16) + (uint64_t(v4) << 24) + (uint64_t(v5) << 32) + (uint64_t(v6) << 40) + (uint64_t(v7) << 48) + (uint64_t(v8) << 56), uint64_t(v9) + (uint64_t(v10) << 8) + (uint64_t(v11) << 16) + (uint64_t(v12) << 24) + (uint64_t(v13) << 32) + (uint64_t(v14) << 40) + (uint64_t(v15) << 48) + (uint64_t(v16) << 56) }
15#else
16 #define JPH_NEON_INT32x4(v1, v2, v3, v4) { v1, v2, v3, v4 }
17 #define JPH_NEON_UINT32x4(v1, v2, v3, v4) { v1, v2, v3, v4 }
18 #define JPH_NEON_INT8x16(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) { v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16 }
19 #define JPH_NEON_UINT8x16(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) { v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16 }
20#endif
21
22// MSVC and GCC prior to version 12 don't define __builtin_shufflevector
23#if defined(JPH_COMPILER_MSVC) || (defined(JPH_COMPILER_GCC) && __GNUC__ < 12)
25
26 // Generic shuffle vector template
27 template <unsigned I1, unsigned I2, unsigned I3, unsigned I4>
28 JPH_INLINE float32x4_t NeonShuffleFloat32x4(float32x4_t inV1, float32x4_t inV2)
29 {
30 float32x2_t lo = vcopy_laneq_f32(vdup_n_f32(0), 0, I1 >= 4? inV2 : inV1, I1 & 0b11);
31 lo = vcopy_laneq_f32(lo, 1, I2 >= 4? inV2 : inV1, I2 & 0b11);
32
33 float32x2_t hi = vcopy_laneq_f32(vdup_n_f32(0), 0, I3 >= 4? inV2 : inV1, I3 & 0b11);
34 hi = vcopy_laneq_f32(hi, 1, I4 >= 4? inV2 : inV1, I4 & 0b11);
35
36 return vcombine_f32(lo, hi);
37 }
38
39 // Specializations
40 template <>
41 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 1, 0, 0>(float32x4_t inV1, float32x4_t inV2)
42 {
43 return vcombine_f32(vget_low_f32(inV1), vdup_lane_f32(vget_low_f32(inV1), 0));
44 }
45
46 template <>
47 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 1, 2, 2>(float32x4_t inV1, float32x4_t inV2)
48 {
49 return vcopyq_laneq_f32(inV1, 3, inV1, 2);
50 }
51
52 template <>
53 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 1, 2, 3>(float32x4_t inV1, float32x4_t inV2)
54 {
55 return inV1;
56 }
57
58 template <>
59 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 1, 3, 2>(float32x4_t inV1, float32x4_t inV2)
60 {
61 return vcombine_f32(vget_low_f32(inV1), vrev64_f32(vget_high_f32(inV1)));
62 }
63
64 template <>
65 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 1, 3, 3>(float32x4_t inV1, float32x4_t inV2)
66 {
67 return vcopyq_laneq_f32(inV1, 2, inV1, 3);
68 }
69
70 template <>
71 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 1, 4, 5>(float32x4_t inV1, float32x4_t inV2)
72 {
73 return vreinterpretq_f32_f64(vzip1q_f64(vreinterpretq_f64_f32(inV1), vreinterpretq_f64_f32(inV2)));
74 }
75
76 template <>
77 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 2, 1, 1>(float32x4_t inV1, float32x4_t inV2)
78 {
79 return vuzp1q_f32(inV1, vdupq_laneq_f32(inV1, 1));
80 }
81
82 template <>
83 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 2, 1, 3>(float32x4_t inV1, float32x4_t inV2)
84 {
85 return vuzp1q_f32(inV1, vrev64q_f32(inV1));
86 }
87
88 template <>
89 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 2, 2, 2>(float32x4_t inV1, float32x4_t inV2)
90 {
91 return vuzp1q_f32(inV1, vdupq_laneq_f32(inV1, 2));
92 }
93
94 template <>
95 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 2, 2, 3>(float32x4_t inV1, float32x4_t inV2)
96 {
97 return vcopyq_laneq_f32(inV1, 1, inV1, 2);
98 }
99
100 template <>
101 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 2, 3, 2>(float32x4_t inV1, float32x4_t inV2)
102 {
103 return vcopyq_laneq_f32(vuzp1q_f32(inV1, inV1), 2, inV1, 3);
104 }
105
106 template <>
107 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 2, 3, 3>(float32x4_t inV1, float32x4_t inV2)
108 {
109 return vuzp1q_f32(inV1, vdupq_laneq_f32(inV1, 3));
110 }
111
112 template <>
113 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 2, 4, 6>(float32x4_t inV1, float32x4_t inV2)
114 {
115 return vuzp1q_f32(inV1, inV2);
116 }
117
118 template <>
119 JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 3, 1, 2>(float32x4_t inV1, float32x4_t inV2)
120 {
121 return vzip1q_f32(inV1, vextq_f32(inV1, vdupq_laneq_f32(inV1, 2), 3));
122 }
123
124 template <>
125 JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 0, 0, 0>(float32x4_t inV1, float32x4_t inV2)
126 {
127 return vcombine_f32(vrev64_f32(vget_low_f32(inV1)), vdup_lane_f32(vget_low_f32(inV1), 0));
128 }
129
130 template <>
131 JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 0, 0, 2>(float32x4_t inV1, float32x4_t inV2)
132 {
133 return vcombine_f32(vrev64_f32(vget_low_f32(inV1)), vzip1_f32(vget_low_f32(inV1), vget_high_f32(inV1)));
134 }
135
136 template <>
137 JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 0, 3, 2>(float32x4_t inV1, float32x4_t inV2)
138 {
139 return vrev64q_f32(inV1);
140 }
141
142 template <>
143 JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 1, 2, 2>(float32x4_t inV1, float32x4_t inV2)
144 {
145 float32x4_t t = vextq_f32(inV1, inV1, 1);
146 return vzip1q_f32(t, t);
147 }
148
149 template <>
150 JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 1, 3, 3>(float32x4_t inV1, float32x4_t inV2)
151 {
152 return vtrn2q_f32(inV1, inV1);
153 }
154
155 // Used extensively by cross product
156 template <>
157 JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 2, 0, 0>(float32x4_t inV1, float32x4_t inV2)
158 {
159 return vcopyq_laneq_f32(vextq_f32(inV1, inV1, 1), 2, inV1, 0);
160 }
161
162 template <>
163 JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 2, 0, 1>(float32x4_t inV1, float32x4_t inV2)
164 {
165 return vextq_f32(vextq_f32(inV1, inV1, 3), inV1, 2);
166 }
167
168 template <>
169 JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 2, 0, 2>(float32x4_t inV1, float32x4_t inV2)
170 {
171 return vcopyq_laneq_f32(vuzp1q_f32(inV1, inV1), 0, inV1, 1);
172 }
173
174 template <>
175 JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 2, 2, 2>(float32x4_t inV1, float32x4_t inV2)
176 {
177 return vcombine_f32(vext_f32(vget_low_f32(inV1), vget_high_f32(inV1), 1), vdup_lane_f32(vget_high_f32(inV1), 0));
178 }
179
180 template <>
181 JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 2, 3, 2>(float32x4_t inV1, float32x4_t inV2)
182 {
183 return vextq_f32(inV1, vdupq_laneq_f32(inV1, 2), 1);
184 }
185
186 template <>
187 JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 2, 3, 3>(float32x4_t inV1, float32x4_t inV2)
188 {
189 return vextq_f32(inV1, vdupq_laneq_f32(inV1, 3), 1);
190 }
191
192 template <>
193 JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 3, 0, 2>(float32x4_t inV1, float32x4_t inV2)
194 {
195 return vuzp2q_f32(inV1, vrev64q_f32(inV1));
196 }
197
198 template <>
199 JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 3, 5, 7>(float32x4_t inV1, float32x4_t inV2)
200 {
201 return vuzp2q_f32(inV1, inV2);
202 }
203
204 template <>
205 JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 0, 1, 1>(float32x4_t inV1, float32x4_t inV2)
206 {
207 return vcopyq_laneq_f32(vzip1q_f32(inV1, inV1), 0, inV1, 2);
208 }
209
210 template <>
211 JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 0, 1, 2>(float32x4_t inV1, float32x4_t inV2)
212 {
213 return vextq_f32(vuzp1q_f32(inV1, inV1), inV1, 3);
214 }
215
216 template <>
217 JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 1, 0, 0>(float32x4_t inV1, float32x4_t inV2)
218 {
219 float32x4_t t = vextq_f32(vuzp1q_f32(inV1, inV1), inV1, 3);
220 return vuzp1q_f32(t, vuzp1q_f32(inV1, inV1));
221 }
222
223 template <>
224 JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 1, 0, 3>(float32x4_t inV1, float32x4_t inV2)
225 {
226 float32x4_t t = vrev64q_f32(inV1);
227 return vextq_f32(t, t, 3);
228 }
229
230 template <>
231 JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 2, 1, 0>(float32x4_t inV1, float32x4_t inV2)
232 {
233 return vextq_f32(vtrn1q_f32(inV1, inV1), vrev64q_f32(inV1), 2);
234 }
235
236 template <>
237 JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 2, 1, 1>(float32x4_t inV1, float32x4_t inV2)
238 {
239 float32x4_t t = vcopyq_laneq_f32(inV1, 3, inV1, 1);
240 return vzip2q_f32(t, t);
241 }
242
243 template <>
244 JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 2, 1, 2>(float32x4_t inV1, float32x4_t inV2)
245 {
246 return vcopyq_laneq_f32(vdupq_laneq_f32(inV1, 2), 2, inV1, 1);
247 }
248
249 template <>
250 JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 2, 2, 2>(float32x4_t inV1, float32x4_t inV2)
251 {
252 return vdupq_laneq_f32(inV1, 2);
253 }
254
255 template <>
256 JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 3, 0, 1>(float32x4_t inV1, float32x4_t inV2)
257 {
258 return vcombine_f32(vget_high_f32(inV1), vget_low_f32(inV1));
259 }
260
261 template <>
262 JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 3, 1, 2>(float32x4_t inV1, float32x4_t inV2)
263 {
264 return vextq_f32(inV1, vextq_f32(inV1, inV1, 1), 2);
265 }
266
267 template <>
268 JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 3, 2, 2>(float32x4_t inV1, float32x4_t inV2)
269 {
270 return vextq_f32(inV1, vdupq_laneq_f32(inV1, 2), 2);
271 }
272
273 template <>
274 JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 3, 2, 3>(float32x4_t inV1, float32x4_t inV2)
275 {
276 return vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(inV1), 1));
277 }
278
279 template <>
280 JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 3, 6, 7>(float32x4_t inV1, float32x4_t inV2)
281 {
282 return vreinterpretq_f32_f64(vzip2q_f64(vreinterpretq_f64_f32(inV1), vreinterpretq_f64_f32(inV2)));
283 }
284
285 template <>
286 JPH_INLINE float32x4_t NeonShuffleFloat32x4<3, 0, 1, 2>(float32x4_t inV1, float32x4_t inV2)
287 {
288 return vextq_f32(inV1, inV1, 3);
289 }
290
291 template <>
292 JPH_INLINE float32x4_t NeonShuffleFloat32x4<3, 0, 3, 2>(float32x4_t inV1, float32x4_t inV2)
293 {
294 return vtrn1q_f32(vdupq_laneq_f32(inV1, 3), inV1);
295 }
296
297 template <>
298 JPH_INLINE float32x4_t NeonShuffleFloat32x4<3, 2, 1, 0>(float32x4_t inV1, float32x4_t inV2)
299 {
300 float32x4_t t = vrev64q_f32(inV1);
301 return vextq_f32(t, t, 2);
302 }
303
304 template <>
305 JPH_INLINE float32x4_t NeonShuffleFloat32x4<3, 2, 3, 2>(float32x4_t inV1, float32x4_t inV2)
306 {
307 float32x2_t zy = vrev64_f32(vget_high_f32(inV1));
308 return vcombine_f32(zy, zy);
309 }
310
311 // Shuffle a vector
312 #define JPH_NEON_SHUFFLE_F32x4(vec1, vec2, index1, index2, index3, index4) NeonShuffleFloat32x4<index1, index2, index3, index4>(vec1, vec2)
313 #define JPH_NEON_SHUFFLE_U32x4(vec1, vec2, index1, index2, index3, index4) vreinterpretq_u32_f32((NeonShuffleFloat32x4<index1, index2, index3, index4>(vreinterpretq_f32_u32(vec1), vreinterpretq_f32_u32(vec2))))
314
316#else
317 // Shuffle a vector
318 #define JPH_NEON_SHUFFLE_F32x4(vec1, vec2, index1, index2, index3, index4) __builtin_shufflevector(vec1, vec2, index1, index2, index3, index4)
319 #define JPH_NEON_SHUFFLE_U32x4(vec1, vec2, index1, index2, index3, index4) __builtin_shufflevector(vec1, vec2, index1, index2, index3, index4)
320#endif
321
322#endif // JPH_USE_NEON
#define JPH_NAMESPACE_END
Definition Core.h:428
#define JPH_NAMESPACE_BEGIN
Definition Core.h:422