71#if defined(JPH_USE_AVX)
72 __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(inV.
mValue, inV.
mValue, _MM_SHUFFLE(0, 0, 0, 0)));
73 t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(inV.
mValue, inV.
mValue, _MM_SHUFFLE(1, 1, 1, 1))));
74 t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(inV.
mValue, inV.
mValue, _MM_SHUFFLE(2, 2, 2, 2))));
76#elif defined(JPH_USE_SSE)
77 __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(inV.
mValue, inV.
mValue, _MM_SHUFFLE(0, 0, 0, 0)));
78 t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(inV.
mValue, inV.
mValue, _MM_SHUFFLE(1, 1, 1, 1))));
79 t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(inV.
mValue, inV.
mValue, _MM_SHUFFLE(2, 2, 2, 2))));
80 __m128d low = _mm_add_pd(mCol3.
mValue.mLow, _mm_cvtps_pd(t));
81 __m128d high = _mm_add_pd(mCol3.
mValue.mHigh, _mm_cvtps_pd(_mm_shuffle_ps(t, t, _MM_SHUFFLE(2, 2, 2, 2))));
82 return DVec3({ low, high });
83#elif defined(JPH_USE_NEON)
84 float32x4_t t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(inV.
mValue, 0));
85 t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(inV.
mValue, 1));
86 t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(inV.
mValue, 2));
87 float64x2_t low = vaddq_f64(mCol3.
mValue.val[0], vcvt_f64_f32(vget_low_f32(t)));
88 float64x2_t high = vaddq_f64(mCol3.
mValue.val[1], vcvt_high_f64_f32(t));
90#elif defined(JPH_USE_RVV)
91 const vfloat32m1_t v0 = __riscv_vfmv_v_f_f32m1(inV.
mF32[0], 4);
92 const vfloat32m1_t v1 = __riscv_vfmv_v_f_f32m1(inV.
mF32[1], 4);
93 const vfloat32m1_t v2 = __riscv_vfmv_v_f_f32m1(inV.
mF32[2], 4);
95 const vfloat32m1_t col0 = __riscv_vle32_v_f32m1(mCol[0].mF32, 4);
96 const vfloat32m1_t col1 = __riscv_vle32_v_f32m1(mCol[1].mF32, 4);
97 const vfloat32m1_t col2 = __riscv_vle32_v_f32m1(mCol[2].mF32, 4);
98 const vfloat64m2_t col3 = __riscv_vle64_v_f64m2(mCol3.
mF64, 4);
100 vfloat32m1_t t = __riscv_vfmul_vv_f32m1(col0, v0, 4);
101 t = __riscv_vfmacc_vv_f32m1(t, col1, v1, 4);
102 t = __riscv_vfmacc_vv_f32m1(t, col2, v2, 4);
104 vfloat64m2_t t_f64 = __riscv_vfwcvt_f_f_v_f64m2(t, 4);
105 t_f64 = __riscv_vfadd_vv_f64m2(t_f64, col3, 4);
108 __riscv_vse64_v_f64m2(v.
mF64, t_f64, 4);
120#if defined(JPH_USE_AVX)
121 __m256d t = _mm256_add_pd(mCol3.
mValue, _mm256_mul_pd(_mm256_cvtps_pd(mCol[0].mValue), _mm256_set1_pd(inV.
mF64[0])));
122 t = _mm256_add_pd(t, _mm256_mul_pd(_mm256_cvtps_pd(mCol[1].mValue), _mm256_set1_pd(inV.
mF64[1])));
123 t = _mm256_add_pd(t, _mm256_mul_pd(_mm256_cvtps_pd(mCol[2].mValue), _mm256_set1_pd(inV.
mF64[2])));
125#elif defined(JPH_USE_SSE)
126 __m128d xxxx = _mm_set1_pd(inV.
mF64[0]);
127 __m128d yyyy = _mm_set1_pd(inV.
mF64[1]);
128 __m128d zzzz = _mm_set1_pd(inV.
mF64[2]);
129 __m128 col0 = mCol[0].
mValue;
130 __m128 col1 = mCol[1].
mValue;
131 __m128 col2 = mCol[2].
mValue;
132 __m128d t_low = _mm_add_pd(mCol3.
mValue.mLow, _mm_mul_pd(_mm_cvtps_pd(col0), xxxx));
133 t_low = _mm_add_pd(t_low, _mm_mul_pd(_mm_cvtps_pd(col1), yyyy));
134 t_low = _mm_add_pd(t_low, _mm_mul_pd(_mm_cvtps_pd(col2), zzzz));
135 __m128d t_high = _mm_add_pd(mCol3.
mValue.mHigh, _mm_mul_pd(_mm_cvtps_pd(_mm_shuffle_ps(col0, col0, _MM_SHUFFLE(2, 2, 2, 2))), xxxx));
136 t_high = _mm_add_pd(t_high, _mm_mul_pd(_mm_cvtps_pd(_mm_shuffle_ps(col1, col1, _MM_SHUFFLE(2, 2, 2, 2))), yyyy));
137 t_high = _mm_add_pd(t_high, _mm_mul_pd(_mm_cvtps_pd(_mm_shuffle_ps(col2, col2, _MM_SHUFFLE(2, 2, 2, 2))), zzzz));
138 return DVec3({ t_low, t_high });
139#elif defined(JPH_USE_NEON)
140 float64x2_t xxxx = vdupq_laneq_f64(inV.
mValue.val[0], 0);
141 float64x2_t yyyy = vdupq_laneq_f64(inV.
mValue.val[0], 1);
142 float64x2_t zzzz = vdupq_laneq_f64(inV.
mValue.val[1], 0);
143 float32x4_t col0 = mCol[0].
mValue;
144 float32x4_t col1 = mCol[1].
mValue;
145 float32x4_t col2 = mCol[2].
mValue;
146 float64x2_t t_low = vaddq_f64(mCol3.
mValue.val[0], vmulq_f64(vcvt_f64_f32(vget_low_f32(col0)), xxxx));
147 t_low = vaddq_f64(t_low, vmulq_f64(vcvt_f64_f32(vget_low_f32(col1)), yyyy));
148 t_low = vaddq_f64(t_low, vmulq_f64(vcvt_f64_f32(vget_low_f32(col2)), zzzz));
149 float64x2_t t_high = vaddq_f64(mCol3.
mValue.val[1], vmulq_f64(vcvt_high_f64_f32(col0), xxxx));
150 t_high = vaddq_f64(t_high, vmulq_f64(vcvt_high_f64_f32(col1), yyyy));
151 t_high = vaddq_f64(t_high, vmulq_f64(vcvt_high_f64_f32(col2), zzzz));
153#elif defined(JPH_USE_RVV)
154 const vfloat64m2_t xxxx = __riscv_vfmv_v_f_f64m2(inV.
mF64[0], 4);
155 const vfloat64m2_t yyyy = __riscv_vfmv_v_f_f64m2(inV.
mF64[1], 4);
156 const vfloat64m2_t zzzz = __riscv_vfmv_v_f_f64m2(inV.
mF64[2], 4);
158 const vfloat32m1_t col0_f32 = __riscv_vle32_v_f32m1(mCol[0].mF32, 4);
159 const vfloat32m1_t col1_f32 = __riscv_vle32_v_f32m1(mCol[1].mF32, 4);
160 const vfloat32m1_t col2_f32 = __riscv_vle32_v_f32m1(mCol[2].mF32, 4);
162 const vfloat64m2_t col0 = __riscv_vfwcvt_f_f_v_f64m2(col0_f32, 4);
163 const vfloat64m2_t col1 = __riscv_vfwcvt_f_f_v_f64m2(col1_f32, 4);
164 const vfloat64m2_t col2 = __riscv_vfwcvt_f_f_v_f64m2(col2_f32, 4);
166 const vfloat64m2_t col3 = __riscv_vle64_v_f64m2(mCol3.
mF64, 4);
168 vfloat64m2_t t = __riscv_vfmul_vv_f64m2(col0, xxxx, 4);
169 t = __riscv_vfmacc_vv_f64m2(t, col1, yyyy, 4);
170 t = __riscv_vfmacc_vv_f64m2(t, col2, zzzz, 4);
171 t = __riscv_vfadd_vv_f64m2(t, col3, 4);
174 __riscv_vse64_v_f64m2(v.
mF64, t, 4);
186#if defined(JPH_USE_AVX)
187 __m256d t = _mm256_mul_pd(_mm256_cvtps_pd(mCol[0].mValue), _mm256_set1_pd(inV.
mF64[0]));
188 t = _mm256_add_pd(t, _mm256_mul_pd(_mm256_cvtps_pd(mCol[1].mValue), _mm256_set1_pd(inV.
mF64[1])));
189 t = _mm256_add_pd(t, _mm256_mul_pd(_mm256_cvtps_pd(mCol[2].mValue), _mm256_set1_pd(inV.
mF64[2])));
191#elif defined(JPH_USE_SSE)
192 __m128d xxxx = _mm_set1_pd(inV.
mF64[0]);
193 __m128d yyyy = _mm_set1_pd(inV.
mF64[1]);
194 __m128d zzzz = _mm_set1_pd(inV.
mF64[2]);
195 __m128 col0 = mCol[0].
mValue;
196 __m128 col1 = mCol[1].
mValue;
197 __m128 col2 = mCol[2].
mValue;
198 __m128d t_low = _mm_mul_pd(_mm_cvtps_pd(col0), xxxx);
199 t_low = _mm_add_pd(t_low, _mm_mul_pd(_mm_cvtps_pd(col1), yyyy));
200 t_low = _mm_add_pd(t_low, _mm_mul_pd(_mm_cvtps_pd(col2), zzzz));
201 __m128d t_high = _mm_mul_pd(_mm_cvtps_pd(_mm_shuffle_ps(col0, col0, _MM_SHUFFLE(2, 2, 2, 2))), xxxx);
202 t_high = _mm_add_pd(t_high, _mm_mul_pd(_mm_cvtps_pd(_mm_shuffle_ps(col1, col1, _MM_SHUFFLE(2, 2, 2, 2))), yyyy));
203 t_high = _mm_add_pd(t_high, _mm_mul_pd(_mm_cvtps_pd(_mm_shuffle_ps(col2, col2, _MM_SHUFFLE(2, 2, 2, 2))), zzzz));
204 return DVec3({ t_low, t_high });
205#elif defined(JPH_USE_NEON)
206 float64x2_t xxxx = vdupq_laneq_f64(inV.
mValue.val[0], 0);
207 float64x2_t yyyy = vdupq_laneq_f64(inV.
mValue.val[0], 1);
208 float64x2_t zzzz = vdupq_laneq_f64(inV.
mValue.val[1], 0);
209 float32x4_t col0 = mCol[0].
mValue;
210 float32x4_t col1 = mCol[1].
mValue;
211 float32x4_t col2 = mCol[2].
mValue;
212 float64x2_t t_low = vmulq_f64(vcvt_f64_f32(vget_low_f32(col0)), xxxx);
213 t_low = vaddq_f64(t_low, vmulq_f64(vcvt_f64_f32(vget_low_f32(col1)), yyyy));
214 t_low = vaddq_f64(t_low, vmulq_f64(vcvt_f64_f32(vget_low_f32(col2)), zzzz));
215 float64x2_t t_high = vmulq_f64(vcvt_high_f64_f32(col0), xxxx);
216 t_high = vaddq_f64(t_high, vmulq_f64(vcvt_high_f64_f32(col1), yyyy));
217 t_high = vaddq_f64(t_high, vmulq_f64(vcvt_high_f64_f32(col2), zzzz));
219#elif defined(JPH_USE_RVV)
220 const vfloat64m2_t xxxx = __riscv_vfmv_v_f_f64m2(inV.
mF64[0], 4);
221 const vfloat64m2_t yyyy = __riscv_vfmv_v_f_f64m2(inV.
mF64[1], 4);
222 const vfloat64m2_t zzzz = __riscv_vfmv_v_f_f64m2(inV.
mF64[2], 4);
224 const vfloat32m1_t col0 = __riscv_vle32_v_f32m1(mCol[0].mF32, 4);
225 const vfloat32m1_t col1 = __riscv_vle32_v_f32m1(mCol[1].mF32, 4);
226 const vfloat32m1_t col2 = __riscv_vle32_v_f32m1(mCol[2].mF32, 4);
228 const vfloat64m2_t col0_f64 = __riscv_vfwcvt_f_f_v_f64m2(col0, 4);
229 const vfloat64m2_t col1_f64 = __riscv_vfwcvt_f_f_v_f64m2(col1, 4);
230 const vfloat64m2_t col2_f64 = __riscv_vfwcvt_f_f_v_f64m2(col2, 4);
232 vfloat64m2_t t = __riscv_vfmul_vv_f64m2(col0_f64, xxxx, 4);
233 t = __riscv_vfmacc_vv_f64m2(t, col1_f64, yyyy, 4);
234 t = __riscv_vfmacc_vv_f64m2(t, col2_f64, zzzz, 4);
237 __riscv_vse64_v_f64m2(v.mData, t, 4);
241 double(mCol[0].mF32[0]) * inV.
mF64[0] +
double(mCol[1].
mF32[0]) * inV.
mF64[1] +
double(mCol[2].
mF32[0]) * inV.
mF64[2],
242 double(mCol[0].
mF32[1]) * inV.
mF64[0] +
double(mCol[1].
mF32[1]) * inV.
mF64[1] +
double(mCol[2].
mF32[1]) * inV.
mF64[2],
243 double(mCol[0].
mF32[2]) * inV.
mF64[0] +
double(mCol[1].
mF32[2]) * inV.
mF64[1] +
double(mCol[2].
mF32[2]) * inV.
mF64[2]);
252#if defined(JPH_USE_SSE)
253 for (
int i = 0; i < 3; ++i)
256 __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)));
257 t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1))));
258 t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2))));
259 result.mCol[i].
mValue = t;
261#elif defined(JPH_USE_NEON)
262 for (
int i = 0; i < 3; ++i)
265 Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(c, 0));
266 t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(c, 1));
267 t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(c, 2));
268 result.mCol[i].
mValue = t;
270#elif defined(JPH_USE_RVV)
271 for (
int i = 0; i < 3; ++i)
274 const vfloat32m1_t v0 = __riscv_vfmv_v_f_f32m1(v.
mF32[0], 4);
275 const vfloat32m1_t v1 = __riscv_vfmv_v_f_f32m1(v.
mF32[1], 4);
276 const vfloat32m1_t v2 = __riscv_vfmv_v_f_f32m1(v.
mF32[2], 4);
278 const vfloat32m1_t col0 = __riscv_vle32_v_f32m1(mCol[0].mF32, 4);
279 const vfloat32m1_t col1 = __riscv_vle32_v_f32m1(mCol[1].mF32, 4);
280 const vfloat32m1_t col2 = __riscv_vle32_v_f32m1(mCol[2].mF32, 4);
282 vfloat32m1_t t = __riscv_vfmul_vv_f32m1(v0, col0, 4);
283 t = __riscv_vfmacc_vv_f32m1(t, col1, v1, 4);
284 t = __riscv_vfmacc_vv_f32m1(t, col2, v2, 4);
285 __riscv_vse32_v_f32m1(result.mCol[i].
mF32, t, 4);
288 for (
int i = 0; i < 3; ++i)
291 result.mCol[i] = mCol[0] * coli.
mF32[0] + mCol[1] * coli.
mF32[1] + mCol[2] * coli.
mF32[2];
306#if defined(JPH_USE_SSE)
307 for (
int i = 0; i < 3; ++i)
309 __m128 c = inM.mCol[i].
mValue;
310 __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)));
311 t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1))));
312 t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2))));
313 result.mCol[i].
mValue = t;
315#elif defined(JPH_USE_NEON)
316 for (
int i = 0; i < 3; ++i)
319 Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(c, 0));
320 t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(c, 1));
321 t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(c, 2));
322 result.mCol[i].
mValue = t;
324#elif defined(JPH_USE_RVV)
325 for (
int i = 0; i < 3; ++i)
327 const float *col_i = inM.mCol[i].
mF32;
328 const vfloat32m1_t v0 = __riscv_vfmv_v_f_f32m1(col_i[0], 4);
329 const vfloat32m1_t v1 = __riscv_vfmv_v_f_f32m1(col_i[1], 4);
330 const vfloat32m1_t v2 = __riscv_vfmv_v_f_f32m1(col_i[2], 4);
332 const vfloat32m1_t col0 = __riscv_vle32_v_f32m1(mCol[0].mF32, 4);
333 const vfloat32m1_t col1 = __riscv_vle32_v_f32m1(mCol[1].mF32, 4);
334 const vfloat32m1_t col2 = __riscv_vle32_v_f32m1(mCol[2].mF32, 4);
336 vfloat32m1_t t = __riscv_vfmul_vv_f32m1(v0, col0, 4);
337 t = __riscv_vfmacc_vv_f32m1(t, col1, v1, 4);
338 t = __riscv_vfmacc_vv_f32m1(t, col2, v2, 4);
339 __riscv_vse32_v_f32m1(result.mCol[i].
mF32, t, 4);
342 for (
int i = 0; i < 3; ++i)
344 Vec4 coli = inM.mCol[i];
345 result.mCol[i] = mCol[0] * coli.
mF32[0] + mCol[1] * coli.
mF32[1] + mCol[2] * coli.
mF32[2];