675#if defined(JPH_USE_SSE)
681 __m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
682 __m128 row1 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(1, 0, 1, 0));
683 __m128 row0 = _mm_shuffle_ps(tmp1, row1, _MM_SHUFFLE(2, 0, 2, 0));
684 row1 = _mm_shuffle_ps(row1, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
685 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
686 __m128 row3 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(3, 2, 3, 2));
687 __m128 row2 = _mm_shuffle_ps(tmp1, row3, _MM_SHUFFLE(2, 0, 2, 0));
688 row3 = _mm_shuffle_ps(row3, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
690 tmp1 = _mm_mul_ps(row2, row3);
691 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
692 __m128 minor0 = _mm_mul_ps(row1, tmp1);
693 __m128 minor1 = _mm_mul_ps(row0, tmp1);
694 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
695 minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
696 minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
697 minor1 = _mm_shuffle_ps(minor1, minor1, _MM_SHUFFLE(1, 0, 3, 2));
699 tmp1 = _mm_mul_ps(row1, row2);
700 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
701 minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
702 __m128 minor3 = _mm_mul_ps(row0, tmp1);
703 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
704 minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
705 minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
706 minor3 = _mm_shuffle_ps(minor3, minor3, _MM_SHUFFLE(1, 0, 3, 2));
708 tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(1, 0, 3, 2)), row3);
709 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
710 row2 = _mm_shuffle_ps(row2, row2, _MM_SHUFFLE(1, 0, 3, 2));
711 minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
712 __m128 minor2 = _mm_mul_ps(row0, tmp1);
713 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
714 minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
715 minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
716 minor2 = _mm_shuffle_ps(minor2, minor2, _MM_SHUFFLE(1, 0, 3, 2));
718 tmp1 = _mm_mul_ps(row0, row1);
719 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
720 minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
721 minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
722 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
723 minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
724 minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
726 tmp1 = _mm_mul_ps(row0, row3);
727 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
728 minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
729 minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
730 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
731 minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
732 minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
734 tmp1 = _mm_mul_ps(row0, row2);
735 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
736 minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
737 minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
738 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
739 minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
740 minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
742 __m128 det = _mm_mul_ps(row0, minor0);
743 det = _mm_add_ps(_mm_shuffle_ps(det, det, _MM_SHUFFLE(2, 3, 0, 1)), det);
744 det = _mm_add_ss(_mm_shuffle_ps(det, det, _MM_SHUFFLE(1, 0, 3, 2)), det);
745 det = _mm_div_ss(_mm_set_ss(1.0f), det);
746 det = _mm_shuffle_ps(det, det, _MM_SHUFFLE(0, 0, 0, 0));
749 result.mCol[0].
mValue = _mm_mul_ps(det, minor0);
750 result.mCol[1].
mValue = _mm_mul_ps(det, minor1);
751 result.mCol[2].
mValue = _mm_mul_ps(det, minor2);
752 result.mCol[3].
mValue = _mm_mul_ps(det, minor3);
754#elif defined(JPH_USE_NEON)
756 Type tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
757 Type row1 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, mCol[3].mValue, 0, 1, 4, 5);
758 Type row0 = JPH_NEON_SHUFFLE_F32x4(tmp1, row1, 0, 2, 4, 6);
759 row1 = JPH_NEON_SHUFFLE_F32x4(row1, tmp1, 1, 3, 5, 7);
760 tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
761 Type row3 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, mCol[3].mValue, 2, 3, 6, 7);
762 Type row2 = JPH_NEON_SHUFFLE_F32x4(tmp1, row3, 0, 2, 4, 6);
763 row3 = JPH_NEON_SHUFFLE_F32x4(row3, tmp1, 1, 3, 5, 7);
765 tmp1 = vmulq_f32(row2, row3);
766 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
767 Type minor0 = vmulq_f32(row1, tmp1);
768 Type minor1 = vmulq_f32(row0, tmp1);
769 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
770 minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
771 minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
772 minor1 = JPH_NEON_SHUFFLE_F32x4(minor1, minor1, 2, 3, 0, 1);
774 tmp1 = vmulq_f32(row1, row2);
775 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
776 minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
777 Type minor3 = vmulq_f32(row0, tmp1);
778 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
779 minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
780 minor3 = vsubq_f32(vmulq_f32(row0, tmp1), minor3);
781 minor3 = JPH_NEON_SHUFFLE_F32x4(minor3, minor3, 2, 3, 0, 1);
783 tmp1 = JPH_NEON_SHUFFLE_F32x4(row1, row1, 2, 3, 0, 1);
784 tmp1 = vmulq_f32(tmp1, row3);
785 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
786 row2 = JPH_NEON_SHUFFLE_F32x4(row2, row2, 2, 3, 0, 1);
787 minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
788 Type minor2 = vmulq_f32(row0, tmp1);
789 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
790 minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
791 minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
792 minor2 = JPH_NEON_SHUFFLE_F32x4(minor2, minor2, 2, 3, 0, 1);
794 tmp1 = vmulq_f32(row0, row1);
795 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
796 minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
797 minor3 = vsubq_f32(vmulq_f32(row2, tmp1), minor3);
798 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
799 minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
800 minor3 = vsubq_f32(minor3, vmulq_f32(row2, tmp1));
802 tmp1 = vmulq_f32(row0, row3);
803 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
804 minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
805 minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
806 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
807 minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
808 minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
810 tmp1 = vmulq_f32(row0, row2);
811 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
812 minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
813 minor3 = vsubq_f32(minor3, vmulq_f32(row1, tmp1));
814 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
815 minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
816 minor3 = vaddq_f32(vmulq_f32(row1, tmp1), minor3);
818 Type det = vmulq_f32(row0, minor0);
819 det = vdupq_n_f32(vaddvq_f32(det));
820 det = vdivq_f32(vdupq_n_f32(1.0f), det);
823 result.mCol[0].
mValue = vmulq_f32(det, minor0);
824 result.mCol[1].
mValue = vmulq_f32(det, minor1);
825 result.mCol[2].
mValue = vmulq_f32(det, minor2);
826 result.mCol[3].
mValue = vmulq_f32(det, minor3);
828#elif defined(JPH_USE_RVV)
830 const vfloat32m1_t zeros = __riscv_vfmv_v_f_f32m1(0.0f, 1);
832 const vfloat32m1_t c0 = __riscv_vle32_v_f32m1(mCol[0].mF32, 4);
833 const vfloat32m1_t c1 = __riscv_vle32_v_f32m1(mCol[1].mF32, 4);
834 const vfloat32m1_t c2 = __riscv_vle32_v_f32m1(mCol[2].mF32, 4);
835 const vfloat32m1_t c3 = __riscv_vle32_v_f32m1(mCol[3].mF32, 4);
837 vfloat32m1_t minor0, minor1, minor2, minor3;
839 vfloat32m1_t row0, row1, row2, row3;
841 tmp1 = RVVShuffleFloat32x4<0, 1, 4, 5>(c0, c1);
842 row1 = RVVShuffleFloat32x4<0, 1, 4, 5>(c2, c3);
843 row0 = RVVShuffleFloat32x4<0, 2, 4, 6>(tmp1, row1);
844 row1 = RVVShuffleFloat32x4<1, 3, 5, 7>(row1, tmp1);
845 tmp1 = RVVShuffleFloat32x4<2, 3, 6, 7>(c0, c1);
846 row3 = RVVShuffleFloat32x4<2, 3, 6, 7>(c2, c3);
847 row2 = RVVShuffleFloat32x4<0, 2, 4, 6>(tmp1, row3);
848 row3 = RVVShuffleFloat32x4<1, 3, 5, 7>(row3, tmp1);
850 tmp1 = __riscv_vfmul_vv_f32m1(row2, row3, 4);
851 tmp1 = RVVShuffleFloat32x4<1, 0, 3, 2>(tmp1, tmp1);
852 minor0 = __riscv_vfmul_vv_f32m1(row1, tmp1, 4);
853 minor1 = __riscv_vfmul_vv_f32m1(row0, tmp1, 4);
854 tmp1 = RVVShuffleFloat32x4<2, 3, 0, 1>(tmp1, tmp1);
855 minor0 = __riscv_vfsub_vv_f32m1(__riscv_vfmul_vv_f32m1(row1, tmp1, 4), minor0, 4);
856 minor1 = __riscv_vfsub_vv_f32m1(__riscv_vfmul_vv_f32m1(row0, tmp1, 4), minor1, 4);
857 minor1 = RVVShuffleFloat32x4<2, 3, 0, 1>(minor1, minor1);
859 tmp1 = __riscv_vfmul_vv_f32m1(row1, row2, 4);
860 tmp1 = RVVShuffleFloat32x4<1, 0, 3, 2>(tmp1, tmp1);
861 minor0 = __riscv_vfadd_vv_f32m1(__riscv_vfmul_vv_f32m1(row3, tmp1, 4), minor0, 4);
862 minor3 = __riscv_vfmul_vv_f32m1(row0, tmp1, 4);
863 tmp1 = RVVShuffleFloat32x4<2, 3, 0, 1>(tmp1, tmp1);
864 minor0 = __riscv_vfsub_vv_f32m1(minor0, __riscv_vfmul_vv_f32m1(row3, tmp1, 4), 4);
865 minor3 = __riscv_vfsub_vv_f32m1(__riscv_vfmul_vv_f32m1(row0, tmp1, 4), minor3, 4);
866 minor3 = RVVShuffleFloat32x4<2, 3, 0, 1>(minor3, minor3);
868 tmp1 = RVVShuffleFloat32x4<2, 3, 0, 1>(row1, row1);
869 tmp1 = __riscv_vfmul_vv_f32m1(tmp1, row3, 4);
870 tmp1 = RVVShuffleFloat32x4<1, 0, 3, 2>(tmp1, tmp1);
871 row2 = RVVShuffleFloat32x4<2, 3, 0, 1>(row2, row2);
872 minor0 = __riscv_vfadd_vv_f32m1(__riscv_vfmul_vv_f32m1(row2, tmp1, 4), minor0, 4);
873 minor2 = __riscv_vfmul_vv_f32m1(row0, tmp1, 4);
874 tmp1 = RVVShuffleFloat32x4<2, 3, 0, 1>(tmp1, tmp1);
875 minor0 = __riscv_vfsub_vv_f32m1(minor0, __riscv_vfmul_vv_f32m1(row2, tmp1, 4), 4);
876 minor2 = __riscv_vfsub_vv_f32m1(__riscv_vfmul_vv_f32m1(row0, tmp1, 4), minor2, 4);
877 minor2 = RVVShuffleFloat32x4<2, 3, 0, 1>(minor2, minor2);
879 tmp1 = __riscv_vfmul_vv_f32m1(row0, row1, 4);
880 tmp1 = RVVShuffleFloat32x4<1, 0, 3, 2>(tmp1, tmp1);
881 minor2 = __riscv_vfadd_vv_f32m1(__riscv_vfmul_vv_f32m1(row3, tmp1, 4), minor2, 4);
882 minor3 = __riscv_vfsub_vv_f32m1(__riscv_vfmul_vv_f32m1(row2, tmp1, 4), minor3, 4);
883 tmp1 = RVVShuffleFloat32x4<2, 3, 0, 1>(tmp1, tmp1);
884 minor2 = __riscv_vfsub_vv_f32m1(__riscv_vfmul_vv_f32m1(row3, tmp1, 4), minor2, 4);
885 minor3 = __riscv_vfsub_vv_f32m1(minor3, __riscv_vfmul_vv_f32m1(row2, tmp1, 4), 4);
887 tmp1 = __riscv_vfmul_vv_f32m1(row0, row3, 4);
888 tmp1 = RVVShuffleFloat32x4<1, 0, 3, 2>(tmp1, tmp1);
889 minor1 = __riscv_vfsub_vv_f32m1(minor1, __riscv_vfmul_vv_f32m1(row2, tmp1, 4), 4);
890 minor2 = __riscv_vfadd_vv_f32m1(__riscv_vfmul_vv_f32m1(row1, tmp1, 4), minor2, 4);
891 tmp1 = RVVShuffleFloat32x4<2, 3, 0, 1>(tmp1, tmp1);
892 minor1 = __riscv_vfadd_vv_f32m1(__riscv_vfmul_vv_f32m1(row2, tmp1, 4), minor1, 4);
893 minor2 = __riscv_vfsub_vv_f32m1(minor2, __riscv_vfmul_vv_f32m1(row1, tmp1, 4), 4);
895 tmp1 = __riscv_vfmul_vv_f32m1(row0, row2, 4);
896 tmp1 = RVVShuffleFloat32x4<1, 0, 3, 2>(tmp1, tmp1);
897 minor1 = __riscv_vfadd_vv_f32m1(__riscv_vfmul_vv_f32m1(row3, tmp1, 4), minor1, 4);
898 minor3 = __riscv_vfsub_vv_f32m1(minor3, __riscv_vfmul_vv_f32m1(row1, tmp1, 4), 4);
899 tmp1 = RVVShuffleFloat32x4<2, 3, 0, 1>(tmp1, tmp1);
900 minor1 = __riscv_vfsub_vv_f32m1(minor1, __riscv_vfmul_vv_f32m1(row3, tmp1, 4), 4);
901 minor3 = __riscv_vfadd_vv_f32m1(__riscv_vfmul_vv_f32m1(row1, tmp1, 4), minor3, 4);
903 const vfloat32m1_t v_det = __riscv_vfmul_vv_f32m1(row0, minor0, 4);
904 const vfloat32m1_t sum_vec = __riscv_vfredusum_vs_f32m1_f32m1(v_det, zeros, 4);
905 const float s_det = __riscv_vfmv_f_s_f32m1_f32(sum_vec);
906 const vfloat32m1_t det_inv = __riscv_vfmv_v_f_f32m1(1.0f / s_det, 4);
908 minor0 = __riscv_vfmul_vv_f32m1(det_inv, minor0, 4);
909 minor1 = __riscv_vfmul_vv_f32m1(det_inv, minor1, 4);
910 minor2 = __riscv_vfmul_vv_f32m1(det_inv, minor2, 4);
911 minor3 = __riscv_vfmul_vv_f32m1(det_inv, minor3, 4);
914 __riscv_vse32_v_f32m1(result.mCol[0].
mF32, minor0, 4);
915 __riscv_vse32_v_f32m1(result.mCol[1].
mF32, minor1, 4);
916 __riscv_vse32_v_f32m1(result.mCol[2].
mF32, minor2, 4);
917 __riscv_vse32_v_f32m1(result.mCol[3].
mF32, minor3, 4);
925 float m10211120 = m10 * m21 - m11 * m20;
926 float m10221220 = m10 * m22 - m12 * m20;
927 float m10231320 = m10 * m23 - m13 * m20;
928 float m10311130 = m10 * m31 - m11 * m30;
929 float m10321230 = m10 * m32 - m12 * m30;
930 float m10331330 = m10 * m33 - m13 * m30;
931 float m11221221 = m11 * m22 - m12 * m21;
932 float m11231321 = m11 * m23 - m13 * m21;
933 float m11321231 = m11 * m32 - m12 * m31;
934 float m11331331 = m11 * m33 - m13 * m31;
935 float m12231322 = m12 * m23 - m13 * m22;
936 float m12331332 = m12 * m33 - m13 * m32;
937 float m20312130 = m20 * m31 - m21 * m30;
938 float m20322230 = m20 * m32 - m22 * m30;
939 float m20332330 = m20 * m33 - m23 * m30;
940 float m21322231 = m21 * m32 - m22 * m31;
941 float m21332331 = m21 * m33 - m23 * m31;
942 float m22332332 = m22 * m33 - m23 * m32;
944 Vec4 col0(m11 * m22332332 - m12 * m21332331 + m13 * m21322231, -m10 * m22332332 + m12 * m20332330 - m13 * m20322230, m10 * m21332331 - m11 * m20332330 + m13 * m20312130, -m10 * m21322231 + m11 * m20322230 - m12 * m20312130);
945 Vec4 col1(-m01 * m22332332 + m02 * m21332331 - m03 * m21322231, m00 * m22332332 - m02 * m20332330 + m03 * m20322230, -m00 * m21332331 + m01 * m20332330 - m03 * m20312130, m00 * m21322231 - m01 * m20322230 + m02 * m20312130);
946 Vec4 col2(m01 * m12331332 - m02 * m11331331 + m03 * m11321231, -m00 * m12331332 + m02 * m10331330 - m03 * m10321230, m00 * m11331331 - m01 * m10331330 + m03 * m10311130, -m00 * m11321231 + m01 * m10321230 - m02 * m10311130);
947 Vec4 col3(-m01 * m12231322 + m02 * m11231321 - m03 * m11221221, m00 * m12231322 - m02 * m10231320 + m03 * m10221220, -m00 * m11231321 + m01 * m10231320 - m03 * m10211120, m00 * m11221221 - m01 * m10221220 + m02 * m10211120);
949 float det = m00 * col0.
mF32[0] + m01 * col0.
mF32[1] + m02 * col0.
mF32[2] + m03 * col0.
mF32[3];
951 return Mat44(col0 / det, col1 / det, col2 / det, col3 / det);