540#if defined(JPH_USE_SSE)
546 __m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
547 __m128 row1 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(1, 0, 1, 0));
548 __m128 row0 = _mm_shuffle_ps(tmp1, row1, _MM_SHUFFLE(2, 0, 2, 0));
549 row1 = _mm_shuffle_ps(row1, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
550 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
551 __m128 row3 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(3, 2, 3, 2));
552 __m128 row2 = _mm_shuffle_ps(tmp1, row3, _MM_SHUFFLE(2, 0, 2, 0));
553 row3 = _mm_shuffle_ps(row3, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
555 tmp1 = _mm_mul_ps(row2, row3);
556 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
557 __m128 minor0 = _mm_mul_ps(row1, tmp1);
558 __m128 minor1 = _mm_mul_ps(row0, tmp1);
559 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
560 minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
561 minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
562 minor1 = _mm_shuffle_ps(minor1, minor1, _MM_SHUFFLE(1, 0, 3, 2));
564 tmp1 = _mm_mul_ps(row1, row2);
565 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
566 minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
567 __m128 minor3 = _mm_mul_ps(row0, tmp1);
568 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
569 minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
570 minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
571 minor3 = _mm_shuffle_ps(minor3, minor3, _MM_SHUFFLE(1, 0, 3, 2));
573 tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(1, 0, 3, 2)), row3);
574 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
575 row2 = _mm_shuffle_ps(row2, row2, _MM_SHUFFLE(1, 0, 3, 2));
576 minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
577 __m128 minor2 = _mm_mul_ps(row0, tmp1);
578 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
579 minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
580 minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
581 minor2 = _mm_shuffle_ps(minor2, minor2, _MM_SHUFFLE(1, 0, 3, 2));
583 tmp1 = _mm_mul_ps(row0, row1);
584 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
585 minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
586 minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
587 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
588 minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
589 minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
591 tmp1 = _mm_mul_ps(row0, row3);
592 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
593 minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
594 minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
595 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
596 minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
597 minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
599 tmp1 = _mm_mul_ps(row0, row2);
600 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
601 minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
602 minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
603 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
604 minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
605 minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
607 __m128 det = _mm_mul_ps(row0, minor0);
608 det = _mm_add_ps(_mm_shuffle_ps(det, det, _MM_SHUFFLE(2, 3, 0, 1)), det);
609 det = _mm_add_ss(_mm_shuffle_ps(det, det, _MM_SHUFFLE(1, 0, 3, 2)), det);
610 det = _mm_div_ss(_mm_set_ss(1.0f), det);
611 det = _mm_shuffle_ps(det, det, _MM_SHUFFLE(0, 0, 0, 0));
614 result.mCol[0].
mValue = _mm_mul_ps(det, minor0);
615 result.mCol[1].
mValue = _mm_mul_ps(det, minor1);
616 result.mCol[2].
mValue = _mm_mul_ps(det, minor2);
617 result.mCol[3].
mValue = _mm_mul_ps(det, minor3);
619#elif defined(JPH_USE_NEON)
621 Type tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
622 Type row1 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, mCol[3].mValue, 0, 1, 4, 5);
623 Type row0 = JPH_NEON_SHUFFLE_F32x4(tmp1, row1, 0, 2, 4, 6);
624 row1 = JPH_NEON_SHUFFLE_F32x4(row1, tmp1, 1, 3, 5, 7);
625 tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
626 Type row3 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, mCol[3].mValue, 2, 3, 6, 7);
627 Type row2 = JPH_NEON_SHUFFLE_F32x4(tmp1, row3, 0, 2, 4, 6);
628 row3 = JPH_NEON_SHUFFLE_F32x4(row3, tmp1, 1, 3, 5, 7);
630 tmp1 = vmulq_f32(row2, row3);
631 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
632 Type minor0 = vmulq_f32(row1, tmp1);
633 Type minor1 = vmulq_f32(row0, tmp1);
634 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
635 minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
636 minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
637 minor1 = JPH_NEON_SHUFFLE_F32x4(minor1, minor1, 2, 3, 0, 1);
639 tmp1 = vmulq_f32(row1, row2);
640 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
641 minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
642 Type minor3 = vmulq_f32(row0, tmp1);
643 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
644 minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
645 minor3 = vsubq_f32(vmulq_f32(row0, tmp1), minor3);
646 minor3 = JPH_NEON_SHUFFLE_F32x4(minor3, minor3, 2, 3, 0, 1);
648 tmp1 = JPH_NEON_SHUFFLE_F32x4(row1, row1, 2, 3, 0, 1);
649 tmp1 = vmulq_f32(tmp1, row3);
650 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
651 row2 = JPH_NEON_SHUFFLE_F32x4(row2, row2, 2, 3, 0, 1);
652 minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
653 Type minor2 = vmulq_f32(row0, tmp1);
654 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
655 minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
656 minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
657 minor2 = JPH_NEON_SHUFFLE_F32x4(minor2, minor2, 2, 3, 0, 1);
659 tmp1 = vmulq_f32(row0, row1);
660 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
661 minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
662 minor3 = vsubq_f32(vmulq_f32(row2, tmp1), minor3);
663 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
664 minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
665 minor3 = vsubq_f32(minor3, vmulq_f32(row2, tmp1));
667 tmp1 = vmulq_f32(row0, row3);
668 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
669 minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
670 minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
671 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
672 minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
673 minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
675 tmp1 = vmulq_f32(row0, row2);
676 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
677 minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
678 minor3 = vsubq_f32(minor3, vmulq_f32(row1, tmp1));
679 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
680 minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
681 minor3 = vaddq_f32(vmulq_f32(row1, tmp1), minor3);
683 Type det = vmulq_f32(row0, minor0);
684 det = vdupq_n_f32(vaddvq_f32(det));
685 det = vdivq_f32(vdupq_n_f32(1.0f), det);
688 result.mCol[0].
mValue = vmulq_f32(det, minor0);
689 result.mCol[1].
mValue = vmulq_f32(det, minor1);
690 result.mCol[2].
mValue = vmulq_f32(det, minor2);
691 result.mCol[3].
mValue = vmulq_f32(det, minor3);
699 float m10211120 = m10 * m21 - m11 * m20;
700 float m10221220 = m10 * m22 - m12 * m20;
701 float m10231320 = m10 * m23 - m13 * m20;
702 float m10311130 = m10 * m31 - m11 * m30;
703 float m10321230 = m10 * m32 - m12 * m30;
704 float m10331330 = m10 * m33 - m13 * m30;
705 float m11221221 = m11 * m22 - m12 * m21;
706 float m11231321 = m11 * m23 - m13 * m21;
707 float m11321231 = m11 * m32 - m12 * m31;
708 float m11331331 = m11 * m33 - m13 * m31;
709 float m12231322 = m12 * m23 - m13 * m22;
710 float m12331332 = m12 * m33 - m13 * m32;
711 float m20312130 = m20 * m31 - m21 * m30;
712 float m20322230 = m20 * m32 - m22 * m30;
713 float m20332330 = m20 * m33 - m23 * m30;
714 float m21322231 = m21 * m32 - m22 * m31;
715 float m21332331 = m21 * m33 - m23 * m31;
716 float m22332332 = m22 * m33 - m23 * m32;
718 Vec4 col0(m11 * m22332332 - m12 * m21332331 + m13 * m21322231, -m10 * m22332332 + m12 * m20332330 - m13 * m20322230, m10 * m21332331 - m11 * m20332330 + m13 * m20312130, -m10 * m21322231 + m11 * m20322230 - m12 * m20312130);
719 Vec4 col1(-m01 * m22332332 + m02 * m21332331 - m03 * m21322231, m00 * m22332332 - m02 * m20332330 + m03 * m20322230, -m00 * m21332331 + m01 * m20332330 - m03 * m20312130, m00 * m21322231 - m01 * m20322230 + m02 * m20312130);
720 Vec4 col2(m01 * m12331332 - m02 * m11331331 + m03 * m11321231, -m00 * m12331332 + m02 * m10331330 - m03 * m10321230, m00 * m11331331 - m01 * m10331330 + m03 * m10311130, -m00 * m11321231 + m01 * m10321230 - m02 * m10311130);
721 Vec4 col3(-m01 * m12231322 + m02 * m11231321 - m03 * m11221221, m00 * m12231322 - m02 * m10231320 + m03 * m10221220, -m00 * m11231321 + m01 * m10231320 - m03 * m10211120, m00 * m11221221 - m01 * m10221220 + m02 * m10211120);
723 float det = m00 * col0.
mF32[0] + m01 * col0.
mF32[1] + m02 * col0.
mF32[2] + m03 * col0.
mF32[3];
725 return Mat44(col0 / det, col1 / det, col2 / det, col3 / det);