155
206
btScalar d = q.length2();
156
207
btFullAssert(d != btScalar(0.0));
157
208
btScalar s = btScalar(2.0) / d;
210
#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
211
__m128 vs, Q = q.get128();
212
__m128i Qi = btCastfTo128i(Q);
215
__m128 V11, V21, V31;
216
__m128 NQ = _mm_xor_ps(Q, btvMzeroMask);
217
__m128i NQi = btCastfTo128i(NQ);
219
V1 = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(1,0,2,3))); // Y X Z W
220
V2 = _mm_shuffle_ps(NQ, Q, BT_SHUFFLE(0,0,1,3)); // -X -X Y W
221
V3 = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(2,1,0,3))); // Z Y X W
222
V1 = _mm_xor_ps(V1, vMPPP); // change the sign of the first element
224
V11 = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(1,1,0,3))); // Y Y X W
225
V21 = _mm_unpackhi_ps(Q, Q); // Z Z W W
226
V31 = _mm_shuffle_ps(Q, NQ, BT_SHUFFLE(0,2,0,3)); // X Z -X -W
232
V11 = _mm_shuffle_ps(NQ, Q, BT_SHUFFLE(2,3,1,3)); // -Z -W Y W
234
V21 = _mm_xor_ps(V21, vMPPP); // change the sign of the first element
235
V31 = _mm_shuffle_ps(Q, NQ, BT_SHUFFLE(3,3,1,3)); // W W -Y -W
236
V31 = _mm_xor_ps(V31, vMPPP); // change the sign of the first element
237
Y = btCastiTo128f(_mm_shuffle_epi32 (NQi, BT_SHUFFLE(3,2,0,3))); // -W -Z -X -W
238
Z = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(1,0,1,3))); // Y X Y W
240
vs = _mm_load_ss(&s);
248
vs = bt_splat3_ps(vs, 0);
158
262
btScalar xs = q.x() * s, ys = q.y() * s, zs = q.z() * s;
159
263
btScalar wx = q.w() * xs, wy = q.w() * ys, wz = q.w() * zs;
160
264
btScalar xx = q.x() * xs, xy = q.x() * ys, xz = q.x() * zs;
161
265
btScalar yy = q.y() * ys, yz = q.y() * zs, zz = q.z() * zs;
162
setValue(btScalar(1.0) - (yy + zz), xy - wz, xz + wy,
267
btScalar(1.0) - (yy + zz), xy - wz, xz + wy,
163
268
xy + wz, btScalar(1.0) - (xx + zz), yz - wx,
164
269
xz - wy, yz + wx, btScalar(1.0) - (xx + yy));
168
274
/** @brief Set the matrix from euler angles using YPR around YXZ respectively
222
341
* @param m The array to be filled */
223
342
void getOpenGLSubMatrix(btScalar *m) const
344
#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
345
__m128 v0 = m_el[0].mVec128;
346
__m128 v1 = m_el[1].mVec128;
347
__m128 v2 = m_el[2].mVec128; // x2 y2 z2 w2
348
__m128 *vm = (__m128 *)m;
351
v2 = _mm_and_ps(v2, btvFFF0fMask); // x2 y2 z2 0
353
vT = _mm_unpackhi_ps(v0, v1); // z0 z1 * *
354
v0 = _mm_unpacklo_ps(v0, v1); // x0 x1 y0 y1
356
v1 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(2, 3, 1, 3) ); // y0 y1 y2 0
357
v0 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(0, 1, 0, 3) ); // x0 x1 x2 0
358
v2 = btCastdTo128f(_mm_move_sd(btCastfTo128d(v2), btCastfTo128d(vT))); // z0 z1 z2 0
363
#elif defined(BT_USE_NEON)
364
// note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
365
static const uint32x2_t zMask = (const uint32x2_t) {-1, 0 };
366
float32x4_t *vm = (float32x4_t *)m;
367
float32x4x2_t top = vtrnq_f32( m_el[0].mVec128, m_el[1].mVec128 ); // {x0 x1 z0 z1}, {y0 y1 w0 w1}
368
float32x2x2_t bl = vtrn_f32( vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f) ); // {x2 0 }, {y2 0}
369
float32x4_t v0 = vcombine_f32( vget_low_f32(top.val[0]), bl.val[0] );
370
float32x4_t v1 = vcombine_f32( vget_low_f32(top.val[1]), bl.val[1] );
371
float32x2_t q = (float32x2_t) vand_u32( (uint32x2_t) vget_high_f32( m_el[2].mVec128), zMask );
372
float32x4_t v2 = vcombine_f32( vget_high_f32(top.val[0]), q ); // z0 z1 z2 0
225
378
m[0] = btScalar(m_el[0].x());
226
379
m[1] = btScalar(m_el[1].x());
227
380
m[2] = btScalar(m_el[2].x());
527
740
SIMD_FORCE_INLINE btMatrix3x3&
528
741
btMatrix3x3::operator*=(const btMatrix3x3& m)
530
setValue(m.tdotx(m_el[0]), m.tdoty(m_el[0]), m.tdotz(m_el[0]),
743
#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
744
__m128 rv00, rv01, rv02;
745
__m128 rv10, rv11, rv12;
746
__m128 rv20, rv21, rv22;
747
__m128 mv0, mv1, mv2;
749
rv02 = m_el[0].mVec128;
750
rv12 = m_el[1].mVec128;
751
rv22 = m_el[2].mVec128;
753
mv0 = _mm_and_ps(m[0].mVec128, btvFFF0fMask);
754
mv1 = _mm_and_ps(m[1].mVec128, btvFFF0fMask);
755
mv2 = _mm_and_ps(m[2].mVec128, btvFFF0fMask);
758
rv00 = bt_splat_ps(rv02, 0);
759
rv01 = bt_splat_ps(rv02, 1);
760
rv02 = bt_splat_ps(rv02, 2);
762
rv00 = _mm_mul_ps(rv00, mv0);
763
rv01 = _mm_mul_ps(rv01, mv1);
764
rv02 = _mm_mul_ps(rv02, mv2);
767
rv10 = bt_splat_ps(rv12, 0);
768
rv11 = bt_splat_ps(rv12, 1);
769
rv12 = bt_splat_ps(rv12, 2);
771
rv10 = _mm_mul_ps(rv10, mv0);
772
rv11 = _mm_mul_ps(rv11, mv1);
773
rv12 = _mm_mul_ps(rv12, mv2);
776
rv20 = bt_splat_ps(rv22, 0);
777
rv21 = bt_splat_ps(rv22, 1);
778
rv22 = bt_splat_ps(rv22, 2);
780
rv20 = _mm_mul_ps(rv20, mv0);
781
rv21 = _mm_mul_ps(rv21, mv1);
782
rv22 = _mm_mul_ps(rv22, mv2);
784
rv00 = _mm_add_ps(rv00, rv01);
785
rv10 = _mm_add_ps(rv10, rv11);
786
rv20 = _mm_add_ps(rv20, rv21);
788
m_el[0].mVec128 = _mm_add_ps(rv00, rv02);
789
m_el[1].mVec128 = _mm_add_ps(rv10, rv12);
790
m_el[2].mVec128 = _mm_add_ps(rv20, rv22);
792
#elif defined(BT_USE_NEON)
794
float32x4_t rv0, rv1, rv2;
795
float32x4_t v0, v1, v2;
796
float32x4_t mv0, mv1, mv2;
798
v0 = m_el[0].mVec128;
799
v1 = m_el[1].mVec128;
800
v2 = m_el[2].mVec128;
802
mv0 = (float32x4_t) vandq_s32((int32x4_t)m[0].mVec128, btvFFF0Mask);
803
mv1 = (float32x4_t) vandq_s32((int32x4_t)m[1].mVec128, btvFFF0Mask);
804
mv2 = (float32x4_t) vandq_s32((int32x4_t)m[2].mVec128, btvFFF0Mask);
806
rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
807
rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
808
rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
810
rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
811
rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
812
rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
814
rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
815
rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
816
rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
818
m_el[0].mVec128 = rv0;
819
m_el[1].mVec128 = rv1;
820
m_el[2].mVec128 = rv2;
823
m.tdotx(m_el[0]), m.tdoty(m_el[0]), m.tdotz(m_el[0]),
531
824
m.tdotx(m_el[1]), m.tdoty(m_el[1]), m.tdotz(m_el[1]),
532
825
m.tdotx(m_el[2]), m.tdoty(m_el[2]), m.tdotz(m_el[2]));
536
830
SIMD_FORCE_INLINE btMatrix3x3&
537
831
btMatrix3x3::operator+=(const btMatrix3x3& m)
833
#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
834
m_el[0].mVec128 = m_el[0].mVec128 + m.m_el[0].mVec128;
835
m_el[1].mVec128 = m_el[1].mVec128 + m.m_el[1].mVec128;
836
m_el[2].mVec128 = m_el[2].mVec128 + m.m_el[2].mVec128;
540
839
m_el[0][0]+m.m_el[0][0],
541
840
m_el[0][1]+m.m_el[0][1],
546
845
m_el[2][0]+m.m_el[2][0],
547
846
m_el[2][1]+m.m_el[2][1],
548
847
m_el[2][2]+m.m_el[2][2]);
552
852
SIMD_FORCE_INLINE btMatrix3x3
553
853
operator*(const btMatrix3x3& m, const btScalar & k)
855
#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
856
__m128 vk = bt_splat_ps(_mm_load_ss((float *)&k), 0x80);
858
_mm_mul_ps(m[0].mVec128, vk),
859
_mm_mul_ps(m[1].mVec128, vk),
860
_mm_mul_ps(m[2].mVec128, vk));
861
#elif defined(BT_USE_NEON)
863
vmulq_n_f32(m[0].mVec128, k),
864
vmulq_n_f32(m[1].mVec128, k),
865
vmulq_n_f32(m[2].mVec128, k));
555
867
return btMatrix3x3(
556
868
m[0].x()*k,m[0].y()*k,m[0].z()*k,
557
869
m[1].x()*k,m[1].y()*k,m[1].z()*k,
558
870
m[2].x()*k,m[2].y()*k,m[2].z()*k);
561
SIMD_FORCE_INLINE btMatrix3x3
874
SIMD_FORCE_INLINE btMatrix3x3
562
875
operator+(const btMatrix3x3& m1, const btMatrix3x3& m2)
877
#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
879
m1[0].mVec128 + m2[0].mVec128,
880
m1[1].mVec128 + m2[1].mVec128,
881
m1[2].mVec128 + m2[2].mVec128);
576
898
SIMD_FORCE_INLINE btMatrix3x3
577
899
operator-(const btMatrix3x3& m1, const btMatrix3x3& m2)
901
#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
903
m1[0].mVec128 - m2[0].mVec128,
904
m1[1].mVec128 - m2[1].mVec128,
905
m1[2].mVec128 - m2[2].mVec128);
592
923
SIMD_FORCE_INLINE btMatrix3x3&
593
924
btMatrix3x3::operator-=(const btMatrix3x3& m)
926
#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
927
m_el[0].mVec128 = m_el[0].mVec128 - m.m_el[0].mVec128;
928
m_el[1].mVec128 = m_el[1].mVec128 - m.m_el[1].mVec128;
929
m_el[2].mVec128 = m_el[2].mVec128 - m.m_el[2].mVec128;
596
932
m_el[0][0]-m.m_el[0][0],
597
933
m_el[0][1]-m.m_el[0][1],
616
953
SIMD_FORCE_INLINE btMatrix3x3
617
954
btMatrix3x3::absolute() const
956
#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
958
_mm_and_ps(m_el[0].mVec128, btvAbsfMask),
959
_mm_and_ps(m_el[1].mVec128, btvAbsfMask),
960
_mm_and_ps(m_el[2].mVec128, btvAbsfMask));
961
#elif defined(BT_USE_NEON)
963
(float32x4_t)vandq_s32((int32x4_t)m_el[0].mVec128, btv3AbsMask),
964
(float32x4_t)vandq_s32((int32x4_t)m_el[1].mVec128, btv3AbsMask),
965
(float32x4_t)vandq_s32((int32x4_t)m_el[2].mVec128, btv3AbsMask));
619
967
return btMatrix3x3(
620
btFabs(m_el[0].x()), btFabs(m_el[0].y()), btFabs(m_el[0].z()),
621
btFabs(m_el[1].x()), btFabs(m_el[1].y()), btFabs(m_el[1].z()),
622
btFabs(m_el[2].x()), btFabs(m_el[2].y()), btFabs(m_el[2].z()));
968
btFabs(m_el[0].x()), btFabs(m_el[0].y()), btFabs(m_el[0].z()),
969
btFabs(m_el[1].x()), btFabs(m_el[1].y()), btFabs(m_el[1].z()),
970
btFabs(m_el[2].x()), btFabs(m_el[2].y()), btFabs(m_el[2].z()));
625
974
SIMD_FORCE_INLINE btMatrix3x3
626
975
btMatrix3x3::transpose() const
628
return btMatrix3x3(m_el[0].x(), m_el[1].x(), m_el[2].x(),
629
m_el[0].y(), m_el[1].y(), m_el[2].y(),
630
m_el[0].z(), m_el[1].z(), m_el[2].z());
977
#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
978
__m128 v0 = m_el[0].mVec128;
979
__m128 v1 = m_el[1].mVec128;
980
__m128 v2 = m_el[2].mVec128; // x2 y2 z2 w2
983
v2 = _mm_and_ps(v2, btvFFF0fMask); // x2 y2 z2 0
985
vT = _mm_unpackhi_ps(v0, v1); // z0 z1 * *
986
v0 = _mm_unpacklo_ps(v0, v1); // x0 x1 y0 y1
988
v1 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(2, 3, 1, 3) ); // y0 y1 y2 0
989
v0 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(0, 1, 0, 3) ); // x0 x1 x2 0
990
v2 = btCastdTo128f(_mm_move_sd(btCastfTo128d(v2), btCastfTo128d(vT))); // z0 z1 z2 0
993
return btMatrix3x3( v0, v1, v2 );
994
#elif defined(BT_USE_NEON)
995
// note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
996
static const uint32x2_t zMask = (const uint32x2_t) {-1, 0 };
997
float32x4x2_t top = vtrnq_f32( m_el[0].mVec128, m_el[1].mVec128 ); // {x0 x1 z0 z1}, {y0 y1 w0 w1}
998
float32x2x2_t bl = vtrn_f32( vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f) ); // {x2 0 }, {y2 0}
999
float32x4_t v0 = vcombine_f32( vget_low_f32(top.val[0]), bl.val[0] );
1000
float32x4_t v1 = vcombine_f32( vget_low_f32(top.val[1]), bl.val[1] );
1001
float32x2_t q = (float32x2_t) vand_u32( (uint32x2_t) vget_high_f32( m_el[2].mVec128), zMask );
1002
float32x4_t v2 = vcombine_f32( vget_high_f32(top.val[0]), q ); // z0 z1 z2 0
1003
return btMatrix3x3( v0, v1, v2 );
1005
return btMatrix3x3( m_el[0].x(), m_el[1].x(), m_el[2].x(),
1006
m_el[0].y(), m_el[1].y(), m_el[2].y(),
1007
m_el[0].z(), m_el[1].z(), m_el[2].z());
633
1011
SIMD_FORCE_INLINE btMatrix3x3
653
1031
SIMD_FORCE_INLINE btMatrix3x3
654
1032
btMatrix3x3::transposeTimes(const btMatrix3x3& m) const
1034
#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
1036
// static const __m128i xyzMask = (const __m128i){ -1ULL, 0xffffffffULL };
1037
__m128 row = m_el[0].mVec128;
1038
__m128 m0 = _mm_and_ps( m.getRow(0).mVec128, btvFFF0fMask );
1039
__m128 m1 = _mm_and_ps( m.getRow(1).mVec128, btvFFF0fMask);
1040
__m128 m2 = _mm_and_ps( m.getRow(2).mVec128, btvFFF0fMask );
1041
__m128 r0 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0));
1042
__m128 r1 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0x55));
1043
__m128 r2 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0xaa));
1044
row = m_el[1].mVec128;
1045
r0 = _mm_add_ps( r0, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0)));
1046
r1 = _mm_add_ps( r1, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0x55)));
1047
r2 = _mm_add_ps( r2, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0xaa)));
1048
row = m_el[2].mVec128;
1049
r0 = _mm_add_ps( r0, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0)));
1050
r1 = _mm_add_ps( r1, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0x55)));
1051
r2 = _mm_add_ps( r2, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0xaa)));
1052
return btMatrix3x3( r0, r1, r2 );
1054
#elif defined BT_USE_NEON
1056
static const uint32x4_t xyzMask = (const uint32x4_t){ -1, -1, -1, 0 };
1057
float32x4_t m0 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(0).mVec128, xyzMask );
1058
float32x4_t m1 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(1).mVec128, xyzMask );
1059
float32x4_t m2 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(2).mVec128, xyzMask );
1060
float32x4_t row = m_el[0].mVec128;
1061
float32x4_t r0 = vmulq_lane_f32( m0, vget_low_f32(row), 0);
1062
float32x4_t r1 = vmulq_lane_f32( m0, vget_low_f32(row), 1);
1063
float32x4_t r2 = vmulq_lane_f32( m0, vget_high_f32(row), 0);
1064
row = m_el[1].mVec128;
1065
r0 = vmlaq_lane_f32( r0, m1, vget_low_f32(row), 0);
1066
r1 = vmlaq_lane_f32( r1, m1, vget_low_f32(row), 1);
1067
r2 = vmlaq_lane_f32( r2, m1, vget_high_f32(row), 0);
1068
row = m_el[2].mVec128;
1069
r0 = vmlaq_lane_f32( r0, m2, vget_low_f32(row), 0);
1070
r1 = vmlaq_lane_f32( r1, m2, vget_low_f32(row), 1);
1071
r2 = vmlaq_lane_f32( r2, m2, vget_high_f32(row), 0);
1072
return btMatrix3x3( r0, r1, r2 );
657
1075
m_el[0].x() * m[0].x() + m_el[1].x() * m[1].x() + m_el[2].x() * m[2].x(),
658
1076
m_el[0].x() * m[0].y() + m_el[1].x() * m[1].y() + m_el[2].x() * m[2].y(),
659
1077
m_el[0].x() * m[0].z() + m_el[1].x() * m[1].z() + m_el[2].x() * m[2].z(),
663
1081
m_el[0].z() * m[0].x() + m_el[1].z() * m[1].x() + m_el[2].z() * m[2].x(),
664
1082
m_el[0].z() * m[0].y() + m_el[1].z() * m[1].y() + m_el[2].z() * m[2].y(),
665
1083
m_el[0].z() * m[0].z() + m_el[1].z() * m[1].z() + m_el[2].z() * m[2].z());
668
1087
SIMD_FORCE_INLINE btMatrix3x3
669
1088
btMatrix3x3::timesTranspose(const btMatrix3x3& m) const
1090
#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
1091
__m128 a0 = m_el[0].mVec128;
1092
__m128 a1 = m_el[1].mVec128;
1093
__m128 a2 = m_el[2].mVec128;
1095
btMatrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here
1096
__m128 mx = mT[0].mVec128;
1097
__m128 my = mT[1].mVec128;
1098
__m128 mz = mT[2].mVec128;
1100
__m128 r0 = _mm_mul_ps(mx, _mm_shuffle_ps(a0, a0, 0x00));
1101
__m128 r1 = _mm_mul_ps(mx, _mm_shuffle_ps(a1, a1, 0x00));
1102
__m128 r2 = _mm_mul_ps(mx, _mm_shuffle_ps(a2, a2, 0x00));
1103
r0 = _mm_add_ps(r0, _mm_mul_ps(my, _mm_shuffle_ps(a0, a0, 0x55)));
1104
r1 = _mm_add_ps(r1, _mm_mul_ps(my, _mm_shuffle_ps(a1, a1, 0x55)));
1105
r2 = _mm_add_ps(r2, _mm_mul_ps(my, _mm_shuffle_ps(a2, a2, 0x55)));
1106
r0 = _mm_add_ps(r0, _mm_mul_ps(mz, _mm_shuffle_ps(a0, a0, 0xaa)));
1107
r1 = _mm_add_ps(r1, _mm_mul_ps(mz, _mm_shuffle_ps(a1, a1, 0xaa)));
1108
r2 = _mm_add_ps(r2, _mm_mul_ps(mz, _mm_shuffle_ps(a2, a2, 0xaa)));
1109
return btMatrix3x3( r0, r1, r2);
1111
#elif defined BT_USE_NEON
1112
float32x4_t a0 = m_el[0].mVec128;
1113
float32x4_t a1 = m_el[1].mVec128;
1114
float32x4_t a2 = m_el[2].mVec128;
1116
btMatrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here
1117
float32x4_t mx = mT[0].mVec128;
1118
float32x4_t my = mT[1].mVec128;
1119
float32x4_t mz = mT[2].mVec128;
1121
float32x4_t r0 = vmulq_lane_f32( mx, vget_low_f32(a0), 0);
1122
float32x4_t r1 = vmulq_lane_f32( mx, vget_low_f32(a1), 0);
1123
float32x4_t r2 = vmulq_lane_f32( mx, vget_low_f32(a2), 0);
1124
r0 = vmlaq_lane_f32( r0, my, vget_low_f32(a0), 1);
1125
r1 = vmlaq_lane_f32( r1, my, vget_low_f32(a1), 1);
1126
r2 = vmlaq_lane_f32( r2, my, vget_low_f32(a2), 1);
1127
r0 = vmlaq_lane_f32( r0, mz, vget_high_f32(a0), 0);
1128
r1 = vmlaq_lane_f32( r1, mz, vget_high_f32(a1), 0);
1129
r2 = vmlaq_lane_f32( r2, mz, vget_high_f32(a2), 0);
1130
return btMatrix3x3( r0, r1, r2 );
671
1133
return btMatrix3x3(
672
1134
m_el[0].dot(m[0]), m_el[0].dot(m[1]), m_el[0].dot(m[2]),
673
1135
m_el[1].dot(m[0]), m_el[1].dot(m[1]), m_el[1].dot(m[2]),
674
1136
m_el[2].dot(m[0]), m_el[2].dot(m[1]), m_el[2].dot(m[2]));
678
1140
SIMD_FORCE_INLINE btVector3
679
1141
operator*(const btMatrix3x3& m, const btVector3& v)
1143
#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
1144
return v.dot3(m[0], m[1], m[2]);
681
1146
return btVector3(m[0].dot(v), m[1].dot(v), m[2].dot(v));
685
1151
SIMD_FORCE_INLINE btVector3
686
1152
operator*(const btVector3& v, const btMatrix3x3& m)
1154
#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
1156
const __m128 vv = v.mVec128;
1158
__m128 c0 = bt_splat_ps( vv, 0);
1159
__m128 c1 = bt_splat_ps( vv, 1);
1160
__m128 c2 = bt_splat_ps( vv, 2);
1162
c0 = _mm_mul_ps(c0, _mm_and_ps(m[0].mVec128, btvFFF0fMask) );
1163
c1 = _mm_mul_ps(c1, _mm_and_ps(m[1].mVec128, btvFFF0fMask) );
1164
c0 = _mm_add_ps(c0, c1);
1165
c2 = _mm_mul_ps(c2, _mm_and_ps(m[2].mVec128, btvFFF0fMask) );
1167
return btVector3(_mm_add_ps(c0, c2));
1168
#elif defined(BT_USE_NEON)
1169
const float32x4_t vv = v.mVec128;
1170
const float32x2_t vlo = vget_low_f32(vv);
1171
const float32x2_t vhi = vget_high_f32(vv);
1173
float32x4_t c0, c1, c2;
1175
c0 = (float32x4_t) vandq_s32((int32x4_t)m[0].mVec128, btvFFF0Mask);
1176
c1 = (float32x4_t) vandq_s32((int32x4_t)m[1].mVec128, btvFFF0Mask);
1177
c2 = (float32x4_t) vandq_s32((int32x4_t)m[2].mVec128, btvFFF0Mask);
1179
c0 = vmulq_lane_f32(c0, vlo, 0);
1180
c1 = vmulq_lane_f32(c1, vlo, 1);
1181
c2 = vmulq_lane_f32(c2, vhi, 0);
1182
c0 = vaddq_f32(c0, c1);
1183
c0 = vaddq_f32(c0, c2);
1185
return btVector3(c0);
688
1187
return btVector3(m.tdotx(v), m.tdoty(v), m.tdotz(v));
691
1191
SIMD_FORCE_INLINE btMatrix3x3
692
1192
operator*(const btMatrix3x3& m1, const btMatrix3x3& m2)
1194
#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
1196
__m128 m10 = m1[0].mVec128;
1197
__m128 m11 = m1[1].mVec128;
1198
__m128 m12 = m1[2].mVec128;
1200
__m128 m2v = _mm_and_ps(m2[0].mVec128, btvFFF0fMask);
1202
__m128 c0 = bt_splat_ps( m10, 0);
1203
__m128 c1 = bt_splat_ps( m11, 0);
1204
__m128 c2 = bt_splat_ps( m12, 0);
1206
c0 = _mm_mul_ps(c0, m2v);
1207
c1 = _mm_mul_ps(c1, m2v);
1208
c2 = _mm_mul_ps(c2, m2v);
1210
m2v = _mm_and_ps(m2[1].mVec128, btvFFF0fMask);
1212
__m128 c0_1 = bt_splat_ps( m10, 1);
1213
__m128 c1_1 = bt_splat_ps( m11, 1);
1214
__m128 c2_1 = bt_splat_ps( m12, 1);
1216
c0_1 = _mm_mul_ps(c0_1, m2v);
1217
c1_1 = _mm_mul_ps(c1_1, m2v);
1218
c2_1 = _mm_mul_ps(c2_1, m2v);
1220
m2v = _mm_and_ps(m2[2].mVec128, btvFFF0fMask);
1222
c0 = _mm_add_ps(c0, c0_1);
1223
c1 = _mm_add_ps(c1, c1_1);
1224
c2 = _mm_add_ps(c2, c2_1);
1226
m10 = bt_splat_ps( m10, 2);
1227
m11 = bt_splat_ps( m11, 2);
1228
m12 = bt_splat_ps( m12, 2);
1230
m10 = _mm_mul_ps(m10, m2v);
1231
m11 = _mm_mul_ps(m11, m2v);
1232
m12 = _mm_mul_ps(m12, m2v);
1234
c0 = _mm_add_ps(c0, m10);
1235
c1 = _mm_add_ps(c1, m11);
1236
c2 = _mm_add_ps(c2, m12);
1238
return btMatrix3x3(c0, c1, c2);
1240
#elif defined(BT_USE_NEON)
1242
float32x4_t rv0, rv1, rv2;
1243
float32x4_t v0, v1, v2;
1244
float32x4_t mv0, mv1, mv2;
1250
mv0 = (float32x4_t) vandq_s32((int32x4_t)m2[0].mVec128, btvFFF0Mask);
1251
mv1 = (float32x4_t) vandq_s32((int32x4_t)m2[1].mVec128, btvFFF0Mask);
1252
mv2 = (float32x4_t) vandq_s32((int32x4_t)m2[2].mVec128, btvFFF0Mask);
1254
rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
1255
rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
1256
rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
1258
rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
1259
rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
1260
rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
1262
rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
1263
rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
1264
rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
1266
return btMatrix3x3(rv0, rv1, rv2);
694
1269
return btMatrix3x3(
695
1270
m2.tdotx( m1[0]), m2.tdoty( m1[0]), m2.tdotz( m1[0]),
696
1271
m2.tdotx( m1[1]), m2.tdoty( m1[1]), m2.tdotz( m1[1]),
697
1272
m2.tdotx( m1[2]), m2.tdoty( m1[2]), m2.tdotz( m1[2]));