MathUtilSSE.inl 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. NS_CC_MATH_BEGIN
  2. #ifdef __SSE__
  3. void MathUtil::addMatrix(const __m128 m[4], float scalar, __m128 dst[4])
  4. {
  5. __m128 s = _mm_set1_ps(scalar);
  6. dst[0] = _mm_add_ps(m[0], s);
  7. dst[1] = _mm_add_ps(m[1], s);
  8. dst[2] = _mm_add_ps(m[2], s);
  9. dst[3] = _mm_add_ps(m[3], s);
  10. }
  11. void MathUtil::addMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4])
  12. {
  13. dst[0] = _mm_add_ps(m1[0], m2[0]);
  14. dst[1] = _mm_add_ps(m1[1], m2[1]);
  15. dst[2] = _mm_add_ps(m1[2], m2[2]);
  16. dst[3] = _mm_add_ps(m1[3], m2[3]);
  17. }
  18. void MathUtil::subtractMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4])
  19. {
  20. dst[0] = _mm_sub_ps(m1[0], m2[0]);
  21. dst[1] = _mm_sub_ps(m1[1], m2[1]);
  22. dst[2] = _mm_sub_ps(m1[2], m2[2]);
  23. dst[3] = _mm_sub_ps(m1[3], m2[3]);
  24. }
  25. void MathUtil::multiplyMatrix(const __m128 m[4], float scalar, __m128 dst[4])
  26. {
  27. __m128 s = _mm_set1_ps(scalar);
  28. dst[0] = _mm_mul_ps(m[0], s);
  29. dst[1] = _mm_mul_ps(m[1], s);
  30. dst[2] = _mm_mul_ps(m[2], s);
  31. dst[3] = _mm_mul_ps(m[3], s);
  32. }
  33. void MathUtil::multiplyMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4])
  34. {
  35. __m128 dst0, dst1, dst2, dst3;
  36. {
  37. __m128 e0 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(0, 0, 0, 0));
  38. __m128 e1 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(1, 1, 1, 1));
  39. __m128 e2 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(2, 2, 2, 2));
  40. __m128 e3 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(3, 3, 3, 3));
  41. __m128 v0 = _mm_mul_ps(m1[0], e0);
  42. __m128 v1 = _mm_mul_ps(m1[1], e1);
  43. __m128 v2 = _mm_mul_ps(m1[2], e2);
  44. __m128 v3 = _mm_mul_ps(m1[3], e3);
  45. __m128 a0 = _mm_add_ps(v0, v1);
  46. __m128 a1 = _mm_add_ps(v2, v3);
  47. __m128 a2 = _mm_add_ps(a0, a1);
  48. dst0 = a2;
  49. }
  50. {
  51. __m128 e0 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(0, 0, 0, 0));
  52. __m128 e1 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(1, 1, 1, 1));
  53. __m128 e2 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(2, 2, 2, 2));
  54. __m128 e3 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(3, 3, 3, 3));
  55. __m128 v0 = _mm_mul_ps(m1[0], e0);
  56. __m128 v1 = _mm_mul_ps(m1[1], e1);
  57. __m128 v2 = _mm_mul_ps(m1[2], e2);
  58. __m128 v3 = _mm_mul_ps(m1[3], e3);
  59. __m128 a0 = _mm_add_ps(v0, v1);
  60. __m128 a1 = _mm_add_ps(v2, v3);
  61. __m128 a2 = _mm_add_ps(a0, a1);
  62. dst1 = a2;
  63. }
  64. {
  65. __m128 e0 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(0, 0, 0, 0));
  66. __m128 e1 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(1, 1, 1, 1));
  67. __m128 e2 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(2, 2, 2, 2));
  68. __m128 e3 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(3, 3, 3, 3));
  69. __m128 v0 = _mm_mul_ps(m1[0], e0);
  70. __m128 v1 = _mm_mul_ps(m1[1], e1);
  71. __m128 v2 = _mm_mul_ps(m1[2], e2);
  72. __m128 v3 = _mm_mul_ps(m1[3], e3);
  73. __m128 a0 = _mm_add_ps(v0, v1);
  74. __m128 a1 = _mm_add_ps(v2, v3);
  75. __m128 a2 = _mm_add_ps(a0, a1);
  76. dst2 = a2;
  77. }
  78. {
  79. __m128 e0 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(0, 0, 0, 0));
  80. __m128 e1 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(1, 1, 1, 1));
  81. __m128 e2 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(2, 2, 2, 2));
  82. __m128 e3 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(3, 3, 3, 3));
  83. __m128 v0 = _mm_mul_ps(m1[0], e0);
  84. __m128 v1 = _mm_mul_ps(m1[1], e1);
  85. __m128 v2 = _mm_mul_ps(m1[2], e2);
  86. __m128 v3 = _mm_mul_ps(m1[3], e3);
  87. __m128 a0 = _mm_add_ps(v0, v1);
  88. __m128 a1 = _mm_add_ps(v2, v3);
  89. __m128 a2 = _mm_add_ps(a0, a1);
  90. dst3 = a2;
  91. }
  92. dst[0] = dst0;
  93. dst[1] = dst1;
  94. dst[2] = dst2;
  95. dst[3] = dst3;
  96. }
  97. void MathUtil::negateMatrix(const __m128 m[4], __m128 dst[4])
  98. {
  99. __m128 z = _mm_setzero_ps();
  100. dst[0] = _mm_sub_ps(z, m[0]);
  101. dst[1] = _mm_sub_ps(z, m[1]);
  102. dst[2] = _mm_sub_ps(z, m[2]);
  103. dst[3] = _mm_sub_ps(z, m[3]);
  104. }
  105. void MathUtil::transposeMatrix(const __m128 m[4], __m128 dst[4])
  106. {
  107. __m128 tmp0 = _mm_shuffle_ps(m[0], m[1], 0x44);
  108. __m128 tmp2 = _mm_shuffle_ps(m[0], m[1], 0xEE);
  109. __m128 tmp1 = _mm_shuffle_ps(m[2], m[3], 0x44);
  110. __m128 tmp3 = _mm_shuffle_ps(m[2], m[3], 0xEE);
  111. dst[0] = _mm_shuffle_ps(tmp0, tmp1, 0x88);
  112. dst[1] = _mm_shuffle_ps(tmp0, tmp1, 0xDD);
  113. dst[2] = _mm_shuffle_ps(tmp2, tmp3, 0x88);
  114. dst[3] = _mm_shuffle_ps(tmp2, tmp3, 0xDD);
  115. }
  116. void MathUtil::transformVec4(const __m128 m[4], const __m128& v, __m128& dst)
  117. {
  118. __m128 col1 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0));
  119. __m128 col2 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1));
  120. __m128 col3 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2));
  121. __m128 col4 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
  122. dst = _mm_add_ps(
  123. _mm_add_ps(_mm_mul_ps(m[0], col1), _mm_mul_ps(m[1], col2)),
  124. _mm_add_ps(_mm_mul_ps(m[2], col3), _mm_mul_ps(m[3], col4))
  125. );
  126. }
  127. #endif
  128. NS_CC_MATH_END