MathUtilNeon64.inl 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265
  1. /**
  2. Copyright 2013 BlackBerry Inc.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. Original file from GamePlay3D: http://gameplay3d.org
  13. This file was modified to fit the cocos2d-x project
  14. */
  15. NS_CC_MATH_BEGIN
  16. class MathUtilNeon64
  17. {
  18. public:
  19. inline static void addMatrix(const float* m, float scalar, float* dst);
  20. inline static void addMatrix(const float* m1, const float* m2, float* dst);
  21. inline static void subtractMatrix(const float* m1, const float* m2, float* dst);
  22. inline static void multiplyMatrix(const float* m, float scalar, float* dst);
  23. inline static void multiplyMatrix(const float* m1, const float* m2, float* dst);
  24. inline static void negateMatrix(const float* m, float* dst);
  25. inline static void transposeMatrix(const float* m, float* dst);
  26. inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);
  27. inline static void transformVec4(const float* m, const float* v, float* dst);
  28. inline static void crossVec3(const float* v1, const float* v2, float* dst);
  29. };
  30. inline void MathUtilNeon64::addMatrix(const float* m, float scalar, float* dst)
  31. {
  32. asm volatile(
  33. "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1] \n\t" // M[m0-m7] M[m8-m15]
  34. "ld1r {v4.4s}, [%2] \n\t" //ssss
  35. "fadd v8.4s, v0.4s, v4.4s \n\t" // DST->M[m0-m3] = M[m0-m3] + s
  36. "fadd v9.4s, v1.4s, v4.4s \n\t" // DST->M[m4-m7] = M[m4-m7] + s
  37. "fadd v10.4s, v2.4s, v4.4s \n\t" // DST->M[m8-m11] = M[m8-m11] + s
  38. "fadd v11.4s, v3.4s, v4.4s \n\t" // DST->M[m12-m15] = M[m12-m15] + s
  39. "st4 {v8.4s, v9.4s, v10.4s, v11.4s}, [%0] \n\t" // Result in V9
  40. :
  41. : "r"(dst), "r"(m), "r"(&scalar)
  42. : "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "memory"
  43. );
  44. }
  45. inline void MathUtilNeon64::addMatrix(const float* m1, const float* m2, float* dst)
  46. {
  47. asm volatile(
  48. "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1] \n\t" // M1[m0-m7] M1[m8-m15]
  49. "ld4 {v8.4s, v9.4s, v10.4s, v11.4s}, [%2] \n\t" // M2[m0-m7] M2[m8-m15]
  50. "fadd v12.4s, v0.4s, v8.4s \n\t" // DST->M[m0-m3] = M1[m0-m3] + M2[m0-m3]
  51. "fadd v13.4s, v1.4s, v9.4s \n\t" // DST->M[m4-m7] = M1[m4-m7] + M2[m4-m7]
  52. "fadd v14.4s, v2.4s, v10.4s \n\t" // DST->M[m8-m11] = M1[m8-m11] + M2[m8-m11]
  53. "fadd v15.4s, v3.4s, v11.4s \n\t" // DST->M[m12-m15] = M1[m12-m15] + M2[m12-m15]
  54. "st4 {v12.4s, v13.4s, v14.4s, v15.4s}, [%0] \n\t" // DST->M[m0-m7] DST->M[m8-m15]
  55. :
  56. : "r"(dst), "r"(m1), "r"(m2)
  57. : "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
  58. );
  59. }
  60. inline void MathUtilNeon64::subtractMatrix(const float* m1, const float* m2, float* dst)
  61. {
  62. asm volatile(
  63. "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1] \n\t" // M1[m0-m7] M1[m8-m15]
  64. "ld4 {v8.4s, v9.4s, v10.4s, v11.4s}, [%2] \n\t" // M2[m0-m7] M2[m8-m15]
  65. "fsub v12.4s, v0.4s, v8.4s \n\t" // DST->M[m0-m3] = M1[m0-m3] - M2[m0-m3]
  66. "fsub v13.4s, v1.4s, v9.4s \n\t" // DST->M[m4-m7] = M1[m4-m7] - M2[m4-m7]
  67. "fsub v14.4s, v2.4s, v10.4s \n\t" // DST->M[m8-m11] = M1[m8-m11] - M2[m8-m11]
  68. "fsub v15.4s, v3.4s, v11.4s \n\t" // DST->M[m12-m15] = M1[m12-m15] - M2[m12-m15]
  69. "st4 {v12.4s, v13.4s, v14.4s, v15.4s}, [%0] \n\t" // DST->M[m0-m7] DST->M[m8-m15]
  70. :
  71. : "r"(dst), "r"(m1), "r"(m2)
  72. : "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
  73. );
  74. }
  75. inline void MathUtilNeon64::multiplyMatrix(const float* m, float scalar, float* dst)
  76. {
  77. asm volatile(
  78. "ld1 {v0.s}[0], [%2] \n\t" //s
  79. "ld4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%1] \n\t" //M[m0-m7] M[m8-m15]
  80. "fmul v8.4s, v4.4s, v0.s[0] \n\t" // DST->M[m0-m3] = M[m0-m3] * s
  81. "fmul v9.4s, v5.4s, v0.s[0] \n\t" // DST->M[m4-m7] = M[m4-m7] * s
  82. "fmul v10.4s, v6.4s, v0.s[0] \n\t" // DST->M[m8-m11] = M[m8-m11] * s
  83. "fmul v11.4s, v7.4s, v0.s[0] \n\t" // DST->M[m12-m15] = M[m12-m15] * s
  84. "st4 {v8.4s, v9.4s, v10.4s, v11.4s}, [%0] \n\t" // DST->M[m0-m7] DST->M[m8-m15]
  85. :
  86. : "r"(dst), "r"(m), "r"(&scalar)
  87. : "v0", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory"
  88. );
  89. }
  90. inline void MathUtilNeon64::multiplyMatrix(const float* m1, const float* m2, float* dst)
  91. {
  92. asm volatile(
  93. "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%1] \n\t" // M1[m0-m7] M1[m8-m15] M2[m0-m7] M2[m8-m15]
  94. "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%2] \n\t" // M2[m0-m15]
  95. "fmul v12.4s, v8.4s, v0.s[0] \n\t" // DST->M[m0-m3] = M1[m0-m3] * M2[m0]
  96. "fmul v13.4s, v8.4s, v0.s[1] \n\t" // DST->M[m4-m7] = M1[m4-m7] * M2[m4]
  97. "fmul v14.4s, v8.4s, v0.s[2] \n\t" // DST->M[m8-m11] = M1[m8-m11] * M2[m8]
  98. "fmul v15.4s, v8.4s, v0.s[3] \n\t" // DST->M[m12-m15] = M1[m12-m15] * M2[m12]
  99. "fmla v12.4s, v9.4s, v1.s[0] \n\t" // DST->M[m0-m3] += M1[m0-m3] * M2[m1]
  100. "fmla v13.4s, v9.4s, v1.s[1] \n\t" // DST->M[m4-m7] += M1[m4-m7] * M2[m5]
  101. "fmla v14.4s, v9.4s, v1.s[2] \n\t" // DST->M[m8-m11] += M1[m8-m11] * M2[m9]
  102. "fmla v15.4s, v9.4s, v1.s[3] \n\t" // DST->M[m12-m15] += M1[m12-m15] * M2[m13]
  103. "fmla v12.4s, v10.4s, v2.s[0] \n\t" // DST->M[m0-m3] += M1[m0-m3] * M2[m2]
  104. "fmla v13.4s, v10.4s, v2.s[1] \n\t" // DST->M[m4-m7] += M1[m4-m7] * M2[m6]
  105. "fmla v14.4s, v10.4s, v2.s[2] \n\t" // DST->M[m8-m11] += M1[m8-m11] * M2[m10]
  106. "fmla v15.4s, v10.4s, v2.s[3] \n\t" // DST->M[m12-m15] += M1[m12-m15] * M2[m14]
  107. "fmla v12.4s, v11.4s, v3.s[0] \n\t" // DST->M[m0-m3] += M1[m0-m3] * M2[m3]
  108. "fmla v13.4s, v11.4s, v3.s[1] \n\t" // DST->M[m4-m7] += M1[m4-m7] * M2[m7]
  109. "fmla v14.4s, v11.4s, v3.s[2] \n\t" // DST->M[m8-m11] += M1[m8-m11] * M2[m11]
  110. "fmla v15.4s, v11.4s, v3.s[3] \n\t" // DST->M[m12-m15] += M1[m12-m15] * M2[m15]
  111. "st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%0] \n\t" // DST->M[m0-m7]// DST->M[m8-m15]
  112. : // output
  113. : "r"(dst), "r"(m1), "r"(m2) // input - note *value* of pointer doesn't change.
  114. : "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
  115. );
  116. }
  117. inline void MathUtilNeon64::negateMatrix(const float* m, float* dst)
  118. {
  119. asm volatile(
  120. "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1] \n\t" // load m0-m7 load m8-m15
  121. "fneg v4.4s, v0.4s \n\t" // negate m0-m3
  122. "fneg v5.4s, v1.4s \n\t" // negate m4-m7
  123. "fneg v6.4s, v2.4s \n\t" // negate m8-m15
  124. "fneg v7.4s, v3.4s \n\t" // negate m8-m15
  125. "st4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%0] \n\t" // store m0-m7 store m8-m15
  126. :
  127. : "r"(dst), "r"(m)
  128. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory"
  129. );
  130. }
  131. inline void MathUtilNeon64::transposeMatrix(const float* m, float* dst)
  132. {
  133. asm volatile(
  134. "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1] \n\t" // DST->M[m0, m4, m8, m12] = M[m0-m3]
  135. //DST->M[m1, m5, m9, m12] = M[m4-m7]
  136. "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0] \n\t"
  137. :
  138. : "r"(dst), "r"(m)
  139. : "v0", "v1", "v2", "v3", "memory"
  140. );
  141. }
  142. inline void MathUtilNeon64::transformVec4(const float* m, float x, float y, float z, float w, float* dst)
  143. {
  144. asm volatile(
  145. "ld1 {v0.s}[0], [%1] \n\t" // V[x]
  146. "ld1 {v0.s}[1], [%2] \n\t" // V[y]
  147. "ld1 {v0.s}[2], [%3] \n\t" // V[z]
  148. "ld1 {v0.s}[3], [%4] \n\t" // V[w]
  149. "ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [%5] \n\t" // M[m0-m7] M[m8-m15]
  150. "fmul v13.4s, v9.4s, v0.s[0] \n\t" // DST->V = M[m0-m3] * V[x]
  151. "fmla v13.4s, v10.4s, v0.s[1] \n\t" // DST->V += M[m4-m7] * V[y]
  152. "fmla v13.4s, v11.4s, v0.s[2] \n\t" // DST->V += M[m8-m11] * V[z]
  153. "fmla v13.4s, v12.4s, v0.s[3] \n\t" // DST->V += M[m12-m15] * V[w]
  154. //"st1 {v13.4s}, [%0] \n\t" // DST->V[x, y] // DST->V[z]
  155. "st1 {v13.2s}, [%0], 8 \n\t"
  156. "st1 {v13.s}[2], [%0] \n\t"
  157. :
  158. : "r"(dst), "r"(&x), "r"(&y), "r"(&z), "r"(&w), "r"(m)
  159. : "v0", "v9", "v10","v11", "v12", "v13", "memory"
  160. );
  161. }
  162. inline void MathUtilNeon64::transformVec4(const float* m, const float* v, float* dst)
  163. {
  164. asm volatile
  165. (
  166. "ld1 {v0.4s}, [%1] \n\t" // V[x, y, z, w]
  167. "ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [%2] \n\t" // M[m0-m7] M[m8-m15]
  168. "fmul v13.4s, v9.4s, v0.s[0] \n\t" // DST->V = M[m0-m3] * V[x]
  169. "fmla v13.4s, v10.4s, v0.s[1] \n\t" // DST->V = M[m4-m7] * V[y]
  170. "fmla v13.4s, v11.4s, v0.s[2] \n\t" // DST->V = M[m8-m11] * V[z]
  171. "fmla v13.4s, v12.4s, v0.s[3] \n\t" // DST->V = M[m12-m15] * V[w]
  172. "st1 {v13.4s}, [%0] \n\t" // DST->V
  173. :
  174. : "r"(dst), "r"(v), "r"(m)
  175. : "v0", "v9", "v10","v11", "v12", "v13", "memory"
  176. );
  177. }
  178. inline void MathUtilNeon64::crossVec3(const float* v1, const float* v2, float* dst)
  179. {
  180. asm volatile(
  181. "ld1 {v0.2s}, [%2] \n\t"
  182. "ld1 {v0.s}[2], [%1] \n\t"
  183. "mov v0.s[3], v0.s[0] \n\t" // q0 = (v1y, v1z, v1x, v1x)
  184. "ld1 {v1.4s}, [%3] \n\t"
  185. "mov v1.s[3], v1.s[0] \n\t" // q1 = (v2x, v2y, v2z, v2x)
  186. "fmul v2.4s, v0.4s, v1.4s \n\t" // x = v1y * v2z, y = v1z * v2x
  187. "mov v0.s[0], v0.s[1] \n\t"
  188. "mov v0.s[1], v0.s[2] \n\t"
  189. "mov v0.s[2], v0.s[3] \n\t"
  190. "mov v1.s[3], v1.s[2] \n\t"
  191. "fmul v0.4s, v0.4s, v1.4s \n\t"
  192. "mov v0.s[3], v0.s[1] \n\t"
  193. "mov v0.s[1], v0.s[2] \n\t"
  194. "mov v0.s[2], v0.s[0] \n\t"
  195. "fsub v2.4s, v0.4s, v2.4s \n\t"
  196. "mov v2.s[0], v2.s[1] \n\t"
  197. "mov v2.s[1], v2.s[2] \n\t"
  198. "mov v2.s[2], v2.s[3] \n\t"
  199. "st1 {v2.2s}, [%0], 8 \n\t" // V[x, y]
  200. "st1 {v2.s}[2], [%0] \n\t" // V[z]
  201. :
  202. : "r"(dst), "r"(v1), "r"((v1+1)), "r"(v2), "r"((v2+1))
  203. : "v0", "v1", "v2", "memory"
  204. );
  205. }
  206. NS_CC_MATH_END