float_dsp_neon.S 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. /*
  2. * ARM NEON optimised Float DSP functions
  3. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  4. * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
  5. *
  6. * This file is part of FFmpeg.
  7. *
  8. * FFmpeg is free software; you can redistribute it and/or
  9. * modify it under the terms of the GNU Lesser General Public
  10. * License as published by the Free Software Foundation; either
  11. * version 2.1 of the License, or (at your option) any later version.
  12. *
  13. * FFmpeg is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. * Lesser General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with FFmpeg; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. */
  22. #include "config.h"
  23. #include "asm.S"
  24. function ff_vector_fmul_neon, export=1
  25. 1: subs w3, w3, #16
  26. ld1 {v0.4S, v1.4S}, [x1], #32
  27. ld1 {v2.4S, v3.4S}, [x1], #32
  28. ld1 {v4.4S, v5.4S}, [x2], #32
  29. ld1 {v6.4S, v7.4S}, [x2], #32
  30. fmul v16.4S, v0.4S, v4.4S
  31. fmul v17.4S, v1.4S, v5.4S
  32. fmul v18.4S, v2.4S, v6.4S
  33. fmul v19.4S, v3.4S, v7.4S
  34. st1 {v16.4S, v17.4S}, [x0], #32
  35. st1 {v18.4S, v19.4S}, [x0], #32
  36. b.ne 1b
  37. ret
  38. endfunc
  39. function ff_vector_fmac_scalar_neon, export=1
  40. mov x3, #-32
  41. 1: subs w2, w2, #16
  42. ld1 {v16.4S, v17.4S}, [x0], #32
  43. ld1 {v18.4S, v19.4S}, [x0], x3
  44. ld1 {v4.4S, v5.4S}, [x1], #32
  45. ld1 {v6.4S, v7.4S}, [x1], #32
  46. fmla v16.4S, v4.4S, v0.S[0]
  47. fmla v17.4S, v5.4S, v0.S[0]
  48. fmla v18.4S, v6.4S, v0.S[0]
  49. fmla v19.4S, v7.4S, v0.S[0]
  50. st1 {v16.4S, v17.4S}, [x0], #32
  51. st1 {v18.4S, v19.4S}, [x0], #32
  52. b.ne 1b
  53. ret
  54. endfunc
  55. function ff_vector_fmul_scalar_neon, export=1
  56. mov w4, #15
  57. bics w3, w2, w4
  58. dup v16.4S, v0.S[0]
  59. b.eq 3f
  60. ld1 {v0.4S, v1.4S}, [x1], #32
  61. 1: subs w3, w3, #16
  62. fmul v0.4S, v0.4S, v16.4S
  63. ld1 {v2.4S, v3.4S}, [x1], #32
  64. fmul v1.4S, v1.4S, v16.4S
  65. fmul v2.4S, v2.4S, v16.4S
  66. st1 {v0.4S, v1.4S}, [x0], #32
  67. fmul v3.4S, v3.4S, v16.4S
  68. b.eq 2f
  69. ld1 {v0.4S, v1.4S}, [x1], #32
  70. st1 {v2.4S, v3.4S}, [x0], #32
  71. b 1b
  72. 2: ands w2, w2, #15
  73. st1 {v2.4S, v3.4S}, [x0], #32
  74. b.eq 4f
  75. 3: ld1 {v0.4S}, [x1], #16
  76. fmul v0.4S, v0.4S, v16.4S
  77. st1 {v0.4S}, [x0], #16
  78. subs w2, w2, #4
  79. b.gt 3b
  80. 4: ret
  81. endfunc
  82. function ff_vector_dmul_scalar_neon, export=1
  83. dup v16.2D, v0.D[0]
  84. ld1 {v0.2D, v1.2D}, [x1], #32
  85. 1: subs w2, w2, #8
  86. fmul v0.2D, v0.2D, v16.2D
  87. ld1 {v2.2D, v3.2D}, [x1], #32
  88. fmul v1.2D, v1.2D, v16.2D
  89. fmul v2.2D, v2.2D, v16.2D
  90. st1 {v0.2D, v1.2D}, [x0], #32
  91. fmul v3.2D, v3.2D, v16.2D
  92. ld1 {v0.2D, v1.2D}, [x1], #32
  93. st1 {v2.2D, v3.2D}, [x0], #32
  94. b.gt 1b
  95. ret
  96. endfunc
  97. function ff_vector_fmul_window_neon, export=1
  98. sxtw x4, w4 // len
  99. sub x2, x2, #8
  100. sub x5, x4, #2
  101. add x2, x2, x5, lsl #2 // src1 + 4 * (len - 4)
  102. add x6, x3, x5, lsl #3 // win + 8 * (len - 2)
  103. add x5, x0, x5, lsl #3 // dst + 8 * (len - 2)
  104. mov x7, #-16
  105. ld1 {v0.4S}, [x1], #16 // s0
  106. ld1 {v2.4S}, [x3], #16 // wi
  107. ld1 {v1.4S}, [x2], x7 // s1
  108. 1: ld1 {v3.4S}, [x6], x7 // wj
  109. subs x4, x4, #4
  110. fmul v17.4S, v0.4S, v2.4S // s0 * wi
  111. rev64 v4.4S, v1.4S
  112. rev64 v5.4S, v3.4S
  113. rev64 v17.4S, v17.4S
  114. ext v4.16B, v4.16B, v4.16B, #8 // s1_r
  115. ext v5.16B, v5.16B, v5.16B, #8 // wj_r
  116. ext v17.16B, v17.16B, v17.16B, #8 // (s0 * wi)_rev
  117. fmul v16.4S, v0.4S, v5.4S // s0 * wj_r
  118. fmla v17.4S, v1.4S, v3.4S // (s0 * wi)_rev + s1 * wj
  119. b.eq 2f
  120. ld1 {v0.4S}, [x1], #16
  121. fmls v16.4S, v4.4S, v2.4S // s0 * wj_r - s1_r * wi
  122. st1 {v17.4S}, [x5], x7
  123. ld1 {v2.4S}, [x3], #16
  124. ld1 {v1.4S}, [x2], x7
  125. st1 {v16.4S}, [x0], #16
  126. b 1b
  127. 2:
  128. fmls v16.4S, v4.4S, v2.4S // s0 * wj_r - s1_r * wi
  129. st1 {v17.4S}, [x5], x7
  130. st1 {v16.4S}, [x0], #16
  131. ret
  132. endfunc
  133. function ff_vector_fmul_add_neon, export=1
  134. ld1 {v0.4S, v1.4S}, [x1], #32
  135. ld1 {v2.4S, v3.4S}, [x2], #32
  136. ld1 {v4.4S, v5.4S}, [x3], #32
  137. 1: subs w4, w4, #8
  138. fmla v4.4S, v0.4S, v2.4S
  139. fmla v5.4S, v1.4S, v3.4S
  140. b.eq 2f
  141. ld1 {v0.4S, v1.4S}, [x1], #32
  142. ld1 {v2.4S, v3.4S}, [x2], #32
  143. st1 {v4.4S, v5.4S}, [x0], #32
  144. ld1 {v4.4S, v5.4S}, [x3], #32
  145. b 1b
  146. 2: st1 {v4.4S, v5.4S}, [x0], #32
  147. ret
  148. endfunc
  149. function ff_vector_fmul_reverse_neon, export=1
  150. sxtw x3, w3
  151. add x2, x2, x3, lsl #2
  152. sub x2, x2, #32
  153. mov x4, #-32
  154. ld1 {v2.4S, v3.4S}, [x2], x4
  155. ld1 {v0.4S, v1.4S}, [x1], #32
  156. 1: subs x3, x3, #8
  157. rev64 v3.4S, v3.4S
  158. rev64 v2.4S, v2.4S
  159. ext v3.16B, v3.16B, v3.16B, #8
  160. ext v2.16B, v2.16B, v2.16B, #8
  161. fmul v16.4S, v0.4S, v3.4S
  162. fmul v17.4S, v1.4S, v2.4S
  163. b.eq 2f
  164. ld1 {v2.4S, v3.4S}, [x2], x4
  165. ld1 {v0.4S, v1.4S}, [x1], #32
  166. st1 {v16.4S, v17.4S}, [x0], #32
  167. b 1b
  168. 2: st1 {v16.4S, v17.4S}, [x0], #32
  169. ret
  170. endfunc
  171. function ff_butterflies_float_neon, export=1
  172. 1: ld1 {v0.4S}, [x0]
  173. ld1 {v1.4S}, [x1]
  174. subs w2, w2, #4
  175. fsub v2.4S, v0.4S, v1.4S
  176. fadd v3.4S, v0.4S, v1.4S
  177. st1 {v2.4S}, [x1], #16
  178. st1 {v3.4S}, [x0], #16
  179. b.gt 1b
  180. ret
  181. endfunc
  182. function ff_scalarproduct_float_neon, export=1
  183. movi v2.4S, #0
  184. 1: ld1 {v0.4S}, [x0], #16
  185. ld1 {v1.4S}, [x1], #16
  186. subs w2, w2, #4
  187. fmla v2.4S, v0.4S, v1.4S
  188. b.gt 1b
  189. faddp v0.4S, v2.4S, v2.4S
  190. faddp s0, v0.2S
  191. ret
  192. endfunc