vf_pullup.asm 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178
  1. ;*****************************************************************************
  2. ;* x86-optimized functions for pullup filter
  3. ;*
  4. ;* This file is part of FFmpeg.
  5. ;*
  6. ;* FFmpeg is free software; you can redistribute it and/or modify
  7. ;* it under the terms of the GNU General Public License as published by
  8. ;* the Free Software Foundation; either version 2 of the License, or
  9. ;* (at your option) any later version.
  10. ;*
  11. ;* FFmpeg is distributed in the hope that it will be useful,
  12. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. ;* GNU General Public License for more details.
  15. ;*
  16. ;* You should have received a copy of the GNU General Public License along
  17. ;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
  18. ;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19. ;******************************************************************************
  20. %include "libavutil/x86/x86util.asm"
  21. SECTION .text
  22. INIT_MMX mmx
  23. cglobal pullup_filter_diff, 3, 5, 8, first, second, size
  24. mov r3, 4
  25. pxor m4, m4
  26. pxor m7, m7
  27. .loop:
  28. movq m0, [firstq]
  29. movq m2, [firstq]
  30. add firstq, sizeq
  31. movq m1, [secondq]
  32. add secondq, sizeq
  33. psubusb m2, m1
  34. psubusb m1, m0
  35. movq m0, m2
  36. movq m3, m1
  37. punpcklbw m0, m7
  38. punpcklbw m1, m7
  39. punpckhbw m2, m7
  40. punpckhbw m3, m7
  41. paddw m4, m0
  42. paddw m4, m1
  43. paddw m4, m2
  44. paddw m4, m3
  45. dec r3
  46. jnz .loop
  47. movq m3, m4
  48. punpcklwd m4, m7
  49. punpckhwd m3, m7
  50. paddd m3, m4
  51. movd eax, m3
  52. psrlq m3, 32
  53. movd r4d, m3
  54. add eax, r4d
  55. RET
  56. INIT_MMX mmx
  57. cglobal pullup_filter_comb, 3, 5, 8, first, second, size
  58. mov r3, 4
  59. pxor m6, m6
  60. pxor m7, m7
  61. sub secondq, sizeq
  62. .loop:
  63. movq m0, [firstq]
  64. movq m1, [secondq]
  65. punpcklbw m0, m7
  66. movq m2, [secondq+sizeq]
  67. punpcklbw m1, m7
  68. punpcklbw m2, m7
  69. paddw m0, m0
  70. paddw m1, m2
  71. movq m2, m0
  72. psubusw m0, m1
  73. psubusw m1, m2
  74. paddw m6, m0
  75. paddw m6, m1
  76. movq m0, [firstq]
  77. movq m1, [secondq]
  78. punpckhbw m0, m7
  79. movq m2, [secondq+sizeq]
  80. punpckhbw m1, m7
  81. punpckhbw m2, m7
  82. paddw m0, m0
  83. paddw m1, m2
  84. movq m2, m0
  85. psubusw m0, m1
  86. psubusw m1, m2
  87. paddw m6, m0
  88. paddw m6, m1
  89. movq m0, [secondq+sizeq]
  90. movq m1, [firstq]
  91. punpcklbw m0, m7
  92. movq m2, [firstq+sizeq]
  93. punpcklbw m1, m7
  94. punpcklbw m2, m7
  95. paddw m0, m0
  96. paddw m1, m2
  97. movq m2, m0
  98. psubusw m0, m1
  99. psubusw m1, m2
  100. paddw m6, m0
  101. paddw m6, m1
  102. movq m0, [secondq+sizeq]
  103. movq m1, [firstq]
  104. punpckhbw m0, m7
  105. movq m2, [firstq+sizeq]
  106. punpckhbw m1, m7
  107. punpckhbw m2, m7
  108. paddw m0, m0
  109. paddw m1, m2
  110. movq m2, m0
  111. psubusw m0, m1
  112. psubusw m1, m2
  113. paddw m6, m0
  114. paddw m6, m1
  115. add firstq, sizeq
  116. add secondq, sizeq
  117. dec r3
  118. jnz .loop
  119. movq m5, m6
  120. punpcklwd m6, m7
  121. punpckhwd m5, m7
  122. paddd m5, m6
  123. movd eax, m5
  124. psrlq m5, 32
  125. movd r4d, m5
  126. add eax, r4d
  127. RET
  128. INIT_MMX mmx
  129. cglobal pullup_filter_var, 3, 5, 8, first, second, size
  130. mov r3, 3
  131. pxor m4, m4
  132. pxor m7, m7
  133. .loop:
  134. movq m0, [firstq]
  135. movq m2, [firstq]
  136. movq m1, [firstq+sizeq]
  137. add firstq, sizeq
  138. psubusb m2, m1
  139. psubusb m1, m0
  140. movq m0, m2
  141. movq m3, m1
  142. punpcklbw m0, m7
  143. punpcklbw m1, m7
  144. punpckhbw m2, m7
  145. punpckhbw m3, m7
  146. paddw m4, m0
  147. paddw m4, m1
  148. paddw m4, m2
  149. paddw m4, m3
  150. dec r3
  151. jnz .loop
  152. movq m3, m4
  153. punpcklwd m4, m7
  154. punpckhwd m3, m7
  155. paddd m3, m4
  156. movd eax, m3
  157. psrlq m3, 32
  158. movd r4d, m3
  159. add eax, r4d
  160. shl eax, 2
  161. RET