vf_overlay.asm 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. ;*****************************************************************************
  2. ;* x86-optimized functions for overlay filter
  3. ;*
  4. ;* Copyright (C) 2018 Paul B Mahol
  5. ;* Copyright (C) 2018 Henrik Gramner
  6. ;*
  7. ;* This file is part of FFmpeg.
  8. ;*
  9. ;* FFmpeg is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* FFmpeg is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with FFmpeg; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;*****************************************************************************
  23. %include "libavutil/x86/x86util.asm"
  24. SECTION_RODATA
  25. pb_1: times 16 db 1
  26. pw_128: times 8 dw 128
  27. pw_255: times 8 dw 255
  28. pw_257: times 8 dw 257
  29. SECTION .text
  30. INIT_XMM sse4
  31. cglobal overlay_row_44, 5, 7, 6, 0, d, da, s, a, w, r, x
  32. xor xq, xq
  33. movsxdifnidn wq, wd
  34. mov rq, wq
  35. and rq, mmsize/2 - 1
  36. cmp wq, mmsize/2
  37. jl .end
  38. sub wq, rq
  39. mova m3, [pw_255]
  40. mova m4, [pw_128]
  41. mova m5, [pw_257]
  42. .loop:
  43. pmovzxbw m0, [sq+xq]
  44. pmovzxbw m2, [aq+xq]
  45. pmovzxbw m1, [dq+xq]
  46. pmullw m0, m2
  47. pxor m2, m3
  48. pmullw m1, m2
  49. paddw m0, m4
  50. paddw m0, m1
  51. pmulhuw m0, m5
  52. packuswb m0, m0
  53. movq [dq+xq], m0
  54. add xq, mmsize/2
  55. cmp xq, wq
  56. jl .loop
  57. .end:
  58. mov eax, xd
  59. RET
  60. INIT_XMM sse4
  61. cglobal overlay_row_22, 5, 7, 6, 0, d, da, s, a, w, r, x
  62. xor xq, xq
  63. movsxdifnidn wq, wd
  64. sub wq, 1
  65. mov rq, wq
  66. and rq, mmsize/2 - 1
  67. cmp wq, mmsize/2
  68. jl .end
  69. sub wq, rq
  70. mova m3, [pw_255]
  71. mova m4, [pw_128]
  72. mova m5, [pw_257]
  73. .loop:
  74. pmovzxbw m0, [sq+xq]
  75. movu m1, [aq+2*xq]
  76. pandn m2, m3, m1
  77. psllw m1, 8
  78. pavgw m2, m1
  79. pavgw m2, m1
  80. psrlw m2, 8
  81. pmovzxbw m1, [dq+xq]
  82. pmullw m0, m2
  83. pxor m2, m3
  84. pmullw m1, m2
  85. paddw m0, m4
  86. paddw m0, m1
  87. pmulhuw m0, m5
  88. packuswb m0, m0
  89. movq [dq+xq], m0
  90. add xq, mmsize/2
  91. cmp xq, wq
  92. jl .loop
  93. .end:
  94. mov eax, xd
  95. RET
  96. INIT_XMM sse4
  97. cglobal overlay_row_20, 6, 7, 7, 0, d, da, s, a, w, r, x
  98. mov daq, aq
  99. add daq, rmp
  100. xor xq, xq
  101. movsxdifnidn wq, wd
  102. sub wq, 1
  103. mov rq, wq
  104. and rq, mmsize/2 - 1
  105. cmp wq, mmsize/2
  106. jl .end
  107. sub wq, rq
  108. mova m3, [pw_255]
  109. mova m4, [pw_128]
  110. mova m5, [pw_257]
  111. mova m6, [pb_1]
  112. .loop:
  113. pmovzxbw m0, [sq+xq]
  114. movu m2, [aq+2*xq]
  115. movu m1, [daq+2*xq]
  116. pmaddubsw m2, m6
  117. pmaddubsw m1, m6
  118. paddw m2, m1
  119. psrlw m2, 2
  120. pmovzxbw m1, [dq+xq]
  121. pmullw m0, m2
  122. pxor m2, m3
  123. pmullw m1, m2
  124. paddw m0, m4
  125. paddw m0, m1
  126. pmulhuw m0, m5
  127. packuswb m0, m0
  128. movq [dq+xq], m0
  129. add xq, mmsize/2
  130. cmp xq, wq
  131. jl .loop
  132. .end:
  133. mov eax, xd
  134. RET