vf_gblur.asm 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. ;*****************************************************************************
  2. ;* x86-optimized functions for gblur filter
  3. ;*
  4. ;* This file is part of FFmpeg.
  5. ;*
  6. ;* FFmpeg is free software; you can redistribute it and/or
  7. ;* modify it under the terms of the GNU Lesser General Public
  8. ;* License as published by the Free Software Foundation; either
  9. ;* version 2.1 of the License, or (at your option) any later version.
  10. ;*
  11. ;* FFmpeg is distributed in the hope that it will be useful,
  12. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. ;* Lesser General Public License for more details.
  15. ;*
  16. ;* You should have received a copy of the GNU Lesser General Public
  17. ;* License along with FFmpeg; if not, write to the Free Software
  18. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. ;******************************************************************************
  20. %include "libavutil/x86/x86util.asm"
  21. SECTION .text
  22. ; void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps,
  23. ; float nu, float bscale)
  24. %macro HORIZ_SLICE 0
  25. %if UNIX64
  26. cglobal horiz_slice, 4, 9, 9, ptr, width, height, steps, x, y, step, stride, remain
  27. %else
  28. cglobal horiz_slice, 4, 9, 9, ptr, width, height, steps, nu, bscale, x, y, step, stride, remain
  29. %endif
  30. %if WIN64
  31. movss m0, num
  32. movss m1, bscalem
  33. DEFINE_ARGS ptr, width, height, steps, x, y, step, stride, remain
  34. %endif
  35. movsxdifnidn widthq, widthd
  36. mulss m2, m0, m0 ; nu ^ 2
  37. mulss m3, m2, m0 ; nu ^ 3
  38. mulss m4, m3, m0 ; nu ^ 4
  39. xor xq, xq
  40. xor yd, yd
  41. mov strideq, widthq
  42. ; stride = width * 4
  43. shl strideq, 2
  44. ; w = w - ((w - 1) & 3)
  45. mov remainq, widthq
  46. sub remainq, 1
  47. and remainq, 3
  48. sub widthq, remainq
  49. shufps m0, m0, 0
  50. shufps m2, m2, 0
  51. shufps m3, m3, 0
  52. shufps m4, m4, 0
  53. .loop_y:
  54. xor stepd, stepd
  55. .loop_step:
  56. ; p0 *= bscale
  57. mulss m5, m1, [ptrq + xq * 4]
  58. movss [ptrq + xq * 4], m5
  59. inc xq
  60. ; filter rightwards
  61. ; Here we are vectorizing the c version by 4
  62. ; for (x = 1; x < width; x++)
  63. ; ptr[x] += nu * ptr[x - 1];
  64. ; let p0 stands for ptr[x-1], the data from last loop
  65. ; and [p1,p2,p3,p4] be the vector data for this loop.
  66. ; Unrolling the loop, we get:
  67. ; p1' = p1 + p0*nu
  68. ; p2' = p2 + p1*nu + p0*nu^2
  69. ; p3' = p3 + p2*nu + p1*nu^2 + p0*nu^3
  70. ; p4' = p4 + p3*nu + p2*nu^2 + p1*nu^3 + p0*nu^4
  71. ; so we can do it in simd:
  72. ; [p1',p2',p3',p4'] = [p1,p2,p3,p4] + [p0,p1,p2,p3]*nu +
  73. ; [0,p0,p1,p2]*nu^2 + [0,0,p0,p1]*nu^3 +
  74. ; [0,0,0,p0]*nu^4
  75. .loop_x:
  76. movu m6, [ptrq + xq * 4] ; s = [p1,p2,p3,p4]
  77. pslldq m7, m6, 4 ; [0, p1,p2,p3]
  78. movss m7, m5 ; [p0,p1,p2,p3]
  79. FMULADD_PS m6, m7, m0, m6, m8 ; s += [p0,p1,p2,p3] * nu
  80. pslldq m7, 4 ; [0,p0,p1,p2]
  81. FMULADD_PS m6, m7, m2, m6, m8 ; s += [0,p0,p1,p2] * nu^2
  82. pslldq m7, 4
  83. FMULADD_PS m6, m7, m3, m6, m8 ; s += [0,0,p0,p1] * nu^3
  84. pslldq m7, 4
  85. FMULADD_PS m6, m7, m4, m6, m8 ; s += [0,0,0,p0] * nu^4
  86. movu [ptrq + xq * 4], m6
  87. shufps m5, m6, m6, q3333
  88. add xq, 4
  89. cmp xq, widthq
  90. jl .loop_x
  91. add widthq, remainq
  92. cmp xq, widthq
  93. je .end_scalar
  94. .loop_scalar:
  95. ; ptr[x] += nu * ptr[x-1]
  96. movss m5, [ptrq + 4*xq - 4]
  97. mulss m5, m0
  98. addss m5, [ptrq + 4*xq]
  99. movss [ptrq + 4*xq], m5
  100. inc xq
  101. cmp xq, widthq
  102. jl .loop_scalar
  103. .end_scalar:
  104. ; ptr[width - 1] *= bscale
  105. dec xq
  106. mulss m5, m1, [ptrq + 4*xq]
  107. movss [ptrq + 4*xq], m5
  108. shufps m5, m5, 0
  109. ; filter leftwards
  110. ; for (; x > 0; x--)
  111. ; ptr[x - 1] += nu * ptr[x];
  112. ; The idea here is basically the same as filter rightwards.
  113. ; But we need to take care as the data layout is different.
  114. ; Let p0 stands for the ptr[x], which is the data from last loop.
  115. ; The way we do it in simd as below:
  116. ; [p-4', p-3', p-2', p-1'] = [p-4, p-3, p-2, p-1]
  117. ; + [p-3, p-2, p-1, p0] * nu
  118. ; + [p-2, p-1, p0, 0] * nu^2
  119. ; + [p-1, p0, 0, 0] * nu^3
  120. ; + [p0, 0, 0, 0] * nu^4
  121. .loop_x_back:
  122. sub xq, 4
  123. movu m6, [ptrq + xq * 4] ; s = [p-4, p-3, p-2, p-1]
  124. psrldq m7, m6, 4 ; [p-3, p-2, p-1, 0 ]
  125. blendps m7, m5, 0x8 ; [p-3, p-2, p-1, p0 ]
  126. FMULADD_PS m6, m7, m0, m6, m8 ; s+= [p-3, p-2, p-1, p0 ] * nu
  127. psrldq m7, 4 ;
  128. FMULADD_PS m6, m7, m2, m6, m8 ; s+= [p-2, p-1, p0, 0] * nu^2
  129. psrldq m7, 4
  130. FMULADD_PS m6, m7, m3, m6, m8 ; s+= [p-1, p0, 0, 0] * nu^3
  131. psrldq m7, 4
  132. FMULADD_PS m6, m7, m4, m6, m8 ; s+= [p0, 0, 0, 0] * nu^4
  133. movu [ptrq + xq * 4], m6
  134. shufps m5, m6, m6, 0 ; m5 = [p-4', p-4', p-4', p-4']
  135. cmp xq, remainq
  136. jg .loop_x_back
  137. cmp xq, 0
  138. je .end_scalar_back
  139. .loop_scalar_back:
  140. ; ptr[x-1] += nu * ptr[x]
  141. movss m5, [ptrq + 4*xq]
  142. mulss m5, m0
  143. addss m5, [ptrq + 4*xq - 4]
  144. movss [ptrq + 4*xq - 4], m5
  145. dec xq
  146. cmp xq, 0
  147. jg .loop_scalar_back
  148. .end_scalar_back:
  149. ; reset aligned width for next line
  150. sub widthq, remainq
  151. inc stepd
  152. cmp stepd, stepsd
  153. jl .loop_step
  154. add ptrq, strideq
  155. inc yd
  156. cmp yd, heightd
  157. jl .loop_y
  158. RET
  159. %endmacro
  160. %if ARCH_X86_64
  161. INIT_XMM sse4
  162. HORIZ_SLICE
  163. INIT_XMM avx2
  164. HORIZ_SLICE
  165. %endif