af_anlmdn.asm 2.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. ;*****************************************************************************
  2. ;* x86-optimized functions for anlmdn filter
  3. ;* Copyright (c) 2017 Paul B Mahol
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION .text
  23. ;------------------------------------------------------------------------------
  24. ; float ff_compute_distance_ssd(float *f1, const float *f2, ptrdiff_t len)
  25. ;------------------------------------------------------------------------------
  26. INIT_XMM sse
  27. cglobal compute_distance_ssd, 3,5,3, f1, f2, len, r, x
  28. mov xq, lenq
  29. shl xq, 2
  30. neg xq
  31. add f1q, xq
  32. add f2q, xq
  33. xor xq, xq
  34. shl lenq, 1
  35. add lenq, 1
  36. shl lenq, 2
  37. mov rq, lenq
  38. and rq, mmsize - 1
  39. xorps m0, m0
  40. cmp lenq, mmsize
  41. jl .loop1
  42. sub lenq, rq
  43. ALIGN 16
  44. .loop0:
  45. movups m1, [f1q + xq]
  46. movups m2, [f2q + xq]
  47. subps m1, m2
  48. mulps m1, m1
  49. addps m0, m1
  50. add xq, mmsize
  51. cmp xq, lenq
  52. jl .loop0
  53. movhlps xmm1, xmm0
  54. addps xmm0, xmm1
  55. movss xmm1, xmm0
  56. shufps xmm0, xmm0, 1
  57. addss xmm0, xmm1
  58. cmp rq, 0
  59. je .end
  60. add lenq, rq
  61. .loop1:
  62. movss xm1, [f1q + xq]
  63. subss xm1, [f2q + xq]
  64. mulss xm1, xm1
  65. addss xm0, xm1
  66. add xq, 4
  67. cmp xq, lenq
  68. jl .loop1
  69. .end:
  70. %if ARCH_X86_64 == 0
  71. movss r0m, xm0
  72. fld dword r0m
  73. %endif
  74. RET