vf_hflip.asm 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. ;*****************************************************************************
  2. ;* x86-optimized functions for hflip filter
  3. ;*
  4. ;* Copyright (C) 2017 Paul B Mahol
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;*****************************************************************************
  22. %include "libavutil/x86/x86util.asm"
  23. SECTION_RODATA
  24. pb_flip_byte: db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
  25. pb_flip_short: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
  26. SECTION .text
  27. ;%1 byte or short, %2 b or w, %3 size in byte (1 for byte, 2 for short)
  28. %macro HFLIP 3
  29. cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x
  30. VBROADCASTI128 m0, [pb_flip_%1]
  31. xor xq, xq
  32. %if %3 == 1
  33. movsxdifnidn wq, wd
  34. %else ; short
  35. add wd, wd
  36. %endif
  37. mov rq, wq
  38. and rq, 2 * mmsize - 1
  39. cmp wq, 2 * mmsize
  40. jl .loop1
  41. sub wq, rq
  42. .loop0:
  43. neg xq
  44. %if mmsize == 32
  45. vpermq m1, [srcq + xq - mmsize + %3], 0x4e; flip each lane at load
  46. vpermq m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load
  47. %else
  48. movu m1, [srcq + xq - mmsize + %3]
  49. movu m2, [srcq + xq - 2 * mmsize + %3]
  50. %endif
  51. pshufb m1, m0
  52. pshufb m2, m0
  53. neg xq
  54. movu [dstq + xq ], m1
  55. movu [dstq + xq + mmsize], m2
  56. add xq, mmsize * 2
  57. cmp xq, wq
  58. jl .loop0
  59. cmp rq, 0
  60. je .end
  61. add wq, rq
  62. .loop1:
  63. neg xq
  64. mov r%2, [srcq + xq]
  65. neg xq
  66. mov [dstq + xq], r%2
  67. add xq, %3
  68. cmp xq, wq
  69. jl .loop1
  70. .end:
  71. RET
  72. %endmacro
  73. INIT_XMM ssse3
  74. HFLIP byte, b, 1
  75. HFLIP short, w, 2
  76. %if HAVE_AVX2_EXTERNAL
  77. INIT_YMM avx2
  78. HFLIP byte, b, 1
  79. HFLIP short, w, 2
  80. %endif