af_volume.asm 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. ;*****************************************************************************
  2. ;* x86-optimized functions for volume filter
  3. ;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_RODATA 32
  23. pd_1_256: times 4 dq 0x3F70000000000000
  24. pd_int32_max: times 4 dq 0x41DFFFFFFFC00000
  25. pw_1: times 8 dw 1
  26. pw_128: times 8 dw 128
  27. pq_128: times 2 dq 128
  28. SECTION .text
  29. ;------------------------------------------------------------------------------
  30. ; void ff_scale_samples_s16(uint8_t *dst, const uint8_t *src, int len,
  31. ; int volume)
  32. ;------------------------------------------------------------------------------
  33. INIT_XMM sse2
  34. cglobal scale_samples_s16, 4,4,4, dst, src, len, volume
  35. movd m0, volumem
  36. pshuflw m0, m0, 0
  37. punpcklwd m0, [pw_1]
  38. mova m1, [pw_128]
  39. lea lenq, [lend*2-mmsize]
  40. .loop:
  41. ; dst[i] = av_clip_int16((src[i] * volume + 128) >> 8);
  42. mova m2, [srcq+lenq]
  43. punpcklwd m3, m2, m1
  44. punpckhwd m2, m1
  45. pmaddwd m3, m0
  46. pmaddwd m2, m0
  47. psrad m3, 8
  48. psrad m2, 8
  49. packssdw m3, m2
  50. mova [dstq+lenq], m3
  51. sub lenq, mmsize
  52. jge .loop
  53. REP_RET
  54. ;------------------------------------------------------------------------------
  55. ; void ff_scale_samples_s32(uint8_t *dst, const uint8_t *src, int len,
  56. ; int volume)
  57. ;------------------------------------------------------------------------------
  58. %macro SCALE_SAMPLES_S32 0
  59. cglobal scale_samples_s32, 4,4,4, dst, src, len, volume
  60. %if ARCH_X86_32 && cpuflag(avx)
  61. vbroadcastss xmm2, volumem
  62. %else
  63. movd xmm2, volumed
  64. pshufd xmm2, xmm2, 0
  65. %endif
  66. CVTDQ2PD m2, xmm2
  67. mulpd m2, m2, [pd_1_256]
  68. mova m3, [pd_int32_max]
  69. lea lenq, [lend*4-mmsize]
  70. .loop:
  71. CVTDQ2PD m0, [srcq+lenq ]
  72. CVTDQ2PD m1, [srcq+lenq+mmsize/2]
  73. mulpd m0, m0, m2
  74. mulpd m1, m1, m2
  75. minpd m0, m0, m3
  76. minpd m1, m1, m3
  77. cvtpd2dq xmm0, m0
  78. cvtpd2dq xmm1, m1
  79. %if cpuflag(avx)
  80. vmovdqa [dstq+lenq ], xmm0
  81. vmovdqa [dstq+lenq+mmsize/2], xmm1
  82. %else
  83. movq [dstq+lenq ], xmm0
  84. movq [dstq+lenq+mmsize/2], xmm1
  85. %endif
  86. sub lenq, mmsize
  87. jge .loop
  88. REP_RET
  89. %endmacro
  90. INIT_XMM sse2
  91. %define CVTDQ2PD cvtdq2pd
  92. SCALE_SAMPLES_S32
  93. %if HAVE_AVX_EXTERNAL
  94. %define CVTDQ2PD vcvtdq2pd
  95. INIT_YMM avx
  96. SCALE_SAMPLES_S32
  97. %endif
  98. %undef CVTDQ2PD
  99. ; NOTE: This is not bit-identical with the C version because it clips to
  100. ; [-INT_MAX, INT_MAX] instead of [INT_MIN, INT_MAX]
  101. INIT_XMM ssse3, atom
  102. cglobal scale_samples_s32, 4,4,8, dst, src, len, volume
  103. movd m4, volumem
  104. pshufd m4, m4, 0
  105. mova m5, [pq_128]
  106. pxor m6, m6
  107. lea lenq, [lend*4-mmsize]
  108. .loop:
  109. ; src[i] = av_clipl_int32((src[i] * volume + 128) >> 8);
  110. mova m7, [srcq+lenq]
  111. pabsd m3, m7
  112. pshufd m0, m3, q0100
  113. pshufd m1, m3, q0302
  114. pmuludq m0, m4
  115. pmuludq m1, m4
  116. paddq m0, m5
  117. paddq m1, m5
  118. psrlq m0, 7
  119. psrlq m1, 7
  120. shufps m2, m0, m1, q3131
  121. shufps m0, m0, m1, q2020
  122. pcmpgtd m2, m6
  123. por m0, m2
  124. psrld m0, 1
  125. psignd m0, m7
  126. mova [dstq+lenq], m0
  127. sub lenq, mmsize
  128. jge .loop
  129. REP_RET