mpegvideoencdsp_init.c 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. /*
  2. * This file is part of FFmpeg.
  3. *
  4. * FFmpeg is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU Lesser General Public
  6. * License as published by the Free Software Foundation; either
  7. * version 2.1 of the License, or (at your option) any later version.
  8. *
  9. * FFmpeg is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. * Lesser General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU Lesser General Public
  15. * License along with FFmpeg; if not, write to the Free Software
  16. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  17. */
  18. #include "libavutil/attributes.h"
  19. #include "libavutil/avassert.h"
  20. #include "libavutil/cpu.h"
  21. #include "libavutil/x86/cpu.h"
  22. #include "libavcodec/avcodec.h"
  23. #include "libavcodec/mpegvideoencdsp.h"
  24. int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
  25. int ff_pix_sum16_mmxext(uint8_t *pix, int line_size);
  26. int ff_pix_sum16_sse2(uint8_t *pix, int line_size);
  27. int ff_pix_sum16_xop(uint8_t *pix, int line_size);
  28. int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
  29. int ff_pix_norm1_sse2(uint8_t *pix, int line_size);
  30. #if HAVE_INLINE_ASM
  31. #define PHADDD(a, t) \
  32. "movq " #a ", " #t " \n\t" \
  33. "psrlq $32, " #a " \n\t" \
  34. "paddd " #t ", " #a " \n\t"
  35. /*
  36. * pmulhw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15])[16 - 31]
  37. * pmulhrw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x8000)[16 - 31]
  38. * pmulhrsw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x4000)[15 - 30]
  39. */
  40. #define PMULHRW(x, y, s, o) \
  41. "pmulhw " #s ", " #x " \n\t" \
  42. "pmulhw " #s ", " #y " \n\t" \
  43. "paddw " #o ", " #x " \n\t" \
  44. "paddw " #o ", " #y " \n\t" \
  45. "psraw $1, " #x " \n\t" \
  46. "psraw $1, " #y " \n\t"
  47. #define DEF(x) x ## _mmx
  48. #define SET_RND MOVQ_WONE
  49. #define SCALE_OFFSET 1
  50. #include "mpegvideoenc_qns_template.c"
  51. #undef DEF
  52. #undef SET_RND
  53. #undef SCALE_OFFSET
  54. #undef PMULHRW
  55. #define DEF(x) x ## _3dnow
  56. #define SET_RND(x)
  57. #define SCALE_OFFSET 0
  58. #define PMULHRW(x, y, s, o) \
  59. "pmulhrw " #s ", " #x " \n\t" \
  60. "pmulhrw " #s ", " #y " \n\t"
  61. #include "mpegvideoenc_qns_template.c"
  62. #undef DEF
  63. #undef SET_RND
  64. #undef SCALE_OFFSET
  65. #undef PMULHRW
  66. #if HAVE_SSSE3_INLINE
  67. #undef PHADDD
  68. #define DEF(x) x ## _ssse3
  69. #define SET_RND(x)
  70. #define SCALE_OFFSET -1
  71. #define PHADDD(a, t) \
  72. "pshufw $0x0E, " #a ", " #t " \n\t" \
  73. /* faster than phaddd on core2 */ \
  74. "paddd " #t ", " #a " \n\t"
  75. #define PMULHRW(x, y, s, o) \
  76. "pmulhrsw " #s ", " #x " \n\t" \
  77. "pmulhrsw " #s ", " #y " \n\t"
  78. #include "mpegvideoenc_qns_template.c"
  79. #undef DEF
  80. #undef SET_RND
  81. #undef SCALE_OFFSET
  82. #undef PMULHRW
  83. #undef PHADDD
  84. #endif /* HAVE_SSSE3_INLINE */
  85. /* Draw the edges of width 'w' of an image of size width, height
  86. * this MMX version can only handle w == 8 || w == 16. */
  87. static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
  88. int w, int h, int sides)
  89. {
  90. uint8_t *ptr, *last_line;
  91. int i;
  92. last_line = buf + (height - 1) * wrap;
  93. /* left and right */
  94. ptr = buf;
  95. if (w == 8) {
  96. __asm__ volatile (
  97. "1: \n\t"
  98. "movd (%0), %%mm0 \n\t"
  99. "punpcklbw %%mm0, %%mm0 \n\t"
  100. "punpcklwd %%mm0, %%mm0 \n\t"
  101. "punpckldq %%mm0, %%mm0 \n\t"
  102. "movq %%mm0, -8(%0) \n\t"
  103. "movq -8(%0, %2), %%mm1 \n\t"
  104. "punpckhbw %%mm1, %%mm1 \n\t"
  105. "punpckhwd %%mm1, %%mm1 \n\t"
  106. "punpckhdq %%mm1, %%mm1 \n\t"
  107. "movq %%mm1, (%0, %2) \n\t"
  108. "add %1, %0 \n\t"
  109. "cmp %3, %0 \n\t"
  110. "jb 1b \n\t"
  111. : "+r" (ptr)
  112. : "r" ((x86_reg) wrap), "r" ((x86_reg) width),
  113. "r" (ptr + wrap * height));
  114. } else if (w == 16) {
  115. __asm__ volatile (
  116. "1: \n\t"
  117. "movd (%0), %%mm0 \n\t"
  118. "punpcklbw %%mm0, %%mm0 \n\t"
  119. "punpcklwd %%mm0, %%mm0 \n\t"
  120. "punpckldq %%mm0, %%mm0 \n\t"
  121. "movq %%mm0, -8(%0) \n\t"
  122. "movq %%mm0, -16(%0) \n\t"
  123. "movq -8(%0, %2), %%mm1 \n\t"
  124. "punpckhbw %%mm1, %%mm1 \n\t"
  125. "punpckhwd %%mm1, %%mm1 \n\t"
  126. "punpckhdq %%mm1, %%mm1 \n\t"
  127. "movq %%mm1, (%0, %2) \n\t"
  128. "movq %%mm1, 8(%0, %2) \n\t"
  129. "add %1, %0 \n\t"
  130. "cmp %3, %0 \n\t"
  131. "jb 1b \n\t"
  132. : "+r"(ptr)
  133. : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
  134. );
  135. } else {
  136. av_assert1(w == 4);
  137. __asm__ volatile (
  138. "1: \n\t"
  139. "movd (%0), %%mm0 \n\t"
  140. "punpcklbw %%mm0, %%mm0 \n\t"
  141. "punpcklwd %%mm0, %%mm0 \n\t"
  142. "movd %%mm0, -4(%0) \n\t"
  143. "movd -4(%0, %2), %%mm1 \n\t"
  144. "punpcklbw %%mm1, %%mm1 \n\t"
  145. "punpckhwd %%mm1, %%mm1 \n\t"
  146. "punpckhdq %%mm1, %%mm1 \n\t"
  147. "movd %%mm1, (%0, %2) \n\t"
  148. "add %1, %0 \n\t"
  149. "cmp %3, %0 \n\t"
  150. "jb 1b \n\t"
  151. : "+r" (ptr)
  152. : "r" ((x86_reg) wrap), "r" ((x86_reg) width),
  153. "r" (ptr + wrap * height));
  154. }
  155. /* top and bottom (and hopefully also the corners) */
  156. if (sides & EDGE_TOP) {
  157. for (i = 0; i < h; i += 4) {
  158. ptr = buf - (i + 1) * wrap - w;
  159. __asm__ volatile (
  160. "1: \n\t"
  161. "movq (%1, %0), %%mm0 \n\t"
  162. "movq %%mm0, (%0) \n\t"
  163. "movq %%mm0, (%0, %2) \n\t"
  164. "movq %%mm0, (%0, %2, 2) \n\t"
  165. "movq %%mm0, (%0, %3) \n\t"
  166. "add $8, %0 \n\t"
  167. "cmp %4, %0 \n\t"
  168. "jb 1b \n\t"
  169. : "+r" (ptr)
  170. : "r" ((x86_reg) buf - (x86_reg) ptr - w),
  171. "r" ((x86_reg) - wrap), "r" ((x86_reg) - wrap * 3),
  172. "r" (ptr + width + 2 * w));
  173. }
  174. }
  175. if (sides & EDGE_BOTTOM) {
  176. for (i = 0; i < h; i += 4) {
  177. ptr = last_line + (i + 1) * wrap - w;
  178. __asm__ volatile (
  179. "1: \n\t"
  180. "movq (%1, %0), %%mm0 \n\t"
  181. "movq %%mm0, (%0) \n\t"
  182. "movq %%mm0, (%0, %2) \n\t"
  183. "movq %%mm0, (%0, %2, 2) \n\t"
  184. "movq %%mm0, (%0, %3) \n\t"
  185. "add $8, %0 \n\t"
  186. "cmp %4, %0 \n\t"
  187. "jb 1b \n\t"
  188. : "+r" (ptr)
  189. : "r" ((x86_reg) last_line - (x86_reg) ptr - w),
  190. "r" ((x86_reg) wrap), "r" ((x86_reg) wrap * 3),
  191. "r" (ptr + width + 2 * w));
  192. }
  193. }
  194. }
  195. #endif /* HAVE_INLINE_ASM */
  196. av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
  197. AVCodecContext *avctx)
  198. {
  199. int cpu_flags = av_get_cpu_flags();
  200. #if ARCH_X86_32
  201. if (EXTERNAL_MMX(cpu_flags)) {
  202. c->pix_sum = ff_pix_sum16_mmx;
  203. c->pix_norm1 = ff_pix_norm1_mmx;
  204. }
  205. if (EXTERNAL_MMXEXT(cpu_flags)) {
  206. c->pix_sum = ff_pix_sum16_mmxext;
  207. }
  208. #endif
  209. if (EXTERNAL_SSE2(cpu_flags)) {
  210. c->pix_sum = ff_pix_sum16_sse2;
  211. c->pix_norm1 = ff_pix_norm1_sse2;
  212. }
  213. if (EXTERNAL_XOP(cpu_flags)) {
  214. c->pix_sum = ff_pix_sum16_xop;
  215. }
  216. #if HAVE_INLINE_ASM
  217. if (INLINE_MMX(cpu_flags)) {
  218. if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
  219. c->try_8x8basis = try_8x8basis_mmx;
  220. }
  221. c->add_8x8basis = add_8x8basis_mmx;
  222. if (avctx->bits_per_raw_sample <= 8) {
  223. c->draw_edges = draw_edges_mmx;
  224. }
  225. }
  226. if (INLINE_AMD3DNOW(cpu_flags)) {
  227. if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
  228. c->try_8x8basis = try_8x8basis_3dnow;
  229. }
  230. c->add_8x8basis = add_8x8basis_3dnow;
  231. }
  232. #if HAVE_SSSE3_INLINE
  233. if (INLINE_SSSE3(cpu_flags)) {
  234. if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
  235. c->try_8x8basis = try_8x8basis_ssse3;
  236. }
  237. c->add_8x8basis = add_8x8basis_ssse3;
  238. }
  239. #endif /* HAVE_SSSE3_INLINE */
  240. #endif /* HAVE_INLINE_ASM */
  241. }