me_cmp_init.c 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651
  1. /*
  2. * SIMD-optimized motion estimation
  3. * Copyright (c) 2000, 2001 Fabrice Bellard
  4. * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  5. *
  6. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  7. *
  8. * This file is part of FFmpeg.
  9. *
  10. * FFmpeg is free software; you can redistribute it and/or
  11. * modify it under the terms of the GNU Lesser General Public
  12. * License as published by the Free Software Foundation; either
  13. * version 2.1 of the License, or (at your option) any later version.
  14. *
  15. * FFmpeg is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. * Lesser General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU Lesser General Public
  21. * License along with FFmpeg; if not, write to the Free Software
  22. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. */
  24. #include "libavutil/attributes.h"
  25. #include "libavutil/cpu.h"
  26. #include "libavutil/x86/asm.h"
  27. #include "libavutil/x86/cpu.h"
  28. #include "libavcodec/me_cmp.h"
  29. #include "libavcodec/mpegvideo.h"
  30. int ff_sum_abs_dctelem_mmx(int16_t *block);
  31. int ff_sum_abs_dctelem_mmxext(int16_t *block);
  32. int ff_sum_abs_dctelem_sse2(int16_t *block);
  33. int ff_sum_abs_dctelem_ssse3(int16_t *block);
  34. int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  35. ptrdiff_t stride, int h);
  36. int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  37. ptrdiff_t stride, int h);
  38. int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  39. ptrdiff_t stride, int h);
  40. int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
  41. int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
  42. int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  43. ptrdiff_t stride, int h);
  44. int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  45. ptrdiff_t stride, int h);
  46. int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  47. ptrdiff_t stride, int h);
  48. int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  49. ptrdiff_t stride, int h);
  50. int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  51. ptrdiff_t stride, int h);
  52. int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  53. ptrdiff_t stride, int h);
  54. int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  55. ptrdiff_t stride, int h);
  56. int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  57. ptrdiff_t stride, int h);
  58. int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  59. ptrdiff_t stride, int h);
  60. int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  61. ptrdiff_t stride, int h);
  62. int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  63. ptrdiff_t stride, int h);
  64. int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  65. ptrdiff_t stride, int h);
  66. int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  67. ptrdiff_t stride, int h);
  68. int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  69. ptrdiff_t stride, int h);
  70. int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  71. ptrdiff_t stride, int h);
  72. int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  73. ptrdiff_t stride, int h);
  74. int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  75. ptrdiff_t stride, int h);
  76. int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  77. ptrdiff_t stride, int h);
  78. #define hadamard_func(cpu) \
  79. int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
  80. uint8_t *src2, ptrdiff_t stride, int h); \
  81. int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
  82. uint8_t *src2, ptrdiff_t stride, int h);
  83. hadamard_func(mmx)
  84. hadamard_func(mmxext)
  85. hadamard_func(sse2)
  86. hadamard_func(ssse3)
  87. #if HAVE_X86ASM
  88. static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
  89. ptrdiff_t stride, int h)
  90. {
  91. int score1, score2;
  92. if (c)
  93. score1 = c->mecc.sse[0](c, pix1, pix2, stride, h);
  94. else
  95. score1 = ff_sse16_mmx(c, pix1, pix2, stride, h);
  96. score2 = ff_hf_noise16_mmx(pix1, stride, h) + ff_hf_noise8_mmx(pix1+8, stride, h)
  97. - ff_hf_noise16_mmx(pix2, stride, h) - ff_hf_noise8_mmx(pix2+8, stride, h);
  98. if (c)
  99. return score1 + FFABS(score2) * c->avctx->nsse_weight;
  100. else
  101. return score1 + FFABS(score2) * 8;
  102. }
  103. static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
  104. ptrdiff_t stride, int h)
  105. {
  106. int score1 = ff_sse8_mmx(c, pix1, pix2, stride, h);
  107. int score2 = ff_hf_noise8_mmx(pix1, stride, h) -
  108. ff_hf_noise8_mmx(pix2, stride, h);
  109. if (c)
  110. return score1 + FFABS(score2) * c->avctx->nsse_weight;
  111. else
  112. return score1 + FFABS(score2) * 8;
  113. }
  114. #endif /* HAVE_X86ASM */
  115. #if HAVE_INLINE_ASM
  116. static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
  117. ptrdiff_t stride, int h)
  118. {
  119. int tmp;
  120. av_assert2((((int) pix) & 7) == 0);
  121. av_assert2((stride & 7) == 0);
  122. #define SUM(in0, in1, out0, out1) \
  123. "movq (%0), %%mm2\n" \
  124. "movq 8(%0), %%mm3\n" \
  125. "add %2,%0\n" \
  126. "movq %%mm2, " #out0 "\n" \
  127. "movq %%mm3, " #out1 "\n" \
  128. "psubusb " #in0 ", %%mm2\n" \
  129. "psubusb " #in1 ", %%mm3\n" \
  130. "psubusb " #out0 ", " #in0 "\n" \
  131. "psubusb " #out1 ", " #in1 "\n" \
  132. "por %%mm2, " #in0 "\n" \
  133. "por %%mm3, " #in1 "\n" \
  134. "movq " #in0 ", %%mm2\n" \
  135. "movq " #in1 ", %%mm3\n" \
  136. "punpcklbw %%mm7, " #in0 "\n" \
  137. "punpcklbw %%mm7, " #in1 "\n" \
  138. "punpckhbw %%mm7, %%mm2\n" \
  139. "punpckhbw %%mm7, %%mm3\n" \
  140. "paddw " #in1 ", " #in0 "\n" \
  141. "paddw %%mm3, %%mm2\n" \
  142. "paddw %%mm2, " #in0 "\n" \
  143. "paddw " #in0 ", %%mm6\n"
  144. __asm__ volatile (
  145. "movl %3, %%ecx\n"
  146. "pxor %%mm6, %%mm6\n"
  147. "pxor %%mm7, %%mm7\n"
  148. "movq (%0), %%mm0\n"
  149. "movq 8(%0), %%mm1\n"
  150. "add %2, %0\n"
  151. "jmp 2f\n"
  152. "1:\n"
  153. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  154. "2:\n"
  155. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  156. "subl $2, %%ecx\n"
  157. "jnz 1b\n"
  158. "movq %%mm6, %%mm0\n"
  159. "psrlq $32, %%mm6\n"
  160. "paddw %%mm6, %%mm0\n"
  161. "movq %%mm0, %%mm6\n"
  162. "psrlq $16, %%mm0\n"
  163. "paddw %%mm6, %%mm0\n"
  164. "movd %%mm0, %1\n"
  165. : "+r" (pix), "=r" (tmp)
  166. : "r" (stride), "m" (h)
  167. : "%ecx");
  168. return tmp & 0xFFFF;
  169. }
  170. #undef SUM
  171. static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  172. ptrdiff_t stride, int h)
  173. {
  174. int tmp;
  175. av_assert2((((int) pix1) & 7) == 0);
  176. av_assert2((((int) pix2) & 7) == 0);
  177. av_assert2((stride & 7) == 0);
  178. #define SUM(in0, in1, out0, out1) \
  179. "movq (%0), %%mm2\n" \
  180. "movq (%1), " #out0 "\n" \
  181. "movq 8(%0), %%mm3\n" \
  182. "movq 8(%1), " #out1 "\n" \
  183. "add %3, %0\n" \
  184. "add %3, %1\n" \
  185. "psubb " #out0 ", %%mm2\n" \
  186. "psubb " #out1 ", %%mm3\n" \
  187. "pxor %%mm7, %%mm2\n" \
  188. "pxor %%mm7, %%mm3\n" \
  189. "movq %%mm2, " #out0 "\n" \
  190. "movq %%mm3, " #out1 "\n" \
  191. "psubusb " #in0 ", %%mm2\n" \
  192. "psubusb " #in1 ", %%mm3\n" \
  193. "psubusb " #out0 ", " #in0 "\n" \
  194. "psubusb " #out1 ", " #in1 "\n" \
  195. "por %%mm2, " #in0 "\n" \
  196. "por %%mm3, " #in1 "\n" \
  197. "movq " #in0 ", %%mm2\n" \
  198. "movq " #in1 ", %%mm3\n" \
  199. "punpcklbw %%mm7, " #in0 "\n" \
  200. "punpcklbw %%mm7, " #in1 "\n" \
  201. "punpckhbw %%mm7, %%mm2\n" \
  202. "punpckhbw %%mm7, %%mm3\n" \
  203. "paddw " #in1 ", " #in0 "\n" \
  204. "paddw %%mm3, %%mm2\n" \
  205. "paddw %%mm2, " #in0 "\n" \
  206. "paddw " #in0 ", %%mm6\n"
  207. __asm__ volatile (
  208. "movl %4, %%ecx\n"
  209. "pxor %%mm6, %%mm6\n"
  210. "pcmpeqw %%mm7, %%mm7\n"
  211. "psllw $15, %%mm7\n"
  212. "packsswb %%mm7, %%mm7\n"
  213. "movq (%0), %%mm0\n"
  214. "movq (%1), %%mm2\n"
  215. "movq 8(%0), %%mm1\n"
  216. "movq 8(%1), %%mm3\n"
  217. "add %3, %0\n"
  218. "add %3, %1\n"
  219. "psubb %%mm2, %%mm0\n"
  220. "psubb %%mm3, %%mm1\n"
  221. "pxor %%mm7, %%mm0\n"
  222. "pxor %%mm7, %%mm1\n"
  223. "jmp 2f\n"
  224. "1:\n"
  225. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  226. "2:\n"
  227. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  228. "subl $2, %%ecx\n"
  229. "jnz 1b\n"
  230. "movq %%mm6, %%mm0\n"
  231. "psrlq $32, %%mm6\n"
  232. "paddw %%mm6, %%mm0\n"
  233. "movq %%mm0, %%mm6\n"
  234. "psrlq $16, %%mm0\n"
  235. "paddw %%mm6, %%mm0\n"
  236. "movd %%mm0, %2\n"
  237. : "+r" (pix1), "+r" (pix2), "=r" (tmp)
  238. : "r" (stride), "m" (h)
  239. : "%ecx");
  240. return tmp & 0x7FFF;
  241. }
  242. #undef SUM
  243. DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
  244. 0x0000000000000000ULL,
  245. 0x0001000100010001ULL,
  246. 0x0002000200020002ULL,
  247. };
  248. static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2,
  249. ptrdiff_t stride, int h)
  250. {
  251. x86_reg len = -stride * h;
  252. __asm__ volatile (
  253. ".p2align 4 \n\t"
  254. "1: \n\t"
  255. "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
  256. "movq (%2, %%"FF_REG_a"), %%mm2 \n\t"
  257. "movq (%2, %%"FF_REG_a"), %%mm4 \n\t"
  258. "add %3, %%"FF_REG_a" \n\t"
  259. "psubusb %%mm0, %%mm2 \n\t"
  260. "psubusb %%mm4, %%mm0 \n\t"
  261. "movq (%1, %%"FF_REG_a"), %%mm1 \n\t"
  262. "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
  263. "movq (%2, %%"FF_REG_a"), %%mm5 \n\t"
  264. "psubusb %%mm1, %%mm3 \n\t"
  265. "psubusb %%mm5, %%mm1 \n\t"
  266. "por %%mm2, %%mm0 \n\t"
  267. "por %%mm1, %%mm3 \n\t"
  268. "movq %%mm0, %%mm1 \n\t"
  269. "movq %%mm3, %%mm2 \n\t"
  270. "punpcklbw %%mm7, %%mm0 \n\t"
  271. "punpckhbw %%mm7, %%mm1 \n\t"
  272. "punpcklbw %%mm7, %%mm3 \n\t"
  273. "punpckhbw %%mm7, %%mm2 \n\t"
  274. "paddw %%mm1, %%mm0 \n\t"
  275. "paddw %%mm3, %%mm2 \n\t"
  276. "paddw %%mm2, %%mm0 \n\t"
  277. "paddw %%mm0, %%mm6 \n\t"
  278. "add %3, %%"FF_REG_a" \n\t"
  279. " js 1b \n\t"
  280. : "+a" (len)
  281. : "r" (blk1 - len), "r" (blk2 - len), "r" (stride));
  282. }
  283. static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
  284. ptrdiff_t stride, int h)
  285. {
  286. x86_reg len = -stride * h;
  287. __asm__ volatile (
  288. ".p2align 4 \n\t"
  289. "1: \n\t"
  290. "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
  291. "movq (%2, %%"FF_REG_a"), %%mm1 \n\t"
  292. "movq (%1, %%"FF_REG_a"), %%mm2 \n\t"
  293. "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
  294. "punpcklbw %%mm7, %%mm0 \n\t"
  295. "punpcklbw %%mm7, %%mm1 \n\t"
  296. "punpckhbw %%mm7, %%mm2 \n\t"
  297. "punpckhbw %%mm7, %%mm3 \n\t"
  298. "paddw %%mm0, %%mm1 \n\t"
  299. "paddw %%mm2, %%mm3 \n\t"
  300. "movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
  301. "movq (%3, %%"FF_REG_a"), %%mm2 \n\t"
  302. "paddw %%mm5, %%mm1 \n\t"
  303. "paddw %%mm5, %%mm3 \n\t"
  304. "psrlw $1, %%mm1 \n\t"
  305. "psrlw $1, %%mm3 \n\t"
  306. "packuswb %%mm3, %%mm1 \n\t"
  307. "psubusb %%mm1, %%mm4 \n\t"
  308. "psubusb %%mm2, %%mm1 \n\t"
  309. "por %%mm4, %%mm1 \n\t"
  310. "movq %%mm1, %%mm0 \n\t"
  311. "punpcklbw %%mm7, %%mm0 \n\t"
  312. "punpckhbw %%mm7, %%mm1 \n\t"
  313. "paddw %%mm1, %%mm0 \n\t"
  314. "paddw %%mm0, %%mm6 \n\t"
  315. "add %4, %%"FF_REG_a" \n\t"
  316. " js 1b \n\t"
  317. : "+a" (len)
  318. : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len),
  319. "r" (stride));
  320. }
  321. static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
  322. ptrdiff_t stride, int h)
  323. {
  324. x86_reg len = -stride * h;
  325. __asm__ volatile (
  326. "movq (%1, %%"FF_REG_a"), %%mm0\n\t"
  327. "movq 1(%1, %%"FF_REG_a"), %%mm2\n\t"
  328. "movq %%mm0, %%mm1 \n\t"
  329. "movq %%mm2, %%mm3 \n\t"
  330. "punpcklbw %%mm7, %%mm0 \n\t"
  331. "punpckhbw %%mm7, %%mm1 \n\t"
  332. "punpcklbw %%mm7, %%mm2 \n\t"
  333. "punpckhbw %%mm7, %%mm3 \n\t"
  334. "paddw %%mm2, %%mm0 \n\t"
  335. "paddw %%mm3, %%mm1 \n\t"
  336. ".p2align 4 \n\t"
  337. "1: \n\t"
  338. "movq (%2, %%"FF_REG_a"), %%mm2\n\t"
  339. "movq 1(%2, %%"FF_REG_a"), %%mm4\n\t"
  340. "movq %%mm2, %%mm3 \n\t"
  341. "movq %%mm4, %%mm5 \n\t"
  342. "punpcklbw %%mm7, %%mm2 \n\t"
  343. "punpckhbw %%mm7, %%mm3 \n\t"
  344. "punpcklbw %%mm7, %%mm4 \n\t"
  345. "punpckhbw %%mm7, %%mm5 \n\t"
  346. "paddw %%mm4, %%mm2 \n\t"
  347. "paddw %%mm5, %%mm3 \n\t"
  348. "movq %5, %%mm5 \n\t"
  349. "paddw %%mm2, %%mm0 \n\t"
  350. "paddw %%mm3, %%mm1 \n\t"
  351. "paddw %%mm5, %%mm0 \n\t"
  352. "paddw %%mm5, %%mm1 \n\t"
  353. "movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
  354. "movq (%3, %%"FF_REG_a"), %%mm5 \n\t"
  355. "psrlw $2, %%mm0 \n\t"
  356. "psrlw $2, %%mm1 \n\t"
  357. "packuswb %%mm1, %%mm0 \n\t"
  358. "psubusb %%mm0, %%mm4 \n\t"
  359. "psubusb %%mm5, %%mm0 \n\t"
  360. "por %%mm4, %%mm0 \n\t"
  361. "movq %%mm0, %%mm4 \n\t"
  362. "punpcklbw %%mm7, %%mm0 \n\t"
  363. "punpckhbw %%mm7, %%mm4 \n\t"
  364. "paddw %%mm0, %%mm6 \n\t"
  365. "paddw %%mm4, %%mm6 \n\t"
  366. "movq %%mm2, %%mm0 \n\t"
  367. "movq %%mm3, %%mm1 \n\t"
  368. "add %4, %%"FF_REG_a" \n\t"
  369. " js 1b \n\t"
  370. : "+a" (len)
  371. : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
  372. "r" (stride), "m" (round_tab[2]));
  373. }
  374. static inline int sum_mmx(void)
  375. {
  376. int ret;
  377. __asm__ volatile (
  378. "movq %%mm6, %%mm0 \n\t"
  379. "psrlq $32, %%mm6 \n\t"
  380. "paddw %%mm0, %%mm6 \n\t"
  381. "movq %%mm6, %%mm0 \n\t"
  382. "psrlq $16, %%mm6 \n\t"
  383. "paddw %%mm0, %%mm6 \n\t"
  384. "movd %%mm6, %0 \n\t"
  385. : "=r" (ret));
  386. return ret & 0xFFFF;
  387. }
  388. static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2,
  389. ptrdiff_t stride, int h)
  390. {
  391. sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
  392. }
  393. static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2,
  394. ptrdiff_t stride, int h)
  395. {
  396. sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
  397. }
  398. #define PIX_SAD(suf) \
  399. static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \
  400. uint8_t *blk1, ptrdiff_t stride, int h) \
  401. { \
  402. av_assert2(h == 8); \
  403. __asm__ volatile ( \
  404. "pxor %%mm7, %%mm7 \n\t" \
  405. "pxor %%mm6, %%mm6 \n\t" \
  406. :); \
  407. \
  408. sad8_1_ ## suf(blk1, blk2, stride, 8); \
  409. \
  410. return sum_ ## suf(); \
  411. } \
  412. \
  413. static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
  414. uint8_t *blk1, ptrdiff_t stride, int h) \
  415. { \
  416. av_assert2(h == 8); \
  417. __asm__ volatile ( \
  418. "pxor %%mm7, %%mm7 \n\t" \
  419. "pxor %%mm6, %%mm6 \n\t" \
  420. "movq %0, %%mm5 \n\t" \
  421. :: "m" (round_tab[1])); \
  422. \
  423. sad8_x2a_ ## suf(blk1, blk2, stride, 8); \
  424. \
  425. return sum_ ## suf(); \
  426. } \
  427. \
  428. static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
  429. uint8_t *blk1, ptrdiff_t stride, int h) \
  430. { \
  431. av_assert2(h == 8); \
  432. __asm__ volatile ( \
  433. "pxor %%mm7, %%mm7 \n\t" \
  434. "pxor %%mm6, %%mm6 \n\t" \
  435. "movq %0, %%mm5 \n\t" \
  436. :: "m" (round_tab[1])); \
  437. \
  438. sad8_y2a_ ## suf(blk1, blk2, stride, 8); \
  439. \
  440. return sum_ ## suf(); \
  441. } \
  442. \
  443. static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
  444. uint8_t *blk1, ptrdiff_t stride, int h) \
  445. { \
  446. av_assert2(h == 8); \
  447. __asm__ volatile ( \
  448. "pxor %%mm7, %%mm7 \n\t" \
  449. "pxor %%mm6, %%mm6 \n\t" \
  450. ::); \
  451. \
  452. sad8_4_ ## suf(blk1, blk2, stride, 8); \
  453. \
  454. return sum_ ## suf(); \
  455. } \
  456. \
  457. static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \
  458. uint8_t *blk1, ptrdiff_t stride, int h) \
  459. { \
  460. __asm__ volatile ( \
  461. "pxor %%mm7, %%mm7 \n\t" \
  462. "pxor %%mm6, %%mm6 \n\t" \
  463. :); \
  464. \
  465. sad8_1_ ## suf(blk1, blk2, stride, h); \
  466. sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
  467. \
  468. return sum_ ## suf(); \
  469. } \
  470. \
  471. static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
  472. uint8_t *blk1, ptrdiff_t stride, int h) \
  473. { \
  474. __asm__ volatile ( \
  475. "pxor %%mm7, %%mm7 \n\t" \
  476. "pxor %%mm6, %%mm6 \n\t" \
  477. "movq %0, %%mm5 \n\t" \
  478. :: "m" (round_tab[1])); \
  479. \
  480. sad8_x2a_ ## suf(blk1, blk2, stride, h); \
  481. sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
  482. \
  483. return sum_ ## suf(); \
  484. } \
  485. \
  486. static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
  487. uint8_t *blk1, ptrdiff_t stride, int h) \
  488. { \
  489. __asm__ volatile ( \
  490. "pxor %%mm7, %%mm7 \n\t" \
  491. "pxor %%mm6, %%mm6 \n\t" \
  492. "movq %0, %%mm5 \n\t" \
  493. :: "m" (round_tab[1])); \
  494. \
  495. sad8_y2a_ ## suf(blk1, blk2, stride, h); \
  496. sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
  497. \
  498. return sum_ ## suf(); \
  499. } \
  500. \
  501. static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
  502. uint8_t *blk1, ptrdiff_t stride, int h) \
  503. { \
  504. __asm__ volatile ( \
  505. "pxor %%mm7, %%mm7 \n\t" \
  506. "pxor %%mm6, %%mm6 \n\t" \
  507. ::); \
  508. \
  509. sad8_4_ ## suf(blk1, blk2, stride, h); \
  510. sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
  511. \
  512. return sum_ ## suf(); \
  513. } \
  514. PIX_SAD(mmx)
  515. #endif /* HAVE_INLINE_ASM */
  516. av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
  517. {
  518. int cpu_flags = av_get_cpu_flags();
  519. #if HAVE_INLINE_ASM
  520. if (INLINE_MMX(cpu_flags)) {
  521. c->pix_abs[0][0] = sad16_mmx;
  522. c->pix_abs[0][1] = sad16_x2_mmx;
  523. c->pix_abs[0][2] = sad16_y2_mmx;
  524. c->pix_abs[0][3] = sad16_xy2_mmx;
  525. c->pix_abs[1][0] = sad8_mmx;
  526. c->pix_abs[1][1] = sad8_x2_mmx;
  527. c->pix_abs[1][2] = sad8_y2_mmx;
  528. c->pix_abs[1][3] = sad8_xy2_mmx;
  529. c->sad[0] = sad16_mmx;
  530. c->sad[1] = sad8_mmx;
  531. c->vsad[4] = vsad_intra16_mmx;
  532. if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
  533. c->vsad[0] = vsad16_mmx;
  534. }
  535. }
  536. #endif /* HAVE_INLINE_ASM */
  537. if (EXTERNAL_MMX(cpu_flags)) {
  538. c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
  539. c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
  540. c->sum_abs_dctelem = ff_sum_abs_dctelem_mmx;
  541. c->sse[0] = ff_sse16_mmx;
  542. c->sse[1] = ff_sse8_mmx;
  543. #if HAVE_X86ASM
  544. c->nsse[0] = nsse16_mmx;
  545. c->nsse[1] = nsse8_mmx;
  546. #endif
  547. }
  548. if (EXTERNAL_MMXEXT(cpu_flags)) {
  549. c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
  550. c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
  551. c->sum_abs_dctelem = ff_sum_abs_dctelem_mmxext;
  552. c->sad[0] = ff_sad16_mmxext;
  553. c->sad[1] = ff_sad8_mmxext;
  554. c->pix_abs[0][0] = ff_sad16_mmxext;
  555. c->pix_abs[0][1] = ff_sad16_x2_mmxext;
  556. c->pix_abs[0][2] = ff_sad16_y2_mmxext;
  557. c->pix_abs[1][0] = ff_sad8_mmxext;
  558. c->pix_abs[1][1] = ff_sad8_x2_mmxext;
  559. c->pix_abs[1][2] = ff_sad8_y2_mmxext;
  560. c->vsad[4] = ff_vsad_intra16_mmxext;
  561. c->vsad[5] = ff_vsad_intra8_mmxext;
  562. if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
  563. c->pix_abs[0][3] = ff_sad16_approx_xy2_mmxext;
  564. c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext;
  565. c->vsad[0] = ff_vsad16_approx_mmxext;
  566. c->vsad[1] = ff_vsad8_approx_mmxext;
  567. }
  568. }
  569. if (EXTERNAL_SSE2(cpu_flags)) {
  570. c->sse[0] = ff_sse16_sse2;
  571. c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2;
  572. #if HAVE_ALIGNED_STACK
  573. c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
  574. c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
  575. #endif
  576. if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
  577. c->sad[0] = ff_sad16_sse2;
  578. c->pix_abs[0][0] = ff_sad16_sse2;
  579. c->pix_abs[0][1] = ff_sad16_x2_sse2;
  580. c->pix_abs[0][2] = ff_sad16_y2_sse2;
  581. c->vsad[4] = ff_vsad_intra16_sse2;
  582. if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
  583. c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2;
  584. c->vsad[0] = ff_vsad16_approx_sse2;
  585. }
  586. }
  587. }
  588. if (EXTERNAL_SSSE3(cpu_flags)) {
  589. c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3;
  590. #if HAVE_ALIGNED_STACK
  591. c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
  592. c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
  593. #endif
  594. }
  595. }