2
0

hevcdsp_init.c 63 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151
  1. /*
  2. * Copyright (c) 2013 Seppo Tomperi
  3. * Copyright (c) 2013 - 2014 Pierre-Edouard Lepere
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "config.h"
  22. #include "libavutil/cpu.h"
  23. #include "libavutil/x86/asm.h"
  24. #include "libavutil/x86/cpu.h"
  25. #include "libavcodec/get_bits.h" /* required for hevcdsp.h GetBitContext */
  26. #include "libavcodec/hevcdsp.h"
  27. #include "libavcodec/x86/hevcdsp.h"
  28. #define LFC_FUNC(DIR, DEPTH, OPT) \
  29. void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int *tc, uint8_t *no_p, uint8_t *no_q);
  30. #define LFL_FUNC(DIR, DEPTH, OPT) \
  31. void ff_hevc_ ## DIR ## _loop_filter_luma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int beta, int *tc, uint8_t *no_p, uint8_t *no_q);
  32. #define LFC_FUNCS(type, depth, opt) \
  33. LFC_FUNC(h, depth, opt) \
  34. LFC_FUNC(v, depth, opt)
  35. #define LFL_FUNCS(type, depth, opt) \
  36. LFL_FUNC(h, depth, opt) \
  37. LFL_FUNC(v, depth, opt)
  38. LFC_FUNCS(uint8_t, 8, sse2)
  39. LFC_FUNCS(uint8_t, 10, sse2)
  40. LFC_FUNCS(uint8_t, 12, sse2)
  41. LFC_FUNCS(uint8_t, 8, avx)
  42. LFC_FUNCS(uint8_t, 10, avx)
  43. LFC_FUNCS(uint8_t, 12, avx)
  44. LFL_FUNCS(uint8_t, 8, sse2)
  45. LFL_FUNCS(uint8_t, 10, sse2)
  46. LFL_FUNCS(uint8_t, 12, sse2)
  47. LFL_FUNCS(uint8_t, 8, ssse3)
  48. LFL_FUNCS(uint8_t, 10, ssse3)
  49. LFL_FUNCS(uint8_t, 12, ssse3)
  50. LFL_FUNCS(uint8_t, 8, avx)
  51. LFL_FUNCS(uint8_t, 10, avx)
  52. LFL_FUNCS(uint8_t, 12, avx)
  53. #define IDCT_DC_FUNCS(W, opt) \
  54. void ff_hevc_idct_ ## W ## _dc_8_ ## opt(int16_t *coeffs); \
  55. void ff_hevc_idct_ ## W ## _dc_10_ ## opt(int16_t *coeffs); \
  56. void ff_hevc_idct_ ## W ## _dc_12_ ## opt(int16_t *coeffs)
  57. IDCT_DC_FUNCS(4x4, mmxext);
  58. IDCT_DC_FUNCS(8x8, mmxext);
  59. IDCT_DC_FUNCS(8x8, sse2);
  60. IDCT_DC_FUNCS(16x16, sse2);
  61. IDCT_DC_FUNCS(32x32, sse2);
  62. IDCT_DC_FUNCS(16x16, avx2);
  63. IDCT_DC_FUNCS(32x32, avx2);
  64. #define IDCT_FUNCS(opt) \
  65. void ff_hevc_idct_4x4_8_ ## opt(int16_t *coeffs, int col_limit); \
  66. void ff_hevc_idct_4x4_10_ ## opt(int16_t *coeffs, int col_limit); \
  67. void ff_hevc_idct_8x8_8_ ## opt(int16_t *coeffs, int col_limit); \
  68. void ff_hevc_idct_8x8_10_ ## opt(int16_t *coeffs, int col_limit); \
  69. void ff_hevc_idct_16x16_8_ ## opt(int16_t *coeffs, int col_limit); \
  70. void ff_hevc_idct_16x16_10_ ## opt(int16_t *coeffs, int col_limit); \
  71. void ff_hevc_idct_32x32_8_ ## opt(int16_t *coeffs, int col_limit); \
  72. void ff_hevc_idct_32x32_10_ ## opt(int16_t *coeffs, int col_limit);
  73. IDCT_FUNCS(sse2)
  74. IDCT_FUNCS(avx)
  75. #define mc_rep_func(name, bitd, step, W, opt) \
  76. void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst, \
  77. uint8_t *_src, ptrdiff_t _srcstride, int height, \
  78. intptr_t mx, intptr_t my, int width) \
  79. { \
  80. int i; \
  81. uint8_t *src; \
  82. int16_t *dst; \
  83. for (i = 0; i < W; i += step) { \
  84. src = _src + (i * ((bitd + 7) / 8)); \
  85. dst = _dst + i; \
  86. ff_hevc_put_hevc_##name##step##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width); \
  87. } \
  88. }
  89. #define mc_rep_uni_func(name, bitd, step, W, opt) \
  90. void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, \
  91. uint8_t *_src, ptrdiff_t _srcstride, int height, \
  92. intptr_t mx, intptr_t my, int width) \
  93. { \
  94. int i; \
  95. uint8_t *src; \
  96. uint8_t *dst; \
  97. for (i = 0; i < W; i += step) { \
  98. src = _src + (i * ((bitd + 7) / 8)); \
  99. dst = _dst + (i * ((bitd + 7) / 8)); \
  100. ff_hevc_put_hevc_uni_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride, \
  101. height, mx, my, width); \
  102. } \
  103. }
  104. #define mc_rep_bi_func(name, bitd, step, W, opt) \
  105. void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, uint8_t *_src, \
  106. ptrdiff_t _srcstride, int16_t* _src2, \
  107. int height, intptr_t mx, intptr_t my, int width) \
  108. { \
  109. int i; \
  110. uint8_t *src; \
  111. uint8_t *dst; \
  112. int16_t *src2; \
  113. for (i = 0; i < W ; i += step) { \
  114. src = _src + (i * ((bitd + 7) / 8)); \
  115. dst = _dst + (i * ((bitd + 7) / 8)); \
  116. src2 = _src2 + i; \
  117. ff_hevc_put_hevc_bi_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride, src2, \
  118. height, mx, my, width); \
  119. } \
  120. }
  121. #define mc_rep_funcs(name, bitd, step, W, opt) \
  122. mc_rep_func(name, bitd, step, W, opt) \
  123. mc_rep_uni_func(name, bitd, step, W, opt) \
  124. mc_rep_bi_func(name, bitd, step, W, opt)
  125. #define mc_rep_func2(name, bitd, step1, step2, W, opt) \
  126. void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *dst, \
  127. uint8_t *src, ptrdiff_t _srcstride, int height, \
  128. intptr_t mx, intptr_t my, int width) \
  129. { \
  130. ff_hevc_put_hevc_##name##step1##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width); \
  131. ff_hevc_put_hevc_##name##step2##_##bitd##_##opt(dst + step1, src + (step1 * ((bitd + 7) / 8)), \
  132. _srcstride, height, mx, my, width); \
  133. }
  134. #define mc_rep_uni_func2(name, bitd, step1, step2, W, opt) \
  135. void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, \
  136. uint8_t *src, ptrdiff_t _srcstride, int height, \
  137. intptr_t mx, intptr_t my, int width) \
  138. { \
  139. ff_hevc_put_hevc_uni_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, height, mx, my, width);\
  140. ff_hevc_put_hevc_uni_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride, \
  141. src + (step1 * ((bitd + 7) / 8)), _srcstride, \
  142. height, mx, my, width); \
  143. }
  144. #define mc_rep_bi_func2(name, bitd, step1, step2, W, opt) \
  145. void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
  146. ptrdiff_t _srcstride, int16_t* src2, \
  147. int height, intptr_t mx, intptr_t my, int width) \
  148. { \
  149. ff_hevc_put_hevc_bi_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, src2, height, mx, my, width);\
  150. ff_hevc_put_hevc_bi_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride, \
  151. src + (step1 * ((bitd + 7) / 8)), _srcstride, \
  152. src2 + step1, height, mx, my, width); \
  153. }
  154. #define mc_rep_funcs2(name, bitd, step1, step2, W, opt) \
  155. mc_rep_func2(name, bitd, step1, step2, W, opt) \
  156. mc_rep_uni_func2(name, bitd, step1, step2, W, opt) \
  157. mc_rep_bi_func2(name, bitd, step1, step2, W, opt)
  158. #if ARCH_X86_64 && HAVE_SSE4_EXTERNAL
  159. #define mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
  160. void ff_hevc_put_hevc_##name##width1##_10_##opt1(int16_t *dst, uint8_t *src, ptrdiff_t _srcstride, \
  161. int height, intptr_t mx, intptr_t my, int width) \
  162. \
  163. { \
  164. ff_hevc_put_hevc_##name##width2##_10_##opt1(dst, src, _srcstride, height, mx, my, width); \
  165. ff_hevc_put_hevc_##name##width3##_10_##opt2(dst+ width2, src+ width4, _srcstride, height, mx, my, width); \
  166. }
  167. #define mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
  168. void ff_hevc_put_hevc_bi_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
  169. ptrdiff_t _srcstride, int16_t *src2, \
  170. int height, intptr_t mx, intptr_t my, int width) \
  171. { \
  172. ff_hevc_put_hevc_bi_##name##width2##_10_##opt1(dst, dststride, src, _srcstride, src2, \
  173. height, mx, my, width); \
  174. ff_hevc_put_hevc_bi_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride, src2+width2,\
  175. height, mx, my, width); \
  176. }
  177. #define mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
  178. void ff_hevc_put_hevc_uni_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride, \
  179. uint8_t *src, ptrdiff_t _srcstride, int height, \
  180. intptr_t mx, intptr_t my, int width) \
  181. { \
  182. ff_hevc_put_hevc_uni_##name##width2##_10_##opt1(dst, dststride, src, _srcstride, \
  183. height, mx, my, width); \
  184. ff_hevc_put_hevc_uni_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride, \
  185. height, mx, my, width); \
  186. }
  187. #define mc_rep_mixs_10(name, width1, width2, width3, opt1, opt2, width4) \
  188. mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
  189. mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
  190. mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)
  191. #define mc_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
  192. void ff_hevc_put_hevc_##name##width1##_8_##opt1(int16_t *dst, uint8_t *src, ptrdiff_t _srcstride, \
  193. int height, intptr_t mx, intptr_t my, int width) \
  194. \
  195. { \
  196. ff_hevc_put_hevc_##name##width2##_8_##opt1(dst, src, _srcstride, height, mx, my, width); \
  197. ff_hevc_put_hevc_##name##width3##_8_##opt2(dst+ width2, src+ width2, _srcstride, height, mx, my, width); \
  198. }
  199. #define mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
  200. void ff_hevc_put_hevc_bi_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
  201. ptrdiff_t _srcstride, int16_t* src2, \
  202. int height, intptr_t mx, intptr_t my, int width) \
  203. { \
  204. ff_hevc_put_hevc_bi_##name##width2##_8_##opt1(dst, dststride, src, _srcstride, \
  205. src2, height, mx, my, width); \
  206. ff_hevc_put_hevc_bi_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride, \
  207. src2+width2, height, mx, my, width); \
  208. }
  209. #define mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
  210. void ff_hevc_put_hevc_uni_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride, \
  211. uint8_t *src, ptrdiff_t _srcstride, int height, \
  212. intptr_t mx, intptr_t my, int width) \
  213. { \
  214. ff_hevc_put_hevc_uni_##name##width2##_8_##opt1(dst, dststride, src, _srcstride, \
  215. height, mx, my, width); \
  216. ff_hevc_put_hevc_uni_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride, \
  217. height, mx, my, width); \
  218. }
  219. #define mc_rep_mixs_8(name, width1, width2, width3, opt1, opt2) \
  220. mc_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
  221. mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
  222. mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2)
  223. #if HAVE_AVX2_EXTERNAL
  224. mc_rep_mixs_8(pel_pixels, 48, 32, 16, avx2, sse4)
  225. mc_rep_mixs_8(epel_hv, 48, 32, 16, avx2, sse4)
  226. mc_rep_mixs_8(epel_h , 48, 32, 16, avx2, sse4)
  227. mc_rep_mixs_8(epel_v , 48, 32, 16, avx2, sse4)
  228. mc_rep_mix_10(pel_pixels, 24, 16, 8, avx2, sse4, 32)
  229. mc_bi_rep_mix_10(pel_pixels,24, 16, 8, avx2, sse4, 32)
  230. mc_rep_mixs_10(epel_hv, 24, 16, 8, avx2, sse4, 32)
  231. mc_rep_mixs_10(epel_h , 24, 16, 8, avx2, sse4, 32)
  232. mc_rep_mixs_10(epel_v , 24, 16, 8, avx2, sse4, 32)
  233. mc_rep_mixs_10(qpel_h , 24, 16, 8, avx2, sse4, 32)
  234. mc_rep_mixs_10(qpel_v , 24, 16, 8, avx2, sse4, 32)
  235. mc_rep_mixs_10(qpel_hv, 24, 16, 8, avx2, sse4, 32)
  236. mc_rep_uni_func(pel_pixels, 8, 64, 128, avx2)//used for 10bit
  237. mc_rep_uni_func(pel_pixels, 8, 32, 96, avx2) //used for 10bit
  238. mc_rep_funcs(pel_pixels, 8, 32, 64, avx2)
  239. mc_rep_func(pel_pixels, 10, 16, 32, avx2)
  240. mc_rep_func(pel_pixels, 10, 16, 48, avx2)
  241. mc_rep_func(pel_pixels, 10, 32, 64, avx2)
  242. mc_rep_bi_func(pel_pixels, 10, 16, 32, avx2)
  243. mc_rep_bi_func(pel_pixels, 10, 16, 48, avx2)
  244. mc_rep_bi_func(pel_pixels, 10, 32, 64, avx2)
  245. mc_rep_funcs(epel_h, 8, 32, 64, avx2)
  246. mc_rep_funcs(epel_v, 8, 32, 64, avx2)
  247. mc_rep_funcs(epel_h, 10, 16, 32, avx2)
  248. mc_rep_funcs(epel_h, 10, 16, 48, avx2)
  249. mc_rep_funcs(epel_h, 10, 32, 64, avx2)
  250. mc_rep_funcs(epel_v, 10, 16, 32, avx2)
  251. mc_rep_funcs(epel_v, 10, 16, 48, avx2)
  252. mc_rep_funcs(epel_v, 10, 32, 64, avx2)
  253. mc_rep_funcs(epel_hv, 8, 32, 64, avx2)
  254. mc_rep_funcs(epel_hv, 10, 16, 32, avx2)
  255. mc_rep_funcs(epel_hv, 10, 16, 48, avx2)
  256. mc_rep_funcs(epel_hv, 10, 32, 64, avx2)
  257. mc_rep_funcs(qpel_h, 8, 32, 64, avx2)
  258. mc_rep_mixs_8(qpel_h , 48, 32, 16, avx2, sse4)
  259. mc_rep_funcs(qpel_v, 8, 32, 64, avx2)
  260. mc_rep_mixs_8(qpel_v, 48, 32, 16, avx2, sse4)
  261. mc_rep_funcs(qpel_h, 10, 16, 32, avx2)
  262. mc_rep_funcs(qpel_h, 10, 16, 48, avx2)
  263. mc_rep_funcs(qpel_h, 10, 32, 64, avx2)
  264. mc_rep_funcs(qpel_v, 10, 16, 32, avx2)
  265. mc_rep_funcs(qpel_v, 10, 16, 48, avx2)
  266. mc_rep_funcs(qpel_v, 10, 32, 64, avx2)
  267. mc_rep_funcs(qpel_hv, 10, 16, 32, avx2)
  268. mc_rep_funcs(qpel_hv, 10, 16, 48, avx2)
  269. mc_rep_funcs(qpel_hv, 10, 32, 64, avx2)
  270. #endif //AVX2
  271. mc_rep_funcs(pel_pixels, 8, 16, 64, sse4)
  272. mc_rep_funcs(pel_pixels, 8, 16, 48, sse4)
  273. mc_rep_funcs(pel_pixels, 8, 16, 32, sse4)
  274. mc_rep_funcs(pel_pixels, 8, 8, 24, sse4)
  275. mc_rep_funcs(pel_pixels,10, 8, 64, sse4)
  276. mc_rep_funcs(pel_pixels,10, 8, 48, sse4)
  277. mc_rep_funcs(pel_pixels,10, 8, 32, sse4)
  278. mc_rep_funcs(pel_pixels,10, 8, 24, sse4)
  279. mc_rep_funcs(pel_pixels,10, 8, 16, sse4)
  280. mc_rep_funcs(pel_pixels,10, 4, 12, sse4)
  281. mc_rep_funcs(pel_pixels,12, 8, 64, sse4)
  282. mc_rep_funcs(pel_pixels,12, 8, 48, sse4)
  283. mc_rep_funcs(pel_pixels,12, 8, 32, sse4)
  284. mc_rep_funcs(pel_pixels,12, 8, 24, sse4)
  285. mc_rep_funcs(pel_pixels,12, 8, 16, sse4)
  286. mc_rep_funcs(pel_pixels,12, 4, 12, sse4)
  287. mc_rep_funcs(epel_h, 8, 16, 64, sse4)
  288. mc_rep_funcs(epel_h, 8, 16, 48, sse4)
  289. mc_rep_funcs(epel_h, 8, 16, 32, sse4)
  290. mc_rep_funcs(epel_h, 8, 8, 24, sse4)
  291. mc_rep_funcs(epel_h,10, 8, 64, sse4)
  292. mc_rep_funcs(epel_h,10, 8, 48, sse4)
  293. mc_rep_funcs(epel_h,10, 8, 32, sse4)
  294. mc_rep_funcs(epel_h,10, 8, 24, sse4)
  295. mc_rep_funcs(epel_h,10, 8, 16, sse4)
  296. mc_rep_funcs(epel_h,10, 4, 12, sse4)
  297. mc_rep_funcs(epel_h,12, 8, 64, sse4)
  298. mc_rep_funcs(epel_h,12, 8, 48, sse4)
  299. mc_rep_funcs(epel_h,12, 8, 32, sse4)
  300. mc_rep_funcs(epel_h,12, 8, 24, sse4)
  301. mc_rep_funcs(epel_h,12, 8, 16, sse4)
  302. mc_rep_funcs(epel_h,12, 4, 12, sse4)
  303. mc_rep_funcs(epel_v, 8, 16, 64, sse4)
  304. mc_rep_funcs(epel_v, 8, 16, 48, sse4)
  305. mc_rep_funcs(epel_v, 8, 16, 32, sse4)
  306. mc_rep_funcs(epel_v, 8, 8, 24, sse4)
  307. mc_rep_funcs(epel_v,10, 8, 64, sse4)
  308. mc_rep_funcs(epel_v,10, 8, 48, sse4)
  309. mc_rep_funcs(epel_v,10, 8, 32, sse4)
  310. mc_rep_funcs(epel_v,10, 8, 24, sse4)
  311. mc_rep_funcs(epel_v,10, 8, 16, sse4)
  312. mc_rep_funcs(epel_v,10, 4, 12, sse4)
  313. mc_rep_funcs(epel_v,12, 8, 64, sse4)
  314. mc_rep_funcs(epel_v,12, 8, 48, sse4)
  315. mc_rep_funcs(epel_v,12, 8, 32, sse4)
  316. mc_rep_funcs(epel_v,12, 8, 24, sse4)
  317. mc_rep_funcs(epel_v,12, 8, 16, sse4)
  318. mc_rep_funcs(epel_v,12, 4, 12, sse4)
  319. mc_rep_funcs(epel_hv, 8, 16, 64, sse4)
  320. mc_rep_funcs(epel_hv, 8, 16, 48, sse4)
  321. mc_rep_funcs(epel_hv, 8, 16, 32, sse4)
  322. mc_rep_funcs(epel_hv, 8, 8, 24, sse4)
  323. mc_rep_funcs2(epel_hv,8, 8, 4, 12, sse4)
  324. mc_rep_funcs(epel_hv,10, 8, 64, sse4)
  325. mc_rep_funcs(epel_hv,10, 8, 48, sse4)
  326. mc_rep_funcs(epel_hv,10, 8, 32, sse4)
  327. mc_rep_funcs(epel_hv,10, 8, 24, sse4)
  328. mc_rep_funcs(epel_hv,10, 8, 16, sse4)
  329. mc_rep_funcs(epel_hv,10, 4, 12, sse4)
  330. mc_rep_funcs(epel_hv,12, 8, 64, sse4)
  331. mc_rep_funcs(epel_hv,12, 8, 48, sse4)
  332. mc_rep_funcs(epel_hv,12, 8, 32, sse4)
  333. mc_rep_funcs(epel_hv,12, 8, 24, sse4)
  334. mc_rep_funcs(epel_hv,12, 8, 16, sse4)
  335. mc_rep_funcs(epel_hv,12, 4, 12, sse4)
  336. mc_rep_funcs(qpel_h, 8, 16, 64, sse4)
  337. mc_rep_funcs(qpel_h, 8, 16, 48, sse4)
  338. mc_rep_funcs(qpel_h, 8, 16, 32, sse4)
  339. mc_rep_funcs(qpel_h, 8, 8, 24, sse4)
  340. mc_rep_funcs(qpel_h,10, 8, 64, sse4)
  341. mc_rep_funcs(qpel_h,10, 8, 48, sse4)
  342. mc_rep_funcs(qpel_h,10, 8, 32, sse4)
  343. mc_rep_funcs(qpel_h,10, 8, 24, sse4)
  344. mc_rep_funcs(qpel_h,10, 8, 16, sse4)
  345. mc_rep_funcs(qpel_h,10, 4, 12, sse4)
  346. mc_rep_funcs(qpel_h,12, 8, 64, sse4)
  347. mc_rep_funcs(qpel_h,12, 8, 48, sse4)
  348. mc_rep_funcs(qpel_h,12, 8, 32, sse4)
  349. mc_rep_funcs(qpel_h,12, 8, 24, sse4)
  350. mc_rep_funcs(qpel_h,12, 8, 16, sse4)
  351. mc_rep_funcs(qpel_h,12, 4, 12, sse4)
  352. mc_rep_funcs(qpel_v, 8, 16, 64, sse4)
  353. mc_rep_funcs(qpel_v, 8, 16, 48, sse4)
  354. mc_rep_funcs(qpel_v, 8, 16, 32, sse4)
  355. mc_rep_funcs(qpel_v, 8, 8, 24, sse4)
  356. mc_rep_funcs(qpel_v,10, 8, 64, sse4)
  357. mc_rep_funcs(qpel_v,10, 8, 48, sse4)
  358. mc_rep_funcs(qpel_v,10, 8, 32, sse4)
  359. mc_rep_funcs(qpel_v,10, 8, 24, sse4)
  360. mc_rep_funcs(qpel_v,10, 8, 16, sse4)
  361. mc_rep_funcs(qpel_v,10, 4, 12, sse4)
  362. mc_rep_funcs(qpel_v,12, 8, 64, sse4)
  363. mc_rep_funcs(qpel_v,12, 8, 48, sse4)
  364. mc_rep_funcs(qpel_v,12, 8, 32, sse4)
  365. mc_rep_funcs(qpel_v,12, 8, 24, sse4)
  366. mc_rep_funcs(qpel_v,12, 8, 16, sse4)
  367. mc_rep_funcs(qpel_v,12, 4, 12, sse4)
  368. mc_rep_funcs(qpel_hv, 8, 8, 64, sse4)
  369. mc_rep_funcs(qpel_hv, 8, 8, 48, sse4)
  370. mc_rep_funcs(qpel_hv, 8, 8, 32, sse4)
  371. mc_rep_funcs(qpel_hv, 8, 8, 24, sse4)
  372. mc_rep_funcs(qpel_hv, 8, 8, 16, sse4)
  373. mc_rep_funcs2(qpel_hv,8, 8, 4, 12, sse4)
  374. mc_rep_funcs(qpel_hv,10, 8, 64, sse4)
  375. mc_rep_funcs(qpel_hv,10, 8, 48, sse4)
  376. mc_rep_funcs(qpel_hv,10, 8, 32, sse4)
  377. mc_rep_funcs(qpel_hv,10, 8, 24, sse4)
  378. mc_rep_funcs(qpel_hv,10, 8, 16, sse4)
  379. mc_rep_funcs(qpel_hv,10, 4, 12, sse4)
  380. mc_rep_funcs(qpel_hv,12, 8, 64, sse4)
  381. mc_rep_funcs(qpel_hv,12, 8, 48, sse4)
  382. mc_rep_funcs(qpel_hv,12, 8, 32, sse4)
  383. mc_rep_funcs(qpel_hv,12, 8, 24, sse4)
  384. mc_rep_funcs(qpel_hv,12, 8, 16, sse4)
  385. mc_rep_funcs(qpel_hv,12, 4, 12, sse4)
  386. #define mc_rep_uni_w(bitd, step, W, opt) \
  387. void ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, \
  388. int height, int denom, int _wx, int _ox) \
  389. { \
  390. int i; \
  391. int16_t *src; \
  392. uint8_t *dst; \
  393. for (i = 0; i < W; i += step) { \
  394. src= _src + i; \
  395. dst= _dst + (i * ((bitd + 7) / 8)); \
  396. ff_hevc_put_hevc_uni_w##step##_##bitd##_##opt(dst, dststride, src, \
  397. height, denom, _wx, _ox); \
  398. } \
  399. }
  400. mc_rep_uni_w(8, 6, 12, sse4)
  401. mc_rep_uni_w(8, 8, 16, sse4)
  402. mc_rep_uni_w(8, 8, 24, sse4)
  403. mc_rep_uni_w(8, 8, 32, sse4)
  404. mc_rep_uni_w(8, 8, 48, sse4)
  405. mc_rep_uni_w(8, 8, 64, sse4)
  406. mc_rep_uni_w(10, 6, 12, sse4)
  407. mc_rep_uni_w(10, 8, 16, sse4)
  408. mc_rep_uni_w(10, 8, 24, sse4)
  409. mc_rep_uni_w(10, 8, 32, sse4)
  410. mc_rep_uni_w(10, 8, 48, sse4)
  411. mc_rep_uni_w(10, 8, 64, sse4)
  412. mc_rep_uni_w(12, 6, 12, sse4)
  413. mc_rep_uni_w(12, 8, 16, sse4)
  414. mc_rep_uni_w(12, 8, 24, sse4)
  415. mc_rep_uni_w(12, 8, 32, sse4)
  416. mc_rep_uni_w(12, 8, 48, sse4)
  417. mc_rep_uni_w(12, 8, 64, sse4)
  418. #define mc_rep_bi_w(bitd, step, W, opt) \
  419. void ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, \
  420. int16_t *_src2, int height, \
  421. int denom, int _wx0, int _wx1, int _ox0, int _ox1) \
  422. { \
  423. int i; \
  424. int16_t *src; \
  425. int16_t *src2; \
  426. uint8_t *dst; \
  427. for (i = 0; i < W; i += step) { \
  428. src = _src + i; \
  429. src2 = _src2 + i; \
  430. dst = _dst + (i * ((bitd + 7) / 8)); \
  431. ff_hevc_put_hevc_bi_w##step##_##bitd##_##opt(dst, dststride, src, src2, \
  432. height, denom, _wx0, _wx1, _ox0, _ox1); \
  433. } \
  434. }
  435. mc_rep_bi_w(8, 6, 12, sse4)
  436. mc_rep_bi_w(8, 8, 16, sse4)
  437. mc_rep_bi_w(8, 8, 24, sse4)
  438. mc_rep_bi_w(8, 8, 32, sse4)
  439. mc_rep_bi_w(8, 8, 48, sse4)
  440. mc_rep_bi_w(8, 8, 64, sse4)
  441. mc_rep_bi_w(10, 6, 12, sse4)
  442. mc_rep_bi_w(10, 8, 16, sse4)
  443. mc_rep_bi_w(10, 8, 24, sse4)
  444. mc_rep_bi_w(10, 8, 32, sse4)
  445. mc_rep_bi_w(10, 8, 48, sse4)
  446. mc_rep_bi_w(10, 8, 64, sse4)
  447. mc_rep_bi_w(12, 6, 12, sse4)
  448. mc_rep_bi_w(12, 8, 16, sse4)
  449. mc_rep_bi_w(12, 8, 24, sse4)
  450. mc_rep_bi_w(12, 8, 32, sse4)
  451. mc_rep_bi_w(12, 8, 48, sse4)
  452. mc_rep_bi_w(12, 8, 64, sse4)
  453. #define mc_uni_w_func(name, bitd, W, opt) \
  454. void ff_hevc_put_hevc_uni_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride, \
  455. uint8_t *_src, ptrdiff_t _srcstride, \
  456. int height, int denom, \
  457. int _wx, int _ox, \
  458. intptr_t mx, intptr_t my, int width) \
  459. { \
  460. LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]); \
  461. ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width); \
  462. ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(_dst, _dststride, temp, height, denom, _wx, _ox);\
  463. }
  464. #define mc_uni_w_funcs(name, bitd, opt) \
  465. mc_uni_w_func(name, bitd, 4, opt) \
  466. mc_uni_w_func(name, bitd, 8, opt) \
  467. mc_uni_w_func(name, bitd, 12, opt) \
  468. mc_uni_w_func(name, bitd, 16, opt) \
  469. mc_uni_w_func(name, bitd, 24, opt) \
  470. mc_uni_w_func(name, bitd, 32, opt) \
  471. mc_uni_w_func(name, bitd, 48, opt) \
  472. mc_uni_w_func(name, bitd, 64, opt)
  473. mc_uni_w_funcs(pel_pixels, 8, sse4)
  474. mc_uni_w_func(pel_pixels, 8, 6, sse4)
  475. mc_uni_w_funcs(epel_h, 8, sse4)
  476. mc_uni_w_func(epel_h, 8, 6, sse4)
  477. mc_uni_w_funcs(epel_v, 8, sse4)
  478. mc_uni_w_func(epel_v, 8, 6, sse4)
  479. mc_uni_w_funcs(epel_hv, 8, sse4)
  480. mc_uni_w_func(epel_hv, 8, 6, sse4)
  481. mc_uni_w_funcs(qpel_h, 8, sse4)
  482. mc_uni_w_funcs(qpel_v, 8, sse4)
  483. mc_uni_w_funcs(qpel_hv, 8, sse4)
  484. mc_uni_w_funcs(pel_pixels, 10, sse4)
  485. mc_uni_w_func(pel_pixels, 10, 6, sse4)
  486. mc_uni_w_funcs(epel_h, 10, sse4)
  487. mc_uni_w_func(epel_h, 10, 6, sse4)
  488. mc_uni_w_funcs(epel_v, 10, sse4)
  489. mc_uni_w_func(epel_v, 10, 6, sse4)
  490. mc_uni_w_funcs(epel_hv, 10, sse4)
  491. mc_uni_w_func(epel_hv, 10, 6, sse4)
  492. mc_uni_w_funcs(qpel_h, 10, sse4)
  493. mc_uni_w_funcs(qpel_v, 10, sse4)
  494. mc_uni_w_funcs(qpel_hv, 10, sse4)
  495. mc_uni_w_funcs(pel_pixels, 12, sse4)
  496. mc_uni_w_func(pel_pixels, 12, 6, sse4)
  497. mc_uni_w_funcs(epel_h, 12, sse4)
  498. mc_uni_w_func(epel_h, 12, 6, sse4)
  499. mc_uni_w_funcs(epel_v, 12, sse4)
  500. mc_uni_w_func(epel_v, 12, 6, sse4)
  501. mc_uni_w_funcs(epel_hv, 12, sse4)
  502. mc_uni_w_func(epel_hv, 12, 6, sse4)
  503. mc_uni_w_funcs(qpel_h, 12, sse4)
  504. mc_uni_w_funcs(qpel_v, 12, sse4)
  505. mc_uni_w_funcs(qpel_hv, 12, sse4)
  506. #define mc_bi_w_func(name, bitd, W, opt) \
  507. void ff_hevc_put_hevc_bi_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride, \
  508. uint8_t *_src, ptrdiff_t _srcstride, \
  509. int16_t *_src2, \
  510. int height, int denom, \
  511. int _wx0, int _wx1, int _ox0, int _ox1, \
  512. intptr_t mx, intptr_t my, int width) \
  513. { \
  514. LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]); \
  515. ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width); \
  516. ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(_dst, _dststride, temp, _src2, \
  517. height, denom, _wx0, _wx1, _ox0, _ox1); \
  518. }
  519. #define mc_bi_w_funcs(name, bitd, opt) \
  520. mc_bi_w_func(name, bitd, 4, opt) \
  521. mc_bi_w_func(name, bitd, 8, opt) \
  522. mc_bi_w_func(name, bitd, 12, opt) \
  523. mc_bi_w_func(name, bitd, 16, opt) \
  524. mc_bi_w_func(name, bitd, 24, opt) \
  525. mc_bi_w_func(name, bitd, 32, opt) \
  526. mc_bi_w_func(name, bitd, 48, opt) \
  527. mc_bi_w_func(name, bitd, 64, opt)
  528. mc_bi_w_funcs(pel_pixels, 8, sse4)
  529. mc_bi_w_func(pel_pixels, 8, 6, sse4)
  530. mc_bi_w_funcs(epel_h, 8, sse4)
  531. mc_bi_w_func(epel_h, 8, 6, sse4)
  532. mc_bi_w_funcs(epel_v, 8, sse4)
  533. mc_bi_w_func(epel_v, 8, 6, sse4)
  534. mc_bi_w_funcs(epel_hv, 8, sse4)
  535. mc_bi_w_func(epel_hv, 8, 6, sse4)
  536. mc_bi_w_funcs(qpel_h, 8, sse4)
  537. mc_bi_w_funcs(qpel_v, 8, sse4)
  538. mc_bi_w_funcs(qpel_hv, 8, sse4)
  539. mc_bi_w_funcs(pel_pixels, 10, sse4)
  540. mc_bi_w_func(pel_pixels, 10, 6, sse4)
  541. mc_bi_w_funcs(epel_h, 10, sse4)
  542. mc_bi_w_func(epel_h, 10, 6, sse4)
  543. mc_bi_w_funcs(epel_v, 10, sse4)
  544. mc_bi_w_func(epel_v, 10, 6, sse4)
  545. mc_bi_w_funcs(epel_hv, 10, sse4)
  546. mc_bi_w_func(epel_hv, 10, 6, sse4)
  547. mc_bi_w_funcs(qpel_h, 10, sse4)
  548. mc_bi_w_funcs(qpel_v, 10, sse4)
  549. mc_bi_w_funcs(qpel_hv, 10, sse4)
  550. mc_bi_w_funcs(pel_pixels, 12, sse4)
  551. mc_bi_w_func(pel_pixels, 12, 6, sse4)
  552. mc_bi_w_funcs(epel_h, 12, sse4)
  553. mc_bi_w_func(epel_h, 12, 6, sse4)
  554. mc_bi_w_funcs(epel_v, 12, sse4)
  555. mc_bi_w_func(epel_v, 12, 6, sse4)
  556. mc_bi_w_funcs(epel_hv, 12, sse4)
  557. mc_bi_w_func(epel_hv, 12, 6, sse4)
  558. mc_bi_w_funcs(qpel_h, 12, sse4)
  559. mc_bi_w_funcs(qpel_v, 12, sse4)
  560. mc_bi_w_funcs(qpel_hv, 12, sse4)
  561. #endif //ARCH_X86_64 && HAVE_SSE4_EXTERNAL
  562. #define SAO_BAND_FILTER_FUNCS(bitd, opt) \
  563. void ff_hevc_sao_band_filter_8_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
  564. int16_t *sao_offset_val, int sao_left_class, int width, int height); \
  565. void ff_hevc_sao_band_filter_16_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
  566. int16_t *sao_offset_val, int sao_left_class, int width, int height); \
  567. void ff_hevc_sao_band_filter_32_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
  568. int16_t *sao_offset_val, int sao_left_class, int width, int height); \
  569. void ff_hevc_sao_band_filter_48_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
  570. int16_t *sao_offset_val, int sao_left_class, int width, int height); \
  571. void ff_hevc_sao_band_filter_64_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
  572. int16_t *sao_offset_val, int sao_left_class, int width, int height);
  573. SAO_BAND_FILTER_FUNCS(8, sse2)
  574. SAO_BAND_FILTER_FUNCS(10, sse2)
  575. SAO_BAND_FILTER_FUNCS(12, sse2)
  576. SAO_BAND_FILTER_FUNCS(8, avx)
  577. SAO_BAND_FILTER_FUNCS(10, avx)
  578. SAO_BAND_FILTER_FUNCS(12, avx)
  579. SAO_BAND_FILTER_FUNCS(8, avx2)
  580. SAO_BAND_FILTER_FUNCS(10, avx2)
  581. SAO_BAND_FILTER_FUNCS(12, avx2)
  582. #define SAO_BAND_INIT(bitd, opt) do { \
  583. c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_##bitd##_##opt; \
  584. c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_##bitd##_##opt; \
  585. c->sao_band_filter[2] = ff_hevc_sao_band_filter_32_##bitd##_##opt; \
  586. c->sao_band_filter[3] = ff_hevc_sao_band_filter_48_##bitd##_##opt; \
  587. c->sao_band_filter[4] = ff_hevc_sao_band_filter_64_##bitd##_##opt; \
  588. } while (0)
  589. #define SAO_EDGE_FILTER_FUNCS(bitd, opt) \
  590. void ff_hevc_sao_edge_filter_8_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
  591. int eo, int width, int height); \
  592. void ff_hevc_sao_edge_filter_16_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
  593. int eo, int width, int height); \
  594. void ff_hevc_sao_edge_filter_32_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
  595. int eo, int width, int height); \
  596. void ff_hevc_sao_edge_filter_48_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
  597. int eo, int width, int height); \
  598. void ff_hevc_sao_edge_filter_64_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
  599. int eo, int width, int height); \
  600. SAO_EDGE_FILTER_FUNCS(8, ssse3)
  601. SAO_EDGE_FILTER_FUNCS(8, avx2)
  602. SAO_EDGE_FILTER_FUNCS(10, sse2)
  603. SAO_EDGE_FILTER_FUNCS(10, avx2)
  604. SAO_EDGE_FILTER_FUNCS(12, sse2)
  605. SAO_EDGE_FILTER_FUNCS(12, avx2)
  606. #define SAO_EDGE_INIT(bitd, opt) do { \
  607. c->sao_edge_filter[0] = ff_hevc_sao_edge_filter_8_##bitd##_##opt; \
  608. c->sao_edge_filter[1] = ff_hevc_sao_edge_filter_16_##bitd##_##opt; \
  609. c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_##bitd##_##opt; \
  610. c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_##bitd##_##opt; \
  611. c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_##bitd##_##opt; \
  612. } while (0)
  613. #define EPEL_LINKS(pointer, my, mx, fname, bitd, opt ) \
  614. PEL_LINK(pointer, 1, my , mx , fname##4 , bitd, opt ); \
  615. PEL_LINK(pointer, 2, my , mx , fname##6 , bitd, opt ); \
  616. PEL_LINK(pointer, 3, my , mx , fname##8 , bitd, opt ); \
  617. PEL_LINK(pointer, 4, my , mx , fname##12, bitd, opt ); \
  618. PEL_LINK(pointer, 5, my , mx , fname##16, bitd, opt ); \
  619. PEL_LINK(pointer, 6, my , mx , fname##24, bitd, opt ); \
  620. PEL_LINK(pointer, 7, my , mx , fname##32, bitd, opt ); \
  621. PEL_LINK(pointer, 8, my , mx , fname##48, bitd, opt ); \
  622. PEL_LINK(pointer, 9, my , mx , fname##64, bitd, opt )
  623. #define QPEL_LINKS(pointer, my, mx, fname, bitd, opt) \
  624. PEL_LINK(pointer, 1, my , mx , fname##4 , bitd, opt ); \
  625. PEL_LINK(pointer, 3, my , mx , fname##8 , bitd, opt ); \
  626. PEL_LINK(pointer, 4, my , mx , fname##12, bitd, opt ); \
  627. PEL_LINK(pointer, 5, my , mx , fname##16, bitd, opt ); \
  628. PEL_LINK(pointer, 6, my , mx , fname##24, bitd, opt ); \
  629. PEL_LINK(pointer, 7, my , mx , fname##32, bitd, opt ); \
  630. PEL_LINK(pointer, 8, my , mx , fname##48, bitd, opt ); \
  631. PEL_LINK(pointer, 9, my , mx , fname##64, bitd, opt )
  632. void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
  633. {
  634. int cpu_flags = av_get_cpu_flags();
  635. if (bit_depth == 8) {
  636. if (EXTERNAL_MMXEXT(cpu_flags)) {
  637. c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_mmxext;
  638. c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_mmxext;
  639. c->add_residual[0] = ff_hevc_add_residual_4_8_mmxext;
  640. }
  641. if (EXTERNAL_SSE2(cpu_flags)) {
  642. c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
  643. c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
  644. if (ARCH_X86_64) {
  645. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_sse2;
  646. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_sse2;
  647. c->idct[2] = ff_hevc_idct_16x16_8_sse2;
  648. c->idct[3] = ff_hevc_idct_32x32_8_sse2;
  649. }
  650. SAO_BAND_INIT(8, sse2);
  651. c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_sse2;
  652. c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_sse2;
  653. c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_sse2;
  654. c->idct[0] = ff_hevc_idct_4x4_8_sse2;
  655. c->idct[1] = ff_hevc_idct_8x8_8_sse2;
  656. c->add_residual[1] = ff_hevc_add_residual_8_8_sse2;
  657. c->add_residual[2] = ff_hevc_add_residual_16_8_sse2;
  658. c->add_residual[3] = ff_hevc_add_residual_32_8_sse2;
  659. }
  660. if (EXTERNAL_SSSE3(cpu_flags)) {
  661. if(ARCH_X86_64) {
  662. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
  663. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
  664. }
  665. SAO_EDGE_INIT(8, ssse3);
  666. }
  667. if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
  668. EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 8, sse4);
  669. EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 8, sse4);
  670. EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 8, sse4);
  671. EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 8, sse4);
  672. QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4);
  673. QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 8, sse4);
  674. QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 8, sse4);
  675. QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 8, sse4);
  676. }
  677. if (EXTERNAL_AVX(cpu_flags)) {
  678. c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_avx;
  679. c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_avx;
  680. if (ARCH_X86_64) {
  681. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx;
  682. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx;
  683. c->idct[2] = ff_hevc_idct_16x16_8_avx;
  684. c->idct[3] = ff_hevc_idct_32x32_8_avx;
  685. }
  686. SAO_BAND_INIT(8, avx);
  687. c->idct[0] = ff_hevc_idct_4x4_8_avx;
  688. c->idct[1] = ff_hevc_idct_8x8_8_avx;
  689. c->add_residual[1] = ff_hevc_add_residual_8_8_avx;
  690. c->add_residual[2] = ff_hevc_add_residual_16_8_avx;
  691. c->add_residual[3] = ff_hevc_add_residual_32_8_avx;
  692. }
  693. if (EXTERNAL_AVX2(cpu_flags)) {
  694. c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2;
  695. c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_8_avx2;
  696. }
  697. if (EXTERNAL_AVX2_FAST(cpu_flags)) {
  698. c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
  699. c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
  700. if (ARCH_X86_64) {
  701. c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
  702. c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
  703. c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
  704. c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
  705. c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
  706. c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
  707. c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
  708. c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
  709. c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
  710. c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
  711. c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
  712. c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
  713. c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
  714. c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
  715. c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
  716. c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
  717. c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
  718. c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
  719. c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_avx2;
  720. c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_8_avx2;
  721. c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_8_avx2;
  722. c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_avx2;
  723. c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_8_avx2;
  724. c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_8_avx2;
  725. c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_avx2;
  726. c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_avx2;
  727. c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_avx2;
  728. c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_avx2;
  729. c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_8_avx2;
  730. c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_8_avx2;
  731. c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_avx2;
  732. c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_8_avx2;
  733. c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_8_avx2;
  734. c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_avx2;
  735. c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_8_avx2;
  736. c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_8_avx2;
  737. c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_avx2;
  738. c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_8_avx2;
  739. c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_8_avx2;
  740. c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_avx2;
  741. c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_8_avx2;
  742. c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_8_avx2;
  743. c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_avx2;
  744. c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_8_avx2;
  745. c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_8_avx2;
  746. c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_avx2;
  747. c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_avx2;
  748. c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_avx2;
  749. c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_avx2;
  750. c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_avx2;
  751. c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_avx2;
  752. c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_avx2;
  753. c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_avx2;
  754. c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_avx2;
  755. c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_avx2;
  756. c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_avx2;
  757. c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_avx2;
  758. c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_avx2;
  759. c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_avx2;
  760. c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_avx2;
  761. c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_avx2;
  762. c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_avx2;
  763. c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_avx2;
  764. }
  765. SAO_BAND_INIT(8, avx2);
  766. c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_8_avx2;
  767. c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2;
  768. c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_8_avx2;
  769. c->add_residual[3] = ff_hevc_add_residual_32_8_avx2;
  770. }
  771. } else if (bit_depth == 10) {
  772. if (EXTERNAL_MMXEXT(cpu_flags)) {
  773. c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext;
  774. c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_mmxext;
  775. c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_mmxext;
  776. }
  777. if (EXTERNAL_SSE2(cpu_flags)) {
  778. c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
  779. c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2;
  780. if (ARCH_X86_64) {
  781. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_sse2;
  782. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_sse2;
  783. c->idct[2] = ff_hevc_idct_16x16_10_sse2;
  784. c->idct[3] = ff_hevc_idct_32x32_10_sse2;
  785. }
  786. SAO_BAND_INIT(10, sse2);
  787. SAO_EDGE_INIT(10, sse2);
  788. c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_sse2;
  789. c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_sse2;
  790. c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_sse2;
  791. c->idct[0] = ff_hevc_idct_4x4_10_sse2;
  792. c->idct[1] = ff_hevc_idct_8x8_10_sse2;
  793. c->add_residual[1] = ff_hevc_add_residual_8_10_sse2;
  794. c->add_residual[2] = ff_hevc_add_residual_16_10_sse2;
  795. c->add_residual[3] = ff_hevc_add_residual_32_10_sse2;
  796. }
  797. if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
  798. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
  799. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
  800. }
  801. if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
  802. EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, sse4);
  803. EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 10, sse4);
  804. EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 10, sse4);
  805. EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 10, sse4);
  806. QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4);
  807. QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 10, sse4);
  808. QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 10, sse4);
  809. QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 10, sse4);
  810. }
  811. if (EXTERNAL_AVX(cpu_flags)) {
  812. c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_avx;
  813. c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_avx;
  814. if (ARCH_X86_64) {
  815. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_avx;
  816. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_avx;
  817. c->idct[2] = ff_hevc_idct_16x16_10_avx;
  818. c->idct[3] = ff_hevc_idct_32x32_10_avx;
  819. }
  820. c->idct[0] = ff_hevc_idct_4x4_10_avx;
  821. c->idct[1] = ff_hevc_idct_8x8_10_avx;
  822. SAO_BAND_INIT(10, avx);
  823. }
  824. if (EXTERNAL_AVX2(cpu_flags)) {
  825. c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_10_avx2;
  826. }
  827. if (EXTERNAL_AVX2_FAST(cpu_flags)) {
  828. c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2;
  829. c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2;
  830. if (ARCH_X86_64) {
  831. c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
  832. c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
  833. c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
  834. c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
  835. c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
  836. c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
  837. c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
  838. c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
  839. c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
  840. c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
  841. c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
  842. c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
  843. c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
  844. c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
  845. c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
  846. c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
  847. c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
  848. c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
  849. c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
  850. c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
  851. c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
  852. c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
  853. c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
  854. c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
  855. c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
  856. c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
  857. c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
  858. c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
  859. c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
  860. c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
  861. c->put_hevc_epel[5][0][1] = ff_hevc_put_hevc_epel_h16_10_avx2;
  862. c->put_hevc_epel[6][0][1] = ff_hevc_put_hevc_epel_h24_10_avx2;
  863. c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_10_avx2;
  864. c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_10_avx2;
  865. c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_10_avx2;
  866. c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_10_avx2;
  867. c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_10_avx2;
  868. c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_10_avx2;
  869. c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_10_avx2;
  870. c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_10_avx2;
  871. c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_10_avx2;
  872. c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_10_avx2;
  873. c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_10_avx2;
  874. c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_10_avx2;
  875. c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_10_avx2;
  876. c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_10_avx2;
  877. c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_10_avx2;
  878. c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_10_avx2;
  879. c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_10_avx2;
  880. c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_10_avx2;
  881. c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_10_avx2;
  882. c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_10_avx2;
  883. c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_10_avx2;
  884. c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_10_avx2;
  885. c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_10_avx2;
  886. c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_10_avx2;
  887. c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_10_avx2;
  888. c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_10_avx2;
  889. c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_10_avx2;
  890. c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_10_avx2;
  891. c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_10_avx2;
  892. c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_10_avx2;
  893. c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_10_avx2;
  894. c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_10_avx2;
  895. c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_10_avx2;
  896. c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_10_avx2;
  897. c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_10_avx2;
  898. c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_10_avx2;
  899. c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_10_avx2;
  900. c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_10_avx2;
  901. c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_10_avx2;
  902. c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_10_avx2;
  903. c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_10_avx2;
  904. c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_10_avx2;
  905. c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_10_avx2;
  906. c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_10_avx2;
  907. c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_10_avx2;
  908. c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_10_avx2;
  909. c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_10_avx2;
  910. c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_10_avx2;
  911. c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_10_avx2;
  912. c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_10_avx2;
  913. c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_10_avx2;
  914. c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_10_avx2;
  915. c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_10_avx2;
  916. c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_10_avx2;
  917. c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_10_avx2;
  918. c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_10_avx2;
  919. c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_10_avx2;
  920. c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_10_avx2;
  921. c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_10_avx2;
  922. c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_10_avx2;
  923. c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_10_avx2;
  924. c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_10_avx2;
  925. c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_10_avx2;
  926. c->put_hevc_qpel_uni[5][1][0] = ff_hevc_put_hevc_uni_qpel_v16_10_avx2;
  927. c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_10_avx2;
  928. c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_10_avx2;
  929. c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_10_avx2;
  930. c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_10_avx2;
  931. c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_10_avx2;
  932. c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_10_avx2;
  933. c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_10_avx2;
  934. c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_10_avx2;
  935. c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_10_avx2;
  936. c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_10_avx2;
  937. c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_10_avx2;
  938. c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_10_avx2;
  939. c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_10_avx2;
  940. c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_10_avx2;
  941. c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_10_avx2;
  942. c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_10_avx2;
  943. c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_10_avx2;
  944. c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_10_avx2;
  945. c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_10_avx2;
  946. c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_10_avx2;
  947. c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_10_avx2;
  948. c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_10_avx2;
  949. c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_10_avx2;
  950. c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_10_avx2;
  951. }
  952. SAO_BAND_INIT(10, avx2);
  953. SAO_EDGE_INIT(10, avx2);
  954. c->add_residual[2] = ff_hevc_add_residual_16_10_avx2;
  955. c->add_residual[3] = ff_hevc_add_residual_32_10_avx2;
  956. }
  957. } else if (bit_depth == 12) {
  958. if (EXTERNAL_MMXEXT(cpu_flags)) {
  959. c->idct_dc[0] = ff_hevc_idct_4x4_dc_12_mmxext;
  960. c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_mmxext;
  961. }
  962. if (EXTERNAL_SSE2(cpu_flags)) {
  963. c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
  964. c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2;
  965. if (ARCH_X86_64) {
  966. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
  967. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
  968. }
  969. SAO_BAND_INIT(12, sse2);
  970. SAO_EDGE_INIT(12, sse2);
  971. c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_sse2;
  972. c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_sse2;
  973. c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_sse2;
  974. }
  975. if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
  976. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
  977. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3;
  978. }
  979. if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
  980. EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, sse4);
  981. EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 12, sse4);
  982. EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 12, sse4);
  983. EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 12, sse4);
  984. QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, sse4);
  985. QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 12, sse4);
  986. QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 12, sse4);
  987. QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 12, sse4);
  988. }
  989. if (EXTERNAL_AVX(cpu_flags)) {
  990. c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_avx;
  991. c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_avx;
  992. if (ARCH_X86_64) {
  993. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_avx;
  994. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_avx;
  995. }
  996. SAO_BAND_INIT(12, avx);
  997. }
  998. if (EXTERNAL_AVX2(cpu_flags)) {
  999. c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_12_avx2;
  1000. }
  1001. if (EXTERNAL_AVX2_FAST(cpu_flags)) {
  1002. c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_avx2;
  1003. c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_avx2;
  1004. SAO_BAND_INIT(12, avx2);
  1005. SAO_EDGE_INIT(12, avx2);
  1006. }
  1007. }
  1008. }