2
0

threading.c 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905
  1. /*
  2. * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "vpx_config.h"
  11. #include "vp8_rtcd.h"
  12. #if !defined(WIN32) && CONFIG_OS_SUPPORT == 1
  13. #include <unistd.h>
  14. #endif
  15. #include "onyxd_int.h"
  16. #include "vpx_mem/vpx_mem.h"
  17. #include "vp8/common/common.h"
  18. #include "vp8/common/threading.h"
  19. #include "vp8/common/loopfilter.h"
  20. #include "vp8/common/extend.h"
  21. #include "vpx_ports/vpx_timer.h"
  22. #include "decoderthreading.h"
  23. #include "detokenize.h"
  24. #include "vp8/common/reconintra4x4.h"
  25. #include "vp8/common/reconinter.h"
  26. #include "vp8/common/reconintra.h"
  27. #include "vp8/common/setupintrarecon.h"
  28. #if CONFIG_ERROR_CONCEALMENT
  29. #include "error_concealment.h"
  30. #endif
  31. #define CALLOC_ARRAY(p, n) CHECK_MEM_ERROR((p), vpx_calloc(sizeof(*(p)), (n)))
  32. #define CALLOC_ARRAY_ALIGNED(p, n, algn) \
  33. do { \
  34. CHECK_MEM_ERROR((p), vpx_memalign((algn), sizeof(*(p)) * (n))); \
  35. memset((p), 0, (n) * sizeof(*(p))); \
  36. } while (0)
  37. static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd,
  38. MB_ROW_DEC *mbrd, int count) {
  39. VP8_COMMON *const pc = &pbi->common;
  40. int i;
  41. for (i = 0; i < count; ++i) {
  42. MACROBLOCKD *mbd = &mbrd[i].mbd;
  43. mbd->subpixel_predict = xd->subpixel_predict;
  44. mbd->subpixel_predict8x4 = xd->subpixel_predict8x4;
  45. mbd->subpixel_predict8x8 = xd->subpixel_predict8x8;
  46. mbd->subpixel_predict16x16 = xd->subpixel_predict16x16;
  47. mbd->frame_type = pc->frame_type;
  48. mbd->pre = xd->pre;
  49. mbd->dst = xd->dst;
  50. mbd->segmentation_enabled = xd->segmentation_enabled;
  51. mbd->mb_segement_abs_delta = xd->mb_segement_abs_delta;
  52. memcpy(mbd->segment_feature_data, xd->segment_feature_data,
  53. sizeof(xd->segment_feature_data));
  54. /*signed char ref_lf_deltas[MAX_REF_LF_DELTAS];*/
  55. memcpy(mbd->ref_lf_deltas, xd->ref_lf_deltas, sizeof(xd->ref_lf_deltas));
  56. /*signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];*/
  57. memcpy(mbd->mode_lf_deltas, xd->mode_lf_deltas, sizeof(xd->mode_lf_deltas));
  58. /*unsigned char mode_ref_lf_delta_enabled;
  59. unsigned char mode_ref_lf_delta_update;*/
  60. mbd->mode_ref_lf_delta_enabled = xd->mode_ref_lf_delta_enabled;
  61. mbd->mode_ref_lf_delta_update = xd->mode_ref_lf_delta_update;
  62. mbd->current_bc = &pbi->mbc[0];
  63. memcpy(mbd->dequant_y1_dc, xd->dequant_y1_dc, sizeof(xd->dequant_y1_dc));
  64. memcpy(mbd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1));
  65. memcpy(mbd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
  66. memcpy(mbd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
  67. mbd->fullpixel_mask = 0xffffffff;
  68. if (pc->full_pixel) mbd->fullpixel_mask = 0xfffffff8;
  69. }
  70. for (i = 0; i < pc->mb_rows; ++i)
  71. vpx_atomic_store_release(&pbi->mt_current_mb_col[i], -1);
  72. }
  73. static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
  74. unsigned int mb_idx) {
  75. MB_PREDICTION_MODE mode;
  76. int i;
  77. #if CONFIG_ERROR_CONCEALMENT
  78. int corruption_detected = 0;
  79. #else
  80. (void)mb_idx;
  81. #endif
  82. if (xd->mode_info_context->mbmi.mb_skip_coeff) {
  83. vp8_reset_mb_tokens_context(xd);
  84. } else if (!vp8dx_bool_error(xd->current_bc)) {
  85. int eobtotal;
  86. eobtotal = vp8_decode_mb_tokens(pbi, xd);
  87. /* Special case: Force the loopfilter to skip when eobtotal is zero */
  88. xd->mode_info_context->mbmi.mb_skip_coeff = (eobtotal == 0);
  89. }
  90. mode = xd->mode_info_context->mbmi.mode;
  91. if (xd->segmentation_enabled) vp8_mb_init_dequantizer(pbi, xd);
  92. #if CONFIG_ERROR_CONCEALMENT
  93. if (pbi->ec_active) {
  94. int throw_residual;
  95. /* When we have independent partitions we can apply residual even
  96. * though other partitions within the frame are corrupt.
  97. */
  98. throw_residual =
  99. (!pbi->independent_partitions && pbi->frame_corrupt_residual);
  100. throw_residual = (throw_residual || vp8dx_bool_error(xd->current_bc));
  101. if ((mb_idx >= pbi->mvs_corrupt_from_mb || throw_residual)) {
  102. /* MB with corrupt residuals or corrupt mode/motion vectors.
  103. * Better to use the predictor as reconstruction.
  104. */
  105. pbi->frame_corrupt_residual = 1;
  106. memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
  107. corruption_detected = 1;
  108. /* force idct to be skipped for B_PRED and use the
  109. * prediction only for reconstruction
  110. * */
  111. memset(xd->eobs, 0, 25);
  112. }
  113. }
  114. #endif
  115. /* do prediction */
  116. if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
  117. vp8_build_intra_predictors_mbuv_s(
  118. xd, xd->recon_above[1], xd->recon_above[2], xd->recon_left[1],
  119. xd->recon_left[2], xd->recon_left_stride[1], xd->dst.u_buffer,
  120. xd->dst.v_buffer, xd->dst.uv_stride);
  121. if (mode != B_PRED) {
  122. vp8_build_intra_predictors_mby_s(
  123. xd, xd->recon_above[0], xd->recon_left[0], xd->recon_left_stride[0],
  124. xd->dst.y_buffer, xd->dst.y_stride);
  125. } else {
  126. short *DQC = xd->dequant_y1;
  127. int dst_stride = xd->dst.y_stride;
  128. /* clear out residual eob info */
  129. if (xd->mode_info_context->mbmi.mb_skip_coeff) memset(xd->eobs, 0, 25);
  130. intra_prediction_down_copy(xd, xd->recon_above[0] + 16);
  131. for (i = 0; i < 16; ++i) {
  132. BLOCKD *b = &xd->block[i];
  133. unsigned char *dst = xd->dst.y_buffer + b->offset;
  134. B_PREDICTION_MODE b_mode = xd->mode_info_context->bmi[i].as_mode;
  135. unsigned char *Above;
  136. unsigned char *yleft;
  137. int left_stride;
  138. unsigned char top_left;
  139. /*Caution: For some b_mode, it needs 8 pixels (4 above + 4
  140. * above-right).*/
  141. if (i < 4 && pbi->common.filter_level) {
  142. Above = xd->recon_above[0] + b->offset;
  143. } else {
  144. Above = dst - dst_stride;
  145. }
  146. if (i % 4 == 0 && pbi->common.filter_level) {
  147. yleft = xd->recon_left[0] + i;
  148. left_stride = 1;
  149. } else {
  150. yleft = dst - 1;
  151. left_stride = dst_stride;
  152. }
  153. if ((i == 4 || i == 8 || i == 12) && pbi->common.filter_level) {
  154. top_left = *(xd->recon_left[0] + i - 1);
  155. } else {
  156. top_left = Above[-1];
  157. }
  158. vp8_intra4x4_predict(Above, yleft, left_stride, b_mode, dst, dst_stride,
  159. top_left);
  160. if (xd->eobs[i]) {
  161. if (xd->eobs[i] > 1) {
  162. vp8_dequant_idct_add(b->qcoeff, DQC, dst, dst_stride);
  163. } else {
  164. vp8_dc_only_idct_add(b->qcoeff[0] * DQC[0], dst, dst_stride, dst,
  165. dst_stride);
  166. memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
  167. }
  168. }
  169. }
  170. }
  171. } else {
  172. vp8_build_inter_predictors_mb(xd);
  173. }
  174. #if CONFIG_ERROR_CONCEALMENT
  175. if (corruption_detected) {
  176. return;
  177. }
  178. #endif
  179. if (!xd->mode_info_context->mbmi.mb_skip_coeff) {
  180. /* dequantization and idct */
  181. if (mode != B_PRED) {
  182. short *DQC = xd->dequant_y1;
  183. if (mode != SPLITMV) {
  184. BLOCKD *b = &xd->block[24];
  185. /* do 2nd order transform on the dc block */
  186. if (xd->eobs[24] > 1) {
  187. vp8_dequantize_b(b, xd->dequant_y2);
  188. vp8_short_inv_walsh4x4(&b->dqcoeff[0], xd->qcoeff);
  189. memset(b->qcoeff, 0, 16 * sizeof(b->qcoeff[0]));
  190. } else {
  191. b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
  192. vp8_short_inv_walsh4x4_1(&b->dqcoeff[0], xd->qcoeff);
  193. memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
  194. }
  195. /* override the dc dequant constant in order to preserve the
  196. * dc components
  197. */
  198. DQC = xd->dequant_y1_dc;
  199. }
  200. vp8_dequant_idct_add_y_block(xd->qcoeff, DQC, xd->dst.y_buffer,
  201. xd->dst.y_stride, xd->eobs);
  202. }
  203. vp8_dequant_idct_add_uv_block(xd->qcoeff + 16 * 16, xd->dequant_uv,
  204. xd->dst.u_buffer, xd->dst.v_buffer,
  205. xd->dst.uv_stride, xd->eobs + 16);
  206. }
  207. }
  208. static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
  209. int start_mb_row) {
  210. const vpx_atomic_int *last_row_current_mb_col;
  211. vpx_atomic_int *current_mb_col;
  212. int mb_row;
  213. VP8_COMMON *pc = &pbi->common;
  214. const int nsync = pbi->sync_range;
  215. const vpx_atomic_int first_row_no_sync_above =
  216. VPX_ATOMIC_INIT(pc->mb_cols + nsync);
  217. int num_part = 1 << pbi->common.multi_token_partition;
  218. int last_mb_row = start_mb_row;
  219. YV12_BUFFER_CONFIG *yv12_fb_new = pbi->dec_fb_ref[INTRA_FRAME];
  220. YV12_BUFFER_CONFIG *yv12_fb_lst = pbi->dec_fb_ref[LAST_FRAME];
  221. int recon_y_stride = yv12_fb_new->y_stride;
  222. int recon_uv_stride = yv12_fb_new->uv_stride;
  223. unsigned char *ref_buffer[MAX_REF_FRAMES][3];
  224. unsigned char *dst_buffer[3];
  225. int i;
  226. int ref_fb_corrupted[MAX_REF_FRAMES];
  227. ref_fb_corrupted[INTRA_FRAME] = 0;
  228. for (i = 1; i < MAX_REF_FRAMES; ++i) {
  229. YV12_BUFFER_CONFIG *this_fb = pbi->dec_fb_ref[i];
  230. ref_buffer[i][0] = this_fb->y_buffer;
  231. ref_buffer[i][1] = this_fb->u_buffer;
  232. ref_buffer[i][2] = this_fb->v_buffer;
  233. ref_fb_corrupted[i] = this_fb->corrupted;
  234. }
  235. dst_buffer[0] = yv12_fb_new->y_buffer;
  236. dst_buffer[1] = yv12_fb_new->u_buffer;
  237. dst_buffer[2] = yv12_fb_new->v_buffer;
  238. xd->up_available = (start_mb_row != 0);
  239. xd->mode_info_context = pc->mi + pc->mode_info_stride * start_mb_row;
  240. xd->mode_info_stride = pc->mode_info_stride;
  241. for (mb_row = start_mb_row; mb_row < pc->mb_rows;
  242. mb_row += (pbi->decoding_thread_count + 1)) {
  243. int recon_yoffset, recon_uvoffset;
  244. int mb_col;
  245. int filter_level;
  246. loop_filter_info_n *lfi_n = &pc->lf_info;
  247. /* save last row processed by this thread */
  248. last_mb_row = mb_row;
  249. /* select bool coder for current partition */
  250. xd->current_bc = &pbi->mbc[mb_row % num_part];
  251. if (mb_row > 0) {
  252. last_row_current_mb_col = &pbi->mt_current_mb_col[mb_row - 1];
  253. } else {
  254. last_row_current_mb_col = &first_row_no_sync_above;
  255. }
  256. current_mb_col = &pbi->mt_current_mb_col[mb_row];
  257. recon_yoffset = mb_row * recon_y_stride * 16;
  258. recon_uvoffset = mb_row * recon_uv_stride * 8;
  259. /* reset contexts */
  260. xd->above_context = pc->above_context;
  261. memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
  262. xd->left_available = 0;
  263. xd->mb_to_top_edge = -((mb_row * 16) << 3);
  264. xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
  265. if (pbi->common.filter_level) {
  266. xd->recon_above[0] = pbi->mt_yabove_row[mb_row] + 0 * 16 + 32;
  267. xd->recon_above[1] = pbi->mt_uabove_row[mb_row] + 0 * 8 + 16;
  268. xd->recon_above[2] = pbi->mt_vabove_row[mb_row] + 0 * 8 + 16;
  269. xd->recon_left[0] = pbi->mt_yleft_col[mb_row];
  270. xd->recon_left[1] = pbi->mt_uleft_col[mb_row];
  271. xd->recon_left[2] = pbi->mt_vleft_col[mb_row];
  272. /* TODO: move to outside row loop */
  273. xd->recon_left_stride[0] = 1;
  274. xd->recon_left_stride[1] = 1;
  275. } else {
  276. xd->recon_above[0] = dst_buffer[0] + recon_yoffset;
  277. xd->recon_above[1] = dst_buffer[1] + recon_uvoffset;
  278. xd->recon_above[2] = dst_buffer[2] + recon_uvoffset;
  279. xd->recon_left[0] = xd->recon_above[0] - 1;
  280. xd->recon_left[1] = xd->recon_above[1] - 1;
  281. xd->recon_left[2] = xd->recon_above[2] - 1;
  282. xd->recon_above[0] -= xd->dst.y_stride;
  283. xd->recon_above[1] -= xd->dst.uv_stride;
  284. xd->recon_above[2] -= xd->dst.uv_stride;
  285. /* TODO: move to outside row loop */
  286. xd->recon_left_stride[0] = xd->dst.y_stride;
  287. xd->recon_left_stride[1] = xd->dst.uv_stride;
  288. setup_intra_recon_left(xd->recon_left[0], xd->recon_left[1],
  289. xd->recon_left[2], xd->dst.y_stride,
  290. xd->dst.uv_stride);
  291. }
  292. for (mb_col = 0; mb_col < pc->mb_cols; ++mb_col) {
  293. if (((mb_col - 1) % nsync) == 0) {
  294. vpx_atomic_store_release(current_mb_col, mb_col - 1);
  295. }
  296. if (mb_row && !(mb_col & (nsync - 1))) {
  297. vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync);
  298. }
  299. /* Distance of MB to the various image edges.
  300. * These are specified to 8th pel as they are always
  301. * compared to values that are in 1/8th pel units.
  302. */
  303. xd->mb_to_left_edge = -((mb_col * 16) << 3);
  304. xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
  305. #if CONFIG_ERROR_CONCEALMENT
  306. {
  307. int corrupt_residual =
  308. (!pbi->independent_partitions && pbi->frame_corrupt_residual) ||
  309. vp8dx_bool_error(xd->current_bc);
  310. if (pbi->ec_active &&
  311. (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) &&
  312. corrupt_residual) {
  313. /* We have an intra block with corrupt
  314. * coefficients, better to conceal with an inter
  315. * block.
  316. * Interpolate MVs from neighboring MBs
  317. *
  318. * Note that for the first mb with corrupt
  319. * residual in a frame, we might not discover
  320. * that before decoding the residual. That
  321. * happens after this check, and therefore no
  322. * inter concealment will be done.
  323. */
  324. vp8_interpolate_motion(xd, mb_row, mb_col, pc->mb_rows, pc->mb_cols);
  325. }
  326. }
  327. #endif
  328. xd->dst.y_buffer = dst_buffer[0] + recon_yoffset;
  329. xd->dst.u_buffer = dst_buffer[1] + recon_uvoffset;
  330. xd->dst.v_buffer = dst_buffer[2] + recon_uvoffset;
  331. /* propagate errors from reference frames */
  332. xd->corrupted |= ref_fb_corrupted[xd->mode_info_context->mbmi.ref_frame];
  333. if (xd->corrupted) {
  334. // Move current decoding marcoblock to the end of row for all rows
  335. // assigned to this thread, such that other threads won't be waiting.
  336. for (; mb_row < pc->mb_rows;
  337. mb_row += (pbi->decoding_thread_count + 1)) {
  338. current_mb_col = &pbi->mt_current_mb_col[mb_row];
  339. vpx_atomic_store_release(current_mb_col, pc->mb_cols + nsync);
  340. }
  341. vpx_internal_error(&xd->error_info, VPX_CODEC_CORRUPT_FRAME,
  342. "Corrupted reference frame");
  343. }
  344. if (xd->mode_info_context->mbmi.ref_frame >= LAST_FRAME) {
  345. const MV_REFERENCE_FRAME ref = xd->mode_info_context->mbmi.ref_frame;
  346. xd->pre.y_buffer = ref_buffer[ref][0] + recon_yoffset;
  347. xd->pre.u_buffer = ref_buffer[ref][1] + recon_uvoffset;
  348. xd->pre.v_buffer = ref_buffer[ref][2] + recon_uvoffset;
  349. } else {
  350. // ref_frame is INTRA_FRAME, pre buffer should not be used.
  351. xd->pre.y_buffer = 0;
  352. xd->pre.u_buffer = 0;
  353. xd->pre.v_buffer = 0;
  354. }
  355. mt_decode_macroblock(pbi, xd, 0);
  356. xd->left_available = 1;
  357. /* check if the boolean decoder has suffered an error */
  358. xd->corrupted |= vp8dx_bool_error(xd->current_bc);
  359. xd->recon_above[0] += 16;
  360. xd->recon_above[1] += 8;
  361. xd->recon_above[2] += 8;
  362. if (!pbi->common.filter_level) {
  363. xd->recon_left[0] += 16;
  364. xd->recon_left[1] += 8;
  365. xd->recon_left[2] += 8;
  366. }
  367. if (pbi->common.filter_level) {
  368. int skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED &&
  369. xd->mode_info_context->mbmi.mode != SPLITMV &&
  370. xd->mode_info_context->mbmi.mb_skip_coeff);
  371. const int mode_index =
  372. lfi_n->mode_lf_lut[xd->mode_info_context->mbmi.mode];
  373. const int seg = xd->mode_info_context->mbmi.segment_id;
  374. const int ref_frame = xd->mode_info_context->mbmi.ref_frame;
  375. filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
  376. if (mb_row != pc->mb_rows - 1) {
  377. /* Save decoded MB last row data for next-row decoding */
  378. memcpy((pbi->mt_yabove_row[mb_row + 1] + 32 + mb_col * 16),
  379. (xd->dst.y_buffer + 15 * recon_y_stride), 16);
  380. memcpy((pbi->mt_uabove_row[mb_row + 1] + 16 + mb_col * 8),
  381. (xd->dst.u_buffer + 7 * recon_uv_stride), 8);
  382. memcpy((pbi->mt_vabove_row[mb_row + 1] + 16 + mb_col * 8),
  383. (xd->dst.v_buffer + 7 * recon_uv_stride), 8);
  384. }
  385. /* save left_col for next MB decoding */
  386. if (mb_col != pc->mb_cols - 1) {
  387. MODE_INFO *next = xd->mode_info_context + 1;
  388. if (next->mbmi.ref_frame == INTRA_FRAME) {
  389. for (i = 0; i < 16; ++i) {
  390. pbi->mt_yleft_col[mb_row][i] =
  391. xd->dst.y_buffer[i * recon_y_stride + 15];
  392. }
  393. for (i = 0; i < 8; ++i) {
  394. pbi->mt_uleft_col[mb_row][i] =
  395. xd->dst.u_buffer[i * recon_uv_stride + 7];
  396. pbi->mt_vleft_col[mb_row][i] =
  397. xd->dst.v_buffer[i * recon_uv_stride + 7];
  398. }
  399. }
  400. }
  401. /* loopfilter on this macroblock. */
  402. if (filter_level) {
  403. if (pc->filter_type == NORMAL_LOOPFILTER) {
  404. loop_filter_info lfi;
  405. FRAME_TYPE frame_type = pc->frame_type;
  406. const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
  407. lfi.mblim = lfi_n->mblim[filter_level];
  408. lfi.blim = lfi_n->blim[filter_level];
  409. lfi.lim = lfi_n->lim[filter_level];
  410. lfi.hev_thr = lfi_n->hev_thr[hev_index];
  411. if (mb_col > 0)
  412. vp8_loop_filter_mbv(xd->dst.y_buffer, xd->dst.u_buffer,
  413. xd->dst.v_buffer, recon_y_stride,
  414. recon_uv_stride, &lfi);
  415. if (!skip_lf)
  416. vp8_loop_filter_bv(xd->dst.y_buffer, xd->dst.u_buffer,
  417. xd->dst.v_buffer, recon_y_stride,
  418. recon_uv_stride, &lfi);
  419. /* don't apply across umv border */
  420. if (mb_row > 0)
  421. vp8_loop_filter_mbh(xd->dst.y_buffer, xd->dst.u_buffer,
  422. xd->dst.v_buffer, recon_y_stride,
  423. recon_uv_stride, &lfi);
  424. if (!skip_lf)
  425. vp8_loop_filter_bh(xd->dst.y_buffer, xd->dst.u_buffer,
  426. xd->dst.v_buffer, recon_y_stride,
  427. recon_uv_stride, &lfi);
  428. } else {
  429. if (mb_col > 0)
  430. vp8_loop_filter_simple_mbv(xd->dst.y_buffer, recon_y_stride,
  431. lfi_n->mblim[filter_level]);
  432. if (!skip_lf)
  433. vp8_loop_filter_simple_bv(xd->dst.y_buffer, recon_y_stride,
  434. lfi_n->blim[filter_level]);
  435. /* don't apply across umv border */
  436. if (mb_row > 0)
  437. vp8_loop_filter_simple_mbh(xd->dst.y_buffer, recon_y_stride,
  438. lfi_n->mblim[filter_level]);
  439. if (!skip_lf)
  440. vp8_loop_filter_simple_bh(xd->dst.y_buffer, recon_y_stride,
  441. lfi_n->blim[filter_level]);
  442. }
  443. }
  444. }
  445. recon_yoffset += 16;
  446. recon_uvoffset += 8;
  447. ++xd->mode_info_context; /* next mb */
  448. xd->above_context++;
  449. }
  450. /* adjust to the next row of mbs */
  451. if (pbi->common.filter_level) {
  452. if (mb_row != pc->mb_rows - 1) {
  453. int lasty = yv12_fb_lst->y_width + VP8BORDERINPIXELS;
  454. int lastuv = (yv12_fb_lst->y_width >> 1) + (VP8BORDERINPIXELS >> 1);
  455. for (i = 0; i < 4; ++i) {
  456. pbi->mt_yabove_row[mb_row + 1][lasty + i] =
  457. pbi->mt_yabove_row[mb_row + 1][lasty - 1];
  458. pbi->mt_uabove_row[mb_row + 1][lastuv + i] =
  459. pbi->mt_uabove_row[mb_row + 1][lastuv - 1];
  460. pbi->mt_vabove_row[mb_row + 1][lastuv + i] =
  461. pbi->mt_vabove_row[mb_row + 1][lastuv - 1];
  462. }
  463. }
  464. } else {
  465. vp8_extend_mb_row(yv12_fb_new, xd->dst.y_buffer + 16,
  466. xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
  467. }
  468. /* last MB of row is ready just after extension is done */
  469. vpx_atomic_store_release(current_mb_col, mb_col + nsync);
  470. ++xd->mode_info_context; /* skip prediction column */
  471. xd->up_available = 1;
  472. /* since we have multithread */
  473. xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count;
  474. }
  475. /* signal end of decoding of current thread for current frame */
  476. if (last_mb_row + (int)pbi->decoding_thread_count + 1 >= pc->mb_rows)
  477. sem_post(&pbi->h_event_end_decoding);
  478. }
  479. static THREAD_FUNCTION thread_decoding_proc(void *p_data) {
  480. int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
  481. VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1);
  482. MB_ROW_DEC *mbrd = (MB_ROW_DEC *)(((DECODETHREAD_DATA *)p_data)->ptr2);
  483. ENTROPY_CONTEXT_PLANES mb_row_left_context;
  484. while (1) {
  485. if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) break;
  486. if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) {
  487. if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) {
  488. break;
  489. } else {
  490. MACROBLOCKD *xd = &mbrd->mbd;
  491. xd->left_context = &mb_row_left_context;
  492. if (setjmp(xd->error_info.jmp)) {
  493. xd->error_info.setjmp = 0;
  494. // Signal the end of decoding for current thread.
  495. sem_post(&pbi->h_event_end_decoding);
  496. continue;
  497. }
  498. xd->error_info.setjmp = 1;
  499. mt_decode_mb_rows(pbi, xd, ithread + 1);
  500. }
  501. }
  502. }
  503. return 0;
  504. }
  505. void vp8_decoder_create_threads(VP8D_COMP *pbi) {
  506. int core_count = 0;
  507. unsigned int ithread;
  508. vpx_atomic_init(&pbi->b_multithreaded_rd, 0);
  509. pbi->allocated_decoding_thread_count = 0;
  510. /* limit decoding threads to the max number of token partitions */
  511. core_count = (pbi->max_threads > 8) ? 8 : pbi->max_threads;
  512. /* limit decoding threads to the available cores */
  513. if (core_count > pbi->common.processor_core_count) {
  514. core_count = pbi->common.processor_core_count;
  515. }
  516. if (core_count > 1) {
  517. vpx_atomic_init(&pbi->b_multithreaded_rd, 1);
  518. pbi->decoding_thread_count = core_count - 1;
  519. CALLOC_ARRAY(pbi->h_decoding_thread, pbi->decoding_thread_count);
  520. CALLOC_ARRAY(pbi->h_event_start_decoding, pbi->decoding_thread_count);
  521. CALLOC_ARRAY_ALIGNED(pbi->mb_row_di, pbi->decoding_thread_count, 32);
  522. CALLOC_ARRAY(pbi->de_thread_data, pbi->decoding_thread_count);
  523. if (sem_init(&pbi->h_event_end_decoding, 0, 0)) {
  524. vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,
  525. "Failed to initialize semaphore");
  526. }
  527. for (ithread = 0; ithread < pbi->decoding_thread_count; ++ithread) {
  528. if (sem_init(&pbi->h_event_start_decoding[ithread], 0, 0)) break;
  529. vp8_setup_block_dptrs(&pbi->mb_row_di[ithread].mbd);
  530. pbi->de_thread_data[ithread].ithread = ithread;
  531. pbi->de_thread_data[ithread].ptr1 = (void *)pbi;
  532. pbi->de_thread_data[ithread].ptr2 = (void *)&pbi->mb_row_di[ithread];
  533. if (pthread_create(&pbi->h_decoding_thread[ithread], 0,
  534. thread_decoding_proc, &pbi->de_thread_data[ithread])) {
  535. sem_destroy(&pbi->h_event_start_decoding[ithread]);
  536. break;
  537. }
  538. }
  539. pbi->allocated_decoding_thread_count = ithread;
  540. if (pbi->allocated_decoding_thread_count !=
  541. (int)pbi->decoding_thread_count) {
  542. /* the remainder of cleanup cases will be handled in
  543. * vp8_decoder_remove_threads(). */
  544. if (pbi->allocated_decoding_thread_count == 0) {
  545. sem_destroy(&pbi->h_event_end_decoding);
  546. }
  547. vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,
  548. "Failed to create threads");
  549. }
  550. }
  551. }
  552. void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows) {
  553. int i;
  554. vpx_free(pbi->mt_current_mb_col);
  555. pbi->mt_current_mb_col = NULL;
  556. /* Free above_row buffers. */
  557. if (pbi->mt_yabove_row) {
  558. for (i = 0; i < mb_rows; ++i) {
  559. vpx_free(pbi->mt_yabove_row[i]);
  560. pbi->mt_yabove_row[i] = NULL;
  561. }
  562. vpx_free(pbi->mt_yabove_row);
  563. pbi->mt_yabove_row = NULL;
  564. }
  565. if (pbi->mt_uabove_row) {
  566. for (i = 0; i < mb_rows; ++i) {
  567. vpx_free(pbi->mt_uabove_row[i]);
  568. pbi->mt_uabove_row[i] = NULL;
  569. }
  570. vpx_free(pbi->mt_uabove_row);
  571. pbi->mt_uabove_row = NULL;
  572. }
  573. if (pbi->mt_vabove_row) {
  574. for (i = 0; i < mb_rows; ++i) {
  575. vpx_free(pbi->mt_vabove_row[i]);
  576. pbi->mt_vabove_row[i] = NULL;
  577. }
  578. vpx_free(pbi->mt_vabove_row);
  579. pbi->mt_vabove_row = NULL;
  580. }
  581. /* Free left_col buffers. */
  582. if (pbi->mt_yleft_col) {
  583. for (i = 0; i < mb_rows; ++i) {
  584. vpx_free(pbi->mt_yleft_col[i]);
  585. pbi->mt_yleft_col[i] = NULL;
  586. }
  587. vpx_free(pbi->mt_yleft_col);
  588. pbi->mt_yleft_col = NULL;
  589. }
  590. if (pbi->mt_uleft_col) {
  591. for (i = 0; i < mb_rows; ++i) {
  592. vpx_free(pbi->mt_uleft_col[i]);
  593. pbi->mt_uleft_col[i] = NULL;
  594. }
  595. vpx_free(pbi->mt_uleft_col);
  596. pbi->mt_uleft_col = NULL;
  597. }
  598. if (pbi->mt_vleft_col) {
  599. for (i = 0; i < mb_rows; ++i) {
  600. vpx_free(pbi->mt_vleft_col[i]);
  601. pbi->mt_vleft_col[i] = NULL;
  602. }
  603. vpx_free(pbi->mt_vleft_col);
  604. pbi->mt_vleft_col = NULL;
  605. }
  606. }
  607. void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
  608. VP8_COMMON *const pc = &pbi->common;
  609. int i;
  610. int uv_width;
  611. if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) {
  612. vp8mt_de_alloc_temp_buffers(pbi, prev_mb_rows);
  613. /* our internal buffers are always multiples of 16 */
  614. if ((width & 0xf) != 0) width += 16 - (width & 0xf);
  615. if (width < 640) {
  616. pbi->sync_range = 1;
  617. } else if (width <= 1280) {
  618. pbi->sync_range = 8;
  619. } else if (width <= 2560) {
  620. pbi->sync_range = 16;
  621. } else {
  622. pbi->sync_range = 32;
  623. }
  624. uv_width = width >> 1;
  625. /* Allocate a vpx_atomic_int for each mb row. */
  626. CHECK_MEM_ERROR(pbi->mt_current_mb_col,
  627. vpx_malloc(sizeof(*pbi->mt_current_mb_col) * pc->mb_rows));
  628. for (i = 0; i < pc->mb_rows; ++i)
  629. vpx_atomic_init(&pbi->mt_current_mb_col[i], 0);
  630. /* Allocate memory for above_row buffers. */
  631. CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows);
  632. for (i = 0; i < pc->mb_rows; ++i) {
  633. CHECK_MEM_ERROR(pbi->mt_yabove_row[i],
  634. vpx_memalign(16, sizeof(unsigned char) *
  635. (width + (VP8BORDERINPIXELS << 1))));
  636. vp8_zero_array(pbi->mt_yabove_row[i], width + (VP8BORDERINPIXELS << 1));
  637. }
  638. CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows);
  639. for (i = 0; i < pc->mb_rows; ++i) {
  640. CHECK_MEM_ERROR(pbi->mt_uabove_row[i],
  641. vpx_memalign(16, sizeof(unsigned char) *
  642. (uv_width + VP8BORDERINPIXELS)));
  643. vp8_zero_array(pbi->mt_uabove_row[i], uv_width + VP8BORDERINPIXELS);
  644. }
  645. CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows);
  646. for (i = 0; i < pc->mb_rows; ++i) {
  647. CHECK_MEM_ERROR(pbi->mt_vabove_row[i],
  648. vpx_memalign(16, sizeof(unsigned char) *
  649. (uv_width + VP8BORDERINPIXELS)));
  650. vp8_zero_array(pbi->mt_vabove_row[i], uv_width + VP8BORDERINPIXELS);
  651. }
  652. /* Allocate memory for left_col buffers. */
  653. CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows);
  654. for (i = 0; i < pc->mb_rows; ++i)
  655. CHECK_MEM_ERROR(pbi->mt_yleft_col[i],
  656. vpx_calloc(sizeof(unsigned char) * 16, 1));
  657. CALLOC_ARRAY(pbi->mt_uleft_col, pc->mb_rows);
  658. for (i = 0; i < pc->mb_rows; ++i)
  659. CHECK_MEM_ERROR(pbi->mt_uleft_col[i],
  660. vpx_calloc(sizeof(unsigned char) * 8, 1));
  661. CALLOC_ARRAY(pbi->mt_vleft_col, pc->mb_rows);
  662. for (i = 0; i < pc->mb_rows; ++i)
  663. CHECK_MEM_ERROR(pbi->mt_vleft_col[i],
  664. vpx_calloc(sizeof(unsigned char) * 8, 1));
  665. }
  666. }
  667. void vp8_decoder_remove_threads(VP8D_COMP *pbi) {
  668. /* shutdown MB Decoding thread; */
  669. if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) {
  670. int i;
  671. vpx_atomic_store_release(&pbi->b_multithreaded_rd, 0);
  672. /* allow all threads to exit */
  673. for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) {
  674. sem_post(&pbi->h_event_start_decoding[i]);
  675. pthread_join(pbi->h_decoding_thread[i], NULL);
  676. }
  677. for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) {
  678. sem_destroy(&pbi->h_event_start_decoding[i]);
  679. }
  680. if (pbi->allocated_decoding_thread_count) {
  681. sem_destroy(&pbi->h_event_end_decoding);
  682. }
  683. vpx_free(pbi->h_decoding_thread);
  684. pbi->h_decoding_thread = NULL;
  685. vpx_free(pbi->h_event_start_decoding);
  686. pbi->h_event_start_decoding = NULL;
  687. vpx_free(pbi->mb_row_di);
  688. pbi->mb_row_di = NULL;
  689. vpx_free(pbi->de_thread_data);
  690. pbi->de_thread_data = NULL;
  691. vp8mt_de_alloc_temp_buffers(pbi, pbi->common.mb_rows);
  692. }
  693. }
  694. int vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) {
  695. VP8_COMMON *pc = &pbi->common;
  696. unsigned int i;
  697. int j;
  698. int filter_level = pc->filter_level;
  699. YV12_BUFFER_CONFIG *yv12_fb_new = pbi->dec_fb_ref[INTRA_FRAME];
  700. if (filter_level) {
  701. /* Set above_row buffer to 127 for decoding first MB row */
  702. memset(pbi->mt_yabove_row[0] + VP8BORDERINPIXELS - 1, 127,
  703. yv12_fb_new->y_width + 5);
  704. memset(pbi->mt_uabove_row[0] + (VP8BORDERINPIXELS >> 1) - 1, 127,
  705. (yv12_fb_new->y_width >> 1) + 5);
  706. memset(pbi->mt_vabove_row[0] + (VP8BORDERINPIXELS >> 1) - 1, 127,
  707. (yv12_fb_new->y_width >> 1) + 5);
  708. for (j = 1; j < pc->mb_rows; ++j) {
  709. memset(pbi->mt_yabove_row[j] + VP8BORDERINPIXELS - 1, (unsigned char)129,
  710. 1);
  711. memset(pbi->mt_uabove_row[j] + (VP8BORDERINPIXELS >> 1) - 1,
  712. (unsigned char)129, 1);
  713. memset(pbi->mt_vabove_row[j] + (VP8BORDERINPIXELS >> 1) - 1,
  714. (unsigned char)129, 1);
  715. }
  716. /* Set left_col to 129 initially */
  717. for (j = 0; j < pc->mb_rows; ++j) {
  718. memset(pbi->mt_yleft_col[j], (unsigned char)129, 16);
  719. memset(pbi->mt_uleft_col[j], (unsigned char)129, 8);
  720. memset(pbi->mt_vleft_col[j], (unsigned char)129, 8);
  721. }
  722. /* Initialize the loop filter for this frame. */
  723. vp8_loop_filter_frame_init(pc, &pbi->mb, filter_level);
  724. } else {
  725. vp8_setup_intra_recon_top_line(yv12_fb_new);
  726. }
  727. setup_decoding_thread_data(pbi, xd, pbi->mb_row_di,
  728. pbi->decoding_thread_count);
  729. for (i = 0; i < pbi->decoding_thread_count; ++i) {
  730. sem_post(&pbi->h_event_start_decoding[i]);
  731. }
  732. if (setjmp(xd->error_info.jmp)) {
  733. xd->error_info.setjmp = 0;
  734. xd->corrupted = 1;
  735. // Wait for other threads to finish. This prevents other threads decoding
  736. // the current frame while the main thread starts decoding the next frame,
  737. // which causes a data race.
  738. for (i = 0; i < pbi->decoding_thread_count; ++i)
  739. sem_wait(&pbi->h_event_end_decoding);
  740. return -1;
  741. }
  742. xd->error_info.setjmp = 1;
  743. mt_decode_mb_rows(pbi, xd, 0);
  744. for (i = 0; i < pbi->decoding_thread_count + 1; ++i)
  745. sem_wait(&pbi->h_event_end_decoding); /* add back for each frame */
  746. return 0;
  747. }