vp9_denoiser_sse2.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328
  1. /*
  2. * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <emmintrin.h>
  11. #include "./vpx_config.h"
  12. #include "./vp9_rtcd.h"
  13. #include "vpx_ports/emmintrin_compat.h"
  14. #include "vpx/vpx_integer.h"
  15. #include "vp9/common/vp9_reconinter.h"
  16. #include "vp9/encoder/vp9_context_tree.h"
  17. #include "vp9/encoder/vp9_denoiser.h"
  18. #include "vpx_mem/vpx_mem.h"
  19. // Compute the sum of all pixel differences of this MB.
  20. static INLINE int sum_diff_16x1(__m128i acc_diff) {
  21. const __m128i k_1 = _mm_set1_epi16(1);
  22. const __m128i acc_diff_lo =
  23. _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8);
  24. const __m128i acc_diff_hi =
  25. _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8);
  26. const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
  27. const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
  28. const __m128i hgfe_dcba =
  29. _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
  30. const __m128i hgfedcba =
  31. _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
  32. return _mm_cvtsi128_si32(hgfedcba);
  33. }
  34. // Denoise a 16x1 vector.
  35. static INLINE __m128i vp9_denoiser_16x1_sse2(
  36. const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
  37. const __m128i *k_0, const __m128i *k_4, const __m128i *k_8,
  38. const __m128i *k_16, const __m128i *l3, const __m128i *l32,
  39. const __m128i *l21, __m128i acc_diff) {
  40. // Calculate differences
  41. const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
  42. const __m128i v_mc_running_avg_y =
  43. _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
  44. __m128i v_running_avg_y;
  45. const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
  46. const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
  47. // Obtain the sign. FF if diff is negative.
  48. const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0);
  49. // Clamp absolute difference to 16 to be used to get mask. Doing this
  50. // allows us to use _mm_cmpgt_epi8, which operates on signed byte.
  51. const __m128i clamped_absdiff =
  52. _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16);
  53. // Get masks for l2 l1 and l0 adjustments.
  54. const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff);
  55. const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff);
  56. const __m128i mask0 = _mm_cmpgt_epi8(*k_4, clamped_absdiff);
  57. // Get adjustments for l2, l1, and l0.
  58. __m128i adj2 = _mm_and_si128(mask2, *l32);
  59. const __m128i adj1 = _mm_and_si128(mask1, *l21);
  60. const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
  61. __m128i adj, padj, nadj;
  62. // Combine the adjustments and get absolute adjustments.
  63. adj2 = _mm_add_epi8(adj2, adj1);
  64. adj = _mm_sub_epi8(*l3, adj2);
  65. adj = _mm_andnot_si128(mask0, adj);
  66. adj = _mm_or_si128(adj, adj0);
  67. // Restore the sign and get positive and negative adjustments.
  68. padj = _mm_andnot_si128(diff_sign, adj);
  69. nadj = _mm_and_si128(diff_sign, adj);
  70. // Calculate filtered value.
  71. v_running_avg_y = _mm_adds_epu8(v_sig, padj);
  72. v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
  73. _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
  74. // Adjustments <=7, and each element in acc_diff can fit in signed
  75. // char.
  76. acc_diff = _mm_adds_epi8(acc_diff, padj);
  77. acc_diff = _mm_subs_epi8(acc_diff, nadj);
  78. return acc_diff;
  79. }
  80. // Denoise a 16x1 vector with a weaker filter.
  81. static INLINE __m128i vp9_denoiser_adj_16x1_sse2(
  82. const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
  83. const __m128i k_0, const __m128i k_delta, __m128i acc_diff) {
  84. __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
  85. // Calculate differences.
  86. const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
  87. const __m128i v_mc_running_avg_y =
  88. _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
  89. const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
  90. const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
  91. // Obtain the sign. FF if diff is negative.
  92. const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
  93. // Clamp absolute difference to delta to get the adjustment.
  94. const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
  95. // Restore the sign and get positive and negative adjustments.
  96. __m128i padj, nadj;
  97. padj = _mm_andnot_si128(diff_sign, adj);
  98. nadj = _mm_and_si128(diff_sign, adj);
  99. // Calculate filtered value.
  100. v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj);
  101. v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj);
  102. _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
  103. // Accumulate the adjustments.
  104. acc_diff = _mm_subs_epi8(acc_diff, padj);
  105. acc_diff = _mm_adds_epi8(acc_diff, nadj);
  106. return acc_diff;
  107. }
  108. // Denoise 8x8 and 8x16 blocks.
  109. static int vp9_denoiser_NxM_sse2_small(const uint8_t *sig, int sig_stride,
  110. const uint8_t *mc_running_avg_y,
  111. int mc_avg_y_stride,
  112. uint8_t *running_avg_y, int avg_y_stride,
  113. int increase_denoising, BLOCK_SIZE bs,
  114. int motion_magnitude, int width) {
  115. int sum_diff_thresh, r, sum_diff = 0;
  116. const int shift_inc =
  117. (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
  118. ? 1
  119. : 0;
  120. uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
  121. __m128i acc_diff = _mm_setzero_si128();
  122. const __m128i k_0 = _mm_setzero_si128();
  123. const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
  124. const __m128i k_8 = _mm_set1_epi8(8);
  125. const __m128i k_16 = _mm_set1_epi8(16);
  126. // Modify each level's adjustment according to motion_magnitude.
  127. const __m128i l3 = _mm_set1_epi8(
  128. (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
  129. // Difference between level 3 and level 2 is 2.
  130. const __m128i l32 = _mm_set1_epi8(2);
  131. // Difference between level 2 and level 1 is 1.
  132. const __m128i l21 = _mm_set1_epi8(1);
  133. const int b_height = (4 << b_height_log2_lookup[bs]) >> 1;
  134. for (r = 0; r < b_height; ++r) {
  135. memcpy(sig_buffer[r], sig, width);
  136. memcpy(sig_buffer[r] + width, sig + sig_stride, width);
  137. memcpy(mc_running_buffer[r], mc_running_avg_y, width);
  138. memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride,
  139. width);
  140. memcpy(running_buffer[r], running_avg_y, width);
  141. memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
  142. acc_diff = vp9_denoiser_16x1_sse2(sig_buffer[r], mc_running_buffer[r],
  143. running_buffer[r], &k_0, &k_4, &k_8,
  144. &k_16, &l3, &l32, &l21, acc_diff);
  145. memcpy(running_avg_y, running_buffer[r], width);
  146. memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, width);
  147. // Update pointers for next iteration.
  148. sig += (sig_stride << 1);
  149. mc_running_avg_y += (mc_avg_y_stride << 1);
  150. running_avg_y += (avg_y_stride << 1);
  151. }
  152. {
  153. sum_diff = sum_diff_16x1(acc_diff);
  154. sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
  155. if (abs(sum_diff) > sum_diff_thresh) {
  156. // Before returning to copy the block (i.e., apply no denoising),
  157. // check if we can still apply some (weaker) temporal filtering to
  158. // this block, that would otherwise not be denoised at all. Simplest
  159. // is to apply an additional adjustment to running_avg_y to bring it
  160. // closer to sig. The adjustment is capped by a maximum delta, and
  161. // chosen such that in most cases the resulting sum_diff will be
  162. // within the acceptable range given by sum_diff_thresh.
  163. // The delta is set by the excess of absolute pixel diff over the
  164. // threshold.
  165. const int delta =
  166. ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
  167. // Only apply the adjustment for max delta up to 3.
  168. if (delta < 4) {
  169. const __m128i k_delta = _mm_set1_epi8(delta);
  170. running_avg_y -= avg_y_stride * (b_height << 1);
  171. for (r = 0; r < b_height; ++r) {
  172. acc_diff = vp9_denoiser_adj_16x1_sse2(
  173. sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_0,
  174. k_delta, acc_diff);
  175. memcpy(running_avg_y, running_buffer[r], width);
  176. memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width,
  177. width);
  178. // Update pointers for next iteration.
  179. running_avg_y += (avg_y_stride << 1);
  180. }
  181. sum_diff = sum_diff_16x1(acc_diff);
  182. if (abs(sum_diff) > sum_diff_thresh) {
  183. return COPY_BLOCK;
  184. }
  185. } else {
  186. return COPY_BLOCK;
  187. }
  188. }
  189. }
  190. return FILTER_BLOCK;
  191. }
  192. // Denoise 16x16, 16x32, 32x16, 32x32, 32x64, 64x32 and 64x64 blocks.
  193. static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
  194. const uint8_t *mc_running_avg_y,
  195. int mc_avg_y_stride,
  196. uint8_t *running_avg_y, int avg_y_stride,
  197. int increase_denoising, BLOCK_SIZE bs,
  198. int motion_magnitude) {
  199. int sum_diff_thresh, r, c, sum_diff = 0;
  200. const int shift_inc =
  201. (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
  202. ? 1
  203. : 0;
  204. __m128i acc_diff[4][4];
  205. const __m128i k_0 = _mm_setzero_si128();
  206. const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
  207. const __m128i k_8 = _mm_set1_epi8(8);
  208. const __m128i k_16 = _mm_set1_epi8(16);
  209. // Modify each level's adjustment according to motion_magnitude.
  210. const __m128i l3 = _mm_set1_epi8(
  211. (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
  212. // Difference between level 3 and level 2 is 2.
  213. const __m128i l32 = _mm_set1_epi8(2);
  214. // Difference between level 2 and level 1 is 1.
  215. const __m128i l21 = _mm_set1_epi8(1);
  216. const int b_width = (4 << b_width_log2_lookup[bs]);
  217. const int b_height = (4 << b_height_log2_lookup[bs]);
  218. const int b_width_shift4 = b_width >> 4;
  219. for (r = 0; r < 4; ++r) {
  220. for (c = 0; c < b_width_shift4; ++c) {
  221. acc_diff[c][r] = _mm_setzero_si128();
  222. }
  223. }
  224. for (r = 0; r < b_height; ++r) {
  225. for (c = 0; c < b_width_shift4; ++c) {
  226. acc_diff[c][r >> 4] = vp9_denoiser_16x1_sse2(
  227. sig, mc_running_avg_y, running_avg_y, &k_0, &k_4, &k_8, &k_16, &l3,
  228. &l32, &l21, acc_diff[c][r >> 4]);
  229. // Update pointers for next iteration.
  230. sig += 16;
  231. mc_running_avg_y += 16;
  232. running_avg_y += 16;
  233. }
  234. if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
  235. for (c = 0; c < b_width_shift4; ++c) {
  236. sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]);
  237. }
  238. }
  239. // Update pointers for next iteration.
  240. sig = sig - b_width + sig_stride;
  241. mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
  242. running_avg_y = running_avg_y - b_width + avg_y_stride;
  243. }
  244. {
  245. sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
  246. if (abs(sum_diff) > sum_diff_thresh) {
  247. const int delta =
  248. ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
  249. // Only apply the adjustment for max delta up to 3.
  250. if (delta < 4) {
  251. const __m128i k_delta = _mm_set1_epi8(delta);
  252. sig -= sig_stride * b_height;
  253. mc_running_avg_y -= mc_avg_y_stride * b_height;
  254. running_avg_y -= avg_y_stride * b_height;
  255. sum_diff = 0;
  256. for (r = 0; r < b_height; ++r) {
  257. for (c = 0; c < b_width_shift4; ++c) {
  258. acc_diff[c][r >> 4] =
  259. vp9_denoiser_adj_16x1_sse2(sig, mc_running_avg_y, running_avg_y,
  260. k_0, k_delta, acc_diff[c][r >> 4]);
  261. // Update pointers for next iteration.
  262. sig += 16;
  263. mc_running_avg_y += 16;
  264. running_avg_y += 16;
  265. }
  266. if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
  267. for (c = 0; c < b_width_shift4; ++c) {
  268. sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]);
  269. }
  270. }
  271. sig = sig - b_width + sig_stride;
  272. mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
  273. running_avg_y = running_avg_y - b_width + avg_y_stride;
  274. }
  275. if (abs(sum_diff) > sum_diff_thresh) {
  276. return COPY_BLOCK;
  277. }
  278. } else {
  279. return COPY_BLOCK;
  280. }
  281. }
  282. }
  283. return FILTER_BLOCK;
  284. }
  285. int vp9_denoiser_filter_sse2(const uint8_t *sig, int sig_stride,
  286. const uint8_t *mc_avg, int mc_avg_stride,
  287. uint8_t *avg, int avg_stride,
  288. int increase_denoising, BLOCK_SIZE bs,
  289. int motion_magnitude) {
  290. // Rank by frequency of the block type to have an early termination.
  291. if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 ||
  292. bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 ||
  293. bs == BLOCK_32X64 || bs == BLOCK_64X32) {
  294. return vp9_denoiser_NxM_sse2_big(sig, sig_stride, mc_avg, mc_avg_stride,
  295. avg, avg_stride, increase_denoising, bs,
  296. motion_magnitude);
  297. } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) {
  298. return vp9_denoiser_NxM_sse2_small(sig, sig_stride, mc_avg, mc_avg_stride,
  299. avg, avg_stride, increase_denoising, bs,
  300. motion_magnitude, 8);
  301. } else {
  302. return COPY_BLOCK;
  303. }
  304. }