vp9_denoiser_sse2.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. /*
  2. * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <emmintrin.h>
  11. #include "./vpx_config.h"
  12. #include "./vp9_rtcd.h"
  13. #include "vpx/vpx_integer.h"
  14. #include "vp9/common/vp9_reconinter.h"
  15. #include "vp9/encoder/vp9_context_tree.h"
  16. #include "vp9/encoder/vp9_denoiser.h"
  17. #include "vpx_mem/vpx_mem.h"
  18. // Compute the sum of all pixel differences of this MB.
  19. static INLINE int sum_diff_16x1(__m128i acc_diff) {
  20. const __m128i k_1 = _mm_set1_epi16(1);
  21. const __m128i acc_diff_lo =
  22. _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8);
  23. const __m128i acc_diff_hi =
  24. _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8);
  25. const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
  26. const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
  27. const __m128i hgfe_dcba =
  28. _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
  29. const __m128i hgfedcba =
  30. _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
  31. return _mm_cvtsi128_si32(hgfedcba);
  32. }
  33. // Denoise a 16x1 vector.
  34. static INLINE __m128i vp9_denoiser_16x1_sse2(
  35. const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
  36. const __m128i *k_0, const __m128i *k_4, const __m128i *k_8,
  37. const __m128i *k_16, const __m128i *l3, const __m128i *l32,
  38. const __m128i *l21, __m128i acc_diff) {
  39. // Calculate differences
  40. const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
  41. const __m128i v_mc_running_avg_y =
  42. _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
  43. __m128i v_running_avg_y;
  44. const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
  45. const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
  46. // Obtain the sign. FF if diff is negative.
  47. const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0);
  48. // Clamp absolute difference to 16 to be used to get mask. Doing this
  49. // allows us to use _mm_cmpgt_epi8, which operates on signed byte.
  50. const __m128i clamped_absdiff =
  51. _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16);
  52. // Get masks for l2 l1 and l0 adjustments.
  53. const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff);
  54. const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff);
  55. const __m128i mask0 = _mm_cmpgt_epi8(*k_4, clamped_absdiff);
  56. // Get adjustments for l2, l1, and l0.
  57. __m128i adj2 = _mm_and_si128(mask2, *l32);
  58. const __m128i adj1 = _mm_and_si128(mask1, *l21);
  59. const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
  60. __m128i adj, padj, nadj;
  61. // Combine the adjustments and get absolute adjustments.
  62. adj2 = _mm_add_epi8(adj2, adj1);
  63. adj = _mm_sub_epi8(*l3, adj2);
  64. adj = _mm_andnot_si128(mask0, adj);
  65. adj = _mm_or_si128(adj, adj0);
  66. // Restore the sign and get positive and negative adjustments.
  67. padj = _mm_andnot_si128(diff_sign, adj);
  68. nadj = _mm_and_si128(diff_sign, adj);
  69. // Calculate filtered value.
  70. v_running_avg_y = _mm_adds_epu8(v_sig, padj);
  71. v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
  72. _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
  73. // Adjustments <=7, and each element in acc_diff can fit in signed
  74. // char.
  75. acc_diff = _mm_adds_epi8(acc_diff, padj);
  76. acc_diff = _mm_subs_epi8(acc_diff, nadj);
  77. return acc_diff;
  78. }
  79. // Denoise a 16x1 vector with a weaker filter.
  80. static INLINE __m128i vp9_denoiser_adj_16x1_sse2(
  81. const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
  82. const __m128i k_0, const __m128i k_delta, __m128i acc_diff) {
  83. __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
  84. // Calculate differences.
  85. const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
  86. const __m128i v_mc_running_avg_y =
  87. _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
  88. const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
  89. const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
  90. // Obtain the sign. FF if diff is negative.
  91. const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
  92. // Clamp absolute difference to delta to get the adjustment.
  93. const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
  94. // Restore the sign and get positive and negative adjustments.
  95. __m128i padj, nadj;
  96. padj = _mm_andnot_si128(diff_sign, adj);
  97. nadj = _mm_and_si128(diff_sign, adj);
  98. // Calculate filtered value.
  99. v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj);
  100. v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj);
  101. _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
  102. // Accumulate the adjustments.
  103. acc_diff = _mm_subs_epi8(acc_diff, padj);
  104. acc_diff = _mm_adds_epi8(acc_diff, nadj);
  105. return acc_diff;
  106. }
  107. // Denoise 8x8 and 8x16 blocks.
  108. static int vp9_denoiser_NxM_sse2_small(const uint8_t *sig, int sig_stride,
  109. const uint8_t *mc_running_avg_y,
  110. int mc_avg_y_stride,
  111. uint8_t *running_avg_y, int avg_y_stride,
  112. int increase_denoising, BLOCK_SIZE bs,
  113. int motion_magnitude, int width) {
  114. int sum_diff_thresh, r, sum_diff = 0;
  115. const int shift_inc =
  116. (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
  117. ? 1
  118. : 0;
  119. uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
  120. __m128i acc_diff = _mm_setzero_si128();
  121. const __m128i k_0 = _mm_setzero_si128();
  122. const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
  123. const __m128i k_8 = _mm_set1_epi8(8);
  124. const __m128i k_16 = _mm_set1_epi8(16);
  125. // Modify each level's adjustment according to motion_magnitude.
  126. const __m128i l3 = _mm_set1_epi8(
  127. (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
  128. // Difference between level 3 and level 2 is 2.
  129. const __m128i l32 = _mm_set1_epi8(2);
  130. // Difference between level 2 and level 1 is 1.
  131. const __m128i l21 = _mm_set1_epi8(1);
  132. const int b_height = (4 << b_height_log2_lookup[bs]) >> 1;
  133. for (r = 0; r < b_height; ++r) {
  134. memcpy(sig_buffer[r], sig, width);
  135. memcpy(sig_buffer[r] + width, sig + sig_stride, width);
  136. memcpy(mc_running_buffer[r], mc_running_avg_y, width);
  137. memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride,
  138. width);
  139. memcpy(running_buffer[r], running_avg_y, width);
  140. memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
  141. acc_diff = vp9_denoiser_16x1_sse2(sig_buffer[r], mc_running_buffer[r],
  142. running_buffer[r], &k_0, &k_4, &k_8,
  143. &k_16, &l3, &l32, &l21, acc_diff);
  144. memcpy(running_avg_y, running_buffer[r], width);
  145. memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, width);
  146. // Update pointers for next iteration.
  147. sig += (sig_stride << 1);
  148. mc_running_avg_y += (mc_avg_y_stride << 1);
  149. running_avg_y += (avg_y_stride << 1);
  150. }
  151. {
  152. sum_diff = sum_diff_16x1(acc_diff);
  153. sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
  154. if (abs(sum_diff) > sum_diff_thresh) {
  155. // Before returning to copy the block (i.e., apply no denoising),
  156. // check if we can still apply some (weaker) temporal filtering to
  157. // this block, that would otherwise not be denoised at all. Simplest
  158. // is to apply an additional adjustment to running_avg_y to bring it
  159. // closer to sig. The adjustment is capped by a maximum delta, and
  160. // chosen such that in most cases the resulting sum_diff will be
  161. // within the acceptable range given by sum_diff_thresh.
  162. // The delta is set by the excess of absolute pixel diff over the
  163. // threshold.
  164. const int delta =
  165. ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
  166. // Only apply the adjustment for max delta up to 3.
  167. if (delta < 4) {
  168. const __m128i k_delta = _mm_set1_epi8(delta);
  169. running_avg_y -= avg_y_stride * (b_height << 1);
  170. for (r = 0; r < b_height; ++r) {
  171. acc_diff = vp9_denoiser_adj_16x1_sse2(
  172. sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_0,
  173. k_delta, acc_diff);
  174. memcpy(running_avg_y, running_buffer[r], width);
  175. memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width,
  176. width);
  177. // Update pointers for next iteration.
  178. running_avg_y += (avg_y_stride << 1);
  179. }
  180. sum_diff = sum_diff_16x1(acc_diff);
  181. if (abs(sum_diff) > sum_diff_thresh) {
  182. return COPY_BLOCK;
  183. }
  184. } else {
  185. return COPY_BLOCK;
  186. }
  187. }
  188. }
  189. return FILTER_BLOCK;
  190. }
  191. // Denoise 16x16, 16x32, 32x16, 32x32, 32x64, 64x32 and 64x64 blocks.
  192. static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
  193. const uint8_t *mc_running_avg_y,
  194. int mc_avg_y_stride,
  195. uint8_t *running_avg_y, int avg_y_stride,
  196. int increase_denoising, BLOCK_SIZE bs,
  197. int motion_magnitude) {
  198. int sum_diff_thresh, r, c, sum_diff = 0;
  199. const int shift_inc =
  200. (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
  201. ? 1
  202. : 0;
  203. __m128i acc_diff[4][4];
  204. const __m128i k_0 = _mm_setzero_si128();
  205. const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
  206. const __m128i k_8 = _mm_set1_epi8(8);
  207. const __m128i k_16 = _mm_set1_epi8(16);
  208. // Modify each level's adjustment according to motion_magnitude.
  209. const __m128i l3 = _mm_set1_epi8(
  210. (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
  211. // Difference between level 3 and level 2 is 2.
  212. const __m128i l32 = _mm_set1_epi8(2);
  213. // Difference between level 2 and level 1 is 1.
  214. const __m128i l21 = _mm_set1_epi8(1);
  215. const int b_width = (4 << b_width_log2_lookup[bs]);
  216. const int b_height = (4 << b_height_log2_lookup[bs]);
  217. const int b_width_shift4 = b_width >> 4;
  218. for (r = 0; r < 4; ++r) {
  219. for (c = 0; c < b_width_shift4; ++c) {
  220. acc_diff[c][r] = _mm_setzero_si128();
  221. }
  222. }
  223. for (r = 0; r < b_height; ++r) {
  224. for (c = 0; c < b_width_shift4; ++c) {
  225. acc_diff[c][r >> 4] = vp9_denoiser_16x1_sse2(
  226. sig, mc_running_avg_y, running_avg_y, &k_0, &k_4, &k_8, &k_16, &l3,
  227. &l32, &l21, acc_diff[c][r >> 4]);
  228. // Update pointers for next iteration.
  229. sig += 16;
  230. mc_running_avg_y += 16;
  231. running_avg_y += 16;
  232. }
  233. if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
  234. for (c = 0; c < b_width_shift4; ++c) {
  235. sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]);
  236. }
  237. }
  238. // Update pointers for next iteration.
  239. sig = sig - b_width + sig_stride;
  240. mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
  241. running_avg_y = running_avg_y - b_width + avg_y_stride;
  242. }
  243. {
  244. sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
  245. if (abs(sum_diff) > sum_diff_thresh) {
  246. const int delta =
  247. ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
  248. // Only apply the adjustment for max delta up to 3.
  249. if (delta < 4) {
  250. const __m128i k_delta = _mm_set1_epi8(delta);
  251. sig -= sig_stride * b_height;
  252. mc_running_avg_y -= mc_avg_y_stride * b_height;
  253. running_avg_y -= avg_y_stride * b_height;
  254. sum_diff = 0;
  255. for (r = 0; r < b_height; ++r) {
  256. for (c = 0; c < b_width_shift4; ++c) {
  257. acc_diff[c][r >> 4] =
  258. vp9_denoiser_adj_16x1_sse2(sig, mc_running_avg_y, running_avg_y,
  259. k_0, k_delta, acc_diff[c][r >> 4]);
  260. // Update pointers for next iteration.
  261. sig += 16;
  262. mc_running_avg_y += 16;
  263. running_avg_y += 16;
  264. }
  265. if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
  266. for (c = 0; c < b_width_shift4; ++c) {
  267. sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]);
  268. }
  269. }
  270. sig = sig - b_width + sig_stride;
  271. mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
  272. running_avg_y = running_avg_y - b_width + avg_y_stride;
  273. }
  274. if (abs(sum_diff) > sum_diff_thresh) {
  275. return COPY_BLOCK;
  276. }
  277. } else {
  278. return COPY_BLOCK;
  279. }
  280. }
  281. }
  282. return FILTER_BLOCK;
  283. }
  284. int vp9_denoiser_filter_sse2(const uint8_t *sig, int sig_stride,
  285. const uint8_t *mc_avg, int mc_avg_stride,
  286. uint8_t *avg, int avg_stride,
  287. int increase_denoising, BLOCK_SIZE bs,
  288. int motion_magnitude) {
  289. // Rank by frequency of the block type to have an early termination.
  290. if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 ||
  291. bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 ||
  292. bs == BLOCK_32X64 || bs == BLOCK_64X32) {
  293. return vp9_denoiser_NxM_sse2_big(sig, sig_stride, mc_avg, mc_avg_stride,
  294. avg, avg_stride, increase_denoising, bs,
  295. motion_magnitude);
  296. } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) {
  297. return vp9_denoiser_NxM_sse2_small(sig, sig_stride, mc_avg, mc_avg_stride,
  298. avg, avg_stride, increase_denoising, bs,
  299. motion_magnitude, 8);
  300. } else {
  301. return COPY_BLOCK;
  302. }
  303. }