post_proc_sse2.c 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. /*
  2. * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <assert.h>
  11. #include <emmintrin.h>
  12. #include <stdio.h>
  13. #include "./vpx_dsp_rtcd.h"
  14. #include "vpx/vpx_integer.h"
  15. #include "vpx_dsp/x86/mem_sse2.h"
  16. extern const int16_t vpx_rv[];
  17. void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows,
  18. int cols, int flimit) {
  19. int col;
  20. const __m128i zero = _mm_setzero_si128();
  21. const __m128i f = _mm_set1_epi32(flimit);
  22. DECLARE_ALIGNED(16, int16_t, above_context[8 * 8]);
  23. // 8 columns are processed at a time.
  24. // If rows is less than 8 the bottom border extension fails.
  25. assert(cols % 8 == 0);
  26. assert(rows >= 8);
  27. for (col = 0; col < cols; col += 8) {
  28. int row, i;
  29. __m128i s = _mm_loadl_epi64((__m128i *)dst);
  30. __m128i sum, sumsq_0, sumsq_1;
  31. __m128i tmp_0, tmp_1;
  32. __m128i below_context;
  33. s = _mm_unpacklo_epi8(s, zero);
  34. for (i = 0; i < 8; ++i) {
  35. _mm_store_si128((__m128i *)above_context + i, s);
  36. }
  37. // sum *= 9
  38. sum = _mm_slli_epi16(s, 3);
  39. sum = _mm_add_epi16(s, sum);
  40. // sum^2 * 9 == (sum * 9) * sum
  41. tmp_0 = _mm_mullo_epi16(sum, s);
  42. tmp_1 = _mm_mulhi_epi16(sum, s);
  43. sumsq_0 = _mm_unpacklo_epi16(tmp_0, tmp_1);
  44. sumsq_1 = _mm_unpackhi_epi16(tmp_0, tmp_1);
  45. // Prime sum/sumsq
  46. for (i = 1; i <= 6; ++i) {
  47. __m128i a = _mm_loadl_epi64((__m128i *)(dst + i * pitch));
  48. a = _mm_unpacklo_epi8(a, zero);
  49. sum = _mm_add_epi16(sum, a);
  50. a = _mm_mullo_epi16(a, a);
  51. sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(a, zero));
  52. sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(a, zero));
  53. }
  54. for (row = 0; row < rows + 8; row++) {
  55. const __m128i above =
  56. _mm_load_si128((__m128i *)above_context + (row & 7));
  57. __m128i this_row = _mm_loadl_epi64((__m128i *)(dst + row * pitch));
  58. __m128i above_sq, below_sq;
  59. __m128i mask_0, mask_1;
  60. __m128i multmp_0, multmp_1;
  61. __m128i rv;
  62. __m128i out;
  63. this_row = _mm_unpacklo_epi8(this_row, zero);
  64. if (row + 7 < rows) {
  65. // Instead of copying the end context we just stop loading when we get
  66. // to the last one.
  67. below_context = _mm_loadl_epi64((__m128i *)(dst + (row + 7) * pitch));
  68. below_context = _mm_unpacklo_epi8(below_context, zero);
  69. }
  70. sum = _mm_sub_epi16(sum, above);
  71. sum = _mm_add_epi16(sum, below_context);
  72. // context^2 fits in 16 bits. Don't need to mulhi and combine. Just zero
  73. // extend. Unfortunately we can't do below_sq - above_sq in 16 bits
  74. // because x86 does not have unpack with sign extension.
  75. above_sq = _mm_mullo_epi16(above, above);
  76. sumsq_0 = _mm_sub_epi32(sumsq_0, _mm_unpacklo_epi16(above_sq, zero));
  77. sumsq_1 = _mm_sub_epi32(sumsq_1, _mm_unpackhi_epi16(above_sq, zero));
  78. below_sq = _mm_mullo_epi16(below_context, below_context);
  79. sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(below_sq, zero));
  80. sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(below_sq, zero));
  81. // sumsq * 16 - sumsq == sumsq * 15
  82. mask_0 = _mm_slli_epi32(sumsq_0, 4);
  83. mask_0 = _mm_sub_epi32(mask_0, sumsq_0);
  84. mask_1 = _mm_slli_epi32(sumsq_1, 4);
  85. mask_1 = _mm_sub_epi32(mask_1, sumsq_1);
  86. multmp_0 = _mm_mullo_epi16(sum, sum);
  87. multmp_1 = _mm_mulhi_epi16(sum, sum);
  88. mask_0 = _mm_sub_epi32(mask_0, _mm_unpacklo_epi16(multmp_0, multmp_1));
  89. mask_1 = _mm_sub_epi32(mask_1, _mm_unpackhi_epi16(multmp_0, multmp_1));
  90. // mask - f gives a negative value when mask < f
  91. mask_0 = _mm_sub_epi32(mask_0, f);
  92. mask_1 = _mm_sub_epi32(mask_1, f);
  93. // Shift the sign bit down to create a mask
  94. mask_0 = _mm_srai_epi32(mask_0, 31);
  95. mask_1 = _mm_srai_epi32(mask_1, 31);
  96. mask_0 = _mm_packs_epi32(mask_0, mask_1);
  97. rv = _mm_loadu_si128((__m128i const *)(vpx_rv + (row & 127)));
  98. mask_1 = _mm_add_epi16(rv, sum);
  99. mask_1 = _mm_add_epi16(mask_1, this_row);
  100. mask_1 = _mm_srai_epi16(mask_1, 4);
  101. mask_1 = _mm_and_si128(mask_0, mask_1);
  102. mask_0 = _mm_andnot_si128(mask_0, this_row);
  103. out = _mm_or_si128(mask_1, mask_0);
  104. _mm_storel_epi64((__m128i *)(dst + row * pitch),
  105. _mm_packus_epi16(out, zero));
  106. _mm_store_si128((__m128i *)above_context + ((row + 8) & 7), this_row);
  107. }
  108. dst += 8;
  109. }
  110. }