convolve_sse2.h 3.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. /*
  2. * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #ifndef VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_
  11. #define VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_
  12. #include <emmintrin.h> // SSE2
  13. #include "./vpx_config.h"
  14. // Interprets the input register as 16-bit words 7 6 5 4 3 2 1 0, then returns
  15. // values at index 2 and 3 to return 3 2 3 2 3 2 3 2 as 16-bit words
  16. static INLINE __m128i extract_quarter_2_epi16_sse2(const __m128i *const reg) {
  17. __m128i tmp = _mm_unpacklo_epi32(*reg, *reg);
  18. return _mm_unpackhi_epi64(tmp, tmp);
  19. }
  20. // Interprets the input register as 16-bit words 7 6 5 4 3 2 1 0, then returns
  21. // values at index 2 and 3 to return 5 4 5 4 5 4 5 4 as 16-bit words.
  22. static INLINE __m128i extract_quarter_3_epi16_sse2(const __m128i *const reg) {
  23. __m128i tmp = _mm_unpackhi_epi32(*reg, *reg);
  24. return _mm_unpacklo_epi64(tmp, tmp);
  25. }
  26. // Interprets src as 8-bit words, zero extends to form 16-bit words, then
  27. // multiplies with ker and add the adjacent results to form 32-bit words.
  28. // Finally adds the result from 1 and 2 together.
  29. static INLINE __m128i mm_madd_add_epi8_sse2(const __m128i *const src_1,
  30. const __m128i *const src_2,
  31. const __m128i *const ker_1,
  32. const __m128i *const ker_2) {
  33. const __m128i src_1_half = _mm_unpacklo_epi8(*src_1, _mm_setzero_si128());
  34. const __m128i src_2_half = _mm_unpacklo_epi8(*src_2, _mm_setzero_si128());
  35. const __m128i madd_1 = _mm_madd_epi16(src_1_half, *ker_1);
  36. const __m128i madd_2 = _mm_madd_epi16(src_2_half, *ker_2);
  37. return _mm_add_epi32(madd_1, madd_2);
  38. }
  39. // Interprets src as 16-bit words, then multiplies with ker and add the
  40. // adjacent results to form 32-bit words. Finally adds the result from 1 and 2
  41. // together.
  42. static INLINE __m128i mm_madd_add_epi16_sse2(const __m128i *const src_1,
  43. const __m128i *const src_2,
  44. const __m128i *const ker_1,
  45. const __m128i *const ker_2) {
  46. const __m128i madd_1 = _mm_madd_epi16(*src_1, *ker_1);
  47. const __m128i madd_2 = _mm_madd_epi16(*src_2, *ker_2);
  48. return _mm_add_epi32(madd_1, madd_2);
  49. }
  50. static INLINE __m128i mm_madd_packs_epi16_sse2(const __m128i *const src_0,
  51. const __m128i *const src_1,
  52. const __m128i *const ker) {
  53. const __m128i madd_1 = _mm_madd_epi16(*src_0, *ker);
  54. const __m128i madd_2 = _mm_madd_epi16(*src_1, *ker);
  55. return _mm_packs_epi32(madd_1, madd_2);
  56. }
  57. // Interleaves src_1 and src_2
  58. static INLINE __m128i mm_zip_epi32_sse2(const __m128i *const src_1,
  59. const __m128i *const src_2) {
  60. const __m128i tmp_1 = _mm_unpacklo_epi32(*src_1, *src_2);
  61. const __m128i tmp_2 = _mm_unpackhi_epi32(*src_1, *src_2);
  62. return _mm_packs_epi32(tmp_1, tmp_2);
  63. }
  64. static INLINE __m128i mm_round_epi32_sse2(const __m128i *const src,
  65. const __m128i *const half_depth,
  66. const int depth) {
  67. const __m128i nearest_src = _mm_add_epi32(*src, *half_depth);
  68. return _mm_srai_epi32(nearest_src, depth);
  69. }
  70. static INLINE __m128i mm_round_epi16_sse2(const __m128i *const src,
  71. const __m128i *const half_depth,
  72. const int depth) {
  73. const __m128i nearest_src = _mm_adds_epi16(*src, *half_depth);
  74. return _mm_srai_epi16(nearest_src, depth);
  75. }
  76. #endif // VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_