mem_sse2.h 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. /*
  2. * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #ifndef VPX_VPX_DSP_X86_MEM_SSE2_H_
  11. #define VPX_VPX_DSP_X86_MEM_SSE2_H_
  12. #include <emmintrin.h> // SSE2
  13. #include <string.h>
  14. #include "./vpx_config.h"
  15. static INLINE void storeu_uint32(void *dst, uint32_t v) {
  16. memcpy(dst, &v, sizeof(v));
  17. }
  18. static INLINE uint32_t loadu_uint32(const void *src) {
  19. uint32_t v;
  20. memcpy(&v, src, sizeof(v));
  21. return v;
  22. }
  23. static INLINE __m128i load_unaligned_u32(const void *a) {
  24. uint32_t val;
  25. memcpy(&val, a, sizeof(val));
  26. return _mm_cvtsi32_si128(val);
  27. }
  28. static INLINE void store_unaligned_u32(void *const a, const __m128i v) {
  29. const uint32_t val = _mm_cvtsi128_si32(v);
  30. memcpy(a, &val, sizeof(val));
  31. }
  32. #define mm_storelu(dst, v) memcpy((dst), (const char *)&(v), 8)
  33. #define mm_storehu(dst, v) memcpy((dst), (const char *)&(v) + 8, 8)
  34. static INLINE __m128i loadh_epi64(const __m128i s, const void *const src) {
  35. return _mm_castps_si128(
  36. _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
  37. }
  38. static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride,
  39. __m128i *const d) {
  40. d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride));
  41. d[1] = _mm_cvtsi32_si128(*(const int *)(s + 1 * stride));
  42. d[2] = _mm_cvtsi32_si128(*(const int *)(s + 2 * stride));
  43. d[3] = _mm_cvtsi32_si128(*(const int *)(s + 3 * stride));
  44. }
  45. static INLINE void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride,
  46. __m128i *const d) {
  47. load_8bit_4x4(s + 0 * stride, stride, &d[0]);
  48. load_8bit_4x4(s + 4 * stride, stride, &d[4]);
  49. }
  50. static INLINE void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride,
  51. __m128i *const d) {
  52. d[0] = _mm_loadl_epi64((const __m128i *)(s + 0 * stride));
  53. d[1] = _mm_loadl_epi64((const __m128i *)(s + 1 * stride));
  54. d[2] = _mm_loadl_epi64((const __m128i *)(s + 2 * stride));
  55. d[3] = _mm_loadl_epi64((const __m128i *)(s + 3 * stride));
  56. }
  57. static INLINE void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride,
  58. __m128i *const d) {
  59. load_8bit_8x4(s + 0 * stride, stride, &d[0]);
  60. load_8bit_8x4(s + 4 * stride, stride, &d[4]);
  61. }
  62. static INLINE void load_8bit_16x8(const uint8_t *const s,
  63. const ptrdiff_t stride, __m128i *const d) {
  64. d[0] = _mm_load_si128((const __m128i *)(s + 0 * stride));
  65. d[1] = _mm_load_si128((const __m128i *)(s + 1 * stride));
  66. d[2] = _mm_load_si128((const __m128i *)(s + 2 * stride));
  67. d[3] = _mm_load_si128((const __m128i *)(s + 3 * stride));
  68. d[4] = _mm_load_si128((const __m128i *)(s + 4 * stride));
  69. d[5] = _mm_load_si128((const __m128i *)(s + 5 * stride));
  70. d[6] = _mm_load_si128((const __m128i *)(s + 6 * stride));
  71. d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride));
  72. }
  73. static INLINE void loadu_8bit_16x4(const uint8_t *const s,
  74. const ptrdiff_t stride, __m128i *const d) {
  75. d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride));
  76. d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride));
  77. d[2] = _mm_loadu_si128((const __m128i *)(s + 2 * stride));
  78. d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride));
  79. }
  80. static INLINE void loadu_8bit_16x8(const uint8_t *const s,
  81. const ptrdiff_t stride, __m128i *const d) {
  82. loadu_8bit_16x4(s + 0 * stride, stride, &d[0]);
  83. loadu_8bit_16x4(s + 4 * stride, stride, &d[4]);
  84. }
  85. static INLINE void _mm_storeh_epi64(__m128i *const d, const __m128i s) {
  86. _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s));
  87. }
  88. static INLINE void store_8bit_4x4(const __m128i *const s, uint8_t *const d,
  89. const ptrdiff_t stride) {
  90. *(int *)(d + 0 * stride) = _mm_cvtsi128_si32(s[0]);
  91. *(int *)(d + 1 * stride) = _mm_cvtsi128_si32(s[1]);
  92. *(int *)(d + 2 * stride) = _mm_cvtsi128_si32(s[2]);
  93. *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]);
  94. }
  95. static INLINE void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d,
  96. const ptrdiff_t stride) {
  97. __m128i ss[4];
  98. ss[0] = s;
  99. ss[1] = _mm_srli_si128(s, 4);
  100. ss[2] = _mm_srli_si128(s, 8);
  101. ss[3] = _mm_srli_si128(s, 12);
  102. store_8bit_4x4(ss, d, stride);
  103. }
  104. static INLINE void store_8bit_8x4_from_16x2(const __m128i *const s,
  105. uint8_t *const d,
  106. const ptrdiff_t stride) {
  107. _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
  108. _mm_storeh_epi64((__m128i *)(d + 1 * stride), s[0]);
  109. _mm_storel_epi64((__m128i *)(d + 2 * stride), s[1]);
  110. _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]);
  111. }
  112. static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d,
  113. const ptrdiff_t stride) {
  114. _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
  115. _mm_storel_epi64((__m128i *)(d + 1 * stride), s[1]);
  116. _mm_storel_epi64((__m128i *)(d + 2 * stride), s[2]);
  117. _mm_storel_epi64((__m128i *)(d + 3 * stride), s[3]);
  118. _mm_storel_epi64((__m128i *)(d + 4 * stride), s[4]);
  119. _mm_storel_epi64((__m128i *)(d + 5 * stride), s[5]);
  120. _mm_storel_epi64((__m128i *)(d + 6 * stride), s[6]);
  121. _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]);
  122. }
  123. static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d,
  124. const ptrdiff_t stride) {
  125. _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]);
  126. _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]);
  127. _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]);
  128. _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]);
  129. }
  130. #endif // VPX_VPX_DSP_X86_MEM_SSE2_H_