avg_intrin_sse2.c 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578
  1. /*
  2. * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <emmintrin.h>
  11. #include "./vpx_dsp_rtcd.h"
  12. #include "vpx/vpx_integer.h"
  13. #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
  14. #include "vpx_ports/mem.h"
  15. void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
  16. int *min, int *max) {
  17. __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
  18. u0 = _mm_setzero_si128();
  19. // Row 0
  20. s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
  21. d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
  22. diff = _mm_subs_epi16(s0, d0);
  23. negdiff = _mm_subs_epi16(u0, diff);
  24. absdiff0 = _mm_max_epi16(diff, negdiff);
  25. // Row 1
  26. s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
  27. d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
  28. diff = _mm_subs_epi16(s0, d0);
  29. negdiff = _mm_subs_epi16(u0, diff);
  30. absdiff = _mm_max_epi16(diff, negdiff);
  31. maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
  32. minabsdiff = _mm_min_epi16(absdiff0, absdiff);
  33. // Row 2
  34. s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
  35. d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
  36. diff = _mm_subs_epi16(s0, d0);
  37. negdiff = _mm_subs_epi16(u0, diff);
  38. absdiff = _mm_max_epi16(diff, negdiff);
  39. maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
  40. minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
  41. // Row 3
  42. s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
  43. d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
  44. diff = _mm_subs_epi16(s0, d0);
  45. negdiff = _mm_subs_epi16(u0, diff);
  46. absdiff = _mm_max_epi16(diff, negdiff);
  47. maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
  48. minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
  49. // Row 4
  50. s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
  51. d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
  52. diff = _mm_subs_epi16(s0, d0);
  53. negdiff = _mm_subs_epi16(u0, diff);
  54. absdiff = _mm_max_epi16(diff, negdiff);
  55. maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
  56. minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
  57. // Row 5
  58. s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
  59. d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
  60. diff = _mm_subs_epi16(s0, d0);
  61. negdiff = _mm_subs_epi16(u0, diff);
  62. absdiff = _mm_max_epi16(diff, negdiff);
  63. maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
  64. minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
  65. // Row 6
  66. s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
  67. d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
  68. diff = _mm_subs_epi16(s0, d0);
  69. negdiff = _mm_subs_epi16(u0, diff);
  70. absdiff = _mm_max_epi16(diff, negdiff);
  71. maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
  72. minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
  73. // Row 7
  74. s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
  75. d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
  76. diff = _mm_subs_epi16(s0, d0);
  77. negdiff = _mm_subs_epi16(u0, diff);
  78. absdiff = _mm_max_epi16(diff, negdiff);
  79. maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
  80. minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
  81. maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
  82. maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
  83. maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
  84. *max = _mm_extract_epi16(maxabsdiff, 0);
  85. minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
  86. minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
  87. minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
  88. *min = _mm_extract_epi16(minabsdiff, 0);
  89. }
  90. unsigned int vpx_avg_8x8_sse2(const uint8_t *s, int p) {
  91. __m128i s0, s1, u0;
  92. unsigned int avg = 0;
  93. u0 = _mm_setzero_si128();
  94. s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
  95. s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
  96. s0 = _mm_adds_epu16(s0, s1);
  97. s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
  98. s0 = _mm_adds_epu16(s0, s1);
  99. s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
  100. s0 = _mm_adds_epu16(s0, s1);
  101. s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
  102. s0 = _mm_adds_epu16(s0, s1);
  103. s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
  104. s0 = _mm_adds_epu16(s0, s1);
  105. s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
  106. s0 = _mm_adds_epu16(s0, s1);
  107. s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
  108. s0 = _mm_adds_epu16(s0, s1);
  109. s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
  110. s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));
  111. s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
  112. avg = _mm_extract_epi16(s0, 0);
  113. return (avg + 32) >> 6;
  114. }
  115. unsigned int vpx_avg_4x4_sse2(const uint8_t *s, int p) {
  116. __m128i s0, s1, u0;
  117. unsigned int avg = 0;
  118. u0 = _mm_setzero_si128();
  119. s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
  120. s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
  121. s0 = _mm_adds_epu16(s0, s1);
  122. s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
  123. s0 = _mm_adds_epu16(s0, s1);
  124. s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
  125. s0 = _mm_adds_epu16(s0, s1);
  126. s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
  127. s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
  128. avg = _mm_extract_epi16(s0, 0);
  129. return (avg + 8) >> 4;
  130. }
  131. #if CONFIG_VP9_HIGHBITDEPTH
  132. unsigned int vpx_highbd_avg_8x8_sse2(const uint8_t *s8, int p) {
  133. __m128i s0, s1;
  134. unsigned int avg;
  135. const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
  136. const __m128i zero = _mm_setzero_si128();
  137. s0 = _mm_loadu_si128((const __m128i *)(s));
  138. s1 = _mm_loadu_si128((const __m128i *)(s + p));
  139. s0 = _mm_adds_epu16(s0, s1);
  140. s1 = _mm_loadu_si128((const __m128i *)(s + 2 * p));
  141. s0 = _mm_adds_epu16(s0, s1);
  142. s1 = _mm_loadu_si128((const __m128i *)(s + 3 * p));
  143. s0 = _mm_adds_epu16(s0, s1);
  144. s1 = _mm_loadu_si128((const __m128i *)(s + 4 * p));
  145. s0 = _mm_adds_epu16(s0, s1);
  146. s1 = _mm_loadu_si128((const __m128i *)(s + 5 * p));
  147. s0 = _mm_adds_epu16(s0, s1);
  148. s1 = _mm_loadu_si128((const __m128i *)(s + 6 * p));
  149. s0 = _mm_adds_epu16(s0, s1);
  150. s1 = _mm_loadu_si128((const __m128i *)(s + 7 * p));
  151. s0 = _mm_adds_epu16(s0, s1);
  152. s1 = _mm_unpackhi_epi16(s0, zero);
  153. s0 = _mm_unpacklo_epi16(s0, zero);
  154. s0 = _mm_add_epi32(s0, s1);
  155. s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 8));
  156. s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 4));
  157. avg = _mm_cvtsi128_si32(s0);
  158. return (avg + 32) >> 6;
  159. }
  160. unsigned int vpx_highbd_avg_4x4_sse2(const uint8_t *s8, int p) {
  161. __m128i s0, s1;
  162. unsigned int avg;
  163. const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
  164. s0 = _mm_loadl_epi64((const __m128i *)(s));
  165. s1 = _mm_loadl_epi64((const __m128i *)(s + p));
  166. s0 = _mm_adds_epu16(s0, s1);
  167. s1 = _mm_loadl_epi64((const __m128i *)(s + 2 * p));
  168. s0 = _mm_adds_epu16(s0, s1);
  169. s1 = _mm_loadl_epi64((const __m128i *)(s + 3 * p));
  170. s0 = _mm_adds_epu16(s0, s1);
  171. s0 = _mm_add_epi16(s0, _mm_srli_si128(s0, 4));
  172. s0 = _mm_add_epi16(s0, _mm_srli_si128(s0, 2));
  173. avg = _mm_extract_epi16(s0, 0);
  174. return (avg + 8) >> 4;
  175. }
  176. #endif // CONFIG_VP9_HIGHBITDEPTH
  177. static void hadamard_col8_sse2(__m128i *in, int iter) {
  178. __m128i a0 = in[0];
  179. __m128i a1 = in[1];
  180. __m128i a2 = in[2];
  181. __m128i a3 = in[3];
  182. __m128i a4 = in[4];
  183. __m128i a5 = in[5];
  184. __m128i a6 = in[6];
  185. __m128i a7 = in[7];
  186. __m128i b0 = _mm_add_epi16(a0, a1);
  187. __m128i b1 = _mm_sub_epi16(a0, a1);
  188. __m128i b2 = _mm_add_epi16(a2, a3);
  189. __m128i b3 = _mm_sub_epi16(a2, a3);
  190. __m128i b4 = _mm_add_epi16(a4, a5);
  191. __m128i b5 = _mm_sub_epi16(a4, a5);
  192. __m128i b6 = _mm_add_epi16(a6, a7);
  193. __m128i b7 = _mm_sub_epi16(a6, a7);
  194. a0 = _mm_add_epi16(b0, b2);
  195. a1 = _mm_add_epi16(b1, b3);
  196. a2 = _mm_sub_epi16(b0, b2);
  197. a3 = _mm_sub_epi16(b1, b3);
  198. a4 = _mm_add_epi16(b4, b6);
  199. a5 = _mm_add_epi16(b5, b7);
  200. a6 = _mm_sub_epi16(b4, b6);
  201. a7 = _mm_sub_epi16(b5, b7);
  202. if (iter == 0) {
  203. b0 = _mm_add_epi16(a0, a4);
  204. b7 = _mm_add_epi16(a1, a5);
  205. b3 = _mm_add_epi16(a2, a6);
  206. b4 = _mm_add_epi16(a3, a7);
  207. b2 = _mm_sub_epi16(a0, a4);
  208. b6 = _mm_sub_epi16(a1, a5);
  209. b1 = _mm_sub_epi16(a2, a6);
  210. b5 = _mm_sub_epi16(a3, a7);
  211. a0 = _mm_unpacklo_epi16(b0, b1);
  212. a1 = _mm_unpacklo_epi16(b2, b3);
  213. a2 = _mm_unpackhi_epi16(b0, b1);
  214. a3 = _mm_unpackhi_epi16(b2, b3);
  215. a4 = _mm_unpacklo_epi16(b4, b5);
  216. a5 = _mm_unpacklo_epi16(b6, b7);
  217. a6 = _mm_unpackhi_epi16(b4, b5);
  218. a7 = _mm_unpackhi_epi16(b6, b7);
  219. b0 = _mm_unpacklo_epi32(a0, a1);
  220. b1 = _mm_unpacklo_epi32(a4, a5);
  221. b2 = _mm_unpackhi_epi32(a0, a1);
  222. b3 = _mm_unpackhi_epi32(a4, a5);
  223. b4 = _mm_unpacklo_epi32(a2, a3);
  224. b5 = _mm_unpacklo_epi32(a6, a7);
  225. b6 = _mm_unpackhi_epi32(a2, a3);
  226. b7 = _mm_unpackhi_epi32(a6, a7);
  227. in[0] = _mm_unpacklo_epi64(b0, b1);
  228. in[1] = _mm_unpackhi_epi64(b0, b1);
  229. in[2] = _mm_unpacklo_epi64(b2, b3);
  230. in[3] = _mm_unpackhi_epi64(b2, b3);
  231. in[4] = _mm_unpacklo_epi64(b4, b5);
  232. in[5] = _mm_unpackhi_epi64(b4, b5);
  233. in[6] = _mm_unpacklo_epi64(b6, b7);
  234. in[7] = _mm_unpackhi_epi64(b6, b7);
  235. } else {
  236. in[0] = _mm_add_epi16(a0, a4);
  237. in[7] = _mm_add_epi16(a1, a5);
  238. in[3] = _mm_add_epi16(a2, a6);
  239. in[4] = _mm_add_epi16(a3, a7);
  240. in[2] = _mm_sub_epi16(a0, a4);
  241. in[6] = _mm_sub_epi16(a1, a5);
  242. in[1] = _mm_sub_epi16(a2, a6);
  243. in[5] = _mm_sub_epi16(a3, a7);
  244. }
  245. }
  246. static INLINE void hadamard_8x8_sse2(const int16_t *src_diff,
  247. ptrdiff_t src_stride, tran_low_t *coeff,
  248. int is_final) {
  249. __m128i src[8];
  250. src[0] = _mm_load_si128((const __m128i *)src_diff);
  251. src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
  252. src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
  253. src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
  254. src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
  255. src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
  256. src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
  257. src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
  258. (void)src_diff;
  259. hadamard_col8_sse2(src, 0);
  260. hadamard_col8_sse2(src, 1);
  261. if (is_final) {
  262. store_tran_low(src[0], coeff);
  263. coeff += 8;
  264. store_tran_low(src[1], coeff);
  265. coeff += 8;
  266. store_tran_low(src[2], coeff);
  267. coeff += 8;
  268. store_tran_low(src[3], coeff);
  269. coeff += 8;
  270. store_tran_low(src[4], coeff);
  271. coeff += 8;
  272. store_tran_low(src[5], coeff);
  273. coeff += 8;
  274. store_tran_low(src[6], coeff);
  275. coeff += 8;
  276. store_tran_low(src[7], coeff);
  277. } else {
  278. int16_t *coeff16 = (int16_t *)coeff;
  279. _mm_store_si128((__m128i *)coeff16, src[0]);
  280. coeff16 += 8;
  281. _mm_store_si128((__m128i *)coeff16, src[1]);
  282. coeff16 += 8;
  283. _mm_store_si128((__m128i *)coeff16, src[2]);
  284. coeff16 += 8;
  285. _mm_store_si128((__m128i *)coeff16, src[3]);
  286. coeff16 += 8;
  287. _mm_store_si128((__m128i *)coeff16, src[4]);
  288. coeff16 += 8;
  289. _mm_store_si128((__m128i *)coeff16, src[5]);
  290. coeff16 += 8;
  291. _mm_store_si128((__m128i *)coeff16, src[6]);
  292. coeff16 += 8;
  293. _mm_store_si128((__m128i *)coeff16, src[7]);
  294. }
  295. }
  296. void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
  297. tran_low_t *coeff) {
  298. hadamard_8x8_sse2(src_diff, src_stride, coeff, 1);
  299. }
  300. static INLINE void hadamard_16x16_sse2(const int16_t *src_diff,
  301. ptrdiff_t src_stride, tran_low_t *coeff,
  302. int is_final) {
  303. #if CONFIG_VP9_HIGHBITDEPTH
  304. // For high bitdepths, it is unnecessary to store_tran_low
  305. // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
  306. // next stage. Output to an intermediate buffer first, then store_tran_low()
  307. // in the final stage.
  308. DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
  309. int16_t *t_coeff = temp_coeff;
  310. #else
  311. int16_t *t_coeff = coeff;
  312. #endif
  313. int16_t *coeff16 = (int16_t *)coeff;
  314. int idx;
  315. for (idx = 0; idx < 4; ++idx) {
  316. const int16_t *src_ptr =
  317. src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
  318. hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64),
  319. 0);
  320. }
  321. for (idx = 0; idx < 64; idx += 8) {
  322. __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
  323. __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64));
  324. __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128));
  325. __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192));
  326. __m128i b0 = _mm_add_epi16(coeff0, coeff1);
  327. __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
  328. __m128i b2 = _mm_add_epi16(coeff2, coeff3);
  329. __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
  330. b0 = _mm_srai_epi16(b0, 1);
  331. b1 = _mm_srai_epi16(b1, 1);
  332. b2 = _mm_srai_epi16(b2, 1);
  333. b3 = _mm_srai_epi16(b3, 1);
  334. coeff0 = _mm_add_epi16(b0, b2);
  335. coeff1 = _mm_add_epi16(b1, b3);
  336. coeff2 = _mm_sub_epi16(b0, b2);
  337. coeff3 = _mm_sub_epi16(b1, b3);
  338. if (is_final) {
  339. store_tran_low(coeff0, coeff);
  340. store_tran_low(coeff1, coeff + 64);
  341. store_tran_low(coeff2, coeff + 128);
  342. store_tran_low(coeff3, coeff + 192);
  343. coeff += 8;
  344. } else {
  345. _mm_store_si128((__m128i *)coeff16, coeff0);
  346. _mm_store_si128((__m128i *)(coeff16 + 64), coeff1);
  347. _mm_store_si128((__m128i *)(coeff16 + 128), coeff2);
  348. _mm_store_si128((__m128i *)(coeff16 + 192), coeff3);
  349. coeff16 += 8;
  350. }
  351. t_coeff += 8;
  352. }
  353. }
  354. void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
  355. tran_low_t *coeff) {
  356. hadamard_16x16_sse2(src_diff, src_stride, coeff, 1);
  357. }
  358. void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
  359. tran_low_t *coeff) {
  360. #if CONFIG_VP9_HIGHBITDEPTH
  361. // For high bitdepths, it is unnecessary to store_tran_low
  362. // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
  363. // next stage. Output to an intermediate buffer first, then store_tran_low()
  364. // in the final stage.
  365. DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
  366. int16_t *t_coeff = temp_coeff;
  367. #else
  368. int16_t *t_coeff = coeff;
  369. #endif
  370. int idx;
  371. for (idx = 0; idx < 4; ++idx) {
  372. const int16_t *src_ptr =
  373. src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
  374. hadamard_16x16_sse2(src_ptr, src_stride,
  375. (tran_low_t *)(t_coeff + idx * 256), 0);
  376. }
  377. for (idx = 0; idx < 256; idx += 8) {
  378. __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
  379. __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256));
  380. __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512));
  381. __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768));
  382. __m128i b0 = _mm_add_epi16(coeff0, coeff1);
  383. __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
  384. __m128i b2 = _mm_add_epi16(coeff2, coeff3);
  385. __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
  386. b0 = _mm_srai_epi16(b0, 2);
  387. b1 = _mm_srai_epi16(b1, 2);
  388. b2 = _mm_srai_epi16(b2, 2);
  389. b3 = _mm_srai_epi16(b3, 2);
  390. coeff0 = _mm_add_epi16(b0, b2);
  391. coeff1 = _mm_add_epi16(b1, b3);
  392. store_tran_low(coeff0, coeff);
  393. store_tran_low(coeff1, coeff + 256);
  394. coeff2 = _mm_sub_epi16(b0, b2);
  395. coeff3 = _mm_sub_epi16(b1, b3);
  396. store_tran_low(coeff2, coeff + 512);
  397. store_tran_low(coeff3, coeff + 768);
  398. coeff += 8;
  399. t_coeff += 8;
  400. }
  401. }
  402. int vpx_satd_sse2(const tran_low_t *coeff, int length) {
  403. int i;
  404. const __m128i zero = _mm_setzero_si128();
  405. __m128i accum = zero;
  406. for (i = 0; i < length; i += 8) {
  407. const __m128i src_line = load_tran_low(coeff);
  408. const __m128i inv = _mm_sub_epi16(zero, src_line);
  409. const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line)
  410. const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
  411. const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
  412. const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
  413. accum = _mm_add_epi32(accum, sum);
  414. coeff += 8;
  415. }
  416. { // cascading summation of accum
  417. __m128i hi = _mm_srli_si128(accum, 8);
  418. accum = _mm_add_epi32(accum, hi);
  419. hi = _mm_srli_epi64(accum, 32);
  420. accum = _mm_add_epi32(accum, hi);
  421. }
  422. return _mm_cvtsi128_si32(accum);
  423. }
  424. void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref,
  425. const int ref_stride, const int height) {
  426. int idx;
  427. __m128i zero = _mm_setzero_si128();
  428. __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
  429. __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
  430. __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
  431. __m128i t0, t1;
  432. int height_1 = height - 1;
  433. ref += ref_stride;
  434. for (idx = 1; idx < height_1; idx += 2) {
  435. src_line = _mm_loadu_si128((const __m128i *)ref);
  436. t0 = _mm_unpacklo_epi8(src_line, zero);
  437. t1 = _mm_unpackhi_epi8(src_line, zero);
  438. s0 = _mm_adds_epu16(s0, t0);
  439. s1 = _mm_adds_epu16(s1, t1);
  440. ref += ref_stride;
  441. src_line = _mm_loadu_si128((const __m128i *)ref);
  442. t0 = _mm_unpacklo_epi8(src_line, zero);
  443. t1 = _mm_unpackhi_epi8(src_line, zero);
  444. s0 = _mm_adds_epu16(s0, t0);
  445. s1 = _mm_adds_epu16(s1, t1);
  446. ref += ref_stride;
  447. }
  448. src_line = _mm_loadu_si128((const __m128i *)ref);
  449. t0 = _mm_unpacklo_epi8(src_line, zero);
  450. t1 = _mm_unpackhi_epi8(src_line, zero);
  451. s0 = _mm_adds_epu16(s0, t0);
  452. s1 = _mm_adds_epu16(s1, t1);
  453. if (height == 64) {
  454. s0 = _mm_srai_epi16(s0, 5);
  455. s1 = _mm_srai_epi16(s1, 5);
  456. } else if (height == 32) {
  457. s0 = _mm_srai_epi16(s0, 4);
  458. s1 = _mm_srai_epi16(s1, 4);
  459. } else {
  460. s0 = _mm_srai_epi16(s0, 3);
  461. s1 = _mm_srai_epi16(s1, 3);
  462. }
  463. _mm_storeu_si128((__m128i *)hbuf, s0);
  464. hbuf += 8;
  465. _mm_storeu_si128((__m128i *)hbuf, s1);
  466. }
  467. int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width) {
  468. __m128i zero = _mm_setzero_si128();
  469. __m128i src_line = _mm_load_si128((const __m128i *)ref);
  470. __m128i s0 = _mm_sad_epu8(src_line, zero);
  471. __m128i s1;
  472. int i;
  473. for (i = 16; i < width; i += 16) {
  474. ref += 16;
  475. src_line = _mm_load_si128((const __m128i *)ref);
  476. s1 = _mm_sad_epu8(src_line, zero);
  477. s0 = _mm_adds_epu16(s0, s1);
  478. }
  479. s1 = _mm_srli_si128(s0, 8);
  480. s0 = _mm_adds_epu16(s0, s1);
  481. return _mm_extract_epi16(s0, 0);
  482. }
  483. int vpx_vector_var_sse2(const int16_t *ref, const int16_t *src, const int bwl) {
  484. int idx;
  485. int width = 4 << bwl;
  486. int16_t mean;
  487. __m128i v0 = _mm_loadu_si128((const __m128i *)ref);
  488. __m128i v1 = _mm_load_si128((const __m128i *)src);
  489. __m128i diff = _mm_subs_epi16(v0, v1);
  490. __m128i sum = diff;
  491. __m128i sse = _mm_madd_epi16(diff, diff);
  492. ref += 8;
  493. src += 8;
  494. for (idx = 8; idx < width; idx += 8) {
  495. v0 = _mm_loadu_si128((const __m128i *)ref);
  496. v1 = _mm_load_si128((const __m128i *)src);
  497. diff = _mm_subs_epi16(v0, v1);
  498. sum = _mm_add_epi16(sum, diff);
  499. v0 = _mm_madd_epi16(diff, diff);
  500. sse = _mm_add_epi32(sse, v0);
  501. ref += 8;
  502. src += 8;
  503. }
  504. v0 = _mm_srli_si128(sum, 8);
  505. sum = _mm_add_epi16(sum, v0);
  506. v0 = _mm_srli_epi64(sum, 32);
  507. sum = _mm_add_epi16(sum, v0);
  508. v0 = _mm_srli_epi32(sum, 16);
  509. sum = _mm_add_epi16(sum, v0);
  510. v1 = _mm_srli_si128(sse, 8);
  511. sse = _mm_add_epi32(sse, v1);
  512. v1 = _mm_srli_epi64(sse, 32);
  513. sse = _mm_add_epi32(sse, v1);
  514. mean = _mm_extract_epi16(sum, 0);
  515. return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
  516. }