fwd_txfm_sse2.h 16 KB


  1. /*
  2. * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #ifndef VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_
  11. #define VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_
  12. #ifdef __cplusplus
  13. extern "C" {
  14. #endif
  15. #define pair_set_epi32(a, b) \
  16. _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a))
  17. static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
  18. __m128i buf0, buf1;
  19. buf0 = _mm_mul_epu32(a, b);
  20. a = _mm_srli_epi64(a, 32);
  21. b = _mm_srli_epi64(b, 32);
  22. buf1 = _mm_mul_epu32(a, b);
  23. return _mm_add_epi64(buf0, buf1);
  24. }
  25. static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
  26. __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
  27. __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
  28. return _mm_unpacklo_epi64(buf0, buf1);
  29. }
  30. static INLINE int check_epi16_overflow_x2(const __m128i *preg0,
  31. const __m128i *preg1) {
  32. const __m128i max_overflow = _mm_set1_epi16(0x7fff);
  33. const __m128i min_overflow = _mm_set1_epi16(0x8000);
  34. __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
  35. _mm_cmpeq_epi16(*preg0, min_overflow));
  36. __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
  37. _mm_cmpeq_epi16(*preg1, min_overflow));
  38. cmp0 = _mm_or_si128(cmp0, cmp1);
  39. return _mm_movemask_epi8(cmp0);
  40. }
  41. static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
  42. const __m128i *preg1,
  43. const __m128i *preg2,
  44. const __m128i *preg3) {
  45. const __m128i max_overflow = _mm_set1_epi16(0x7fff);
  46. const __m128i min_overflow = _mm_set1_epi16(0x8000);
  47. __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
  48. _mm_cmpeq_epi16(*preg0, min_overflow));
  49. __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
  50. _mm_cmpeq_epi16(*preg1, min_overflow));
  51. __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow),
  52. _mm_cmpeq_epi16(*preg2, min_overflow));
  53. __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow),
  54. _mm_cmpeq_epi16(*preg3, min_overflow));
  55. cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
  56. return _mm_movemask_epi8(cmp0);
  57. }
  58. static INLINE int check_epi16_overflow_x8(
  59. const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
  60. const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
  61. const __m128i *preg6, const __m128i *preg7) {
  62. int res0, res1;
  63. res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
  64. res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
  65. return res0 + res1;
  66. }
  67. static INLINE int check_epi16_overflow_x12(
  68. const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
  69. const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
  70. const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
  71. const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) {
  72. int res0, res1;
  73. res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
  74. res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
  75. if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
  76. return res0 + res1;
  77. }
  78. static INLINE int check_epi16_overflow_x16(
  79. const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
  80. const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
  81. const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
  82. const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
  83. const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
  84. const __m128i *preg15) {
  85. int res0, res1;
  86. res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
  87. res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
  88. if (!res0) {
  89. res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
  90. if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
  91. }
  92. return res0 + res1;
  93. }
  94. static INLINE int check_epi16_overflow_x32(
  95. const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
  96. const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
  97. const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
  98. const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
  99. const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
  100. const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
  101. const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
  102. const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
  103. const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
  104. const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
  105. const __m128i *preg30, const __m128i *preg31) {
  106. int res0, res1;
  107. res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
  108. res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
  109. if (!res0) {
  110. res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
  111. if (!res1) {
  112. res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
  113. if (!res0) {
  114. res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19);
  115. if (!res1) {
  116. res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23);
  117. if (!res0) {
  118. res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27);
  119. if (!res1)
  120. res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31);
  121. }
  122. }
  123. }
  124. }
  125. }
  126. return res0 + res1;
  127. }
  128. static INLINE int k_check_epi32_overflow_4(const __m128i *preg0,
  129. const __m128i *preg1,
  130. const __m128i *preg2,
  131. const __m128i *preg3,
  132. const __m128i *zero) {
  133. __m128i minus_one = _mm_set1_epi32(-1);
  134. // Check for overflows
  135. __m128i reg0_shifted = _mm_slli_epi64(*preg0, 1);
  136. __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1);
  137. __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1);
  138. __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1);
  139. __m128i reg0_top_dwords =
  140. _mm_shuffle_epi32(reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1));
  141. __m128i reg1_top_dwords =
  142. _mm_shuffle_epi32(reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1));
  143. __m128i reg2_top_dwords =
  144. _mm_shuffle_epi32(reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1));
  145. __m128i reg3_top_dwords =
  146. _mm_shuffle_epi32(reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1));
  147. __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords);
  148. __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords);
  149. __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero);
  150. __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero);
  151. __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one);
  152. __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one);
  153. int overflow_01 =
  154. _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_01, valid_negative_01));
  155. int overflow_23 =
  156. _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_23, valid_negative_23));
  157. return (overflow_01 + overflow_23);
  158. }
  159. static INLINE int k_check_epi32_overflow_8(
  160. const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
  161. const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
  162. const __m128i *preg6, const __m128i *preg7, const __m128i *zero) {
  163. int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
  164. if (!overflow) {
  165. overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
  166. }
  167. return overflow;
  168. }
  169. static INLINE int k_check_epi32_overflow_16(
  170. const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
  171. const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
  172. const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
  173. const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
  174. const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
  175. const __m128i *preg15, const __m128i *zero) {
  176. int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
  177. if (!overflow) {
  178. overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
  179. if (!overflow) {
  180. overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
  181. if (!overflow) {
  182. overflow =
  183. k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero);
  184. }
  185. }
  186. }
  187. return overflow;
  188. }
  189. static INLINE int k_check_epi32_overflow_32(
  190. const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
  191. const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
  192. const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
  193. const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
  194. const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
  195. const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
  196. const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
  197. const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
  198. const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
  199. const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
  200. const __m128i *preg30, const __m128i *preg31, const __m128i *zero) {
  201. int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
  202. if (!overflow) {
  203. overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
  204. if (!overflow) {
  205. overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
  206. if (!overflow) {
  207. overflow =
  208. k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero);
  209. if (!overflow) {
  210. overflow =
  211. k_check_epi32_overflow_4(preg16, preg17, preg18, preg19, zero);
  212. if (!overflow) {
  213. overflow =
  214. k_check_epi32_overflow_4(preg20, preg21, preg22, preg23, zero);
  215. if (!overflow) {
  216. overflow = k_check_epi32_overflow_4(preg24, preg25, preg26,
  217. preg27, zero);
  218. if (!overflow) {
  219. overflow = k_check_epi32_overflow_4(preg28, preg29, preg30,
  220. preg31, zero);
  221. }
  222. }
  223. }
  224. }
  225. }
  226. }
  227. }
  228. return overflow;
  229. }
  230. static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
  231. #if CONFIG_VP9_HIGHBITDEPTH
  232. const __m128i zero = _mm_setzero_si128();
  233. const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
  234. __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
  235. __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
  236. _mm_store_si128((__m128i *)(dst_ptr), out0);
  237. _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
  238. #else
  239. _mm_store_si128((__m128i *)(dst_ptr), *poutput);
  240. #endif // CONFIG_VP9_HIGHBITDEPTH
  241. }
  242. static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
  243. #if CONFIG_VP9_HIGHBITDEPTH
  244. const __m128i zero = _mm_setzero_si128();
  245. const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
  246. __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
  247. __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
  248. _mm_storeu_si128((__m128i *)(dst_ptr), out0);
  249. _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
  250. #else
  251. _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
  252. #endif // CONFIG_VP9_HIGHBITDEPTH
  253. }
  254. static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1,
  255. const __m128i *pmultiplier,
  256. const __m128i *prounding,
  257. const int shift) {
  258. const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier);
  259. const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier);
  260. const __m128i v0 = _mm_add_epi32(u0, *prounding);
  261. const __m128i v1 = _mm_add_epi32(u1, *prounding);
  262. const __m128i w0 = _mm_srai_epi32(v0, shift);
  263. const __m128i w1 = _mm_srai_epi32(v1, shift);
  264. return _mm_packs_epi32(w0, w1);
  265. }
  266. static INLINE void transpose_and_output8x8(
  267. const __m128i *pin00, const __m128i *pin01, const __m128i *pin02,
  268. const __m128i *pin03, const __m128i *pin04, const __m128i *pin05,
  269. const __m128i *pin06, const __m128i *pin07, const int pass,
  270. int16_t *out0_ptr, tran_low_t *out1_ptr) {
  271. // 00 01 02 03 04 05 06 07
  272. // 10 11 12 13 14 15 16 17
  273. // 20 21 22 23 24 25 26 27
  274. // 30 31 32 33 34 35 36 37
  275. // 40 41 42 43 44 45 46 47
  276. // 50 51 52 53 54 55 56 57
  277. // 60 61 62 63 64 65 66 67
  278. // 70 71 72 73 74 75 76 77
  279. const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01);
  280. const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03);
  281. const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01);
  282. const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03);
  283. const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05);
  284. const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07);
  285. const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05);
  286. const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07);
  287. // 00 10 01 11 02 12 03 13
  288. // 20 30 21 31 22 32 23 33
  289. // 04 14 05 15 06 16 07 17
  290. // 24 34 25 35 26 36 27 37
  291. // 40 50 41 51 42 52 43 53
  292. // 60 70 61 71 62 72 63 73
  293. // 54 54 55 55 56 56 57 57
  294. // 64 74 65 75 66 76 67 77
  295. const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
  296. const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
  297. const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
  298. const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
  299. const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
  300. const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
  301. const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
  302. const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
  303. // 00 10 20 30 01 11 21 31
  304. // 40 50 60 70 41 51 61 71
  305. // 02 12 22 32 03 13 23 33
  306. // 42 52 62 72 43 53 63 73
  307. // 04 14 24 34 05 15 21 36
  308. // 44 54 64 74 45 55 61 76
  309. // 06 16 26 36 07 17 27 37
  310. // 46 56 66 76 47 57 67 77
  311. const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
  312. const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
  313. const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
  314. const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
  315. const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
  316. const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
  317. const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
  318. const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
  319. // 00 10 20 30 40 50 60 70
  320. // 01 11 21 31 41 51 61 71
  321. // 02 12 22 32 42 52 62 72
  322. // 03 13 23 33 43 53 63 73
  323. // 04 14 24 34 44 54 64 74
  324. // 05 15 25 35 45 55 65 75
  325. // 06 16 26 36 46 56 66 76
  326. // 07 17 27 37 47 57 67 77
  327. if (pass == 0) {
  328. _mm_storeu_si128((__m128i *)(out0_ptr + 0 * 16), tr2_0);
  329. _mm_storeu_si128((__m128i *)(out0_ptr + 1 * 16), tr2_1);
  330. _mm_storeu_si128((__m128i *)(out0_ptr + 2 * 16), tr2_2);
  331. _mm_storeu_si128((__m128i *)(out0_ptr + 3 * 16), tr2_3);
  332. _mm_storeu_si128((__m128i *)(out0_ptr + 4 * 16), tr2_4);
  333. _mm_storeu_si128((__m128i *)(out0_ptr + 5 * 16), tr2_5);
  334. _mm_storeu_si128((__m128i *)(out0_ptr + 6 * 16), tr2_6);
  335. _mm_storeu_si128((__m128i *)(out0_ptr + 7 * 16), tr2_7);
  336. } else {
  337. storeu_output(&tr2_0, (out1_ptr + 0 * 16));
  338. storeu_output(&tr2_1, (out1_ptr + 1 * 16));
  339. storeu_output(&tr2_2, (out1_ptr + 2 * 16));
  340. storeu_output(&tr2_3, (out1_ptr + 3 * 16));
  341. storeu_output(&tr2_4, (out1_ptr + 4 * 16));
  342. storeu_output(&tr2_5, (out1_ptr + 5 * 16));
  343. storeu_output(&tr2_6, (out1_ptr + 6 * 16));
  344. storeu_output(&tr2_7, (out1_ptr + 7 * 16));
  345. }
  346. }
  347. #ifdef __cplusplus
  348. } // extern "C"
  349. #endif
  350. #endif // VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_