subtract_msa.c 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264
  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "./vpx_dsp_rtcd.h"
  11. #include "vpx_dsp/mips/macros_msa.h"
  12. static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride,
  13. const uint8_t *pred_ptr, int32_t pred_stride,
  14. int16_t *diff_ptr, int32_t diff_stride) {
  15. uint32_t src0, src1, src2, src3;
  16. uint32_t pred0, pred1, pred2, pred3;
  17. v16i8 src = { 0 };
  18. v16i8 pred = { 0 };
  19. v16u8 src_l0, src_l1;
  20. v8i16 diff0, diff1;
  21. LW4(src_ptr, src_stride, src0, src1, src2, src3);
  22. LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3);
  23. INSERT_W4_SB(src0, src1, src2, src3, src);
  24. INSERT_W4_SB(pred0, pred1, pred2, pred3, pred);
  25. ILVRL_B2_UB(src, pred, src_l0, src_l1);
  26. HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
  27. ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride));
  28. }
  29. static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
  30. const uint8_t *pred_ptr, int32_t pred_stride,
  31. int16_t *diff_ptr, int32_t diff_stride) {
  32. uint32_t loop_cnt;
  33. uint64_t src0, src1, pred0, pred1;
  34. v16i8 src = { 0 };
  35. v16i8 pred = { 0 };
  36. v16u8 src_l0, src_l1;
  37. v8i16 diff0, diff1;
  38. for (loop_cnt = 4; loop_cnt--;) {
  39. LD2(src_ptr, src_stride, src0, src1);
  40. src_ptr += (2 * src_stride);
  41. LD2(pred_ptr, pred_stride, pred0, pred1);
  42. pred_ptr += (2 * pred_stride);
  43. INSERT_D2_SB(src0, src1, src);
  44. INSERT_D2_SB(pred0, pred1, pred);
  45. ILVRL_B2_UB(src, pred, src_l0, src_l1);
  46. HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
  47. ST_SH2(diff0, diff1, diff_ptr, diff_stride);
  48. diff_ptr += (2 * diff_stride);
  49. }
  50. }
  51. static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride,
  52. const uint8_t *pred, int32_t pred_stride,
  53. int16_t *diff, int32_t diff_stride) {
  54. int8_t count;
  55. v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
  56. v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
  57. v16u8 src_l0, src_l1;
  58. v8i16 diff0, diff1;
  59. for (count = 2; count--;) {
  60. LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  61. src += (8 * src_stride);
  62. LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6,
  63. pred7);
  64. pred += (8 * pred_stride);
  65. ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
  66. HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
  67. ST_SH2(diff0, diff1, diff, 8);
  68. diff += diff_stride;
  69. ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
  70. HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
  71. ST_SH2(diff0, diff1, diff, 8);
  72. diff += diff_stride;
  73. ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
  74. HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
  75. ST_SH2(diff0, diff1, diff, 8);
  76. diff += diff_stride;
  77. ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
  78. HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
  79. ST_SH2(diff0, diff1, diff, 8);
  80. diff += diff_stride;
  81. ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
  82. HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
  83. ST_SH2(diff0, diff1, diff, 8);
  84. diff += diff_stride;
  85. ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
  86. HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
  87. ST_SH2(diff0, diff1, diff, 8);
  88. diff += diff_stride;
  89. ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
  90. HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
  91. ST_SH2(diff0, diff1, diff, 8);
  92. diff += diff_stride;
  93. ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
  94. HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
  95. ST_SH2(diff0, diff1, diff, 8);
  96. diff += diff_stride;
  97. }
  98. }
  99. static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride,
  100. const uint8_t *pred, int32_t pred_stride,
  101. int16_t *diff, int32_t diff_stride) {
  102. uint32_t loop_cnt;
  103. v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
  104. v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
  105. v16u8 src_l0, src_l1;
  106. v8i16 diff0, diff1;
  107. for (loop_cnt = 8; loop_cnt--;) {
  108. LD_SB2(src, 16, src0, src1);
  109. src += src_stride;
  110. LD_SB2(src, 16, src2, src3);
  111. src += src_stride;
  112. LD_SB2(src, 16, src4, src5);
  113. src += src_stride;
  114. LD_SB2(src, 16, src6, src7);
  115. src += src_stride;
  116. LD_SB2(pred, 16, pred0, pred1);
  117. pred += pred_stride;
  118. LD_SB2(pred, 16, pred2, pred3);
  119. pred += pred_stride;
  120. LD_SB2(pred, 16, pred4, pred5);
  121. pred += pred_stride;
  122. LD_SB2(pred, 16, pred6, pred7);
  123. pred += pred_stride;
  124. ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
  125. HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
  126. ST_SH2(diff0, diff1, diff, 8);
  127. ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
  128. HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
  129. ST_SH2(diff0, diff1, diff + 16, 8);
  130. diff += diff_stride;
  131. ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
  132. HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
  133. ST_SH2(diff0, diff1, diff, 8);
  134. ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
  135. HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
  136. ST_SH2(diff0, diff1, diff + 16, 8);
  137. diff += diff_stride;
  138. ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
  139. HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
  140. ST_SH2(diff0, diff1, diff, 8);
  141. ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
  142. HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
  143. ST_SH2(diff0, diff1, diff + 16, 8);
  144. diff += diff_stride;
  145. ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
  146. HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
  147. ST_SH2(diff0, diff1, diff, 8);
  148. ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
  149. HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
  150. ST_SH2(diff0, diff1, diff + 16, 8);
  151. diff += diff_stride;
  152. }
  153. }
  154. static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride,
  155. const uint8_t *pred, int32_t pred_stride,
  156. int16_t *diff, int32_t diff_stride) {
  157. uint32_t loop_cnt;
  158. v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
  159. v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
  160. v16u8 src_l0, src_l1;
  161. v8i16 diff0, diff1;
  162. for (loop_cnt = 32; loop_cnt--;) {
  163. LD_SB4(src, 16, src0, src1, src2, src3);
  164. src += src_stride;
  165. LD_SB4(src, 16, src4, src5, src6, src7);
  166. src += src_stride;
  167. LD_SB4(pred, 16, pred0, pred1, pred2, pred3);
  168. pred += pred_stride;
  169. LD_SB4(pred, 16, pred4, pred5, pred6, pred7);
  170. pred += pred_stride;
  171. ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
  172. HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
  173. ST_SH2(diff0, diff1, diff, 8);
  174. ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
  175. HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
  176. ST_SH2(diff0, diff1, diff + 16, 8);
  177. ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
  178. HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
  179. ST_SH2(diff0, diff1, diff + 32, 8);
  180. ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
  181. HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
  182. ST_SH2(diff0, diff1, diff + 48, 8);
  183. diff += diff_stride;
  184. ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
  185. HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
  186. ST_SH2(diff0, diff1, diff, 8);
  187. ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
  188. HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
  189. ST_SH2(diff0, diff1, diff + 16, 8);
  190. ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
  191. HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
  192. ST_SH2(diff0, diff1, diff + 32, 8);
  193. ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
  194. HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
  195. ST_SH2(diff0, diff1, diff + 48, 8);
  196. diff += diff_stride;
  197. }
  198. }
  199. void vpx_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr,
  200. ptrdiff_t diff_stride, const uint8_t *src_ptr,
  201. ptrdiff_t src_stride, const uint8_t *pred_ptr,
  202. ptrdiff_t pred_stride) {
  203. if (rows == cols) {
  204. switch (rows) {
  205. case 4:
  206. sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
  207. diff_stride);
  208. break;
  209. case 8:
  210. sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
  211. diff_stride);
  212. break;
  213. case 16:
  214. sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
  215. diff_stride);
  216. break;
  217. case 32:
  218. sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
  219. diff_stride);
  220. break;
  221. case 64:
  222. sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
  223. diff_stride);
  224. break;
  225. default:
  226. vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
  227. src_stride, pred_ptr, pred_stride);
  228. break;
  229. }
  230. } else {
  231. vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
  232. pred_ptr, pred_stride);
  233. }
  234. }