sum_squares_msa.c 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. /*
  2. * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "./vpx_dsp_rtcd.h"
  11. #include "./macros_msa.h"
  12. uint64_t vpx_sum_squares_2d_i16_msa(const int16_t *src, int src_stride,
  13. int size) {
  14. int row, col;
  15. uint64_t ss_res = 0;
  16. v4i32 mul0, mul1;
  17. v2i64 res0 = { 0 };
  18. if (4 == size) {
  19. uint64_t src0, src1, src2, src3;
  20. v8i16 diff0 = { 0 };
  21. v8i16 diff1 = { 0 };
  22. LD4(src, src_stride, src0, src1, src2, src3);
  23. INSERT_D2_SH(src0, src1, diff0);
  24. INSERT_D2_SH(src2, src3, diff1);
  25. DOTP_SH2_SW(diff0, diff1, diff0, diff1, mul0, mul1);
  26. mul0 += mul1;
  27. res0 = __msa_hadd_s_d(mul0, mul0);
  28. res0 += __msa_splati_d(res0, 1);
  29. ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
  30. } else if (8 == size) {
  31. v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
  32. LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  33. DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1);
  34. DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
  35. DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
  36. DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
  37. mul0 += mul1;
  38. res0 = __msa_hadd_s_d(mul0, mul0);
  39. res0 += __msa_splati_d(res0, 1);
  40. ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
  41. } else if (16 == size) {
  42. v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
  43. LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  44. DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1);
  45. DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
  46. DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
  47. DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
  48. LD_SH8(src + 8, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  49. src += 8 * src_stride;
  50. DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
  51. DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
  52. DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
  53. DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
  54. LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  55. DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
  56. DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
  57. DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
  58. DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
  59. LD_SH8(src + 8, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  60. DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
  61. DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
  62. DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
  63. DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
  64. mul0 += mul1;
  65. res0 += __msa_hadd_s_d(mul0, mul0);
  66. res0 += __msa_splati_d(res0, 1);
  67. ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
  68. } else if (0 == (size % 16)) {
  69. v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
  70. for (row = 0; row < (size >> 4); row++) {
  71. for (col = 0; col < size; col += 16) {
  72. const int16_t *src_ptr = src + col;
  73. LD_SH8(src_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6,
  74. src7);
  75. DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1);
  76. DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
  77. DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
  78. DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
  79. LD_SH8(src_ptr + 8, src_stride, src0, src1, src2, src3, src4, src5,
  80. src6, src7);
  81. src_ptr += 8 * src_stride;
  82. DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
  83. DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
  84. DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
  85. DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
  86. LD_SH8(src_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6,
  87. src7);
  88. DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
  89. DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
  90. DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
  91. DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
  92. LD_SH8(src_ptr + 8, src_stride, src0, src1, src2, src3, src4, src5,
  93. src6, src7);
  94. DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
  95. DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
  96. DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
  97. DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
  98. mul0 += mul1;
  99. res0 += __msa_hadd_s_d(mul0, mul0);
  100. }
  101. src += 16 * src_stride;
  102. }
  103. res0 += __msa_splati_d(res0, 1);
  104. ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
  105. } else {
  106. int16_t val;
  107. for (row = 0; row < size; row++) {
  108. for (col = 0; col < size; col++) {
  109. val = src[col];
  110. ss_res += val * val;
  111. }
  112. src += src_stride;
  113. }
  114. }
  115. return ss_res;
  116. }