123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129 |
- /*
- * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
- #include "./vpx_dsp_rtcd.h"
- #include "./macros_msa.h"
- uint64_t vpx_sum_squares_2d_i16_msa(const int16_t *src, int src_stride,
- int size) {
- int row, col;
- uint64_t ss_res = 0;
- v4i32 mul0, mul1;
- v2i64 res0 = { 0 };
- if (4 == size) {
- uint64_t src0, src1, src2, src3;
- v8i16 diff0 = { 0 };
- v8i16 diff1 = { 0 };
- LD4(src, src_stride, src0, src1, src2, src3);
- INSERT_D2_SH(src0, src1, diff0);
- INSERT_D2_SH(src2, src3, diff1);
- DOTP_SH2_SW(diff0, diff1, diff0, diff1, mul0, mul1);
- mul0 += mul1;
- res0 = __msa_hadd_s_d(mul0, mul0);
- res0 += __msa_splati_d(res0, 1);
- ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
- } else if (8 == size) {
- v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
- LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
- DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1);
- DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
- DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
- DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
- mul0 += mul1;
- res0 = __msa_hadd_s_d(mul0, mul0);
- res0 += __msa_splati_d(res0, 1);
- ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
- } else if (16 == size) {
- v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
- LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
- DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1);
- DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
- DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
- DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
- LD_SH8(src + 8, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
- src += 8 * src_stride;
- DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
- DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
- DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
- DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
- LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
- DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
- DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
- DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
- DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
- LD_SH8(src + 8, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
- DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
- DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
- DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
- DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
- mul0 += mul1;
- res0 += __msa_hadd_s_d(mul0, mul0);
- res0 += __msa_splati_d(res0, 1);
- ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
- } else if (0 == (size % 16)) {
- v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
- for (row = 0; row < (size >> 4); row++) {
- for (col = 0; col < size; col += 16) {
- const int16_t *src_ptr = src + col;
- LD_SH8(src_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6,
- src7);
- DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1);
- DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
- DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
- DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
- LD_SH8(src_ptr + 8, src_stride, src0, src1, src2, src3, src4, src5,
- src6, src7);
- src_ptr += 8 * src_stride;
- DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
- DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
- DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
- DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
- LD_SH8(src_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6,
- src7);
- DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
- DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
- DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
- DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
- LD_SH8(src_ptr + 8, src_stride, src0, src1, src2, src3, src4, src5,
- src6, src7);
- DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
- DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
- DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
- DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
- mul0 += mul1;
- res0 += __msa_hadd_s_d(mul0, mul0);
- }
- src += 16 * src_stride;
- }
- res0 += __msa_splati_d(res0, 1);
- ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
- } else {
- int16_t val;
- for (row = 0; row < size; row++) {
- for (col = 0; col < size; col++) {
- val = src[col];
- ss_res += val * val;
- }
- src += src_stride;
- }
- }
- return ss_res;
- }
|