2
0

subtract_vsx.c 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. /*
  2. * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <assert.h>
  11. #include "./vpx_config.h"
  12. #include "./vpx_dsp_rtcd.h"
  13. #include "vpx/vpx_integer.h"
  14. #include "vpx_dsp/ppc/types_vsx.h"
  15. static VPX_FORCE_INLINE void subtract_block4x4(
  16. int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src,
  17. ptrdiff_t src_stride, const uint8_t *pred, ptrdiff_t pred_stride) {
  18. int16_t *diff1 = diff + 2 * diff_stride;
  19. const uint8_t *src1 = src + 2 * src_stride;
  20. const uint8_t *pred1 = pred + 2 * pred_stride;
  21. const int16x8_t d0 = vec_vsx_ld(0, diff);
  22. const int16x8_t d1 = vec_vsx_ld(0, diff + diff_stride);
  23. const int16x8_t d2 = vec_vsx_ld(0, diff1);
  24. const int16x8_t d3 = vec_vsx_ld(0, diff1 + diff_stride);
  25. const uint8x16_t s0 = read4x2(src, (int)src_stride);
  26. const uint8x16_t p0 = read4x2(pred, (int)pred_stride);
  27. const uint8x16_t s1 = read4x2(src1, (int)src_stride);
  28. const uint8x16_t p1 = read4x2(pred1, (int)pred_stride);
  29. const int16x8_t da = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));
  30. const int16x8_t db = vec_sub(unpack_to_s16_h(s1), unpack_to_s16_h(p1));
  31. vec_vsx_st(xxpermdi(da, d0, 1), 0, diff);
  32. vec_vsx_st(xxpermdi(da, d1, 3), 0, diff + diff_stride);
  33. vec_vsx_st(xxpermdi(db, d2, 1), 0, diff1);
  34. vec_vsx_st(xxpermdi(db, d3, 3), 0, diff1 + diff_stride);
  35. }
  36. void vpx_subtract_block_vsx(int rows, int cols, int16_t *diff,
  37. ptrdiff_t diff_stride, const uint8_t *src,
  38. ptrdiff_t src_stride, const uint8_t *pred,
  39. ptrdiff_t pred_stride) {
  40. int r = rows, c;
  41. switch (cols) {
  42. case 64:
  43. case 32:
  44. do {
  45. for (c = 0; c < cols; c += 32) {
  46. const uint8x16_t s0 = vec_vsx_ld(0, src + c);
  47. const uint8x16_t s1 = vec_vsx_ld(16, src + c);
  48. const uint8x16_t p0 = vec_vsx_ld(0, pred + c);
  49. const uint8x16_t p1 = vec_vsx_ld(16, pred + c);
  50. const int16x8_t d0l =
  51. vec_sub(unpack_to_s16_l(s0), unpack_to_s16_l(p0));
  52. const int16x8_t d0h =
  53. vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));
  54. const int16x8_t d1l =
  55. vec_sub(unpack_to_s16_l(s1), unpack_to_s16_l(p1));
  56. const int16x8_t d1h =
  57. vec_sub(unpack_to_s16_h(s1), unpack_to_s16_h(p1));
  58. vec_vsx_st(d0h, 0, diff + c);
  59. vec_vsx_st(d0l, 16, diff + c);
  60. vec_vsx_st(d1h, 0, diff + c + 16);
  61. vec_vsx_st(d1l, 16, diff + c + 16);
  62. }
  63. diff += diff_stride;
  64. pred += pred_stride;
  65. src += src_stride;
  66. } while (--r);
  67. break;
  68. case 16:
  69. do {
  70. const uint8x16_t s0 = vec_vsx_ld(0, src);
  71. const uint8x16_t p0 = vec_vsx_ld(0, pred);
  72. const int16x8_t d0l = vec_sub(unpack_to_s16_l(s0), unpack_to_s16_l(p0));
  73. const int16x8_t d0h = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));
  74. vec_vsx_st(d0h, 0, diff);
  75. vec_vsx_st(d0l, 16, diff);
  76. diff += diff_stride;
  77. pred += pred_stride;
  78. src += src_stride;
  79. } while (--r);
  80. break;
  81. case 8:
  82. do {
  83. const uint8x16_t s0 = vec_vsx_ld(0, src);
  84. const uint8x16_t p0 = vec_vsx_ld(0, pred);
  85. const int16x8_t d0h = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));
  86. vec_vsx_st(d0h, 0, diff);
  87. diff += diff_stride;
  88. pred += pred_stride;
  89. src += src_stride;
  90. } while (--r);
  91. break;
  92. case 4:
  93. subtract_block4x4(diff, diff_stride, src, src_stride, pred, pred_stride);
  94. if (r > 4) {
  95. diff += 4 * diff_stride;
  96. pred += 4 * pred_stride;
  97. src += 4 * src_stride;
  98. subtract_block4x4(diff, diff_stride,
  99. src, src_stride,
  100. pred, pred_stride);
  101. }
  102. break;
  103. default: assert(0); // unreachable
  104. }
  105. }