subtract_neon.c 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. /*
  2. * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <arm_neon.h>
  11. #include <assert.h>
  12. #include "./vpx_config.h"
  13. #include "./vpx_dsp_rtcd.h"
  14. #include "vpx/vpx_integer.h"
  15. #include "vpx_dsp/arm/mem_neon.h"
  16. void vpx_subtract_block_neon(int rows, int cols, int16_t *diff,
  17. ptrdiff_t diff_stride, const uint8_t *src,
  18. ptrdiff_t src_stride, const uint8_t *pred,
  19. ptrdiff_t pred_stride) {
  20. int r = rows, c;
  21. if (cols > 16) {
  22. do {
  23. for (c = 0; c < cols; c += 32) {
  24. const uint8x16_t s0 = vld1q_u8(&src[c + 0]);
  25. const uint8x16_t s1 = vld1q_u8(&src[c + 16]);
  26. const uint8x16_t p0 = vld1q_u8(&pred[c + 0]);
  27. const uint8x16_t p1 = vld1q_u8(&pred[c + 16]);
  28. const uint16x8_t d0 = vsubl_u8(vget_low_u8(s0), vget_low_u8(p0));
  29. const uint16x8_t d1 = vsubl_u8(vget_high_u8(s0), vget_high_u8(p0));
  30. const uint16x8_t d2 = vsubl_u8(vget_low_u8(s1), vget_low_u8(p1));
  31. const uint16x8_t d3 = vsubl_u8(vget_high_u8(s1), vget_high_u8(p1));
  32. vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(d0));
  33. vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(d1));
  34. vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(d2));
  35. vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(d3));
  36. }
  37. diff += diff_stride;
  38. pred += pred_stride;
  39. src += src_stride;
  40. } while (--r);
  41. } else if (cols > 8) {
  42. do {
  43. const uint8x16_t s = vld1q_u8(&src[0]);
  44. const uint8x16_t p = vld1q_u8(&pred[0]);
  45. const uint16x8_t d0 = vsubl_u8(vget_low_u8(s), vget_low_u8(p));
  46. const uint16x8_t d1 = vsubl_u8(vget_high_u8(s), vget_high_u8(p));
  47. vst1q_s16(&diff[0], vreinterpretq_s16_u16(d0));
  48. vst1q_s16(&diff[8], vreinterpretq_s16_u16(d1));
  49. diff += diff_stride;
  50. pred += pred_stride;
  51. src += src_stride;
  52. } while (--r);
  53. } else if (cols > 4) {
  54. do {
  55. const uint8x8_t s = vld1_u8(&src[0]);
  56. const uint8x8_t p = vld1_u8(&pred[0]);
  57. const uint16x8_t v_diff = vsubl_u8(s, p);
  58. vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
  59. diff += diff_stride;
  60. pred += pred_stride;
  61. src += src_stride;
  62. } while (--r);
  63. } else {
  64. assert(cols == 4);
  65. do {
  66. const uint8x8_t s = load_unaligned_u8(src, (int)src_stride);
  67. const uint8x8_t p = load_unaligned_u8(pred, (int)pred_stride);
  68. const uint16x8_t d = vsubl_u8(s, p);
  69. vst1_s16(diff + 0 * diff_stride, vreinterpret_s16_u16(vget_low_u16(d)));
  70. vst1_s16(diff + 1 * diff_stride, vreinterpret_s16_u16(vget_high_u16(d)));
  71. diff += 2 * diff_stride;
  72. pred += 2 * pred_stride;
  73. src += 2 * src_stride;
  74. r -= 2;
  75. } while (r);
  76. }
  77. }