vpx_convolve_copy_neon.c 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. /*
  2. * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <arm_neon.h>
  11. #include "./vpx_dsp_rtcd.h"
  12. #include "vpx/vpx_integer.h"
  13. void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
  14. uint8_t *dst, ptrdiff_t dst_stride,
  15. const InterpKernel *filter, int x0_q4,
  16. int x_step_q4, int y0_q4, int y_step_q4, int w,
  17. int h) {
  18. (void)filter;
  19. (void)x0_q4;
  20. (void)x_step_q4;
  21. (void)y0_q4;
  22. (void)y_step_q4;
  23. if (w < 8) { // copy4
  24. do {
  25. *(uint32_t *)dst = *(const uint32_t *)src;
  26. src += src_stride;
  27. dst += dst_stride;
  28. *(uint32_t *)dst = *(const uint32_t *)src;
  29. src += src_stride;
  30. dst += dst_stride;
  31. h -= 2;
  32. } while (h > 0);
  33. } else if (w == 8) { // copy8
  34. uint8x8_t s0, s1;
  35. do {
  36. s0 = vld1_u8(src);
  37. src += src_stride;
  38. s1 = vld1_u8(src);
  39. src += src_stride;
  40. vst1_u8(dst, s0);
  41. dst += dst_stride;
  42. vst1_u8(dst, s1);
  43. dst += dst_stride;
  44. h -= 2;
  45. } while (h > 0);
  46. } else if (w < 32) { // copy16
  47. uint8x16_t s0, s1;
  48. do {
  49. s0 = vld1q_u8(src);
  50. src += src_stride;
  51. s1 = vld1q_u8(src);
  52. src += src_stride;
  53. vst1q_u8(dst, s0);
  54. dst += dst_stride;
  55. vst1q_u8(dst, s1);
  56. dst += dst_stride;
  57. h -= 2;
  58. } while (h > 0);
  59. } else if (w == 32) { // copy32
  60. uint8x16_t s0, s1, s2, s3;
  61. do {
  62. s0 = vld1q_u8(src);
  63. s1 = vld1q_u8(src + 16);
  64. src += src_stride;
  65. s2 = vld1q_u8(src);
  66. s3 = vld1q_u8(src + 16);
  67. src += src_stride;
  68. vst1q_u8(dst, s0);
  69. vst1q_u8(dst + 16, s1);
  70. dst += dst_stride;
  71. vst1q_u8(dst, s2);
  72. vst1q_u8(dst + 16, s3);
  73. dst += dst_stride;
  74. h -= 2;
  75. } while (h > 0);
  76. } else { // copy64
  77. uint8x16_t s0, s1, s2, s3;
  78. do {
  79. s0 = vld1q_u8(src);
  80. s1 = vld1q_u8(src + 16);
  81. s2 = vld1q_u8(src + 32);
  82. s3 = vld1q_u8(src + 48);
  83. src += src_stride;
  84. vst1q_u8(dst, s0);
  85. vst1q_u8(dst + 16, s1);
  86. vst1q_u8(dst + 32, s2);
  87. vst1q_u8(dst + 48, s3);
  88. dst += dst_stride;
  89. } while (--h);
  90. }
  91. }