convolve.h 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279
  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #ifndef VPX_VPX_DSP_X86_CONVOLVE_H_
  11. #define VPX_VPX_DSP_X86_CONVOLVE_H_
  12. #include <assert.h>
  13. #include "./vpx_config.h"
  14. #include "vpx/vpx_integer.h"
  15. #include "vpx_ports/mem.h"
  16. // TODO(chiyotsai@google.com): Refactor the code here. Currently this is pretty
  17. // hacky and awful to read. Note that there is a filter_x[3] == 128 check in
  18. // HIGHBD_FUN_CONV_2D to avoid seg fault due to the fact that the c function
  19. // assumes the filter is always 8 tap.
  20. typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
  21. uint8_t *output_ptr, ptrdiff_t out_pitch,
  22. uint32_t output_height, const int16_t *filter);
  23. // TODO(chiyotsai@google.com): Remove the is_avg argument to the MACROS once we
  24. // have 4-tap vert avg filter.
  25. #define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt, is_avg) \
  26. void vpx_convolve8_##name##_##opt( \
  27. const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
  28. ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
  29. int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \
  30. const int16_t *filter_row = filter[offset]; \
  31. (void)x0_q4; \
  32. (void)x_step_q4; \
  33. (void)y0_q4; \
  34. (void)y_step_q4; \
  35. assert(filter_row[3] != 128); \
  36. assert(step_q4 == 16); \
  37. if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \
  38. const int num_taps = 8; \
  39. while (w >= 16) { \
  40. vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
  41. dst_stride, h, filter_row); \
  42. src += 16; \
  43. dst += 16; \
  44. w -= 16; \
  45. } \
  46. if (w == 8) { \
  47. vpx_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \
  48. dst_stride, h, filter_row); \
  49. } else if (w == 4) { \
  50. vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \
  51. dst_stride, h, filter_row); \
  52. } \
  53. (void)num_taps; \
  54. } else if (filter_row[2] | filter_row[5]) { \
  55. const int num_taps = is_avg ? 8 : 4; \
  56. while (w >= 16) { \
  57. vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
  58. dst_stride, h, filter_row); \
  59. src += 16; \
  60. dst += 16; \
  61. w -= 16; \
  62. } \
  63. if (w == 8) { \
  64. vpx_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \
  65. dst_stride, h, filter_row); \
  66. } else if (w == 4) { \
  67. vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \
  68. dst_stride, h, filter_row); \
  69. } \
  70. (void)num_taps; \
  71. } else { \
  72. const int num_taps = 2; \
  73. while (w >= 16) { \
  74. vpx_filter_block1d16_##dir##2_##avg##opt(src_start, src_stride, dst, \
  75. dst_stride, h, filter_row); \
  76. src += 16; \
  77. dst += 16; \
  78. w -= 16; \
  79. } \
  80. if (w == 8) { \
  81. vpx_filter_block1d8_##dir##2_##avg##opt(src_start, src_stride, dst, \
  82. dst_stride, h, filter_row); \
  83. } else if (w == 4) { \
  84. vpx_filter_block1d4_##dir##2_##avg##opt(src_start, src_stride, dst, \
  85. dst_stride, h, filter_row); \
  86. } \
  87. (void)num_taps; \
  88. } \
  89. }
  90. #define FUN_CONV_2D(avg, opt, is_avg) \
  91. void vpx_convolve8_##avg##opt( \
  92. const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
  93. ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
  94. int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \
  95. const int16_t *filter_x = filter[x0_q4]; \
  96. const int16_t *filter_y = filter[y0_q4]; \
  97. (void)filter_y; \
  98. assert(filter_x[3] != 128); \
  99. assert(filter_y[3] != 128); \
  100. assert(w <= 64); \
  101. assert(h <= 64); \
  102. assert(x_step_q4 == 16); \
  103. assert(y_step_q4 == 16); \
  104. if (filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) { \
  105. DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \
  106. vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
  107. filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, \
  108. h + 7); \
  109. vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
  110. filter, x0_q4, x_step_q4, y0_q4, \
  111. y_step_q4, w, h); \
  112. } else if (filter_x[2] | filter_x[5]) { \
  113. const int num_taps = is_avg ? 8 : 4; \
  114. DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \
  115. vpx_convolve8_horiz_##opt( \
  116. src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64, \
  117. filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1); \
  118. vpx_convolve8_##avg##vert_##opt(fdata2 + 64 * (num_taps / 2 - 1), 64, \
  119. dst, dst_stride, filter, x0_q4, \
  120. x_step_q4, y0_q4, y_step_q4, w, h); \
  121. } else { \
  122. DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \
  123. vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, x0_q4, \
  124. x_step_q4, y0_q4, y_step_q4, w, h + 1); \
  125. vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter, \
  126. x0_q4, x_step_q4, y0_q4, y_step_q4, w, \
  127. h); \
  128. } \
  129. }
  130. #if CONFIG_VP9_HIGHBITDEPTH
  131. typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
  132. const ptrdiff_t src_pitch,
  133. uint16_t *output_ptr,
  134. ptrdiff_t out_pitch,
  135. unsigned int output_height,
  136. const int16_t *filter, int bd);
  137. #define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt, \
  138. is_avg) \
  139. void vpx_highbd_convolve8_##name##_##opt( \
  140. const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \
  141. ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4, \
  142. int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \
  143. const int16_t *filter_row = filter_kernel[offset]; \
  144. if (step_q4 == 16 && filter_row[3] != 128) { \
  145. if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \
  146. const int num_taps = 8; \
  147. while (w >= 16) { \
  148. vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \
  149. src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
  150. src += 16; \
  151. dst += 16; \
  152. w -= 16; \
  153. } \
  154. while (w >= 8) { \
  155. vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \
  156. src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
  157. src += 8; \
  158. dst += 8; \
  159. w -= 8; \
  160. } \
  161. while (w >= 4) { \
  162. vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \
  163. src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
  164. src += 4; \
  165. dst += 4; \
  166. w -= 4; \
  167. } \
  168. (void)num_taps; \
  169. } else if (filter_row[2] | filter_row[5]) { \
  170. const int num_taps = is_avg ? 8 : 4; \
  171. while (w >= 16) { \
  172. vpx_highbd_filter_block1d16_##dir##4_##avg##opt( \
  173. src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
  174. src += 16; \
  175. dst += 16; \
  176. w -= 16; \
  177. } \
  178. while (w >= 8) { \
  179. vpx_highbd_filter_block1d8_##dir##4_##avg##opt( \
  180. src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
  181. src += 8; \
  182. dst += 8; \
  183. w -= 8; \
  184. } \
  185. while (w >= 4) { \
  186. vpx_highbd_filter_block1d4_##dir##4_##avg##opt( \
  187. src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
  188. src += 4; \
  189. dst += 4; \
  190. w -= 4; \
  191. } \
  192. (void)num_taps; \
  193. } else { \
  194. const int num_taps = 2; \
  195. while (w >= 16) { \
  196. vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \
  197. src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
  198. src += 16; \
  199. dst += 16; \
  200. w -= 16; \
  201. } \
  202. while (w >= 8) { \
  203. vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \
  204. src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
  205. src += 8; \
  206. dst += 8; \
  207. w -= 8; \
  208. } \
  209. while (w >= 4) { \
  210. vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \
  211. src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
  212. src += 4; \
  213. dst += 4; \
  214. w -= 4; \
  215. } \
  216. (void)num_taps; \
  217. } \
  218. } \
  219. if (w) { \
  220. vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
  221. filter_kernel, x0_q4, x_step_q4, y0_q4, \
  222. y_step_q4, w, h, bd); \
  223. } \
  224. }
  225. #define HIGH_FUN_CONV_2D(avg, opt, is_avg) \
  226. void vpx_highbd_convolve8_##avg##opt( \
  227. const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \
  228. ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
  229. int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \
  230. const int16_t *filter_x = filter[x0_q4]; \
  231. assert(w <= 64); \
  232. assert(h <= 64); \
  233. if (x_step_q4 == 16 && y_step_q4 == 16) { \
  234. if ((filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) || \
  235. filter_x[3] == 128) { \
  236. DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
  237. vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
  238. fdata2, 64, filter, x0_q4, x_step_q4, \
  239. y0_q4, y_step_q4, w, h + 7, bd); \
  240. vpx_highbd_convolve8_##avg##vert_##opt( \
  241. fdata2 + 192, 64, dst, dst_stride, filter, x0_q4, x_step_q4, \
  242. y0_q4, y_step_q4, w, h, bd); \
  243. } else if (filter_x[2] | filter_x[5]) { \
  244. const int num_taps = is_avg ? 8 : 4; \
  245. DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
  246. vpx_highbd_convolve8_horiz_##opt( \
  247. src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64, \
  248. filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1, \
  249. bd); \
  250. vpx_highbd_convolve8_##avg##vert_##opt( \
  251. fdata2 + 64 * (num_taps / 2 - 1), 64, dst, dst_stride, filter, \
  252. x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); \
  253. } else { \
  254. DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
  255. vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, \
  256. x0_q4, x_step_q4, y0_q4, y_step_q4, \
  257. w, h + 1, bd); \
  258. vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
  259. filter, x0_q4, x_step_q4, \
  260. y0_q4, y_step_q4, w, h, bd); \
  261. } \
  262. } else { \
  263. vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, filter, \
  264. x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, \
  265. bd); \
  266. } \
  267. }
  268. #endif // CONFIG_VP9_HIGHBITDEPTH
  269. #endif // VPX_VPX_DSP_X86_CONVOLVE_H_