vpx_convolve_vsx.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408
  1. /*
  2. * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <assert.h>
  11. #include <string.h>
  12. #include "./vpx_dsp_rtcd.h"
  13. #include "vpx/vpx_integer.h"
  14. #include "vpx_dsp/ppc/types_vsx.h"
  15. #include "vpx_dsp/vpx_filter.h"
  16. // TODO(lu_zero): unroll
  17. static VPX_FORCE_INLINE void copy_w16(const uint8_t *src, ptrdiff_t src_stride,
  18. uint8_t *dst, ptrdiff_t dst_stride,
  19. int32_t h) {
  20. int i;
  21. for (i = h; i--;) {
  22. vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
  23. src += src_stride;
  24. dst += dst_stride;
  25. }
  26. }
  27. static VPX_FORCE_INLINE void copy_w32(const uint8_t *src, ptrdiff_t src_stride,
  28. uint8_t *dst, ptrdiff_t dst_stride,
  29. int32_t h) {
  30. int i;
  31. for (i = h; i--;) {
  32. vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
  33. vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
  34. src += src_stride;
  35. dst += dst_stride;
  36. }
  37. }
  38. static VPX_FORCE_INLINE void copy_w64(const uint8_t *src, ptrdiff_t src_stride,
  39. uint8_t *dst, ptrdiff_t dst_stride,
  40. int32_t h) {
  41. int i;
  42. for (i = h; i--;) {
  43. vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
  44. vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
  45. vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
  46. vec_vsx_st(vec_vsx_ld(48, src), 48, dst);
  47. src += src_stride;
  48. dst += dst_stride;
  49. }
  50. }
  51. void vpx_convolve_copy_vsx(const uint8_t *src, ptrdiff_t src_stride,
  52. uint8_t *dst, ptrdiff_t dst_stride,
  53. const InterpKernel *filter, int x0_q4, int x_step_q4,
  54. int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) {
  55. (void)filter;
  56. (void)x0_q4;
  57. (void)x_step_q4;
  58. (void)y0_q4;
  59. (void)y_step_q4;
  60. switch (w) {
  61. case 16: {
  62. copy_w16(src, src_stride, dst, dst_stride, h);
  63. break;
  64. }
  65. case 32: {
  66. copy_w32(src, src_stride, dst, dst_stride, h);
  67. break;
  68. }
  69. case 64: {
  70. copy_w64(src, src_stride, dst, dst_stride, h);
  71. break;
  72. }
  73. default: {
  74. int i;
  75. for (i = h; i--;) {
  76. memcpy(dst, src, w);
  77. src += src_stride;
  78. dst += dst_stride;
  79. }
  80. break;
  81. }
  82. }
  83. }
  84. static VPX_FORCE_INLINE void avg_w16(const uint8_t *src, ptrdiff_t src_stride,
  85. uint8_t *dst, ptrdiff_t dst_stride,
  86. int32_t h) {
  87. int i;
  88. for (i = h; i--;) {
  89. const uint8x16_t v = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst));
  90. vec_vsx_st(v, 0, dst);
  91. src += src_stride;
  92. dst += dst_stride;
  93. }
  94. }
  95. static VPX_FORCE_INLINE void avg_w32(const uint8_t *src, ptrdiff_t src_stride,
  96. uint8_t *dst, ptrdiff_t dst_stride,
  97. int32_t h) {
  98. int i;
  99. for (i = h; i--;) {
  100. const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst));
  101. const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst));
  102. vec_vsx_st(v0, 0, dst);
  103. vec_vsx_st(v1, 16, dst);
  104. src += src_stride;
  105. dst += dst_stride;
  106. }
  107. }
  108. static VPX_FORCE_INLINE void avg_w64(const uint8_t *src, ptrdiff_t src_stride,
  109. uint8_t *dst, ptrdiff_t dst_stride,
  110. int32_t h) {
  111. int i;
  112. for (i = h; i--;) {
  113. const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst));
  114. const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst));
  115. const uint8x16_t v2 = vec_avg(vec_vsx_ld(32, src), vec_vsx_ld(32, dst));
  116. const uint8x16_t v3 = vec_avg(vec_vsx_ld(48, src), vec_vsx_ld(48, dst));
  117. vec_vsx_st(v0, 0, dst);
  118. vec_vsx_st(v1, 16, dst);
  119. vec_vsx_st(v2, 32, dst);
  120. vec_vsx_st(v3, 48, dst);
  121. src += src_stride;
  122. dst += dst_stride;
  123. }
  124. }
  125. void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,
  126. uint8_t *dst, ptrdiff_t dst_stride,
  127. const InterpKernel *filter, int x0_q4, int x_step_q4,
  128. int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) {
  129. switch (w) {
  130. case 16: {
  131. avg_w16(src, src_stride, dst, dst_stride, h);
  132. break;
  133. }
  134. case 32: {
  135. avg_w32(src, src_stride, dst, dst_stride, h);
  136. break;
  137. }
  138. case 64: {
  139. avg_w64(src, src_stride, dst, dst_stride, h);
  140. break;
  141. }
  142. default: {
  143. vpx_convolve_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
  144. x_step_q4, y0_q4, y_step_q4, w, h);
  145. break;
  146. }
  147. }
  148. }
  149. static VPX_FORCE_INLINE void convolve_line(uint8_t *dst, const int16x8_t s,
  150. const int16x8_t f) {
  151. const int32x4_t sum = vec_msum(s, f, vec_splat_s32(0));
  152. const int32x4_t bias =
  153. vec_sl(vec_splat_s32(1), vec_splat_u32(FILTER_BITS - 1));
  154. const int32x4_t avg = vec_sr(vec_sums(sum, bias), vec_splat_u32(FILTER_BITS));
  155. const uint8x16_t v = vec_splat(
  156. vec_packsu(vec_pack(avg, vec_splat_s32(0)), vec_splat_s16(0)), 3);
  157. vec_ste(v, 0, dst);
  158. }
  159. static VPX_FORCE_INLINE void convolve_line_h(uint8_t *dst,
  160. const uint8_t *const src_x,
  161. const int16_t *const x_filter) {
  162. const int16x8_t s = unpack_to_s16_h(vec_vsx_ld(0, src_x));
  163. const int16x8_t f = vec_vsx_ld(0, x_filter);
  164. convolve_line(dst, s, f);
  165. }
  166. // TODO(lu_zero): Implement 8x8 and bigger block special cases
  167. static VPX_FORCE_INLINE void convolve_horiz(const uint8_t *src,
  168. ptrdiff_t src_stride, uint8_t *dst,
  169. ptrdiff_t dst_stride,
  170. const InterpKernel *x_filters,
  171. int x0_q4, int x_step_q4, int w,
  172. int h) {
  173. int x, y;
  174. src -= SUBPEL_TAPS / 2 - 1;
  175. for (y = 0; y < h; ++y) {
  176. int x_q4 = x0_q4;
  177. for (x = 0; x < w; ++x) {
  178. convolve_line_h(dst + x, &src[x_q4 >> SUBPEL_BITS],
  179. x_filters[x_q4 & SUBPEL_MASK]);
  180. x_q4 += x_step_q4;
  181. }
  182. src += src_stride;
  183. dst += dst_stride;
  184. }
  185. }
  186. static VPX_FORCE_INLINE void convolve_avg_horiz(
  187. const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
  188. ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
  189. int x_step_q4, int w, int h) {
  190. int x, y;
  191. src -= SUBPEL_TAPS / 2 - 1;
  192. for (y = 0; y < h; ++y) {
  193. int x_q4 = x0_q4;
  194. for (x = 0; x < w; ++x) {
  195. uint8_t v;
  196. convolve_line_h(&v, &src[x_q4 >> SUBPEL_BITS],
  197. x_filters[x_q4 & SUBPEL_MASK]);
  198. dst[x] = ROUND_POWER_OF_TWO(dst[x] + v, 1);
  199. x_q4 += x_step_q4;
  200. }
  201. src += src_stride;
  202. dst += dst_stride;
  203. }
  204. }
  205. static uint8x16_t transpose_line_u8_8x8(uint8x16_t a, uint8x16_t b,
  206. uint8x16_t c, uint8x16_t d,
  207. uint8x16_t e, uint8x16_t f,
  208. uint8x16_t g, uint8x16_t h) {
  209. uint16x8_t ab = (uint16x8_t)vec_mergeh(a, b);
  210. uint16x8_t cd = (uint16x8_t)vec_mergeh(c, d);
  211. uint16x8_t ef = (uint16x8_t)vec_mergeh(e, f);
  212. uint16x8_t gh = (uint16x8_t)vec_mergeh(g, h);
  213. uint32x4_t abcd = (uint32x4_t)vec_mergeh(ab, cd);
  214. uint32x4_t efgh = (uint32x4_t)vec_mergeh(ef, gh);
  215. return (uint8x16_t)vec_mergeh(abcd, efgh);
  216. }
  217. static VPX_FORCE_INLINE void convolve_line_v(uint8_t *dst,
  218. const uint8_t *const src_y,
  219. ptrdiff_t src_stride,
  220. const int16_t *const y_filter) {
  221. uint8x16_t s0 = vec_vsx_ld(0, src_y + 0 * src_stride);
  222. uint8x16_t s1 = vec_vsx_ld(0, src_y + 1 * src_stride);
  223. uint8x16_t s2 = vec_vsx_ld(0, src_y + 2 * src_stride);
  224. uint8x16_t s3 = vec_vsx_ld(0, src_y + 3 * src_stride);
  225. uint8x16_t s4 = vec_vsx_ld(0, src_y + 4 * src_stride);
  226. uint8x16_t s5 = vec_vsx_ld(0, src_y + 5 * src_stride);
  227. uint8x16_t s6 = vec_vsx_ld(0, src_y + 6 * src_stride);
  228. uint8x16_t s7 = vec_vsx_ld(0, src_y + 7 * src_stride);
  229. const int16x8_t f = vec_vsx_ld(0, y_filter);
  230. uint8_t buf[16];
  231. const uint8x16_t s = transpose_line_u8_8x8(s0, s1, s2, s3, s4, s5, s6, s7);
  232. vec_vsx_st(s, 0, buf);
  233. convolve_line(dst, unpack_to_s16_h(s), f);
  234. }
  235. static VPX_FORCE_INLINE void convolve_vert(const uint8_t *src,
  236. ptrdiff_t src_stride, uint8_t *dst,
  237. ptrdiff_t dst_stride,
  238. const InterpKernel *y_filters,
  239. int y0_q4, int y_step_q4, int w,
  240. int h) {
  241. int x, y;
  242. src -= src_stride * (SUBPEL_TAPS / 2 - 1);
  243. for (x = 0; x < w; ++x) {
  244. int y_q4 = y0_q4;
  245. for (y = 0; y < h; ++y) {
  246. convolve_line_v(dst + y * dst_stride,
  247. &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride,
  248. y_filters[y_q4 & SUBPEL_MASK]);
  249. y_q4 += y_step_q4;
  250. }
  251. ++src;
  252. ++dst;
  253. }
  254. }
  255. static VPX_FORCE_INLINE void convolve_avg_vert(
  256. const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
  257. ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
  258. int y_step_q4, int w, int h) {
  259. int x, y;
  260. src -= src_stride * (SUBPEL_TAPS / 2 - 1);
  261. for (x = 0; x < w; ++x) {
  262. int y_q4 = y0_q4;
  263. for (y = 0; y < h; ++y) {
  264. uint8_t v;
  265. convolve_line_v(&v, &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride,
  266. y_filters[y_q4 & SUBPEL_MASK]);
  267. dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + v, 1);
  268. y_q4 += y_step_q4;
  269. }
  270. ++src;
  271. ++dst;
  272. }
  273. }
  274. static VPX_FORCE_INLINE void convolve(const uint8_t *src, ptrdiff_t src_stride,
  275. uint8_t *dst, ptrdiff_t dst_stride,
  276. const InterpKernel *const filter,
  277. int x0_q4, int x_step_q4, int y0_q4,
  278. int y_step_q4, int w, int h) {
  279. // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  280. // 2d filtering proceeds in 2 steps:
  281. // (1) Interpolate horizontally into an intermediate buffer, temp.
  282. // (2) Interpolate temp vertically to derive the sub-pixel result.
  283. // Deriving the maximum number of rows in the temp buffer (135):
  284. // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
  285. // --Largest block size is 64x64 pixels.
  286. // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
  287. // original frame (in 1/16th pixel units).
  288. // --Must round-up because block may be located at sub-pixel position.
  289. // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
  290. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
  291. DECLARE_ALIGNED(16, uint8_t, temp[64 * 135]);
  292. const int intermediate_height =
  293. (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
  294. assert(w <= 64);
  295. assert(h <= 64);
  296. assert(y_step_q4 <= 32);
  297. assert(x_step_q4 <= 32);
  298. convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
  299. filter, x0_q4, x_step_q4, w, intermediate_height);
  300. convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter,
  301. y0_q4, y_step_q4, w, h);
  302. }
  303. void vpx_convolve8_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride,
  304. uint8_t *dst, ptrdiff_t dst_stride,
  305. const InterpKernel *filter, int x0_q4,
  306. int x_step_q4, int y0_q4, int y_step_q4, int w,
  307. int h) {
  308. (void)y0_q4;
  309. (void)y_step_q4;
  310. convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w,
  311. h);
  312. }
  313. void vpx_convolve8_avg_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride,
  314. uint8_t *dst, ptrdiff_t dst_stride,
  315. const InterpKernel *filter, int x0_q4,
  316. int x_step_q4, int y0_q4, int y_step_q4, int w,
  317. int h) {
  318. (void)y0_q4;
  319. (void)y_step_q4;
  320. convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
  321. w, h);
  322. }
  323. void vpx_convolve8_vert_vsx(const uint8_t *src, ptrdiff_t src_stride,
  324. uint8_t *dst, ptrdiff_t dst_stride,
  325. const InterpKernel *filter, int x0_q4,
  326. int x_step_q4, int y0_q4, int y_step_q4, int w,
  327. int h) {
  328. (void)x0_q4;
  329. (void)x_step_q4;
  330. convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w,
  331. h);
  332. }
  333. void vpx_convolve8_avg_vert_vsx(const uint8_t *src, ptrdiff_t src_stride,
  334. uint8_t *dst, ptrdiff_t dst_stride,
  335. const InterpKernel *filter, int x0_q4,
  336. int x_step_q4, int y0_q4, int y_step_q4, int w,
  337. int h) {
  338. (void)x0_q4;
  339. (void)x_step_q4;
  340. convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4,
  341. w, h);
  342. }
  343. void vpx_convolve8_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
  344. ptrdiff_t dst_stride, const InterpKernel *filter,
  345. int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
  346. int w, int h) {
  347. convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4,
  348. y_step_q4, w, h);
  349. }
  350. void vpx_convolve8_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,
  351. uint8_t *dst, ptrdiff_t dst_stride,
  352. const InterpKernel *filter, int x0_q4, int x_step_q4,
  353. int y0_q4, int y_step_q4, int w, int h) {
  354. // Fixed size intermediate buffer places limits on parameters.
  355. DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
  356. assert(w <= 64);
  357. assert(h <= 64);
  358. vpx_convolve8_vsx(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,
  359. y_step_q4, w, h);
  360. vpx_convolve_avg_vsx(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
  361. }