highbd_vpx_convolve8_neon.c 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931
  1. /*
  2. * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <arm_neon.h>
  11. #include <assert.h>
  12. #include "./vpx_config.h"
  13. #include "./vpx_dsp_rtcd.h"
  14. #include "vpx/vpx_integer.h"
  15. #include "vpx_dsp/arm/transpose_neon.h"
  16. #include "vpx_ports/mem.h"
  17. static INLINE void load_4x4(const int16_t *s, const ptrdiff_t p,
  18. int16x4_t *const s0, int16x4_t *const s1,
  19. int16x4_t *const s2, int16x4_t *const s3) {
  20. *s0 = vld1_s16(s);
  21. s += p;
  22. *s1 = vld1_s16(s);
  23. s += p;
  24. *s2 = vld1_s16(s);
  25. s += p;
  26. *s3 = vld1_s16(s);
  27. }
  28. static INLINE void load_8x4(const uint16_t *s, const ptrdiff_t p,
  29. uint16x8_t *const s0, uint16x8_t *const s1,
  30. uint16x8_t *const s2, uint16x8_t *const s3) {
  31. *s0 = vld1q_u16(s);
  32. s += p;
  33. *s1 = vld1q_u16(s);
  34. s += p;
  35. *s2 = vld1q_u16(s);
  36. s += p;
  37. *s3 = vld1q_u16(s);
  38. }
  39. static INLINE void load_8x8(const int16_t *s, const ptrdiff_t p,
  40. int16x8_t *const s0, int16x8_t *const s1,
  41. int16x8_t *const s2, int16x8_t *const s3,
  42. int16x8_t *const s4, int16x8_t *const s5,
  43. int16x8_t *const s6, int16x8_t *const s7) {
  44. *s0 = vld1q_s16(s);
  45. s += p;
  46. *s1 = vld1q_s16(s);
  47. s += p;
  48. *s2 = vld1q_s16(s);
  49. s += p;
  50. *s3 = vld1q_s16(s);
  51. s += p;
  52. *s4 = vld1q_s16(s);
  53. s += p;
  54. *s5 = vld1q_s16(s);
  55. s += p;
  56. *s6 = vld1q_s16(s);
  57. s += p;
  58. *s7 = vld1q_s16(s);
  59. }
  60. static INLINE void store_8x8(uint16_t *s, const ptrdiff_t p,
  61. const uint16x8_t s0, const uint16x8_t s1,
  62. const uint16x8_t s2, const uint16x8_t s3,
  63. const uint16x8_t s4, const uint16x8_t s5,
  64. const uint16x8_t s6, const uint16x8_t s7) {
  65. vst1q_u16(s, s0);
  66. s += p;
  67. vst1q_u16(s, s1);
  68. s += p;
  69. vst1q_u16(s, s2);
  70. s += p;
  71. vst1q_u16(s, s3);
  72. s += p;
  73. vst1q_u16(s, s4);
  74. s += p;
  75. vst1q_u16(s, s5);
  76. s += p;
  77. vst1q_u16(s, s6);
  78. s += p;
  79. vst1q_u16(s, s7);
  80. }
  81. static INLINE int32x4_t highbd_convolve8_4(
  82. const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
  83. const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
  84. const int16x4_t s6, const int16x4_t s7, const int16x8_t filters) {
  85. const int16x4_t filters_lo = vget_low_s16(filters);
  86. const int16x4_t filters_hi = vget_high_s16(filters);
  87. int32x4_t sum;
  88. sum = vmull_lane_s16(s0, filters_lo, 0);
  89. sum = vmlal_lane_s16(sum, s1, filters_lo, 1);
  90. sum = vmlal_lane_s16(sum, s2, filters_lo, 2);
  91. sum = vmlal_lane_s16(sum, s3, filters_lo, 3);
  92. sum = vmlal_lane_s16(sum, s4, filters_hi, 0);
  93. sum = vmlal_lane_s16(sum, s5, filters_hi, 1);
  94. sum = vmlal_lane_s16(sum, s6, filters_hi, 2);
  95. sum = vmlal_lane_s16(sum, s7, filters_hi, 3);
  96. return sum;
  97. }
  98. static INLINE uint16x8_t
  99. highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
  100. const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
  101. const int16x8_t s6, const int16x8_t s7,
  102. const int16x8_t filters, const uint16x8_t max) {
  103. const int16x4_t filters_lo = vget_low_s16(filters);
  104. const int16x4_t filters_hi = vget_high_s16(filters);
  105. int32x4_t sum0, sum1;
  106. uint16x8_t d;
  107. sum0 = vmull_lane_s16(vget_low_s16(s0), filters_lo, 0);
  108. sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters_lo, 1);
  109. sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters_lo, 2);
  110. sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters_lo, 3);
  111. sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filters_hi, 0);
  112. sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filters_hi, 1);
  113. sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filters_hi, 2);
  114. sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filters_hi, 3);
  115. sum1 = vmull_lane_s16(vget_high_s16(s0), filters_lo, 0);
  116. sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters_lo, 1);
  117. sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters_lo, 2);
  118. sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters_lo, 3);
  119. sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filters_hi, 0);
  120. sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filters_hi, 1);
  121. sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filters_hi, 2);
  122. sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filters_hi, 3);
  123. d = vcombine_u16(vqrshrun_n_s32(sum0, 7), vqrshrun_n_s32(sum1, 7));
  124. d = vminq_u16(d, max);
  125. return d;
  126. }
  127. void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
  128. uint16_t *dst, ptrdiff_t dst_stride,
  129. const InterpKernel *filter, int x0_q4,
  130. int x_step_q4, int y0_q4, int y_step_q4,
  131. int w, int h, int bd) {
  132. if (x_step_q4 != 16) {
  133. vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter,
  134. x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
  135. } else {
  136. const int16x8_t filters = vld1q_s16(filter[x0_q4]);
  137. const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
  138. uint16x8_t t0, t1, t2, t3;
  139. assert(!((intptr_t)dst & 3));
  140. assert(!(dst_stride & 3));
  141. src -= 3;
  142. if (h == 4) {
  143. int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
  144. int32x4_t d0, d1, d2, d3;
  145. uint16x8_t d01, d23;
  146. __builtin_prefetch(src + 0 * src_stride);
  147. __builtin_prefetch(src + 1 * src_stride);
  148. __builtin_prefetch(src + 2 * src_stride);
  149. __builtin_prefetch(src + 3 * src_stride);
  150. load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
  151. transpose_u16_8x4(&t0, &t1, &t2, &t3);
  152. s0 = vreinterpret_s16_u16(vget_low_u16(t0));
  153. s1 = vreinterpret_s16_u16(vget_low_u16(t1));
  154. s2 = vreinterpret_s16_u16(vget_low_u16(t2));
  155. s3 = vreinterpret_s16_u16(vget_low_u16(t3));
  156. s4 = vreinterpret_s16_u16(vget_high_u16(t0));
  157. s5 = vreinterpret_s16_u16(vget_high_u16(t1));
  158. s6 = vreinterpret_s16_u16(vget_high_u16(t2));
  159. __builtin_prefetch(dst + 0 * dst_stride);
  160. __builtin_prefetch(dst + 1 * dst_stride);
  161. __builtin_prefetch(dst + 2 * dst_stride);
  162. __builtin_prefetch(dst + 3 * dst_stride);
  163. src += 7;
  164. do {
  165. load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
  166. transpose_s16_4x4d(&s7, &s8, &s9, &s10);
  167. d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
  168. d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
  169. d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
  170. d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
  171. d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
  172. d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
  173. d01 = vminq_u16(d01, max);
  174. d23 = vminq_u16(d23, max);
  175. transpose_u16_4x4q(&d01, &d23);
  176. vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01));
  177. vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23));
  178. vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01));
  179. vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23));
  180. s0 = s4;
  181. s1 = s5;
  182. s2 = s6;
  183. s3 = s7;
  184. s4 = s8;
  185. s5 = s9;
  186. s6 = s10;
  187. src += 4;
  188. dst += 4;
  189. w -= 4;
  190. } while (w > 0);
  191. } else {
  192. int16x8_t t4, t5, t6, t7;
  193. int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
  194. uint16x8_t d0, d1, d2, d3;
  195. if (w == 4) {
  196. do {
  197. load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
  198. &s5, &s6, &s7);
  199. transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
  200. load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10,
  201. &t4, &t5, &t6, &t7);
  202. src += 8 * src_stride;
  203. __builtin_prefetch(dst + 0 * dst_stride);
  204. __builtin_prefetch(dst + 1 * dst_stride);
  205. __builtin_prefetch(dst + 2 * dst_stride);
  206. __builtin_prefetch(dst + 3 * dst_stride);
  207. __builtin_prefetch(dst + 4 * dst_stride);
  208. __builtin_prefetch(dst + 5 * dst_stride);
  209. __builtin_prefetch(dst + 6 * dst_stride);
  210. __builtin_prefetch(dst + 7 * dst_stride);
  211. transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7);
  212. __builtin_prefetch(src + 0 * src_stride);
  213. __builtin_prefetch(src + 1 * src_stride);
  214. __builtin_prefetch(src + 2 * src_stride);
  215. __builtin_prefetch(src + 3 * src_stride);
  216. __builtin_prefetch(src + 4 * src_stride);
  217. __builtin_prefetch(src + 5 * src_stride);
  218. __builtin_prefetch(src + 6 * src_stride);
  219. __builtin_prefetch(src + 7 * src_stride);
  220. d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
  221. d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
  222. d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
  223. d3 =
  224. highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
  225. transpose_u16_8x4(&d0, &d1, &d2, &d3);
  226. vst1_u16(dst, vget_low_u16(d0));
  227. dst += dst_stride;
  228. vst1_u16(dst, vget_low_u16(d1));
  229. dst += dst_stride;
  230. vst1_u16(dst, vget_low_u16(d2));
  231. dst += dst_stride;
  232. vst1_u16(dst, vget_low_u16(d3));
  233. dst += dst_stride;
  234. vst1_u16(dst, vget_high_u16(d0));
  235. dst += dst_stride;
  236. vst1_u16(dst, vget_high_u16(d1));
  237. dst += dst_stride;
  238. vst1_u16(dst, vget_high_u16(d2));
  239. dst += dst_stride;
  240. vst1_u16(dst, vget_high_u16(d3));
  241. dst += dst_stride;
  242. h -= 8;
  243. } while (h > 0);
  244. } else {
  245. int width;
  246. const uint16_t *s;
  247. uint16_t *d;
  248. int16x8_t s11, s12, s13, s14;
  249. uint16x8_t d4, d5, d6, d7;
  250. do {
  251. __builtin_prefetch(src + 0 * src_stride);
  252. __builtin_prefetch(src + 1 * src_stride);
  253. __builtin_prefetch(src + 2 * src_stride);
  254. __builtin_prefetch(src + 3 * src_stride);
  255. __builtin_prefetch(src + 4 * src_stride);
  256. __builtin_prefetch(src + 5 * src_stride);
  257. __builtin_prefetch(src + 6 * src_stride);
  258. __builtin_prefetch(src + 7 * src_stride);
  259. load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
  260. &s5, &s6, &s7);
  261. transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
  262. width = w;
  263. s = src + 7;
  264. d = dst;
  265. __builtin_prefetch(dst + 0 * dst_stride);
  266. __builtin_prefetch(dst + 1 * dst_stride);
  267. __builtin_prefetch(dst + 2 * dst_stride);
  268. __builtin_prefetch(dst + 3 * dst_stride);
  269. __builtin_prefetch(dst + 4 * dst_stride);
  270. __builtin_prefetch(dst + 5 * dst_stride);
  271. __builtin_prefetch(dst + 6 * dst_stride);
  272. __builtin_prefetch(dst + 7 * dst_stride);
  273. do {
  274. load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11,
  275. &s12, &s13, &s14);
  276. transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
  277. d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters,
  278. max);
  279. d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters,
  280. max);
  281. d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters,
  282. max);
  283. d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters,
  284. max);
  285. d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters,
  286. max);
  287. d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters,
  288. max);
  289. d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters,
  290. max);
  291. d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14,
  292. filters, max);
  293. transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
  294. store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
  295. s0 = s8;
  296. s1 = s9;
  297. s2 = s10;
  298. s3 = s11;
  299. s4 = s12;
  300. s5 = s13;
  301. s6 = s14;
  302. s += 8;
  303. d += 8;
  304. width -= 8;
  305. } while (width > 0);
  306. src += 8 * src_stride;
  307. dst += 8 * dst_stride;
  308. h -= 8;
  309. } while (h > 0);
  310. }
  311. }
  312. }
  313. }
  314. void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
  315. ptrdiff_t src_stride, uint16_t *dst,
  316. ptrdiff_t dst_stride,
  317. const InterpKernel *filter, int x0_q4,
  318. int x_step_q4, int y0_q4,
  319. int y_step_q4, int w, int h, int bd) {
  320. if (x_step_q4 != 16) {
  321. vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
  322. x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
  323. bd);
  324. } else {
  325. const int16x8_t filters = vld1q_s16(filter[x0_q4]);
  326. const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
  327. uint16x8_t t0, t1, t2, t3;
  328. assert(!((intptr_t)dst & 3));
  329. assert(!(dst_stride & 3));
  330. src -= 3;
  331. if (h == 4) {
  332. int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
  333. int32x4_t d0, d1, d2, d3;
  334. uint16x8_t d01, d23, t01, t23;
  335. __builtin_prefetch(src + 0 * src_stride);
  336. __builtin_prefetch(src + 1 * src_stride);
  337. __builtin_prefetch(src + 2 * src_stride);
  338. __builtin_prefetch(src + 3 * src_stride);
  339. load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
  340. transpose_u16_8x4(&t0, &t1, &t2, &t3);
  341. s0 = vreinterpret_s16_u16(vget_low_u16(t0));
  342. s1 = vreinterpret_s16_u16(vget_low_u16(t1));
  343. s2 = vreinterpret_s16_u16(vget_low_u16(t2));
  344. s3 = vreinterpret_s16_u16(vget_low_u16(t3));
  345. s4 = vreinterpret_s16_u16(vget_high_u16(t0));
  346. s5 = vreinterpret_s16_u16(vget_high_u16(t1));
  347. s6 = vreinterpret_s16_u16(vget_high_u16(t2));
  348. __builtin_prefetch(dst + 0 * dst_stride);
  349. __builtin_prefetch(dst + 1 * dst_stride);
  350. __builtin_prefetch(dst + 2 * dst_stride);
  351. __builtin_prefetch(dst + 3 * dst_stride);
  352. src += 7;
  353. do {
  354. load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
  355. transpose_s16_4x4d(&s7, &s8, &s9, &s10);
  356. d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
  357. d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
  358. d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
  359. d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
  360. t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
  361. t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
  362. t01 = vminq_u16(t01, max);
  363. t23 = vminq_u16(t23, max);
  364. transpose_u16_4x4q(&t01, &t23);
  365. d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
  366. vld1_u16(dst + 2 * dst_stride));
  367. d23 = vcombine_u16(vld1_u16(dst + 1 * dst_stride),
  368. vld1_u16(dst + 3 * dst_stride));
  369. d01 = vrhaddq_u16(d01, t01);
  370. d23 = vrhaddq_u16(d23, t23);
  371. vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01));
  372. vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23));
  373. vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01));
  374. vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23));
  375. s0 = s4;
  376. s1 = s5;
  377. s2 = s6;
  378. s3 = s7;
  379. s4 = s8;
  380. s5 = s9;
  381. s6 = s10;
  382. src += 4;
  383. dst += 4;
  384. w -= 4;
  385. } while (w > 0);
  386. } else {
  387. int16x8_t t4, t5, t6, t7;
  388. int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
  389. uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3;
  390. if (w == 4) {
  391. do {
  392. load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
  393. &s5, &s6, &s7);
  394. transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
  395. load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10,
  396. &t4, &t5, &t6, &t7);
  397. src += 8 * src_stride;
  398. __builtin_prefetch(dst + 0 * dst_stride);
  399. __builtin_prefetch(dst + 1 * dst_stride);
  400. __builtin_prefetch(dst + 2 * dst_stride);
  401. __builtin_prefetch(dst + 3 * dst_stride);
  402. __builtin_prefetch(dst + 4 * dst_stride);
  403. __builtin_prefetch(dst + 5 * dst_stride);
  404. __builtin_prefetch(dst + 6 * dst_stride);
  405. __builtin_prefetch(dst + 7 * dst_stride);
  406. transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7);
  407. __builtin_prefetch(src + 0 * src_stride);
  408. __builtin_prefetch(src + 1 * src_stride);
  409. __builtin_prefetch(src + 2 * src_stride);
  410. __builtin_prefetch(src + 3 * src_stride);
  411. __builtin_prefetch(src + 4 * src_stride);
  412. __builtin_prefetch(src + 5 * src_stride);
  413. __builtin_prefetch(src + 6 * src_stride);
  414. __builtin_prefetch(src + 7 * src_stride);
  415. t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
  416. t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
  417. t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
  418. t3 =
  419. highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
  420. transpose_u16_8x4(&t0, &t1, &t2, &t3);
  421. d0 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
  422. vld1_u16(dst + 4 * dst_stride));
  423. d1 = vcombine_u16(vld1_u16(dst + 1 * dst_stride),
  424. vld1_u16(dst + 5 * dst_stride));
  425. d2 = vcombine_u16(vld1_u16(dst + 2 * dst_stride),
  426. vld1_u16(dst + 6 * dst_stride));
  427. d3 = vcombine_u16(vld1_u16(dst + 3 * dst_stride),
  428. vld1_u16(dst + 7 * dst_stride));
  429. d0 = vrhaddq_u16(d0, t0);
  430. d1 = vrhaddq_u16(d1, t1);
  431. d2 = vrhaddq_u16(d2, t2);
  432. d3 = vrhaddq_u16(d3, t3);
  433. vst1_u16(dst, vget_low_u16(d0));
  434. dst += dst_stride;
  435. vst1_u16(dst, vget_low_u16(d1));
  436. dst += dst_stride;
  437. vst1_u16(dst, vget_low_u16(d2));
  438. dst += dst_stride;
  439. vst1_u16(dst, vget_low_u16(d3));
  440. dst += dst_stride;
  441. vst1_u16(dst, vget_high_u16(d0));
  442. dst += dst_stride;
  443. vst1_u16(dst, vget_high_u16(d1));
  444. dst += dst_stride;
  445. vst1_u16(dst, vget_high_u16(d2));
  446. dst += dst_stride;
  447. vst1_u16(dst, vget_high_u16(d3));
  448. dst += dst_stride;
  449. h -= 8;
  450. } while (h > 0);
  451. } else {
  452. int width;
  453. const uint16_t *s;
  454. uint16_t *d;
  455. int16x8_t s11, s12, s13, s14;
  456. uint16x8_t d4, d5, d6, d7;
  457. do {
  458. __builtin_prefetch(src + 0 * src_stride);
  459. __builtin_prefetch(src + 1 * src_stride);
  460. __builtin_prefetch(src + 2 * src_stride);
  461. __builtin_prefetch(src + 3 * src_stride);
  462. __builtin_prefetch(src + 4 * src_stride);
  463. __builtin_prefetch(src + 5 * src_stride);
  464. __builtin_prefetch(src + 6 * src_stride);
  465. __builtin_prefetch(src + 7 * src_stride);
  466. load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
  467. &s5, &s6, &s7);
  468. transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
  469. width = w;
  470. s = src + 7;
  471. d = dst;
  472. __builtin_prefetch(dst + 0 * dst_stride);
  473. __builtin_prefetch(dst + 1 * dst_stride);
  474. __builtin_prefetch(dst + 2 * dst_stride);
  475. __builtin_prefetch(dst + 3 * dst_stride);
  476. __builtin_prefetch(dst + 4 * dst_stride);
  477. __builtin_prefetch(dst + 5 * dst_stride);
  478. __builtin_prefetch(dst + 6 * dst_stride);
  479. __builtin_prefetch(dst + 7 * dst_stride);
  480. do {
  481. load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11,
  482. &s12, &s13, &s14);
  483. transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
  484. d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters,
  485. max);
  486. d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters,
  487. max);
  488. d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters,
  489. max);
  490. d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters,
  491. max);
  492. d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters,
  493. max);
  494. d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters,
  495. max);
  496. d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters,
  497. max);
  498. d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14,
  499. filters, max);
  500. transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
  501. d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
  502. d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
  503. d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
  504. d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
  505. d4 = vrhaddq_u16(d4, vld1q_u16(d + 4 * dst_stride));
  506. d5 = vrhaddq_u16(d5, vld1q_u16(d + 5 * dst_stride));
  507. d6 = vrhaddq_u16(d6, vld1q_u16(d + 6 * dst_stride));
  508. d7 = vrhaddq_u16(d7, vld1q_u16(d + 7 * dst_stride));
  509. store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
  510. s0 = s8;
  511. s1 = s9;
  512. s2 = s10;
  513. s3 = s11;
  514. s4 = s12;
  515. s5 = s13;
  516. s6 = s14;
  517. s += 8;
  518. d += 8;
  519. width -= 8;
  520. } while (width > 0);
  521. src += 8 * src_stride;
  522. dst += 8 * dst_stride;
  523. h -= 8;
  524. } while (h > 0);
  525. }
  526. }
  527. }
  528. }
  529. void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
  530. uint16_t *dst, ptrdiff_t dst_stride,
  531. const InterpKernel *filter, int x0_q4,
  532. int x_step_q4, int y0_q4, int y_step_q4,
  533. int w, int h, int bd) {
  534. if (y_step_q4 != 16) {
  535. vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
  536. x_step_q4, y0_q4, y_step_q4, w, h, bd);
  537. } else {
  538. const int16x8_t filters = vld1q_s16(filter[y0_q4]);
  539. const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
  540. assert(!((intptr_t)dst & 3));
  541. assert(!(dst_stride & 3));
  542. src -= 3 * src_stride;
  543. if (w == 4) {
  544. int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
  545. int32x4_t d0, d1, d2, d3;
  546. uint16x8_t d01, d23;
  547. s0 = vreinterpret_s16_u16(vld1_u16(src));
  548. src += src_stride;
  549. s1 = vreinterpret_s16_u16(vld1_u16(src));
  550. src += src_stride;
  551. s2 = vreinterpret_s16_u16(vld1_u16(src));
  552. src += src_stride;
  553. s3 = vreinterpret_s16_u16(vld1_u16(src));
  554. src += src_stride;
  555. s4 = vreinterpret_s16_u16(vld1_u16(src));
  556. src += src_stride;
  557. s5 = vreinterpret_s16_u16(vld1_u16(src));
  558. src += src_stride;
  559. s6 = vreinterpret_s16_u16(vld1_u16(src));
  560. src += src_stride;
  561. do {
  562. s7 = vreinterpret_s16_u16(vld1_u16(src));
  563. src += src_stride;
  564. s8 = vreinterpret_s16_u16(vld1_u16(src));
  565. src += src_stride;
  566. s9 = vreinterpret_s16_u16(vld1_u16(src));
  567. src += src_stride;
  568. s10 = vreinterpret_s16_u16(vld1_u16(src));
  569. src += src_stride;
  570. __builtin_prefetch(dst + 0 * dst_stride);
  571. __builtin_prefetch(dst + 1 * dst_stride);
  572. __builtin_prefetch(dst + 2 * dst_stride);
  573. __builtin_prefetch(dst + 3 * dst_stride);
  574. __builtin_prefetch(src + 0 * src_stride);
  575. __builtin_prefetch(src + 1 * src_stride);
  576. __builtin_prefetch(src + 2 * src_stride);
  577. __builtin_prefetch(src + 3 * src_stride);
  578. d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
  579. d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
  580. d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
  581. d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
  582. d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
  583. d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
  584. d01 = vminq_u16(d01, max);
  585. d23 = vminq_u16(d23, max);
  586. vst1_u16(dst, vget_low_u16(d01));
  587. dst += dst_stride;
  588. vst1_u16(dst, vget_high_u16(d01));
  589. dst += dst_stride;
  590. vst1_u16(dst, vget_low_u16(d23));
  591. dst += dst_stride;
  592. vst1_u16(dst, vget_high_u16(d23));
  593. dst += dst_stride;
  594. s0 = s4;
  595. s1 = s5;
  596. s2 = s6;
  597. s3 = s7;
  598. s4 = s8;
  599. s5 = s9;
  600. s6 = s10;
  601. h -= 4;
  602. } while (h > 0);
  603. } else {
  604. int height;
  605. const uint16_t *s;
  606. uint16_t *d;
  607. int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
  608. uint16x8_t d0, d1, d2, d3;
  609. do {
  610. __builtin_prefetch(src + 0 * src_stride);
  611. __builtin_prefetch(src + 1 * src_stride);
  612. __builtin_prefetch(src + 2 * src_stride);
  613. __builtin_prefetch(src + 3 * src_stride);
  614. __builtin_prefetch(src + 4 * src_stride);
  615. __builtin_prefetch(src + 5 * src_stride);
  616. __builtin_prefetch(src + 6 * src_stride);
  617. s = src;
  618. s0 = vreinterpretq_s16_u16(vld1q_u16(s));
  619. s += src_stride;
  620. s1 = vreinterpretq_s16_u16(vld1q_u16(s));
  621. s += src_stride;
  622. s2 = vreinterpretq_s16_u16(vld1q_u16(s));
  623. s += src_stride;
  624. s3 = vreinterpretq_s16_u16(vld1q_u16(s));
  625. s += src_stride;
  626. s4 = vreinterpretq_s16_u16(vld1q_u16(s));
  627. s += src_stride;
  628. s5 = vreinterpretq_s16_u16(vld1q_u16(s));
  629. s += src_stride;
  630. s6 = vreinterpretq_s16_u16(vld1q_u16(s));
  631. s += src_stride;
  632. d = dst;
  633. height = h;
  634. do {
  635. s7 = vreinterpretq_s16_u16(vld1q_u16(s));
  636. s += src_stride;
  637. s8 = vreinterpretq_s16_u16(vld1q_u16(s));
  638. s += src_stride;
  639. s9 = vreinterpretq_s16_u16(vld1q_u16(s));
  640. s += src_stride;
  641. s10 = vreinterpretq_s16_u16(vld1q_u16(s));
  642. s += src_stride;
  643. __builtin_prefetch(d + 0 * dst_stride);
  644. __builtin_prefetch(d + 1 * dst_stride);
  645. __builtin_prefetch(d + 2 * dst_stride);
  646. __builtin_prefetch(d + 3 * dst_stride);
  647. __builtin_prefetch(s + 0 * src_stride);
  648. __builtin_prefetch(s + 1 * src_stride);
  649. __builtin_prefetch(s + 2 * src_stride);
  650. __builtin_prefetch(s + 3 * src_stride);
  651. d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
  652. d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
  653. d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
  654. d3 =
  655. highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
  656. vst1q_u16(d, d0);
  657. d += dst_stride;
  658. vst1q_u16(d, d1);
  659. d += dst_stride;
  660. vst1q_u16(d, d2);
  661. d += dst_stride;
  662. vst1q_u16(d, d3);
  663. d += dst_stride;
  664. s0 = s4;
  665. s1 = s5;
  666. s2 = s6;
  667. s3 = s7;
  668. s4 = s8;
  669. s5 = s9;
  670. s6 = s10;
  671. height -= 4;
  672. } while (height > 0);
  673. src += 8;
  674. dst += 8;
  675. w -= 8;
  676. } while (w > 0);
  677. }
  678. }
  679. }
  680. void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
  681. ptrdiff_t src_stride, uint16_t *dst,
  682. ptrdiff_t dst_stride,
  683. const InterpKernel *filter, int x0_q4,
  684. int x_step_q4, int y0_q4, int y_step_q4,
  685. int w, int h, int bd) {
  686. if (y_step_q4 != 16) {
  687. vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
  688. x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
  689. bd);
  690. } else {
  691. const int16x8_t filters = vld1q_s16(filter[y0_q4]);
  692. const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
  693. assert(!((intptr_t)dst & 3));
  694. assert(!(dst_stride & 3));
  695. src -= 3 * src_stride;
  696. if (w == 4) {
  697. int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
  698. int32x4_t d0, d1, d2, d3;
  699. uint16x8_t d01, d23, t01, t23;
  700. s0 = vreinterpret_s16_u16(vld1_u16(src));
  701. src += src_stride;
  702. s1 = vreinterpret_s16_u16(vld1_u16(src));
  703. src += src_stride;
  704. s2 = vreinterpret_s16_u16(vld1_u16(src));
  705. src += src_stride;
  706. s3 = vreinterpret_s16_u16(vld1_u16(src));
  707. src += src_stride;
  708. s4 = vreinterpret_s16_u16(vld1_u16(src));
  709. src += src_stride;
  710. s5 = vreinterpret_s16_u16(vld1_u16(src));
  711. src += src_stride;
  712. s6 = vreinterpret_s16_u16(vld1_u16(src));
  713. src += src_stride;
  714. do {
  715. s7 = vreinterpret_s16_u16(vld1_u16(src));
  716. src += src_stride;
  717. s8 = vreinterpret_s16_u16(vld1_u16(src));
  718. src += src_stride;
  719. s9 = vreinterpret_s16_u16(vld1_u16(src));
  720. src += src_stride;
  721. s10 = vreinterpret_s16_u16(vld1_u16(src));
  722. src += src_stride;
  723. __builtin_prefetch(dst + 0 * dst_stride);
  724. __builtin_prefetch(dst + 1 * dst_stride);
  725. __builtin_prefetch(dst + 2 * dst_stride);
  726. __builtin_prefetch(dst + 3 * dst_stride);
  727. __builtin_prefetch(src + 0 * src_stride);
  728. __builtin_prefetch(src + 1 * src_stride);
  729. __builtin_prefetch(src + 2 * src_stride);
  730. __builtin_prefetch(src + 3 * src_stride);
  731. d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
  732. d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
  733. d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
  734. d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
  735. t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
  736. t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
  737. t01 = vminq_u16(t01, max);
  738. t23 = vminq_u16(t23, max);
  739. d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
  740. vld1_u16(dst + 1 * dst_stride));
  741. d23 = vcombine_u16(vld1_u16(dst + 2 * dst_stride),
  742. vld1_u16(dst + 3 * dst_stride));
  743. d01 = vrhaddq_u16(d01, t01);
  744. d23 = vrhaddq_u16(d23, t23);
  745. vst1_u16(dst, vget_low_u16(d01));
  746. dst += dst_stride;
  747. vst1_u16(dst, vget_high_u16(d01));
  748. dst += dst_stride;
  749. vst1_u16(dst, vget_low_u16(d23));
  750. dst += dst_stride;
  751. vst1_u16(dst, vget_high_u16(d23));
  752. dst += dst_stride;
  753. s0 = s4;
  754. s1 = s5;
  755. s2 = s6;
  756. s3 = s7;
  757. s4 = s8;
  758. s5 = s9;
  759. s6 = s10;
  760. h -= 4;
  761. } while (h > 0);
  762. } else {
  763. int height;
  764. const uint16_t *s;
  765. uint16_t *d;
  766. int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
  767. uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3;
  768. do {
  769. __builtin_prefetch(src + 0 * src_stride);
  770. __builtin_prefetch(src + 1 * src_stride);
  771. __builtin_prefetch(src + 2 * src_stride);
  772. __builtin_prefetch(src + 3 * src_stride);
  773. __builtin_prefetch(src + 4 * src_stride);
  774. __builtin_prefetch(src + 5 * src_stride);
  775. __builtin_prefetch(src + 6 * src_stride);
  776. s = src;
  777. s0 = vreinterpretq_s16_u16(vld1q_u16(s));
  778. s += src_stride;
  779. s1 = vreinterpretq_s16_u16(vld1q_u16(s));
  780. s += src_stride;
  781. s2 = vreinterpretq_s16_u16(vld1q_u16(s));
  782. s += src_stride;
  783. s3 = vreinterpretq_s16_u16(vld1q_u16(s));
  784. s += src_stride;
  785. s4 = vreinterpretq_s16_u16(vld1q_u16(s));
  786. s += src_stride;
  787. s5 = vreinterpretq_s16_u16(vld1q_u16(s));
  788. s += src_stride;
  789. s6 = vreinterpretq_s16_u16(vld1q_u16(s));
  790. s += src_stride;
  791. d = dst;
  792. height = h;
  793. do {
  794. s7 = vreinterpretq_s16_u16(vld1q_u16(s));
  795. s += src_stride;
  796. s8 = vreinterpretq_s16_u16(vld1q_u16(s));
  797. s += src_stride;
  798. s9 = vreinterpretq_s16_u16(vld1q_u16(s));
  799. s += src_stride;
  800. s10 = vreinterpretq_s16_u16(vld1q_u16(s));
  801. s += src_stride;
  802. __builtin_prefetch(d + 0 * dst_stride);
  803. __builtin_prefetch(d + 1 * dst_stride);
  804. __builtin_prefetch(d + 2 * dst_stride);
  805. __builtin_prefetch(d + 3 * dst_stride);
  806. __builtin_prefetch(s + 0 * src_stride);
  807. __builtin_prefetch(s + 1 * src_stride);
  808. __builtin_prefetch(s + 2 * src_stride);
  809. __builtin_prefetch(s + 3 * src_stride);
  810. t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
  811. t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
  812. t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
  813. t3 =
  814. highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
  815. d0 = vld1q_u16(d + 0 * dst_stride);
  816. d1 = vld1q_u16(d + 1 * dst_stride);
  817. d2 = vld1q_u16(d + 2 * dst_stride);
  818. d3 = vld1q_u16(d + 3 * dst_stride);
  819. d0 = vrhaddq_u16(d0, t0);
  820. d1 = vrhaddq_u16(d1, t1);
  821. d2 = vrhaddq_u16(d2, t2);
  822. d3 = vrhaddq_u16(d3, t3);
  823. vst1q_u16(d, d0);
  824. d += dst_stride;
  825. vst1q_u16(d, d1);
  826. d += dst_stride;
  827. vst1q_u16(d, d2);
  828. d += dst_stride;
  829. vst1q_u16(d, d3);
  830. d += dst_stride;
  831. s0 = s4;
  832. s1 = s5;
  833. s2 = s6;
  834. s3 = s7;
  835. s4 = s8;
  836. s5 = s9;
  837. s6 = s10;
  838. height -= 4;
  839. } while (height > 0);
  840. src += 8;
  841. dst += 8;
  842. w -= 8;
  843. } while (w > 0);
  844. }
  845. }
  846. }