vpx_convolve8_neon.c 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959
  1. /*
  2. * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <arm_neon.h>
  11. #include <assert.h>
  12. #include "./vpx_config.h"
  13. #include "./vpx_dsp_rtcd.h"
  14. #include "vpx/vpx_integer.h"
  15. #include "vpx_dsp/arm/transpose_neon.h"
  16. #include "vpx_dsp/arm/vpx_convolve8_neon.h"
  17. #include "vpx_ports/mem.h"
  18. // Note:
  19. // 1. src is not always 32-bit aligned, so don't call vld1_lane_u32(src).
  20. // 2. After refactoring the shared code in kernel loops with inline functions,
  21. // the decoder speed dropped a lot when using gcc compiler. Therefore there is
  22. // no refactoring for those parts by now.
  23. // 3. For horizontal convolve, there is an alternative optimization that
  24. // convolves a single row in each loop. For each row, 8 sample banks with 4 or 8
  25. // samples in each are read from memory: src, (src+1), (src+2), (src+3),
  26. // (src+4), (src+5), (src+6), (src+7), or prepared by vector extract
  27. // instructions. This optimization is much faster in speed unit test, but slowed
  28. // down the whole decoder by 5%.
  29. static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
  30. const uint8x8_t s0, const uint8x8_t s1,
  31. const uint8x8_t s2, const uint8x8_t s3,
  32. const uint8x8_t s4, const uint8x8_t s5,
  33. const uint8x8_t s6, const uint8x8_t s7) {
  34. vst1_u8(s, s0);
  35. s += p;
  36. vst1_u8(s, s1);
  37. s += p;
  38. vst1_u8(s, s2);
  39. s += p;
  40. vst1_u8(s, s3);
  41. s += p;
  42. vst1_u8(s, s4);
  43. s += p;
  44. vst1_u8(s, s5);
  45. s += p;
  46. vst1_u8(s, s6);
  47. s += p;
  48. vst1_u8(s, s7);
  49. }
  50. void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
  51. uint8_t *dst, ptrdiff_t dst_stride,
  52. const InterpKernel *filter, int x0_q4,
  53. int x_step_q4, int y0_q4, int y_step_q4, int w,
  54. int h) {
  55. const int16x8_t filters = vld1q_s16(filter[x0_q4]);
  56. uint8x8_t t0, t1, t2, t3;
  57. assert(!((intptr_t)dst & 3));
  58. assert(!(dst_stride & 3));
  59. assert(x_step_q4 == 16);
  60. (void)x_step_q4;
  61. (void)y0_q4;
  62. (void)y_step_q4;
  63. src -= 3;
  64. if (h == 4) {
  65. uint8x8_t d01, d23;
  66. int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0,
  67. d1, d2, d3;
  68. int16x8_t tt0, tt1, tt2, tt3;
  69. __builtin_prefetch(src + 0 * src_stride);
  70. __builtin_prefetch(src + 1 * src_stride);
  71. __builtin_prefetch(src + 2 * src_stride);
  72. __builtin_prefetch(src + 3 * src_stride);
  73. filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
  74. filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
  75. load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
  76. transpose_u8_8x4(&t0, &t1, &t2, &t3);
  77. tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
  78. tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
  79. tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
  80. tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
  81. s0 = vget_low_s16(tt0);
  82. s1 = vget_low_s16(tt1);
  83. s2 = vget_low_s16(tt2);
  84. s3 = vget_low_s16(tt3);
  85. s4 = vget_high_s16(tt0);
  86. s5 = vget_high_s16(tt1);
  87. s6 = vget_high_s16(tt2);
  88. __builtin_prefetch(dst + 0 * dst_stride);
  89. __builtin_prefetch(dst + 1 * dst_stride);
  90. __builtin_prefetch(dst + 2 * dst_stride);
  91. __builtin_prefetch(dst + 3 * dst_stride);
  92. src += 7;
  93. do {
  94. load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
  95. transpose_u8_8x4(&t0, &t1, &t2, &t3);
  96. tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
  97. tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
  98. tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
  99. tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
  100. s7 = vget_low_s16(tt0);
  101. s8 = vget_low_s16(tt1);
  102. s9 = vget_low_s16(tt2);
  103. s10 = vget_low_s16(tt3);
  104. d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
  105. filter4);
  106. d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
  107. filter4);
  108. d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
  109. filter4);
  110. d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
  111. filter4);
  112. d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
  113. d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
  114. transpose_u8_4x4(&d01, &d23);
  115. vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride),
  116. vreinterpret_u32_u8(d01), 0);
  117. vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride),
  118. vreinterpret_u32_u8(d23), 0);
  119. vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride),
  120. vreinterpret_u32_u8(d01), 1);
  121. vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride),
  122. vreinterpret_u32_u8(d23), 1);
  123. s0 = s4;
  124. s1 = s5;
  125. s2 = s6;
  126. s3 = s7;
  127. s4 = s8;
  128. s5 = s9;
  129. s6 = s10;
  130. src += 4;
  131. dst += 4;
  132. w -= 4;
  133. } while (w > 0);
  134. } else {
  135. const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
  136. const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
  137. int width;
  138. const uint8_t *s;
  139. uint8x8_t t4, t5, t6, t7;
  140. int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
  141. if (w == 4) {
  142. do {
  143. load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
  144. transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
  145. s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
  146. s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
  147. s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
  148. s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
  149. s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
  150. s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
  151. s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
  152. load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
  153. &t7);
  154. src += 8 * src_stride;
  155. __builtin_prefetch(dst + 0 * dst_stride);
  156. __builtin_prefetch(dst + 1 * dst_stride);
  157. __builtin_prefetch(dst + 2 * dst_stride);
  158. __builtin_prefetch(dst + 3 * dst_stride);
  159. __builtin_prefetch(dst + 4 * dst_stride);
  160. __builtin_prefetch(dst + 5 * dst_stride);
  161. __builtin_prefetch(dst + 6 * dst_stride);
  162. __builtin_prefetch(dst + 7 * dst_stride);
  163. transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
  164. s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
  165. s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
  166. s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
  167. s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
  168. __builtin_prefetch(src + 0 * src_stride);
  169. __builtin_prefetch(src + 1 * src_stride);
  170. __builtin_prefetch(src + 2 * src_stride);
  171. __builtin_prefetch(src + 3 * src_stride);
  172. __builtin_prefetch(src + 4 * src_stride);
  173. __builtin_prefetch(src + 5 * src_stride);
  174. __builtin_prefetch(src + 6 * src_stride);
  175. __builtin_prefetch(src + 7 * src_stride);
  176. t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
  177. filter4);
  178. t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
  179. filter4);
  180. t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
  181. filter4);
  182. t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
  183. filter4);
  184. transpose_u8_8x4(&t0, &t1, &t2, &t3);
  185. vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0);
  186. dst += dst_stride;
  187. vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 0);
  188. dst += dst_stride;
  189. vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 0);
  190. dst += dst_stride;
  191. vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 0);
  192. dst += dst_stride;
  193. vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 1);
  194. dst += dst_stride;
  195. vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 1);
  196. dst += dst_stride;
  197. vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 1);
  198. dst += dst_stride;
  199. vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 1);
  200. dst += dst_stride;
  201. h -= 8;
  202. } while (h > 0);
  203. } else {
  204. uint8_t *d;
  205. int16x8_t s11, s12, s13, s14;
  206. do {
  207. __builtin_prefetch(src + 0 * src_stride);
  208. __builtin_prefetch(src + 1 * src_stride);
  209. __builtin_prefetch(src + 2 * src_stride);
  210. __builtin_prefetch(src + 3 * src_stride);
  211. __builtin_prefetch(src + 4 * src_stride);
  212. __builtin_prefetch(src + 5 * src_stride);
  213. __builtin_prefetch(src + 6 * src_stride);
  214. __builtin_prefetch(src + 7 * src_stride);
  215. load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
  216. transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
  217. s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
  218. s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
  219. s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
  220. s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
  221. s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
  222. s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
  223. s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
  224. width = w;
  225. s = src + 7;
  226. d = dst;
  227. __builtin_prefetch(dst + 0 * dst_stride);
  228. __builtin_prefetch(dst + 1 * dst_stride);
  229. __builtin_prefetch(dst + 2 * dst_stride);
  230. __builtin_prefetch(dst + 3 * dst_stride);
  231. __builtin_prefetch(dst + 4 * dst_stride);
  232. __builtin_prefetch(dst + 5 * dst_stride);
  233. __builtin_prefetch(dst + 6 * dst_stride);
  234. __builtin_prefetch(dst + 7 * dst_stride);
  235. do {
  236. load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
  237. transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
  238. s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
  239. s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
  240. s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
  241. s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
  242. s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
  243. s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
  244. s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
  245. s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
  246. t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
  247. filter4);
  248. t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
  249. filter4);
  250. t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
  251. filter4);
  252. t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
  253. filter4);
  254. t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
  255. filter4);
  256. t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
  257. filter4);
  258. t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
  259. filter4);
  260. t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
  261. filter3, filter4);
  262. transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
  263. store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
  264. s0 = s8;
  265. s1 = s9;
  266. s2 = s10;
  267. s3 = s11;
  268. s4 = s12;
  269. s5 = s13;
  270. s6 = s14;
  271. s += 8;
  272. d += 8;
  273. width -= 8;
  274. } while (width > 0);
  275. src += 8 * src_stride;
  276. dst += 8 * dst_stride;
  277. h -= 8;
  278. } while (h > 0);
  279. }
  280. }
  281. }
  282. void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
  283. uint8_t *dst, ptrdiff_t dst_stride,
  284. const InterpKernel *filter, int x0_q4,
  285. int x_step_q4, int y0_q4, int y_step_q4,
  286. int w, int h) {
  287. const int16x8_t filters = vld1q_s16(filter[x0_q4]);
  288. uint8x8_t t0, t1, t2, t3;
  289. assert(!((intptr_t)dst & 3));
  290. assert(!(dst_stride & 3));
  291. assert(x_step_q4 == 16);
  292. (void)x_step_q4;
  293. (void)y0_q4;
  294. (void)y_step_q4;
  295. src -= 3;
  296. if (h == 4) {
  297. uint8x8_t d01, d23;
  298. int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0,
  299. d1, d2, d3;
  300. int16x8_t tt0, tt1, tt2, tt3;
  301. uint32x4_t d0123 = vdupq_n_u32(0);
  302. __builtin_prefetch(src + 0 * src_stride);
  303. __builtin_prefetch(src + 1 * src_stride);
  304. __builtin_prefetch(src + 2 * src_stride);
  305. __builtin_prefetch(src + 3 * src_stride);
  306. filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
  307. filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
  308. load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
  309. transpose_u8_8x4(&t0, &t1, &t2, &t3);
  310. tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
  311. tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
  312. tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
  313. tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
  314. s0 = vget_low_s16(tt0);
  315. s1 = vget_low_s16(tt1);
  316. s2 = vget_low_s16(tt2);
  317. s3 = vget_low_s16(tt3);
  318. s4 = vget_high_s16(tt0);
  319. s5 = vget_high_s16(tt1);
  320. s6 = vget_high_s16(tt2);
  321. __builtin_prefetch(dst + 0 * dst_stride);
  322. __builtin_prefetch(dst + 1 * dst_stride);
  323. __builtin_prefetch(dst + 2 * dst_stride);
  324. __builtin_prefetch(dst + 3 * dst_stride);
  325. src += 7;
  326. do {
  327. load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
  328. transpose_u8_8x4(&t0, &t1, &t2, &t3);
  329. tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
  330. tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
  331. tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
  332. tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
  333. s7 = vget_low_s16(tt0);
  334. s8 = vget_low_s16(tt1);
  335. s9 = vget_low_s16(tt2);
  336. s10 = vget_low_s16(tt3);
  337. d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
  338. filter4);
  339. d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
  340. filter4);
  341. d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
  342. filter4);
  343. d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
  344. filter4);
  345. d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
  346. d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
  347. transpose_u8_4x4(&d01, &d23);
  348. d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
  349. d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2);
  350. d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1);
  351. d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
  352. d0123 = vreinterpretq_u32_u8(
  353. vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23)));
  354. vst1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
  355. vst1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2);
  356. vst1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1);
  357. vst1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
  358. s0 = s4;
  359. s1 = s5;
  360. s2 = s6;
  361. s3 = s7;
  362. s4 = s8;
  363. s5 = s9;
  364. s6 = s10;
  365. src += 4;
  366. dst += 4;
  367. w -= 4;
  368. } while (w > 0);
  369. } else {
  370. const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
  371. const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
  372. int width;
  373. const uint8_t *s;
  374. uint8x8_t t4, t5, t6, t7;
  375. int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
  376. if (w == 4) {
  377. uint32x4_t d0415 = vdupq_n_u32(0);
  378. uint32x4_t d2637 = vdupq_n_u32(0);
  379. do {
  380. load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
  381. transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
  382. s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
  383. s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
  384. s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
  385. s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
  386. s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
  387. s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
  388. s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
  389. load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
  390. &t7);
  391. src += 8 * src_stride;
  392. __builtin_prefetch(dst + 0 * dst_stride);
  393. __builtin_prefetch(dst + 1 * dst_stride);
  394. __builtin_prefetch(dst + 2 * dst_stride);
  395. __builtin_prefetch(dst + 3 * dst_stride);
  396. __builtin_prefetch(dst + 4 * dst_stride);
  397. __builtin_prefetch(dst + 5 * dst_stride);
  398. __builtin_prefetch(dst + 6 * dst_stride);
  399. __builtin_prefetch(dst + 7 * dst_stride);
  400. transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
  401. s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
  402. s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
  403. s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
  404. s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
  405. __builtin_prefetch(src + 0 * src_stride);
  406. __builtin_prefetch(src + 1 * src_stride);
  407. __builtin_prefetch(src + 2 * src_stride);
  408. __builtin_prefetch(src + 3 * src_stride);
  409. __builtin_prefetch(src + 4 * src_stride);
  410. __builtin_prefetch(src + 5 * src_stride);
  411. __builtin_prefetch(src + 6 * src_stride);
  412. __builtin_prefetch(src + 7 * src_stride);
  413. t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
  414. filter4);
  415. t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
  416. filter4);
  417. t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
  418. filter4);
  419. t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
  420. filter4);
  421. transpose_u8_8x4(&t0, &t1, &t2, &t3);
  422. d0415 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0415, 0);
  423. d0415 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0415, 2);
  424. d2637 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d2637, 0);
  425. d2637 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d2637, 2);
  426. d0415 = vld1q_lane_u32((uint32_t *)(dst + 4 * dst_stride), d0415, 1);
  427. d0415 = vld1q_lane_u32((uint32_t *)(dst + 5 * dst_stride), d0415, 3);
  428. d2637 = vld1q_lane_u32((uint32_t *)(dst + 6 * dst_stride), d2637, 1);
  429. d2637 = vld1q_lane_u32((uint32_t *)(dst + 7 * dst_stride), d2637, 3);
  430. d0415 = vreinterpretq_u32_u8(
  431. vrhaddq_u8(vreinterpretq_u8_u32(d0415), vcombine_u8(t0, t1)));
  432. d2637 = vreinterpretq_u32_u8(
  433. vrhaddq_u8(vreinterpretq_u8_u32(d2637), vcombine_u8(t2, t3)));
  434. vst1q_lane_u32((uint32_t *)dst, d0415, 0);
  435. dst += dst_stride;
  436. vst1q_lane_u32((uint32_t *)dst, d0415, 2);
  437. dst += dst_stride;
  438. vst1q_lane_u32((uint32_t *)dst, d2637, 0);
  439. dst += dst_stride;
  440. vst1q_lane_u32((uint32_t *)dst, d2637, 2);
  441. dst += dst_stride;
  442. vst1q_lane_u32((uint32_t *)dst, d0415, 1);
  443. dst += dst_stride;
  444. vst1q_lane_u32((uint32_t *)dst, d0415, 3);
  445. dst += dst_stride;
  446. vst1q_lane_u32((uint32_t *)dst, d2637, 1);
  447. dst += dst_stride;
  448. vst1q_lane_u32((uint32_t *)dst, d2637, 3);
  449. dst += dst_stride;
  450. h -= 8;
  451. } while (h > 0);
  452. } else {
  453. uint8_t *d;
  454. int16x8_t s11, s12, s13, s14;
  455. uint8x16_t d01, d23, d45, d67;
  456. do {
  457. __builtin_prefetch(src + 0 * src_stride);
  458. __builtin_prefetch(src + 1 * src_stride);
  459. __builtin_prefetch(src + 2 * src_stride);
  460. __builtin_prefetch(src + 3 * src_stride);
  461. __builtin_prefetch(src + 4 * src_stride);
  462. __builtin_prefetch(src + 5 * src_stride);
  463. __builtin_prefetch(src + 6 * src_stride);
  464. __builtin_prefetch(src + 7 * src_stride);
  465. load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
  466. transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
  467. s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
  468. s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
  469. s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
  470. s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
  471. s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
  472. s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
  473. s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
  474. width = w;
  475. s = src + 7;
  476. d = dst;
  477. __builtin_prefetch(dst + 0 * dst_stride);
  478. __builtin_prefetch(dst + 1 * dst_stride);
  479. __builtin_prefetch(dst + 2 * dst_stride);
  480. __builtin_prefetch(dst + 3 * dst_stride);
  481. __builtin_prefetch(dst + 4 * dst_stride);
  482. __builtin_prefetch(dst + 5 * dst_stride);
  483. __builtin_prefetch(dst + 6 * dst_stride);
  484. __builtin_prefetch(dst + 7 * dst_stride);
  485. do {
  486. load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
  487. transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
  488. s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
  489. s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
  490. s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
  491. s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
  492. s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
  493. s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
  494. s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
  495. s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
  496. t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
  497. filter4);
  498. t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
  499. filter4);
  500. t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
  501. filter4);
  502. t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
  503. filter4);
  504. t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
  505. filter4);
  506. t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
  507. filter4);
  508. t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
  509. filter4);
  510. t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
  511. filter3, filter4);
  512. transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
  513. d01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
  514. vld1_u8(d + 1 * dst_stride));
  515. d23 = vcombine_u8(vld1_u8(d + 2 * dst_stride),
  516. vld1_u8(d + 3 * dst_stride));
  517. d45 = vcombine_u8(vld1_u8(d + 4 * dst_stride),
  518. vld1_u8(d + 5 * dst_stride));
  519. d67 = vcombine_u8(vld1_u8(d + 6 * dst_stride),
  520. vld1_u8(d + 7 * dst_stride));
  521. d01 = vrhaddq_u8(d01, vcombine_u8(t0, t1));
  522. d23 = vrhaddq_u8(d23, vcombine_u8(t2, t3));
  523. d45 = vrhaddq_u8(d45, vcombine_u8(t4, t5));
  524. d67 = vrhaddq_u8(d67, vcombine_u8(t6, t7));
  525. store_u8_8x8(d, dst_stride, vget_low_u8(d01), vget_high_u8(d01),
  526. vget_low_u8(d23), vget_high_u8(d23), vget_low_u8(d45),
  527. vget_high_u8(d45), vget_low_u8(d67), vget_high_u8(d67));
  528. s0 = s8;
  529. s1 = s9;
  530. s2 = s10;
  531. s3 = s11;
  532. s4 = s12;
  533. s5 = s13;
  534. s6 = s14;
  535. s += 8;
  536. d += 8;
  537. width -= 8;
  538. } while (width > 0);
  539. src += 8 * src_stride;
  540. dst += 8 * dst_stride;
  541. h -= 8;
  542. } while (h > 0);
  543. }
  544. }
  545. }
  546. void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
  547. uint8_t *dst, ptrdiff_t dst_stride,
  548. const InterpKernel *filter, int x0_q4,
  549. int x_step_q4, int y0_q4, int y_step_q4, int w,
  550. int h) {
  551. const int16x8_t filters = vld1q_s16(filter[y0_q4]);
  552. assert(!((intptr_t)dst & 3));
  553. assert(!(dst_stride & 3));
  554. assert(y_step_q4 == 16);
  555. (void)x0_q4;
  556. (void)x_step_q4;
  557. (void)y_step_q4;
  558. src -= 3 * src_stride;
  559. if (w == 4) {
  560. const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
  561. const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
  562. uint8x8_t d01, d23;
  563. int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
  564. s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
  565. src += src_stride;
  566. s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
  567. src += src_stride;
  568. s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
  569. src += src_stride;
  570. s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
  571. src += src_stride;
  572. s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
  573. src += src_stride;
  574. s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
  575. src += src_stride;
  576. s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
  577. src += src_stride;
  578. do {
  579. s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
  580. src += src_stride;
  581. s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
  582. src += src_stride;
  583. s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
  584. src += src_stride;
  585. s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
  586. src += src_stride;
  587. __builtin_prefetch(dst + 0 * dst_stride);
  588. __builtin_prefetch(dst + 1 * dst_stride);
  589. __builtin_prefetch(dst + 2 * dst_stride);
  590. __builtin_prefetch(dst + 3 * dst_stride);
  591. __builtin_prefetch(src + 0 * src_stride);
  592. __builtin_prefetch(src + 1 * src_stride);
  593. __builtin_prefetch(src + 2 * src_stride);
  594. __builtin_prefetch(src + 3 * src_stride);
  595. d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
  596. filter4);
  597. d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
  598. filter4);
  599. d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
  600. filter4);
  601. d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
  602. filter4);
  603. d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
  604. d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
  605. vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
  606. dst += dst_stride;
  607. vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
  608. dst += dst_stride;
  609. vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
  610. dst += dst_stride;
  611. vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
  612. dst += dst_stride;
  613. s0 = s4;
  614. s1 = s5;
  615. s2 = s6;
  616. s3 = s7;
  617. s4 = s8;
  618. s5 = s9;
  619. s6 = s10;
  620. h -= 4;
  621. } while (h > 0);
  622. } else {
  623. const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
  624. const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
  625. int height;
  626. const uint8_t *s;
  627. uint8_t *d;
  628. uint8x8_t t0, t1, t2, t3;
  629. int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
  630. do {
  631. __builtin_prefetch(src + 0 * src_stride);
  632. __builtin_prefetch(src + 1 * src_stride);
  633. __builtin_prefetch(src + 2 * src_stride);
  634. __builtin_prefetch(src + 3 * src_stride);
  635. __builtin_prefetch(src + 4 * src_stride);
  636. __builtin_prefetch(src + 5 * src_stride);
  637. __builtin_prefetch(src + 6 * src_stride);
  638. s = src;
  639. s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
  640. s += src_stride;
  641. s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
  642. s += src_stride;
  643. s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
  644. s += src_stride;
  645. s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
  646. s += src_stride;
  647. s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
  648. s += src_stride;
  649. s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
  650. s += src_stride;
  651. s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
  652. s += src_stride;
  653. d = dst;
  654. height = h;
  655. do {
  656. s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
  657. s += src_stride;
  658. s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
  659. s += src_stride;
  660. s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
  661. s += src_stride;
  662. s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
  663. s += src_stride;
  664. __builtin_prefetch(d + 0 * dst_stride);
  665. __builtin_prefetch(d + 1 * dst_stride);
  666. __builtin_prefetch(d + 2 * dst_stride);
  667. __builtin_prefetch(d + 3 * dst_stride);
  668. __builtin_prefetch(s + 0 * src_stride);
  669. __builtin_prefetch(s + 1 * src_stride);
  670. __builtin_prefetch(s + 2 * src_stride);
  671. __builtin_prefetch(s + 3 * src_stride);
  672. t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
  673. filter4);
  674. t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
  675. filter4);
  676. t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
  677. filter4);
  678. t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
  679. filter4);
  680. vst1_u8(d, t0);
  681. d += dst_stride;
  682. vst1_u8(d, t1);
  683. d += dst_stride;
  684. vst1_u8(d, t2);
  685. d += dst_stride;
  686. vst1_u8(d, t3);
  687. d += dst_stride;
  688. s0 = s4;
  689. s1 = s5;
  690. s2 = s6;
  691. s3 = s7;
  692. s4 = s8;
  693. s5 = s9;
  694. s6 = s10;
  695. height -= 4;
  696. } while (height > 0);
  697. src += 8;
  698. dst += 8;
  699. w -= 8;
  700. } while (w > 0);
  701. }
  702. }
  703. void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
  704. uint8_t *dst, ptrdiff_t dst_stride,
  705. const InterpKernel *filter, int x0_q4,
  706. int x_step_q4, int y0_q4, int y_step_q4, int w,
  707. int h) {
  708. const int16x8_t filters = vld1q_s16(filter[y0_q4]);
  709. assert(!((intptr_t)dst & 3));
  710. assert(!(dst_stride & 3));
  711. assert(y_step_q4 == 16);
  712. (void)x0_q4;
  713. (void)x_step_q4;
  714. (void)y_step_q4;
  715. src -= 3 * src_stride;
  716. if (w == 4) {
  717. const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
  718. const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
  719. uint8x8_t d01, d23;
  720. int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
  721. uint32x4_t d0123 = vdupq_n_u32(0);
  722. s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
  723. src += src_stride;
  724. s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
  725. src += src_stride;
  726. s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
  727. src += src_stride;
  728. s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
  729. src += src_stride;
  730. s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
  731. src += src_stride;
  732. s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
  733. src += src_stride;
  734. s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
  735. src += src_stride;
  736. do {
  737. s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
  738. src += src_stride;
  739. s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
  740. src += src_stride;
  741. s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
  742. src += src_stride;
  743. s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
  744. src += src_stride;
  745. __builtin_prefetch(dst + 0 * dst_stride);
  746. __builtin_prefetch(dst + 1 * dst_stride);
  747. __builtin_prefetch(dst + 2 * dst_stride);
  748. __builtin_prefetch(dst + 3 * dst_stride);
  749. __builtin_prefetch(src + 0 * src_stride);
  750. __builtin_prefetch(src + 1 * src_stride);
  751. __builtin_prefetch(src + 2 * src_stride);
  752. __builtin_prefetch(src + 3 * src_stride);
  753. d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
  754. filter4);
  755. d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
  756. filter4);
  757. d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
  758. filter4);
  759. d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
  760. filter4);
  761. d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
  762. d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
  763. d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
  764. d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 1);
  765. d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 2);
  766. d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
  767. d0123 = vreinterpretq_u32_u8(
  768. vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23)));
  769. vst1q_lane_u32((uint32_t *)dst, d0123, 0);
  770. dst += dst_stride;
  771. vst1q_lane_u32((uint32_t *)dst, d0123, 1);
  772. dst += dst_stride;
  773. vst1q_lane_u32((uint32_t *)dst, d0123, 2);
  774. dst += dst_stride;
  775. vst1q_lane_u32((uint32_t *)dst, d0123, 3);
  776. dst += dst_stride;
  777. s0 = s4;
  778. s1 = s5;
  779. s2 = s6;
  780. s3 = s7;
  781. s4 = s8;
  782. s5 = s9;
  783. s6 = s10;
  784. h -= 4;
  785. } while (h > 0);
  786. } else {
  787. const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
  788. const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
  789. int height;
  790. const uint8_t *s;
  791. uint8_t *d;
  792. uint8x8_t t0, t1, t2, t3;
  793. uint8x16_t d01, d23, dd01, dd23;
  794. int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
  795. do {
  796. __builtin_prefetch(src + 0 * src_stride);
  797. __builtin_prefetch(src + 1 * src_stride);
  798. __builtin_prefetch(src + 2 * src_stride);
  799. __builtin_prefetch(src + 3 * src_stride);
  800. __builtin_prefetch(src + 4 * src_stride);
  801. __builtin_prefetch(src + 5 * src_stride);
  802. __builtin_prefetch(src + 6 * src_stride);
  803. s = src;
  804. s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
  805. s += src_stride;
  806. s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
  807. s += src_stride;
  808. s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
  809. s += src_stride;
  810. s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
  811. s += src_stride;
  812. s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
  813. s += src_stride;
  814. s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
  815. s += src_stride;
  816. s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
  817. s += src_stride;
  818. d = dst;
  819. height = h;
  820. do {
  821. s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
  822. s += src_stride;
  823. s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
  824. s += src_stride;
  825. s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
  826. s += src_stride;
  827. s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
  828. s += src_stride;
  829. __builtin_prefetch(d + 0 * dst_stride);
  830. __builtin_prefetch(d + 1 * dst_stride);
  831. __builtin_prefetch(d + 2 * dst_stride);
  832. __builtin_prefetch(d + 3 * dst_stride);
  833. __builtin_prefetch(s + 0 * src_stride);
  834. __builtin_prefetch(s + 1 * src_stride);
  835. __builtin_prefetch(s + 2 * src_stride);
  836. __builtin_prefetch(s + 3 * src_stride);
  837. t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
  838. filter4);
  839. t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
  840. filter4);
  841. t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
  842. filter4);
  843. t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
  844. filter4);
  845. d01 = vcombine_u8(t0, t1);
  846. d23 = vcombine_u8(t2, t3);
  847. dd01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
  848. vld1_u8(d + 1 * dst_stride));
  849. dd23 = vcombine_u8(vld1_u8(d + 2 * dst_stride),
  850. vld1_u8(d + 3 * dst_stride));
  851. dd01 = vrhaddq_u8(dd01, d01);
  852. dd23 = vrhaddq_u8(dd23, d23);
  853. vst1_u8(d, vget_low_u8(dd01));
  854. d += dst_stride;
  855. vst1_u8(d, vget_high_u8(dd01));
  856. d += dst_stride;
  857. vst1_u8(d, vget_low_u8(dd23));
  858. d += dst_stride;
  859. vst1_u8(d, vget_high_u8(dd23));
  860. d += dst_stride;
  861. s0 = s4;
  862. s1 = s5;
  863. s2 = s6;
  864. s3 = s7;
  865. s4 = s8;
  866. s5 = s9;
  867. s6 = s10;
  868. height -= 4;
  869. } while (height > 0);
  870. src += 8;
  871. dst += 8;
  872. w -= 8;
  873. } while (w > 0);
  874. }
  875. }