2
0

scale_neon.cc 38 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021
  1. /*
  2. * Copyright 2011 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/row.h"
  11. #ifdef __cplusplus
  12. namespace libyuv {
  13. extern "C" {
  14. #endif
  15. // This module is for GCC Neon.
  16. #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
  17. !defined(__aarch64__)
  18. // NEON downscalers with interpolation.
  19. // Provided by Fritz Koenig
  20. // Read 32x1 throw away even pixels, and write 16x1.
  21. void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
  22. uint8* dst, int dst_width) {
  23. asm volatile (
  24. "1: \n"
  25. // load even pixels into q0, odd into q1
  26. MEMACCESS(0)
  27. "vld2.8 {q0, q1}, [%0]! \n"
  28. "subs %2, %2, #16 \n" // 16 processed per loop
  29. MEMACCESS(1)
  30. "vst1.8 {q1}, [%1]! \n" // store odd pixels
  31. "bgt 1b \n"
  32. : "+r"(src_ptr), // %0
  33. "+r"(dst), // %1
  34. "+r"(dst_width) // %2
  35. :
  36. : "q0", "q1" // Clobber List
  37. );
  38. }
  39. // Read 32x1 average down and write 16x1.
  40. void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
  41. uint8* dst, int dst_width) {
  42. asm volatile (
  43. "1: \n"
  44. MEMACCESS(0)
  45. "vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc
  46. "subs %2, %2, #16 \n" // 16 processed per loop
  47. "vpaddl.u8 q0, q0 \n" // add adjacent
  48. "vpaddl.u8 q1, q1 \n"
  49. "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack
  50. "vrshrn.u16 d1, q1, #1 \n"
  51. MEMACCESS(1)
  52. "vst1.8 {q0}, [%1]! \n"
  53. "bgt 1b \n"
  54. : "+r"(src_ptr), // %0
  55. "+r"(dst), // %1
  56. "+r"(dst_width) // %2
  57. :
  58. : "q0", "q1" // Clobber List
  59. );
  60. }
  61. // Read 32x2 average down and write 16x1.
  62. void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
  63. uint8* dst, int dst_width) {
  64. asm volatile (
  65. // change the stride to row 2 pointer
  66. "add %1, %0 \n"
  67. "1: \n"
  68. MEMACCESS(0)
  69. "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
  70. MEMACCESS(1)
  71. "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
  72. "subs %3, %3, #16 \n" // 16 processed per loop
  73. "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
  74. "vpaddl.u8 q1, q1 \n"
  75. "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1
  76. "vpadal.u8 q1, q3 \n"
  77. "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
  78. "vrshrn.u16 d1, q1, #2 \n"
  79. MEMACCESS(2)
  80. "vst1.8 {q0}, [%2]! \n"
  81. "bgt 1b \n"
  82. : "+r"(src_ptr), // %0
  83. "+r"(src_stride), // %1
  84. "+r"(dst), // %2
  85. "+r"(dst_width) // %3
  86. :
  87. : "q0", "q1", "q2", "q3" // Clobber List
  88. );
  89. }
  90. void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
  91. uint8* dst_ptr, int dst_width) {
  92. asm volatile (
  93. "1: \n"
  94. MEMACCESS(0)
  95. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
  96. "subs %2, %2, #8 \n" // 8 processed per loop
  97. MEMACCESS(1)
  98. "vst1.8 {d2}, [%1]! \n"
  99. "bgt 1b \n"
  100. : "+r"(src_ptr), // %0
  101. "+r"(dst_ptr), // %1
  102. "+r"(dst_width) // %2
  103. :
  104. : "q0", "q1", "memory", "cc"
  105. );
  106. }
  107. void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
  108. uint8* dst_ptr, int dst_width) {
  109. const uint8* src_ptr1 = src_ptr + src_stride;
  110. const uint8* src_ptr2 = src_ptr + src_stride * 2;
  111. const uint8* src_ptr3 = src_ptr + src_stride * 3;
  112. asm volatile (
  113. "1: \n"
  114. MEMACCESS(0)
  115. "vld1.8 {q0}, [%0]! \n" // load up 16x4
  116. MEMACCESS(3)
  117. "vld1.8 {q1}, [%3]! \n"
  118. MEMACCESS(4)
  119. "vld1.8 {q2}, [%4]! \n"
  120. MEMACCESS(5)
  121. "vld1.8 {q3}, [%5]! \n"
  122. "subs %2, %2, #4 \n"
  123. "vpaddl.u8 q0, q0 \n"
  124. "vpadal.u8 q0, q1 \n"
  125. "vpadal.u8 q0, q2 \n"
  126. "vpadal.u8 q0, q3 \n"
  127. "vpaddl.u16 q0, q0 \n"
  128. "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
  129. "vmovn.u16 d0, q0 \n"
  130. MEMACCESS(1)
  131. "vst1.32 {d0[0]}, [%1]! \n"
  132. "bgt 1b \n"
  133. : "+r"(src_ptr), // %0
  134. "+r"(dst_ptr), // %1
  135. "+r"(dst_width), // %2
  136. "+r"(src_ptr1), // %3
  137. "+r"(src_ptr2), // %4
  138. "+r"(src_ptr3) // %5
  139. :
  140. : "q0", "q1", "q2", "q3", "memory", "cc"
  141. );
  142. }
  143. // Down scale from 4 to 3 pixels. Use the neon multilane read/write
  144. // to load up the every 4th pixel into a 4 different registers.
  145. // Point samples 32 pixels to 24 pixels.
  146. void ScaleRowDown34_NEON(const uint8* src_ptr,
  147. ptrdiff_t src_stride,
  148. uint8* dst_ptr, int dst_width) {
  149. asm volatile (
  150. "1: \n"
  151. MEMACCESS(0)
  152. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
  153. "subs %2, %2, #24 \n"
  154. "vmov d2, d3 \n" // order d0, d1, d2
  155. MEMACCESS(1)
  156. "vst3.8 {d0, d1, d2}, [%1]! \n"
  157. "bgt 1b \n"
  158. : "+r"(src_ptr), // %0
  159. "+r"(dst_ptr), // %1
  160. "+r"(dst_width) // %2
  161. :
  162. : "d0", "d1", "d2", "d3", "memory", "cc"
  163. );
  164. }
  165. void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
  166. ptrdiff_t src_stride,
  167. uint8* dst_ptr, int dst_width) {
  168. asm volatile (
  169. "vmov.u8 d24, #3 \n"
  170. "add %3, %0 \n"
  171. "1: \n"
  172. MEMACCESS(0)
  173. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
  174. MEMACCESS(3)
  175. "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
  176. "subs %2, %2, #24 \n"
  177. // filter src line 0 with src line 1
  178. // expand chars to shorts to allow for room
  179. // when adding lines together
  180. "vmovl.u8 q8, d4 \n"
  181. "vmovl.u8 q9, d5 \n"
  182. "vmovl.u8 q10, d6 \n"
  183. "vmovl.u8 q11, d7 \n"
  184. // 3 * line_0 + line_1
  185. "vmlal.u8 q8, d0, d24 \n"
  186. "vmlal.u8 q9, d1, d24 \n"
  187. "vmlal.u8 q10, d2, d24 \n"
  188. "vmlal.u8 q11, d3, d24 \n"
  189. // (3 * line_0 + line_1) >> 2
  190. "vqrshrn.u16 d0, q8, #2 \n"
  191. "vqrshrn.u16 d1, q9, #2 \n"
  192. "vqrshrn.u16 d2, q10, #2 \n"
  193. "vqrshrn.u16 d3, q11, #2 \n"
  194. // a0 = (src[0] * 3 + s[1] * 1) >> 2
  195. "vmovl.u8 q8, d1 \n"
  196. "vmlal.u8 q8, d0, d24 \n"
  197. "vqrshrn.u16 d0, q8, #2 \n"
  198. // a1 = (src[1] * 1 + s[2] * 1) >> 1
  199. "vrhadd.u8 d1, d1, d2 \n"
  200. // a2 = (src[2] * 1 + s[3] * 3) >> 2
  201. "vmovl.u8 q8, d2 \n"
  202. "vmlal.u8 q8, d3, d24 \n"
  203. "vqrshrn.u16 d2, q8, #2 \n"
  204. MEMACCESS(1)
  205. "vst3.8 {d0, d1, d2}, [%1]! \n"
  206. "bgt 1b \n"
  207. : "+r"(src_ptr), // %0
  208. "+r"(dst_ptr), // %1
  209. "+r"(dst_width), // %2
  210. "+r"(src_stride) // %3
  211. :
  212. : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
  213. );
  214. }
  215. void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
  216. ptrdiff_t src_stride,
  217. uint8* dst_ptr, int dst_width) {
  218. asm volatile (
  219. "vmov.u8 d24, #3 \n"
  220. "add %3, %0 \n"
  221. "1: \n"
  222. MEMACCESS(0)
  223. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
  224. MEMACCESS(3)
  225. "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
  226. "subs %2, %2, #24 \n"
  227. // average src line 0 with src line 1
  228. "vrhadd.u8 q0, q0, q2 \n"
  229. "vrhadd.u8 q1, q1, q3 \n"
  230. // a0 = (src[0] * 3 + s[1] * 1) >> 2
  231. "vmovl.u8 q3, d1 \n"
  232. "vmlal.u8 q3, d0, d24 \n"
  233. "vqrshrn.u16 d0, q3, #2 \n"
  234. // a1 = (src[1] * 1 + s[2] * 1) >> 1
  235. "vrhadd.u8 d1, d1, d2 \n"
  236. // a2 = (src[2] * 1 + s[3] * 3) >> 2
  237. "vmovl.u8 q3, d2 \n"
  238. "vmlal.u8 q3, d3, d24 \n"
  239. "vqrshrn.u16 d2, q3, #2 \n"
  240. MEMACCESS(1)
  241. "vst3.8 {d0, d1, d2}, [%1]! \n"
  242. "bgt 1b \n"
  243. : "+r"(src_ptr), // %0
  244. "+r"(dst_ptr), // %1
  245. "+r"(dst_width), // %2
  246. "+r"(src_stride) // %3
  247. :
  248. : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
  249. );
  250. }
  251. #define HAS_SCALEROWDOWN38_NEON
  252. static uvec8 kShuf38 =
  253. { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
  254. static uvec8 kShuf38_2 =
  255. { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
  256. static vec16 kMult38_Div6 =
  257. { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
  258. 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
  259. static vec16 kMult38_Div9 =
  260. { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
  261. 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
  262. // 32 -> 12
  263. void ScaleRowDown38_NEON(const uint8* src_ptr,
  264. ptrdiff_t src_stride,
  265. uint8* dst_ptr, int dst_width) {
  266. asm volatile (
  267. MEMACCESS(3)
  268. "vld1.8 {q3}, [%3] \n"
  269. "1: \n"
  270. MEMACCESS(0)
  271. "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
  272. "subs %2, %2, #12 \n"
  273. "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
  274. "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
  275. MEMACCESS(1)
  276. "vst1.8 {d4}, [%1]! \n"
  277. MEMACCESS(1)
  278. "vst1.32 {d5[0]}, [%1]! \n"
  279. "bgt 1b \n"
  280. : "+r"(src_ptr), // %0
  281. "+r"(dst_ptr), // %1
  282. "+r"(dst_width) // %2
  283. : "r"(&kShuf38) // %3
  284. : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
  285. );
  286. }
  287. // 32x3 -> 12x1
  288. void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
  289. ptrdiff_t src_stride,
  290. uint8* dst_ptr, int dst_width) {
  291. const uint8* src_ptr1 = src_ptr + src_stride * 2;
  292. asm volatile (
  293. MEMACCESS(5)
  294. "vld1.16 {q13}, [%5] \n"
  295. MEMACCESS(6)
  296. "vld1.8 {q14}, [%6] \n"
  297. MEMACCESS(7)
  298. "vld1.8 {q15}, [%7] \n"
  299. "add %3, %0 \n"
  300. "1: \n"
  301. // d0 = 00 40 01 41 02 42 03 43
  302. // d1 = 10 50 11 51 12 52 13 53
  303. // d2 = 20 60 21 61 22 62 23 63
  304. // d3 = 30 70 31 71 32 72 33 73
  305. MEMACCESS(0)
  306. "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
  307. MEMACCESS(3)
  308. "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
  309. MEMACCESS(4)
  310. "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
  311. "subs %2, %2, #12 \n"
  312. // Shuffle the input data around to get align the data
  313. // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
  314. // d0 = 00 10 01 11 02 12 03 13
  315. // d1 = 40 50 41 51 42 52 43 53
  316. "vtrn.u8 d0, d1 \n"
  317. "vtrn.u8 d4, d5 \n"
  318. "vtrn.u8 d16, d17 \n"
  319. // d2 = 20 30 21 31 22 32 23 33
  320. // d3 = 60 70 61 71 62 72 63 73
  321. "vtrn.u8 d2, d3 \n"
  322. "vtrn.u8 d6, d7 \n"
  323. "vtrn.u8 d18, d19 \n"
  324. // d0 = 00+10 01+11 02+12 03+13
  325. // d2 = 40+50 41+51 42+52 43+53
  326. "vpaddl.u8 q0, q0 \n"
  327. "vpaddl.u8 q2, q2 \n"
  328. "vpaddl.u8 q8, q8 \n"
  329. // d3 = 60+70 61+71 62+72 63+73
  330. "vpaddl.u8 d3, d3 \n"
  331. "vpaddl.u8 d7, d7 \n"
  332. "vpaddl.u8 d19, d19 \n"
  333. // combine source lines
  334. "vadd.u16 q0, q2 \n"
  335. "vadd.u16 q0, q8 \n"
  336. "vadd.u16 d4, d3, d7 \n"
  337. "vadd.u16 d4, d19 \n"
  338. // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
  339. // + s[6 + st * 1] + s[7 + st * 1]
  340. // + s[6 + st * 2] + s[7 + st * 2]) / 6
  341. "vqrdmulh.s16 q2, q2, q13 \n"
  342. "vmovn.u16 d4, q2 \n"
  343. // Shuffle 2,3 reg around so that 2 can be added to the
  344. // 0,1 reg and 3 can be added to the 4,5 reg. This
  345. // requires expanding from u8 to u16 as the 0,1 and 4,5
  346. // registers are already expanded. Then do transposes
  347. // to get aligned.
  348. // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
  349. "vmovl.u8 q1, d2 \n"
  350. "vmovl.u8 q3, d6 \n"
  351. "vmovl.u8 q9, d18 \n"
  352. // combine source lines
  353. "vadd.u16 q1, q3 \n"
  354. "vadd.u16 q1, q9 \n"
  355. // d4 = xx 20 xx 30 xx 22 xx 32
  356. // d5 = xx 21 xx 31 xx 23 xx 33
  357. "vtrn.u32 d2, d3 \n"
  358. // d4 = xx 20 xx 21 xx 22 xx 23
  359. // d5 = xx 30 xx 31 xx 32 xx 33
  360. "vtrn.u16 d2, d3 \n"
  361. // 0+1+2, 3+4+5
  362. "vadd.u16 q0, q1 \n"
  363. // Need to divide, but can't downshift as the the value
  364. // isn't a power of 2. So multiply by 65536 / n
  365. // and take the upper 16 bits.
  366. "vqrdmulh.s16 q0, q0, q15 \n"
  367. // Align for table lookup, vtbl requires registers to
  368. // be adjacent
  369. "vmov.u8 d2, d4 \n"
  370. "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
  371. "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
  372. MEMACCESS(1)
  373. "vst1.8 {d3}, [%1]! \n"
  374. MEMACCESS(1)
  375. "vst1.32 {d4[0]}, [%1]! \n"
  376. "bgt 1b \n"
  377. : "+r"(src_ptr), // %0
  378. "+r"(dst_ptr), // %1
  379. "+r"(dst_width), // %2
  380. "+r"(src_stride), // %3
  381. "+r"(src_ptr1) // %4
  382. : "r"(&kMult38_Div6), // %5
  383. "r"(&kShuf38_2), // %6
  384. "r"(&kMult38_Div9) // %7
  385. : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
  386. );
  387. }
  388. // 32x2 -> 12x1
  389. void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
  390. ptrdiff_t src_stride,
  391. uint8* dst_ptr, int dst_width) {
  392. asm volatile (
  393. MEMACCESS(4)
  394. "vld1.16 {q13}, [%4] \n"
  395. MEMACCESS(5)
  396. "vld1.8 {q14}, [%5] \n"
  397. "add %3, %0 \n"
  398. "1: \n"
  399. // d0 = 00 40 01 41 02 42 03 43
  400. // d1 = 10 50 11 51 12 52 13 53
  401. // d2 = 20 60 21 61 22 62 23 63
  402. // d3 = 30 70 31 71 32 72 33 73
  403. MEMACCESS(0)
  404. "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
  405. MEMACCESS(3)
  406. "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
  407. "subs %2, %2, #12 \n"
  408. // Shuffle the input data around to get align the data
  409. // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
  410. // d0 = 00 10 01 11 02 12 03 13
  411. // d1 = 40 50 41 51 42 52 43 53
  412. "vtrn.u8 d0, d1 \n"
  413. "vtrn.u8 d4, d5 \n"
  414. // d2 = 20 30 21 31 22 32 23 33
  415. // d3 = 60 70 61 71 62 72 63 73
  416. "vtrn.u8 d2, d3 \n"
  417. "vtrn.u8 d6, d7 \n"
  418. // d0 = 00+10 01+11 02+12 03+13
  419. // d2 = 40+50 41+51 42+52 43+53
  420. "vpaddl.u8 q0, q0 \n"
  421. "vpaddl.u8 q2, q2 \n"
  422. // d3 = 60+70 61+71 62+72 63+73
  423. "vpaddl.u8 d3, d3 \n"
  424. "vpaddl.u8 d7, d7 \n"
  425. // combine source lines
  426. "vadd.u16 q0, q2 \n"
  427. "vadd.u16 d4, d3, d7 \n"
  428. // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
  429. "vqrshrn.u16 d4, q2, #2 \n"
  430. // Shuffle 2,3 reg around so that 2 can be added to the
  431. // 0,1 reg and 3 can be added to the 4,5 reg. This
  432. // requires expanding from u8 to u16 as the 0,1 and 4,5
  433. // registers are already expanded. Then do transposes
  434. // to get aligned.
  435. // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
  436. "vmovl.u8 q1, d2 \n"
  437. "vmovl.u8 q3, d6 \n"
  438. // combine source lines
  439. "vadd.u16 q1, q3 \n"
  440. // d4 = xx 20 xx 30 xx 22 xx 32
  441. // d5 = xx 21 xx 31 xx 23 xx 33
  442. "vtrn.u32 d2, d3 \n"
  443. // d4 = xx 20 xx 21 xx 22 xx 23
  444. // d5 = xx 30 xx 31 xx 32 xx 33
  445. "vtrn.u16 d2, d3 \n"
  446. // 0+1+2, 3+4+5
  447. "vadd.u16 q0, q1 \n"
  448. // Need to divide, but can't downshift as the the value
  449. // isn't a power of 2. So multiply by 65536 / n
  450. // and take the upper 16 bits.
  451. "vqrdmulh.s16 q0, q0, q13 \n"
  452. // Align for table lookup, vtbl requires registers to
  453. // be adjacent
  454. "vmov.u8 d2, d4 \n"
  455. "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
  456. "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
  457. MEMACCESS(1)
  458. "vst1.8 {d3}, [%1]! \n"
  459. MEMACCESS(1)
  460. "vst1.32 {d4[0]}, [%1]! \n"
  461. "bgt 1b \n"
  462. : "+r"(src_ptr), // %0
  463. "+r"(dst_ptr), // %1
  464. "+r"(dst_width), // %2
  465. "+r"(src_stride) // %3
  466. : "r"(&kMult38_Div6), // %4
  467. "r"(&kShuf38_2) // %5
  468. : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
  469. );
  470. }
  471. void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
  472. uint16* dst_ptr, int src_width, int src_height) {
  473. const uint8* src_tmp;
  474. asm volatile (
  475. "1: \n"
  476. "mov %0, %1 \n"
  477. "mov r12, %5 \n"
  478. "veor q2, q2, q2 \n"
  479. "veor q3, q3, q3 \n"
  480. "2: \n"
  481. // load 16 pixels into q0
  482. MEMACCESS(0)
  483. "vld1.8 {q0}, [%0], %3 \n"
  484. "vaddw.u8 q3, q3, d1 \n"
  485. "vaddw.u8 q2, q2, d0 \n"
  486. "subs r12, r12, #1 \n"
  487. "bgt 2b \n"
  488. MEMACCESS(2)
  489. "vst1.16 {q2, q3}, [%2]! \n" // store pixels
  490. "add %1, %1, #16 \n"
  491. "subs %4, %4, #16 \n" // 16 processed per loop
  492. "bgt 1b \n"
  493. : "=&r"(src_tmp), // %0
  494. "+r"(src_ptr), // %1
  495. "+r"(dst_ptr), // %2
  496. "+r"(src_stride), // %3
  497. "+r"(src_width), // %4
  498. "+r"(src_height) // %5
  499. :
  500. : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List
  501. );
  502. }
  503. // TODO(Yang Zhang): Investigate less load instructions for
  504. // the x/dx stepping
  505. #define LOAD2_DATA8_LANE(n) \
  506. "lsr %5, %3, #16 \n" \
  507. "add %6, %1, %5 \n" \
  508. "add %3, %3, %4 \n" \
  509. MEMACCESS(6) \
  510. "vld2.8 {d6["#n"], d7["#n"]}, [%6] \n"
  511. // The NEON version mimics this formula (from row_common.cc):
  512. // #define BLENDER(a, b, f) (uint8)((int)(a) +
  513. // ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
  514. void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
  515. int dst_width, int x, int dx) {
  516. int dx_offset[4] = {0, 1, 2, 3};
  517. int* tmp = dx_offset;
  518. const uint8* src_tmp = src_ptr;
  519. asm volatile (
  520. "vdup.32 q0, %3 \n" // x
  521. "vdup.32 q1, %4 \n" // dx
  522. "vld1.32 {q2}, [%5] \n" // 0 1 2 3
  523. "vshl.i32 q3, q1, #2 \n" // 4 * dx
  524. "vmul.s32 q1, q1, q2 \n"
  525. // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
  526. "vadd.s32 q1, q1, q0 \n"
  527. // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
  528. "vadd.s32 q2, q1, q3 \n"
  529. "vshl.i32 q0, q3, #1 \n" // 8 * dx
  530. "1: \n"
  531. LOAD2_DATA8_LANE(0)
  532. LOAD2_DATA8_LANE(1)
  533. LOAD2_DATA8_LANE(2)
  534. LOAD2_DATA8_LANE(3)
  535. LOAD2_DATA8_LANE(4)
  536. LOAD2_DATA8_LANE(5)
  537. LOAD2_DATA8_LANE(6)
  538. LOAD2_DATA8_LANE(7)
  539. "vmov q10, q1 \n"
  540. "vmov q11, q2 \n"
  541. "vuzp.16 q10, q11 \n"
  542. "vmovl.u8 q8, d6 \n"
  543. "vmovl.u8 q9, d7 \n"
  544. "vsubl.s16 q11, d18, d16 \n"
  545. "vsubl.s16 q12, d19, d17 \n"
  546. "vmovl.u16 q13, d20 \n"
  547. "vmovl.u16 q10, d21 \n"
  548. "vmul.s32 q11, q11, q13 \n"
  549. "vmul.s32 q12, q12, q10 \n"
  550. "vrshrn.s32 d18, q11, #16 \n"
  551. "vrshrn.s32 d19, q12, #16 \n"
  552. "vadd.s16 q8, q8, q9 \n"
  553. "vmovn.s16 d6, q8 \n"
  554. MEMACCESS(0)
  555. "vst1.8 {d6}, [%0]! \n" // store pixels
  556. "vadd.s32 q1, q1, q0 \n"
  557. "vadd.s32 q2, q2, q0 \n"
  558. "subs %2, %2, #8 \n" // 8 processed per loop
  559. "bgt 1b \n"
  560. : "+r"(dst_ptr), // %0
  561. "+r"(src_ptr), // %1
  562. "+r"(dst_width), // %2
  563. "+r"(x), // %3
  564. "+r"(dx), // %4
  565. "+r"(tmp), // %5
  566. "+r"(src_tmp) // %6
  567. :
  568. : "memory", "cc", "q0", "q1", "q2", "q3",
  569. "q8", "q9", "q10", "q11", "q12", "q13"
  570. );
  571. }
  572. #undef LOAD2_DATA8_LANE
  573. // 16x2 -> 16x1
  574. void ScaleFilterRows_NEON(uint8* dst_ptr,
  575. const uint8* src_ptr, ptrdiff_t src_stride,
  576. int dst_width, int source_y_fraction) {
  577. asm volatile (
  578. "cmp %4, #0 \n"
  579. "beq 100f \n"
  580. "add %2, %1 \n"
  581. "cmp %4, #64 \n"
  582. "beq 75f \n"
  583. "cmp %4, #128 \n"
  584. "beq 50f \n"
  585. "cmp %4, #192 \n"
  586. "beq 25f \n"
  587. "vdup.8 d5, %4 \n"
  588. "rsb %4, #256 \n"
  589. "vdup.8 d4, %4 \n"
  590. // General purpose row blend.
  591. "1: \n"
  592. MEMACCESS(1)
  593. "vld1.8 {q0}, [%1]! \n"
  594. MEMACCESS(2)
  595. "vld1.8 {q1}, [%2]! \n"
  596. "subs %3, %3, #16 \n"
  597. "vmull.u8 q13, d0, d4 \n"
  598. "vmull.u8 q14, d1, d4 \n"
  599. "vmlal.u8 q13, d2, d5 \n"
  600. "vmlal.u8 q14, d3, d5 \n"
  601. "vrshrn.u16 d0, q13, #8 \n"
  602. "vrshrn.u16 d1, q14, #8 \n"
  603. MEMACCESS(0)
  604. "vst1.8 {q0}, [%0]! \n"
  605. "bgt 1b \n"
  606. "b 99f \n"
  607. // Blend 25 / 75.
  608. "25: \n"
  609. MEMACCESS(1)
  610. "vld1.8 {q0}, [%1]! \n"
  611. MEMACCESS(2)
  612. "vld1.8 {q1}, [%2]! \n"
  613. "subs %3, %3, #16 \n"
  614. "vrhadd.u8 q0, q1 \n"
  615. "vrhadd.u8 q0, q1 \n"
  616. MEMACCESS(0)
  617. "vst1.8 {q0}, [%0]! \n"
  618. "bgt 25b \n"
  619. "b 99f \n"
  620. // Blend 50 / 50.
  621. "50: \n"
  622. MEMACCESS(1)
  623. "vld1.8 {q0}, [%1]! \n"
  624. MEMACCESS(2)
  625. "vld1.8 {q1}, [%2]! \n"
  626. "subs %3, %3, #16 \n"
  627. "vrhadd.u8 q0, q1 \n"
  628. MEMACCESS(0)
  629. "vst1.8 {q0}, [%0]! \n"
  630. "bgt 50b \n"
  631. "b 99f \n"
  632. // Blend 75 / 25.
  633. "75: \n"
  634. MEMACCESS(1)
  635. "vld1.8 {q1}, [%1]! \n"
  636. MEMACCESS(2)
  637. "vld1.8 {q0}, [%2]! \n"
  638. "subs %3, %3, #16 \n"
  639. "vrhadd.u8 q0, q1 \n"
  640. "vrhadd.u8 q0, q1 \n"
  641. MEMACCESS(0)
  642. "vst1.8 {q0}, [%0]! \n"
  643. "bgt 75b \n"
  644. "b 99f \n"
  645. // Blend 100 / 0 - Copy row unchanged.
  646. "100: \n"
  647. MEMACCESS(1)
  648. "vld1.8 {q0}, [%1]! \n"
  649. "subs %3, %3, #16 \n"
  650. MEMACCESS(0)
  651. "vst1.8 {q0}, [%0]! \n"
  652. "bgt 100b \n"
  653. "99: \n"
  654. MEMACCESS(0)
  655. "vst1.8 {d1[7]}, [%0] \n"
  656. : "+r"(dst_ptr), // %0
  657. "+r"(src_ptr), // %1
  658. "+r"(src_stride), // %2
  659. "+r"(dst_width), // %3
  660. "+r"(source_y_fraction) // %4
  661. :
  662. : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
  663. );
  664. }
  665. void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
  666. uint8* dst, int dst_width) {
  667. asm volatile (
  668. "1: \n"
  669. // load even pixels into q0, odd into q1
  670. MEMACCESS(0)
  671. "vld2.32 {q0, q1}, [%0]! \n"
  672. MEMACCESS(0)
  673. "vld2.32 {q2, q3}, [%0]! \n"
  674. "subs %2, %2, #8 \n" // 8 processed per loop
  675. MEMACCESS(1)
  676. "vst1.8 {q1}, [%1]! \n" // store odd pixels
  677. MEMACCESS(1)
  678. "vst1.8 {q3}, [%1]! \n"
  679. "bgt 1b \n"
  680. : "+r"(src_ptr), // %0
  681. "+r"(dst), // %1
  682. "+r"(dst_width) // %2
  683. :
  684. : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
  685. );
  686. }
  687. void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
  688. uint8* dst_argb, int dst_width) {
  689. asm volatile (
  690. "1: \n"
  691. MEMACCESS(0)
  692. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
  693. MEMACCESS(0)
  694. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
  695. "subs %2, %2, #8 \n" // 8 processed per loop
  696. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
  697. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
  698. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
  699. "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
  700. "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack
  701. "vrshrn.u16 d1, q1, #1 \n"
  702. "vrshrn.u16 d2, q2, #1 \n"
  703. "vrshrn.u16 d3, q3, #1 \n"
  704. MEMACCESS(1)
  705. "vst4.8 {d0, d1, d2, d3}, [%1]! \n"
  706. "bgt 1b \n"
  707. : "+r"(src_argb), // %0
  708. "+r"(dst_argb), // %1
  709. "+r"(dst_width) // %2
  710. :
  711. : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
  712. );
  713. }
  714. void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
  715. uint8* dst, int dst_width) {
  716. asm volatile (
  717. // change the stride to row 2 pointer
  718. "add %1, %1, %0 \n"
  719. "1: \n"
  720. MEMACCESS(0)
  721. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
  722. MEMACCESS(0)
  723. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
  724. "subs %3, %3, #8 \n" // 8 processed per loop.
  725. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
  726. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
  727. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
  728. "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
  729. MEMACCESS(1)
  730. "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels.
  731. MEMACCESS(1)
  732. "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels.
  733. "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
  734. "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
  735. "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
  736. "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
  737. "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
  738. "vrshrn.u16 d1, q1, #2 \n"
  739. "vrshrn.u16 d2, q2, #2 \n"
  740. "vrshrn.u16 d3, q3, #2 \n"
  741. MEMACCESS(2)
  742. "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
  743. "bgt 1b \n"
  744. : "+r"(src_ptr), // %0
  745. "+r"(src_stride), // %1
  746. "+r"(dst), // %2
  747. "+r"(dst_width) // %3
  748. :
  749. : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
  750. );
  751. }
  752. // Reads 4 pixels at a time.
  753. // Alignment requirement: src_argb 4 byte aligned.
  754. void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
  755. int src_stepx, uint8* dst_argb, int dst_width) {
  756. asm volatile (
  757. "mov r12, %3, lsl #2 \n"
  758. "1: \n"
  759. MEMACCESS(0)
  760. "vld1.32 {d0[0]}, [%0], r12 \n"
  761. MEMACCESS(0)
  762. "vld1.32 {d0[1]}, [%0], r12 \n"
  763. MEMACCESS(0)
  764. "vld1.32 {d1[0]}, [%0], r12 \n"
  765. MEMACCESS(0)
  766. "vld1.32 {d1[1]}, [%0], r12 \n"
  767. "subs %2, %2, #4 \n" // 4 pixels per loop.
  768. MEMACCESS(1)
  769. "vst1.8 {q0}, [%1]! \n"
  770. "bgt 1b \n"
  771. : "+r"(src_argb), // %0
  772. "+r"(dst_argb), // %1
  773. "+r"(dst_width) // %2
  774. : "r"(src_stepx) // %3
  775. : "memory", "cc", "r12", "q0"
  776. );
  777. }
  778. // Reads 4 pixels at a time.
  779. // Alignment requirement: src_argb 4 byte aligned.
  780. void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
  781. int src_stepx,
  782. uint8* dst_argb, int dst_width) {
  783. asm volatile (
  784. "mov r12, %4, lsl #2 \n"
  785. "add %1, %1, %0 \n"
  786. "1: \n"
  787. MEMACCESS(0)
  788. "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
  789. MEMACCESS(1)
  790. "vld1.8 {d1}, [%1], r12 \n"
  791. MEMACCESS(0)
  792. "vld1.8 {d2}, [%0], r12 \n"
  793. MEMACCESS(1)
  794. "vld1.8 {d3}, [%1], r12 \n"
  795. MEMACCESS(0)
  796. "vld1.8 {d4}, [%0], r12 \n"
  797. MEMACCESS(1)
  798. "vld1.8 {d5}, [%1], r12 \n"
  799. MEMACCESS(0)
  800. "vld1.8 {d6}, [%0], r12 \n"
  801. MEMACCESS(1)
  802. "vld1.8 {d7}, [%1], r12 \n"
  803. "vaddl.u8 q0, d0, d1 \n"
  804. "vaddl.u8 q1, d2, d3 \n"
  805. "vaddl.u8 q2, d4, d5 \n"
  806. "vaddl.u8 q3, d6, d7 \n"
  807. "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
  808. "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
  809. "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
  810. "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
  811. "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
  812. "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
  813. "subs %3, %3, #4 \n" // 4 pixels per loop.
  814. MEMACCESS(2)
  815. "vst1.8 {q0}, [%2]! \n"
  816. "bgt 1b \n"
  817. : "+r"(src_argb), // %0
  818. "+r"(src_stride), // %1
  819. "+r"(dst_argb), // %2
  820. "+r"(dst_width) // %3
  821. : "r"(src_stepx) // %4
  822. : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
  823. );
  824. }
  825. // TODO(Yang Zhang): Investigate less load instructions for
  826. // the x/dx stepping
  827. #define LOAD1_DATA32_LANE(dn, n) \
  828. "lsr %5, %3, #16 \n" \
  829. "add %6, %1, %5, lsl #2 \n" \
  830. "add %3, %3, %4 \n" \
  831. MEMACCESS(6) \
  832. "vld1.32 {"#dn"["#n"]}, [%6] \n"
  833. void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
  834. int dst_width, int x, int dx) {
  835. int tmp;
  836. const uint8* src_tmp = src_argb;
  837. asm volatile (
  838. "1: \n"
  839. LOAD1_DATA32_LANE(d0, 0)
  840. LOAD1_DATA32_LANE(d0, 1)
  841. LOAD1_DATA32_LANE(d1, 0)
  842. LOAD1_DATA32_LANE(d1, 1)
  843. LOAD1_DATA32_LANE(d2, 0)
  844. LOAD1_DATA32_LANE(d2, 1)
  845. LOAD1_DATA32_LANE(d3, 0)
  846. LOAD1_DATA32_LANE(d3, 1)
  847. MEMACCESS(0)
  848. "vst1.32 {q0, q1}, [%0]! \n" // store pixels
  849. "subs %2, %2, #8 \n" // 8 processed per loop
  850. "bgt 1b \n"
  851. : "+r"(dst_argb), // %0
  852. "+r"(src_argb), // %1
  853. "+r"(dst_width), // %2
  854. "+r"(x), // %3
  855. "+r"(dx), // %4
  856. "=&r"(tmp), // %5
  857. "+r"(src_tmp) // %6
  858. :
  859. : "memory", "cc", "q0", "q1"
  860. );
  861. }
  862. #undef LOAD1_DATA32_LANE
  863. // TODO(Yang Zhang): Investigate less load instructions for
  864. // the x/dx stepping
  865. #define LOAD2_DATA32_LANE(dn1, dn2, n) \
  866. "lsr %5, %3, #16 \n" \
  867. "add %6, %1, %5, lsl #2 \n" \
  868. "add %3, %3, %4 \n" \
  869. MEMACCESS(6) \
  870. "vld2.32 {"#dn1"["#n"], "#dn2"["#n"]}, [%6] \n"
  871. void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
  872. int dst_width, int x, int dx) {
  873. int dx_offset[4] = {0, 1, 2, 3};
  874. int* tmp = dx_offset;
  875. const uint8* src_tmp = src_argb;
  876. asm volatile (
  877. "vdup.32 q0, %3 \n" // x
  878. "vdup.32 q1, %4 \n" // dx
  879. "vld1.32 {q2}, [%5] \n" // 0 1 2 3
  880. "vshl.i32 q9, q1, #2 \n" // 4 * dx
  881. "vmul.s32 q1, q1, q2 \n"
  882. "vmov.i8 q3, #0x7f \n" // 0x7F
  883. "vmov.i16 q15, #0x7f \n" // 0x7F
  884. // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
  885. "vadd.s32 q8, q1, q0 \n"
  886. "1: \n"
  887. // d0, d1: a
  888. // d2, d3: b
  889. LOAD2_DATA32_LANE(d0, d2, 0)
  890. LOAD2_DATA32_LANE(d0, d2, 1)
  891. LOAD2_DATA32_LANE(d1, d3, 0)
  892. LOAD2_DATA32_LANE(d1, d3, 1)
  893. "vshrn.i32 d22, q8, #9 \n"
  894. "vand.16 d22, d22, d30 \n"
  895. "vdup.8 d24, d22[0] \n"
  896. "vdup.8 d25, d22[2] \n"
  897. "vdup.8 d26, d22[4] \n"
  898. "vdup.8 d27, d22[6] \n"
  899. "vext.8 d4, d24, d25, #4 \n"
  900. "vext.8 d5, d26, d27, #4 \n" // f
  901. "veor.8 q10, q2, q3 \n" // 0x7f ^ f
  902. "vmull.u8 q11, d0, d20 \n"
  903. "vmull.u8 q12, d1, d21 \n"
  904. "vmull.u8 q13, d2, d4 \n"
  905. "vmull.u8 q14, d3, d5 \n"
  906. "vadd.i16 q11, q11, q13 \n"
  907. "vadd.i16 q12, q12, q14 \n"
  908. "vshrn.i16 d0, q11, #7 \n"
  909. "vshrn.i16 d1, q12, #7 \n"
  910. MEMACCESS(0)
  911. "vst1.32 {d0, d1}, [%0]! \n" // store pixels
  912. "vadd.s32 q8, q8, q9 \n"
  913. "subs %2, %2, #4 \n" // 4 processed per loop
  914. "bgt 1b \n"
  915. : "+r"(dst_argb), // %0
  916. "+r"(src_argb), // %1
  917. "+r"(dst_width), // %2
  918. "+r"(x), // %3
  919. "+r"(dx), // %4
  920. "+r"(tmp), // %5
  921. "+r"(src_tmp) // %6
  922. :
  923. : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
  924. "q10", "q11", "q12", "q13", "q14", "q15"
  925. );
  926. }
  927. #undef LOAD2_DATA32_LANE
  928. #endif // defined(__ARM_NEON__) && !defined(__aarch64__)
  929. #ifdef __cplusplus
  930. } // extern "C"
  931. } // namespace libyuv
  932. #endif