row_neon64.cc 133 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884
  1. /*
  2. * Copyright 2014 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/row.h"
  11. #ifdef __cplusplus
  12. namespace libyuv {
  13. extern "C" {
  14. #endif
  15. // This module is for GCC Neon armv8 64 bit.
  16. #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
  17. // Read 8 Y, 4 U and 4 V from 422
  18. #define READYUV422 \
  19. "ld1 {v0.8b}, [%0], #8 \n" \
  20. "ld1 {v1.s}[0], [%1], #4 \n" \
  21. "ld1 {v1.s}[1], [%2], #4 \n"
  22. // Read 8 Y, 8 U and 8 V from 444
  23. #define READYUV444 \
  24. "ld1 {v0.8b}, [%0], #8 \n" \
  25. "ld1 {v1.d}[0], [%1], #8 \n" \
  26. "ld1 {v1.d}[1], [%2], #8 \n" \
  27. "uaddlp v1.8h, v1.16b \n" \
  28. "rshrn v1.8b, v1.8h, #1 \n"
  29. // Read 8 Y, and set 4 U and 4 V to 128
  30. #define READYUV400 \
  31. "ld1 {v0.8b}, [%0], #8 \n" \
  32. "movi v1.8b , #128 \n"
  33. // Read 8 Y and 4 UV from NV12
  34. #define READNV12 \
  35. "ld1 {v0.8b}, [%0], #8 \n" \
  36. "ld1 {v2.8b}, [%1], #8 \n" \
  37. "uzp1 v1.8b, v2.8b, v2.8b \n" \
  38. "uzp2 v3.8b, v2.8b, v2.8b \n" \
  39. "ins v1.s[1], v3.s[0] \n"
  40. // Read 8 Y and 4 VU from NV21
  41. #define READNV21 \
  42. "ld1 {v0.8b}, [%0], #8 \n" \
  43. "ld1 {v2.8b}, [%1], #8 \n" \
  44. "uzp1 v3.8b, v2.8b, v2.8b \n" \
  45. "uzp2 v1.8b, v2.8b, v2.8b \n" \
  46. "ins v1.s[1], v3.s[0] \n"
  47. // Read 8 YUY2
  48. #define READYUY2 \
  49. "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \
  50. "uzp2 v3.8b, v1.8b, v1.8b \n" \
  51. "uzp1 v1.8b, v1.8b, v1.8b \n" \
  52. "ins v1.s[1], v3.s[0] \n"
  53. // Read 8 UYVY
  54. #define READUYVY \
  55. "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \
  56. "orr v0.8b, v3.8b, v3.8b \n" \
  57. "uzp1 v1.8b, v2.8b, v2.8b \n" \
  58. "uzp2 v3.8b, v2.8b, v2.8b \n" \
  59. "ins v1.s[1], v3.s[0] \n"
  60. #define YUVTORGB_SETUP \
  61. "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \
  62. "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \
  63. "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \
  64. "ld1r {v31.4s}, [%[kYToRgb]] \n" \
  65. "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
  66. "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n"
  67. #define YUVTORGB(vR, vG, vB) \
  68. "uxtl v0.8h, v0.8b \n" /* Extract Y */ \
  69. "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \
  70. "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \
  71. "ushll v0.4s, v0.4h, #0 \n" \
  72. "mul v3.4s, v3.4s, v31.4s \n" \
  73. "mul v0.4s, v0.4s, v31.4s \n" \
  74. "sqshrun v0.4h, v0.4s, #16 \n" \
  75. "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \
  76. "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \
  77. "mov v2.d[0], v1.d[1] \n" /* Extract V */ \
  78. "uxtl v2.8h, v2.8b \n" \
  79. "uxtl v1.8h, v1.8b \n" /* Extract U */ \
  80. "mul v3.8h, v1.8h, v27.8h \n" \
  81. "mul v5.8h, v1.8h, v29.8h \n" \
  82. "mul v6.8h, v2.8h, v30.8h \n" \
  83. "mul v7.8h, v2.8h, v28.8h \n" \
  84. "sqadd v6.8h, v6.8h, v5.8h \n" \
  85. "sqadd " #vB \
  86. ".8h, v24.8h, v0.8h \n" /* B */ \
  87. "sqadd " #vG \
  88. ".8h, v25.8h, v0.8h \n" /* G */ \
  89. "sqadd " #vR \
  90. ".8h, v26.8h, v0.8h \n" /* R */ \
  91. "sqadd " #vB ".8h, " #vB \
  92. ".8h, v3.8h \n" /* B */ \
  93. "sqsub " #vG ".8h, " #vG \
  94. ".8h, v6.8h \n" /* G */ \
  95. "sqadd " #vR ".8h, " #vR \
  96. ".8h, v7.8h \n" /* R */ \
  97. "sqshrun " #vB ".8b, " #vB \
  98. ".8h, #6 \n" /* B */ \
  99. "sqshrun " #vG ".8b, " #vG \
  100. ".8h, #6 \n" /* G */ \
  101. "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */
  102. void I444ToARGBRow_NEON(const uint8_t* src_y,
  103. const uint8_t* src_u,
  104. const uint8_t* src_v,
  105. uint8_t* dst_argb,
  106. const struct YuvConstants* yuvconstants,
  107. int width) {
  108. asm volatile (
  109. YUVTORGB_SETUP
  110. "movi v23.8b, #255 \n" /* A */
  111. "1: \n"
  112. READYUV444
  113. YUVTORGB(v22, v21, v20)
  114. "subs %w4, %w4, #8 \n"
  115. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
  116. "b.gt 1b \n"
  117. : "+r"(src_y), // %0
  118. "+r"(src_u), // %1
  119. "+r"(src_v), // %2
  120. "+r"(dst_argb), // %3
  121. "+r"(width) // %4
  122. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  123. [kUVToG]"r"(&yuvconstants->kUVToG),
  124. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  125. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  126. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  127. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  128. );
  129. }
  130. void I422ToARGBRow_NEON(const uint8_t* src_y,
  131. const uint8_t* src_u,
  132. const uint8_t* src_v,
  133. uint8_t* dst_argb,
  134. const struct YuvConstants* yuvconstants,
  135. int width) {
  136. asm volatile (
  137. YUVTORGB_SETUP
  138. "movi v23.8b, #255 \n" /* A */
  139. "1: \n"
  140. READYUV422
  141. YUVTORGB(v22, v21, v20)
  142. "subs %w4, %w4, #8 \n"
  143. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
  144. "b.gt 1b \n"
  145. : "+r"(src_y), // %0
  146. "+r"(src_u), // %1
  147. "+r"(src_v), // %2
  148. "+r"(dst_argb), // %3
  149. "+r"(width) // %4
  150. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  151. [kUVToG]"r"(&yuvconstants->kUVToG),
  152. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  153. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  154. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  155. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  156. );
  157. }
  158. void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
  159. const uint8_t* src_u,
  160. const uint8_t* src_v,
  161. const uint8_t* src_a,
  162. uint8_t* dst_argb,
  163. const struct YuvConstants* yuvconstants,
  164. int width) {
  165. asm volatile (
  166. YUVTORGB_SETUP
  167. "1: \n"
  168. READYUV422
  169. YUVTORGB(v22, v21, v20)
  170. "ld1 {v23.8b}, [%3], #8 \n"
  171. "subs %w5, %w5, #8 \n"
  172. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
  173. "b.gt 1b \n"
  174. : "+r"(src_y), // %0
  175. "+r"(src_u), // %1
  176. "+r"(src_v), // %2
  177. "+r"(src_a), // %3
  178. "+r"(dst_argb), // %4
  179. "+r"(width) // %5
  180. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  181. [kUVToG]"r"(&yuvconstants->kUVToG),
  182. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  183. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  184. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  185. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  186. );
  187. }
  188. void I422ToRGBARow_NEON(const uint8_t* src_y,
  189. const uint8_t* src_u,
  190. const uint8_t* src_v,
  191. uint8_t* dst_rgba,
  192. const struct YuvConstants* yuvconstants,
  193. int width) {
  194. asm volatile (
  195. YUVTORGB_SETUP
  196. "movi v20.8b, #255 \n" /* A */
  197. "1: \n"
  198. READYUV422
  199. YUVTORGB(v23, v22, v21)
  200. "subs %w4, %w4, #8 \n"
  201. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
  202. "b.gt 1b \n"
  203. : "+r"(src_y), // %0
  204. "+r"(src_u), // %1
  205. "+r"(src_v), // %2
  206. "+r"(dst_rgba), // %3
  207. "+r"(width) // %4
  208. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  209. [kUVToG]"r"(&yuvconstants->kUVToG),
  210. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  211. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  212. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  213. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  214. );
  215. }
  216. void I422ToRGB24Row_NEON(const uint8_t* src_y,
  217. const uint8_t* src_u,
  218. const uint8_t* src_v,
  219. uint8_t* dst_rgb24,
  220. const struct YuvConstants* yuvconstants,
  221. int width) {
  222. asm volatile (
  223. YUVTORGB_SETUP
  224. "1: \n"
  225. READYUV422
  226. YUVTORGB(v22, v21, v20)
  227. "subs %w4, %w4, #8 \n"
  228. "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
  229. "b.gt 1b \n"
  230. : "+r"(src_y), // %0
  231. "+r"(src_u), // %1
  232. "+r"(src_v), // %2
  233. "+r"(dst_rgb24), // %3
  234. "+r"(width) // %4
  235. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  236. [kUVToG]"r"(&yuvconstants->kUVToG),
  237. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  238. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  239. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  240. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  241. );
  242. }
  243. #define ARGBTORGB565 \
  244. "shll v0.8h, v22.8b, #8 \n" /* R */ \
  245. "shll v21.8h, v21.8b, #8 \n" /* G */ \
  246. "shll v20.8h, v20.8b, #8 \n" /* B */ \
  247. "sri v0.8h, v21.8h, #5 \n" /* RG */ \
  248. "sri v0.8h, v20.8h, #11 \n" /* RGB */
  249. void I422ToRGB565Row_NEON(const uint8_t* src_y,
  250. const uint8_t* src_u,
  251. const uint8_t* src_v,
  252. uint8_t* dst_rgb565,
  253. const struct YuvConstants* yuvconstants,
  254. int width) {
  255. asm volatile(
  256. YUVTORGB_SETUP
  257. "1: \n" READYUV422 YUVTORGB(
  258. v22, v21,
  259. v20) "subs %w4, %w4, #8 \n" ARGBTORGB565
  260. "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
  261. // RGB565.
  262. "b.gt 1b \n"
  263. : "+r"(src_y), // %0
  264. "+r"(src_u), // %1
  265. "+r"(src_v), // %2
  266. "+r"(dst_rgb565), // %3
  267. "+r"(width) // %4
  268. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  269. [kUVToG] "r"(&yuvconstants->kUVToG),
  270. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  271. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  272. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  273. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
  274. }
  275. #define ARGBTOARGB1555 \
  276. "shll v0.8h, v23.8b, #8 \n" /* A */ \
  277. "shll v22.8h, v22.8b, #8 \n" /* R */ \
  278. "shll v21.8h, v21.8b, #8 \n" /* G */ \
  279. "shll v20.8h, v20.8b, #8 \n" /* B */ \
  280. "sri v0.8h, v22.8h, #1 \n" /* AR */ \
  281. "sri v0.8h, v21.8h, #6 \n" /* ARG */ \
  282. "sri v0.8h, v20.8h, #11 \n" /* ARGB */
  283. void I422ToARGB1555Row_NEON(const uint8_t* src_y,
  284. const uint8_t* src_u,
  285. const uint8_t* src_v,
  286. uint8_t* dst_argb1555,
  287. const struct YuvConstants* yuvconstants,
  288. int width) {
  289. asm volatile(
  290. YUVTORGB_SETUP
  291. "movi v23.8b, #255 \n"
  292. "1: \n" READYUV422 YUVTORGB(
  293. v22, v21,
  294. v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555
  295. "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
  296. // RGB565.
  297. "b.gt 1b \n"
  298. : "+r"(src_y), // %0
  299. "+r"(src_u), // %1
  300. "+r"(src_v), // %2
  301. "+r"(dst_argb1555), // %3
  302. "+r"(width) // %4
  303. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  304. [kUVToG] "r"(&yuvconstants->kUVToG),
  305. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  306. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  307. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  308. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
  309. }
  310. #define ARGBTOARGB4444 \
  311. /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \
  312. "ushr v20.8b, v20.8b, #4 \n" /* B */ \
  313. "bic v21.8b, v21.8b, v4.8b \n" /* G */ \
  314. "ushr v22.8b, v22.8b, #4 \n" /* R */ \
  315. "bic v23.8b, v23.8b, v4.8b \n" /* A */ \
  316. "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \
  317. "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \
  318. "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */
  319. void I422ToARGB4444Row_NEON(const uint8_t* src_y,
  320. const uint8_t* src_u,
  321. const uint8_t* src_v,
  322. uint8_t* dst_argb4444,
  323. const struct YuvConstants* yuvconstants,
  324. int width) {
  325. asm volatile (
  326. YUVTORGB_SETUP
  327. "movi v4.16b, #0x0f \n" // bits to clear with vbic.
  328. "1: \n"
  329. READYUV422
  330. YUVTORGB(v22, v21, v20)
  331. "subs %w4, %w4, #8 \n"
  332. "movi v23.8b, #255 \n"
  333. ARGBTOARGB4444
  334. "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444.
  335. "b.gt 1b \n"
  336. : "+r"(src_y), // %0
  337. "+r"(src_u), // %1
  338. "+r"(src_v), // %2
  339. "+r"(dst_argb4444), // %3
  340. "+r"(width) // %4
  341. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  342. [kUVToG]"r"(&yuvconstants->kUVToG),
  343. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  344. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  345. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  346. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  347. );
  348. }
  349. void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
  350. asm volatile (
  351. YUVTORGB_SETUP
  352. "movi v23.8b, #255 \n"
  353. "1: \n"
  354. READYUV400
  355. YUVTORGB(v22, v21, v20)
  356. "subs %w2, %w2, #8 \n"
  357. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
  358. "b.gt 1b \n"
  359. : "+r"(src_y), // %0
  360. "+r"(dst_argb), // %1
  361. "+r"(width) // %2
  362. : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
  363. [kUVToG]"r"(&kYuvI601Constants.kUVToG),
  364. [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
  365. [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
  366. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  367. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  368. );
  369. }
  370. void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
  371. asm volatile(
  372. "movi v23.8b, #255 \n"
  373. "1: \n"
  374. "ld1 {v20.8b}, [%0], #8 \n"
  375. "orr v21.8b, v20.8b, v20.8b \n"
  376. "orr v22.8b, v20.8b, v20.8b \n"
  377. "subs %w2, %w2, #8 \n"
  378. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
  379. "b.gt 1b \n"
  380. : "+r"(src_y), // %0
  381. "+r"(dst_argb), // %1
  382. "+r"(width) // %2
  383. :
  384. : "cc", "memory", "v20", "v21", "v22", "v23");
  385. }
  386. void NV12ToARGBRow_NEON(const uint8_t* src_y,
  387. const uint8_t* src_uv,
  388. uint8_t* dst_argb,
  389. const struct YuvConstants* yuvconstants,
  390. int width) {
  391. asm volatile (
  392. YUVTORGB_SETUP
  393. "movi v23.8b, #255 \n"
  394. "1: \n"
  395. READNV12
  396. YUVTORGB(v22, v21, v20)
  397. "subs %w3, %w3, #8 \n"
  398. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
  399. "b.gt 1b \n"
  400. : "+r"(src_y), // %0
  401. "+r"(src_uv), // %1
  402. "+r"(dst_argb), // %2
  403. "+r"(width) // %3
  404. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  405. [kUVToG]"r"(&yuvconstants->kUVToG),
  406. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  407. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  408. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  409. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  410. );
  411. }
  412. void NV21ToARGBRow_NEON(const uint8_t* src_y,
  413. const uint8_t* src_vu,
  414. uint8_t* dst_argb,
  415. const struct YuvConstants* yuvconstants,
  416. int width) {
  417. asm volatile (
  418. YUVTORGB_SETUP
  419. "movi v23.8b, #255 \n"
  420. "1: \n"
  421. READNV21
  422. YUVTORGB(v22, v21, v20)
  423. "subs %w3, %w3, #8 \n"
  424. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
  425. "b.gt 1b \n"
  426. : "+r"(src_y), // %0
  427. "+r"(src_vu), // %1
  428. "+r"(dst_argb), // %2
  429. "+r"(width) // %3
  430. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  431. [kUVToG]"r"(&yuvconstants->kUVToG),
  432. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  433. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  434. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  435. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  436. );
  437. }
  438. void NV12ToRGB24Row_NEON(const uint8_t* src_y,
  439. const uint8_t* src_uv,
  440. uint8_t* dst_rgb24,
  441. const struct YuvConstants* yuvconstants,
  442. int width) {
  443. asm volatile (
  444. YUVTORGB_SETUP
  445. "1: \n"
  446. READNV12
  447. YUVTORGB(v22, v21, v20)
  448. "subs %w3, %w3, #8 \n"
  449. "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
  450. "b.gt 1b \n"
  451. : "+r"(src_y), // %0
  452. "+r"(src_uv), // %1
  453. "+r"(dst_rgb24), // %2
  454. "+r"(width) // %3
  455. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  456. [kUVToG]"r"(&yuvconstants->kUVToG),
  457. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  458. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  459. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  460. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  461. );
  462. }
  463. void NV21ToRGB24Row_NEON(const uint8_t* src_y,
  464. const uint8_t* src_vu,
  465. uint8_t* dst_rgb24,
  466. const struct YuvConstants* yuvconstants,
  467. int width) {
  468. asm volatile (
  469. YUVTORGB_SETUP
  470. "1: \n"
  471. READNV21
  472. YUVTORGB(v22, v21, v20)
  473. "subs %w3, %w3, #8 \n"
  474. "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
  475. "b.gt 1b \n"
  476. : "+r"(src_y), // %0
  477. "+r"(src_vu), // %1
  478. "+r"(dst_rgb24), // %2
  479. "+r"(width) // %3
  480. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  481. [kUVToG]"r"(&yuvconstants->kUVToG),
  482. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  483. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  484. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  485. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  486. );
  487. }
  488. void NV12ToRGB565Row_NEON(const uint8_t* src_y,
  489. const uint8_t* src_uv,
  490. uint8_t* dst_rgb565,
  491. const struct YuvConstants* yuvconstants,
  492. int width) {
  493. asm volatile(
  494. YUVTORGB_SETUP
  495. "1: \n" READNV12 YUVTORGB(
  496. v22, v21,
  497. v20) "subs %w3, %w3, #8 \n" ARGBTORGB565
  498. "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels
  499. // RGB565.
  500. "b.gt 1b \n"
  501. : "+r"(src_y), // %0
  502. "+r"(src_uv), // %1
  503. "+r"(dst_rgb565), // %2
  504. "+r"(width) // %3
  505. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  506. [kUVToG] "r"(&yuvconstants->kUVToG),
  507. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  508. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  509. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  510. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
  511. }
  512. void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
  513. uint8_t* dst_argb,
  514. const struct YuvConstants* yuvconstants,
  515. int width) {
  516. asm volatile (
  517. YUVTORGB_SETUP
  518. "movi v23.8b, #255 \n"
  519. "1: \n"
  520. READYUY2
  521. YUVTORGB(v22, v21, v20)
  522. "subs %w2, %w2, #8 \n"
  523. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
  524. "b.gt 1b \n"
  525. : "+r"(src_yuy2), // %0
  526. "+r"(dst_argb), // %1
  527. "+r"(width) // %2
  528. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  529. [kUVToG]"r"(&yuvconstants->kUVToG),
  530. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  531. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  532. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  533. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  534. );
  535. }
  536. void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
  537. uint8_t* dst_argb,
  538. const struct YuvConstants* yuvconstants,
  539. int width) {
  540. asm volatile (
  541. YUVTORGB_SETUP
  542. "movi v23.8b, #255 \n"
  543. "1: \n"
  544. READUYVY
  545. YUVTORGB(v22, v21, v20)
  546. "subs %w2, %w2, #8 \n"
  547. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
  548. "b.gt 1b \n"
  549. : "+r"(src_uyvy), // %0
  550. "+r"(dst_argb), // %1
  551. "+r"(width) // %2
  552. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  553. [kUVToG]"r"(&yuvconstants->kUVToG),
  554. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  555. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  556. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  557. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  558. );
  559. }
  560. // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
  561. void SplitUVRow_NEON(const uint8_t* src_uv,
  562. uint8_t* dst_u,
  563. uint8_t* dst_v,
  564. int width) {
  565. asm volatile(
  566. "1: \n"
  567. "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
  568. "subs %w3, %w3, #16 \n" // 16 processed per loop
  569. "st1 {v0.16b}, [%1], #16 \n" // store U
  570. "st1 {v1.16b}, [%2], #16 \n" // store V
  571. "b.gt 1b \n"
  572. : "+r"(src_uv), // %0
  573. "+r"(dst_u), // %1
  574. "+r"(dst_v), // %2
  575. "+r"(width) // %3 // Output registers
  576. : // Input registers
  577. : "cc", "memory", "v0", "v1" // Clobber List
  578. );
  579. }
  580. // Reads 16 U's and V's and writes out 16 pairs of UV.
  581. void MergeUVRow_NEON(const uint8_t* src_u,
  582. const uint8_t* src_v,
  583. uint8_t* dst_uv,
  584. int width) {
  585. asm volatile(
  586. "1: \n"
  587. "ld1 {v0.16b}, [%0], #16 \n" // load U
  588. "ld1 {v1.16b}, [%1], #16 \n" // load V
  589. "subs %w3, %w3, #16 \n" // 16 processed per loop
  590. "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
  591. "b.gt 1b \n"
  592. : "+r"(src_u), // %0
  593. "+r"(src_v), // %1
  594. "+r"(dst_uv), // %2
  595. "+r"(width) // %3 // Output registers
  596. : // Input registers
  597. : "cc", "memory", "v0", "v1" // Clobber List
  598. );
  599. }
  600. // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
  601. void SplitRGBRow_NEON(const uint8_t* src_rgb,
  602. uint8_t* dst_r,
  603. uint8_t* dst_g,
  604. uint8_t* dst_b,
  605. int width) {
  606. asm volatile(
  607. "1: \n"
  608. "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB
  609. "subs %w4, %w4, #16 \n" // 16 processed per loop
  610. "st1 {v0.16b}, [%1], #16 \n" // store R
  611. "st1 {v1.16b}, [%2], #16 \n" // store G
  612. "st1 {v2.16b}, [%3], #16 \n" // store B
  613. "b.gt 1b \n"
  614. : "+r"(src_rgb), // %0
  615. "+r"(dst_r), // %1
  616. "+r"(dst_g), // %2
  617. "+r"(dst_b), // %3
  618. "+r"(width) // %4
  619. : // Input registers
  620. : "cc", "memory", "v0", "v1", "v2" // Clobber List
  621. );
  622. }
  623. // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
  624. void MergeRGBRow_NEON(const uint8_t* src_r,
  625. const uint8_t* src_g,
  626. const uint8_t* src_b,
  627. uint8_t* dst_rgb,
  628. int width) {
  629. asm volatile(
  630. "1: \n"
  631. "ld1 {v0.16b}, [%0], #16 \n" // load R
  632. "ld1 {v1.16b}, [%1], #16 \n" // load G
  633. "ld1 {v2.16b}, [%2], #16 \n" // load B
  634. "subs %w4, %w4, #16 \n" // 16 processed per loop
  635. "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB
  636. "b.gt 1b \n"
  637. : "+r"(src_r), // %0
  638. "+r"(src_g), // %1
  639. "+r"(src_b), // %2
  640. "+r"(dst_rgb), // %3
  641. "+r"(width) // %4
  642. : // Input registers
  643. : "cc", "memory", "v0", "v1", "v2" // Clobber List
  644. );
  645. }
  646. // Copy multiple of 32.
  647. void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
  648. asm volatile(
  649. "1: \n"
  650. "ldp q0, q1, [%0], #32 \n"
  651. "subs %w2, %w2, #32 \n" // 32 processed per loop
  652. "stp q0, q1, [%1], #32 \n"
  653. "b.gt 1b \n"
  654. : "+r"(src), // %0
  655. "+r"(dst), // %1
  656. "+r"(width) // %2 // Output registers
  657. : // Input registers
  658. : "cc", "memory", "v0", "v1" // Clobber List
  659. );
  660. }
  661. // SetRow writes 'width' bytes using an 8 bit value repeated.
  662. void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
  663. asm volatile(
  664. "dup v0.16b, %w2 \n" // duplicate 16 bytes
  665. "1: \n"
  666. "subs %w1, %w1, #16 \n" // 16 bytes per loop
  667. "st1 {v0.16b}, [%0], #16 \n" // store
  668. "b.gt 1b \n"
  669. : "+r"(dst), // %0
  670. "+r"(width) // %1
  671. : "r"(v8) // %2
  672. : "cc", "memory", "v0");
  673. }
  674. void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
  675. asm volatile(
  676. "dup v0.4s, %w2 \n" // duplicate 4 ints
  677. "1: \n"
  678. "subs %w1, %w1, #4 \n" // 4 ints per loop
  679. "st1 {v0.16b}, [%0], #16 \n" // store
  680. "b.gt 1b \n"
  681. : "+r"(dst), // %0
  682. "+r"(width) // %1
  683. : "r"(v32) // %2
  684. : "cc", "memory", "v0");
  685. }
  686. void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
  687. asm volatile(
  688. // Start at end of source row.
  689. "add %0, %0, %w2, sxtw \n"
  690. "sub %0, %0, #16 \n"
  691. "1: \n"
  692. "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
  693. "subs %w2, %w2, #16 \n" // 16 pixels per loop.
  694. "rev64 v0.16b, v0.16b \n"
  695. "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
  696. "st1 {v0.D}[0], [%1], #8 \n"
  697. "b.gt 1b \n"
  698. : "+r"(src), // %0
  699. "+r"(dst), // %1
  700. "+r"(width) // %2
  701. : "r"((ptrdiff_t)-16) // %3
  702. : "cc", "memory", "v0");
  703. }
  704. void MirrorUVRow_NEON(const uint8_t* src_uv,
  705. uint8_t* dst_u,
  706. uint8_t* dst_v,
  707. int width) {
  708. asm volatile(
  709. // Start at end of source row.
  710. "add %0, %0, %w3, sxtw #1 \n"
  711. "sub %0, %0, #16 \n"
  712. "1: \n"
  713. "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
  714. "subs %w3, %w3, #8 \n" // 8 pixels per loop.
  715. "rev64 v0.8b, v0.8b \n"
  716. "rev64 v1.8b, v1.8b \n"
  717. "st1 {v0.8b}, [%1], #8 \n" // dst += 8
  718. "st1 {v1.8b}, [%2], #8 \n"
  719. "b.gt 1b \n"
  720. : "+r"(src_uv), // %0
  721. "+r"(dst_u), // %1
  722. "+r"(dst_v), // %2
  723. "+r"(width) // %3
  724. : "r"((ptrdiff_t)-16) // %4
  725. : "cc", "memory", "v0", "v1");
  726. }
  727. void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
  728. asm volatile(
  729. // Start at end of source row.
  730. "add %0, %0, %w2, sxtw #2 \n"
  731. "sub %0, %0, #16 \n"
  732. "1: \n"
  733. "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
  734. "subs %w2, %w2, #4 \n" // 4 pixels per loop.
  735. "rev64 v0.4s, v0.4s \n"
  736. "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
  737. "st1 {v0.D}[0], [%1], #8 \n"
  738. "b.gt 1b \n"
  739. : "+r"(src), // %0
  740. "+r"(dst), // %1
  741. "+r"(width) // %2
  742. : "r"((ptrdiff_t)-16) // %3
  743. : "cc", "memory", "v0");
  744. }
  745. void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
  746. uint8_t* dst_argb,
  747. int width) {
  748. asm volatile(
  749. "movi v4.8b, #255 \n" // Alpha
  750. "1: \n"
  751. "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
  752. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  753. "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
  754. "b.gt 1b \n"
  755. : "+r"(src_rgb24), // %0
  756. "+r"(dst_argb), // %1
  757. "+r"(width) // %2
  758. :
  759. : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
  760. );
  761. }
  762. void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
  763. asm volatile(
  764. "movi v5.8b, #255 \n" // Alpha
  765. "1: \n"
  766. "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
  767. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  768. "orr v3.8b, v1.8b, v1.8b \n" // move g
  769. "orr v4.8b, v0.8b, v0.8b \n" // move r
  770. "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
  771. "b.gt 1b \n"
  772. : "+r"(src_raw), // %0
  773. "+r"(dst_argb), // %1
  774. "+r"(width) // %2
  775. :
  776. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
  777. );
  778. }
  779. void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
  780. asm volatile(
  781. "1: \n"
  782. "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
  783. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  784. "orr v3.8b, v1.8b, v1.8b \n" // move g
  785. "orr v4.8b, v0.8b, v0.8b \n" // move r
  786. "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
  787. "b.gt 1b \n"
  788. : "+r"(src_raw), // %0
  789. "+r"(dst_rgb24), // %1
  790. "+r"(width) // %2
  791. :
  792. : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
  793. );
  794. }
  795. #define RGB565TOARGB \
  796. "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \
  797. "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \
  798. "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \
  799. "orr v1.8b, v4.8b, v6.8b \n" /* G */ \
  800. "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
  801. "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \
  802. "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \
  803. "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \
  804. "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \
  805. "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \
  806. "dup v2.2D, v0.D[1] \n" /* R */
  807. void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
  808. uint8_t* dst_argb,
  809. int width) {
  810. asm volatile(
  811. "movi v3.8b, #255 \n" // Alpha
  812. "1: \n"
  813. "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
  814. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  815. RGB565TOARGB
  816. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
  817. "b.gt 1b \n"
  818. : "+r"(src_rgb565), // %0
  819. "+r"(dst_argb), // %1
  820. "+r"(width) // %2
  821. :
  822. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List
  823. );
  824. }
  825. #define ARGB1555TOARGB \
  826. "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
  827. "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
  828. "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \
  829. \
  830. "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \
  831. "xtn2 v3.16b, v2.8h \n" \
  832. \
  833. "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
  834. "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
  835. \
  836. "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \
  837. "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
  838. "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
  839. \
  840. "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
  841. "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \
  842. "dup v1.2D, v0.D[1] \n" \
  843. "dup v3.2D, v2.D[1] \n"
  844. // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
  845. #define RGB555TOARGB \
  846. "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
  847. "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
  848. "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \
  849. \
  850. "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
  851. "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
  852. \
  853. "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \
  854. "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
  855. "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
  856. \
  857. "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
  858. "orr v2.16b, v1.16b, v3.16b \n" /* R */ \
  859. "dup v1.2D, v0.D[1] \n" /* G */
  860. void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
  861. uint8_t* dst_argb,
  862. int width) {
  863. asm volatile(
  864. "movi v3.8b, #255 \n" // Alpha
  865. "1: \n"
  866. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
  867. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  868. ARGB1555TOARGB
  869. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
  870. // pixels
  871. "b.gt 1b \n"
  872. : "+r"(src_argb1555), // %0
  873. "+r"(dst_argb), // %1
  874. "+r"(width) // %2
  875. :
  876. : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
  877. );
  878. }
  879. #define ARGB4444TOARGB \
  880. "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
  881. "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \
  882. "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \
  883. "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \
  884. "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \
  885. "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \
  886. "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \
  887. "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \
  888. "dup v0.2D, v2.D[1] \n" \
  889. "dup v1.2D, v3.D[1] \n"
  890. void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
  891. uint8_t* dst_argb,
  892. int width) {
  893. asm volatile(
  894. "1: \n"
  895. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
  896. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  897. ARGB4444TOARGB
  898. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
  899. // pixels
  900. "b.gt 1b \n"
  901. : "+r"(src_argb4444), // %0
  902. "+r"(dst_argb), // %1
  903. "+r"(width) // %2
  904. :
  905. : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
  906. );
  907. }
  908. void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
  909. uint8_t* dst_rgb24,
  910. int width) {
  911. asm volatile(
  912. "1: \n"
  913. "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB
  914. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  915. "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of
  916. // RGB24.
  917. "b.gt 1b \n"
  918. : "+r"(src_argb), // %0
  919. "+r"(dst_rgb24), // %1
  920. "+r"(width) // %2
  921. :
  922. : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
  923. );
  924. }
  925. void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
  926. asm volatile(
  927. "1: \n"
  928. "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
  929. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  930. "orr v4.8b, v2.8b, v2.8b \n" // mov g
  931. "orr v5.8b, v1.8b, v1.8b \n" // mov b
  932. "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
  933. "b.gt 1b \n"
  934. : "+r"(src_argb), // %0
  935. "+r"(dst_raw), // %1
  936. "+r"(width) // %2
  937. :
  938. : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
  939. );
  940. }
  941. void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
  942. asm volatile(
  943. "1: \n"
  944. "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
  945. "subs %w2, %w2, #16 \n" // 16 processed per loop.
  946. "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
  947. "b.gt 1b \n"
  948. : "+r"(src_yuy2), // %0
  949. "+r"(dst_y), // %1
  950. "+r"(width) // %2
  951. :
  952. : "cc", "memory", "v0", "v1" // Clobber List
  953. );
  954. }
  955. void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
  956. asm volatile(
  957. "1: \n"
  958. "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
  959. "subs %w2, %w2, #16 \n" // 16 processed per loop.
  960. "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
  961. "b.gt 1b \n"
  962. : "+r"(src_uyvy), // %0
  963. "+r"(dst_y), // %1
  964. "+r"(width) // %2
  965. :
  966. : "cc", "memory", "v0", "v1" // Clobber List
  967. );
  968. }
  969. void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
  970. uint8_t* dst_u,
  971. uint8_t* dst_v,
  972. int width) {
  973. asm volatile(
  974. "1: \n"
  975. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2
  976. "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
  977. "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
  978. "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
  979. "b.gt 1b \n"
  980. : "+r"(src_yuy2), // %0
  981. "+r"(dst_u), // %1
  982. "+r"(dst_v), // %2
  983. "+r"(width) // %3
  984. :
  985. : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
  986. );
  987. }
  988. void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
  989. uint8_t* dst_u,
  990. uint8_t* dst_v,
  991. int width) {
  992. asm volatile(
  993. "1: \n"
  994. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY
  995. "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
  996. "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
  997. "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
  998. "b.gt 1b \n"
  999. : "+r"(src_uyvy), // %0
  1000. "+r"(dst_u), // %1
  1001. "+r"(dst_v), // %2
  1002. "+r"(width) // %3
  1003. :
  1004. : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
  1005. );
  1006. }
  1007. void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
  1008. int stride_yuy2,
  1009. uint8_t* dst_u,
  1010. uint8_t* dst_v,
  1011. int width) {
  1012. const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
  1013. asm volatile(
  1014. "1: \n"
  1015. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
  1016. "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
  1017. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
  1018. "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
  1019. "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
  1020. "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
  1021. "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
  1022. "b.gt 1b \n"
  1023. : "+r"(src_yuy2), // %0
  1024. "+r"(src_yuy2b), // %1
  1025. "+r"(dst_u), // %2
  1026. "+r"(dst_v), // %3
  1027. "+r"(width) // %4
  1028. :
  1029. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
  1030. "v7" // Clobber List
  1031. );
  1032. }
  1033. void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
  1034. int stride_uyvy,
  1035. uint8_t* dst_u,
  1036. uint8_t* dst_v,
  1037. int width) {
  1038. const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
  1039. asm volatile(
  1040. "1: \n"
  1041. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
  1042. "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
  1043. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
  1044. "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
  1045. "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
  1046. "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
  1047. "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
  1048. "b.gt 1b \n"
  1049. : "+r"(src_uyvy), // %0
  1050. "+r"(src_uyvyb), // %1
  1051. "+r"(dst_u), // %2
  1052. "+r"(dst_v), // %3
  1053. "+r"(width) // %4
  1054. :
  1055. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
  1056. "v7" // Clobber List
  1057. );
  1058. }
  1059. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  1060. void ARGBShuffleRow_NEON(const uint8_t* src_argb,
  1061. uint8_t* dst_argb,
  1062. const uint8_t* shuffler,
  1063. int width) {
  1064. asm volatile(
  1065. "ld1 {v2.16b}, [%3] \n" // shuffler
  1066. "1: \n"
  1067. "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
  1068. "subs %w2, %w2, #4 \n" // 4 processed per loop
  1069. "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
  1070. "st1 {v1.16b}, [%1], #16 \n" // store 4.
  1071. "b.gt 1b \n"
  1072. : "+r"(src_argb), // %0
  1073. "+r"(dst_argb), // %1
  1074. "+r"(width) // %2
  1075. : "r"(shuffler) // %3
  1076. : "cc", "memory", "v0", "v1", "v2" // Clobber List
  1077. );
  1078. }
  1079. void I422ToYUY2Row_NEON(const uint8_t* src_y,
  1080. const uint8_t* src_u,
  1081. const uint8_t* src_v,
  1082. uint8_t* dst_yuy2,
  1083. int width) {
  1084. asm volatile(
  1085. "1: \n"
  1086. "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
  1087. "orr v2.8b, v1.8b, v1.8b \n"
  1088. "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
  1089. "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
  1090. "subs %w4, %w4, #16 \n" // 16 pixels
  1091. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
  1092. "b.gt 1b \n"
  1093. : "+r"(src_y), // %0
  1094. "+r"(src_u), // %1
  1095. "+r"(src_v), // %2
  1096. "+r"(dst_yuy2), // %3
  1097. "+r"(width) // %4
  1098. :
  1099. : "cc", "memory", "v0", "v1", "v2", "v3");
  1100. }
  1101. void I422ToUYVYRow_NEON(const uint8_t* src_y,
  1102. const uint8_t* src_u,
  1103. const uint8_t* src_v,
  1104. uint8_t* dst_uyvy,
  1105. int width) {
  1106. asm volatile(
  1107. "1: \n"
  1108. "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
  1109. "orr v3.8b, v2.8b, v2.8b \n"
  1110. "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
  1111. "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
  1112. "subs %w4, %w4, #16 \n" // 16 pixels
  1113. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
  1114. "b.gt 1b \n"
  1115. : "+r"(src_y), // %0
  1116. "+r"(src_u), // %1
  1117. "+r"(src_v), // %2
  1118. "+r"(dst_uyvy), // %3
  1119. "+r"(width) // %4
  1120. :
  1121. : "cc", "memory", "v0", "v1", "v2", "v3");
  1122. }
  1123. void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
  1124. uint8_t* dst_rgb565,
  1125. int width) {
  1126. asm volatile(
  1127. "1: \n"
  1128. "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
  1129. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1130. ARGBTORGB565
  1131. "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
  1132. "b.gt 1b \n"
  1133. : "+r"(src_argb), // %0
  1134. "+r"(dst_rgb565), // %1
  1135. "+r"(width) // %2
  1136. :
  1137. : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
  1138. }
  1139. void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
  1140. uint8_t* dst_rgb,
  1141. const uint32_t dither4,
  1142. int width) {
  1143. asm volatile(
  1144. "dup v1.4s, %w2 \n" // dither4
  1145. "1: \n"
  1146. "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels
  1147. "subs %w3, %w3, #8 \n" // 8 processed per loop.
  1148. "uqadd v20.8b, v20.8b, v1.8b \n"
  1149. "uqadd v21.8b, v21.8b, v1.8b \n"
  1150. "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565
  1151. "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
  1152. "b.gt 1b \n"
  1153. : "+r"(dst_rgb) // %0
  1154. : "r"(src_argb), // %1
  1155. "r"(dither4), // %2
  1156. "r"(width) // %3
  1157. : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23");
  1158. }
  1159. void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
  1160. uint8_t* dst_argb1555,
  1161. int width) {
  1162. asm volatile(
  1163. "1: \n"
  1164. "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
  1165. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1166. ARGBTOARGB1555
  1167. "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
  1168. // ARGB1555.
  1169. "b.gt 1b \n"
  1170. : "+r"(src_argb), // %0
  1171. "+r"(dst_argb1555), // %1
  1172. "+r"(width) // %2
  1173. :
  1174. : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
  1175. }
  1176. void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
  1177. uint8_t* dst_argb4444,
  1178. int width) {
  1179. asm volatile(
  1180. "movi v4.16b, #0x0f \n" // bits to clear with
  1181. // vbic.
  1182. "1: \n"
  1183. "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
  1184. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1185. ARGBTOARGB4444
  1186. "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
  1187. // ARGB4444.
  1188. "b.gt 1b \n"
  1189. : "+r"(src_argb), // %0
  1190. "+r"(dst_argb4444), // %1
  1191. "+r"(width) // %2
  1192. :
  1193. : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23");
  1194. }
  1195. void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
  1196. asm volatile(
  1197. "movi v4.8b, #13 \n" // B * 0.1016 coefficient
  1198. "movi v5.8b, #65 \n" // G * 0.5078 coefficient
  1199. "movi v6.8b, #33 \n" // R * 0.2578 coefficient
  1200. "movi v7.8b, #16 \n" // Add 16 constant
  1201. "1: \n"
  1202. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
  1203. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1204. "umull v3.8h, v0.8b, v4.8b \n" // B
  1205. "umlal v3.8h, v1.8b, v5.8b \n" // G
  1206. "umlal v3.8h, v2.8b, v6.8b \n" // R
  1207. "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
  1208. "uqadd v0.8b, v0.8b, v7.8b \n"
  1209. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  1210. "b.gt 1b \n"
  1211. : "+r"(src_argb), // %0
  1212. "+r"(dst_y), // %1
  1213. "+r"(width) // %2
  1214. :
  1215. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
  1216. }
  1217. void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
  1218. uint8_t* dst_a,
  1219. int width) {
  1220. asm volatile(
  1221. "1: \n"
  1222. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16
  1223. // pixels
  1224. "subs %w2, %w2, #16 \n" // 16 processed per loop
  1225. "st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
  1226. "b.gt 1b \n"
  1227. : "+r"(src_argb), // %0
  1228. "+r"(dst_a), // %1
  1229. "+r"(width) // %2
  1230. :
  1231. : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
  1232. );
  1233. }
  1234. void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
  1235. asm volatile(
  1236. "movi v4.8b, #15 \n" // B * 0.11400 coefficient
  1237. "movi v5.8b, #75 \n" // G * 0.58700 coefficient
  1238. "movi v6.8b, #38 \n" // R * 0.29900 coefficient
  1239. "1: \n"
  1240. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
  1241. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1242. "umull v3.8h, v0.8b, v4.8b \n" // B
  1243. "umlal v3.8h, v1.8b, v5.8b \n" // G
  1244. "umlal v3.8h, v2.8b, v6.8b \n" // R
  1245. "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y
  1246. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  1247. "b.gt 1b \n"
  1248. : "+r"(src_argb), // %0
  1249. "+r"(dst_y), // %1
  1250. "+r"(width) // %2
  1251. :
  1252. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
  1253. }
  1254. // 8x1 pixels.
  1255. void ARGBToUV444Row_NEON(const uint8_t* src_argb,
  1256. uint8_t* dst_u,
  1257. uint8_t* dst_v,
  1258. int width) {
  1259. asm volatile(
  1260. "movi v24.8b, #112 \n" // UB / VR 0.875
  1261. // coefficient
  1262. "movi v25.8b, #74 \n" // UG -0.5781 coefficient
  1263. "movi v26.8b, #38 \n" // UR -0.2969 coefficient
  1264. "movi v27.8b, #18 \n" // VB -0.1406 coefficient
  1265. "movi v28.8b, #94 \n" // VG -0.7344 coefficient
  1266. "movi v29.16b,#0x80 \n" // 128.5
  1267. "1: \n"
  1268. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
  1269. // pixels.
  1270. "subs %w3, %w3, #8 \n" // 8 processed per loop.
  1271. "umull v4.8h, v0.8b, v24.8b \n" // B
  1272. "umlsl v4.8h, v1.8b, v25.8b \n" // G
  1273. "umlsl v4.8h, v2.8b, v26.8b \n" // R
  1274. "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned
  1275. "umull v3.8h, v2.8b, v24.8b \n" // R
  1276. "umlsl v3.8h, v1.8b, v28.8b \n" // G
  1277. "umlsl v3.8h, v0.8b, v27.8b \n" // B
  1278. "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned
  1279. "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U
  1280. "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
  1281. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
  1282. "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
  1283. "b.gt 1b \n"
  1284. : "+r"(src_argb), // %0
  1285. "+r"(dst_u), // %1
  1286. "+r"(dst_v), // %2
  1287. "+r"(width) // %3
  1288. :
  1289. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
  1290. "v27", "v28", "v29");
  1291. }
  1292. #define RGBTOUV_SETUP_REG \
  1293. "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
  1294. "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
  1295. "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \
  1296. "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \
  1297. "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \
  1298. "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
  1299. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
  1300. // clang-format off
  1301. #define RGBTOUV(QB, QG, QR) \
  1302. "mul v3.8h, " #QB ",v20.8h \n" /* B */ \
  1303. "mul v4.8h, " #QR ",v20.8h \n" /* R */ \
  1304. "mls v3.8h, " #QG ",v21.8h \n" /* G */ \
  1305. "mls v4.8h, " #QG ",v24.8h \n" /* G */ \
  1306. "mls v3.8h, " #QR ",v22.8h \n" /* R */ \
  1307. "mls v4.8h, " #QB ",v23.8h \n" /* B */ \
  1308. "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
  1309. "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
  1310. "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
  1311. "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
  1312. // clang-format on
  1313. // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
  1314. // TODO(fbarchard): consider ptrdiff_t for all strides.
  1315. void ARGBToUVRow_NEON(const uint8_t* src_argb,
  1316. int src_stride_argb,
  1317. uint8_t* dst_u,
  1318. uint8_t* dst_v,
  1319. int width) {
  1320. const uint8_t* src_argb_1 = src_argb + src_stride_argb;
  1321. asm volatile (
  1322. RGBTOUV_SETUP_REG
  1323. "1: \n"
  1324. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
  1325. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
  1326. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
  1327. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
  1328. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
  1329. "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
  1330. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
  1331. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
  1332. "urshr v0.8h, v0.8h, #1 \n" // 2x average
  1333. "urshr v1.8h, v1.8h, #1 \n"
  1334. "urshr v2.8h, v2.8h, #1 \n"
  1335. "subs %w4, %w4, #16 \n" // 32 processed per loop.
  1336. RGBTOUV(v0.8h, v1.8h, v2.8h)
  1337. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1338. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1339. "b.gt 1b \n"
  1340. : "+r"(src_argb), // %0
  1341. "+r"(src_argb_1), // %1
  1342. "+r"(dst_u), // %2
  1343. "+r"(dst_v), // %3
  1344. "+r"(width) // %4
  1345. :
  1346. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  1347. "v20", "v21", "v22", "v23", "v24", "v25"
  1348. );
  1349. }
  1350. // TODO(fbarchard): Subsample match C code.
  1351. void ARGBToUVJRow_NEON(const uint8_t* src_argb,
  1352. int src_stride_argb,
  1353. uint8_t* dst_u,
  1354. uint8_t* dst_v,
  1355. int width) {
  1356. const uint8_t* src_argb_1 = src_argb + src_stride_argb;
  1357. asm volatile (
  1358. "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
  1359. "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
  1360. "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
  1361. "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
  1362. "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
  1363. "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
  1364. "1: \n"
  1365. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
  1366. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
  1367. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
  1368. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
  1369. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
  1370. "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
  1371. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
  1372. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
  1373. "urshr v0.8h, v0.8h, #1 \n" // 2x average
  1374. "urshr v1.8h, v1.8h, #1 \n"
  1375. "urshr v2.8h, v2.8h, #1 \n"
  1376. "subs %w4, %w4, #16 \n" // 32 processed per loop.
  1377. RGBTOUV(v0.8h, v1.8h, v2.8h)
  1378. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1379. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1380. "b.gt 1b \n"
  1381. : "+r"(src_argb), // %0
  1382. "+r"(src_argb_1), // %1
  1383. "+r"(dst_u), // %2
  1384. "+r"(dst_v), // %3
  1385. "+r"(width) // %4
  1386. :
  1387. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  1388. "v20", "v21", "v22", "v23", "v24", "v25"
  1389. );
  1390. }
  1391. void BGRAToUVRow_NEON(const uint8_t* src_bgra,
  1392. int src_stride_bgra,
  1393. uint8_t* dst_u,
  1394. uint8_t* dst_v,
  1395. int width) {
  1396. const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
  1397. asm volatile (
  1398. RGBTOUV_SETUP_REG
  1399. "1: \n"
  1400. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
  1401. "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
  1402. "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
  1403. "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
  1404. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
  1405. "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
  1406. "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
  1407. "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
  1408. "urshr v0.8h, v0.8h, #1 \n" // 2x average
  1409. "urshr v1.8h, v3.8h, #1 \n"
  1410. "urshr v2.8h, v2.8h, #1 \n"
  1411. "subs %w4, %w4, #16 \n" // 32 processed per loop.
  1412. RGBTOUV(v0.8h, v1.8h, v2.8h)
  1413. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1414. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1415. "b.gt 1b \n"
  1416. : "+r"(src_bgra), // %0
  1417. "+r"(src_bgra_1), // %1
  1418. "+r"(dst_u), // %2
  1419. "+r"(dst_v), // %3
  1420. "+r"(width) // %4
  1421. :
  1422. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  1423. "v20", "v21", "v22", "v23", "v24", "v25"
  1424. );
  1425. }
  1426. void ABGRToUVRow_NEON(const uint8_t* src_abgr,
  1427. int src_stride_abgr,
  1428. uint8_t* dst_u,
  1429. uint8_t* dst_v,
  1430. int width) {
  1431. const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
  1432. asm volatile (
  1433. RGBTOUV_SETUP_REG
  1434. "1: \n"
  1435. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
  1436. "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
  1437. "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
  1438. "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
  1439. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
  1440. "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
  1441. "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
  1442. "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
  1443. "urshr v0.8h, v3.8h, #1 \n" // 2x average
  1444. "urshr v2.8h, v2.8h, #1 \n"
  1445. "urshr v1.8h, v1.8h, #1 \n"
  1446. "subs %w4, %w4, #16 \n" // 32 processed per loop.
  1447. RGBTOUV(v0.8h, v2.8h, v1.8h)
  1448. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1449. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1450. "b.gt 1b \n"
  1451. : "+r"(src_abgr), // %0
  1452. "+r"(src_abgr_1), // %1
  1453. "+r"(dst_u), // %2
  1454. "+r"(dst_v), // %3
  1455. "+r"(width) // %4
  1456. :
  1457. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  1458. "v20", "v21", "v22", "v23", "v24", "v25"
  1459. );
  1460. }
  1461. void RGBAToUVRow_NEON(const uint8_t* src_rgba,
  1462. int src_stride_rgba,
  1463. uint8_t* dst_u,
  1464. uint8_t* dst_v,
  1465. int width) {
  1466. const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
  1467. asm volatile (
  1468. RGBTOUV_SETUP_REG
  1469. "1: \n"
  1470. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
  1471. "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
  1472. "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
  1473. "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
  1474. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
  1475. "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
  1476. "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
  1477. "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
  1478. "urshr v0.8h, v0.8h, #1 \n" // 2x average
  1479. "urshr v1.8h, v1.8h, #1 \n"
  1480. "urshr v2.8h, v2.8h, #1 \n"
  1481. "subs %w4, %w4, #16 \n" // 32 processed per loop.
  1482. RGBTOUV(v0.8h, v1.8h, v2.8h)
  1483. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1484. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1485. "b.gt 1b \n"
  1486. : "+r"(src_rgba), // %0
  1487. "+r"(src_rgba_1), // %1
  1488. "+r"(dst_u), // %2
  1489. "+r"(dst_v), // %3
  1490. "+r"(width) // %4
  1491. :
  1492. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  1493. "v20", "v21", "v22", "v23", "v24", "v25"
  1494. );
  1495. }
  1496. void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
  1497. int src_stride_rgb24,
  1498. uint8_t* dst_u,
  1499. uint8_t* dst_v,
  1500. int width) {
  1501. const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
  1502. asm volatile (
  1503. RGBTOUV_SETUP_REG
  1504. "1: \n"
  1505. "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
  1506. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
  1507. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
  1508. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
  1509. "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more.
  1510. "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
  1511. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
  1512. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
  1513. "urshr v0.8h, v0.8h, #1 \n" // 2x average
  1514. "urshr v1.8h, v1.8h, #1 \n"
  1515. "urshr v2.8h, v2.8h, #1 \n"
  1516. "subs %w4, %w4, #16 \n" // 32 processed per loop.
  1517. RGBTOUV(v0.8h, v1.8h, v2.8h)
  1518. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1519. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1520. "b.gt 1b \n"
  1521. : "+r"(src_rgb24), // %0
  1522. "+r"(src_rgb24_1), // %1
  1523. "+r"(dst_u), // %2
  1524. "+r"(dst_v), // %3
  1525. "+r"(width) // %4
  1526. :
  1527. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  1528. "v20", "v21", "v22", "v23", "v24", "v25"
  1529. );
  1530. }
  1531. void RAWToUVRow_NEON(const uint8_t* src_raw,
  1532. int src_stride_raw,
  1533. uint8_t* dst_u,
  1534. uint8_t* dst_v,
  1535. int width) {
  1536. const uint8_t* src_raw_1 = src_raw + src_stride_raw;
  1537. asm volatile (
  1538. RGBTOUV_SETUP_REG
  1539. "1: \n"
  1540. "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels.
  1541. "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
  1542. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
  1543. "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
  1544. "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels
  1545. "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
  1546. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
  1547. "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
  1548. "urshr v2.8h, v2.8h, #1 \n" // 2x average
  1549. "urshr v1.8h, v1.8h, #1 \n"
  1550. "urshr v0.8h, v0.8h, #1 \n"
  1551. "subs %w4, %w4, #16 \n" // 32 processed per loop.
  1552. RGBTOUV(v2.8h, v1.8h, v0.8h)
  1553. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1554. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1555. "b.gt 1b \n"
  1556. : "+r"(src_raw), // %0
  1557. "+r"(src_raw_1), // %1
  1558. "+r"(dst_u), // %2
  1559. "+r"(dst_v), // %3
  1560. "+r"(width) // %4
  1561. :
  1562. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  1563. "v20", "v21", "v22", "v23", "v24", "v25"
  1564. );
  1565. }
  1566. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
  1567. void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
  1568. int src_stride_rgb565,
  1569. uint8_t* dst_u,
  1570. uint8_t* dst_v,
  1571. int width) {
  1572. const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
  1573. asm volatile(
  1574. "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) /
  1575. // 2
  1576. "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2
  1577. "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2
  1578. "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2
  1579. "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2
  1580. "movi v27.16b, #0x80 \n" // 128.5 0x8080 in 16bit
  1581. "1: \n"
  1582. "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
  1583. RGB565TOARGB
  1584. "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1585. "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1586. "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1587. "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels.
  1588. RGB565TOARGB
  1589. "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1590. "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1591. "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1592. "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels.
  1593. RGB565TOARGB
  1594. "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1595. "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1596. "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1597. "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels.
  1598. RGB565TOARGB
  1599. "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1600. "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1601. "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1602. "ins v16.D[1], v17.D[0] \n"
  1603. "ins v18.D[1], v19.D[0] \n"
  1604. "ins v20.D[1], v21.D[0] \n"
  1605. "urshr v4.8h, v16.8h, #1 \n" // 2x average
  1606. "urshr v5.8h, v18.8h, #1 \n"
  1607. "urshr v6.8h, v20.8h, #1 \n"
  1608. "subs %w4, %w4, #16 \n" // 16 processed per loop.
  1609. "mul v16.8h, v4.8h, v22.8h \n" // B
  1610. "mls v16.8h, v5.8h, v23.8h \n" // G
  1611. "mls v16.8h, v6.8h, v24.8h \n" // R
  1612. "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned
  1613. "mul v17.8h, v6.8h, v22.8h \n" // R
  1614. "mls v17.8h, v5.8h, v26.8h \n" // G
  1615. "mls v17.8h, v4.8h, v25.8h \n" // B
  1616. "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned
  1617. "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U
  1618. "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V
  1619. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1620. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1621. "b.gt 1b \n"
  1622. : "+r"(src_rgb565), // %0
  1623. "+r"(src_rgb565_1), // %1
  1624. "+r"(dst_u), // %2
  1625. "+r"(dst_v), // %3
  1626. "+r"(width) // %4
  1627. :
  1628. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
  1629. "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
  1630. "v27");
  1631. }
  1632. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
  1633. void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
  1634. int src_stride_argb1555,
  1635. uint8_t* dst_u,
  1636. uint8_t* dst_v,
  1637. int width) {
  1638. const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
  1639. asm volatile(
  1640. RGBTOUV_SETUP_REG
  1641. "1: \n"
  1642. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
  1643. RGB555TOARGB
  1644. "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1645. "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1646. "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1647. "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
  1648. RGB555TOARGB
  1649. "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1650. "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1651. "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1652. "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
  1653. RGB555TOARGB
  1654. "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1655. "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1656. "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1657. "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
  1658. RGB555TOARGB
  1659. "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1660. "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1661. "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1662. "ins v16.D[1], v26.D[0] \n"
  1663. "ins v17.D[1], v27.D[0] \n"
  1664. "ins v18.D[1], v28.D[0] \n"
  1665. "urshr v4.8h, v16.8h, #1 \n" // 2x average
  1666. "urshr v5.8h, v17.8h, #1 \n"
  1667. "urshr v6.8h, v18.8h, #1 \n"
  1668. "subs %w4, %w4, #16 \n" // 16 processed per loop.
  1669. "mul v2.8h, v4.8h, v20.8h \n" // B
  1670. "mls v2.8h, v5.8h, v21.8h \n" // G
  1671. "mls v2.8h, v6.8h, v22.8h \n" // R
  1672. "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
  1673. "mul v3.8h, v6.8h, v20.8h \n" // R
  1674. "mls v3.8h, v5.8h, v24.8h \n" // G
  1675. "mls v3.8h, v4.8h, v23.8h \n" // B
  1676. "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
  1677. "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
  1678. "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
  1679. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1680. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1681. "b.gt 1b \n"
  1682. : "+r"(src_argb1555), // %0
  1683. "+r"(src_argb1555_1), // %1
  1684. "+r"(dst_u), // %2
  1685. "+r"(dst_v), // %3
  1686. "+r"(width) // %4
  1687. :
  1688. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
  1689. "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
  1690. "v28");
  1691. }
  1692. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
  1693. void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
  1694. int src_stride_argb4444,
  1695. uint8_t* dst_u,
  1696. uint8_t* dst_v,
  1697. int width) {
  1698. const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
  1699. asm volatile(
  1700. RGBTOUV_SETUP_REG
  1701. "1: \n"
  1702. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
  1703. ARGB4444TOARGB
  1704. "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1705. "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1706. "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1707. "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels.
  1708. ARGB4444TOARGB
  1709. "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1710. "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1711. "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1712. "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels.
  1713. ARGB4444TOARGB
  1714. "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1715. "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1716. "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1717. "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels.
  1718. ARGB4444TOARGB
  1719. "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1720. "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1721. "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1722. "ins v16.D[1], v26.D[0] \n"
  1723. "ins v17.D[1], v27.D[0] \n"
  1724. "ins v18.D[1], v28.D[0] \n"
  1725. "urshr v4.8h, v16.8h, #1 \n" // 2x average
  1726. "urshr v5.8h, v17.8h, #1 \n"
  1727. "urshr v6.8h, v18.8h, #1 \n"
  1728. "subs %w4, %w4, #16 \n" // 16 processed per loop.
  1729. "mul v2.8h, v4.8h, v20.8h \n" // B
  1730. "mls v2.8h, v5.8h, v21.8h \n" // G
  1731. "mls v2.8h, v6.8h, v22.8h \n" // R
  1732. "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
  1733. "mul v3.8h, v6.8h, v20.8h \n" // R
  1734. "mls v3.8h, v5.8h, v24.8h \n" // G
  1735. "mls v3.8h, v4.8h, v23.8h \n" // B
  1736. "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
  1737. "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
  1738. "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
  1739. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1740. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1741. "b.gt 1b \n"
  1742. : "+r"(src_argb4444), // %0
  1743. "+r"(src_argb4444_1), // %1
  1744. "+r"(dst_u), // %2
  1745. "+r"(dst_v), // %3
  1746. "+r"(width) // %4
  1747. :
  1748. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
  1749. "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
  1750. "v28"
  1751. );
  1752. }
  1753. void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
  1754. asm volatile(
  1755. "movi v24.8b, #13 \n" // B * 0.1016 coefficient
  1756. "movi v25.8b, #65 \n" // G * 0.5078 coefficient
  1757. "movi v26.8b, #33 \n" // R * 0.2578 coefficient
  1758. "movi v27.8b, #16 \n" // Add 16 constant
  1759. "1: \n"
  1760. "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
  1761. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1762. RGB565TOARGB
  1763. "umull v3.8h, v0.8b, v24.8b \n" // B
  1764. "umlal v3.8h, v1.8b, v25.8b \n" // G
  1765. "umlal v3.8h, v2.8b, v26.8b \n" // R
  1766. "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
  1767. "uqadd v0.8b, v0.8b, v27.8b \n"
  1768. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  1769. "b.gt 1b \n"
  1770. : "+r"(src_rgb565), // %0
  1771. "+r"(dst_y), // %1
  1772. "+r"(width) // %2
  1773. :
  1774. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26",
  1775. "v27");
  1776. }
  1777. void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
  1778. uint8_t* dst_y,
  1779. int width) {
  1780. asm volatile(
  1781. "movi v4.8b, #13 \n" // B * 0.1016 coefficient
  1782. "movi v5.8b, #65 \n" // G * 0.5078 coefficient
  1783. "movi v6.8b, #33 \n" // R * 0.2578 coefficient
  1784. "movi v7.8b, #16 \n" // Add 16 constant
  1785. "1: \n"
  1786. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
  1787. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1788. ARGB1555TOARGB
  1789. "umull v3.8h, v0.8b, v4.8b \n" // B
  1790. "umlal v3.8h, v1.8b, v5.8b \n" // G
  1791. "umlal v3.8h, v2.8b, v6.8b \n" // R
  1792. "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
  1793. "uqadd v0.8b, v0.8b, v7.8b \n"
  1794. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  1795. "b.gt 1b \n"
  1796. : "+r"(src_argb1555), // %0
  1797. "+r"(dst_y), // %1
  1798. "+r"(width) // %2
  1799. :
  1800. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
  1801. }
  1802. void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
  1803. uint8_t* dst_y,
  1804. int width) {
  1805. asm volatile(
  1806. "movi v24.8b, #13 \n" // B * 0.1016 coefficient
  1807. "movi v25.8b, #65 \n" // G * 0.5078 coefficient
  1808. "movi v26.8b, #33 \n" // R * 0.2578 coefficient
  1809. "movi v27.8b, #16 \n" // Add 16 constant
  1810. "1: \n"
  1811. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
  1812. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1813. ARGB4444TOARGB
  1814. "umull v3.8h, v0.8b, v24.8b \n" // B
  1815. "umlal v3.8h, v1.8b, v25.8b \n" // G
  1816. "umlal v3.8h, v2.8b, v26.8b \n" // R
  1817. "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
  1818. "uqadd v0.8b, v0.8b, v27.8b \n"
  1819. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  1820. "b.gt 1b \n"
  1821. : "+r"(src_argb4444), // %0
  1822. "+r"(dst_y), // %1
  1823. "+r"(width) // %2
  1824. :
  1825. : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
  1826. }
  1827. void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
  1828. asm volatile(
  1829. "movi v4.8b, #33 \n" // R * 0.2578 coefficient
  1830. "movi v5.8b, #65 \n" // G * 0.5078 coefficient
  1831. "movi v6.8b, #13 \n" // B * 0.1016 coefficient
  1832. "movi v7.8b, #16 \n" // Add 16 constant
  1833. "1: \n"
  1834. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
  1835. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1836. "umull v16.8h, v1.8b, v4.8b \n" // R
  1837. "umlal v16.8h, v2.8b, v5.8b \n" // G
  1838. "umlal v16.8h, v3.8b, v6.8b \n" // B
  1839. "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
  1840. "uqadd v0.8b, v0.8b, v7.8b \n"
  1841. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  1842. "b.gt 1b \n"
  1843. : "+r"(src_bgra), // %0
  1844. "+r"(dst_y), // %1
  1845. "+r"(width) // %2
  1846. :
  1847. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
  1848. }
  1849. void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
  1850. asm volatile(
  1851. "movi v4.8b, #33 \n" // R * 0.2578 coefficient
  1852. "movi v5.8b, #65 \n" // G * 0.5078 coefficient
  1853. "movi v6.8b, #13 \n" // B * 0.1016 coefficient
  1854. "movi v7.8b, #16 \n" // Add 16 constant
  1855. "1: \n"
  1856. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
  1857. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1858. "umull v16.8h, v0.8b, v4.8b \n" // R
  1859. "umlal v16.8h, v1.8b, v5.8b \n" // G
  1860. "umlal v16.8h, v2.8b, v6.8b \n" // B
  1861. "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
  1862. "uqadd v0.8b, v0.8b, v7.8b \n"
  1863. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  1864. "b.gt 1b \n"
  1865. : "+r"(src_abgr), // %0
  1866. "+r"(dst_y), // %1
  1867. "+r"(width) // %2
  1868. :
  1869. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
  1870. }
  1871. void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
  1872. asm volatile(
  1873. "movi v4.8b, #13 \n" // B * 0.1016 coefficient
  1874. "movi v5.8b, #65 \n" // G * 0.5078 coefficient
  1875. "movi v6.8b, #33 \n" // R * 0.2578 coefficient
  1876. "movi v7.8b, #16 \n" // Add 16 constant
  1877. "1: \n"
  1878. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
  1879. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1880. "umull v16.8h, v1.8b, v4.8b \n" // B
  1881. "umlal v16.8h, v2.8b, v5.8b \n" // G
  1882. "umlal v16.8h, v3.8b, v6.8b \n" // R
  1883. "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
  1884. "uqadd v0.8b, v0.8b, v7.8b \n"
  1885. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  1886. "b.gt 1b \n"
  1887. : "+r"(src_rgba), // %0
  1888. "+r"(dst_y), // %1
  1889. "+r"(width) // %2
  1890. :
  1891. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
  1892. }
  1893. void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
  1894. asm volatile(
  1895. "movi v4.8b, #13 \n" // B * 0.1016 coefficient
  1896. "movi v5.8b, #65 \n" // G * 0.5078 coefficient
  1897. "movi v6.8b, #33 \n" // R * 0.2578 coefficient
  1898. "movi v7.8b, #16 \n" // Add 16 constant
  1899. "1: \n"
  1900. "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
  1901. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1902. "umull v16.8h, v0.8b, v4.8b \n" // B
  1903. "umlal v16.8h, v1.8b, v5.8b \n" // G
  1904. "umlal v16.8h, v2.8b, v6.8b \n" // R
  1905. "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
  1906. "uqadd v0.8b, v0.8b, v7.8b \n"
  1907. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  1908. "b.gt 1b \n"
  1909. : "+r"(src_rgb24), // %0
  1910. "+r"(dst_y), // %1
  1911. "+r"(width) // %2
  1912. :
  1913. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
  1914. }
  1915. void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
  1916. asm volatile(
  1917. "movi v4.8b, #33 \n" // R * 0.2578 coefficient
  1918. "movi v5.8b, #65 \n" // G * 0.5078 coefficient
  1919. "movi v6.8b, #13 \n" // B * 0.1016 coefficient
  1920. "movi v7.8b, #16 \n" // Add 16 constant
  1921. "1: \n"
  1922. "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
  1923. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1924. "umull v16.8h, v0.8b, v4.8b \n" // B
  1925. "umlal v16.8h, v1.8b, v5.8b \n" // G
  1926. "umlal v16.8h, v2.8b, v6.8b \n" // R
  1927. "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
  1928. "uqadd v0.8b, v0.8b, v7.8b \n"
  1929. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  1930. "b.gt 1b \n"
  1931. : "+r"(src_raw), // %0
  1932. "+r"(dst_y), // %1
  1933. "+r"(width) // %2
  1934. :
  1935. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
  1936. }
  1937. // Bilinear filter 16x2 -> 16x1
  1938. void InterpolateRow_NEON(uint8_t* dst_ptr,
  1939. const uint8_t* src_ptr,
  1940. ptrdiff_t src_stride,
  1941. int dst_width,
  1942. int source_y_fraction) {
  1943. int y1_fraction = source_y_fraction;
  1944. int y0_fraction = 256 - y1_fraction;
  1945. const uint8_t* src_ptr1 = src_ptr + src_stride;
  1946. asm volatile(
  1947. "cmp %w4, #0 \n"
  1948. "b.eq 100f \n"
  1949. "cmp %w4, #128 \n"
  1950. "b.eq 50f \n"
  1951. "dup v5.16b, %w4 \n"
  1952. "dup v4.16b, %w5 \n"
  1953. // General purpose row blend.
  1954. "1: \n"
  1955. "ld1 {v0.16b}, [%1], #16 \n"
  1956. "ld1 {v1.16b}, [%2], #16 \n"
  1957. "subs %w3, %w3, #16 \n"
  1958. "umull v2.8h, v0.8b, v4.8b \n"
  1959. "umull2 v3.8h, v0.16b, v4.16b \n"
  1960. "umlal v2.8h, v1.8b, v5.8b \n"
  1961. "umlal2 v3.8h, v1.16b, v5.16b \n"
  1962. "rshrn v0.8b, v2.8h, #8 \n"
  1963. "rshrn2 v0.16b, v3.8h, #8 \n"
  1964. "st1 {v0.16b}, [%0], #16 \n"
  1965. "b.gt 1b \n"
  1966. "b 99f \n"
  1967. // Blend 50 / 50.
  1968. "50: \n"
  1969. "ld1 {v0.16b}, [%1], #16 \n"
  1970. "ld1 {v1.16b}, [%2], #16 \n"
  1971. "subs %w3, %w3, #16 \n"
  1972. "urhadd v0.16b, v0.16b, v1.16b \n"
  1973. "st1 {v0.16b}, [%0], #16 \n"
  1974. "b.gt 50b \n"
  1975. "b 99f \n"
  1976. // Blend 100 / 0 - Copy row unchanged.
  1977. "100: \n"
  1978. "ld1 {v0.16b}, [%1], #16 \n"
  1979. "subs %w3, %w3, #16 \n"
  1980. "st1 {v0.16b}, [%0], #16 \n"
  1981. "b.gt 100b \n"
  1982. "99: \n"
  1983. : "+r"(dst_ptr), // %0
  1984. "+r"(src_ptr), // %1
  1985. "+r"(src_ptr1), // %2
  1986. "+r"(dst_width), // %3
  1987. "+r"(y1_fraction), // %4
  1988. "+r"(y0_fraction) // %5
  1989. :
  1990. : "cc", "memory", "v0", "v1", "v3", "v4", "v5");
  1991. }
  1992. // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
  1993. void ARGBBlendRow_NEON(const uint8_t* src_argb0,
  1994. const uint8_t* src_argb1,
  1995. uint8_t* dst_argb,
  1996. int width) {
  1997. asm volatile(
  1998. "subs %w3, %w3, #8 \n"
  1999. "b.lt 89f \n"
  2000. // Blend 8 pixels.
  2001. "8: \n"
  2002. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0
  2003. // pixels
  2004. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1
  2005. // pixels
  2006. "subs %w3, %w3, #8 \n" // 8 processed per loop.
  2007. "umull v16.8h, v4.8b, v3.8b \n" // db * a
  2008. "umull v17.8h, v5.8b, v3.8b \n" // dg * a
  2009. "umull v18.8h, v6.8b, v3.8b \n" // dr * a
  2010. "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
  2011. "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
  2012. "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
  2013. "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
  2014. "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
  2015. "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
  2016. "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
  2017. "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
  2018. "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
  2019. "movi v3.8b, #255 \n" // a = 255
  2020. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
  2021. // pixels
  2022. "b.ge 8b \n"
  2023. "89: \n"
  2024. "adds %w3, %w3, #8-1 \n"
  2025. "b.lt 99f \n"
  2026. // Blend 1 pixels.
  2027. "1: \n"
  2028. "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
  2029. "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
  2030. "subs %w3, %w3, #1 \n" // 1 processed per loop.
  2031. "umull v16.8h, v4.8b, v3.8b \n" // db * a
  2032. "umull v17.8h, v5.8b, v3.8b \n" // dg * a
  2033. "umull v18.8h, v6.8b, v3.8b \n" // dr * a
  2034. "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
  2035. "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
  2036. "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
  2037. "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
  2038. "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
  2039. "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
  2040. "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
  2041. "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
  2042. "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
  2043. "movi v3.8b, #255 \n" // a = 255
  2044. "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
  2045. "b.ge 1b \n"
  2046. "99: \n"
  2047. : "+r"(src_argb0), // %0
  2048. "+r"(src_argb1), // %1
  2049. "+r"(dst_argb), // %2
  2050. "+r"(width) // %3
  2051. :
  2052. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
  2053. "v17", "v18");
  2054. }
  2055. // Attenuate 8 pixels at a time.
  2056. void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
  2057. uint8_t* dst_argb,
  2058. int width) {
  2059. asm volatile(
  2060. // Attenuate 8 pixels.
  2061. "1: \n"
  2062. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
  2063. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  2064. "umull v4.8h, v0.8b, v3.8b \n" // b * a
  2065. "umull v5.8h, v1.8b, v3.8b \n" // g * a
  2066. "umull v6.8h, v2.8b, v3.8b \n" // r * a
  2067. "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
  2068. "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
  2069. "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
  2070. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
  2071. // pixels
  2072. "b.gt 1b \n"
  2073. : "+r"(src_argb), // %0
  2074. "+r"(dst_argb), // %1
  2075. "+r"(width) // %2
  2076. :
  2077. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
  2078. }
  2079. // Quantize 8 ARGB pixels (32 bytes).
  2080. // dst = (dst * scale >> 16) * interval_size + interval_offset;
  2081. void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
  2082. int scale,
  2083. int interval_size,
  2084. int interval_offset,
  2085. int width) {
  2086. asm volatile(
  2087. "dup v4.8h, %w2 \n"
  2088. "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
  2089. "dup v5.8h, %w3 \n" // interval multiply.
  2090. "dup v6.8h, %w4 \n" // interval add
  2091. // 8 pixel loop.
  2092. "1: \n"
  2093. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB.
  2094. "subs %w1, %w1, #8 \n" // 8 processed per loop.
  2095. "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
  2096. "uxtl v1.8h, v1.8b \n"
  2097. "uxtl v2.8h, v2.8b \n"
  2098. "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
  2099. "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
  2100. "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
  2101. "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
  2102. "mul v1.8h, v1.8h, v5.8h \n" // g
  2103. "mul v2.8h, v2.8h, v5.8h \n" // r
  2104. "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
  2105. "add v1.8h, v1.8h, v6.8h \n" // g
  2106. "add v2.8h, v2.8h, v6.8h \n" // r
  2107. "uqxtn v0.8b, v0.8h \n"
  2108. "uqxtn v1.8b, v1.8h \n"
  2109. "uqxtn v2.8b, v2.8h \n"
  2110. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB
  2111. "b.gt 1b \n"
  2112. : "+r"(dst_argb), // %0
  2113. "+r"(width) // %1
  2114. : "r"(scale), // %2
  2115. "r"(interval_size), // %3
  2116. "r"(interval_offset) // %4
  2117. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
  2118. }
  2119. // Shade 8 pixels at a time by specified value.
  2120. // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
  2121. // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
  2122. void ARGBShadeRow_NEON(const uint8_t* src_argb,
  2123. uint8_t* dst_argb,
  2124. int width,
  2125. uint32_t value) {
  2126. asm volatile(
  2127. "dup v0.4s, %w3 \n" // duplicate scale value.
  2128. "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
  2129. "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
  2130. // 8 pixel loop.
  2131. "1: \n"
  2132. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB
  2133. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  2134. "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
  2135. "uxtl v5.8h, v5.8b \n"
  2136. "uxtl v6.8h, v6.8b \n"
  2137. "uxtl v7.8h, v7.8b \n"
  2138. "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
  2139. "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
  2140. "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
  2141. "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
  2142. "uqxtn v4.8b, v4.8h \n"
  2143. "uqxtn v5.8b, v5.8h \n"
  2144. "uqxtn v6.8b, v6.8h \n"
  2145. "uqxtn v7.8b, v7.8h \n"
  2146. "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
  2147. "b.gt 1b \n"
  2148. : "+r"(src_argb), // %0
  2149. "+r"(dst_argb), // %1
  2150. "+r"(width) // %2
  2151. : "r"(value) // %3
  2152. : "cc", "memory", "v0", "v4", "v5", "v6", "v7");
  2153. }
  2154. // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
  2155. // Similar to ARGBToYJ but stores ARGB.
  2156. // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
  2157. void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
  2158. asm volatile(
  2159. "movi v24.8b, #15 \n" // B * 0.11400 coefficient
  2160. "movi v25.8b, #75 \n" // G * 0.58700 coefficient
  2161. "movi v26.8b, #38 \n" // R * 0.29900 coefficient
  2162. "1: \n"
  2163. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
  2164. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  2165. "umull v4.8h, v0.8b, v24.8b \n" // B
  2166. "umlal v4.8h, v1.8b, v25.8b \n" // G
  2167. "umlal v4.8h, v2.8b, v26.8b \n" // R
  2168. "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B
  2169. "orr v1.8b, v0.8b, v0.8b \n" // G
  2170. "orr v2.8b, v0.8b, v0.8b \n" // R
  2171. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
  2172. "b.gt 1b \n"
  2173. : "+r"(src_argb), // %0
  2174. "+r"(dst_argb), // %1
  2175. "+r"(width) // %2
  2176. :
  2177. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26");
  2178. }
  2179. // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
  2180. // b = (r * 35 + g * 68 + b * 17) >> 7
  2181. // g = (r * 45 + g * 88 + b * 22) >> 7
  2182. // r = (r * 50 + g * 98 + b * 24) >> 7
  2183. void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
  2184. asm volatile(
  2185. "movi v20.8b, #17 \n" // BB coefficient
  2186. "movi v21.8b, #68 \n" // BG coefficient
  2187. "movi v22.8b, #35 \n" // BR coefficient
  2188. "movi v24.8b, #22 \n" // GB coefficient
  2189. "movi v25.8b, #88 \n" // GG coefficient
  2190. "movi v26.8b, #45 \n" // GR coefficient
  2191. "movi v28.8b, #24 \n" // BB coefficient
  2192. "movi v29.8b, #98 \n" // BG coefficient
  2193. "movi v30.8b, #50 \n" // BR coefficient
  2194. "1: \n"
  2195. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
  2196. "subs %w1, %w1, #8 \n" // 8 processed per loop.
  2197. "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
  2198. "umlal v4.8h, v1.8b, v21.8b \n" // G
  2199. "umlal v4.8h, v2.8b, v22.8b \n" // R
  2200. "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
  2201. "umlal v5.8h, v1.8b, v25.8b \n" // G
  2202. "umlal v5.8h, v2.8b, v26.8b \n" // R
  2203. "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
  2204. "umlal v6.8h, v1.8b, v29.8b \n" // G
  2205. "umlal v6.8h, v2.8b, v30.8b \n" // R
  2206. "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
  2207. "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
  2208. "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
  2209. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
  2210. "b.gt 1b \n"
  2211. : "+r"(dst_argb), // %0
  2212. "+r"(width) // %1
  2213. :
  2214. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  2215. "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30");
  2216. }
  2217. // Tranform 8 ARGB pixels (32 bytes) with color matrix.
  2218. // TODO(fbarchard): Was same as Sepia except matrix is provided. This function
  2219. // needs to saturate. Consider doing a non-saturating version.
  2220. void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
  2221. uint8_t* dst_argb,
  2222. const int8_t* matrix_argb,
  2223. int width) {
  2224. asm volatile(
  2225. "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
  2226. "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
  2227. "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
  2228. "1: \n"
  2229. "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB
  2230. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  2231. "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
  2232. "uxtl v17.8h, v17.8b \n" // g
  2233. "uxtl v18.8h, v18.8b \n" // r
  2234. "uxtl v19.8h, v19.8b \n" // a
  2235. "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
  2236. "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
  2237. "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
  2238. "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
  2239. "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
  2240. "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
  2241. "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
  2242. "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
  2243. "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
  2244. "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
  2245. "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
  2246. "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
  2247. "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
  2248. "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
  2249. "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
  2250. "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
  2251. "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
  2252. "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
  2253. "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
  2254. "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
  2255. "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
  2256. "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
  2257. "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
  2258. "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
  2259. "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
  2260. "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
  2261. "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
  2262. "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
  2263. "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
  2264. "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
  2265. "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
  2266. "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
  2267. "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB
  2268. "b.gt 1b \n"
  2269. : "+r"(src_argb), // %0
  2270. "+r"(dst_argb), // %1
  2271. "+r"(width) // %2
  2272. : "r"(matrix_argb) // %3
  2273. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
  2274. "v17", "v18", "v19", "v22", "v23", "v24", "v25");
  2275. }
  2276. // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
  2277. // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
  2278. void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
  2279. const uint8_t* src_argb1,
  2280. uint8_t* dst_argb,
  2281. int width) {
  2282. asm volatile(
  2283. // 8 pixel loop.
  2284. "1: \n"
  2285. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
  2286. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
  2287. "subs %w3, %w3, #8 \n" // 8 processed per loop.
  2288. "umull v0.8h, v0.8b, v4.8b \n" // multiply B
  2289. "umull v1.8h, v1.8b, v5.8b \n" // multiply G
  2290. "umull v2.8h, v2.8b, v6.8b \n" // multiply R
  2291. "umull v3.8h, v3.8b, v7.8b \n" // multiply A
  2292. "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
  2293. "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
  2294. "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
  2295. "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
  2296. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
  2297. "b.gt 1b \n"
  2298. : "+r"(src_argb0), // %0
  2299. "+r"(src_argb1), // %1
  2300. "+r"(dst_argb), // %2
  2301. "+r"(width) // %3
  2302. :
  2303. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
  2304. }
  2305. // Add 2 rows of ARGB pixels together, 8 pixels at a time.
  2306. void ARGBAddRow_NEON(const uint8_t* src_argb0,
  2307. const uint8_t* src_argb1,
  2308. uint8_t* dst_argb,
  2309. int width) {
  2310. asm volatile(
  2311. // 8 pixel loop.
  2312. "1: \n"
  2313. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
  2314. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
  2315. "subs %w3, %w3, #8 \n" // 8 processed per loop.
  2316. "uqadd v0.8b, v0.8b, v4.8b \n"
  2317. "uqadd v1.8b, v1.8b, v5.8b \n"
  2318. "uqadd v2.8b, v2.8b, v6.8b \n"
  2319. "uqadd v3.8b, v3.8b, v7.8b \n"
  2320. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
  2321. "b.gt 1b \n"
  2322. : "+r"(src_argb0), // %0
  2323. "+r"(src_argb1), // %1
  2324. "+r"(dst_argb), // %2
  2325. "+r"(width) // %3
  2326. :
  2327. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
  2328. }
  2329. // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
  2330. void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
  2331. const uint8_t* src_argb1,
  2332. uint8_t* dst_argb,
  2333. int width) {
  2334. asm volatile(
  2335. // 8 pixel loop.
  2336. "1: \n"
  2337. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
  2338. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
  2339. "subs %w3, %w3, #8 \n" // 8 processed per loop.
  2340. "uqsub v0.8b, v0.8b, v4.8b \n"
  2341. "uqsub v1.8b, v1.8b, v5.8b \n"
  2342. "uqsub v2.8b, v2.8b, v6.8b \n"
  2343. "uqsub v3.8b, v3.8b, v7.8b \n"
  2344. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
  2345. "b.gt 1b \n"
  2346. : "+r"(src_argb0), // %0
  2347. "+r"(src_argb1), // %1
  2348. "+r"(dst_argb), // %2
  2349. "+r"(width) // %3
  2350. :
  2351. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
  2352. }
  2353. // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
  2354. // A = 255
  2355. // R = Sobel
  2356. // G = Sobel
  2357. // B = Sobel
  2358. void SobelRow_NEON(const uint8_t* src_sobelx,
  2359. const uint8_t* src_sobely,
  2360. uint8_t* dst_argb,
  2361. int width) {
  2362. asm volatile(
  2363. "movi v3.8b, #255 \n" // alpha
  2364. // 8 pixel loop.
  2365. "1: \n"
  2366. "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
  2367. "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
  2368. "subs %w3, %w3, #8 \n" // 8 processed per loop.
  2369. "uqadd v0.8b, v0.8b, v1.8b \n" // add
  2370. "orr v1.8b, v0.8b, v0.8b \n"
  2371. "orr v2.8b, v0.8b, v0.8b \n"
  2372. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
  2373. "b.gt 1b \n"
  2374. : "+r"(src_sobelx), // %0
  2375. "+r"(src_sobely), // %1
  2376. "+r"(dst_argb), // %2
  2377. "+r"(width) // %3
  2378. :
  2379. : "cc", "memory", "v0", "v1", "v2", "v3");
  2380. }
  2381. // Adds Sobel X and Sobel Y and stores Sobel into plane.
  2382. void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
  2383. const uint8_t* src_sobely,
  2384. uint8_t* dst_y,
  2385. int width) {
  2386. asm volatile(
  2387. // 16 pixel loop.
  2388. "1: \n"
  2389. "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
  2390. "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
  2391. "subs %w3, %w3, #16 \n" // 16 processed per loop.
  2392. "uqadd v0.16b, v0.16b, v1.16b \n" // add
  2393. "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
  2394. "b.gt 1b \n"
  2395. : "+r"(src_sobelx), // %0
  2396. "+r"(src_sobely), // %1
  2397. "+r"(dst_y), // %2
  2398. "+r"(width) // %3
  2399. :
  2400. : "cc", "memory", "v0", "v1");
  2401. }
  2402. // Mixes Sobel X, Sobel Y and Sobel into ARGB.
  2403. // A = 255
  2404. // R = Sobel X
  2405. // G = Sobel
  2406. // B = Sobel Y
  2407. void SobelXYRow_NEON(const uint8_t* src_sobelx,
  2408. const uint8_t* src_sobely,
  2409. uint8_t* dst_argb,
  2410. int width) {
  2411. asm volatile(
  2412. "movi v3.8b, #255 \n" // alpha
  2413. // 8 pixel loop.
  2414. "1: \n"
  2415. "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
  2416. "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
  2417. "subs %w3, %w3, #8 \n" // 8 processed per loop.
  2418. "uqadd v1.8b, v0.8b, v2.8b \n" // add
  2419. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
  2420. "b.gt 1b \n"
  2421. : "+r"(src_sobelx), // %0
  2422. "+r"(src_sobely), // %1
  2423. "+r"(dst_argb), // %2
  2424. "+r"(width) // %3
  2425. :
  2426. : "cc", "memory", "v0", "v1", "v2", "v3");
  2427. }
  2428. // SobelX as a matrix is
  2429. // -1 0 1
  2430. // -2 0 2
  2431. // -1 0 1
  2432. void SobelXRow_NEON(const uint8_t* src_y0,
  2433. const uint8_t* src_y1,
  2434. const uint8_t* src_y2,
  2435. uint8_t* dst_sobelx,
  2436. int width) {
  2437. asm volatile(
  2438. "1: \n"
  2439. "ld1 {v0.8b}, [%0],%5 \n" // top
  2440. "ld1 {v1.8b}, [%0],%6 \n"
  2441. "usubl v0.8h, v0.8b, v1.8b \n"
  2442. "ld1 {v2.8b}, [%1],%5 \n" // center * 2
  2443. "ld1 {v3.8b}, [%1],%6 \n"
  2444. "usubl v1.8h, v2.8b, v3.8b \n"
  2445. "add v0.8h, v0.8h, v1.8h \n"
  2446. "add v0.8h, v0.8h, v1.8h \n"
  2447. "ld1 {v2.8b}, [%2],%5 \n" // bottom
  2448. "ld1 {v3.8b}, [%2],%6 \n"
  2449. "subs %w4, %w4, #8 \n" // 8 pixels
  2450. "usubl v1.8h, v2.8b, v3.8b \n"
  2451. "add v0.8h, v0.8h, v1.8h \n"
  2452. "abs v0.8h, v0.8h \n"
  2453. "uqxtn v0.8b, v0.8h \n"
  2454. "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
  2455. "b.gt 1b \n"
  2456. : "+r"(src_y0), // %0
  2457. "+r"(src_y1), // %1
  2458. "+r"(src_y2), // %2
  2459. "+r"(dst_sobelx), // %3
  2460. "+r"(width) // %4
  2461. : "r"(2LL), // %5
  2462. "r"(6LL) // %6
  2463. : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
  2464. );
  2465. }
  2466. // SobelY as a matrix is
  2467. // -1 -2 -1
  2468. // 0 0 0
  2469. // 1 2 1
  2470. void SobelYRow_NEON(const uint8_t* src_y0,
  2471. const uint8_t* src_y1,
  2472. uint8_t* dst_sobely,
  2473. int width) {
  2474. asm volatile(
  2475. "1: \n"
  2476. "ld1 {v0.8b}, [%0],%4 \n" // left
  2477. "ld1 {v1.8b}, [%1],%4 \n"
  2478. "usubl v0.8h, v0.8b, v1.8b \n"
  2479. "ld1 {v2.8b}, [%0],%4 \n" // center * 2
  2480. "ld1 {v3.8b}, [%1],%4 \n"
  2481. "usubl v1.8h, v2.8b, v3.8b \n"
  2482. "add v0.8h, v0.8h, v1.8h \n"
  2483. "add v0.8h, v0.8h, v1.8h \n"
  2484. "ld1 {v2.8b}, [%0],%5 \n" // right
  2485. "ld1 {v3.8b}, [%1],%5 \n"
  2486. "subs %w3, %w3, #8 \n" // 8 pixels
  2487. "usubl v1.8h, v2.8b, v3.8b \n"
  2488. "add v0.8h, v0.8h, v1.8h \n"
  2489. "abs v0.8h, v0.8h \n"
  2490. "uqxtn v0.8b, v0.8h \n"
  2491. "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
  2492. "b.gt 1b \n"
  2493. : "+r"(src_y0), // %0
  2494. "+r"(src_y1), // %1
  2495. "+r"(dst_sobely), // %2
  2496. "+r"(width) // %3
  2497. : "r"(1LL), // %4
  2498. "r"(6LL) // %5
  2499. : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
  2500. );
  2501. }
  2502. // Caveat - rounds float to half float whereas scaling version truncates.
  2503. void HalfFloat1Row_NEON(const uint16_t* src,
  2504. uint16_t* dst,
  2505. float /*unused*/,
  2506. int width) {
  2507. asm volatile(
  2508. "1: \n"
  2509. "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
  2510. "subs %w2, %w2, #8 \n" // 8 pixels per loop
  2511. "uxtl v2.4s, v1.4h \n" // 8 int's
  2512. "uxtl2 v3.4s, v1.8h \n"
  2513. "scvtf v2.4s, v2.4s \n" // 8 floats
  2514. "scvtf v3.4s, v3.4s \n"
  2515. "fcvtn v1.4h, v2.4s \n" // 8 half floats
  2516. "fcvtn2 v1.8h, v3.4s \n"
  2517. "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
  2518. "b.gt 1b \n"
  2519. : "+r"(src), // %0
  2520. "+r"(dst), // %1
  2521. "+r"(width) // %2
  2522. :
  2523. : "cc", "memory", "v1", "v2", "v3");
  2524. }
  2525. void HalfFloatRow_NEON(const uint16_t* src,
  2526. uint16_t* dst,
  2527. float scale,
  2528. int width) {
  2529. asm volatile(
  2530. "1: \n"
  2531. "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
  2532. "subs %w2, %w2, #8 \n" // 8 pixels per loop
  2533. "uxtl v2.4s, v1.4h \n" // 8 int's
  2534. "uxtl2 v3.4s, v1.8h \n"
  2535. "scvtf v2.4s, v2.4s \n" // 8 floats
  2536. "scvtf v3.4s, v3.4s \n"
  2537. "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
  2538. "fmul v3.4s, v3.4s, %3.s[0] \n"
  2539. "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat
  2540. "uqshrn2 v1.8h, v3.4s, #13 \n"
  2541. "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
  2542. "b.gt 1b \n"
  2543. : "+r"(src), // %0
  2544. "+r"(dst), // %1
  2545. "+r"(width) // %2
  2546. : "w"(scale * 1.9259299444e-34f) // %3
  2547. : "cc", "memory", "v1", "v2", "v3");
  2548. }
  2549. void ByteToFloatRow_NEON(const uint8_t* src,
  2550. float* dst,
  2551. float scale,
  2552. int width) {
  2553. asm volatile(
  2554. "1: \n"
  2555. "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes
  2556. "subs %w2, %w2, #8 \n" // 8 pixels per loop
  2557. "uxtl v1.8h, v1.8b \n" // 8 shorts
  2558. "uxtl v2.4s, v1.4h \n" // 8 ints
  2559. "uxtl2 v3.4s, v1.8h \n"
  2560. "scvtf v2.4s, v2.4s \n" // 8 floats
  2561. "scvtf v3.4s, v3.4s \n"
  2562. "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
  2563. "fmul v3.4s, v3.4s, %3.s[0] \n"
  2564. "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats
  2565. "b.gt 1b \n"
  2566. : "+r"(src), // %0
  2567. "+r"(dst), // %1
  2568. "+r"(width) // %2
  2569. : "w"(scale) // %3
  2570. : "cc", "memory", "v1", "v2", "v3");
  2571. }
  2572. float ScaleMaxSamples_NEON(const float* src,
  2573. float* dst,
  2574. float scale,
  2575. int width) {
  2576. float fmax;
  2577. asm volatile(
  2578. "movi v5.4s, #0 \n" // max
  2579. "movi v6.4s, #0 \n"
  2580. "1: \n"
  2581. "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
  2582. "subs %w2, %w2, #8 \n" // 8 processed per loop
  2583. "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
  2584. "fmul v4.4s, v2.4s, %4.s[0] \n" // scale
  2585. "fmax v5.4s, v5.4s, v1.4s \n" // max
  2586. "fmax v6.4s, v6.4s, v2.4s \n"
  2587. "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
  2588. "b.gt 1b \n"
  2589. "fmax v5.4s, v5.4s, v6.4s \n" // max
  2590. "fmaxv %s3, v5.4s \n" // signed max acculator
  2591. : "+r"(src), // %0
  2592. "+r"(dst), // %1
  2593. "+r"(width), // %2
  2594. "=w"(fmax) // %3
  2595. : "w"(scale) // %4
  2596. : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
  2597. return fmax;
  2598. }
  2599. float ScaleSumSamples_NEON(const float* src,
  2600. float* dst,
  2601. float scale,
  2602. int width) {
  2603. float fsum;
  2604. asm volatile(
  2605. "movi v5.4s, #0 \n" // max
  2606. "movi v6.4s, #0 \n" // max
  2607. "1: \n"
  2608. "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
  2609. "subs %w2, %w2, #8 \n" // 8 processed per loop
  2610. "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
  2611. "fmul v4.4s, v2.4s, %4.s[0] \n"
  2612. "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
  2613. "fmla v6.4s, v2.4s, v2.4s \n"
  2614. "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
  2615. "b.gt 1b \n"
  2616. "faddp v5.4s, v5.4s, v6.4s \n"
  2617. "faddp v5.4s, v5.4s, v5.4s \n"
  2618. "faddp %3.4s, v5.4s, v5.4s \n" // sum
  2619. : "+r"(src), // %0
  2620. "+r"(dst), // %1
  2621. "+r"(width), // %2
  2622. "=w"(fsum) // %3
  2623. : "w"(scale) // %4
  2624. : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
  2625. return fsum;
  2626. }
  2627. void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
  2628. asm volatile(
  2629. "1: \n"
  2630. "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
  2631. "subs %w2, %w2, #8 \n" // 8 processed per loop
  2632. "fmul v1.4s, v1.4s, %3.s[0] \n" // scale
  2633. "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
  2634. "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
  2635. "b.gt 1b \n"
  2636. : "+r"(src), // %0
  2637. "+r"(dst), // %1
  2638. "+r"(width) // %2
  2639. : "w"(scale) // %3
  2640. : "cc", "memory", "v1", "v2");
  2641. }
  2642. // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
  2643. void GaussCol_NEON(const uint16_t* src0,
  2644. const uint16_t* src1,
  2645. const uint16_t* src2,
  2646. const uint16_t* src3,
  2647. const uint16_t* src4,
  2648. uint32_t* dst,
  2649. int width) {
  2650. asm volatile(
  2651. "movi v6.8h, #4 \n" // constant 4
  2652. "movi v7.8h, #6 \n" // constant 6
  2653. "1: \n"
  2654. "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows
  2655. "ld1 {v2.8h}, [%4], #16 \n"
  2656. "uaddl v0.4s, v1.4h, v2.4h \n" // * 1
  2657. "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1
  2658. "ld1 {v2.8h}, [%1], #16 \n"
  2659. "umlal v0.4s, v2.4h, v6.4h \n" // * 4
  2660. "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
  2661. "ld1 {v2.8h}, [%2], #16 \n"
  2662. "umlal v0.4s, v2.4h, v7.4h \n" // * 6
  2663. "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6
  2664. "ld1 {v2.8h}, [%3], #16 \n"
  2665. "umlal v0.4s, v2.4h, v6.4h \n" // * 4
  2666. "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
  2667. "subs %w6, %w6, #8 \n" // 8 processed per loop
  2668. "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
  2669. "b.gt 1b \n"
  2670. : "+r"(src0), // %0
  2671. "+r"(src1), // %1
  2672. "+r"(src2), // %2
  2673. "+r"(src3), // %3
  2674. "+r"(src4), // %4
  2675. "+r"(dst), // %5
  2676. "+r"(width) // %6
  2677. :
  2678. : "cc", "memory", "v0", "v1", "v2", "v6", "v7");
  2679. }
  2680. // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
  2681. void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
  2682. const uint32_t* src1 = src + 1;
  2683. const uint32_t* src2 = src + 2;
  2684. const uint32_t* src3 = src + 3;
  2685. asm volatile(
  2686. "movi v6.4s, #4 \n" // constant 4
  2687. "movi v7.4s, #6 \n" // constant 6
  2688. "1: \n"
  2689. "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples
  2690. "add v0.4s, v0.4s, v1.4s \n" // * 1
  2691. "add v1.4s, v1.4s, v2.4s \n" // * 1
  2692. "ld1 {v2.4s,v3.4s}, [%2], #32 \n"
  2693. "mla v0.4s, v2.4s, v7.4s \n" // * 6
  2694. "mla v1.4s, v3.4s, v7.4s \n" // * 6
  2695. "ld1 {v2.4s,v3.4s}, [%1], #32 \n"
  2696. "ld1 {v4.4s,v5.4s}, [%3], #32 \n"
  2697. "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4
  2698. "add v3.4s, v3.4s, v5.4s \n"
  2699. "mla v0.4s, v2.4s, v6.4s \n" // * 4
  2700. "mla v1.4s, v3.4s, v6.4s \n" // * 4
  2701. "subs %w5, %w5, #8 \n" // 8 processed per loop
  2702. "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
  2703. "uqrshrn2 v0.8h, v1.4s, #8 \n"
  2704. "st1 {v0.8h}, [%4], #16 \n" // store 8 samples
  2705. "b.gt 1b \n"
  2706. : "+r"(src), // %0
  2707. "+r"(src1), // %1
  2708. "+r"(src2), // %2
  2709. "+r"(src3), // %3
  2710. "+r"(dst), // %4
  2711. "+r"(width) // %5
  2712. : "r"(32LL) // %6
  2713. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
  2714. }
  2715. #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
  2716. #ifdef __cplusplus
  2717. } // extern "C"
  2718. } // namespace libyuv
  2719. #endif