row_neon.cc 136 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892
  1. /*
  2. * Copyright 2011 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/row.h"
  11. #include <stdio.h>
  12. #ifdef __cplusplus
  13. namespace libyuv {
  14. extern "C" {
  15. #endif
  16. // This module is for GCC Neon
  17. #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
  18. !defined(__aarch64__)
  19. // Read 8 Y, 4 U and 4 V from 422
  20. #define READYUV422 \
  21. "vld1.8 {d0}, [%0]! \n" \
  22. "vld1.32 {d2[0]}, [%1]! \n" \
  23. "vld1.32 {d2[1]}, [%2]! \n"
  24. // Read 8 Y, 8 U and 8 V from 444
  25. #define READYUV444 \
  26. "vld1.8 {d0}, [%0]! \n" \
  27. "vld1.8 {d2}, [%1]! \n" \
  28. "vld1.8 {d3}, [%2]! \n" \
  29. "vpaddl.u8 q1, q1 \n" \
  30. "vrshrn.u16 d2, q1, #1 \n"
  31. // Read 8 Y, and set 4 U and 4 V to 128
  32. #define READYUV400 \
  33. "vld1.8 {d0}, [%0]! \n" \
  34. "vmov.u8 d2, #128 \n"
  35. // Read 8 Y and 4 UV from NV12
  36. #define READNV12 \
  37. "vld1.8 {d0}, [%0]! \n" \
  38. "vld1.8 {d2}, [%1]! \n" \
  39. "vmov.u8 d3, d2 \n" /* split odd/even uv apart */ \
  40. "vuzp.u8 d2, d3 \n" \
  41. "vtrn.u32 d2, d3 \n"
  42. // Read 8 Y and 4 VU from NV21
  43. #define READNV21 \
  44. "vld1.8 {d0}, [%0]! \n" \
  45. "vld1.8 {d2}, [%1]! \n" \
  46. "vmov.u8 d3, d2 \n" /* split odd/even uv apart */ \
  47. "vuzp.u8 d3, d2 \n" \
  48. "vtrn.u32 d2, d3 \n"
  49. // Read 8 YUY2
  50. #define READYUY2 \
  51. "vld2.8 {d0, d2}, [%0]! \n" \
  52. "vmov.u8 d3, d2 \n" \
  53. "vuzp.u8 d2, d3 \n" \
  54. "vtrn.u32 d2, d3 \n"
  55. // Read 8 UYVY
  56. #define READUYVY \
  57. "vld2.8 {d2, d3}, [%0]! \n" \
  58. "vmov.u8 d0, d3 \n" \
  59. "vmov.u8 d3, d2 \n" \
  60. "vuzp.u8 d2, d3 \n" \
  61. "vtrn.u32 d2, d3 \n"
  62. #define YUVTORGB_SETUP \
  63. "vld1.8 {d24}, [%[kUVToRB]] \n" \
  64. "vld1.8 {d25}, [%[kUVToG]] \n" \
  65. "vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \
  66. "vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \
  67. "vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \
  68. "vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n"
  69. #define YUVTORGB \
  70. "vmull.u8 q8, d2, d24 \n" /* u/v B/R component */ \
  71. "vmull.u8 q9, d2, d25 \n" /* u/v G component */ \
  72. "vmovl.u8 q0, d0 \n" /* Y */ \
  73. "vmovl.s16 q10, d1 \n" \
  74. "vmovl.s16 q0, d0 \n" \
  75. "vmul.s32 q10, q10, q15 \n" \
  76. "vmul.s32 q0, q0, q15 \n" \
  77. "vqshrun.s32 d0, q0, #16 \n" \
  78. "vqshrun.s32 d1, q10, #16 \n" /* Y */ \
  79. "vadd.s16 d18, d19 \n" \
  80. "vshll.u16 q1, d16, #16 \n" /* Replicate u * UB */ \
  81. "vshll.u16 q10, d17, #16 \n" /* Replicate v * VR */ \
  82. "vshll.u16 q3, d18, #16 \n" /* Replicate (v*VG + u*UG)*/ \
  83. "vaddw.u16 q1, q1, d16 \n" \
  84. "vaddw.u16 q10, q10, d17 \n" \
  85. "vaddw.u16 q3, q3, d18 \n" \
  86. "vqadd.s16 q8, q0, q13 \n" /* B */ \
  87. "vqadd.s16 q9, q0, q14 \n" /* R */ \
  88. "vqadd.s16 q0, q0, q4 \n" /* G */ \
  89. "vqadd.s16 q8, q8, q1 \n" /* B */ \
  90. "vqadd.s16 q9, q9, q10 \n" /* R */ \
  91. "vqsub.s16 q0, q0, q3 \n" /* G */ \
  92. "vqshrun.s16 d20, q8, #6 \n" /* B */ \
  93. "vqshrun.s16 d22, q9, #6 \n" /* R */ \
  94. "vqshrun.s16 d21, q0, #6 \n" /* G */
  95. void I444ToARGBRow_NEON(const uint8_t* src_y,
  96. const uint8_t* src_u,
  97. const uint8_t* src_v,
  98. uint8_t* dst_argb,
  99. const struct YuvConstants* yuvconstants,
  100. int width) {
  101. asm volatile(
  102. YUVTORGB_SETUP
  103. "vmov.u8 d23, #255 \n"
  104. "1: \n" READYUV444 YUVTORGB
  105. "subs %4, %4, #8 \n"
  106. "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
  107. "bgt 1b \n"
  108. : "+r"(src_y), // %0
  109. "+r"(src_u), // %1
  110. "+r"(src_v), // %2
  111. "+r"(dst_argb), // %3
  112. "+r"(width) // %4
  113. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  114. [kUVToG] "r"(&yuvconstants->kUVToG),
  115. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  116. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  117. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
  118. "q12", "q13", "q14", "q15");
  119. }
  120. void I422ToARGBRow_NEON(const uint8_t* src_y,
  121. const uint8_t* src_u,
  122. const uint8_t* src_v,
  123. uint8_t* dst_argb,
  124. const struct YuvConstants* yuvconstants,
  125. int width) {
  126. asm volatile(
  127. YUVTORGB_SETUP
  128. "vmov.u8 d23, #255 \n"
  129. "1: \n" READYUV422 YUVTORGB
  130. "subs %4, %4, #8 \n"
  131. "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
  132. "bgt 1b \n"
  133. : "+r"(src_y), // %0
  134. "+r"(src_u), // %1
  135. "+r"(src_v), // %2
  136. "+r"(dst_argb), // %3
  137. "+r"(width) // %4
  138. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  139. [kUVToG] "r"(&yuvconstants->kUVToG),
  140. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  141. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  142. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
  143. "q12", "q13", "q14", "q15");
  144. }
  145. void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
  146. const uint8_t* src_u,
  147. const uint8_t* src_v,
  148. const uint8_t* src_a,
  149. uint8_t* dst_argb,
  150. const struct YuvConstants* yuvconstants,
  151. int width) {
  152. asm volatile(
  153. YUVTORGB_SETUP
  154. "1: \n" READYUV422 YUVTORGB
  155. "subs %5, %5, #8 \n"
  156. "vld1.8 {d23}, [%3]! \n"
  157. "vst4.8 {d20, d21, d22, d23}, [%4]! \n"
  158. "bgt 1b \n"
  159. : "+r"(src_y), // %0
  160. "+r"(src_u), // %1
  161. "+r"(src_v), // %2
  162. "+r"(src_a), // %3
  163. "+r"(dst_argb), // %4
  164. "+r"(width) // %5
  165. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  166. [kUVToG] "r"(&yuvconstants->kUVToG),
  167. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  168. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  169. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
  170. "q12", "q13", "q14", "q15");
  171. }
  172. void I422ToRGBARow_NEON(const uint8_t* src_y,
  173. const uint8_t* src_u,
  174. const uint8_t* src_v,
  175. uint8_t* dst_rgba,
  176. const struct YuvConstants* yuvconstants,
  177. int width) {
  178. asm volatile(
  179. YUVTORGB_SETUP
  180. "1: \n" READYUV422 YUVTORGB
  181. "subs %4, %4, #8 \n"
  182. "vmov.u8 d19, #255 \n" // YUVTORGB modified d19
  183. "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
  184. "bgt 1b \n"
  185. : "+r"(src_y), // %0
  186. "+r"(src_u), // %1
  187. "+r"(src_v), // %2
  188. "+r"(dst_rgba), // %3
  189. "+r"(width) // %4
  190. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  191. [kUVToG] "r"(&yuvconstants->kUVToG),
  192. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  193. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  194. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
  195. "q12", "q13", "q14", "q15");
  196. }
  197. void I422ToRGB24Row_NEON(const uint8_t* src_y,
  198. const uint8_t* src_u,
  199. const uint8_t* src_v,
  200. uint8_t* dst_rgb24,
  201. const struct YuvConstants* yuvconstants,
  202. int width) {
  203. asm volatile(
  204. YUVTORGB_SETUP
  205. "1: \n" READYUV422 YUVTORGB
  206. "subs %4, %4, #8 \n"
  207. "vst3.8 {d20, d21, d22}, [%3]! \n"
  208. "bgt 1b \n"
  209. : "+r"(src_y), // %0
  210. "+r"(src_u), // %1
  211. "+r"(src_v), // %2
  212. "+r"(dst_rgb24), // %3
  213. "+r"(width) // %4
  214. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  215. [kUVToG] "r"(&yuvconstants->kUVToG),
  216. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  217. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  218. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
  219. "q12", "q13", "q14", "q15");
  220. }
  221. #define ARGBTORGB565 \
  222. "vshll.u8 q0, d22, #8 \n" /* R */ \
  223. "vshll.u8 q8, d21, #8 \n" /* G */ \
  224. "vshll.u8 q9, d20, #8 \n" /* B */ \
  225. "vsri.16 q0, q8, #5 \n" /* RG */ \
  226. "vsri.16 q0, q9, #11 \n" /* RGB */
  227. void I422ToRGB565Row_NEON(const uint8_t* src_y,
  228. const uint8_t* src_u,
  229. const uint8_t* src_v,
  230. uint8_t* dst_rgb565,
  231. const struct YuvConstants* yuvconstants,
  232. int width) {
  233. asm volatile(
  234. YUVTORGB_SETUP
  235. "1: \n" READYUV422 YUVTORGB
  236. "subs %4, %4, #8 \n" ARGBTORGB565
  237. "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
  238. "bgt 1b \n"
  239. : "+r"(src_y), // %0
  240. "+r"(src_u), // %1
  241. "+r"(src_v), // %2
  242. "+r"(dst_rgb565), // %3
  243. "+r"(width) // %4
  244. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  245. [kUVToG] "r"(&yuvconstants->kUVToG),
  246. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  247. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  248. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
  249. "q12", "q13", "q14", "q15");
  250. }
  251. #define ARGBTOARGB1555 \
  252. "vshll.u8 q0, d23, #8 \n" /* A */ \
  253. "vshll.u8 q8, d22, #8 \n" /* R */ \
  254. "vshll.u8 q9, d21, #8 \n" /* G */ \
  255. "vshll.u8 q10, d20, #8 \n" /* B */ \
  256. "vsri.16 q0, q8, #1 \n" /* AR */ \
  257. "vsri.16 q0, q9, #6 \n" /* ARG */ \
  258. "vsri.16 q0, q10, #11 \n" /* ARGB */
  259. void I422ToARGB1555Row_NEON(const uint8_t* src_y,
  260. const uint8_t* src_u,
  261. const uint8_t* src_v,
  262. uint8_t* dst_argb1555,
  263. const struct YuvConstants* yuvconstants,
  264. int width) {
  265. asm volatile(
  266. YUVTORGB_SETUP
  267. "1: \n" READYUV422 YUVTORGB
  268. "subs %4, %4, #8 \n"
  269. "vmov.u8 d23, #255 \n" ARGBTOARGB1555
  270. "vst1.8 {q0}, [%3]! \n" // store 8 pixels
  271. "bgt 1b \n"
  272. : "+r"(src_y), // %0
  273. "+r"(src_u), // %1
  274. "+r"(src_v), // %2
  275. "+r"(dst_argb1555), // %3
  276. "+r"(width) // %4
  277. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  278. [kUVToG] "r"(&yuvconstants->kUVToG),
  279. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  280. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  281. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
  282. "q12", "q13", "q14", "q15");
  283. }
  284. #define ARGBTOARGB4444 \
  285. "vshr.u8 d20, d20, #4 \n" /* B */ \
  286. "vbic.32 d21, d21, d4 \n" /* G */ \
  287. "vshr.u8 d22, d22, #4 \n" /* R */ \
  288. "vbic.32 d23, d23, d4 \n" /* A */ \
  289. "vorr d0, d20, d21 \n" /* BG */ \
  290. "vorr d1, d22, d23 \n" /* RA */ \
  291. "vzip.u8 d0, d1 \n" /* BGRA */
  292. void I422ToARGB4444Row_NEON(const uint8_t* src_y,
  293. const uint8_t* src_u,
  294. const uint8_t* src_v,
  295. uint8_t* dst_argb4444,
  296. const struct YuvConstants* yuvconstants,
  297. int width) {
  298. asm volatile(
  299. YUVTORGB_SETUP
  300. "vmov.u8 d4, #0x0f \n" // vbic bits to clear
  301. "1: \n"
  302. READYUV422 YUVTORGB
  303. "subs %4, %4, #8 \n"
  304. "vmov.u8 d23, #255 \n" ARGBTOARGB4444
  305. "vst1.8 {q0}, [%3]! \n" // store 8 pixels
  306. "bgt 1b \n"
  307. : "+r"(src_y), // %0
  308. "+r"(src_u), // %1
  309. "+r"(src_v), // %2
  310. "+r"(dst_argb4444), // %3
  311. "+r"(width) // %4
  312. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  313. [kUVToG] "r"(&yuvconstants->kUVToG),
  314. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  315. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  316. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
  317. "q12", "q13", "q14", "q15");
  318. }
  319. void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
  320. asm volatile(
  321. YUVTORGB_SETUP
  322. "vmov.u8 d23, #255 \n"
  323. "1: \n" READYUV400 YUVTORGB
  324. "subs %2, %2, #8 \n"
  325. "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
  326. "bgt 1b \n"
  327. : "+r"(src_y), // %0
  328. "+r"(dst_argb), // %1
  329. "+r"(width) // %2
  330. : [kUVToRB] "r"(&kYuvI601Constants.kUVToRB),
  331. [kUVToG] "r"(&kYuvI601Constants.kUVToG),
  332. [kUVBiasBGR] "r"(&kYuvI601Constants.kUVBiasBGR),
  333. [kYToRgb] "r"(&kYuvI601Constants.kYToRgb)
  334. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
  335. "q12", "q13", "q14", "q15");
  336. }
  337. void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
  338. asm volatile(
  339. "vmov.u8 d23, #255 \n"
  340. "1: \n"
  341. "vld1.8 {d20}, [%0]! \n"
  342. "vmov d21, d20 \n"
  343. "vmov d22, d20 \n"
  344. "subs %2, %2, #8 \n"
  345. "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
  346. "bgt 1b \n"
  347. : "+r"(src_y), // %0
  348. "+r"(dst_argb), // %1
  349. "+r"(width) // %2
  350. :
  351. : "cc", "memory", "d20", "d21", "d22", "d23");
  352. }
  353. void NV12ToARGBRow_NEON(const uint8_t* src_y,
  354. const uint8_t* src_uv,
  355. uint8_t* dst_argb,
  356. const struct YuvConstants* yuvconstants,
  357. int width) {
  358. asm volatile(YUVTORGB_SETUP
  359. "vmov.u8 d23, #255 \n"
  360. "1: \n" READNV12 YUVTORGB
  361. "subs %3, %3, #8 \n"
  362. "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
  363. "bgt 1b \n"
  364. : "+r"(src_y), // %0
  365. "+r"(src_uv), // %1
  366. "+r"(dst_argb), // %2
  367. "+r"(width) // %3
  368. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  369. [kUVToG] "r"(&yuvconstants->kUVToG),
  370. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  371. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  372. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
  373. "q10", "q11", "q12", "q13", "q14", "q15");
  374. }
  375. void NV21ToARGBRow_NEON(const uint8_t* src_y,
  376. const uint8_t* src_vu,
  377. uint8_t* dst_argb,
  378. const struct YuvConstants* yuvconstants,
  379. int width) {
  380. asm volatile(YUVTORGB_SETUP
  381. "vmov.u8 d23, #255 \n"
  382. "1: \n" READNV21 YUVTORGB
  383. "subs %3, %3, #8 \n"
  384. "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
  385. "bgt 1b \n"
  386. : "+r"(src_y), // %0
  387. "+r"(src_vu), // %1
  388. "+r"(dst_argb), // %2
  389. "+r"(width) // %3
  390. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  391. [kUVToG] "r"(&yuvconstants->kUVToG),
  392. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  393. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  394. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
  395. "q10", "q11", "q12", "q13", "q14", "q15");
  396. }
  397. void NV12ToRGB24Row_NEON(const uint8_t* src_y,
  398. const uint8_t* src_uv,
  399. uint8_t* dst_rgb24,
  400. const struct YuvConstants* yuvconstants,
  401. int width) {
  402. asm volatile(
  403. YUVTORGB_SETUP
  404. "1: \n"
  405. READNV12 YUVTORGB
  406. "subs %3, %3, #8 \n"
  407. "vst3.8 {d20, d21, d22}, [%2]! \n"
  408. "bgt 1b \n"
  409. : "+r"(src_y), // %0
  410. "+r"(src_uv), // %1
  411. "+r"(dst_rgb24), // %2
  412. "+r"(width) // %3
  413. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  414. [kUVToG] "r"(&yuvconstants->kUVToG),
  415. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  416. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  417. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
  418. "q12", "q13", "q14", "q15");
  419. }
  420. void NV21ToRGB24Row_NEON(const uint8_t* src_y,
  421. const uint8_t* src_vu,
  422. uint8_t* dst_rgb24,
  423. const struct YuvConstants* yuvconstants,
  424. int width) {
  425. asm volatile(
  426. YUVTORGB_SETUP
  427. "1: \n"
  428. READNV21 YUVTORGB
  429. "subs %3, %3, #8 \n"
  430. "vst3.8 {d20, d21, d22}, [%2]! \n"
  431. "bgt 1b \n"
  432. : "+r"(src_y), // %0
  433. "+r"(src_vu), // %1
  434. "+r"(dst_rgb24), // %2
  435. "+r"(width) // %3
  436. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  437. [kUVToG] "r"(&yuvconstants->kUVToG),
  438. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  439. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  440. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
  441. "q12", "q13", "q14", "q15");
  442. }
  443. void NV12ToRGB565Row_NEON(const uint8_t* src_y,
  444. const uint8_t* src_uv,
  445. uint8_t* dst_rgb565,
  446. const struct YuvConstants* yuvconstants,
  447. int width) {
  448. asm volatile(
  449. YUVTORGB_SETUP
  450. "1: \n" READNV12 YUVTORGB
  451. "subs %3, %3, #8 \n" ARGBTORGB565
  452. "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
  453. "bgt 1b \n"
  454. : "+r"(src_y), // %0
  455. "+r"(src_uv), // %1
  456. "+r"(dst_rgb565), // %2
  457. "+r"(width) // %3
  458. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  459. [kUVToG] "r"(&yuvconstants->kUVToG),
  460. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  461. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  462. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
  463. "q12", "q13", "q14", "q15");
  464. }
  465. void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
  466. uint8_t* dst_argb,
  467. const struct YuvConstants* yuvconstants,
  468. int width) {
  469. asm volatile(YUVTORGB_SETUP
  470. "vmov.u8 d23, #255 \n"
  471. "1: \n" READYUY2 YUVTORGB
  472. "subs %2, %2, #8 \n"
  473. "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
  474. "bgt 1b \n"
  475. : "+r"(src_yuy2), // %0
  476. "+r"(dst_argb), // %1
  477. "+r"(width) // %2
  478. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  479. [kUVToG] "r"(&yuvconstants->kUVToG),
  480. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  481. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  482. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
  483. "q10", "q11", "q12", "q13", "q14", "q15");
  484. }
  485. void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
  486. uint8_t* dst_argb,
  487. const struct YuvConstants* yuvconstants,
  488. int width) {
  489. asm volatile(YUVTORGB_SETUP
  490. "vmov.u8 d23, #255 \n"
  491. "1: \n" READUYVY YUVTORGB
  492. "subs %2, %2, #8 \n"
  493. "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
  494. "bgt 1b \n"
  495. : "+r"(src_uyvy), // %0
  496. "+r"(dst_argb), // %1
  497. "+r"(width) // %2
  498. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  499. [kUVToG] "r"(&yuvconstants->kUVToG),
  500. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  501. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  502. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
  503. "q10", "q11", "q12", "q13", "q14", "q15");
  504. }
  505. // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
  506. void SplitUVRow_NEON(const uint8_t* src_uv,
  507. uint8_t* dst_u,
  508. uint8_t* dst_v,
  509. int width) {
  510. asm volatile(
  511. "1: \n"
  512. "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
  513. "subs %3, %3, #16 \n" // 16 processed per loop
  514. "vst1.8 {q0}, [%1]! \n" // store U
  515. "vst1.8 {q1}, [%2]! \n" // store V
  516. "bgt 1b \n"
  517. : "+r"(src_uv), // %0
  518. "+r"(dst_u), // %1
  519. "+r"(dst_v), // %2
  520. "+r"(width) // %3 // Output registers
  521. : // Input registers
  522. : "cc", "memory", "q0", "q1" // Clobber List
  523. );
  524. }
  525. // Reads 16 U's and V's and writes out 16 pairs of UV.
  526. void MergeUVRow_NEON(const uint8_t* src_u,
  527. const uint8_t* src_v,
  528. uint8_t* dst_uv,
  529. int width) {
  530. asm volatile(
  531. "1: \n"
  532. "vld1.8 {q0}, [%0]! \n" // load U
  533. "vld1.8 {q1}, [%1]! \n" // load V
  534. "subs %3, %3, #16 \n" // 16 processed per loop
  535. "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
  536. "bgt 1b \n"
  537. : "+r"(src_u), // %0
  538. "+r"(src_v), // %1
  539. "+r"(dst_uv), // %2
  540. "+r"(width) // %3 // Output registers
  541. : // Input registers
  542. : "cc", "memory", "q0", "q1" // Clobber List
  543. );
  544. }
  545. // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
  546. void SplitRGBRow_NEON(const uint8_t* src_rgb,
  547. uint8_t* dst_r,
  548. uint8_t* dst_g,
  549. uint8_t* dst_b,
  550. int width) {
  551. asm volatile(
  552. "1: \n"
  553. "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB
  554. "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB
  555. "subs %4, %4, #16 \n" // 16 processed per loop
  556. "vst1.8 {q0}, [%1]! \n" // store R
  557. "vst1.8 {q1}, [%2]! \n" // store G
  558. "vst1.8 {q2}, [%3]! \n" // store B
  559. "bgt 1b \n"
  560. : "+r"(src_rgb), // %0
  561. "+r"(dst_r), // %1
  562. "+r"(dst_g), // %2
  563. "+r"(dst_b), // %3
  564. "+r"(width) // %4
  565. : // Input registers
  566. : "cc", "memory", "d0", "d1", "d2" // Clobber List
  567. );
  568. }
  569. // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
  570. void MergeRGBRow_NEON(const uint8_t* src_r,
  571. const uint8_t* src_g,
  572. const uint8_t* src_b,
  573. uint8_t* dst_rgb,
  574. int width) {
  575. asm volatile(
  576. "1: \n"
  577. "vld1.8 {q0}, [%0]! \n" // load R
  578. "vld1.8 {q1}, [%1]! \n" // load G
  579. "vld1.8 {q2}, [%2]! \n" // load B
  580. "subs %4, %4, #16 \n" // 16 processed per loop
  581. "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB
  582. "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB
  583. "bgt 1b \n"
  584. : "+r"(src_r), // %0
  585. "+r"(src_g), // %1
  586. "+r"(src_b), // %2
  587. "+r"(dst_rgb), // %3
  588. "+r"(width) // %4
  589. : // Input registers
  590. : "cc", "memory", "q0", "q1", "q2" // Clobber List
  591. );
  592. }
  593. // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
  594. void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
  595. asm volatile(
  596. "1: \n"
  597. "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
  598. "subs %2, %2, #32 \n" // 32 processed per loop
  599. "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
  600. "bgt 1b \n"
  601. : "+r"(src), // %0
  602. "+r"(dst), // %1
  603. "+r"(width) // %2 // Output registers
  604. : // Input registers
  605. : "cc", "memory", "q0", "q1" // Clobber List
  606. );
  607. }
  608. // SetRow writes 'width' bytes using an 8 bit value repeated.
  609. void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
  610. asm volatile(
  611. "vdup.8 q0, %2 \n" // duplicate 16 bytes
  612. "1: \n"
  613. "subs %1, %1, #16 \n" // 16 bytes per loop
  614. "vst1.8 {q0}, [%0]! \n" // store
  615. "bgt 1b \n"
  616. : "+r"(dst), // %0
  617. "+r"(width) // %1
  618. : "r"(v8) // %2
  619. : "cc", "memory", "q0");
  620. }
  621. // ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
  622. void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
  623. asm volatile(
  624. "vdup.u32 q0, %2 \n" // duplicate 4 ints
  625. "1: \n"
  626. "subs %1, %1, #4 \n" // 4 pixels per loop
  627. "vst1.8 {q0}, [%0]! \n" // store
  628. "bgt 1b \n"
  629. : "+r"(dst), // %0
  630. "+r"(width) // %1
  631. : "r"(v32) // %2
  632. : "cc", "memory", "q0");
  633. }
  634. void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
  635. asm volatile(
  636. // Start at end of source row.
  637. "mov r3, #-16 \n"
  638. "add %0, %0, %2 \n"
  639. "sub %0, #16 \n"
  640. "1: \n"
  641. "vld1.8 {q0}, [%0], r3 \n" // src -= 16
  642. "subs %2, #16 \n" // 16 pixels per loop.
  643. "vrev64.8 q0, q0 \n"
  644. "vst1.8 {d1}, [%1]! \n" // dst += 16
  645. "vst1.8 {d0}, [%1]! \n"
  646. "bgt 1b \n"
  647. : "+r"(src), // %0
  648. "+r"(dst), // %1
  649. "+r"(width) // %2
  650. :
  651. : "cc", "memory", "r3", "q0");
  652. }
  653. void MirrorUVRow_NEON(const uint8_t* src_uv,
  654. uint8_t* dst_u,
  655. uint8_t* dst_v,
  656. int width) {
  657. asm volatile(
  658. // Start at end of source row.
  659. "mov r12, #-16 \n"
  660. "add %0, %0, %3, lsl #1 \n"
  661. "sub %0, #16 \n"
  662. "1: \n"
  663. "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
  664. "subs %3, #8 \n" // 8 pixels per loop.
  665. "vrev64.8 q0, q0 \n"
  666. "vst1.8 {d0}, [%1]! \n" // dst += 8
  667. "vst1.8 {d1}, [%2]! \n"
  668. "bgt 1b \n"
  669. : "+r"(src_uv), // %0
  670. "+r"(dst_u), // %1
  671. "+r"(dst_v), // %2
  672. "+r"(width) // %3
  673. :
  674. : "cc", "memory", "r12", "q0");
  675. }
  676. void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
  677. asm volatile(
  678. // Start at end of source row.
  679. "mov r3, #-16 \n"
  680. "add %0, %0, %2, lsl #2 \n"
  681. "sub %0, #16 \n"
  682. "1: \n"
  683. "vld1.8 {q0}, [%0], r3 \n" // src -= 16
  684. "subs %2, #4 \n" // 4 pixels per loop.
  685. "vrev64.32 q0, q0 \n"
  686. "vst1.8 {d1}, [%1]! \n" // dst += 16
  687. "vst1.8 {d0}, [%1]! \n"
  688. "bgt 1b \n"
  689. : "+r"(src), // %0
  690. "+r"(dst), // %1
  691. "+r"(width) // %2
  692. :
  693. : "cc", "memory", "r3", "q0");
  694. }
  695. void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
  696. uint8_t* dst_argb,
  697. int width) {
  698. asm volatile(
  699. "vmov.u8 d4, #255 \n" // Alpha
  700. "1: \n"
  701. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
  702. "subs %2, %2, #8 \n" // 8 processed per loop.
  703. "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
  704. "bgt 1b \n"
  705. : "+r"(src_rgb24), // %0
  706. "+r"(dst_argb), // %1
  707. "+r"(width) // %2
  708. :
  709. : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
  710. );
  711. }
  712. void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
  713. asm volatile(
  714. "vmov.u8 d4, #255 \n" // Alpha
  715. "1: \n"
  716. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
  717. "subs %2, %2, #8 \n" // 8 processed per loop.
  718. "vswp.u8 d1, d3 \n" // swap R, B
  719. "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
  720. "bgt 1b \n"
  721. : "+r"(src_raw), // %0
  722. "+r"(dst_argb), // %1
  723. "+r"(width) // %2
  724. :
  725. : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
  726. );
  727. }
  728. void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
  729. asm volatile(
  730. "1: \n"
  731. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
  732. "subs %2, %2, #8 \n" // 8 processed per loop.
  733. "vswp.u8 d1, d3 \n" // swap R, B
  734. "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
  735. // RGB24.
  736. "bgt 1b \n"
  737. : "+r"(src_raw), // %0
  738. "+r"(dst_rgb24), // %1
  739. "+r"(width) // %2
  740. :
  741. : "cc", "memory", "d1", "d2", "d3" // Clobber List
  742. );
  743. }
  744. #define RGB565TOARGB \
  745. "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \
  746. "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \
  747. "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \
  748. "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \
  749. "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
  750. "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
  751. "vorr.u8 d0, d0, d4 \n" /* B */ \
  752. "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \
  753. "vorr.u8 d2, d1, d5 \n" /* R */ \
  754. "vorr.u8 d1, d4, d6 \n" /* G */
  755. void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
  756. uint8_t* dst_argb,
  757. int width) {
  758. asm volatile(
  759. "vmov.u8 d3, #255 \n" // Alpha
  760. "1: \n"
  761. "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
  762. "subs %2, %2, #8 \n" // 8 processed per loop.
  763. RGB565TOARGB
  764. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
  765. "bgt 1b \n"
  766. : "+r"(src_rgb565), // %0
  767. "+r"(dst_argb), // %1
  768. "+r"(width) // %2
  769. :
  770. : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
  771. );
  772. }
  773. #define ARGB1555TOARGB \
  774. "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \
  775. "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \
  776. "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \
  777. "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \
  778. "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \
  779. "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \
  780. "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \
  781. "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \
  782. "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \
  783. "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \
  784. "vorr.u8 q1, q1, q3 \n" /* R,A */ \
  785. "vorr.u8 q0, q0, q2 \n" /* B,G */
  786. // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
  787. #define RGB555TOARGB \
  788. "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \
  789. "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \
  790. "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \
  791. "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \
  792. "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
  793. "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
  794. "vorr.u8 d0, d0, d4 \n" /* B */ \
  795. "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \
  796. "vorr.u8 d2, d1, d5 \n" /* R */ \
  797. "vorr.u8 d1, d4, d6 \n" /* G */
  798. void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
  799. uint8_t* dst_argb,
  800. int width) {
  801. asm volatile(
  802. "vmov.u8 d3, #255 \n" // Alpha
  803. "1: \n"
  804. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
  805. "subs %2, %2, #8 \n" // 8 processed per loop.
  806. ARGB1555TOARGB
  807. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
  808. "bgt 1b \n"
  809. : "+r"(src_argb1555), // %0
  810. "+r"(dst_argb), // %1
  811. "+r"(width) // %2
  812. :
  813. : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
  814. );
  815. }
  816. #define ARGB4444TOARGB \
  817. "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \
  818. "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \
  819. "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \
  820. "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \
  821. "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \
  822. "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \
  823. "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \
  824. "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */
  825. void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
  826. uint8_t* dst_argb,
  827. int width) {
  828. asm volatile(
  829. "vmov.u8 d3, #255 \n" // Alpha
  830. "1: \n"
  831. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
  832. "subs %2, %2, #8 \n" // 8 processed per loop.
  833. ARGB4444TOARGB
  834. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
  835. "bgt 1b \n"
  836. : "+r"(src_argb4444), // %0
  837. "+r"(dst_argb), // %1
  838. "+r"(width) // %2
  839. :
  840. : "cc", "memory", "q0", "q1", "q2" // Clobber List
  841. );
  842. }
  843. void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
  844. uint8_t* dst_rgb24,
  845. int width) {
  846. asm volatile(
  847. "1: \n"
  848. "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
  849. "subs %2, %2, #8 \n" // 8 processed per loop.
  850. "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
  851. // RGB24.
  852. "bgt 1b \n"
  853. : "+r"(src_argb), // %0
  854. "+r"(dst_rgb24), // %1
  855. "+r"(width) // %2
  856. :
  857. : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
  858. );
  859. }
  860. void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
  861. asm volatile(
  862. "1: \n"
  863. "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
  864. "subs %2, %2, #8 \n" // 8 processed per loop.
  865. "vswp.u8 d1, d3 \n" // swap R, B
  866. "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
  867. "bgt 1b \n"
  868. : "+r"(src_argb), // %0
  869. "+r"(dst_raw), // %1
  870. "+r"(width) // %2
  871. :
  872. : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
  873. );
  874. }
  875. void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
  876. asm volatile(
  877. "1: \n"
  878. "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
  879. "subs %2, %2, #16 \n" // 16 processed per loop.
  880. "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
  881. "bgt 1b \n"
  882. : "+r"(src_yuy2), // %0
  883. "+r"(dst_y), // %1
  884. "+r"(width) // %2
  885. :
  886. : "cc", "memory", "q0", "q1" // Clobber List
  887. );
  888. }
  889. void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
  890. asm volatile(
  891. "1: \n"
  892. "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
  893. "subs %2, %2, #16 \n" // 16 processed per loop.
  894. "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
  895. "bgt 1b \n"
  896. : "+r"(src_uyvy), // %0
  897. "+r"(dst_y), // %1
  898. "+r"(width) // %2
  899. :
  900. : "cc", "memory", "q0", "q1" // Clobber List
  901. );
  902. }
  903. void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
  904. uint8_t* dst_u,
  905. uint8_t* dst_v,
  906. int width) {
  907. asm volatile(
  908. "1: \n"
  909. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
  910. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
  911. "vst1.8 {d1}, [%1]! \n" // store 8 U.
  912. "vst1.8 {d3}, [%2]! \n" // store 8 V.
  913. "bgt 1b \n"
  914. : "+r"(src_yuy2), // %0
  915. "+r"(dst_u), // %1
  916. "+r"(dst_v), // %2
  917. "+r"(width) // %3
  918. :
  919. : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
  920. );
  921. }
  922. void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
  923. uint8_t* dst_u,
  924. uint8_t* dst_v,
  925. int width) {
  926. asm volatile(
  927. "1: \n"
  928. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
  929. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
  930. "vst1.8 {d0}, [%1]! \n" // store 8 U.
  931. "vst1.8 {d2}, [%2]! \n" // store 8 V.
  932. "bgt 1b \n"
  933. : "+r"(src_uyvy), // %0
  934. "+r"(dst_u), // %1
  935. "+r"(dst_v), // %2
  936. "+r"(width) // %3
  937. :
  938. : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
  939. );
  940. }
  941. void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
  942. int stride_yuy2,
  943. uint8_t* dst_u,
  944. uint8_t* dst_v,
  945. int width) {
  946. asm volatile(
  947. "add %1, %0, %1 \n" // stride + src_yuy2
  948. "1: \n"
  949. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
  950. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
  951. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
  952. "vrhadd.u8 d1, d1, d5 \n" // average rows of U
  953. "vrhadd.u8 d3, d3, d7 \n" // average rows of V
  954. "vst1.8 {d1}, [%2]! \n" // store 8 U.
  955. "vst1.8 {d3}, [%3]! \n" // store 8 V.
  956. "bgt 1b \n"
  957. : "+r"(src_yuy2), // %0
  958. "+r"(stride_yuy2), // %1
  959. "+r"(dst_u), // %2
  960. "+r"(dst_v), // %3
  961. "+r"(width) // %4
  962. :
  963. : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
  964. "d7" // Clobber List
  965. );
  966. }
  967. void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
  968. int stride_uyvy,
  969. uint8_t* dst_u,
  970. uint8_t* dst_v,
  971. int width) {
  972. asm volatile(
  973. "add %1, %0, %1 \n" // stride + src_uyvy
  974. "1: \n"
  975. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
  976. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
  977. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
  978. "vrhadd.u8 d0, d0, d4 \n" // average rows of U
  979. "vrhadd.u8 d2, d2, d6 \n" // average rows of V
  980. "vst1.8 {d0}, [%2]! \n" // store 8 U.
  981. "vst1.8 {d2}, [%3]! \n" // store 8 V.
  982. "bgt 1b \n"
  983. : "+r"(src_uyvy), // %0
  984. "+r"(stride_uyvy), // %1
  985. "+r"(dst_u), // %2
  986. "+r"(dst_v), // %3
  987. "+r"(width) // %4
  988. :
  989. : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
  990. "d7" // Clobber List
  991. );
  992. }
  993. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  994. void ARGBShuffleRow_NEON(const uint8_t* src_argb,
  995. uint8_t* dst_argb,
  996. const uint8_t* shuffler,
  997. int width) {
  998. asm volatile(
  999. "vld1.8 {q2}, [%3] \n" // shuffler
  1000. "1: \n"
  1001. "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
  1002. "subs %2, %2, #4 \n" // 4 processed per loop
  1003. "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
  1004. "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
  1005. "vst1.8 {q1}, [%1]! \n" // store 4.
  1006. "bgt 1b \n"
  1007. : "+r"(src_argb), // %0
  1008. "+r"(dst_argb), // %1
  1009. "+r"(width) // %2
  1010. : "r"(shuffler) // %3
  1011. : "cc", "memory", "q0", "q1", "q2" // Clobber List
  1012. );
  1013. }
  1014. void I422ToYUY2Row_NEON(const uint8_t* src_y,
  1015. const uint8_t* src_u,
  1016. const uint8_t* src_v,
  1017. uint8_t* dst_yuy2,
  1018. int width) {
  1019. asm volatile(
  1020. "1: \n"
  1021. "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
  1022. "vld1.8 {d1}, [%1]! \n" // load 8 Us
  1023. "vld1.8 {d3}, [%2]! \n" // load 8 Vs
  1024. "subs %4, %4, #16 \n" // 16 pixels
  1025. "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
  1026. "bgt 1b \n"
  1027. : "+r"(src_y), // %0
  1028. "+r"(src_u), // %1
  1029. "+r"(src_v), // %2
  1030. "+r"(dst_yuy2), // %3
  1031. "+r"(width) // %4
  1032. :
  1033. : "cc", "memory", "d0", "d1", "d2", "d3");
  1034. }
  1035. void I422ToUYVYRow_NEON(const uint8_t* src_y,
  1036. const uint8_t* src_u,
  1037. const uint8_t* src_v,
  1038. uint8_t* dst_uyvy,
  1039. int width) {
  1040. asm volatile(
  1041. "1: \n"
  1042. "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
  1043. "vld1.8 {d0}, [%1]! \n" // load 8 Us
  1044. "vld1.8 {d2}, [%2]! \n" // load 8 Vs
  1045. "subs %4, %4, #16 \n" // 16 pixels
  1046. "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
  1047. "bgt 1b \n"
  1048. : "+r"(src_y), // %0
  1049. "+r"(src_u), // %1
  1050. "+r"(src_v), // %2
  1051. "+r"(dst_uyvy), // %3
  1052. "+r"(width) // %4
  1053. :
  1054. : "cc", "memory", "d0", "d1", "d2", "d3");
  1055. }
  1056. void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
  1057. uint8_t* dst_rgb565,
  1058. int width) {
  1059. asm volatile(
  1060. "1: \n"
  1061. "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
  1062. "subs %2, %2, #8 \n" // 8 processed per loop.
  1063. ARGBTORGB565
  1064. "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
  1065. "bgt 1b \n"
  1066. : "+r"(src_argb), // %0
  1067. "+r"(dst_rgb565), // %1
  1068. "+r"(width) // %2
  1069. :
  1070. : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
  1071. }
  1072. void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
  1073. uint8_t* dst_rgb,
  1074. const uint32_t dither4,
  1075. int width) {
  1076. asm volatile(
  1077. "vdup.32 d2, %2 \n" // dither4
  1078. "1: \n"
  1079. "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB.
  1080. "subs %3, %3, #8 \n" // 8 processed per loop.
  1081. "vqadd.u8 d20, d20, d2 \n"
  1082. "vqadd.u8 d21, d21, d2 \n"
  1083. "vqadd.u8 d22, d22, d2 \n" // add for dither
  1084. ARGBTORGB565
  1085. "vst1.8 {q0}, [%0]! \n" // store 8 RGB565.
  1086. "bgt 1b \n"
  1087. : "+r"(dst_rgb) // %0
  1088. : "r"(src_argb), // %1
  1089. "r"(dither4), // %2
  1090. "r"(width) // %3
  1091. : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11");
  1092. }
  1093. void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
  1094. uint8_t* dst_argb1555,
  1095. int width) {
  1096. asm volatile(
  1097. "1: \n"
  1098. "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
  1099. "subs %2, %2, #8 \n" // 8 processed per loop.
  1100. ARGBTOARGB1555
  1101. "vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555.
  1102. "bgt 1b \n"
  1103. : "+r"(src_argb), // %0
  1104. "+r"(dst_argb1555), // %1
  1105. "+r"(width) // %2
  1106. :
  1107. : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
  1108. }
  1109. void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
  1110. uint8_t* dst_argb4444,
  1111. int width) {
  1112. asm volatile(
  1113. "vmov.u8 d4, #0x0f \n" // bits to clear with
  1114. // vbic.
  1115. "1: \n"
  1116. "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
  1117. "subs %2, %2, #8 \n" // 8 processed per loop.
  1118. ARGBTOARGB4444
  1119. "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444.
  1120. "bgt 1b \n"
  1121. : "+r"(src_argb), // %0
  1122. "+r"(dst_argb4444), // %1
  1123. "+r"(width) // %2
  1124. :
  1125. : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
  1126. }
  1127. void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
  1128. asm volatile(
  1129. "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
  1130. "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
  1131. "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
  1132. "vmov.u8 d27, #16 \n" // Add 16 constant
  1133. "1: \n"
  1134. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
  1135. "subs %2, %2, #8 \n" // 8 processed per loop.
  1136. "vmull.u8 q2, d0, d24 \n" // B
  1137. "vmlal.u8 q2, d1, d25 \n" // G
  1138. "vmlal.u8 q2, d2, d26 \n" // R
  1139. "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
  1140. "vqadd.u8 d0, d27 \n"
  1141. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1142. "bgt 1b \n"
  1143. : "+r"(src_argb), // %0
  1144. "+r"(dst_y), // %1
  1145. "+r"(width) // %2
  1146. :
  1147. : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
  1148. }
  1149. void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
  1150. uint8_t* dst_a,
  1151. int width) {
  1152. asm volatile(
  1153. "1: \n"
  1154. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
  1155. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
  1156. "subs %2, %2, #16 \n" // 16 processed per loop
  1157. "vst1.8 {q3}, [%1]! \n" // store 16 A's.
  1158. "bgt 1b \n"
  1159. : "+r"(src_argb), // %0
  1160. "+r"(dst_a), // %1
  1161. "+r"(width) // %2
  1162. :
  1163. : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
  1164. );
  1165. }
  1166. void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
  1167. asm volatile(
  1168. "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
  1169. "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
  1170. "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
  1171. "1: \n"
  1172. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
  1173. "subs %2, %2, #8 \n" // 8 processed per loop.
  1174. "vmull.u8 q2, d0, d24 \n" // B
  1175. "vmlal.u8 q2, d1, d25 \n" // G
  1176. "vmlal.u8 q2, d2, d26 \n" // R
  1177. "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y
  1178. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1179. "bgt 1b \n"
  1180. : "+r"(src_argb), // %0
  1181. "+r"(dst_y), // %1
  1182. "+r"(width) // %2
  1183. :
  1184. : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
  1185. }
  1186. // 8x1 pixels.
  1187. void ARGBToUV444Row_NEON(const uint8_t* src_argb,
  1188. uint8_t* dst_u,
  1189. uint8_t* dst_v,
  1190. int width) {
  1191. asm volatile(
  1192. "vmov.u8 d24, #112 \n" // UB / VR 0.875
  1193. // coefficient
  1194. "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient
  1195. "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient
  1196. "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
  1197. "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
  1198. "vmov.u16 q15, #0x8080 \n" // 128.5
  1199. "1: \n"
  1200. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
  1201. "subs %3, %3, #8 \n" // 8 processed per loop.
  1202. "vmull.u8 q2, d0, d24 \n" // B
  1203. "vmlsl.u8 q2, d1, d25 \n" // G
  1204. "vmlsl.u8 q2, d2, d26 \n" // R
  1205. "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned
  1206. "vmull.u8 q3, d2, d24 \n" // R
  1207. "vmlsl.u8 q3, d1, d28 \n" // G
  1208. "vmlsl.u8 q3, d0, d27 \n" // B
  1209. "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned
  1210. "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U
  1211. "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V
  1212. "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
  1213. "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
  1214. "bgt 1b \n"
  1215. : "+r"(src_argb), // %0
  1216. "+r"(dst_u), // %1
  1217. "+r"(dst_v), // %2
  1218. "+r"(width) // %3
  1219. :
  1220. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14",
  1221. "q15");
  1222. }
  1223. // clang-format off
  1224. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
  1225. #define RGBTOUV(QB, QG, QR) \
  1226. "vmul.s16 q8, " #QB ", q10 \n" /* B */ \
  1227. "vmls.s16 q8, " #QG ", q11 \n" /* G */ \
  1228. "vmls.s16 q8, " #QR ", q12 \n" /* R */ \
  1229. "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \
  1230. "vmul.s16 q9, " #QR ", q10 \n" /* R */ \
  1231. "vmls.s16 q9, " #QG ", q14 \n" /* G */ \
  1232. "vmls.s16 q9, " #QB ", q13 \n" /* B */ \
  1233. "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \
  1234. "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \
  1235. "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
  1236. // clang-format on
  1237. // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
  1238. void ARGBToUVRow_NEON(const uint8_t* src_argb,
  1239. int src_stride_argb,
  1240. uint8_t* dst_u,
  1241. uint8_t* dst_v,
  1242. int width) {
  1243. asm volatile (
  1244. "add %1, %0, %1 \n" // src_stride + src_argb
  1245. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1246. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1247. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1248. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1249. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1250. "vmov.u16 q15, #0x8080 \n" // 128.5
  1251. "1: \n"
  1252. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
  1253. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
  1254. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
  1255. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
  1256. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
  1257. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
  1258. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
  1259. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
  1260. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
  1261. "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
  1262. "vrshr.u16 q0, q0, #1 \n" // 2x average
  1263. "vrshr.u16 q1, q1, #1 \n"
  1264. "vrshr.u16 q2, q2, #1 \n"
  1265. "subs %4, %4, #16 \n" // 32 processed per loop.
  1266. RGBTOUV(q0, q1, q2)
  1267. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1268. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1269. "bgt 1b \n"
  1270. : "+r"(src_argb), // %0
  1271. "+r"(src_stride_argb), // %1
  1272. "+r"(dst_u), // %2
  1273. "+r"(dst_v), // %3
  1274. "+r"(width) // %4
  1275. :
  1276. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1277. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1278. );
  1279. }
  1280. // TODO(fbarchard): Subsample match C code.
  1281. void ARGBToUVJRow_NEON(const uint8_t* src_argb,
  1282. int src_stride_argb,
  1283. uint8_t* dst_u,
  1284. uint8_t* dst_v,
  1285. int width) {
  1286. asm volatile (
  1287. "add %1, %0, %1 \n" // src_stride + src_argb
  1288. "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
  1289. "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
  1290. "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
  1291. "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
  1292. "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
  1293. "vmov.u16 q15, #0x8080 \n" // 128.5
  1294. "1: \n"
  1295. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
  1296. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
  1297. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
  1298. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
  1299. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
  1300. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
  1301. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
  1302. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
  1303. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
  1304. "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
  1305. "vrshr.u16 q0, q0, #1 \n" // 2x average
  1306. "vrshr.u16 q1, q1, #1 \n"
  1307. "vrshr.u16 q2, q2, #1 \n"
  1308. "subs %4, %4, #16 \n" // 32 processed per loop.
  1309. RGBTOUV(q0, q1, q2)
  1310. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1311. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1312. "bgt 1b \n"
  1313. : "+r"(src_argb), // %0
  1314. "+r"(src_stride_argb), // %1
  1315. "+r"(dst_u), // %2
  1316. "+r"(dst_v), // %3
  1317. "+r"(width) // %4
  1318. :
  1319. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1320. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1321. );
  1322. }
  1323. void BGRAToUVRow_NEON(const uint8_t* src_bgra,
  1324. int src_stride_bgra,
  1325. uint8_t* dst_u,
  1326. uint8_t* dst_v,
  1327. int width) {
  1328. asm volatile (
  1329. "add %1, %0, %1 \n" // src_stride + src_bgra
  1330. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1331. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1332. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1333. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1334. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1335. "vmov.u16 q15, #0x8080 \n" // 128.5
  1336. "1: \n"
  1337. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
  1338. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
  1339. "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
  1340. "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
  1341. "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
  1342. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
  1343. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
  1344. "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
  1345. "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
  1346. "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
  1347. "vrshr.u16 q1, q1, #1 \n" // 2x average
  1348. "vrshr.u16 q2, q2, #1 \n"
  1349. "vrshr.u16 q3, q3, #1 \n"
  1350. "subs %4, %4, #16 \n" // 32 processed per loop.
  1351. RGBTOUV(q3, q2, q1)
  1352. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1353. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1354. "bgt 1b \n"
  1355. : "+r"(src_bgra), // %0
  1356. "+r"(src_stride_bgra), // %1
  1357. "+r"(dst_u), // %2
  1358. "+r"(dst_v), // %3
  1359. "+r"(width) // %4
  1360. :
  1361. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1362. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1363. );
  1364. }
  1365. void ABGRToUVRow_NEON(const uint8_t* src_abgr,
  1366. int src_stride_abgr,
  1367. uint8_t* dst_u,
  1368. uint8_t* dst_v,
  1369. int width) {
  1370. asm volatile (
  1371. "add %1, %0, %1 \n" // src_stride + src_abgr
  1372. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1373. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1374. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1375. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1376. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1377. "vmov.u16 q15, #0x8080 \n" // 128.5
  1378. "1: \n"
  1379. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
  1380. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
  1381. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
  1382. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
  1383. "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
  1384. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
  1385. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
  1386. "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
  1387. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
  1388. "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
  1389. "vrshr.u16 q0, q0, #1 \n" // 2x average
  1390. "vrshr.u16 q1, q1, #1 \n"
  1391. "vrshr.u16 q2, q2, #1 \n"
  1392. "subs %4, %4, #16 \n" // 32 processed per loop.
  1393. RGBTOUV(q2, q1, q0)
  1394. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1395. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1396. "bgt 1b \n"
  1397. : "+r"(src_abgr), // %0
  1398. "+r"(src_stride_abgr), // %1
  1399. "+r"(dst_u), // %2
  1400. "+r"(dst_v), // %3
  1401. "+r"(width) // %4
  1402. :
  1403. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1404. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1405. );
  1406. }
  1407. void RGBAToUVRow_NEON(const uint8_t* src_rgba,
  1408. int src_stride_rgba,
  1409. uint8_t* dst_u,
  1410. uint8_t* dst_v,
  1411. int width) {
  1412. asm volatile (
  1413. "add %1, %0, %1 \n" // src_stride + src_rgba
  1414. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1415. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1416. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1417. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1418. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1419. "vmov.u16 q15, #0x8080 \n" // 128.5
  1420. "1: \n"
  1421. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
  1422. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
  1423. "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
  1424. "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
  1425. "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
  1426. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
  1427. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
  1428. "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
  1429. "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
  1430. "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
  1431. "vrshr.u16 q0, q0, #1 \n" // 2x average
  1432. "vrshr.u16 q1, q1, #1 \n"
  1433. "vrshr.u16 q2, q2, #1 \n"
  1434. "subs %4, %4, #16 \n" // 32 processed per loop.
  1435. RGBTOUV(q0, q1, q2)
  1436. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1437. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1438. "bgt 1b \n"
  1439. : "+r"(src_rgba), // %0
  1440. "+r"(src_stride_rgba), // %1
  1441. "+r"(dst_u), // %2
  1442. "+r"(dst_v), // %3
  1443. "+r"(width) // %4
  1444. :
  1445. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1446. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1447. );
  1448. }
  1449. void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
  1450. int src_stride_rgb24,
  1451. uint8_t* dst_u,
  1452. uint8_t* dst_v,
  1453. int width) {
  1454. asm volatile (
  1455. "add %1, %0, %1 \n" // src_stride + src_rgb24
  1456. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1457. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1458. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1459. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1460. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1461. "vmov.u16 q15, #0x8080 \n" // 128.5
  1462. "1: \n"
  1463. "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
  1464. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
  1465. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
  1466. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
  1467. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
  1468. "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
  1469. "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
  1470. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
  1471. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
  1472. "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
  1473. "vrshr.u16 q0, q0, #1 \n" // 2x average
  1474. "vrshr.u16 q1, q1, #1 \n"
  1475. "vrshr.u16 q2, q2, #1 \n"
  1476. "subs %4, %4, #16 \n" // 32 processed per loop.
  1477. RGBTOUV(q0, q1, q2)
  1478. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1479. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1480. "bgt 1b \n"
  1481. : "+r"(src_rgb24), // %0
  1482. "+r"(src_stride_rgb24), // %1
  1483. "+r"(dst_u), // %2
  1484. "+r"(dst_v), // %3
  1485. "+r"(width) // %4
  1486. :
  1487. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1488. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1489. );
  1490. }
  1491. void RAWToUVRow_NEON(const uint8_t* src_raw,
  1492. int src_stride_raw,
  1493. uint8_t* dst_u,
  1494. uint8_t* dst_v,
  1495. int width) {
  1496. asm volatile (
  1497. "add %1, %0, %1 \n" // src_stride + src_raw
  1498. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1499. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1500. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1501. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1502. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1503. "vmov.u16 q15, #0x8080 \n" // 128.5
  1504. "1: \n"
  1505. "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
  1506. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
  1507. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
  1508. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
  1509. "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
  1510. "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
  1511. "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
  1512. "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
  1513. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
  1514. "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
  1515. "vrshr.u16 q0, q0, #1 \n" // 2x average
  1516. "vrshr.u16 q1, q1, #1 \n"
  1517. "vrshr.u16 q2, q2, #1 \n"
  1518. "subs %4, %4, #16 \n" // 32 processed per loop.
  1519. RGBTOUV(q2, q1, q0)
  1520. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1521. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1522. "bgt 1b \n"
  1523. : "+r"(src_raw), // %0
  1524. "+r"(src_stride_raw), // %1
  1525. "+r"(dst_u), // %2
  1526. "+r"(dst_v), // %3
  1527. "+r"(width) // %4
  1528. :
  1529. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1530. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1531. );
  1532. }
  1533. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
  1534. void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
  1535. int src_stride_rgb565,
  1536. uint8_t* dst_u,
  1537. uint8_t* dst_v,
  1538. int width) {
  1539. asm volatile(
  1540. "add %1, %0, %1 \n" // src_stride + src_argb
  1541. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
  1542. // coefficient
  1543. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1544. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1545. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1546. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1547. "vmov.u16 q15, #0x8080 \n" // 128.5
  1548. "1: \n"
  1549. "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
  1550. RGB565TOARGB
  1551. "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
  1552. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
  1553. "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
  1554. "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
  1555. RGB565TOARGB
  1556. "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
  1557. "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
  1558. "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
  1559. "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
  1560. RGB565TOARGB
  1561. "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
  1562. "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
  1563. "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
  1564. "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
  1565. RGB565TOARGB
  1566. "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
  1567. "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
  1568. "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
  1569. "vrshr.u16 q4, q4, #1 \n" // 2x average
  1570. "vrshr.u16 q5, q5, #1 \n"
  1571. "vrshr.u16 q6, q6, #1 \n"
  1572. "subs %4, %4, #16 \n" // 16 processed per loop.
  1573. "vmul.s16 q8, q4, q10 \n" // B
  1574. "vmls.s16 q8, q5, q11 \n" // G
  1575. "vmls.s16 q8, q6, q12 \n" // R
  1576. "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
  1577. "vmul.s16 q9, q6, q10 \n" // R
  1578. "vmls.s16 q9, q5, q14 \n" // G
  1579. "vmls.s16 q9, q4, q13 \n" // B
  1580. "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
  1581. "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
  1582. "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
  1583. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1584. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1585. "bgt 1b \n"
  1586. : "+r"(src_rgb565), // %0
  1587. "+r"(src_stride_rgb565), // %1
  1588. "+r"(dst_u), // %2
  1589. "+r"(dst_v), // %3
  1590. "+r"(width) // %4
  1591. :
  1592. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
  1593. "q9", "q10", "q11", "q12", "q13", "q14", "q15");
  1594. }
  1595. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
  1596. void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
  1597. int src_stride_argb1555,
  1598. uint8_t* dst_u,
  1599. uint8_t* dst_v,
  1600. int width) {
  1601. asm volatile(
  1602. "add %1, %0, %1 \n" // src_stride + src_argb
  1603. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
  1604. // coefficient
  1605. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1606. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1607. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1608. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1609. "vmov.u16 q15, #0x8080 \n" // 128.5
  1610. "1: \n"
  1611. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
  1612. RGB555TOARGB
  1613. "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
  1614. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
  1615. "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
  1616. "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
  1617. RGB555TOARGB
  1618. "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
  1619. "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
  1620. "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
  1621. "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
  1622. RGB555TOARGB
  1623. "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
  1624. "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
  1625. "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
  1626. "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
  1627. RGB555TOARGB
  1628. "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
  1629. "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
  1630. "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
  1631. "vrshr.u16 q4, q4, #1 \n" // 2x average
  1632. "vrshr.u16 q5, q5, #1 \n"
  1633. "vrshr.u16 q6, q6, #1 \n"
  1634. "subs %4, %4, #16 \n" // 16 processed per loop.
  1635. "vmul.s16 q8, q4, q10 \n" // B
  1636. "vmls.s16 q8, q5, q11 \n" // G
  1637. "vmls.s16 q8, q6, q12 \n" // R
  1638. "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
  1639. "vmul.s16 q9, q6, q10 \n" // R
  1640. "vmls.s16 q9, q5, q14 \n" // G
  1641. "vmls.s16 q9, q4, q13 \n" // B
  1642. "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
  1643. "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
  1644. "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
  1645. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1646. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1647. "bgt 1b \n"
  1648. : "+r"(src_argb1555), // %0
  1649. "+r"(src_stride_argb1555), // %1
  1650. "+r"(dst_u), // %2
  1651. "+r"(dst_v), // %3
  1652. "+r"(width) // %4
  1653. :
  1654. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
  1655. "q9", "q10", "q11", "q12", "q13", "q14", "q15");
  1656. }
  1657. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
  1658. void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
  1659. int src_stride_argb4444,
  1660. uint8_t* dst_u,
  1661. uint8_t* dst_v,
  1662. int width) {
  1663. asm volatile(
  1664. "add %1, %0, %1 \n" // src_stride + src_argb
  1665. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
  1666. // coefficient
  1667. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1668. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1669. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1670. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1671. "vmov.u16 q15, #0x8080 \n" // 128.5
  1672. "1: \n"
  1673. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
  1674. ARGB4444TOARGB
  1675. "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
  1676. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
  1677. "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
  1678. "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
  1679. ARGB4444TOARGB
  1680. "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
  1681. "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
  1682. "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
  1683. "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
  1684. ARGB4444TOARGB
  1685. "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
  1686. "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
  1687. "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
  1688. "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
  1689. ARGB4444TOARGB
  1690. "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
  1691. "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
  1692. "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
  1693. "vrshr.u16 q4, q4, #1 \n" // 2x average
  1694. "vrshr.u16 q5, q5, #1 \n"
  1695. "vrshr.u16 q6, q6, #1 \n"
  1696. "subs %4, %4, #16 \n" // 16 processed per loop.
  1697. "vmul.s16 q8, q4, q10 \n" // B
  1698. "vmls.s16 q8, q5, q11 \n" // G
  1699. "vmls.s16 q8, q6, q12 \n" // R
  1700. "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
  1701. "vmul.s16 q9, q6, q10 \n" // R
  1702. "vmls.s16 q9, q5, q14 \n" // G
  1703. "vmls.s16 q9, q4, q13 \n" // B
  1704. "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
  1705. "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
  1706. "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
  1707. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1708. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1709. "bgt 1b \n"
  1710. : "+r"(src_argb4444), // %0
  1711. "+r"(src_stride_argb4444), // %1
  1712. "+r"(dst_u), // %2
  1713. "+r"(dst_v), // %3
  1714. "+r"(width) // %4
  1715. :
  1716. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
  1717. "q9", "q10", "q11", "q12", "q13", "q14", "q15");
  1718. }
  1719. void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
  1720. asm volatile(
  1721. "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
  1722. "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
  1723. "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
  1724. "vmov.u8 d27, #16 \n" // Add 16 constant
  1725. "1: \n"
  1726. "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
  1727. "subs %2, %2, #8 \n" // 8 processed per loop.
  1728. RGB565TOARGB
  1729. "vmull.u8 q2, d0, d24 \n" // B
  1730. "vmlal.u8 q2, d1, d25 \n" // G
  1731. "vmlal.u8 q2, d2, d26 \n" // R
  1732. "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
  1733. "vqadd.u8 d0, d27 \n"
  1734. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1735. "bgt 1b \n"
  1736. : "+r"(src_rgb565), // %0
  1737. "+r"(dst_y), // %1
  1738. "+r"(width) // %2
  1739. :
  1740. : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
  1741. }
  1742. void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
  1743. uint8_t* dst_y,
  1744. int width) {
  1745. asm volatile(
  1746. "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
  1747. "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
  1748. "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
  1749. "vmov.u8 d27, #16 \n" // Add 16 constant
  1750. "1: \n"
  1751. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
  1752. "subs %2, %2, #8 \n" // 8 processed per loop.
  1753. ARGB1555TOARGB
  1754. "vmull.u8 q2, d0, d24 \n" // B
  1755. "vmlal.u8 q2, d1, d25 \n" // G
  1756. "vmlal.u8 q2, d2, d26 \n" // R
  1757. "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
  1758. "vqadd.u8 d0, d27 \n"
  1759. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1760. "bgt 1b \n"
  1761. : "+r"(src_argb1555), // %0
  1762. "+r"(dst_y), // %1
  1763. "+r"(width) // %2
  1764. :
  1765. : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
  1766. }
  1767. void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
  1768. uint8_t* dst_y,
  1769. int width) {
  1770. asm volatile(
  1771. "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
  1772. "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
  1773. "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
  1774. "vmov.u8 d27, #16 \n" // Add 16 constant
  1775. "1: \n"
  1776. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
  1777. "subs %2, %2, #8 \n" // 8 processed per loop.
  1778. ARGB4444TOARGB
  1779. "vmull.u8 q2, d0, d24 \n" // B
  1780. "vmlal.u8 q2, d1, d25 \n" // G
  1781. "vmlal.u8 q2, d2, d26 \n" // R
  1782. "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
  1783. "vqadd.u8 d0, d27 \n"
  1784. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1785. "bgt 1b \n"
  1786. : "+r"(src_argb4444), // %0
  1787. "+r"(dst_y), // %1
  1788. "+r"(width) // %2
  1789. :
  1790. : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
  1791. }
  1792. void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
  1793. asm volatile(
  1794. "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
  1795. "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
  1796. "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
  1797. "vmov.u8 d7, #16 \n" // Add 16 constant
  1798. "1: \n"
  1799. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
  1800. "subs %2, %2, #8 \n" // 8 processed per loop.
  1801. "vmull.u8 q8, d1, d4 \n" // R
  1802. "vmlal.u8 q8, d2, d5 \n" // G
  1803. "vmlal.u8 q8, d3, d6 \n" // B
  1804. "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
  1805. "vqadd.u8 d0, d7 \n"
  1806. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1807. "bgt 1b \n"
  1808. : "+r"(src_bgra), // %0
  1809. "+r"(dst_y), // %1
  1810. "+r"(width) // %2
  1811. :
  1812. : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
  1813. }
  1814. void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
  1815. asm volatile(
  1816. "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
  1817. "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
  1818. "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
  1819. "vmov.u8 d7, #16 \n" // Add 16 constant
  1820. "1: \n"
  1821. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
  1822. "subs %2, %2, #8 \n" // 8 processed per loop.
  1823. "vmull.u8 q8, d0, d4 \n" // R
  1824. "vmlal.u8 q8, d1, d5 \n" // G
  1825. "vmlal.u8 q8, d2, d6 \n" // B
  1826. "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
  1827. "vqadd.u8 d0, d7 \n"
  1828. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1829. "bgt 1b \n"
  1830. : "+r"(src_abgr), // %0
  1831. "+r"(dst_y), // %1
  1832. "+r"(width) // %2
  1833. :
  1834. : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
  1835. }
  1836. void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
  1837. asm volatile(
  1838. "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
  1839. "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
  1840. "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
  1841. "vmov.u8 d7, #16 \n" // Add 16 constant
  1842. "1: \n"
  1843. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
  1844. "subs %2, %2, #8 \n" // 8 processed per loop.
  1845. "vmull.u8 q8, d1, d4 \n" // B
  1846. "vmlal.u8 q8, d2, d5 \n" // G
  1847. "vmlal.u8 q8, d3, d6 \n" // R
  1848. "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
  1849. "vqadd.u8 d0, d7 \n"
  1850. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1851. "bgt 1b \n"
  1852. : "+r"(src_rgba), // %0
  1853. "+r"(dst_y), // %1
  1854. "+r"(width) // %2
  1855. :
  1856. : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
  1857. }
  1858. void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
  1859. asm volatile(
  1860. "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
  1861. "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
  1862. "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
  1863. "vmov.u8 d7, #16 \n" // Add 16 constant
  1864. "1: \n"
  1865. "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
  1866. "subs %2, %2, #8 \n" // 8 processed per loop.
  1867. "vmull.u8 q8, d0, d4 \n" // B
  1868. "vmlal.u8 q8, d1, d5 \n" // G
  1869. "vmlal.u8 q8, d2, d6 \n" // R
  1870. "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
  1871. "vqadd.u8 d0, d7 \n"
  1872. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1873. "bgt 1b \n"
  1874. : "+r"(src_rgb24), // %0
  1875. "+r"(dst_y), // %1
  1876. "+r"(width) // %2
  1877. :
  1878. : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
  1879. }
  1880. void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
  1881. asm volatile(
  1882. "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
  1883. "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
  1884. "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
  1885. "vmov.u8 d7, #16 \n" // Add 16 constant
  1886. "1: \n"
  1887. "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
  1888. "subs %2, %2, #8 \n" // 8 processed per loop.
  1889. "vmull.u8 q8, d0, d4 \n" // B
  1890. "vmlal.u8 q8, d1, d5 \n" // G
  1891. "vmlal.u8 q8, d2, d6 \n" // R
  1892. "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
  1893. "vqadd.u8 d0, d7 \n"
  1894. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1895. "bgt 1b \n"
  1896. : "+r"(src_raw), // %0
  1897. "+r"(dst_y), // %1
  1898. "+r"(width) // %2
  1899. :
  1900. : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
  1901. }
  1902. // Bilinear filter 16x2 -> 16x1
  1903. void InterpolateRow_NEON(uint8_t* dst_ptr,
  1904. const uint8_t* src_ptr,
  1905. ptrdiff_t src_stride,
  1906. int dst_width,
  1907. int source_y_fraction) {
  1908. int y1_fraction = source_y_fraction;
  1909. asm volatile(
  1910. "cmp %4, #0 \n"
  1911. "beq 100f \n"
  1912. "add %2, %1 \n"
  1913. "cmp %4, #128 \n"
  1914. "beq 50f \n"
  1915. "vdup.8 d5, %4 \n"
  1916. "rsb %4, #256 \n"
  1917. "vdup.8 d4, %4 \n"
  1918. // General purpose row blend.
  1919. "1: \n"
  1920. "vld1.8 {q0}, [%1]! \n"
  1921. "vld1.8 {q1}, [%2]! \n"
  1922. "subs %3, %3, #16 \n"
  1923. "vmull.u8 q13, d0, d4 \n"
  1924. "vmull.u8 q14, d1, d4 \n"
  1925. "vmlal.u8 q13, d2, d5 \n"
  1926. "vmlal.u8 q14, d3, d5 \n"
  1927. "vrshrn.u16 d0, q13, #8 \n"
  1928. "vrshrn.u16 d1, q14, #8 \n"
  1929. "vst1.8 {q0}, [%0]! \n"
  1930. "bgt 1b \n"
  1931. "b 99f \n"
  1932. // Blend 50 / 50.
  1933. "50: \n"
  1934. "vld1.8 {q0}, [%1]! \n"
  1935. "vld1.8 {q1}, [%2]! \n"
  1936. "subs %3, %3, #16 \n"
  1937. "vrhadd.u8 q0, q1 \n"
  1938. "vst1.8 {q0}, [%0]! \n"
  1939. "bgt 50b \n"
  1940. "b 99f \n"
  1941. // Blend 100 / 0 - Copy row unchanged.
  1942. "100: \n"
  1943. "vld1.8 {q0}, [%1]! \n"
  1944. "subs %3, %3, #16 \n"
  1945. "vst1.8 {q0}, [%0]! \n"
  1946. "bgt 100b \n"
  1947. "99: \n"
  1948. : "+r"(dst_ptr), // %0
  1949. "+r"(src_ptr), // %1
  1950. "+r"(src_stride), // %2
  1951. "+r"(dst_width), // %3
  1952. "+r"(y1_fraction) // %4
  1953. :
  1954. : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14");
  1955. }
  1956. // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
  1957. void ARGBBlendRow_NEON(const uint8_t* src_argb0,
  1958. const uint8_t* src_argb1,
  1959. uint8_t* dst_argb,
  1960. int width) {
  1961. asm volatile(
  1962. "subs %3, #8 \n"
  1963. "blt 89f \n"
  1964. // Blend 8 pixels.
  1965. "8: \n"
  1966. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
  1967. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
  1968. "subs %3, %3, #8 \n" // 8 processed per loop.
  1969. "vmull.u8 q10, d4, d3 \n" // db * a
  1970. "vmull.u8 q11, d5, d3 \n" // dg * a
  1971. "vmull.u8 q12, d6, d3 \n" // dr * a
  1972. "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
  1973. "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
  1974. "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
  1975. "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
  1976. "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
  1977. "vqadd.u8 q0, q0, q2 \n" // + sbg
  1978. "vqadd.u8 d2, d2, d6 \n" // + sr
  1979. "vmov.u8 d3, #255 \n" // a = 255
  1980. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
  1981. "bge 8b \n"
  1982. "89: \n"
  1983. "adds %3, #8-1 \n"
  1984. "blt 99f \n"
  1985. // Blend 1 pixels.
  1986. "1: \n"
  1987. "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
  1988. "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
  1989. "subs %3, %3, #1 \n" // 1 processed per loop.
  1990. "vmull.u8 q10, d4, d3 \n" // db * a
  1991. "vmull.u8 q11, d5, d3 \n" // dg * a
  1992. "vmull.u8 q12, d6, d3 \n" // dr * a
  1993. "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
  1994. "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
  1995. "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
  1996. "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
  1997. "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
  1998. "vqadd.u8 q0, q0, q2 \n" // + sbg
  1999. "vqadd.u8 d2, d2, d6 \n" // + sr
  2000. "vmov.u8 d3, #255 \n" // a = 255
  2001. "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
  2002. "bge 1b \n"
  2003. "99: \n"
  2004. : "+r"(src_argb0), // %0
  2005. "+r"(src_argb1), // %1
  2006. "+r"(dst_argb), // %2
  2007. "+r"(width) // %3
  2008. :
  2009. : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12");
  2010. }
  2011. // Attenuate 8 pixels at a time.
  2012. void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
  2013. uint8_t* dst_argb,
  2014. int width) {
  2015. asm volatile(
  2016. // Attenuate 8 pixels.
  2017. "1: \n"
  2018. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
  2019. "subs %2, %2, #8 \n" // 8 processed per loop.
  2020. "vmull.u8 q10, d0, d3 \n" // b * a
  2021. "vmull.u8 q11, d1, d3 \n" // g * a
  2022. "vmull.u8 q12, d2, d3 \n" // r * a
  2023. "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
  2024. "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
  2025. "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
  2026. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
  2027. "bgt 1b \n"
  2028. : "+r"(src_argb), // %0
  2029. "+r"(dst_argb), // %1
  2030. "+r"(width) // %2
  2031. :
  2032. : "cc", "memory", "q0", "q1", "q10", "q11", "q12");
  2033. }
  2034. // Quantize 8 ARGB pixels (32 bytes).
  2035. // dst = (dst * scale >> 16) * interval_size + interval_offset;
  2036. void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
  2037. int scale,
  2038. int interval_size,
  2039. int interval_offset,
  2040. int width) {
  2041. asm volatile(
  2042. "vdup.u16 q8, %2 \n"
  2043. "vshr.u16 q8, q8, #1 \n" // scale >>= 1
  2044. "vdup.u16 q9, %3 \n" // interval multiply.
  2045. "vdup.u16 q10, %4 \n" // interval add
  2046. // 8 pixel loop.
  2047. "1: \n"
  2048. "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
  2049. "subs %1, %1, #8 \n" // 8 processed per loop.
  2050. "vmovl.u8 q0, d0 \n" // b (0 .. 255)
  2051. "vmovl.u8 q1, d2 \n"
  2052. "vmovl.u8 q2, d4 \n"
  2053. "vqdmulh.s16 q0, q0, q8 \n" // b * scale
  2054. "vqdmulh.s16 q1, q1, q8 \n" // g
  2055. "vqdmulh.s16 q2, q2, q8 \n" // r
  2056. "vmul.u16 q0, q0, q9 \n" // b * interval_size
  2057. "vmul.u16 q1, q1, q9 \n" // g
  2058. "vmul.u16 q2, q2, q9 \n" // r
  2059. "vadd.u16 q0, q0, q10 \n" // b + interval_offset
  2060. "vadd.u16 q1, q1, q10 \n" // g
  2061. "vadd.u16 q2, q2, q10 \n" // r
  2062. "vqmovn.u16 d0, q0 \n"
  2063. "vqmovn.u16 d2, q1 \n"
  2064. "vqmovn.u16 d4, q2 \n"
  2065. "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
  2066. "bgt 1b \n"
  2067. : "+r"(dst_argb), // %0
  2068. "+r"(width) // %1
  2069. : "r"(scale), // %2
  2070. "r"(interval_size), // %3
  2071. "r"(interval_offset) // %4
  2072. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10");
  2073. }
  2074. // Shade 8 pixels at a time by specified value.
  2075. // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
  2076. // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
  2077. void ARGBShadeRow_NEON(const uint8_t* src_argb,
  2078. uint8_t* dst_argb,
  2079. int width,
  2080. uint32_t value) {
  2081. asm volatile(
  2082. "vdup.u32 q0, %3 \n" // duplicate scale value.
  2083. "vzip.u8 d0, d1 \n" // d0 aarrggbb.
  2084. "vshr.u16 q0, q0, #1 \n" // scale / 2.
  2085. // 8 pixel loop.
  2086. "1: \n"
  2087. "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
  2088. "subs %2, %2, #8 \n" // 8 processed per loop.
  2089. "vmovl.u8 q10, d20 \n" // b (0 .. 255)
  2090. "vmovl.u8 q11, d22 \n"
  2091. "vmovl.u8 q12, d24 \n"
  2092. "vmovl.u8 q13, d26 \n"
  2093. "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2
  2094. "vqrdmulh.s16 q11, q11, d0[1] \n" // g
  2095. "vqrdmulh.s16 q12, q12, d0[2] \n" // r
  2096. "vqrdmulh.s16 q13, q13, d0[3] \n" // a
  2097. "vqmovn.u16 d20, q10 \n"
  2098. "vqmovn.u16 d22, q11 \n"
  2099. "vqmovn.u16 d24, q12 \n"
  2100. "vqmovn.u16 d26, q13 \n"
  2101. "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
  2102. "bgt 1b \n"
  2103. : "+r"(src_argb), // %0
  2104. "+r"(dst_argb), // %1
  2105. "+r"(width) // %2
  2106. : "r"(value) // %3
  2107. : "cc", "memory", "q0", "q10", "q11", "q12", "q13");
  2108. }
  2109. // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
  2110. // Similar to ARGBToYJ but stores ARGB.
  2111. // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
  2112. void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
  2113. asm volatile(
  2114. "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
  2115. "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
  2116. "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
  2117. "1: \n"
  2118. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
  2119. "subs %2, %2, #8 \n" // 8 processed per loop.
  2120. "vmull.u8 q2, d0, d24 \n" // B
  2121. "vmlal.u8 q2, d1, d25 \n" // G
  2122. "vmlal.u8 q2, d2, d26 \n" // R
  2123. "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B
  2124. "vmov d1, d0 \n" // G
  2125. "vmov d2, d0 \n" // R
  2126. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
  2127. "bgt 1b \n"
  2128. : "+r"(src_argb), // %0
  2129. "+r"(dst_argb), // %1
  2130. "+r"(width) // %2
  2131. :
  2132. : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
  2133. }
  2134. // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
  2135. // b = (r * 35 + g * 68 + b * 17) >> 7
  2136. // g = (r * 45 + g * 88 + b * 22) >> 7
  2137. // r = (r * 50 + g * 98 + b * 24) >> 7
  2138. void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
  2139. asm volatile(
  2140. "vmov.u8 d20, #17 \n" // BB coefficient
  2141. "vmov.u8 d21, #68 \n" // BG coefficient
  2142. "vmov.u8 d22, #35 \n" // BR coefficient
  2143. "vmov.u8 d24, #22 \n" // GB coefficient
  2144. "vmov.u8 d25, #88 \n" // GG coefficient
  2145. "vmov.u8 d26, #45 \n" // GR coefficient
  2146. "vmov.u8 d28, #24 \n" // BB coefficient
  2147. "vmov.u8 d29, #98 \n" // BG coefficient
  2148. "vmov.u8 d30, #50 \n" // BR coefficient
  2149. "1: \n"
  2150. "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
  2151. "subs %1, %1, #8 \n" // 8 processed per loop.
  2152. "vmull.u8 q2, d0, d20 \n" // B to Sepia B
  2153. "vmlal.u8 q2, d1, d21 \n" // G
  2154. "vmlal.u8 q2, d2, d22 \n" // R
  2155. "vmull.u8 q3, d0, d24 \n" // B to Sepia G
  2156. "vmlal.u8 q3, d1, d25 \n" // G
  2157. "vmlal.u8 q3, d2, d26 \n" // R
  2158. "vmull.u8 q8, d0, d28 \n" // B to Sepia R
  2159. "vmlal.u8 q8, d1, d29 \n" // G
  2160. "vmlal.u8 q8, d2, d30 \n" // R
  2161. "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
  2162. "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
  2163. "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
  2164. "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
  2165. "bgt 1b \n"
  2166. : "+r"(dst_argb), // %0
  2167. "+r"(width) // %1
  2168. :
  2169. : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12", "q13",
  2170. "q14", "q15");
  2171. }
  2172. // Tranform 8 ARGB pixels (32 bytes) with color matrix.
  2173. // TODO(fbarchard): Was same as Sepia except matrix is provided. This function
  2174. // needs to saturate. Consider doing a non-saturating version.
  2175. void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
  2176. uint8_t* dst_argb,
  2177. const int8_t* matrix_argb,
  2178. int width) {
  2179. asm volatile(
  2180. "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
  2181. "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
  2182. "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
  2183. "1: \n"
  2184. "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
  2185. "subs %2, %2, #8 \n" // 8 processed per loop.
  2186. "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
  2187. "vmovl.u8 q9, d18 \n" // g
  2188. "vmovl.u8 q10, d20 \n" // r
  2189. "vmovl.u8 q11, d22 \n" // a
  2190. "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
  2191. "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
  2192. "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
  2193. "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A
  2194. "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
  2195. "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
  2196. "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
  2197. "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A
  2198. "vqadd.s16 q12, q12, q4 \n" // Accumulate B
  2199. "vqadd.s16 q13, q13, q5 \n" // Accumulate G
  2200. "vqadd.s16 q14, q14, q6 \n" // Accumulate R
  2201. "vqadd.s16 q15, q15, q7 \n" // Accumulate A
  2202. "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
  2203. "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
  2204. "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
  2205. "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A
  2206. "vqadd.s16 q12, q12, q4 \n" // Accumulate B
  2207. "vqadd.s16 q13, q13, q5 \n" // Accumulate G
  2208. "vqadd.s16 q14, q14, q6 \n" // Accumulate R
  2209. "vqadd.s16 q15, q15, q7 \n" // Accumulate A
  2210. "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B
  2211. "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G
  2212. "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R
  2213. "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A
  2214. "vqadd.s16 q12, q12, q4 \n" // Accumulate B
  2215. "vqadd.s16 q13, q13, q5 \n" // Accumulate G
  2216. "vqadd.s16 q14, q14, q6 \n" // Accumulate R
  2217. "vqadd.s16 q15, q15, q7 \n" // Accumulate A
  2218. "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B
  2219. "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G
  2220. "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R
  2221. "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A
  2222. "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
  2223. "bgt 1b \n"
  2224. : "+r"(src_argb), // %0
  2225. "+r"(dst_argb), // %1
  2226. "+r"(width) // %2
  2227. : "r"(matrix_argb) // %3
  2228. : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
  2229. "q10", "q11", "q12", "q13", "q14", "q15");
  2230. }
  2231. // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
  2232. void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
  2233. const uint8_t* src_argb1,
  2234. uint8_t* dst_argb,
  2235. int width) {
  2236. asm volatile(
  2237. // 8 pixel loop.
  2238. "1: \n"
  2239. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
  2240. "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB
  2241. "subs %3, %3, #8 \n" // 8 processed per loop.
  2242. "vmull.u8 q0, d0, d1 \n" // multiply B
  2243. "vmull.u8 q1, d2, d3 \n" // multiply G
  2244. "vmull.u8 q2, d4, d5 \n" // multiply R
  2245. "vmull.u8 q3, d6, d7 \n" // multiply A
  2246. "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
  2247. "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
  2248. "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
  2249. "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
  2250. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
  2251. "bgt 1b \n"
  2252. : "+r"(src_argb0), // %0
  2253. "+r"(src_argb1), // %1
  2254. "+r"(dst_argb), // %2
  2255. "+r"(width) // %3
  2256. :
  2257. : "cc", "memory", "q0", "q1", "q2", "q3");
  2258. }
  2259. // Add 2 rows of ARGB pixels together, 8 pixels at a time.
  2260. void ARGBAddRow_NEON(const uint8_t* src_argb0,
  2261. const uint8_t* src_argb1,
  2262. uint8_t* dst_argb,
  2263. int width) {
  2264. asm volatile(
  2265. // 8 pixel loop.
  2266. "1: \n"
  2267. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
  2268. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
  2269. "subs %3, %3, #8 \n" // 8 processed per loop.
  2270. "vqadd.u8 q0, q0, q2 \n" // add B, G
  2271. "vqadd.u8 q1, q1, q3 \n" // add R, A
  2272. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
  2273. "bgt 1b \n"
  2274. : "+r"(src_argb0), // %0
  2275. "+r"(src_argb1), // %1
  2276. "+r"(dst_argb), // %2
  2277. "+r"(width) // %3
  2278. :
  2279. : "cc", "memory", "q0", "q1", "q2", "q3");
  2280. }
  2281. // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
  2282. void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
  2283. const uint8_t* src_argb1,
  2284. uint8_t* dst_argb,
  2285. int width) {
  2286. asm volatile(
  2287. // 8 pixel loop.
  2288. "1: \n"
  2289. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
  2290. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
  2291. "subs %3, %3, #8 \n" // 8 processed per loop.
  2292. "vqsub.u8 q0, q0, q2 \n" // subtract B, G
  2293. "vqsub.u8 q1, q1, q3 \n" // subtract R, A
  2294. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
  2295. "bgt 1b \n"
  2296. : "+r"(src_argb0), // %0
  2297. "+r"(src_argb1), // %1
  2298. "+r"(dst_argb), // %2
  2299. "+r"(width) // %3
  2300. :
  2301. : "cc", "memory", "q0", "q1", "q2", "q3");
  2302. }
  2303. // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
  2304. // A = 255
  2305. // R = Sobel
  2306. // G = Sobel
  2307. // B = Sobel
  2308. void SobelRow_NEON(const uint8_t* src_sobelx,
  2309. const uint8_t* src_sobely,
  2310. uint8_t* dst_argb,
  2311. int width) {
  2312. asm volatile(
  2313. "vmov.u8 d3, #255 \n" // alpha
  2314. // 8 pixel loop.
  2315. "1: \n"
  2316. "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
  2317. "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
  2318. "subs %3, %3, #8 \n" // 8 processed per loop.
  2319. "vqadd.u8 d0, d0, d1 \n" // add
  2320. "vmov.u8 d1, d0 \n"
  2321. "vmov.u8 d2, d0 \n"
  2322. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
  2323. "bgt 1b \n"
  2324. : "+r"(src_sobelx), // %0
  2325. "+r"(src_sobely), // %1
  2326. "+r"(dst_argb), // %2
  2327. "+r"(width) // %3
  2328. :
  2329. : "cc", "memory", "q0", "q1");
  2330. }
  2331. // Adds Sobel X and Sobel Y and stores Sobel into plane.
  2332. void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
  2333. const uint8_t* src_sobely,
  2334. uint8_t* dst_y,
  2335. int width) {
  2336. asm volatile(
  2337. // 16 pixel loop.
  2338. "1: \n"
  2339. "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
  2340. "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
  2341. "subs %3, %3, #16 \n" // 16 processed per loop.
  2342. "vqadd.u8 q0, q0, q1 \n" // add
  2343. "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
  2344. "bgt 1b \n"
  2345. : "+r"(src_sobelx), // %0
  2346. "+r"(src_sobely), // %1
  2347. "+r"(dst_y), // %2
  2348. "+r"(width) // %3
  2349. :
  2350. : "cc", "memory", "q0", "q1");
  2351. }
  2352. // Mixes Sobel X, Sobel Y and Sobel into ARGB.
  2353. // A = 255
  2354. // R = Sobel X
  2355. // G = Sobel
  2356. // B = Sobel Y
  2357. void SobelXYRow_NEON(const uint8_t* src_sobelx,
  2358. const uint8_t* src_sobely,
  2359. uint8_t* dst_argb,
  2360. int width) {
  2361. asm volatile(
  2362. "vmov.u8 d3, #255 \n" // alpha
  2363. // 8 pixel loop.
  2364. "1: \n"
  2365. "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
  2366. "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
  2367. "subs %3, %3, #8 \n" // 8 processed per loop.
  2368. "vqadd.u8 d1, d0, d2 \n" // add
  2369. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
  2370. "bgt 1b \n"
  2371. : "+r"(src_sobelx), // %0
  2372. "+r"(src_sobely), // %1
  2373. "+r"(dst_argb), // %2
  2374. "+r"(width) // %3
  2375. :
  2376. : "cc", "memory", "q0", "q1");
  2377. }
  2378. // SobelX as a matrix is
  2379. // -1 0 1
  2380. // -2 0 2
  2381. // -1 0 1
  2382. void SobelXRow_NEON(const uint8_t* src_y0,
  2383. const uint8_t* src_y1,
  2384. const uint8_t* src_y2,
  2385. uint8_t* dst_sobelx,
  2386. int width) {
  2387. asm volatile(
  2388. "1: \n"
  2389. "vld1.8 {d0}, [%0],%5 \n" // top
  2390. "vld1.8 {d1}, [%0],%6 \n"
  2391. "vsubl.u8 q0, d0, d1 \n"
  2392. "vld1.8 {d2}, [%1],%5 \n" // center * 2
  2393. "vld1.8 {d3}, [%1],%6 \n"
  2394. "vsubl.u8 q1, d2, d3 \n"
  2395. "vadd.s16 q0, q0, q1 \n"
  2396. "vadd.s16 q0, q0, q1 \n"
  2397. "vld1.8 {d2}, [%2],%5 \n" // bottom
  2398. "vld1.8 {d3}, [%2],%6 \n"
  2399. "subs %4, %4, #8 \n" // 8 pixels
  2400. "vsubl.u8 q1, d2, d3 \n"
  2401. "vadd.s16 q0, q0, q1 \n"
  2402. "vabs.s16 q0, q0 \n"
  2403. "vqmovn.u16 d0, q0 \n"
  2404. "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
  2405. "bgt 1b \n"
  2406. : "+r"(src_y0), // %0
  2407. "+r"(src_y1), // %1
  2408. "+r"(src_y2), // %2
  2409. "+r"(dst_sobelx), // %3
  2410. "+r"(width) // %4
  2411. : "r"(2), // %5
  2412. "r"(6) // %6
  2413. : "cc", "memory", "q0", "q1" // Clobber List
  2414. );
  2415. }
  2416. // SobelY as a matrix is
  2417. // -1 -2 -1
  2418. // 0 0 0
  2419. // 1 2 1
  2420. void SobelYRow_NEON(const uint8_t* src_y0,
  2421. const uint8_t* src_y1,
  2422. uint8_t* dst_sobely,
  2423. int width) {
  2424. asm volatile(
  2425. "1: \n"
  2426. "vld1.8 {d0}, [%0],%4 \n" // left
  2427. "vld1.8 {d1}, [%1],%4 \n"
  2428. "vsubl.u8 q0, d0, d1 \n"
  2429. "vld1.8 {d2}, [%0],%4 \n" // center * 2
  2430. "vld1.8 {d3}, [%1],%4 \n"
  2431. "vsubl.u8 q1, d2, d3 \n"
  2432. "vadd.s16 q0, q0, q1 \n"
  2433. "vadd.s16 q0, q0, q1 \n"
  2434. "vld1.8 {d2}, [%0],%5 \n" // right
  2435. "vld1.8 {d3}, [%1],%5 \n"
  2436. "subs %3, %3, #8 \n" // 8 pixels
  2437. "vsubl.u8 q1, d2, d3 \n"
  2438. "vadd.s16 q0, q0, q1 \n"
  2439. "vabs.s16 q0, q0 \n"
  2440. "vqmovn.u16 d0, q0 \n"
  2441. "vst1.8 {d0}, [%2]! \n" // store 8 sobely
  2442. "bgt 1b \n"
  2443. : "+r"(src_y0), // %0
  2444. "+r"(src_y1), // %1
  2445. "+r"(dst_sobely), // %2
  2446. "+r"(width) // %3
  2447. : "r"(1), // %4
  2448. "r"(6) // %5
  2449. : "cc", "memory", "q0", "q1" // Clobber List
  2450. );
  2451. }
  2452. // %y passes a float as a scalar vector for vector * scalar multiply.
  2453. // the regoster must be d0 to d15 and indexed with [0] or [1] to access
  2454. // the float in the first or second float of the d-reg
  2455. void HalfFloat1Row_NEON(const uint16_t* src,
  2456. uint16_t* dst,
  2457. float /*unused*/,
  2458. int width) {
  2459. asm volatile(
  2460. "1: \n"
  2461. "vld1.8 {q1}, [%0]! \n" // load 8 shorts
  2462. "subs %2, %2, #8 \n" // 8 pixels per loop
  2463. "vmovl.u16 q2, d2 \n" // 8 int's
  2464. "vmovl.u16 q3, d3 \n"
  2465. "vcvt.f32.u32 q2, q2 \n" // 8 floats
  2466. "vcvt.f32.u32 q3, q3 \n"
  2467. "vmul.f32 q2, q2, %y3 \n" // adjust exponent
  2468. "vmul.f32 q3, q3, %y3 \n"
  2469. "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
  2470. "vqshrn.u32 d3, q3, #13 \n"
  2471. "vst1.8 {q1}, [%1]! \n"
  2472. "bgt 1b \n"
  2473. : "+r"(src), // %0
  2474. "+r"(dst), // %1
  2475. "+r"(width) // %2
  2476. : "w"(1.9259299444e-34f) // %3
  2477. : "cc", "memory", "q1", "q2", "q3");
  2478. }
  2479. void HalfFloatRow_NEON(const uint16_t* src,
  2480. uint16_t* dst,
  2481. float scale,
  2482. int width) {
  2483. asm volatile(
  2484. "1: \n"
  2485. "vld1.8 {q1}, [%0]! \n" // load 8 shorts
  2486. "subs %2, %2, #8 \n" // 8 pixels per loop
  2487. "vmovl.u16 q2, d2 \n" // 8 int's
  2488. "vmovl.u16 q3, d3 \n"
  2489. "vcvt.f32.u32 q2, q2 \n" // 8 floats
  2490. "vcvt.f32.u32 q3, q3 \n"
  2491. "vmul.f32 q2, q2, %y3 \n" // adjust exponent
  2492. "vmul.f32 q3, q3, %y3 \n"
  2493. "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
  2494. "vqshrn.u32 d3, q3, #13 \n"
  2495. "vst1.8 {q1}, [%1]! \n"
  2496. "bgt 1b \n"
  2497. : "+r"(src), // %0
  2498. "+r"(dst), // %1
  2499. "+r"(width) // %2
  2500. : "w"(scale * 1.9259299444e-34f) // %3
  2501. : "cc", "memory", "q1", "q2", "q3");
  2502. }
  2503. void ByteToFloatRow_NEON(const uint8_t* src,
  2504. float* dst,
  2505. float scale,
  2506. int width) {
  2507. asm volatile(
  2508. "1: \n"
  2509. "vld1.8 {d2}, [%0]! \n" // load 8 bytes
  2510. "subs %2, %2, #8 \n" // 8 pixels per loop
  2511. "vmovl.u8 q1, d2 \n" // 8 shorts
  2512. "vmovl.u16 q2, d2 \n" // 8 ints
  2513. "vmovl.u16 q3, d3 \n"
  2514. "vcvt.f32.u32 q2, q2 \n" // 8 floats
  2515. "vcvt.f32.u32 q3, q3 \n"
  2516. "vmul.f32 q2, q2, %y3 \n" // scale
  2517. "vmul.f32 q3, q3, %y3 \n"
  2518. "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats
  2519. "bgt 1b \n"
  2520. : "+r"(src), // %0
  2521. "+r"(dst), // %1
  2522. "+r"(width) // %2
  2523. : "w"(scale) // %3
  2524. : "cc", "memory", "q1", "q2", "q3");
  2525. }
  2526. // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
  2527. void GaussCol_NEON(const uint16_t* src0,
  2528. const uint16_t* src1,
  2529. const uint16_t* src2,
  2530. const uint16_t* src3,
  2531. const uint16_t* src4,
  2532. uint32_t* dst,
  2533. int width) {
  2534. asm volatile(
  2535. "vmov.u16 d6, #4 \n" // constant 4
  2536. "vmov.u16 d7, #6 \n" // constant 6
  2537. "1: \n"
  2538. "vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows
  2539. "vld1.16 {q2}, [%4]! \n"
  2540. "vaddl.u16 q0, d2, d4 \n" // * 1
  2541. "vaddl.u16 q1, d3, d5 \n" // * 1
  2542. "vld1.16 {q2}, [%1]! \n"
  2543. "vmlal.u16 q0, d4, d6 \n" // * 4
  2544. "vmlal.u16 q1, d5, d6 \n" // * 4
  2545. "vld1.16 {q2}, [%2]! \n"
  2546. "vmlal.u16 q0, d4, d7 \n" // * 6
  2547. "vmlal.u16 q1, d5, d7 \n" // * 6
  2548. "vld1.16 {q2}, [%3]! \n"
  2549. "vmlal.u16 q0, d4, d6 \n" // * 4
  2550. "vmlal.u16 q1, d5, d6 \n" // * 4
  2551. "subs %6, %6, #8 \n" // 8 processed per loop
  2552. "vst1.32 {q0, q1}, [%5]! \n" // store 8 samples
  2553. "bgt 1b \n"
  2554. : "+r"(src0), // %0
  2555. "+r"(src1), // %1
  2556. "+r"(src2), // %2
  2557. "+r"(src3), // %3
  2558. "+r"(src4), // %4
  2559. "+r"(dst), // %5
  2560. "+r"(width) // %6
  2561. :
  2562. : "cc", "memory", "q0", "q1", "q2", "q3");
  2563. }
  2564. // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
  2565. void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
  2566. const uint32_t* src1 = src + 1;
  2567. const uint32_t* src2 = src + 2;
  2568. const uint32_t* src3 = src + 3;
  2569. asm volatile(
  2570. "vmov.u32 q10, #4 \n" // constant 4
  2571. "vmov.u32 q11, #6 \n" // constant 6
  2572. "1: \n"
  2573. "vld1.32 {q0, q1}, [%0]! \n" // load 12 source samples
  2574. "vld1.32 {q2}, [%0] \n"
  2575. "vadd.u32 q0, q0, q1 \n" // * 1
  2576. "vadd.u32 q1, q1, q2 \n" // * 1
  2577. "vld1.32 {q2, q3}, [%2]! \n"
  2578. "vmla.u32 q0, q2, q11 \n" // * 6
  2579. "vmla.u32 q1, q3, q11 \n" // * 6
  2580. "vld1.32 {q2, q3}, [%1]! \n"
  2581. "vld1.32 {q8, q9}, [%3]! \n"
  2582. "vadd.u32 q2, q2, q8 \n" // add rows for * 4
  2583. "vadd.u32 q3, q3, q9 \n"
  2584. "vmla.u32 q0, q2, q10 \n" // * 4
  2585. "vmla.u32 q1, q3, q10 \n" // * 4
  2586. "subs %5, %5, #8 \n" // 8 processed per loop
  2587. "vqshrn.u32 d0, q0, #8 \n" // round and pack
  2588. "vqshrn.u32 d1, q1, #8 \n"
  2589. "vst1.u16 {q0}, [%4]! \n" // store 8 samples
  2590. "bgt 1b \n"
  2591. : "+r"(src), // %0
  2592. "+r"(src1), // %1
  2593. "+r"(src2), // %2
  2594. "+r"(src3), // %3
  2595. "+r"(dst), // %4
  2596. "+r"(width) // %5
  2597. :
  2598. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
  2599. }
  2600. // Convert biplanar NV21 to packed YUV24
  2601. void NV21ToYUV24Row_NEON(const uint8_t* src_y,
  2602. const uint8_t* src_vu,
  2603. uint8_t* dst_yuv24,
  2604. int width) {
  2605. asm volatile(
  2606. "1: \n"
  2607. "vld1.8 {q2}, [%0]! \n" // load 16 Y values
  2608. "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values
  2609. "vmov d1, d0 \n"
  2610. "vzip.u8 d0, d1 \n" // VV
  2611. "vmov d3, d2 \n"
  2612. "vzip.u8 d2, d3 \n" // UU
  2613. "subs %3, %3, #16 \n" // 16 pixels per loop
  2614. "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels
  2615. "vst3.8 {d1, d3, d5}, [%2]! \n"
  2616. "bgt 1b \n"
  2617. : "+r"(src_y), // %0
  2618. "+r"(src_vu), // %1
  2619. "+r"(dst_yuv24), // %2
  2620. "+r"(width) // %3
  2621. :
  2622. : "cc", "memory", "q0", "q1", "q2");
  2623. }
  2624. void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
  2625. int src_stride_ayuv,
  2626. uint8_t* dst_uv,
  2627. int width) {
  2628. asm volatile(
  2629. "add %1, %0, %1 \n" // src_stride + src_AYUV
  2630. "1: \n"
  2631. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
  2632. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
  2633. // pixels.
  2634. "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
  2635. "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
  2636. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
  2637. // pixels.
  2638. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV
  2639. // pixels.
  2640. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
  2641. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
  2642. "vqrshrun.s16 d1, q0, #2 \n" // 2x2 average
  2643. "vqrshrun.s16 d0, q1, #2 \n"
  2644. "subs %3, %3, #16 \n" // 16 processed per loop.
  2645. "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV.
  2646. "bgt 1b \n"
  2647. : "+r"(src_ayuv), // %0
  2648. "+r"(src_stride_ayuv), // %1
  2649. "+r"(dst_uv), // %2
  2650. "+r"(width) // %3
  2651. :
  2652. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
  2653. }
  2654. void AYUVToVURow_NEON(const uint8_t* src_ayuv,
  2655. int src_stride_ayuv,
  2656. uint8_t* dst_vu,
  2657. int width) {
  2658. asm volatile(
  2659. "add %1, %0, %1 \n" // src_stride + src_AYUV
  2660. "1: \n"
  2661. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
  2662. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
  2663. // pixels.
  2664. "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
  2665. "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
  2666. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
  2667. // pixels.
  2668. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV
  2669. // pixels.
  2670. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
  2671. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
  2672. "vqrshrun.s16 d0, q0, #2 \n" // 2x2 average
  2673. "vqrshrun.s16 d1, q1, #2 \n"
  2674. "subs %3, %3, #16 \n" // 16 processed per loop.
  2675. "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU.
  2676. "bgt 1b \n"
  2677. : "+r"(src_ayuv), // %0
  2678. "+r"(src_stride_ayuv), // %1
  2679. "+r"(dst_vu), // %2
  2680. "+r"(width) // %3
  2681. :
  2682. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
  2683. }
  2684. // Copy row of AYUV Y's into Y.
  2685. // Similar to ARGBExtractAlphaRow_NEON
  2686. void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
  2687. asm volatile(
  2688. "1: \n"
  2689. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels
  2690. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels
  2691. "subs %2, %2, #16 \n" // 16 processed per loop
  2692. "vst1.8 {q2}, [%1]! \n" // store 16 Y's.
  2693. "bgt 1b \n"
  2694. : "+r"(src_ayuv), // %0
  2695. "+r"(dst_y), // %1
  2696. "+r"(width) // %2
  2697. :
  2698. : "cc", "memory", "q0", "q1", "q2", "q3");
  2699. }
  2700. // Convert UV plane of NV12 to VU of NV21.
  2701. void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
  2702. asm volatile(
  2703. "1: \n"
  2704. "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values
  2705. "vld2.8 {d1, d3}, [%0]! \n"
  2706. "vorr.u8 q2, q0, q0 \n" // move U after V
  2707. "subs %2, %2, #16 \n" // 16 pixels per loop
  2708. "vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels
  2709. "bgt 1b \n"
  2710. : "+r"(src_uv), // %0
  2711. "+r"(dst_vu), // %1
  2712. "+r"(width) // %2
  2713. :
  2714. : "cc", "memory", "q0", "q1", "q2");
  2715. }
  2716. #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
  2717. #ifdef __cplusplus
  2718. } // extern "C"
  2719. } // namespace libyuv
  2720. #endif