row_neon.cc 127 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693
  1. /*
  2. * Copyright 2011 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/row.h"
  11. #include <stdio.h>
  12. #ifdef __cplusplus
  13. namespace libyuv {
  14. extern "C" {
  15. #endif
  16. // This module is for GCC Neon
  17. #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
  18. !defined(__aarch64__)
  19. // Read 8 Y, 4 U and 4 V from 422
  20. #define READYUV422 \
  21. "vld1.8 {d0}, [%0]! \n" \
  22. "vld1.32 {d2[0]}, [%1]! \n" \
  23. "vld1.32 {d2[1]}, [%2]! \n"
  24. // Read 8 Y, 8 U and 8 V from 444
  25. #define READYUV444 \
  26. "vld1.8 {d0}, [%0]! \n" \
  27. "vld1.8 {d2}, [%1]! \n" \
  28. "vld1.8 {d3}, [%2]! \n" \
  29. "vpaddl.u8 q1, q1 \n" \
  30. "vrshrn.u16 d2, q1, #1 \n"
  31. // Read 8 Y, and set 4 U and 4 V to 128
  32. #define READYUV400 \
  33. "vld1.8 {d0}, [%0]! \n" \
  34. "vmov.u8 d2, #128 \n"
  35. // Read 8 Y and 4 UV from NV12
  36. #define READNV12 \
  37. "vld1.8 {d0}, [%0]! \n" \
  38. "vld1.8 {d2}, [%1]! \n" \
  39. "vmov.u8 d3, d2 \n" /* split odd/even uv apart */ \
  40. "vuzp.u8 d2, d3 \n" \
  41. "vtrn.u32 d2, d3 \n"
  42. // Read 8 Y and 4 VU from NV21
  43. #define READNV21 \
  44. "vld1.8 {d0}, [%0]! \n" \
  45. "vld1.8 {d2}, [%1]! \n" \
  46. "vmov.u8 d3, d2 \n" /* split odd/even uv apart */ \
  47. "vuzp.u8 d3, d2 \n" \
  48. "vtrn.u32 d2, d3 \n"
  49. // Read 8 YUY2
  50. #define READYUY2 \
  51. "vld2.8 {d0, d2}, [%0]! \n" \
  52. "vmov.u8 d3, d2 \n" \
  53. "vuzp.u8 d2, d3 \n" \
  54. "vtrn.u32 d2, d3 \n"
  55. // Read 8 UYVY
  56. #define READUYVY \
  57. "vld2.8 {d2, d3}, [%0]! \n" \
  58. "vmov.u8 d0, d3 \n" \
  59. "vmov.u8 d3, d2 \n" \
  60. "vuzp.u8 d2, d3 \n" \
  61. "vtrn.u32 d2, d3 \n"
  62. #define YUVTORGB_SETUP \
  63. "vld1.8 {d24}, [%[kUVToRB]] \n" \
  64. "vld1.8 {d25}, [%[kUVToG]] \n" \
  65. "vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \
  66. "vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \
  67. "vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \
  68. "vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n"
  69. #define YUVTORGB \
  70. "vmull.u8 q8, d2, d24 \n" /* u/v B/R component */ \
  71. "vmull.u8 q9, d2, d25 \n" /* u/v G component */ \
  72. "vmovl.u8 q0, d0 \n" /* Y */ \
  73. "vmovl.s16 q10, d1 \n" \
  74. "vmovl.s16 q0, d0 \n" \
  75. "vmul.s32 q10, q10, q15 \n" \
  76. "vmul.s32 q0, q0, q15 \n" \
  77. "vqshrun.s32 d0, q0, #16 \n" \
  78. "vqshrun.s32 d1, q10, #16 \n" /* Y */ \
  79. "vadd.s16 d18, d19 \n" \
  80. "vshll.u16 q1, d16, #16 \n" /* Replicate u * UB */ \
  81. "vshll.u16 q10, d17, #16 \n" /* Replicate v * VR */ \
  82. "vshll.u16 q3, d18, #16 \n" /* Replicate (v*VG + u*UG)*/ \
  83. "vaddw.u16 q1, q1, d16 \n" \
  84. "vaddw.u16 q10, q10, d17 \n" \
  85. "vaddw.u16 q3, q3, d18 \n" \
  86. "vqadd.s16 q8, q0, q13 \n" /* B */ \
  87. "vqadd.s16 q9, q0, q14 \n" /* R */ \
  88. "vqadd.s16 q0, q0, q4 \n" /* G */ \
  89. "vqadd.s16 q8, q8, q1 \n" /* B */ \
  90. "vqadd.s16 q9, q9, q10 \n" /* R */ \
  91. "vqsub.s16 q0, q0, q3 \n" /* G */ \
  92. "vqshrun.s16 d20, q8, #6 \n" /* B */ \
  93. "vqshrun.s16 d22, q9, #6 \n" /* R */ \
  94. "vqshrun.s16 d21, q0, #6 \n" /* G */
  95. void I444ToARGBRow_NEON(const uint8_t* src_y,
  96. const uint8_t* src_u,
  97. const uint8_t* src_v,
  98. uint8_t* dst_argb,
  99. const struct YuvConstants* yuvconstants,
  100. int width) {
  101. asm volatile(
  102. YUVTORGB_SETUP
  103. "vmov.u8 d23, #255 \n"
  104. "1: \n" READYUV444 YUVTORGB
  105. "subs %4, %4, #8 \n"
  106. "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
  107. "bgt 1b \n"
  108. : "+r"(src_y), // %0
  109. "+r"(src_u), // %1
  110. "+r"(src_v), // %2
  111. "+r"(dst_argb), // %3
  112. "+r"(width) // %4
  113. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  114. [kUVToG] "r"(&yuvconstants->kUVToG),
  115. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  116. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  117. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
  118. "q12", "q13", "q14", "q15");
  119. }
  120. void I422ToARGBRow_NEON(const uint8_t* src_y,
  121. const uint8_t* src_u,
  122. const uint8_t* src_v,
  123. uint8_t* dst_argb,
  124. const struct YuvConstants* yuvconstants,
  125. int width) {
  126. asm volatile(
  127. YUVTORGB_SETUP
  128. "vmov.u8 d23, #255 \n"
  129. "1: \n" READYUV422 YUVTORGB
  130. "subs %4, %4, #8 \n"
  131. "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
  132. "bgt 1b \n"
  133. : "+r"(src_y), // %0
  134. "+r"(src_u), // %1
  135. "+r"(src_v), // %2
  136. "+r"(dst_argb), // %3
  137. "+r"(width) // %4
  138. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  139. [kUVToG] "r"(&yuvconstants->kUVToG),
  140. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  141. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  142. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
  143. "q12", "q13", "q14", "q15");
  144. }
  145. void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
  146. const uint8_t* src_u,
  147. const uint8_t* src_v,
  148. const uint8_t* src_a,
  149. uint8_t* dst_argb,
  150. const struct YuvConstants* yuvconstants,
  151. int width) {
  152. asm volatile(
  153. YUVTORGB_SETUP
  154. "1: \n" READYUV422 YUVTORGB
  155. "subs %5, %5, #8 \n"
  156. "vld1.8 {d23}, [%3]! \n"
  157. "vst4.8 {d20, d21, d22, d23}, [%4]! \n"
  158. "bgt 1b \n"
  159. : "+r"(src_y), // %0
  160. "+r"(src_u), // %1
  161. "+r"(src_v), // %2
  162. "+r"(src_a), // %3
  163. "+r"(dst_argb), // %4
  164. "+r"(width) // %5
  165. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  166. [kUVToG] "r"(&yuvconstants->kUVToG),
  167. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  168. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  169. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
  170. "q12", "q13", "q14", "q15");
  171. }
  172. void I422ToRGBARow_NEON(const uint8_t* src_y,
  173. const uint8_t* src_u,
  174. const uint8_t* src_v,
  175. uint8_t* dst_rgba,
  176. const struct YuvConstants* yuvconstants,
  177. int width) {
  178. asm volatile(
  179. YUVTORGB_SETUP
  180. "1: \n" READYUV422 YUVTORGB
  181. "subs %4, %4, #8 \n"
  182. "vmov.u8 d19, #255 \n" // YUVTORGB modified d19
  183. "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
  184. "bgt 1b \n"
  185. : "+r"(src_y), // %0
  186. "+r"(src_u), // %1
  187. "+r"(src_v), // %2
  188. "+r"(dst_rgba), // %3
  189. "+r"(width) // %4
  190. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  191. [kUVToG] "r"(&yuvconstants->kUVToG),
  192. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  193. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  194. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
  195. "q12", "q13", "q14", "q15");
  196. }
  197. void I422ToRGB24Row_NEON(const uint8_t* src_y,
  198. const uint8_t* src_u,
  199. const uint8_t* src_v,
  200. uint8_t* dst_rgb24,
  201. const struct YuvConstants* yuvconstants,
  202. int width) {
  203. asm volatile(
  204. YUVTORGB_SETUP
  205. "1: \n" READYUV422 YUVTORGB
  206. "subs %4, %4, #8 \n"
  207. "vst3.8 {d20, d21, d22}, [%3]! \n"
  208. "bgt 1b \n"
  209. : "+r"(src_y), // %0
  210. "+r"(src_u), // %1
  211. "+r"(src_v), // %2
  212. "+r"(dst_rgb24), // %3
  213. "+r"(width) // %4
  214. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  215. [kUVToG] "r"(&yuvconstants->kUVToG),
  216. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  217. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  218. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
  219. "q12", "q13", "q14", "q15");
  220. }
  221. #define ARGBTORGB565 \
  222. "vshll.u8 q0, d22, #8 \n" /* R */ \
  223. "vshll.u8 q8, d21, #8 \n" /* G */ \
  224. "vshll.u8 q9, d20, #8 \n" /* B */ \
  225. "vsri.16 q0, q8, #5 \n" /* RG */ \
  226. "vsri.16 q0, q9, #11 \n" /* RGB */
  227. void I422ToRGB565Row_NEON(const uint8_t* src_y,
  228. const uint8_t* src_u,
  229. const uint8_t* src_v,
  230. uint8_t* dst_rgb565,
  231. const struct YuvConstants* yuvconstants,
  232. int width) {
  233. asm volatile(
  234. YUVTORGB_SETUP
  235. "1: \n" READYUV422 YUVTORGB
  236. "subs %4, %4, #8 \n" ARGBTORGB565
  237. "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
  238. "bgt 1b \n"
  239. : "+r"(src_y), // %0
  240. "+r"(src_u), // %1
  241. "+r"(src_v), // %2
  242. "+r"(dst_rgb565), // %3
  243. "+r"(width) // %4
  244. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  245. [kUVToG] "r"(&yuvconstants->kUVToG),
  246. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  247. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  248. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
  249. "q12", "q13", "q14", "q15");
  250. }
  251. #define ARGBTOARGB1555 \
  252. "vshll.u8 q0, d23, #8 \n" /* A */ \
  253. "vshll.u8 q8, d22, #8 \n" /* R */ \
  254. "vshll.u8 q9, d21, #8 \n" /* G */ \
  255. "vshll.u8 q10, d20, #8 \n" /* B */ \
  256. "vsri.16 q0, q8, #1 \n" /* AR */ \
  257. "vsri.16 q0, q9, #6 \n" /* ARG */ \
  258. "vsri.16 q0, q10, #11 \n" /* ARGB */
  259. void I422ToARGB1555Row_NEON(const uint8_t* src_y,
  260. const uint8_t* src_u,
  261. const uint8_t* src_v,
  262. uint8_t* dst_argb1555,
  263. const struct YuvConstants* yuvconstants,
  264. int width) {
  265. asm volatile(
  266. YUVTORGB_SETUP
  267. "1: \n" READYUV422 YUVTORGB
  268. "subs %4, %4, #8 \n"
  269. "vmov.u8 d23, #255 \n" ARGBTOARGB1555
  270. "vst1.8 {q0}, [%3]! \n" // store 8 pixels
  271. "bgt 1b \n"
  272. : "+r"(src_y), // %0
  273. "+r"(src_u), // %1
  274. "+r"(src_v), // %2
  275. "+r"(dst_argb1555), // %3
  276. "+r"(width) // %4
  277. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  278. [kUVToG] "r"(&yuvconstants->kUVToG),
  279. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  280. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  281. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
  282. "q12", "q13", "q14", "q15");
  283. }
  284. #define ARGBTOARGB4444 \
  285. "vshr.u8 d20, d20, #4 \n" /* B */ \
  286. "vbic.32 d21, d21, d4 \n" /* G */ \
  287. "vshr.u8 d22, d22, #4 \n" /* R */ \
  288. "vbic.32 d23, d23, d4 \n" /* A */ \
  289. "vorr d0, d20, d21 \n" /* BG */ \
  290. "vorr d1, d22, d23 \n" /* RA */ \
  291. "vzip.u8 d0, d1 \n" /* BGRA */
  292. void I422ToARGB4444Row_NEON(const uint8_t* src_y,
  293. const uint8_t* src_u,
  294. const uint8_t* src_v,
  295. uint8_t* dst_argb4444,
  296. const struct YuvConstants* yuvconstants,
  297. int width) {
  298. asm volatile(
  299. YUVTORGB_SETUP
  300. "vmov.u8 d4, #0x0f \n" // vbic bits to clear
  301. "1: \n"
  302. READYUV422 YUVTORGB
  303. "subs %4, %4, #8 \n"
  304. "vmov.u8 d23, #255 \n" ARGBTOARGB4444
  305. "vst1.8 {q0}, [%3]! \n" // store 8 pixels
  306. "bgt 1b \n"
  307. : "+r"(src_y), // %0
  308. "+r"(src_u), // %1
  309. "+r"(src_v), // %2
  310. "+r"(dst_argb4444), // %3
  311. "+r"(width) // %4
  312. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  313. [kUVToG] "r"(&yuvconstants->kUVToG),
  314. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  315. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  316. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
  317. "q12", "q13", "q14", "q15");
  318. }
  319. void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
  320. asm volatile(
  321. YUVTORGB_SETUP
  322. "vmov.u8 d23, #255 \n"
  323. "1: \n" READYUV400 YUVTORGB
  324. "subs %2, %2, #8 \n"
  325. "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
  326. "bgt 1b \n"
  327. : "+r"(src_y), // %0
  328. "+r"(dst_argb), // %1
  329. "+r"(width) // %2
  330. : [kUVToRB] "r"(&kYuvI601Constants.kUVToRB),
  331. [kUVToG] "r"(&kYuvI601Constants.kUVToG),
  332. [kUVBiasBGR] "r"(&kYuvI601Constants.kUVBiasBGR),
  333. [kYToRgb] "r"(&kYuvI601Constants.kYToRgb)
  334. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
  335. "q12", "q13", "q14", "q15");
  336. }
  337. void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
  338. asm volatile(
  339. "vmov.u8 d23, #255 \n"
  340. "1: \n"
  341. "vld1.8 {d20}, [%0]! \n"
  342. "vmov d21, d20 \n"
  343. "vmov d22, d20 \n"
  344. "subs %2, %2, #8 \n"
  345. "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
  346. "bgt 1b \n"
  347. : "+r"(src_y), // %0
  348. "+r"(dst_argb), // %1
  349. "+r"(width) // %2
  350. :
  351. : "cc", "memory", "d20", "d21", "d22", "d23");
  352. }
  353. void NV12ToARGBRow_NEON(const uint8_t* src_y,
  354. const uint8_t* src_uv,
  355. uint8_t* dst_argb,
  356. const struct YuvConstants* yuvconstants,
  357. int width) {
  358. asm volatile(YUVTORGB_SETUP
  359. "vmov.u8 d23, #255 \n"
  360. "1: \n" READNV12 YUVTORGB
  361. "subs %3, %3, #8 \n"
  362. "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
  363. "bgt 1b \n"
  364. : "+r"(src_y), // %0
  365. "+r"(src_uv), // %1
  366. "+r"(dst_argb), // %2
  367. "+r"(width) // %3
  368. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  369. [kUVToG] "r"(&yuvconstants->kUVToG),
  370. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  371. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  372. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
  373. "q10", "q11", "q12", "q13", "q14", "q15");
  374. }
  375. void NV21ToARGBRow_NEON(const uint8_t* src_y,
  376. const uint8_t* src_vu,
  377. uint8_t* dst_argb,
  378. const struct YuvConstants* yuvconstants,
  379. int width) {
  380. asm volatile(YUVTORGB_SETUP
  381. "vmov.u8 d23, #255 \n"
  382. "1: \n" READNV21 YUVTORGB
  383. "subs %3, %3, #8 \n"
  384. "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
  385. "bgt 1b \n"
  386. : "+r"(src_y), // %0
  387. "+r"(src_vu), // %1
  388. "+r"(dst_argb), // %2
  389. "+r"(width) // %3
  390. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  391. [kUVToG] "r"(&yuvconstants->kUVToG),
  392. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  393. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  394. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
  395. "q10", "q11", "q12", "q13", "q14", "q15");
  396. }
  397. void NV12ToRGB24Row_NEON(const uint8_t* src_y,
  398. const uint8_t* src_uv,
  399. uint8_t* dst_rgb24,
  400. const struct YuvConstants* yuvconstants,
  401. int width) {
  402. asm volatile(
  403. YUVTORGB_SETUP
  404. "1: \n"
  405. READNV12 YUVTORGB
  406. "subs %3, %3, #8 \n"
  407. "vst3.8 {d20, d21, d22}, [%2]! \n"
  408. "bgt 1b \n"
  409. : "+r"(src_y), // %0
  410. "+r"(src_uv), // %1
  411. "+r"(dst_rgb24), // %2
  412. "+r"(width) // %3
  413. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  414. [kUVToG] "r"(&yuvconstants->kUVToG),
  415. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  416. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  417. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
  418. "q12", "q13", "q14", "q15");
  419. }
  420. void NV21ToRGB24Row_NEON(const uint8_t* src_y,
  421. const uint8_t* src_vu,
  422. uint8_t* dst_rgb24,
  423. const struct YuvConstants* yuvconstants,
  424. int width) {
  425. asm volatile(
  426. YUVTORGB_SETUP
  427. "1: \n"
  428. READNV21 YUVTORGB
  429. "subs %3, %3, #8 \n"
  430. "vst3.8 {d20, d21, d22}, [%2]! \n"
  431. "bgt 1b \n"
  432. : "+r"(src_y), // %0
  433. "+r"(src_vu), // %1
  434. "+r"(dst_rgb24), // %2
  435. "+r"(width) // %3
  436. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  437. [kUVToG] "r"(&yuvconstants->kUVToG),
  438. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  439. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  440. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
  441. "q12", "q13", "q14", "q15");
  442. }
  443. void NV12ToRGB565Row_NEON(const uint8_t* src_y,
  444. const uint8_t* src_uv,
  445. uint8_t* dst_rgb565,
  446. const struct YuvConstants* yuvconstants,
  447. int width) {
  448. asm volatile(
  449. YUVTORGB_SETUP
  450. "1: \n" READNV12 YUVTORGB
  451. "subs %3, %3, #8 \n" ARGBTORGB565
  452. "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
  453. "bgt 1b \n"
  454. : "+r"(src_y), // %0
  455. "+r"(src_uv), // %1
  456. "+r"(dst_rgb565), // %2
  457. "+r"(width) // %3
  458. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  459. [kUVToG] "r"(&yuvconstants->kUVToG),
  460. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  461. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  462. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
  463. "q12", "q13", "q14", "q15");
  464. }
  465. void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
  466. uint8_t* dst_argb,
  467. const struct YuvConstants* yuvconstants,
  468. int width) {
  469. asm volatile(YUVTORGB_SETUP
  470. "vmov.u8 d23, #255 \n"
  471. "1: \n" READYUY2 YUVTORGB
  472. "subs %2, %2, #8 \n"
  473. "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
  474. "bgt 1b \n"
  475. : "+r"(src_yuy2), // %0
  476. "+r"(dst_argb), // %1
  477. "+r"(width) // %2
  478. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  479. [kUVToG] "r"(&yuvconstants->kUVToG),
  480. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  481. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  482. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
  483. "q10", "q11", "q12", "q13", "q14", "q15");
  484. }
  485. void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
  486. uint8_t* dst_argb,
  487. const struct YuvConstants* yuvconstants,
  488. int width) {
  489. asm volatile(YUVTORGB_SETUP
  490. "vmov.u8 d23, #255 \n"
  491. "1: \n" READUYVY YUVTORGB
  492. "subs %2, %2, #8 \n"
  493. "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
  494. "bgt 1b \n"
  495. : "+r"(src_uyvy), // %0
  496. "+r"(dst_argb), // %1
  497. "+r"(width) // %2
  498. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  499. [kUVToG] "r"(&yuvconstants->kUVToG),
  500. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  501. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  502. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
  503. "q10", "q11", "q12", "q13", "q14", "q15");
  504. }
  505. // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
  506. void SplitUVRow_NEON(const uint8_t* src_uv,
  507. uint8_t* dst_u,
  508. uint8_t* dst_v,
  509. int width) {
  510. asm volatile(
  511. "1: \n"
  512. "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
  513. "subs %3, %3, #16 \n" // 16 processed per loop
  514. "vst1.8 {q0}, [%1]! \n" // store U
  515. "vst1.8 {q1}, [%2]! \n" // store V
  516. "bgt 1b \n"
  517. : "+r"(src_uv), // %0
  518. "+r"(dst_u), // %1
  519. "+r"(dst_v), // %2
  520. "+r"(width) // %3 // Output registers
  521. : // Input registers
  522. : "cc", "memory", "q0", "q1" // Clobber List
  523. );
  524. }
  525. // Reads 16 U's and V's and writes out 16 pairs of UV.
  526. void MergeUVRow_NEON(const uint8_t* src_u,
  527. const uint8_t* src_v,
  528. uint8_t* dst_uv,
  529. int width) {
  530. asm volatile(
  531. "1: \n"
  532. "vld1.8 {q0}, [%0]! \n" // load U
  533. "vld1.8 {q1}, [%1]! \n" // load V
  534. "subs %3, %3, #16 \n" // 16 processed per loop
  535. "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
  536. "bgt 1b \n"
  537. : "+r"(src_u), // %0
  538. "+r"(src_v), // %1
  539. "+r"(dst_uv), // %2
  540. "+r"(width) // %3 // Output registers
  541. : // Input registers
  542. : "cc", "memory", "q0", "q1" // Clobber List
  543. );
  544. }
  545. // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
  546. void SplitRGBRow_NEON(const uint8_t* src_rgb,
  547. uint8_t* dst_r,
  548. uint8_t* dst_g,
  549. uint8_t* dst_b,
  550. int width) {
  551. asm volatile(
  552. "1: \n"
  553. "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB
  554. "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB
  555. "subs %4, %4, #16 \n" // 16 processed per loop
  556. "vst1.8 {q0}, [%1]! \n" // store R
  557. "vst1.8 {q1}, [%2]! \n" // store G
  558. "vst1.8 {q2}, [%3]! \n" // store B
  559. "bgt 1b \n"
  560. : "+r"(src_rgb), // %0
  561. "+r"(dst_r), // %1
  562. "+r"(dst_g), // %2
  563. "+r"(dst_b), // %3
  564. "+r"(width) // %4
  565. : // Input registers
  566. : "cc", "memory", "d0", "d1", "d2" // Clobber List
  567. );
  568. }
  569. // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
  570. void MergeRGBRow_NEON(const uint8_t* src_r,
  571. const uint8_t* src_g,
  572. const uint8_t* src_b,
  573. uint8_t* dst_rgb,
  574. int width) {
  575. asm volatile(
  576. "1: \n"
  577. "vld1.8 {q0}, [%0]! \n" // load R
  578. "vld1.8 {q1}, [%1]! \n" // load G
  579. "vld1.8 {q2}, [%2]! \n" // load B
  580. "subs %4, %4, #16 \n" // 16 processed per loop
  581. "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB
  582. "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB
  583. "bgt 1b \n"
  584. : "+r"(src_r), // %0
  585. "+r"(src_g), // %1
  586. "+r"(src_b), // %2
  587. "+r"(dst_rgb), // %3
  588. "+r"(width) // %4
  589. : // Input registers
  590. : "cc", "memory", "q0", "q1", "q2" // Clobber List
  591. );
  592. }
  593. // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
  594. void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
  595. asm volatile(
  596. "1: \n"
  597. "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
  598. "subs %2, %2, #32 \n" // 32 processed per loop
  599. "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
  600. "bgt 1b \n"
  601. : "+r"(src), // %0
  602. "+r"(dst), // %1
  603. "+r"(width) // %2 // Output registers
  604. : // Input registers
  605. : "cc", "memory", "q0", "q1" // Clobber List
  606. );
  607. }
  608. // SetRow writes 'width' bytes using an 8 bit value repeated.
  609. void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
  610. asm volatile(
  611. "vdup.8 q0, %2 \n" // duplicate 16 bytes
  612. "1: \n"
  613. "subs %1, %1, #16 \n" // 16 bytes per loop
  614. "vst1.8 {q0}, [%0]! \n" // store
  615. "bgt 1b \n"
  616. : "+r"(dst), // %0
  617. "+r"(width) // %1
  618. : "r"(v8) // %2
  619. : "cc", "memory", "q0");
  620. }
  621. // ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
  622. void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
  623. asm volatile(
  624. "vdup.u32 q0, %2 \n" // duplicate 4 ints
  625. "1: \n"
  626. "subs %1, %1, #4 \n" // 4 pixels per loop
  627. "vst1.8 {q0}, [%0]! \n" // store
  628. "bgt 1b \n"
  629. : "+r"(dst), // %0
  630. "+r"(width) // %1
  631. : "r"(v32) // %2
  632. : "cc", "memory", "q0");
  633. }
  634. void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
  635. asm volatile(
  636. // Start at end of source row.
  637. "mov r3, #-16 \n"
  638. "add %0, %0, %2 \n"
  639. "sub %0, #16 \n"
  640. "1: \n"
  641. "vld1.8 {q0}, [%0], r3 \n" // src -= 16
  642. "subs %2, #16 \n" // 16 pixels per loop.
  643. "vrev64.8 q0, q0 \n"
  644. "vst1.8 {d1}, [%1]! \n" // dst += 16
  645. "vst1.8 {d0}, [%1]! \n"
  646. "bgt 1b \n"
  647. : "+r"(src), // %0
  648. "+r"(dst), // %1
  649. "+r"(width) // %2
  650. :
  651. : "cc", "memory", "r3", "q0");
  652. }
  653. void MirrorUVRow_NEON(const uint8_t* src_uv,
  654. uint8_t* dst_u,
  655. uint8_t* dst_v,
  656. int width) {
  657. asm volatile(
  658. // Start at end of source row.
  659. "mov r12, #-16 \n"
  660. "add %0, %0, %3, lsl #1 \n"
  661. "sub %0, #16 \n"
  662. "1: \n"
  663. "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
  664. "subs %3, #8 \n" // 8 pixels per loop.
  665. "vrev64.8 q0, q0 \n"
  666. "vst1.8 {d0}, [%1]! \n" // dst += 8
  667. "vst1.8 {d1}, [%2]! \n"
  668. "bgt 1b \n"
  669. : "+r"(src_uv), // %0
  670. "+r"(dst_u), // %1
  671. "+r"(dst_v), // %2
  672. "+r"(width) // %3
  673. :
  674. : "cc", "memory", "r12", "q0");
  675. }
  676. void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
  677. asm volatile(
  678. // Start at end of source row.
  679. "mov r3, #-16 \n"
  680. "add %0, %0, %2, lsl #2 \n"
  681. "sub %0, #16 \n"
  682. "1: \n"
  683. "vld1.8 {q0}, [%0], r3 \n" // src -= 16
  684. "subs %2, #4 \n" // 4 pixels per loop.
  685. "vrev64.32 q0, q0 \n"
  686. "vst1.8 {d1}, [%1]! \n" // dst += 16
  687. "vst1.8 {d0}, [%1]! \n"
  688. "bgt 1b \n"
  689. : "+r"(src), // %0
  690. "+r"(dst), // %1
  691. "+r"(width) // %2
  692. :
  693. : "cc", "memory", "r3", "q0");
  694. }
  695. void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
  696. uint8_t* dst_argb,
  697. int width) {
  698. asm volatile(
  699. "vmov.u8 d4, #255 \n" // Alpha
  700. "1: \n"
  701. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
  702. "subs %2, %2, #8 \n" // 8 processed per loop.
  703. "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
  704. "bgt 1b \n"
  705. : "+r"(src_rgb24), // %0
  706. "+r"(dst_argb), // %1
  707. "+r"(width) // %2
  708. :
  709. : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
  710. );
  711. }
  712. void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
  713. asm volatile(
  714. "vmov.u8 d4, #255 \n" // Alpha
  715. "1: \n"
  716. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
  717. "subs %2, %2, #8 \n" // 8 processed per loop.
  718. "vswp.u8 d1, d3 \n" // swap R, B
  719. "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
  720. "bgt 1b \n"
  721. : "+r"(src_raw), // %0
  722. "+r"(dst_argb), // %1
  723. "+r"(width) // %2
  724. :
  725. : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
  726. );
  727. }
  728. void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
  729. asm volatile(
  730. "1: \n"
  731. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
  732. "subs %2, %2, #8 \n" // 8 processed per loop.
  733. "vswp.u8 d1, d3 \n" // swap R, B
  734. "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
  735. // RGB24.
  736. "bgt 1b \n"
  737. : "+r"(src_raw), // %0
  738. "+r"(dst_rgb24), // %1
  739. "+r"(width) // %2
  740. :
  741. : "cc", "memory", "d1", "d2", "d3" // Clobber List
  742. );
  743. }
  744. #define RGB565TOARGB \
  745. "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \
  746. "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \
  747. "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \
  748. "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \
  749. "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
  750. "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
  751. "vorr.u8 d0, d0, d4 \n" /* B */ \
  752. "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \
  753. "vorr.u8 d2, d1, d5 \n" /* R */ \
  754. "vorr.u8 d1, d4, d6 \n" /* G */
  755. void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
  756. uint8_t* dst_argb,
  757. int width) {
  758. asm volatile(
  759. "vmov.u8 d3, #255 \n" // Alpha
  760. "1: \n"
  761. "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
  762. "subs %2, %2, #8 \n" // 8 processed per loop.
  763. RGB565TOARGB
  764. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
  765. "bgt 1b \n"
  766. : "+r"(src_rgb565), // %0
  767. "+r"(dst_argb), // %1
  768. "+r"(width) // %2
  769. :
  770. : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
  771. );
  772. }
  773. #define ARGB1555TOARGB \
  774. "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \
  775. "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \
  776. "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \
  777. "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \
  778. "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \
  779. "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \
  780. "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \
  781. "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \
  782. "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \
  783. "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \
  784. "vorr.u8 q1, q1, q3 \n" /* R,A */ \
  785. "vorr.u8 q0, q0, q2 \n" /* B,G */
  786. // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
  787. #define RGB555TOARGB \
  788. "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \
  789. "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \
  790. "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \
  791. "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \
  792. "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
  793. "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
  794. "vorr.u8 d0, d0, d4 \n" /* B */ \
  795. "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \
  796. "vorr.u8 d2, d1, d5 \n" /* R */ \
  797. "vorr.u8 d1, d4, d6 \n" /* G */
  798. void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
  799. uint8_t* dst_argb,
  800. int width) {
  801. asm volatile(
  802. "vmov.u8 d3, #255 \n" // Alpha
  803. "1: \n"
  804. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
  805. "subs %2, %2, #8 \n" // 8 processed per loop.
  806. ARGB1555TOARGB
  807. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
  808. "bgt 1b \n"
  809. : "+r"(src_argb1555), // %0
  810. "+r"(dst_argb), // %1
  811. "+r"(width) // %2
  812. :
  813. : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
  814. );
  815. }
  816. #define ARGB4444TOARGB \
  817. "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \
  818. "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \
  819. "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \
  820. "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \
  821. "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \
  822. "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \
  823. "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \
  824. "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */
  825. void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
  826. uint8_t* dst_argb,
  827. int width) {
  828. asm volatile(
  829. "vmov.u8 d3, #255 \n" // Alpha
  830. "1: \n"
  831. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
  832. "subs %2, %2, #8 \n" // 8 processed per loop.
  833. ARGB4444TOARGB
  834. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
  835. "bgt 1b \n"
  836. : "+r"(src_argb4444), // %0
  837. "+r"(dst_argb), // %1
  838. "+r"(width) // %2
  839. :
  840. : "cc", "memory", "q0", "q1", "q2" // Clobber List
  841. );
  842. }
  843. void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
  844. uint8_t* dst_rgb24,
  845. int width) {
  846. asm volatile(
  847. "1: \n"
  848. "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
  849. "subs %2, %2, #8 \n" // 8 processed per loop.
  850. "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
  851. // RGB24.
  852. "bgt 1b \n"
  853. : "+r"(src_argb), // %0
  854. "+r"(dst_rgb24), // %1
  855. "+r"(width) // %2
  856. :
  857. : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
  858. );
  859. }
  860. void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
  861. asm volatile(
  862. "1: \n"
  863. "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
  864. "subs %2, %2, #8 \n" // 8 processed per loop.
  865. "vswp.u8 d1, d3 \n" // swap R, B
  866. "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
  867. "bgt 1b \n"
  868. : "+r"(src_argb), // %0
  869. "+r"(dst_raw), // %1
  870. "+r"(width) // %2
  871. :
  872. : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
  873. );
  874. }
  875. void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
  876. asm volatile(
  877. "1: \n"
  878. "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
  879. "subs %2, %2, #16 \n" // 16 processed per loop.
  880. "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
  881. "bgt 1b \n"
  882. : "+r"(src_yuy2), // %0
  883. "+r"(dst_y), // %1
  884. "+r"(width) // %2
  885. :
  886. : "cc", "memory", "q0", "q1" // Clobber List
  887. );
  888. }
  889. void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
  890. asm volatile(
  891. "1: \n"
  892. "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
  893. "subs %2, %2, #16 \n" // 16 processed per loop.
  894. "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
  895. "bgt 1b \n"
  896. : "+r"(src_uyvy), // %0
  897. "+r"(dst_y), // %1
  898. "+r"(width) // %2
  899. :
  900. : "cc", "memory", "q0", "q1" // Clobber List
  901. );
  902. }
  903. void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
  904. uint8_t* dst_u,
  905. uint8_t* dst_v,
  906. int width) {
  907. asm volatile(
  908. "1: \n"
  909. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
  910. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
  911. "vst1.8 {d1}, [%1]! \n" // store 8 U.
  912. "vst1.8 {d3}, [%2]! \n" // store 8 V.
  913. "bgt 1b \n"
  914. : "+r"(src_yuy2), // %0
  915. "+r"(dst_u), // %1
  916. "+r"(dst_v), // %2
  917. "+r"(width) // %3
  918. :
  919. : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
  920. );
  921. }
  922. void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
  923. uint8_t* dst_u,
  924. uint8_t* dst_v,
  925. int width) {
  926. asm volatile(
  927. "1: \n"
  928. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
  929. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
  930. "vst1.8 {d0}, [%1]! \n" // store 8 U.
  931. "vst1.8 {d2}, [%2]! \n" // store 8 V.
  932. "bgt 1b \n"
  933. : "+r"(src_uyvy), // %0
  934. "+r"(dst_u), // %1
  935. "+r"(dst_v), // %2
  936. "+r"(width) // %3
  937. :
  938. : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
  939. );
  940. }
  941. void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
  942. int stride_yuy2,
  943. uint8_t* dst_u,
  944. uint8_t* dst_v,
  945. int width) {
  946. asm volatile(
  947. "add %1, %0, %1 \n" // stride + src_yuy2
  948. "1: \n"
  949. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
  950. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
  951. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
  952. "vrhadd.u8 d1, d1, d5 \n" // average rows of U
  953. "vrhadd.u8 d3, d3, d7 \n" // average rows of V
  954. "vst1.8 {d1}, [%2]! \n" // store 8 U.
  955. "vst1.8 {d3}, [%3]! \n" // store 8 V.
  956. "bgt 1b \n"
  957. : "+r"(src_yuy2), // %0
  958. "+r"(stride_yuy2), // %1
  959. "+r"(dst_u), // %2
  960. "+r"(dst_v), // %3
  961. "+r"(width) // %4
  962. :
  963. : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
  964. "d7" // Clobber List
  965. );
  966. }
  967. void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
  968. int stride_uyvy,
  969. uint8_t* dst_u,
  970. uint8_t* dst_v,
  971. int width) {
  972. asm volatile(
  973. "add %1, %0, %1 \n" // stride + src_uyvy
  974. "1: \n"
  975. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
  976. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
  977. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
  978. "vrhadd.u8 d0, d0, d4 \n" // average rows of U
  979. "vrhadd.u8 d2, d2, d6 \n" // average rows of V
  980. "vst1.8 {d0}, [%2]! \n" // store 8 U.
  981. "vst1.8 {d2}, [%3]! \n" // store 8 V.
  982. "bgt 1b \n"
  983. : "+r"(src_uyvy), // %0
  984. "+r"(stride_uyvy), // %1
  985. "+r"(dst_u), // %2
  986. "+r"(dst_v), // %3
  987. "+r"(width) // %4
  988. :
  989. : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
  990. "d7" // Clobber List
  991. );
  992. }
  993. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  994. void ARGBShuffleRow_NEON(const uint8_t* src_argb,
  995. uint8_t* dst_argb,
  996. const uint8_t* shuffler,
  997. int width) {
  998. asm volatile(
  999. "vld1.8 {q2}, [%3] \n" // shuffler
  1000. "1: \n"
  1001. "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
  1002. "subs %2, %2, #4 \n" // 4 processed per loop
  1003. "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
  1004. "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
  1005. "vst1.8 {q1}, [%1]! \n" // store 4.
  1006. "bgt 1b \n"
  1007. : "+r"(src_argb), // %0
  1008. "+r"(dst_argb), // %1
  1009. "+r"(width) // %2
  1010. : "r"(shuffler) // %3
  1011. : "cc", "memory", "q0", "q1", "q2" // Clobber List
  1012. );
  1013. }
  1014. void I422ToYUY2Row_NEON(const uint8_t* src_y,
  1015. const uint8_t* src_u,
  1016. const uint8_t* src_v,
  1017. uint8_t* dst_yuy2,
  1018. int width) {
  1019. asm volatile(
  1020. "1: \n"
  1021. "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
  1022. "vld1.8 {d1}, [%1]! \n" // load 8 Us
  1023. "vld1.8 {d3}, [%2]! \n" // load 8 Vs
  1024. "subs %4, %4, #16 \n" // 16 pixels
  1025. "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
  1026. "bgt 1b \n"
  1027. : "+r"(src_y), // %0
  1028. "+r"(src_u), // %1
  1029. "+r"(src_v), // %2
  1030. "+r"(dst_yuy2), // %3
  1031. "+r"(width) // %4
  1032. :
  1033. : "cc", "memory", "d0", "d1", "d2", "d3");
  1034. }
  1035. void I422ToUYVYRow_NEON(const uint8_t* src_y,
  1036. const uint8_t* src_u,
  1037. const uint8_t* src_v,
  1038. uint8_t* dst_uyvy,
  1039. int width) {
  1040. asm volatile(
  1041. "1: \n"
  1042. "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
  1043. "vld1.8 {d0}, [%1]! \n" // load 8 Us
  1044. "vld1.8 {d2}, [%2]! \n" // load 8 Vs
  1045. "subs %4, %4, #16 \n" // 16 pixels
  1046. "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
  1047. "bgt 1b \n"
  1048. : "+r"(src_y), // %0
  1049. "+r"(src_u), // %1
  1050. "+r"(src_v), // %2
  1051. "+r"(dst_uyvy), // %3
  1052. "+r"(width) // %4
  1053. :
  1054. : "cc", "memory", "d0", "d1", "d2", "d3");
  1055. }
  1056. void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
  1057. uint8_t* dst_rgb565,
  1058. int width) {
  1059. asm volatile(
  1060. "1: \n"
  1061. "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
  1062. "subs %2, %2, #8 \n" // 8 processed per loop.
  1063. ARGBTORGB565
  1064. "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
  1065. "bgt 1b \n"
  1066. : "+r"(src_argb), // %0
  1067. "+r"(dst_rgb565), // %1
  1068. "+r"(width) // %2
  1069. :
  1070. : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
  1071. }
  1072. void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
  1073. uint8_t* dst_rgb,
  1074. const uint32_t dither4,
  1075. int width) {
  1076. asm volatile(
  1077. "vdup.32 d2, %2 \n" // dither4
  1078. "1: \n"
  1079. "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB.
  1080. "subs %3, %3, #8 \n" // 8 processed per loop.
  1081. "vqadd.u8 d20, d20, d2 \n"
  1082. "vqadd.u8 d21, d21, d2 \n"
  1083. "vqadd.u8 d22, d22, d2 \n" // add for dither
  1084. ARGBTORGB565
  1085. "vst1.8 {q0}, [%0]! \n" // store 8 RGB565.
  1086. "bgt 1b \n"
  1087. : "+r"(dst_rgb) // %0
  1088. : "r"(src_argb), // %1
  1089. "r"(dither4), // %2
  1090. "r"(width) // %3
  1091. : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11");
  1092. }
  1093. void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
  1094. uint8_t* dst_argb1555,
  1095. int width) {
  1096. asm volatile(
  1097. "1: \n"
  1098. "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
  1099. "subs %2, %2, #8 \n" // 8 processed per loop.
  1100. ARGBTOARGB1555
  1101. "vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555.
  1102. "bgt 1b \n"
  1103. : "+r"(src_argb), // %0
  1104. "+r"(dst_argb1555), // %1
  1105. "+r"(width) // %2
  1106. :
  1107. : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
  1108. }
  1109. void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
  1110. uint8_t* dst_argb4444,
  1111. int width) {
  1112. asm volatile(
  1113. "vmov.u8 d4, #0x0f \n" // bits to clear with
  1114. // vbic.
  1115. "1: \n"
  1116. "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
  1117. "subs %2, %2, #8 \n" // 8 processed per loop.
  1118. ARGBTOARGB4444
  1119. "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444.
  1120. "bgt 1b \n"
  1121. : "+r"(src_argb), // %0
  1122. "+r"(dst_argb4444), // %1
  1123. "+r"(width) // %2
  1124. :
  1125. : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
  1126. }
  1127. void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
  1128. asm volatile(
  1129. "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
  1130. "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
  1131. "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
  1132. "vmov.u8 d27, #16 \n" // Add 16 constant
  1133. "1: \n"
  1134. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
  1135. "subs %2, %2, #8 \n" // 8 processed per loop.
  1136. "vmull.u8 q2, d0, d24 \n" // B
  1137. "vmlal.u8 q2, d1, d25 \n" // G
  1138. "vmlal.u8 q2, d2, d26 \n" // R
  1139. "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
  1140. "vqadd.u8 d0, d27 \n"
  1141. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1142. "bgt 1b \n"
  1143. : "+r"(src_argb), // %0
  1144. "+r"(dst_y), // %1
  1145. "+r"(width) // %2
  1146. :
  1147. : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
  1148. }
  1149. void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
  1150. uint8_t* dst_a,
  1151. int width) {
  1152. asm volatile(
  1153. "1: \n"
  1154. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
  1155. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
  1156. "subs %2, %2, #16 \n" // 16 processed per loop
  1157. "vst1.8 {q3}, [%1]! \n" // store 16 A's.
  1158. "bgt 1b \n"
  1159. : "+r"(src_argb), // %0
  1160. "+r"(dst_a), // %1
  1161. "+r"(width) // %2
  1162. :
  1163. : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
  1164. );
  1165. }
  1166. void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
  1167. asm volatile(
  1168. "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
  1169. "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
  1170. "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
  1171. "1: \n"
  1172. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
  1173. "subs %2, %2, #8 \n" // 8 processed per loop.
  1174. "vmull.u8 q2, d0, d24 \n" // B
  1175. "vmlal.u8 q2, d1, d25 \n" // G
  1176. "vmlal.u8 q2, d2, d26 \n" // R
  1177. "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y
  1178. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1179. "bgt 1b \n"
  1180. : "+r"(src_argb), // %0
  1181. "+r"(dst_y), // %1
  1182. "+r"(width) // %2
  1183. :
  1184. : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
  1185. }
  1186. // 8x1 pixels.
  1187. void ARGBToUV444Row_NEON(const uint8_t* src_argb,
  1188. uint8_t* dst_u,
  1189. uint8_t* dst_v,
  1190. int width) {
  1191. asm volatile(
  1192. "vmov.u8 d24, #112 \n" // UB / VR 0.875
  1193. // coefficient
  1194. "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient
  1195. "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient
  1196. "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
  1197. "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
  1198. "vmov.u16 q15, #0x8080 \n" // 128.5
  1199. "1: \n"
  1200. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
  1201. "subs %3, %3, #8 \n" // 8 processed per loop.
  1202. "vmull.u8 q2, d0, d24 \n" // B
  1203. "vmlsl.u8 q2, d1, d25 \n" // G
  1204. "vmlsl.u8 q2, d2, d26 \n" // R
  1205. "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned
  1206. "vmull.u8 q3, d2, d24 \n" // R
  1207. "vmlsl.u8 q3, d1, d28 \n" // G
  1208. "vmlsl.u8 q3, d0, d27 \n" // B
  1209. "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned
  1210. "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U
  1211. "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V
  1212. "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
  1213. "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
  1214. "bgt 1b \n"
  1215. : "+r"(src_argb), // %0
  1216. "+r"(dst_u), // %1
  1217. "+r"(dst_v), // %2
  1218. "+r"(width) // %3
  1219. :
  1220. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14",
  1221. "q15");
  1222. }
  1223. // clang-format off
  1224. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
  1225. #define RGBTOUV(QB, QG, QR) \
  1226. "vmul.s16 q8, " #QB ", q10 \n" /* B */ \
  1227. "vmls.s16 q8, " #QG ", q11 \n" /* G */ \
  1228. "vmls.s16 q8, " #QR ", q12 \n" /* R */ \
  1229. "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \
  1230. "vmul.s16 q9, " #QR ", q10 \n" /* R */ \
  1231. "vmls.s16 q9, " #QG ", q14 \n" /* G */ \
  1232. "vmls.s16 q9, " #QB ", q13 \n" /* B */ \
  1233. "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \
  1234. "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \
  1235. "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
  1236. // clang-format on
  1237. // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
  1238. void ARGBToUVRow_NEON(const uint8_t* src_argb,
  1239. int src_stride_argb,
  1240. uint8_t* dst_u,
  1241. uint8_t* dst_v,
  1242. int width) {
  1243. asm volatile (
  1244. "add %1, %0, %1 \n" // src_stride + src_argb
  1245. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1246. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1247. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1248. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1249. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1250. "vmov.u16 q15, #0x8080 \n" // 128.5
  1251. "1: \n"
  1252. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
  1253. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
  1254. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
  1255. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
  1256. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
  1257. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
  1258. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
  1259. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
  1260. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
  1261. "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
  1262. "vrshr.u16 q0, q0, #1 \n" // 2x average
  1263. "vrshr.u16 q1, q1, #1 \n"
  1264. "vrshr.u16 q2, q2, #1 \n"
  1265. "subs %4, %4, #16 \n" // 32 processed per loop.
  1266. RGBTOUV(q0, q1, q2)
  1267. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1268. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1269. "bgt 1b \n"
  1270. : "+r"(src_argb), // %0
  1271. "+r"(src_stride_argb), // %1
  1272. "+r"(dst_u), // %2
  1273. "+r"(dst_v), // %3
  1274. "+r"(width) // %4
  1275. :
  1276. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1277. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1278. );
  1279. }
  1280. // TODO(fbarchard): Subsample match C code.
  1281. void ARGBToUVJRow_NEON(const uint8_t* src_argb,
  1282. int src_stride_argb,
  1283. uint8_t* dst_u,
  1284. uint8_t* dst_v,
  1285. int width) {
  1286. asm volatile (
  1287. "add %1, %0, %1 \n" // src_stride + src_argb
  1288. "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
  1289. "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
  1290. "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
  1291. "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
  1292. "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
  1293. "vmov.u16 q15, #0x8080 \n" // 128.5
  1294. "1: \n"
  1295. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
  1296. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
  1297. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
  1298. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
  1299. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
  1300. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
  1301. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
  1302. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
  1303. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
  1304. "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
  1305. "vrshr.u16 q0, q0, #1 \n" // 2x average
  1306. "vrshr.u16 q1, q1, #1 \n"
  1307. "vrshr.u16 q2, q2, #1 \n"
  1308. "subs %4, %4, #16 \n" // 32 processed per loop.
  1309. RGBTOUV(q0, q1, q2)
  1310. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1311. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1312. "bgt 1b \n"
  1313. : "+r"(src_argb), // %0
  1314. "+r"(src_stride_argb), // %1
  1315. "+r"(dst_u), // %2
  1316. "+r"(dst_v), // %3
  1317. "+r"(width) // %4
  1318. :
  1319. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1320. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1321. );
  1322. }
  1323. void BGRAToUVRow_NEON(const uint8_t* src_bgra,
  1324. int src_stride_bgra,
  1325. uint8_t* dst_u,
  1326. uint8_t* dst_v,
  1327. int width) {
  1328. asm volatile (
  1329. "add %1, %0, %1 \n" // src_stride + src_bgra
  1330. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1331. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1332. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1333. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1334. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1335. "vmov.u16 q15, #0x8080 \n" // 128.5
  1336. "1: \n"
  1337. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
  1338. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
  1339. "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
  1340. "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
  1341. "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
  1342. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
  1343. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
  1344. "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
  1345. "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
  1346. "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
  1347. "vrshr.u16 q1, q1, #1 \n" // 2x average
  1348. "vrshr.u16 q2, q2, #1 \n"
  1349. "vrshr.u16 q3, q3, #1 \n"
  1350. "subs %4, %4, #16 \n" // 32 processed per loop.
  1351. RGBTOUV(q3, q2, q1)
  1352. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1353. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1354. "bgt 1b \n"
  1355. : "+r"(src_bgra), // %0
  1356. "+r"(src_stride_bgra), // %1
  1357. "+r"(dst_u), // %2
  1358. "+r"(dst_v), // %3
  1359. "+r"(width) // %4
  1360. :
  1361. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1362. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1363. );
  1364. }
  1365. void ABGRToUVRow_NEON(const uint8_t* src_abgr,
  1366. int src_stride_abgr,
  1367. uint8_t* dst_u,
  1368. uint8_t* dst_v,
  1369. int width) {
  1370. asm volatile (
  1371. "add %1, %0, %1 \n" // src_stride + src_abgr
  1372. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1373. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1374. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1375. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1376. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1377. "vmov.u16 q15, #0x8080 \n" // 128.5
  1378. "1: \n"
  1379. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
  1380. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
  1381. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
  1382. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
  1383. "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
  1384. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
  1385. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
  1386. "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
  1387. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
  1388. "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
  1389. "vrshr.u16 q0, q0, #1 \n" // 2x average
  1390. "vrshr.u16 q1, q1, #1 \n"
  1391. "vrshr.u16 q2, q2, #1 \n"
  1392. "subs %4, %4, #16 \n" // 32 processed per loop.
  1393. RGBTOUV(q2, q1, q0)
  1394. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1395. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1396. "bgt 1b \n"
  1397. : "+r"(src_abgr), // %0
  1398. "+r"(src_stride_abgr), // %1
  1399. "+r"(dst_u), // %2
  1400. "+r"(dst_v), // %3
  1401. "+r"(width) // %4
  1402. :
  1403. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1404. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1405. );
  1406. }
  1407. void RGBAToUVRow_NEON(const uint8_t* src_rgba,
  1408. int src_stride_rgba,
  1409. uint8_t* dst_u,
  1410. uint8_t* dst_v,
  1411. int width) {
  1412. asm volatile (
  1413. "add %1, %0, %1 \n" // src_stride + src_rgba
  1414. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1415. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1416. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1417. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1418. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1419. "vmov.u16 q15, #0x8080 \n" // 128.5
  1420. "1: \n"
  1421. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
  1422. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
  1423. "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
  1424. "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
  1425. "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
  1426. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
  1427. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
  1428. "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
  1429. "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
  1430. "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
  1431. "vrshr.u16 q0, q0, #1 \n" // 2x average
  1432. "vrshr.u16 q1, q1, #1 \n"
  1433. "vrshr.u16 q2, q2, #1 \n"
  1434. "subs %4, %4, #16 \n" // 32 processed per loop.
  1435. RGBTOUV(q0, q1, q2)
  1436. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1437. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1438. "bgt 1b \n"
  1439. : "+r"(src_rgba), // %0
  1440. "+r"(src_stride_rgba), // %1
  1441. "+r"(dst_u), // %2
  1442. "+r"(dst_v), // %3
  1443. "+r"(width) // %4
  1444. :
  1445. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1446. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1447. );
  1448. }
  1449. void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
  1450. int src_stride_rgb24,
  1451. uint8_t* dst_u,
  1452. uint8_t* dst_v,
  1453. int width) {
  1454. asm volatile (
  1455. "add %1, %0, %1 \n" // src_stride + src_rgb24
  1456. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1457. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1458. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1459. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1460. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1461. "vmov.u16 q15, #0x8080 \n" // 128.5
  1462. "1: \n"
  1463. "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
  1464. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
  1465. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
  1466. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
  1467. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
  1468. "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
  1469. "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
  1470. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
  1471. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
  1472. "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
  1473. "vrshr.u16 q0, q0, #1 \n" // 2x average
  1474. "vrshr.u16 q1, q1, #1 \n"
  1475. "vrshr.u16 q2, q2, #1 \n"
  1476. "subs %4, %4, #16 \n" // 32 processed per loop.
  1477. RGBTOUV(q0, q1, q2)
  1478. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1479. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1480. "bgt 1b \n"
  1481. : "+r"(src_rgb24), // %0
  1482. "+r"(src_stride_rgb24), // %1
  1483. "+r"(dst_u), // %2
  1484. "+r"(dst_v), // %3
  1485. "+r"(width) // %4
  1486. :
  1487. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1488. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1489. );
  1490. }
  1491. void RAWToUVRow_NEON(const uint8_t* src_raw,
  1492. int src_stride_raw,
  1493. uint8_t* dst_u,
  1494. uint8_t* dst_v,
  1495. int width) {
  1496. asm volatile (
  1497. "add %1, %0, %1 \n" // src_stride + src_raw
  1498. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1499. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1500. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1501. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1502. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1503. "vmov.u16 q15, #0x8080 \n" // 128.5
  1504. "1: \n"
  1505. "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
  1506. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
  1507. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
  1508. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
  1509. "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
  1510. "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
  1511. "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
  1512. "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
  1513. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
  1514. "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
  1515. "vrshr.u16 q0, q0, #1 \n" // 2x average
  1516. "vrshr.u16 q1, q1, #1 \n"
  1517. "vrshr.u16 q2, q2, #1 \n"
  1518. "subs %4, %4, #16 \n" // 32 processed per loop.
  1519. RGBTOUV(q2, q1, q0)
  1520. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1521. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1522. "bgt 1b \n"
  1523. : "+r"(src_raw), // %0
  1524. "+r"(src_stride_raw), // %1
  1525. "+r"(dst_u), // %2
  1526. "+r"(dst_v), // %3
  1527. "+r"(width) // %4
  1528. :
  1529. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1530. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1531. );
  1532. }
  1533. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
  1534. void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
  1535. int src_stride_rgb565,
  1536. uint8_t* dst_u,
  1537. uint8_t* dst_v,
  1538. int width) {
  1539. asm volatile(
  1540. "add %1, %0, %1 \n" // src_stride + src_argb
  1541. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
  1542. // coefficient
  1543. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1544. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1545. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1546. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1547. "vmov.u16 q15, #0x8080 \n" // 128.5
  1548. "1: \n"
  1549. "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
  1550. RGB565TOARGB
  1551. "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
  1552. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
  1553. "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
  1554. "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
  1555. RGB565TOARGB
  1556. "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
  1557. "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
  1558. "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
  1559. "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
  1560. RGB565TOARGB
  1561. "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
  1562. "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
  1563. "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
  1564. "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
  1565. RGB565TOARGB
  1566. "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
  1567. "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
  1568. "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
  1569. "vrshr.u16 q4, q4, #1 \n" // 2x average
  1570. "vrshr.u16 q5, q5, #1 \n"
  1571. "vrshr.u16 q6, q6, #1 \n"
  1572. "subs %4, %4, #16 \n" // 16 processed per loop.
  1573. "vmul.s16 q8, q4, q10 \n" // B
  1574. "vmls.s16 q8, q5, q11 \n" // G
  1575. "vmls.s16 q8, q6, q12 \n" // R
  1576. "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
  1577. "vmul.s16 q9, q6, q10 \n" // R
  1578. "vmls.s16 q9, q5, q14 \n" // G
  1579. "vmls.s16 q9, q4, q13 \n" // B
  1580. "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
  1581. "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
  1582. "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
  1583. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1584. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1585. "bgt 1b \n"
  1586. : "+r"(src_rgb565), // %0
  1587. "+r"(src_stride_rgb565), // %1
  1588. "+r"(dst_u), // %2
  1589. "+r"(dst_v), // %3
  1590. "+r"(width) // %4
  1591. :
  1592. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
  1593. "q9", "q10", "q11", "q12", "q13", "q14", "q15");
  1594. }
  1595. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
  1596. void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
  1597. int src_stride_argb1555,
  1598. uint8_t* dst_u,
  1599. uint8_t* dst_v,
  1600. int width) {
  1601. asm volatile(
  1602. "add %1, %0, %1 \n" // src_stride + src_argb
  1603. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
  1604. // coefficient
  1605. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1606. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1607. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1608. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1609. "vmov.u16 q15, #0x8080 \n" // 128.5
  1610. "1: \n"
  1611. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
  1612. RGB555TOARGB
  1613. "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
  1614. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
  1615. "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
  1616. "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
  1617. RGB555TOARGB
  1618. "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
  1619. "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
  1620. "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
  1621. "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
  1622. RGB555TOARGB
  1623. "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
  1624. "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
  1625. "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
  1626. "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
  1627. RGB555TOARGB
  1628. "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
  1629. "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
  1630. "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
  1631. "vrshr.u16 q4, q4, #1 \n" // 2x average
  1632. "vrshr.u16 q5, q5, #1 \n"
  1633. "vrshr.u16 q6, q6, #1 \n"
  1634. "subs %4, %4, #16 \n" // 16 processed per loop.
  1635. "vmul.s16 q8, q4, q10 \n" // B
  1636. "vmls.s16 q8, q5, q11 \n" // G
  1637. "vmls.s16 q8, q6, q12 \n" // R
  1638. "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
  1639. "vmul.s16 q9, q6, q10 \n" // R
  1640. "vmls.s16 q9, q5, q14 \n" // G
  1641. "vmls.s16 q9, q4, q13 \n" // B
  1642. "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
  1643. "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
  1644. "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
  1645. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1646. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1647. "bgt 1b \n"
  1648. : "+r"(src_argb1555), // %0
  1649. "+r"(src_stride_argb1555), // %1
  1650. "+r"(dst_u), // %2
  1651. "+r"(dst_v), // %3
  1652. "+r"(width) // %4
  1653. :
  1654. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
  1655. "q9", "q10", "q11", "q12", "q13", "q14", "q15");
  1656. }
  1657. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
  1658. void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
  1659. int src_stride_argb4444,
  1660. uint8_t* dst_u,
  1661. uint8_t* dst_v,
  1662. int width) {
  1663. asm volatile(
  1664. "add %1, %0, %1 \n" // src_stride + src_argb
  1665. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
  1666. // coefficient
  1667. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1668. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1669. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1670. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1671. "vmov.u16 q15, #0x8080 \n" // 128.5
  1672. "1: \n"
  1673. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
  1674. ARGB4444TOARGB
  1675. "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
  1676. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
  1677. "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
  1678. "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
  1679. ARGB4444TOARGB
  1680. "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
  1681. "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
  1682. "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
  1683. "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
  1684. ARGB4444TOARGB
  1685. "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
  1686. "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
  1687. "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
  1688. "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
  1689. ARGB4444TOARGB
  1690. "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
  1691. "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
  1692. "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
  1693. "vrshr.u16 q4, q4, #1 \n" // 2x average
  1694. "vrshr.u16 q5, q5, #1 \n"
  1695. "vrshr.u16 q6, q6, #1 \n"
  1696. "subs %4, %4, #16 \n" // 16 processed per loop.
  1697. "vmul.s16 q8, q4, q10 \n" // B
  1698. "vmls.s16 q8, q5, q11 \n" // G
  1699. "vmls.s16 q8, q6, q12 \n" // R
  1700. "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
  1701. "vmul.s16 q9, q6, q10 \n" // R
  1702. "vmls.s16 q9, q5, q14 \n" // G
  1703. "vmls.s16 q9, q4, q13 \n" // B
  1704. "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
  1705. "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
  1706. "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
  1707. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1708. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1709. "bgt 1b \n"
  1710. : "+r"(src_argb4444), // %0
  1711. "+r"(src_stride_argb4444), // %1
  1712. "+r"(dst_u), // %2
  1713. "+r"(dst_v), // %3
  1714. "+r"(width) // %4
  1715. :
  1716. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
  1717. "q9", "q10", "q11", "q12", "q13", "q14", "q15");
  1718. }
  1719. void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
  1720. asm volatile(
  1721. "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
  1722. "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
  1723. "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
  1724. "vmov.u8 d27, #16 \n" // Add 16 constant
  1725. "1: \n"
  1726. "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
  1727. "subs %2, %2, #8 \n" // 8 processed per loop.
  1728. RGB565TOARGB
  1729. "vmull.u8 q2, d0, d24 \n" // B
  1730. "vmlal.u8 q2, d1, d25 \n" // G
  1731. "vmlal.u8 q2, d2, d26 \n" // R
  1732. "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
  1733. "vqadd.u8 d0, d27 \n"
  1734. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1735. "bgt 1b \n"
  1736. : "+r"(src_rgb565), // %0
  1737. "+r"(dst_y), // %1
  1738. "+r"(width) // %2
  1739. :
  1740. : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
  1741. }
  1742. void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
  1743. uint8_t* dst_y,
  1744. int width) {
  1745. asm volatile(
  1746. "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
  1747. "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
  1748. "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
  1749. "vmov.u8 d27, #16 \n" // Add 16 constant
  1750. "1: \n"
  1751. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
  1752. "subs %2, %2, #8 \n" // 8 processed per loop.
  1753. ARGB1555TOARGB
  1754. "vmull.u8 q2, d0, d24 \n" // B
  1755. "vmlal.u8 q2, d1, d25 \n" // G
  1756. "vmlal.u8 q2, d2, d26 \n" // R
  1757. "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
  1758. "vqadd.u8 d0, d27 \n"
  1759. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1760. "bgt 1b \n"
  1761. : "+r"(src_argb1555), // %0
  1762. "+r"(dst_y), // %1
  1763. "+r"(width) // %2
  1764. :
  1765. : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
  1766. }
  1767. void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
  1768. uint8_t* dst_y,
  1769. int width) {
  1770. asm volatile(
  1771. "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
  1772. "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
  1773. "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
  1774. "vmov.u8 d27, #16 \n" // Add 16 constant
  1775. "1: \n"
  1776. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
  1777. "subs %2, %2, #8 \n" // 8 processed per loop.
  1778. ARGB4444TOARGB
  1779. "vmull.u8 q2, d0, d24 \n" // B
  1780. "vmlal.u8 q2, d1, d25 \n" // G
  1781. "vmlal.u8 q2, d2, d26 \n" // R
  1782. "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
  1783. "vqadd.u8 d0, d27 \n"
  1784. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1785. "bgt 1b \n"
  1786. : "+r"(src_argb4444), // %0
  1787. "+r"(dst_y), // %1
  1788. "+r"(width) // %2
  1789. :
  1790. : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
  1791. }
  1792. void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
  1793. asm volatile(
  1794. "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
  1795. "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
  1796. "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
  1797. "vmov.u8 d7, #16 \n" // Add 16 constant
  1798. "1: \n"
  1799. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
  1800. "subs %2, %2, #8 \n" // 8 processed per loop.
  1801. "vmull.u8 q8, d1, d4 \n" // R
  1802. "vmlal.u8 q8, d2, d5 \n" // G
  1803. "vmlal.u8 q8, d3, d6 \n" // B
  1804. "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
  1805. "vqadd.u8 d0, d7 \n"
  1806. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1807. "bgt 1b \n"
  1808. : "+r"(src_bgra), // %0
  1809. "+r"(dst_y), // %1
  1810. "+r"(width) // %2
  1811. :
  1812. : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
  1813. }
  1814. void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
  1815. asm volatile(
  1816. "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
  1817. "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
  1818. "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
  1819. "vmov.u8 d7, #16 \n" // Add 16 constant
  1820. "1: \n"
  1821. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
  1822. "subs %2, %2, #8 \n" // 8 processed per loop.
  1823. "vmull.u8 q8, d0, d4 \n" // R
  1824. "vmlal.u8 q8, d1, d5 \n" // G
  1825. "vmlal.u8 q8, d2, d6 \n" // B
  1826. "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
  1827. "vqadd.u8 d0, d7 \n"
  1828. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1829. "bgt 1b \n"
  1830. : "+r"(src_abgr), // %0
  1831. "+r"(dst_y), // %1
  1832. "+r"(width) // %2
  1833. :
  1834. : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
  1835. }
  1836. void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
  1837. asm volatile(
  1838. "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
  1839. "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
  1840. "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
  1841. "vmov.u8 d7, #16 \n" // Add 16 constant
  1842. "1: \n"
  1843. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
  1844. "subs %2, %2, #8 \n" // 8 processed per loop.
  1845. "vmull.u8 q8, d1, d4 \n" // B
  1846. "vmlal.u8 q8, d2, d5 \n" // G
  1847. "vmlal.u8 q8, d3, d6 \n" // R
  1848. "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
  1849. "vqadd.u8 d0, d7 \n"
  1850. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1851. "bgt 1b \n"
  1852. : "+r"(src_rgba), // %0
  1853. "+r"(dst_y), // %1
  1854. "+r"(width) // %2
  1855. :
  1856. : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
  1857. }
  1858. void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
  1859. asm volatile(
  1860. "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
  1861. "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
  1862. "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
  1863. "vmov.u8 d7, #16 \n" // Add 16 constant
  1864. "1: \n"
  1865. "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
  1866. "subs %2, %2, #8 \n" // 8 processed per loop.
  1867. "vmull.u8 q8, d0, d4 \n" // B
  1868. "vmlal.u8 q8, d1, d5 \n" // G
  1869. "vmlal.u8 q8, d2, d6 \n" // R
  1870. "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
  1871. "vqadd.u8 d0, d7 \n"
  1872. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1873. "bgt 1b \n"
  1874. : "+r"(src_rgb24), // %0
  1875. "+r"(dst_y), // %1
  1876. "+r"(width) // %2
  1877. :
  1878. : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
  1879. }
  1880. void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
  1881. asm volatile(
  1882. "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
  1883. "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
  1884. "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
  1885. "vmov.u8 d7, #16 \n" // Add 16 constant
  1886. "1: \n"
  1887. "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
  1888. "subs %2, %2, #8 \n" // 8 processed per loop.
  1889. "vmull.u8 q8, d0, d4 \n" // B
  1890. "vmlal.u8 q8, d1, d5 \n" // G
  1891. "vmlal.u8 q8, d2, d6 \n" // R
  1892. "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
  1893. "vqadd.u8 d0, d7 \n"
  1894. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1895. "bgt 1b \n"
  1896. : "+r"(src_raw), // %0
  1897. "+r"(dst_y), // %1
  1898. "+r"(width) // %2
  1899. :
  1900. : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
  1901. }
  1902. // Bilinear filter 16x2 -> 16x1
  1903. void InterpolateRow_NEON(uint8_t* dst_ptr,
  1904. const uint8_t* src_ptr,
  1905. ptrdiff_t src_stride,
  1906. int dst_width,
  1907. int source_y_fraction) {
  1908. int y1_fraction = source_y_fraction;
  1909. asm volatile(
  1910. "cmp %4, #0 \n"
  1911. "beq 100f \n"
  1912. "add %2, %1 \n"
  1913. "cmp %4, #128 \n"
  1914. "beq 50f \n"
  1915. "vdup.8 d5, %4 \n"
  1916. "rsb %4, #256 \n"
  1917. "vdup.8 d4, %4 \n"
  1918. // General purpose row blend.
  1919. "1: \n"
  1920. "vld1.8 {q0}, [%1]! \n"
  1921. "vld1.8 {q1}, [%2]! \n"
  1922. "subs %3, %3, #16 \n"
  1923. "vmull.u8 q13, d0, d4 \n"
  1924. "vmull.u8 q14, d1, d4 \n"
  1925. "vmlal.u8 q13, d2, d5 \n"
  1926. "vmlal.u8 q14, d3, d5 \n"
  1927. "vrshrn.u16 d0, q13, #8 \n"
  1928. "vrshrn.u16 d1, q14, #8 \n"
  1929. "vst1.8 {q0}, [%0]! \n"
  1930. "bgt 1b \n"
  1931. "b 99f \n"
  1932. // Blend 50 / 50.
  1933. "50: \n"
  1934. "vld1.8 {q0}, [%1]! \n"
  1935. "vld1.8 {q1}, [%2]! \n"
  1936. "subs %3, %3, #16 \n"
  1937. "vrhadd.u8 q0, q1 \n"
  1938. "vst1.8 {q0}, [%0]! \n"
  1939. "bgt 50b \n"
  1940. "b 99f \n"
  1941. // Blend 100 / 0 - Copy row unchanged.
  1942. "100: \n"
  1943. "vld1.8 {q0}, [%1]! \n"
  1944. "subs %3, %3, #16 \n"
  1945. "vst1.8 {q0}, [%0]! \n"
  1946. "bgt 100b \n"
  1947. "99: \n"
  1948. : "+r"(dst_ptr), // %0
  1949. "+r"(src_ptr), // %1
  1950. "+r"(src_stride), // %2
  1951. "+r"(dst_width), // %3
  1952. "+r"(y1_fraction) // %4
  1953. :
  1954. : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14");
  1955. }
  1956. // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
  1957. void ARGBBlendRow_NEON(const uint8_t* src_argb0,
  1958. const uint8_t* src_argb1,
  1959. uint8_t* dst_argb,
  1960. int width) {
  1961. asm volatile(
  1962. "subs %3, #8 \n"
  1963. "blt 89f \n"
  1964. // Blend 8 pixels.
  1965. "8: \n"
  1966. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
  1967. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
  1968. "subs %3, %3, #8 \n" // 8 processed per loop.
  1969. "vmull.u8 q10, d4, d3 \n" // db * a
  1970. "vmull.u8 q11, d5, d3 \n" // dg * a
  1971. "vmull.u8 q12, d6, d3 \n" // dr * a
  1972. "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
  1973. "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
  1974. "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
  1975. "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
  1976. "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
  1977. "vqadd.u8 q0, q0, q2 \n" // + sbg
  1978. "vqadd.u8 d2, d2, d6 \n" // + sr
  1979. "vmov.u8 d3, #255 \n" // a = 255
  1980. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
  1981. "bge 8b \n"
  1982. "89: \n"
  1983. "adds %3, #8-1 \n"
  1984. "blt 99f \n"
  1985. // Blend 1 pixels.
  1986. "1: \n"
  1987. "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
  1988. "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
  1989. "subs %3, %3, #1 \n" // 1 processed per loop.
  1990. "vmull.u8 q10, d4, d3 \n" // db * a
  1991. "vmull.u8 q11, d5, d3 \n" // dg * a
  1992. "vmull.u8 q12, d6, d3 \n" // dr * a
  1993. "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
  1994. "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
  1995. "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
  1996. "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
  1997. "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
  1998. "vqadd.u8 q0, q0, q2 \n" // + sbg
  1999. "vqadd.u8 d2, d2, d6 \n" // + sr
  2000. "vmov.u8 d3, #255 \n" // a = 255
  2001. "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
  2002. "bge 1b \n"
  2003. "99: \n"
  2004. : "+r"(src_argb0), // %0
  2005. "+r"(src_argb1), // %1
  2006. "+r"(dst_argb), // %2
  2007. "+r"(width) // %3
  2008. :
  2009. : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12");
  2010. }
  2011. // Attenuate 8 pixels at a time.
  2012. void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
  2013. uint8_t* dst_argb,
  2014. int width) {
  2015. asm volatile(
  2016. // Attenuate 8 pixels.
  2017. "1: \n"
  2018. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
  2019. "subs %2, %2, #8 \n" // 8 processed per loop.
  2020. "vmull.u8 q10, d0, d3 \n" // b * a
  2021. "vmull.u8 q11, d1, d3 \n" // g * a
  2022. "vmull.u8 q12, d2, d3 \n" // r * a
  2023. "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
  2024. "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
  2025. "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
  2026. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
  2027. "bgt 1b \n"
  2028. : "+r"(src_argb), // %0
  2029. "+r"(dst_argb), // %1
  2030. "+r"(width) // %2
  2031. :
  2032. : "cc", "memory", "q0", "q1", "q10", "q11", "q12");
  2033. }
  2034. // Quantize 8 ARGB pixels (32 bytes).
  2035. // dst = (dst * scale >> 16) * interval_size + interval_offset;
  2036. void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
  2037. int scale,
  2038. int interval_size,
  2039. int interval_offset,
  2040. int width) {
  2041. asm volatile(
  2042. "vdup.u16 q8, %2 \n"
  2043. "vshr.u16 q8, q8, #1 \n" // scale >>= 1
  2044. "vdup.u16 q9, %3 \n" // interval multiply.
  2045. "vdup.u16 q10, %4 \n" // interval add
  2046. // 8 pixel loop.
  2047. "1: \n"
  2048. "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
  2049. "subs %1, %1, #8 \n" // 8 processed per loop.
  2050. "vmovl.u8 q0, d0 \n" // b (0 .. 255)
  2051. "vmovl.u8 q1, d2 \n"
  2052. "vmovl.u8 q2, d4 \n"
  2053. "vqdmulh.s16 q0, q0, q8 \n" // b * scale
  2054. "vqdmulh.s16 q1, q1, q8 \n" // g
  2055. "vqdmulh.s16 q2, q2, q8 \n" // r
  2056. "vmul.u16 q0, q0, q9 \n" // b * interval_size
  2057. "vmul.u16 q1, q1, q9 \n" // g
  2058. "vmul.u16 q2, q2, q9 \n" // r
  2059. "vadd.u16 q0, q0, q10 \n" // b + interval_offset
  2060. "vadd.u16 q1, q1, q10 \n" // g
  2061. "vadd.u16 q2, q2, q10 \n" // r
  2062. "vqmovn.u16 d0, q0 \n"
  2063. "vqmovn.u16 d2, q1 \n"
  2064. "vqmovn.u16 d4, q2 \n"
  2065. "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
  2066. "bgt 1b \n"
  2067. : "+r"(dst_argb), // %0
  2068. "+r"(width) // %1
  2069. : "r"(scale), // %2
  2070. "r"(interval_size), // %3
  2071. "r"(interval_offset) // %4
  2072. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10");
  2073. }
  2074. // Shade 8 pixels at a time by specified value.
  2075. // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
  2076. // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
  2077. void ARGBShadeRow_NEON(const uint8_t* src_argb,
  2078. uint8_t* dst_argb,
  2079. int width,
  2080. uint32_t value) {
  2081. asm volatile(
  2082. "vdup.u32 q0, %3 \n" // duplicate scale value.
  2083. "vzip.u8 d0, d1 \n" // d0 aarrggbb.
  2084. "vshr.u16 q0, q0, #1 \n" // scale / 2.
  2085. // 8 pixel loop.
  2086. "1: \n"
  2087. "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
  2088. "subs %2, %2, #8 \n" // 8 processed per loop.
  2089. "vmovl.u8 q10, d20 \n" // b (0 .. 255)
  2090. "vmovl.u8 q11, d22 \n"
  2091. "vmovl.u8 q12, d24 \n"
  2092. "vmovl.u8 q13, d26 \n"
  2093. "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2
  2094. "vqrdmulh.s16 q11, q11, d0[1] \n" // g
  2095. "vqrdmulh.s16 q12, q12, d0[2] \n" // r
  2096. "vqrdmulh.s16 q13, q13, d0[3] \n" // a
  2097. "vqmovn.u16 d20, q10 \n"
  2098. "vqmovn.u16 d22, q11 \n"
  2099. "vqmovn.u16 d24, q12 \n"
  2100. "vqmovn.u16 d26, q13 \n"
  2101. "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
  2102. "bgt 1b \n"
  2103. : "+r"(src_argb), // %0
  2104. "+r"(dst_argb), // %1
  2105. "+r"(width) // %2
  2106. : "r"(value) // %3
  2107. : "cc", "memory", "q0", "q10", "q11", "q12", "q13");
  2108. }
  2109. // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
  2110. // Similar to ARGBToYJ but stores ARGB.
  2111. // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
  2112. void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
  2113. asm volatile(
  2114. "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
  2115. "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
  2116. "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
  2117. "1: \n"
  2118. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
  2119. "subs %2, %2, #8 \n" // 8 processed per loop.
  2120. "vmull.u8 q2, d0, d24 \n" // B
  2121. "vmlal.u8 q2, d1, d25 \n" // G
  2122. "vmlal.u8 q2, d2, d26 \n" // R
  2123. "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B
  2124. "vmov d1, d0 \n" // G
  2125. "vmov d2, d0 \n" // R
  2126. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
  2127. "bgt 1b \n"
  2128. : "+r"(src_argb), // %0
  2129. "+r"(dst_argb), // %1
  2130. "+r"(width) // %2
  2131. :
  2132. : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
  2133. }
  2134. // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
  2135. // b = (r * 35 + g * 68 + b * 17) >> 7
  2136. // g = (r * 45 + g * 88 + b * 22) >> 7
  2137. // r = (r * 50 + g * 98 + b * 24) >> 7
  2138. void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
  2139. asm volatile(
  2140. "vmov.u8 d20, #17 \n" // BB coefficient
  2141. "vmov.u8 d21, #68 \n" // BG coefficient
  2142. "vmov.u8 d22, #35 \n" // BR coefficient
  2143. "vmov.u8 d24, #22 \n" // GB coefficient
  2144. "vmov.u8 d25, #88 \n" // GG coefficient
  2145. "vmov.u8 d26, #45 \n" // GR coefficient
  2146. "vmov.u8 d28, #24 \n" // BB coefficient
  2147. "vmov.u8 d29, #98 \n" // BG coefficient
  2148. "vmov.u8 d30, #50 \n" // BR coefficient
  2149. "1: \n"
  2150. "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
  2151. "subs %1, %1, #8 \n" // 8 processed per loop.
  2152. "vmull.u8 q2, d0, d20 \n" // B to Sepia B
  2153. "vmlal.u8 q2, d1, d21 \n" // G
  2154. "vmlal.u8 q2, d2, d22 \n" // R
  2155. "vmull.u8 q3, d0, d24 \n" // B to Sepia G
  2156. "vmlal.u8 q3, d1, d25 \n" // G
  2157. "vmlal.u8 q3, d2, d26 \n" // R
  2158. "vmull.u8 q8, d0, d28 \n" // B to Sepia R
  2159. "vmlal.u8 q8, d1, d29 \n" // G
  2160. "vmlal.u8 q8, d2, d30 \n" // R
  2161. "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
  2162. "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
  2163. "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
  2164. "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
  2165. "bgt 1b \n"
  2166. : "+r"(dst_argb), // %0
  2167. "+r"(width) // %1
  2168. :
  2169. : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12", "q13",
  2170. "q14", "q15");
  2171. }
  2172. // Tranform 8 ARGB pixels (32 bytes) with color matrix.
  2173. // TODO(fbarchard): Was same as Sepia except matrix is provided. This function
  2174. // needs to saturate. Consider doing a non-saturating version.
  2175. void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
  2176. uint8_t* dst_argb,
  2177. const int8_t* matrix_argb,
  2178. int width) {
  2179. asm volatile(
  2180. "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
  2181. "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
  2182. "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
  2183. "1: \n"
  2184. "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
  2185. "subs %2, %2, #8 \n" // 8 processed per loop.
  2186. "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
  2187. "vmovl.u8 q9, d18 \n" // g
  2188. "vmovl.u8 q10, d20 \n" // r
  2189. "vmovl.u8 q11, d22 \n" // a
  2190. "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
  2191. "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
  2192. "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
  2193. "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A
  2194. "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
  2195. "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
  2196. "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
  2197. "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A
  2198. "vqadd.s16 q12, q12, q4 \n" // Accumulate B
  2199. "vqadd.s16 q13, q13, q5 \n" // Accumulate G
  2200. "vqadd.s16 q14, q14, q6 \n" // Accumulate R
  2201. "vqadd.s16 q15, q15, q7 \n" // Accumulate A
  2202. "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
  2203. "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
  2204. "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
  2205. "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A
  2206. "vqadd.s16 q12, q12, q4 \n" // Accumulate B
  2207. "vqadd.s16 q13, q13, q5 \n" // Accumulate G
  2208. "vqadd.s16 q14, q14, q6 \n" // Accumulate R
  2209. "vqadd.s16 q15, q15, q7 \n" // Accumulate A
  2210. "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B
  2211. "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G
  2212. "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R
  2213. "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A
  2214. "vqadd.s16 q12, q12, q4 \n" // Accumulate B
  2215. "vqadd.s16 q13, q13, q5 \n" // Accumulate G
  2216. "vqadd.s16 q14, q14, q6 \n" // Accumulate R
  2217. "vqadd.s16 q15, q15, q7 \n" // Accumulate A
  2218. "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B
  2219. "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G
  2220. "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R
  2221. "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A
  2222. "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
  2223. "bgt 1b \n"
  2224. : "+r"(src_argb), // %0
  2225. "+r"(dst_argb), // %1
  2226. "+r"(width) // %2
  2227. : "r"(matrix_argb) // %3
  2228. : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
  2229. "q10", "q11", "q12", "q13", "q14", "q15");
  2230. }
  2231. // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
  2232. void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
  2233. const uint8_t* src_argb1,
  2234. uint8_t* dst_argb,
  2235. int width) {
  2236. asm volatile(
  2237. // 8 pixel loop.
  2238. "1: \n"
  2239. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
  2240. "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB
  2241. "subs %3, %3, #8 \n" // 8 processed per loop.
  2242. "vmull.u8 q0, d0, d1 \n" // multiply B
  2243. "vmull.u8 q1, d2, d3 \n" // multiply G
  2244. "vmull.u8 q2, d4, d5 \n" // multiply R
  2245. "vmull.u8 q3, d6, d7 \n" // multiply A
  2246. "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
  2247. "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
  2248. "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
  2249. "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
  2250. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
  2251. "bgt 1b \n"
  2252. : "+r"(src_argb0), // %0
  2253. "+r"(src_argb1), // %1
  2254. "+r"(dst_argb), // %2
  2255. "+r"(width) // %3
  2256. :
  2257. : "cc", "memory", "q0", "q1", "q2", "q3");
  2258. }
  2259. // Add 2 rows of ARGB pixels together, 8 pixels at a time.
  2260. void ARGBAddRow_NEON(const uint8_t* src_argb0,
  2261. const uint8_t* src_argb1,
  2262. uint8_t* dst_argb,
  2263. int width) {
  2264. asm volatile(
  2265. // 8 pixel loop.
  2266. "1: \n"
  2267. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
  2268. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
  2269. "subs %3, %3, #8 \n" // 8 processed per loop.
  2270. "vqadd.u8 q0, q0, q2 \n" // add B, G
  2271. "vqadd.u8 q1, q1, q3 \n" // add R, A
  2272. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
  2273. "bgt 1b \n"
  2274. : "+r"(src_argb0), // %0
  2275. "+r"(src_argb1), // %1
  2276. "+r"(dst_argb), // %2
  2277. "+r"(width) // %3
  2278. :
  2279. : "cc", "memory", "q0", "q1", "q2", "q3");
  2280. }
  2281. // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
  2282. void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
  2283. const uint8_t* src_argb1,
  2284. uint8_t* dst_argb,
  2285. int width) {
  2286. asm volatile(
  2287. // 8 pixel loop.
  2288. "1: \n"
  2289. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
  2290. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
  2291. "subs %3, %3, #8 \n" // 8 processed per loop.
  2292. "vqsub.u8 q0, q0, q2 \n" // subtract B, G
  2293. "vqsub.u8 q1, q1, q3 \n" // subtract R, A
  2294. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
  2295. "bgt 1b \n"
  2296. : "+r"(src_argb0), // %0
  2297. "+r"(src_argb1), // %1
  2298. "+r"(dst_argb), // %2
  2299. "+r"(width) // %3
  2300. :
  2301. : "cc", "memory", "q0", "q1", "q2", "q3");
  2302. }
  2303. // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
  2304. // A = 255
  2305. // R = Sobel
  2306. // G = Sobel
  2307. // B = Sobel
  2308. void SobelRow_NEON(const uint8_t* src_sobelx,
  2309. const uint8_t* src_sobely,
  2310. uint8_t* dst_argb,
  2311. int width) {
  2312. asm volatile(
  2313. "vmov.u8 d3, #255 \n" // alpha
  2314. // 8 pixel loop.
  2315. "1: \n"
  2316. "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
  2317. "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
  2318. "subs %3, %3, #8 \n" // 8 processed per loop.
  2319. "vqadd.u8 d0, d0, d1 \n" // add
  2320. "vmov.u8 d1, d0 \n"
  2321. "vmov.u8 d2, d0 \n"
  2322. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
  2323. "bgt 1b \n"
  2324. : "+r"(src_sobelx), // %0
  2325. "+r"(src_sobely), // %1
  2326. "+r"(dst_argb), // %2
  2327. "+r"(width) // %3
  2328. :
  2329. : "cc", "memory", "q0", "q1");
  2330. }
  2331. // Adds Sobel X and Sobel Y and stores Sobel into plane.
  2332. void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
  2333. const uint8_t* src_sobely,
  2334. uint8_t* dst_y,
  2335. int width) {
  2336. asm volatile(
  2337. // 16 pixel loop.
  2338. "1: \n"
  2339. "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
  2340. "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
  2341. "subs %3, %3, #16 \n" // 16 processed per loop.
  2342. "vqadd.u8 q0, q0, q1 \n" // add
  2343. "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
  2344. "bgt 1b \n"
  2345. : "+r"(src_sobelx), // %0
  2346. "+r"(src_sobely), // %1
  2347. "+r"(dst_y), // %2
  2348. "+r"(width) // %3
  2349. :
  2350. : "cc", "memory", "q0", "q1");
  2351. }
  2352. // Mixes Sobel X, Sobel Y and Sobel into ARGB.
  2353. // A = 255
  2354. // R = Sobel X
  2355. // G = Sobel
  2356. // B = Sobel Y
  2357. void SobelXYRow_NEON(const uint8_t* src_sobelx,
  2358. const uint8_t* src_sobely,
  2359. uint8_t* dst_argb,
  2360. int width) {
  2361. asm volatile(
  2362. "vmov.u8 d3, #255 \n" // alpha
  2363. // 8 pixel loop.
  2364. "1: \n"
  2365. "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
  2366. "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
  2367. "subs %3, %3, #8 \n" // 8 processed per loop.
  2368. "vqadd.u8 d1, d0, d2 \n" // add
  2369. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
  2370. "bgt 1b \n"
  2371. : "+r"(src_sobelx), // %0
  2372. "+r"(src_sobely), // %1
  2373. "+r"(dst_argb), // %2
  2374. "+r"(width) // %3
  2375. :
  2376. : "cc", "memory", "q0", "q1");
  2377. }
  2378. // SobelX as a matrix is
  2379. // -1 0 1
  2380. // -2 0 2
  2381. // -1 0 1
  2382. void SobelXRow_NEON(const uint8_t* src_y0,
  2383. const uint8_t* src_y1,
  2384. const uint8_t* src_y2,
  2385. uint8_t* dst_sobelx,
  2386. int width) {
  2387. asm volatile(
  2388. "1: \n"
  2389. "vld1.8 {d0}, [%0],%5 \n" // top
  2390. "vld1.8 {d1}, [%0],%6 \n"
  2391. "vsubl.u8 q0, d0, d1 \n"
  2392. "vld1.8 {d2}, [%1],%5 \n" // center * 2
  2393. "vld1.8 {d3}, [%1],%6 \n"
  2394. "vsubl.u8 q1, d2, d3 \n"
  2395. "vadd.s16 q0, q0, q1 \n"
  2396. "vadd.s16 q0, q0, q1 \n"
  2397. "vld1.8 {d2}, [%2],%5 \n" // bottom
  2398. "vld1.8 {d3}, [%2],%6 \n"
  2399. "subs %4, %4, #8 \n" // 8 pixels
  2400. "vsubl.u8 q1, d2, d3 \n"
  2401. "vadd.s16 q0, q0, q1 \n"
  2402. "vabs.s16 q0, q0 \n"
  2403. "vqmovn.u16 d0, q0 \n"
  2404. "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
  2405. "bgt 1b \n"
  2406. : "+r"(src_y0), // %0
  2407. "+r"(src_y1), // %1
  2408. "+r"(src_y2), // %2
  2409. "+r"(dst_sobelx), // %3
  2410. "+r"(width) // %4
  2411. : "r"(2), // %5
  2412. "r"(6) // %6
  2413. : "cc", "memory", "q0", "q1" // Clobber List
  2414. );
  2415. }
  2416. // SobelY as a matrix is
  2417. // -1 -2 -1
  2418. // 0 0 0
  2419. // 1 2 1
  2420. void SobelYRow_NEON(const uint8_t* src_y0,
  2421. const uint8_t* src_y1,
  2422. uint8_t* dst_sobely,
  2423. int width) {
  2424. asm volatile(
  2425. "1: \n"
  2426. "vld1.8 {d0}, [%0],%4 \n" // left
  2427. "vld1.8 {d1}, [%1],%4 \n"
  2428. "vsubl.u8 q0, d0, d1 \n"
  2429. "vld1.8 {d2}, [%0],%4 \n" // center * 2
  2430. "vld1.8 {d3}, [%1],%4 \n"
  2431. "vsubl.u8 q1, d2, d3 \n"
  2432. "vadd.s16 q0, q0, q1 \n"
  2433. "vadd.s16 q0, q0, q1 \n"
  2434. "vld1.8 {d2}, [%0],%5 \n" // right
  2435. "vld1.8 {d3}, [%1],%5 \n"
  2436. "subs %3, %3, #8 \n" // 8 pixels
  2437. "vsubl.u8 q1, d2, d3 \n"
  2438. "vadd.s16 q0, q0, q1 \n"
  2439. "vabs.s16 q0, q0 \n"
  2440. "vqmovn.u16 d0, q0 \n"
  2441. "vst1.8 {d0}, [%2]! \n" // store 8 sobely
  2442. "bgt 1b \n"
  2443. : "+r"(src_y0), // %0
  2444. "+r"(src_y1), // %1
  2445. "+r"(dst_sobely), // %2
  2446. "+r"(width) // %3
  2447. : "r"(1), // %4
  2448. "r"(6) // %5
  2449. : "cc", "memory", "q0", "q1" // Clobber List
  2450. );
  2451. }
  2452. // %y passes a float as a scalar vector for vector * scalar multiply.
  2453. // the regoster must be d0 to d15 and indexed with [0] or [1] to access
  2454. // the float in the first or second float of the d-reg
  2455. void HalfFloat1Row_NEON(const uint16_t* src,
  2456. uint16_t* dst,
  2457. float /*unused*/,
  2458. int width) {
  2459. asm volatile(
  2460. "1: \n"
  2461. "vld1.8 {q1}, [%0]! \n" // load 8 shorts
  2462. "subs %2, %2, #8 \n" // 8 pixels per loop
  2463. "vmovl.u16 q2, d2 \n" // 8 int's
  2464. "vmovl.u16 q3, d3 \n"
  2465. "vcvt.f32.u32 q2, q2 \n" // 8 floats
  2466. "vcvt.f32.u32 q3, q3 \n"
  2467. "vmul.f32 q2, q2, %y3 \n" // adjust exponent
  2468. "vmul.f32 q3, q3, %y3 \n"
  2469. "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
  2470. "vqshrn.u32 d3, q3, #13 \n"
  2471. "vst1.8 {q1}, [%1]! \n"
  2472. "bgt 1b \n"
  2473. : "+r"(src), // %0
  2474. "+r"(dst), // %1
  2475. "+r"(width) // %2
  2476. : "w"(1.9259299444e-34f) // %3
  2477. : "cc", "memory", "q1", "q2", "q3");
  2478. }
  2479. void HalfFloatRow_NEON(const uint16_t* src,
  2480. uint16_t* dst,
  2481. float scale,
  2482. int width) {
  2483. asm volatile(
  2484. "1: \n"
  2485. "vld1.8 {q1}, [%0]! \n" // load 8 shorts
  2486. "subs %2, %2, #8 \n" // 8 pixels per loop
  2487. "vmovl.u16 q2, d2 \n" // 8 int's
  2488. "vmovl.u16 q3, d3 \n"
  2489. "vcvt.f32.u32 q2, q2 \n" // 8 floats
  2490. "vcvt.f32.u32 q3, q3 \n"
  2491. "vmul.f32 q2, q2, %y3 \n" // adjust exponent
  2492. "vmul.f32 q3, q3, %y3 \n"
  2493. "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
  2494. "vqshrn.u32 d3, q3, #13 \n"
  2495. "vst1.8 {q1}, [%1]! \n"
  2496. "bgt 1b \n"
  2497. : "+r"(src), // %0
  2498. "+r"(dst), // %1
  2499. "+r"(width) // %2
  2500. : "w"(scale * 1.9259299444e-34f) // %3
  2501. : "cc", "memory", "q1", "q2", "q3");
  2502. }
  2503. void ByteToFloatRow_NEON(const uint8_t* src,
  2504. float* dst,
  2505. float scale,
  2506. int width) {
  2507. asm volatile(
  2508. "1: \n"
  2509. "vld1.8 {d2}, [%0]! \n" // load 8 bytes
  2510. "subs %2, %2, #8 \n" // 8 pixels per loop
  2511. "vmovl.u8 q1, d2 \n" // 8 shorts
  2512. "vmovl.u16 q2, d2 \n" // 8 ints
  2513. "vmovl.u16 q3, d3 \n"
  2514. "vcvt.f32.u32 q2, q2 \n" // 8 floats
  2515. "vcvt.f32.u32 q3, q3 \n"
  2516. "vmul.f32 q2, q2, %y3 \n" // scale
  2517. "vmul.f32 q3, q3, %y3 \n"
  2518. "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats
  2519. "bgt 1b \n"
  2520. : "+r"(src), // %0
  2521. "+r"(dst), // %1
  2522. "+r"(width) // %2
  2523. : "w"(scale) // %3
  2524. : "cc", "memory", "q1", "q2", "q3");
  2525. }
  2526. #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
  2527. #ifdef __cplusplus
  2528. } // extern "C"
  2529. } // namespace libyuv
  2530. #endif