row_neon.cc 123 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844
  1. /*
  2. * Copyright 2011 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/row.h"
  11. #ifdef __cplusplus
  12. namespace libyuv {
  13. extern "C" {
  14. #endif
  15. // This module is for GCC Neon
  16. #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
  17. !defined(__aarch64__)
  18. // Read 8 Y, 4 U and 4 V from 422
  19. #define READYUV422 \
  20. MEMACCESS(0) \
  21. "vld1.8 {d0}, [%0]! \n" \
  22. MEMACCESS(1) \
  23. "vld1.32 {d2[0]}, [%1]! \n" \
  24. MEMACCESS(2) \
  25. "vld1.32 {d2[1]}, [%2]! \n"
  26. // Read 8 Y, 2 U and 2 V from 422
  27. #define READYUV411 \
  28. MEMACCESS(0) \
  29. "vld1.8 {d0}, [%0]! \n" \
  30. MEMACCESS(1) \
  31. "vld1.16 {d2[0]}, [%1]! \n" \
  32. MEMACCESS(2) \
  33. "vld1.16 {d2[1]}, [%2]! \n" \
  34. "vmov.u8 d3, d2 \n" \
  35. "vzip.u8 d2, d3 \n"
  36. // Read 8 Y, 8 U and 8 V from 444
  37. #define READYUV444 \
  38. MEMACCESS(0) \
  39. "vld1.8 {d0}, [%0]! \n" \
  40. MEMACCESS(1) \
  41. "vld1.8 {d2}, [%1]! \n" \
  42. MEMACCESS(2) \
  43. "vld1.8 {d3}, [%2]! \n" \
  44. "vpaddl.u8 q1, q1 \n" \
  45. "vrshrn.u16 d2, q1, #1 \n"
  46. // Read 8 Y, and set 4 U and 4 V to 128
  47. #define READYUV400 \
  48. MEMACCESS(0) \
  49. "vld1.8 {d0}, [%0]! \n" \
  50. "vmov.u8 d2, #128 \n"
  51. // Read 8 Y and 4 UV from NV12
  52. #define READNV12 \
  53. MEMACCESS(0) \
  54. "vld1.8 {d0}, [%0]! \n" \
  55. MEMACCESS(1) \
  56. "vld1.8 {d2}, [%1]! \n" \
  57. "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
  58. "vuzp.u8 d2, d3 \n" \
  59. "vtrn.u32 d2, d3 \n"
  60. // Read 8 Y and 4 VU from NV21
  61. #define READNV21 \
  62. MEMACCESS(0) \
  63. "vld1.8 {d0}, [%0]! \n" \
  64. MEMACCESS(1) \
  65. "vld1.8 {d2}, [%1]! \n" \
  66. "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
  67. "vuzp.u8 d3, d2 \n" \
  68. "vtrn.u32 d2, d3 \n"
  69. // Read 8 YUY2
  70. #define READYUY2 \
  71. MEMACCESS(0) \
  72. "vld2.8 {d0, d2}, [%0]! \n" \
  73. "vmov.u8 d3, d2 \n" \
  74. "vuzp.u8 d2, d3 \n" \
  75. "vtrn.u32 d2, d3 \n"
  76. // Read 8 UYVY
  77. #define READUYVY \
  78. MEMACCESS(0) \
  79. "vld2.8 {d2, d3}, [%0]! \n" \
  80. "vmov.u8 d0, d3 \n" \
  81. "vmov.u8 d3, d2 \n" \
  82. "vuzp.u8 d2, d3 \n" \
  83. "vtrn.u32 d2, d3 \n"
  84. #define YUVTORGB_SETUP \
  85. MEMACCESS([kUVToRB]) \
  86. "vld1.8 {d24}, [%[kUVToRB]] \n" \
  87. MEMACCESS([kUVToG]) \
  88. "vld1.8 {d25}, [%[kUVToG]] \n" \
  89. MEMACCESS([kUVBiasBGR]) \
  90. "vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \
  91. MEMACCESS([kUVBiasBGR]) \
  92. "vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \
  93. MEMACCESS([kUVBiasBGR]) \
  94. "vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \
  95. MEMACCESS([kYToRgb]) \
  96. "vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n"
  97. #define YUVTORGB \
  98. "vmull.u8 q8, d2, d24 \n" /* u/v B/R component */\
  99. "vmull.u8 q9, d2, d25 \n" /* u/v G component */\
  100. "vmovl.u8 q0, d0 \n" /* Y */\
  101. "vmovl.s16 q10, d1 \n" \
  102. "vmovl.s16 q0, d0 \n" \
  103. "vmul.s32 q10, q10, q15 \n" \
  104. "vmul.s32 q0, q0, q15 \n" \
  105. "vqshrun.s32 d0, q0, #16 \n" \
  106. "vqshrun.s32 d1, q10, #16 \n" /* Y */\
  107. "vadd.s16 d18, d19 \n" \
  108. "vshll.u16 q1, d16, #16 \n" /* Replicate u * UB */\
  109. "vshll.u16 q10, d17, #16 \n" /* Replicate v * VR */\
  110. "vshll.u16 q3, d18, #16 \n" /* Replicate (v*VG + u*UG)*/\
  111. "vaddw.u16 q1, q1, d16 \n" \
  112. "vaddw.u16 q10, q10, d17 \n" \
  113. "vaddw.u16 q3, q3, d18 \n" \
  114. "vqadd.s16 q8, q0, q13 \n" /* B */ \
  115. "vqadd.s16 q9, q0, q14 \n" /* R */ \
  116. "vqadd.s16 q0, q0, q4 \n" /* G */ \
  117. "vqadd.s16 q8, q8, q1 \n" /* B */ \
  118. "vqadd.s16 q9, q9, q10 \n" /* R */ \
  119. "vqsub.s16 q0, q0, q3 \n" /* G */ \
  120. "vqshrun.s16 d20, q8, #6 \n" /* B */ \
  121. "vqshrun.s16 d22, q9, #6 \n" /* R */ \
  122. "vqshrun.s16 d21, q0, #6 \n" /* G */
  123. void I444ToARGBRow_NEON(const uint8* src_y,
  124. const uint8* src_u,
  125. const uint8* src_v,
  126. uint8* dst_argb,
  127. const struct YuvConstants* yuvconstants,
  128. int width) {
  129. asm volatile (
  130. YUVTORGB_SETUP
  131. "vmov.u8 d23, #255 \n"
  132. "1: \n"
  133. READYUV444
  134. YUVTORGB
  135. "subs %4, %4, #8 \n"
  136. MEMACCESS(3)
  137. "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
  138. "bgt 1b \n"
  139. : "+r"(src_y), // %0
  140. "+r"(src_u), // %1
  141. "+r"(src_v), // %2
  142. "+r"(dst_argb), // %3
  143. "+r"(width) // %4
  144. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  145. [kUVToG]"r"(&yuvconstants->kUVToG),
  146. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  147. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  148. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  149. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  150. );
  151. }
  152. void I422ToARGBRow_NEON(const uint8* src_y,
  153. const uint8* src_u,
  154. const uint8* src_v,
  155. uint8* dst_argb,
  156. const struct YuvConstants* yuvconstants,
  157. int width) {
  158. asm volatile (
  159. YUVTORGB_SETUP
  160. "vmov.u8 d23, #255 \n"
  161. "1: \n"
  162. READYUV422
  163. YUVTORGB
  164. "subs %4, %4, #8 \n"
  165. MEMACCESS(3)
  166. "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
  167. "bgt 1b \n"
  168. : "+r"(src_y), // %0
  169. "+r"(src_u), // %1
  170. "+r"(src_v), // %2
  171. "+r"(dst_argb), // %3
  172. "+r"(width) // %4
  173. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  174. [kUVToG]"r"(&yuvconstants->kUVToG),
  175. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  176. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  177. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  178. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  179. );
  180. }
  181. void I422AlphaToARGBRow_NEON(const uint8* src_y,
  182. const uint8* src_u,
  183. const uint8* src_v,
  184. const uint8* src_a,
  185. uint8* dst_argb,
  186. const struct YuvConstants* yuvconstants,
  187. int width) {
  188. asm volatile (
  189. YUVTORGB_SETUP
  190. "1: \n"
  191. READYUV422
  192. YUVTORGB
  193. "subs %5, %5, #8 \n"
  194. MEMACCESS(3)
  195. "vld1.8 {d23}, [%3]! \n"
  196. MEMACCESS(4)
  197. "vst4.8 {d20, d21, d22, d23}, [%4]! \n"
  198. "bgt 1b \n"
  199. : "+r"(src_y), // %0
  200. "+r"(src_u), // %1
  201. "+r"(src_v), // %2
  202. "+r"(src_a), // %3
  203. "+r"(dst_argb), // %4
  204. "+r"(width) // %5
  205. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  206. [kUVToG]"r"(&yuvconstants->kUVToG),
  207. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  208. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  209. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  210. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  211. );
  212. }
  213. void I411ToARGBRow_NEON(const uint8* src_y,
  214. const uint8* src_u,
  215. const uint8* src_v,
  216. uint8* dst_argb,
  217. const struct YuvConstants* yuvconstants,
  218. int width) {
  219. asm volatile (
  220. YUVTORGB_SETUP
  221. "vmov.u8 d23, #255 \n"
  222. "1: \n"
  223. READYUV411
  224. YUVTORGB
  225. "subs %4, %4, #8 \n"
  226. MEMACCESS(3)
  227. "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
  228. "bgt 1b \n"
  229. : "+r"(src_y), // %0
  230. "+r"(src_u), // %1
  231. "+r"(src_v), // %2
  232. "+r"(dst_argb), // %3
  233. "+r"(width) // %4
  234. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  235. [kUVToG]"r"(&yuvconstants->kUVToG),
  236. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  237. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  238. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  239. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  240. );
  241. }
  242. void I422ToRGBARow_NEON(const uint8* src_y,
  243. const uint8* src_u,
  244. const uint8* src_v,
  245. uint8* dst_rgba,
  246. const struct YuvConstants* yuvconstants,
  247. int width) {
  248. asm volatile (
  249. YUVTORGB_SETUP
  250. "1: \n"
  251. READYUV422
  252. YUVTORGB
  253. "subs %4, %4, #8 \n"
  254. "vmov.u8 d19, #255 \n" // d19 modified by YUVTORGB
  255. MEMACCESS(3)
  256. "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
  257. "bgt 1b \n"
  258. : "+r"(src_y), // %0
  259. "+r"(src_u), // %1
  260. "+r"(src_v), // %2
  261. "+r"(dst_rgba), // %3
  262. "+r"(width) // %4
  263. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  264. [kUVToG]"r"(&yuvconstants->kUVToG),
  265. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  266. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  267. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  268. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  269. );
  270. }
  271. void I422ToRGB24Row_NEON(const uint8* src_y,
  272. const uint8* src_u,
  273. const uint8* src_v,
  274. uint8* dst_rgb24,
  275. const struct YuvConstants* yuvconstants,
  276. int width) {
  277. asm volatile (
  278. YUVTORGB_SETUP
  279. "1: \n"
  280. READYUV422
  281. YUVTORGB
  282. "subs %4, %4, #8 \n"
  283. MEMACCESS(3)
  284. "vst3.8 {d20, d21, d22}, [%3]! \n"
  285. "bgt 1b \n"
  286. : "+r"(src_y), // %0
  287. "+r"(src_u), // %1
  288. "+r"(src_v), // %2
  289. "+r"(dst_rgb24), // %3
  290. "+r"(width) // %4
  291. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  292. [kUVToG]"r"(&yuvconstants->kUVToG),
  293. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  294. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  295. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  296. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  297. );
  298. }
  299. #define ARGBTORGB565 \
  300. "vshll.u8 q0, d22, #8 \n" /* R */ \
  301. "vshll.u8 q8, d21, #8 \n" /* G */ \
  302. "vshll.u8 q9, d20, #8 \n" /* B */ \
  303. "vsri.16 q0, q8, #5 \n" /* RG */ \
  304. "vsri.16 q0, q9, #11 \n" /* RGB */
  305. void I422ToRGB565Row_NEON(const uint8* src_y,
  306. const uint8* src_u,
  307. const uint8* src_v,
  308. uint8* dst_rgb565,
  309. const struct YuvConstants* yuvconstants,
  310. int width) {
  311. asm volatile (
  312. YUVTORGB_SETUP
  313. "1: \n"
  314. READYUV422
  315. YUVTORGB
  316. "subs %4, %4, #8 \n"
  317. ARGBTORGB565
  318. MEMACCESS(3)
  319. "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
  320. "bgt 1b \n"
  321. : "+r"(src_y), // %0
  322. "+r"(src_u), // %1
  323. "+r"(src_v), // %2
  324. "+r"(dst_rgb565), // %3
  325. "+r"(width) // %4
  326. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  327. [kUVToG]"r"(&yuvconstants->kUVToG),
  328. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  329. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  330. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  331. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  332. );
  333. }
  334. #define ARGBTOARGB1555 \
  335. "vshll.u8 q0, d23, #8 \n" /* A */ \
  336. "vshll.u8 q8, d22, #8 \n" /* R */ \
  337. "vshll.u8 q9, d21, #8 \n" /* G */ \
  338. "vshll.u8 q10, d20, #8 \n" /* B */ \
  339. "vsri.16 q0, q8, #1 \n" /* AR */ \
  340. "vsri.16 q0, q9, #6 \n" /* ARG */ \
  341. "vsri.16 q0, q10, #11 \n" /* ARGB */
  342. void I422ToARGB1555Row_NEON(const uint8* src_y,
  343. const uint8* src_u,
  344. const uint8* src_v,
  345. uint8* dst_argb1555,
  346. const struct YuvConstants* yuvconstants,
  347. int width) {
  348. asm volatile (
  349. YUVTORGB_SETUP
  350. "1: \n"
  351. READYUV422
  352. YUVTORGB
  353. "subs %4, %4, #8 \n"
  354. "vmov.u8 d23, #255 \n"
  355. ARGBTOARGB1555
  356. MEMACCESS(3)
  357. "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555.
  358. "bgt 1b \n"
  359. : "+r"(src_y), // %0
  360. "+r"(src_u), // %1
  361. "+r"(src_v), // %2
  362. "+r"(dst_argb1555), // %3
  363. "+r"(width) // %4
  364. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  365. [kUVToG]"r"(&yuvconstants->kUVToG),
  366. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  367. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  368. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  369. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  370. );
  371. }
  372. #define ARGBTOARGB4444 \
  373. "vshr.u8 d20, d20, #4 \n" /* B */ \
  374. "vbic.32 d21, d21, d4 \n" /* G */ \
  375. "vshr.u8 d22, d22, #4 \n" /* R */ \
  376. "vbic.32 d23, d23, d4 \n" /* A */ \
  377. "vorr d0, d20, d21 \n" /* BG */ \
  378. "vorr d1, d22, d23 \n" /* RA */ \
  379. "vzip.u8 d0, d1 \n" /* BGRA */
  380. void I422ToARGB4444Row_NEON(const uint8* src_y,
  381. const uint8* src_u,
  382. const uint8* src_v,
  383. uint8* dst_argb4444,
  384. const struct YuvConstants* yuvconstants,
  385. int width) {
  386. asm volatile (
  387. YUVTORGB_SETUP
  388. "vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
  389. "1: \n"
  390. READYUV422
  391. YUVTORGB
  392. "subs %4, %4, #8 \n"
  393. "vmov.u8 d23, #255 \n"
  394. ARGBTOARGB4444
  395. MEMACCESS(3)
  396. "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444.
  397. "bgt 1b \n"
  398. : "+r"(src_y), // %0
  399. "+r"(src_u), // %1
  400. "+r"(src_v), // %2
  401. "+r"(dst_argb4444), // %3
  402. "+r"(width) // %4
  403. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  404. [kUVToG]"r"(&yuvconstants->kUVToG),
  405. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  406. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  407. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  408. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  409. );
  410. }
  411. void I400ToARGBRow_NEON(const uint8* src_y,
  412. uint8* dst_argb,
  413. int width) {
  414. asm volatile (
  415. YUVTORGB_SETUP
  416. "vmov.u8 d23, #255 \n"
  417. "1: \n"
  418. READYUV400
  419. YUVTORGB
  420. "subs %2, %2, #8 \n"
  421. MEMACCESS(1)
  422. "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
  423. "bgt 1b \n"
  424. : "+r"(src_y), // %0
  425. "+r"(dst_argb), // %1
  426. "+r"(width) // %2
  427. : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
  428. [kUVToG]"r"(&kYuvI601Constants.kUVToG),
  429. [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
  430. [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
  431. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  432. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  433. );
  434. }
  435. void J400ToARGBRow_NEON(const uint8* src_y,
  436. uint8* dst_argb,
  437. int width) {
  438. asm volatile (
  439. "vmov.u8 d23, #255 \n"
  440. "1: \n"
  441. MEMACCESS(0)
  442. "vld1.8 {d20}, [%0]! \n"
  443. "vmov d21, d20 \n"
  444. "vmov d22, d20 \n"
  445. "subs %2, %2, #8 \n"
  446. MEMACCESS(1)
  447. "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
  448. "bgt 1b \n"
  449. : "+r"(src_y), // %0
  450. "+r"(dst_argb), // %1
  451. "+r"(width) // %2
  452. :
  453. : "cc", "memory", "d20", "d21", "d22", "d23"
  454. );
  455. }
  456. void NV12ToARGBRow_NEON(const uint8* src_y,
  457. const uint8* src_uv,
  458. uint8* dst_argb,
  459. const struct YuvConstants* yuvconstants,
  460. int width) {
  461. asm volatile (
  462. YUVTORGB_SETUP
  463. "vmov.u8 d23, #255 \n"
  464. "1: \n"
  465. READNV12
  466. YUVTORGB
  467. "subs %3, %3, #8 \n"
  468. MEMACCESS(2)
  469. "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
  470. "bgt 1b \n"
  471. : "+r"(src_y), // %0
  472. "+r"(src_uv), // %1
  473. "+r"(dst_argb), // %2
  474. "+r"(width) // %3
  475. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  476. [kUVToG]"r"(&yuvconstants->kUVToG),
  477. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  478. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  479. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  480. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  481. );
  482. }
  483. void NV21ToARGBRow_NEON(const uint8* src_y,
  484. const uint8* src_vu,
  485. uint8* dst_argb,
  486. const struct YuvConstants* yuvconstants,
  487. int width) {
  488. asm volatile (
  489. YUVTORGB_SETUP
  490. "vmov.u8 d23, #255 \n"
  491. "1: \n"
  492. READNV21
  493. YUVTORGB
  494. "subs %3, %3, #8 \n"
  495. MEMACCESS(2)
  496. "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
  497. "bgt 1b \n"
  498. : "+r"(src_y), // %0
  499. "+r"(src_vu), // %1
  500. "+r"(dst_argb), // %2
  501. "+r"(width) // %3
  502. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  503. [kUVToG]"r"(&yuvconstants->kUVToG),
  504. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  505. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  506. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  507. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  508. );
  509. }
  510. void NV12ToRGB565Row_NEON(const uint8* src_y,
  511. const uint8* src_uv,
  512. uint8* dst_rgb565,
  513. const struct YuvConstants* yuvconstants,
  514. int width) {
  515. asm volatile (
  516. YUVTORGB_SETUP
  517. "1: \n"
  518. READNV12
  519. YUVTORGB
  520. "subs %3, %3, #8 \n"
  521. ARGBTORGB565
  522. MEMACCESS(2)
  523. "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
  524. "bgt 1b \n"
  525. : "+r"(src_y), // %0
  526. "+r"(src_uv), // %1
  527. "+r"(dst_rgb565), // %2
  528. "+r"(width) // %3
  529. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  530. [kUVToG]"r"(&yuvconstants->kUVToG),
  531. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  532. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  533. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  534. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  535. );
  536. }
  537. void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
  538. uint8* dst_argb,
  539. const struct YuvConstants* yuvconstants,
  540. int width) {
  541. asm volatile (
  542. YUVTORGB_SETUP
  543. "vmov.u8 d23, #255 \n"
  544. "1: \n"
  545. READYUY2
  546. YUVTORGB
  547. "subs %2, %2, #8 \n"
  548. MEMACCESS(1)
  549. "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
  550. "bgt 1b \n"
  551. : "+r"(src_yuy2), // %0
  552. "+r"(dst_argb), // %1
  553. "+r"(width) // %2
  554. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  555. [kUVToG]"r"(&yuvconstants->kUVToG),
  556. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  557. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  558. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  559. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  560. );
  561. }
  562. void UYVYToARGBRow_NEON(const uint8* src_uyvy,
  563. uint8* dst_argb,
  564. const struct YuvConstants* yuvconstants,
  565. int width) {
  566. asm volatile (
  567. YUVTORGB_SETUP
  568. "vmov.u8 d23, #255 \n"
  569. "1: \n"
  570. READUYVY
  571. YUVTORGB
  572. "subs %2, %2, #8 \n"
  573. MEMACCESS(1)
  574. "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
  575. "bgt 1b \n"
  576. : "+r"(src_uyvy), // %0
  577. "+r"(dst_argb), // %1
  578. "+r"(width) // %2
  579. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  580. [kUVToG]"r"(&yuvconstants->kUVToG),
  581. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  582. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  583. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  584. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  585. );
  586. }
  587. // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
  588. void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
  589. int width) {
  590. asm volatile (
  591. "1: \n"
  592. MEMACCESS(0)
  593. "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
  594. "subs %3, %3, #16 \n" // 16 processed per loop
  595. MEMACCESS(1)
  596. "vst1.8 {q0}, [%1]! \n" // store U
  597. MEMACCESS(2)
  598. "vst1.8 {q1}, [%2]! \n" // store V
  599. "bgt 1b \n"
  600. : "+r"(src_uv), // %0
  601. "+r"(dst_u), // %1
  602. "+r"(dst_v), // %2
  603. "+r"(width) // %3 // Output registers
  604. : // Input registers
  605. : "cc", "memory", "q0", "q1" // Clobber List
  606. );
  607. }
  608. // Reads 16 U's and V's and writes out 16 pairs of UV.
  609. void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
  610. int width) {
  611. asm volatile (
  612. "1: \n"
  613. MEMACCESS(0)
  614. "vld1.8 {q0}, [%0]! \n" // load U
  615. MEMACCESS(1)
  616. "vld1.8 {q1}, [%1]! \n" // load V
  617. "subs %3, %3, #16 \n" // 16 processed per loop
  618. MEMACCESS(2)
  619. "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
  620. "bgt 1b \n"
  621. :
  622. "+r"(src_u), // %0
  623. "+r"(src_v), // %1
  624. "+r"(dst_uv), // %2
  625. "+r"(width) // %3 // Output registers
  626. : // Input registers
  627. : "cc", "memory", "q0", "q1" // Clobber List
  628. );
  629. }
  630. // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
  631. void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
  632. asm volatile (
  633. "1: \n"
  634. MEMACCESS(0)
  635. "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
  636. "subs %2, %2, #32 \n" // 32 processed per loop
  637. MEMACCESS(1)
  638. "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
  639. "bgt 1b \n"
  640. : "+r"(src), // %0
  641. "+r"(dst), // %1
  642. "+r"(count) // %2 // Output registers
  643. : // Input registers
  644. : "cc", "memory", "q0", "q1" // Clobber List
  645. );
  646. }
  647. // SetRow writes 'count' bytes using an 8 bit value repeated.
  648. void SetRow_NEON(uint8* dst, uint8 v8, int count) {
  649. asm volatile (
  650. "vdup.8 q0, %2 \n" // duplicate 16 bytes
  651. "1: \n"
  652. "subs %1, %1, #16 \n" // 16 bytes per loop
  653. MEMACCESS(0)
  654. "vst1.8 {q0}, [%0]! \n" // store
  655. "bgt 1b \n"
  656. : "+r"(dst), // %0
  657. "+r"(count) // %1
  658. : "r"(v8) // %2
  659. : "cc", "memory", "q0"
  660. );
  661. }
  662. // ARGBSetRow writes 'count' pixels using an 32 bit value repeated.
  663. void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
  664. asm volatile (
  665. "vdup.u32 q0, %2 \n" // duplicate 4 ints
  666. "1: \n"
  667. "subs %1, %1, #4 \n" // 4 pixels per loop
  668. MEMACCESS(0)
  669. "vst1.8 {q0}, [%0]! \n" // store
  670. "bgt 1b \n"
  671. : "+r"(dst), // %0
  672. "+r"(count) // %1
  673. : "r"(v32) // %2
  674. : "cc", "memory", "q0"
  675. );
  676. }
  677. void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
  678. asm volatile (
  679. // Start at end of source row.
  680. "mov r3, #-16 \n"
  681. "add %0, %0, %2 \n"
  682. "sub %0, #16 \n"
  683. "1: \n"
  684. MEMACCESS(0)
  685. "vld1.8 {q0}, [%0], r3 \n" // src -= 16
  686. "subs %2, #16 \n" // 16 pixels per loop.
  687. "vrev64.8 q0, q0 \n"
  688. MEMACCESS(1)
  689. "vst1.8 {d1}, [%1]! \n" // dst += 16
  690. MEMACCESS(1)
  691. "vst1.8 {d0}, [%1]! \n"
  692. "bgt 1b \n"
  693. : "+r"(src), // %0
  694. "+r"(dst), // %1
  695. "+r"(width) // %2
  696. :
  697. : "cc", "memory", "r3", "q0"
  698. );
  699. }
  700. void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
  701. int width) {
  702. asm volatile (
  703. // Start at end of source row.
  704. "mov r12, #-16 \n"
  705. "add %0, %0, %3, lsl #1 \n"
  706. "sub %0, #16 \n"
  707. "1: \n"
  708. MEMACCESS(0)
  709. "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
  710. "subs %3, #8 \n" // 8 pixels per loop.
  711. "vrev64.8 q0, q0 \n"
  712. MEMACCESS(1)
  713. "vst1.8 {d0}, [%1]! \n" // dst += 8
  714. MEMACCESS(2)
  715. "vst1.8 {d1}, [%2]! \n"
  716. "bgt 1b \n"
  717. : "+r"(src_uv), // %0
  718. "+r"(dst_u), // %1
  719. "+r"(dst_v), // %2
  720. "+r"(width) // %3
  721. :
  722. : "cc", "memory", "r12", "q0"
  723. );
  724. }
  725. void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
  726. asm volatile (
  727. // Start at end of source row.
  728. "mov r3, #-16 \n"
  729. "add %0, %0, %2, lsl #2 \n"
  730. "sub %0, #16 \n"
  731. "1: \n"
  732. MEMACCESS(0)
  733. "vld1.8 {q0}, [%0], r3 \n" // src -= 16
  734. "subs %2, #4 \n" // 4 pixels per loop.
  735. "vrev64.32 q0, q0 \n"
  736. MEMACCESS(1)
  737. "vst1.8 {d1}, [%1]! \n" // dst += 16
  738. MEMACCESS(1)
  739. "vst1.8 {d0}, [%1]! \n"
  740. "bgt 1b \n"
  741. : "+r"(src), // %0
  742. "+r"(dst), // %1
  743. "+r"(width) // %2
  744. :
  745. : "cc", "memory", "r3", "q0"
  746. );
  747. }
  748. void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
  749. asm volatile (
  750. "vmov.u8 d4, #255 \n" // Alpha
  751. "1: \n"
  752. MEMACCESS(0)
  753. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
  754. "subs %2, %2, #8 \n" // 8 processed per loop.
  755. MEMACCESS(1)
  756. "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
  757. "bgt 1b \n"
  758. : "+r"(src_rgb24), // %0
  759. "+r"(dst_argb), // %1
  760. "+r"(width) // %2
  761. :
  762. : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
  763. );
  764. }
  765. void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
  766. asm volatile (
  767. "vmov.u8 d4, #255 \n" // Alpha
  768. "1: \n"
  769. MEMACCESS(0)
  770. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
  771. "subs %2, %2, #8 \n" // 8 processed per loop.
  772. "vswp.u8 d1, d3 \n" // swap R, B
  773. MEMACCESS(1)
  774. "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
  775. "bgt 1b \n"
  776. : "+r"(src_raw), // %0
  777. "+r"(dst_argb), // %1
  778. "+r"(width) // %2
  779. :
  780. : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
  781. );
  782. }
  783. void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
  784. asm volatile (
  785. "1: \n"
  786. MEMACCESS(0)
  787. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
  788. "subs %2, %2, #8 \n" // 8 processed per loop.
  789. "vswp.u8 d1, d3 \n" // swap R, B
  790. MEMACCESS(1)
  791. "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24.
  792. "bgt 1b \n"
  793. : "+r"(src_raw), // %0
  794. "+r"(dst_rgb24), // %1
  795. "+r"(width) // %2
  796. :
  797. : "cc", "memory", "d1", "d2", "d3" // Clobber List
  798. );
  799. }
  800. #define RGB565TOARGB \
  801. "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \
  802. "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \
  803. "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \
  804. "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \
  805. "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
  806. "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
  807. "vorr.u8 d0, d0, d4 \n" /* B */ \
  808. "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \
  809. "vorr.u8 d2, d1, d5 \n" /* R */ \
  810. "vorr.u8 d1, d4, d6 \n" /* G */
  811. void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
  812. asm volatile (
  813. "vmov.u8 d3, #255 \n" // Alpha
  814. "1: \n"
  815. MEMACCESS(0)
  816. "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
  817. "subs %2, %2, #8 \n" // 8 processed per loop.
  818. RGB565TOARGB
  819. MEMACCESS(1)
  820. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
  821. "bgt 1b \n"
  822. : "+r"(src_rgb565), // %0
  823. "+r"(dst_argb), // %1
  824. "+r"(width) // %2
  825. :
  826. : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
  827. );
  828. }
  829. #define ARGB1555TOARGB \
  830. "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \
  831. "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \
  832. "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \
  833. "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \
  834. "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \
  835. "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \
  836. "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \
  837. "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \
  838. "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \
  839. "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \
  840. "vorr.u8 q1, q1, q3 \n" /* R,A */ \
  841. "vorr.u8 q0, q0, q2 \n" /* B,G */ \
  842. // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
  843. #define RGB555TOARGB \
  844. "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \
  845. "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \
  846. "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \
  847. "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \
  848. "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
  849. "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
  850. "vorr.u8 d0, d0, d4 \n" /* B */ \
  851. "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \
  852. "vorr.u8 d2, d1, d5 \n" /* R */ \
  853. "vorr.u8 d1, d4, d6 \n" /* G */
  854. void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
  855. int width) {
  856. asm volatile (
  857. "vmov.u8 d3, #255 \n" // Alpha
  858. "1: \n"
  859. MEMACCESS(0)
  860. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
  861. "subs %2, %2, #8 \n" // 8 processed per loop.
  862. ARGB1555TOARGB
  863. MEMACCESS(1)
  864. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
  865. "bgt 1b \n"
  866. : "+r"(src_argb1555), // %0
  867. "+r"(dst_argb), // %1
  868. "+r"(width) // %2
  869. :
  870. : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
  871. );
  872. }
  873. #define ARGB4444TOARGB \
  874. "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \
  875. "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \
  876. "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \
  877. "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \
  878. "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \
  879. "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \
  880. "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \
  881. "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */
  882. void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
  883. int width) {
  884. asm volatile (
  885. "vmov.u8 d3, #255 \n" // Alpha
  886. "1: \n"
  887. MEMACCESS(0)
  888. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
  889. "subs %2, %2, #8 \n" // 8 processed per loop.
  890. ARGB4444TOARGB
  891. MEMACCESS(1)
  892. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
  893. "bgt 1b \n"
  894. : "+r"(src_argb4444), // %0
  895. "+r"(dst_argb), // %1
  896. "+r"(width) // %2
  897. :
  898. : "cc", "memory", "q0", "q1", "q2" // Clobber List
  899. );
  900. }
  901. void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
  902. asm volatile (
  903. "1: \n"
  904. MEMACCESS(0)
  905. "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
  906. "subs %2, %2, #8 \n" // 8 processed per loop.
  907. MEMACCESS(1)
  908. "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24.
  909. "bgt 1b \n"
  910. : "+r"(src_argb), // %0
  911. "+r"(dst_rgb24), // %1
  912. "+r"(width) // %2
  913. :
  914. : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
  915. );
  916. }
  917. void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
  918. asm volatile (
  919. "1: \n"
  920. MEMACCESS(0)
  921. "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
  922. "subs %2, %2, #8 \n" // 8 processed per loop.
  923. "vswp.u8 d1, d3 \n" // swap R, B
  924. MEMACCESS(1)
  925. "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
  926. "bgt 1b \n"
  927. : "+r"(src_argb), // %0
  928. "+r"(dst_raw), // %1
  929. "+r"(width) // %2
  930. :
  931. : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
  932. );
  933. }
  934. void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
  935. asm volatile (
  936. "1: \n"
  937. MEMACCESS(0)
  938. "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
  939. "subs %2, %2, #16 \n" // 16 processed per loop.
  940. MEMACCESS(1)
  941. "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
  942. "bgt 1b \n"
  943. : "+r"(src_yuy2), // %0
  944. "+r"(dst_y), // %1
  945. "+r"(width) // %2
  946. :
  947. : "cc", "memory", "q0", "q1" // Clobber List
  948. );
  949. }
  950. void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
  951. asm volatile (
  952. "1: \n"
  953. MEMACCESS(0)
  954. "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
  955. "subs %2, %2, #16 \n" // 16 processed per loop.
  956. MEMACCESS(1)
  957. "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
  958. "bgt 1b \n"
  959. : "+r"(src_uyvy), // %0
  960. "+r"(dst_y), // %1
  961. "+r"(width) // %2
  962. :
  963. : "cc", "memory", "q0", "q1" // Clobber List
  964. );
  965. }
  966. void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
  967. int width) {
  968. asm volatile (
  969. "1: \n"
  970. MEMACCESS(0)
  971. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
  972. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
  973. MEMACCESS(1)
  974. "vst1.8 {d1}, [%1]! \n" // store 8 U.
  975. MEMACCESS(2)
  976. "vst1.8 {d3}, [%2]! \n" // store 8 V.
  977. "bgt 1b \n"
  978. : "+r"(src_yuy2), // %0
  979. "+r"(dst_u), // %1
  980. "+r"(dst_v), // %2
  981. "+r"(width) // %3
  982. :
  983. : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
  984. );
  985. }
  986. void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
  987. int width) {
  988. asm volatile (
  989. "1: \n"
  990. MEMACCESS(0)
  991. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
  992. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
  993. MEMACCESS(1)
  994. "vst1.8 {d0}, [%1]! \n" // store 8 U.
  995. MEMACCESS(2)
  996. "vst1.8 {d2}, [%2]! \n" // store 8 V.
  997. "bgt 1b \n"
  998. : "+r"(src_uyvy), // %0
  999. "+r"(dst_u), // %1
  1000. "+r"(dst_v), // %2
  1001. "+r"(width) // %3
  1002. :
  1003. : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
  1004. );
  1005. }
  1006. void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
  1007. uint8* dst_u, uint8* dst_v, int width) {
  1008. asm volatile (
  1009. "add %1, %0, %1 \n" // stride + src_yuy2
  1010. "1: \n"
  1011. MEMACCESS(0)
  1012. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
  1013. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
  1014. MEMACCESS(1)
  1015. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
  1016. "vrhadd.u8 d1, d1, d5 \n" // average rows of U
  1017. "vrhadd.u8 d3, d3, d7 \n" // average rows of V
  1018. MEMACCESS(2)
  1019. "vst1.8 {d1}, [%2]! \n" // store 8 U.
  1020. MEMACCESS(3)
  1021. "vst1.8 {d3}, [%3]! \n" // store 8 V.
  1022. "bgt 1b \n"
  1023. : "+r"(src_yuy2), // %0
  1024. "+r"(stride_yuy2), // %1
  1025. "+r"(dst_u), // %2
  1026. "+r"(dst_v), // %3
  1027. "+r"(width) // %4
  1028. :
  1029. : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
  1030. );
  1031. }
  1032. void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
  1033. uint8* dst_u, uint8* dst_v, int width) {
  1034. asm volatile (
  1035. "add %1, %0, %1 \n" // stride + src_uyvy
  1036. "1: \n"
  1037. MEMACCESS(0)
  1038. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
  1039. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
  1040. MEMACCESS(1)
  1041. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
  1042. "vrhadd.u8 d0, d0, d4 \n" // average rows of U
  1043. "vrhadd.u8 d2, d2, d6 \n" // average rows of V
  1044. MEMACCESS(2)
  1045. "vst1.8 {d0}, [%2]! \n" // store 8 U.
  1046. MEMACCESS(3)
  1047. "vst1.8 {d2}, [%3]! \n" // store 8 V.
  1048. "bgt 1b \n"
  1049. : "+r"(src_uyvy), // %0
  1050. "+r"(stride_uyvy), // %1
  1051. "+r"(dst_u), // %2
  1052. "+r"(dst_v), // %3
  1053. "+r"(width) // %4
  1054. :
  1055. : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
  1056. );
  1057. }
  1058. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  1059. void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
  1060. const uint8* shuffler, int width) {
  1061. asm volatile (
  1062. MEMACCESS(3)
  1063. "vld1.8 {q2}, [%3] \n" // shuffler
  1064. "1: \n"
  1065. MEMACCESS(0)
  1066. "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
  1067. "subs %2, %2, #4 \n" // 4 processed per loop
  1068. "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
  1069. "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
  1070. MEMACCESS(1)
  1071. "vst1.8 {q1}, [%1]! \n" // store 4.
  1072. "bgt 1b \n"
  1073. : "+r"(src_argb), // %0
  1074. "+r"(dst_argb), // %1
  1075. "+r"(width) // %2
  1076. : "r"(shuffler) // %3
  1077. : "cc", "memory", "q0", "q1", "q2" // Clobber List
  1078. );
  1079. }
  1080. void I422ToYUY2Row_NEON(const uint8* src_y,
  1081. const uint8* src_u,
  1082. const uint8* src_v,
  1083. uint8* dst_yuy2, int width) {
  1084. asm volatile (
  1085. "1: \n"
  1086. MEMACCESS(0)
  1087. "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
  1088. MEMACCESS(1)
  1089. "vld1.8 {d1}, [%1]! \n" // load 8 Us
  1090. MEMACCESS(2)
  1091. "vld1.8 {d3}, [%2]! \n" // load 8 Vs
  1092. "subs %4, %4, #16 \n" // 16 pixels
  1093. MEMACCESS(3)
  1094. "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
  1095. "bgt 1b \n"
  1096. : "+r"(src_y), // %0
  1097. "+r"(src_u), // %1
  1098. "+r"(src_v), // %2
  1099. "+r"(dst_yuy2), // %3
  1100. "+r"(width) // %4
  1101. :
  1102. : "cc", "memory", "d0", "d1", "d2", "d3"
  1103. );
  1104. }
  1105. void I422ToUYVYRow_NEON(const uint8* src_y,
  1106. const uint8* src_u,
  1107. const uint8* src_v,
  1108. uint8* dst_uyvy, int width) {
  1109. asm volatile (
  1110. "1: \n"
  1111. MEMACCESS(0)
  1112. "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
  1113. MEMACCESS(1)
  1114. "vld1.8 {d0}, [%1]! \n" // load 8 Us
  1115. MEMACCESS(2)
  1116. "vld1.8 {d2}, [%2]! \n" // load 8 Vs
  1117. "subs %4, %4, #16 \n" // 16 pixels
  1118. MEMACCESS(3)
  1119. "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
  1120. "bgt 1b \n"
  1121. : "+r"(src_y), // %0
  1122. "+r"(src_u), // %1
  1123. "+r"(src_v), // %2
  1124. "+r"(dst_uyvy), // %3
  1125. "+r"(width) // %4
  1126. :
  1127. : "cc", "memory", "d0", "d1", "d2", "d3"
  1128. );
  1129. }
  1130. void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
  1131. asm volatile (
  1132. "1: \n"
  1133. MEMACCESS(0)
  1134. "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
  1135. "subs %2, %2, #8 \n" // 8 processed per loop.
  1136. ARGBTORGB565
  1137. MEMACCESS(1)
  1138. "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
  1139. "bgt 1b \n"
  1140. : "+r"(src_argb), // %0
  1141. "+r"(dst_rgb565), // %1
  1142. "+r"(width) // %2
  1143. :
  1144. : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
  1145. );
  1146. }
  1147. void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
  1148. const uint32 dither4, int width) {
  1149. asm volatile (
  1150. "vdup.32 d2, %2 \n" // dither4
  1151. "1: \n"
  1152. MEMACCESS(1)
  1153. "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB.
  1154. "subs %3, %3, #8 \n" // 8 processed per loop.
  1155. "vqadd.u8 d20, d20, d2 \n"
  1156. "vqadd.u8 d21, d21, d2 \n"
  1157. "vqadd.u8 d22, d22, d2 \n"
  1158. ARGBTORGB565
  1159. MEMACCESS(0)
  1160. "vst1.8 {q0}, [%0]! \n" // store 8 pixels RGB565.
  1161. "bgt 1b \n"
  1162. : "+r"(dst_rgb) // %0
  1163. : "r"(src_argb), // %1
  1164. "r"(dither4), // %2
  1165. "r"(width) // %3
  1166. : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"
  1167. );
  1168. }
  1169. void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
  1170. int width) {
  1171. asm volatile (
  1172. "1: \n"
  1173. MEMACCESS(0)
  1174. "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
  1175. "subs %2, %2, #8 \n" // 8 processed per loop.
  1176. ARGBTOARGB1555
  1177. MEMACCESS(1)
  1178. "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555.
  1179. "bgt 1b \n"
  1180. : "+r"(src_argb), // %0
  1181. "+r"(dst_argb1555), // %1
  1182. "+r"(width) // %2
  1183. :
  1184. : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
  1185. );
  1186. }
  1187. void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
  1188. int width) {
  1189. asm volatile (
  1190. "vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
  1191. "1: \n"
  1192. MEMACCESS(0)
  1193. "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
  1194. "subs %2, %2, #8 \n" // 8 processed per loop.
  1195. ARGBTOARGB4444
  1196. MEMACCESS(1)
  1197. "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444.
  1198. "bgt 1b \n"
  1199. : "+r"(src_argb), // %0
  1200. "+r"(dst_argb4444), // %1
  1201. "+r"(width) // %2
  1202. :
  1203. : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
  1204. );
  1205. }
  1206. void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
  1207. asm volatile (
  1208. "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
  1209. "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
  1210. "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
  1211. "vmov.u8 d27, #16 \n" // Add 16 constant
  1212. "1: \n"
  1213. MEMACCESS(0)
  1214. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
  1215. "subs %2, %2, #8 \n" // 8 processed per loop.
  1216. "vmull.u8 q2, d0, d24 \n" // B
  1217. "vmlal.u8 q2, d1, d25 \n" // G
  1218. "vmlal.u8 q2, d2, d26 \n" // R
  1219. "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
  1220. "vqadd.u8 d0, d27 \n"
  1221. MEMACCESS(1)
  1222. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1223. "bgt 1b \n"
  1224. : "+r"(src_argb), // %0
  1225. "+r"(dst_y), // %1
  1226. "+r"(width) // %2
  1227. :
  1228. : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
  1229. );
  1230. }
  1231. void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
  1232. asm volatile (
  1233. "1: \n"
  1234. MEMACCESS(0)
  1235. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
  1236. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
  1237. "subs %2, %2, #16 \n" // 16 processed per loop
  1238. MEMACCESS(1)
  1239. "vst1.8 {q3}, [%1]! \n" // store 16 A's.
  1240. "bgt 1b \n"
  1241. : "+r"(src_argb), // %0
  1242. "+r"(dst_a), // %1
  1243. "+r"(width) // %2
  1244. :
  1245. : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
  1246. );
  1247. }
  1248. void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
  1249. asm volatile (
  1250. "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
  1251. "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
  1252. "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
  1253. "1: \n"
  1254. MEMACCESS(0)
  1255. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
  1256. "subs %2, %2, #8 \n" // 8 processed per loop.
  1257. "vmull.u8 q2, d0, d24 \n" // B
  1258. "vmlal.u8 q2, d1, d25 \n" // G
  1259. "vmlal.u8 q2, d2, d26 \n" // R
  1260. "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y
  1261. MEMACCESS(1)
  1262. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1263. "bgt 1b \n"
  1264. : "+r"(src_argb), // %0
  1265. "+r"(dst_y), // %1
  1266. "+r"(width) // %2
  1267. :
  1268. : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
  1269. );
  1270. }
  1271. // 8x1 pixels.
  1272. void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
  1273. int width) {
  1274. asm volatile (
  1275. "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient
  1276. "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient
  1277. "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient
  1278. "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
  1279. "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
  1280. "vmov.u16 q15, #0x8080 \n" // 128.5
  1281. "1: \n"
  1282. MEMACCESS(0)
  1283. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
  1284. "subs %3, %3, #8 \n" // 8 processed per loop.
  1285. "vmull.u8 q2, d0, d24 \n" // B
  1286. "vmlsl.u8 q2, d1, d25 \n" // G
  1287. "vmlsl.u8 q2, d2, d26 \n" // R
  1288. "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned
  1289. "vmull.u8 q3, d2, d24 \n" // R
  1290. "vmlsl.u8 q3, d1, d28 \n" // G
  1291. "vmlsl.u8 q3, d0, d27 \n" // B
  1292. "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned
  1293. "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U
  1294. "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V
  1295. MEMACCESS(1)
  1296. "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
  1297. MEMACCESS(2)
  1298. "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
  1299. "bgt 1b \n"
  1300. : "+r"(src_argb), // %0
  1301. "+r"(dst_u), // %1
  1302. "+r"(dst_v), // %2
  1303. "+r"(width) // %3
  1304. :
  1305. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
  1306. );
  1307. }
  1308. // 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32.
  1309. void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
  1310. int width) {
  1311. asm volatile (
  1312. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1313. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1314. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1315. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1316. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1317. "vmov.u16 q15, #0x8080 \n" // 128.5
  1318. "1: \n"
  1319. MEMACCESS(0)
  1320. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
  1321. MEMACCESS(0)
  1322. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
  1323. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
  1324. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
  1325. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
  1326. MEMACCESS(0)
  1327. "vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels.
  1328. MEMACCESS(0)
  1329. "vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels.
  1330. "vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts.
  1331. "vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts.
  1332. "vpaddl.u8 q6, q6 \n" // R 16 bytes -> 8 shorts.
  1333. "vpadd.u16 d0, d0, d1 \n" // B 16 shorts -> 8 shorts.
  1334. "vpadd.u16 d1, d8, d9 \n" // B
  1335. "vpadd.u16 d2, d2, d3 \n" // G 16 shorts -> 8 shorts.
  1336. "vpadd.u16 d3, d10, d11 \n" // G
  1337. "vpadd.u16 d4, d4, d5 \n" // R 16 shorts -> 8 shorts.
  1338. "vpadd.u16 d5, d12, d13 \n" // R
  1339. "vrshr.u16 q0, q0, #1 \n" // 2x average
  1340. "vrshr.u16 q1, q1, #1 \n"
  1341. "vrshr.u16 q2, q2, #1 \n"
  1342. "subs %3, %3, #32 \n" // 32 processed per loop.
  1343. "vmul.s16 q8, q0, q10 \n" // B
  1344. "vmls.s16 q8, q1, q11 \n" // G
  1345. "vmls.s16 q8, q2, q12 \n" // R
  1346. "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
  1347. "vmul.s16 q9, q2, q10 \n" // R
  1348. "vmls.s16 q9, q1, q14 \n" // G
  1349. "vmls.s16 q9, q0, q13 \n" // B
  1350. "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
  1351. "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
  1352. "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
  1353. MEMACCESS(1)
  1354. "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
  1355. MEMACCESS(2)
  1356. "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
  1357. "bgt 1b \n"
  1358. : "+r"(src_argb), // %0
  1359. "+r"(dst_u), // %1
  1360. "+r"(dst_v), // %2
  1361. "+r"(width) // %3
  1362. :
  1363. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1364. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1365. );
  1366. }
  1367. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
  1368. #define RGBTOUV(QB, QG, QR) \
  1369. "vmul.s16 q8, " #QB ", q10 \n" /* B */ \
  1370. "vmls.s16 q8, " #QG ", q11 \n" /* G */ \
  1371. "vmls.s16 q8, " #QR ", q12 \n" /* R */ \
  1372. "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \
  1373. "vmul.s16 q9, " #QR ", q10 \n" /* R */ \
  1374. "vmls.s16 q9, " #QG ", q14 \n" /* G */ \
  1375. "vmls.s16 q9, " #QB ", q13 \n" /* B */ \
  1376. "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \
  1377. "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \
  1378. "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
  1379. // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
  1380. void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
  1381. uint8* dst_u, uint8* dst_v, int width) {
  1382. asm volatile (
  1383. "add %1, %0, %1 \n" // src_stride + src_argb
  1384. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1385. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1386. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1387. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1388. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1389. "vmov.u16 q15, #0x8080 \n" // 128.5
  1390. "1: \n"
  1391. MEMACCESS(0)
  1392. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
  1393. MEMACCESS(0)
  1394. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
  1395. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
  1396. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
  1397. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
  1398. MEMACCESS(1)
  1399. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
  1400. MEMACCESS(1)
  1401. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
  1402. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
  1403. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
  1404. "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
  1405. "vrshr.u16 q0, q0, #1 \n" // 2x average
  1406. "vrshr.u16 q1, q1, #1 \n"
  1407. "vrshr.u16 q2, q2, #1 \n"
  1408. "subs %4, %4, #16 \n" // 32 processed per loop.
  1409. RGBTOUV(q0, q1, q2)
  1410. MEMACCESS(2)
  1411. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1412. MEMACCESS(3)
  1413. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1414. "bgt 1b \n"
  1415. : "+r"(src_argb), // %0
  1416. "+r"(src_stride_argb), // %1
  1417. "+r"(dst_u), // %2
  1418. "+r"(dst_v), // %3
  1419. "+r"(width) // %4
  1420. :
  1421. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1422. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1423. );
  1424. }
  1425. // TODO(fbarchard): Subsample match C code.
  1426. void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
  1427. uint8* dst_u, uint8* dst_v, int width) {
  1428. asm volatile (
  1429. "add %1, %0, %1 \n" // src_stride + src_argb
  1430. "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
  1431. "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
  1432. "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
  1433. "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
  1434. "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
  1435. "vmov.u16 q15, #0x8080 \n" // 128.5
  1436. "1: \n"
  1437. MEMACCESS(0)
  1438. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
  1439. MEMACCESS(0)
  1440. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
  1441. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
  1442. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
  1443. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
  1444. MEMACCESS(1)
  1445. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
  1446. MEMACCESS(1)
  1447. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
  1448. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
  1449. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
  1450. "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
  1451. "vrshr.u16 q0, q0, #1 \n" // 2x average
  1452. "vrshr.u16 q1, q1, #1 \n"
  1453. "vrshr.u16 q2, q2, #1 \n"
  1454. "subs %4, %4, #16 \n" // 32 processed per loop.
  1455. RGBTOUV(q0, q1, q2)
  1456. MEMACCESS(2)
  1457. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1458. MEMACCESS(3)
  1459. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1460. "bgt 1b \n"
  1461. : "+r"(src_argb), // %0
  1462. "+r"(src_stride_argb), // %1
  1463. "+r"(dst_u), // %2
  1464. "+r"(dst_v), // %3
  1465. "+r"(width) // %4
  1466. :
  1467. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1468. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1469. );
  1470. }
  1471. void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
  1472. uint8* dst_u, uint8* dst_v, int width) {
  1473. asm volatile (
  1474. "add %1, %0, %1 \n" // src_stride + src_bgra
  1475. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1476. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1477. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1478. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1479. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1480. "vmov.u16 q15, #0x8080 \n" // 128.5
  1481. "1: \n"
  1482. MEMACCESS(0)
  1483. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
  1484. MEMACCESS(0)
  1485. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
  1486. "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
  1487. "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
  1488. "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
  1489. MEMACCESS(1)
  1490. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
  1491. MEMACCESS(1)
  1492. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
  1493. "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
  1494. "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
  1495. "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
  1496. "vrshr.u16 q1, q1, #1 \n" // 2x average
  1497. "vrshr.u16 q2, q2, #1 \n"
  1498. "vrshr.u16 q3, q3, #1 \n"
  1499. "subs %4, %4, #16 \n" // 32 processed per loop.
  1500. RGBTOUV(q3, q2, q1)
  1501. MEMACCESS(2)
  1502. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1503. MEMACCESS(3)
  1504. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1505. "bgt 1b \n"
  1506. : "+r"(src_bgra), // %0
  1507. "+r"(src_stride_bgra), // %1
  1508. "+r"(dst_u), // %2
  1509. "+r"(dst_v), // %3
  1510. "+r"(width) // %4
  1511. :
  1512. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1513. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1514. );
  1515. }
  1516. void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
  1517. uint8* dst_u, uint8* dst_v, int width) {
  1518. asm volatile (
  1519. "add %1, %0, %1 \n" // src_stride + src_abgr
  1520. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1521. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1522. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1523. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1524. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1525. "vmov.u16 q15, #0x8080 \n" // 128.5
  1526. "1: \n"
  1527. MEMACCESS(0)
  1528. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
  1529. MEMACCESS(0)
  1530. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
  1531. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
  1532. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
  1533. "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
  1534. MEMACCESS(1)
  1535. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
  1536. MEMACCESS(1)
  1537. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
  1538. "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
  1539. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
  1540. "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
  1541. "vrshr.u16 q0, q0, #1 \n" // 2x average
  1542. "vrshr.u16 q1, q1, #1 \n"
  1543. "vrshr.u16 q2, q2, #1 \n"
  1544. "subs %4, %4, #16 \n" // 32 processed per loop.
  1545. RGBTOUV(q2, q1, q0)
  1546. MEMACCESS(2)
  1547. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1548. MEMACCESS(3)
  1549. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1550. "bgt 1b \n"
  1551. : "+r"(src_abgr), // %0
  1552. "+r"(src_stride_abgr), // %1
  1553. "+r"(dst_u), // %2
  1554. "+r"(dst_v), // %3
  1555. "+r"(width) // %4
  1556. :
  1557. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1558. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1559. );
  1560. }
  1561. void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
  1562. uint8* dst_u, uint8* dst_v, int width) {
  1563. asm volatile (
  1564. "add %1, %0, %1 \n" // src_stride + src_rgba
  1565. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1566. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1567. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1568. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1569. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1570. "vmov.u16 q15, #0x8080 \n" // 128.5
  1571. "1: \n"
  1572. MEMACCESS(0)
  1573. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
  1574. MEMACCESS(0)
  1575. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
  1576. "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
  1577. "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
  1578. "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
  1579. MEMACCESS(1)
  1580. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
  1581. MEMACCESS(1)
  1582. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
  1583. "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
  1584. "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
  1585. "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
  1586. "vrshr.u16 q0, q0, #1 \n" // 2x average
  1587. "vrshr.u16 q1, q1, #1 \n"
  1588. "vrshr.u16 q2, q2, #1 \n"
  1589. "subs %4, %4, #16 \n" // 32 processed per loop.
  1590. RGBTOUV(q0, q1, q2)
  1591. MEMACCESS(2)
  1592. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1593. MEMACCESS(3)
  1594. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1595. "bgt 1b \n"
  1596. : "+r"(src_rgba), // %0
  1597. "+r"(src_stride_rgba), // %1
  1598. "+r"(dst_u), // %2
  1599. "+r"(dst_v), // %3
  1600. "+r"(width) // %4
  1601. :
  1602. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1603. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1604. );
  1605. }
  1606. void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
  1607. uint8* dst_u, uint8* dst_v, int width) {
  1608. asm volatile (
  1609. "add %1, %0, %1 \n" // src_stride + src_rgb24
  1610. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1611. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1612. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1613. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1614. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1615. "vmov.u16 q15, #0x8080 \n" // 128.5
  1616. "1: \n"
  1617. MEMACCESS(0)
  1618. "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
  1619. MEMACCESS(0)
  1620. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
  1621. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
  1622. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
  1623. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
  1624. MEMACCESS(1)
  1625. "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
  1626. MEMACCESS(1)
  1627. "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
  1628. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
  1629. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
  1630. "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
  1631. "vrshr.u16 q0, q0, #1 \n" // 2x average
  1632. "vrshr.u16 q1, q1, #1 \n"
  1633. "vrshr.u16 q2, q2, #1 \n"
  1634. "subs %4, %4, #16 \n" // 32 processed per loop.
  1635. RGBTOUV(q0, q1, q2)
  1636. MEMACCESS(2)
  1637. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1638. MEMACCESS(3)
  1639. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1640. "bgt 1b \n"
  1641. : "+r"(src_rgb24), // %0
  1642. "+r"(src_stride_rgb24), // %1
  1643. "+r"(dst_u), // %2
  1644. "+r"(dst_v), // %3
  1645. "+r"(width) // %4
  1646. :
  1647. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1648. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1649. );
  1650. }
  1651. void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
  1652. uint8* dst_u, uint8* dst_v, int width) {
  1653. asm volatile (
  1654. "add %1, %0, %1 \n" // src_stride + src_raw
  1655. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1656. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1657. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1658. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1659. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1660. "vmov.u16 q15, #0x8080 \n" // 128.5
  1661. "1: \n"
  1662. MEMACCESS(0)
  1663. "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
  1664. MEMACCESS(0)
  1665. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
  1666. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
  1667. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
  1668. "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
  1669. MEMACCESS(1)
  1670. "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
  1671. MEMACCESS(1)
  1672. "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
  1673. "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
  1674. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
  1675. "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
  1676. "vrshr.u16 q0, q0, #1 \n" // 2x average
  1677. "vrshr.u16 q1, q1, #1 \n"
  1678. "vrshr.u16 q2, q2, #1 \n"
  1679. "subs %4, %4, #16 \n" // 32 processed per loop.
  1680. RGBTOUV(q2, q1, q0)
  1681. MEMACCESS(2)
  1682. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1683. MEMACCESS(3)
  1684. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1685. "bgt 1b \n"
  1686. : "+r"(src_raw), // %0
  1687. "+r"(src_stride_raw), // %1
  1688. "+r"(dst_u), // %2
  1689. "+r"(dst_v), // %3
  1690. "+r"(width) // %4
  1691. :
  1692. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1693. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1694. );
  1695. }
  1696. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
  1697. void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
  1698. uint8* dst_u, uint8* dst_v, int width) {
  1699. asm volatile (
  1700. "add %1, %0, %1 \n" // src_stride + src_argb
  1701. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1702. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1703. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1704. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1705. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1706. "vmov.u16 q15, #0x8080 \n" // 128.5
  1707. "1: \n"
  1708. MEMACCESS(0)
  1709. "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
  1710. RGB565TOARGB
  1711. "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
  1712. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
  1713. "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
  1714. MEMACCESS(0)
  1715. "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
  1716. RGB565TOARGB
  1717. "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
  1718. "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
  1719. "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
  1720. MEMACCESS(1)
  1721. "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
  1722. RGB565TOARGB
  1723. "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
  1724. "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
  1725. "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
  1726. MEMACCESS(1)
  1727. "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
  1728. RGB565TOARGB
  1729. "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
  1730. "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
  1731. "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
  1732. "vrshr.u16 q4, q4, #1 \n" // 2x average
  1733. "vrshr.u16 q5, q5, #1 \n"
  1734. "vrshr.u16 q6, q6, #1 \n"
  1735. "subs %4, %4, #16 \n" // 16 processed per loop.
  1736. "vmul.s16 q8, q4, q10 \n" // B
  1737. "vmls.s16 q8, q5, q11 \n" // G
  1738. "vmls.s16 q8, q6, q12 \n" // R
  1739. "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
  1740. "vmul.s16 q9, q6, q10 \n" // R
  1741. "vmls.s16 q9, q5, q14 \n" // G
  1742. "vmls.s16 q9, q4, q13 \n" // B
  1743. "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
  1744. "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
  1745. "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
  1746. MEMACCESS(2)
  1747. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1748. MEMACCESS(3)
  1749. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1750. "bgt 1b \n"
  1751. : "+r"(src_rgb565), // %0
  1752. "+r"(src_stride_rgb565), // %1
  1753. "+r"(dst_u), // %2
  1754. "+r"(dst_v), // %3
  1755. "+r"(width) // %4
  1756. :
  1757. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1758. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1759. );
  1760. }
  1761. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
  1762. void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
  1763. uint8* dst_u, uint8* dst_v, int width) {
  1764. asm volatile (
  1765. "add %1, %0, %1 \n" // src_stride + src_argb
  1766. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1767. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1768. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1769. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1770. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1771. "vmov.u16 q15, #0x8080 \n" // 128.5
  1772. "1: \n"
  1773. MEMACCESS(0)
  1774. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
  1775. RGB555TOARGB
  1776. "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
  1777. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
  1778. "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
  1779. MEMACCESS(0)
  1780. "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
  1781. RGB555TOARGB
  1782. "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
  1783. "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
  1784. "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
  1785. MEMACCESS(1)
  1786. "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
  1787. RGB555TOARGB
  1788. "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
  1789. "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
  1790. "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
  1791. MEMACCESS(1)
  1792. "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
  1793. RGB555TOARGB
  1794. "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
  1795. "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
  1796. "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
  1797. "vrshr.u16 q4, q4, #1 \n" // 2x average
  1798. "vrshr.u16 q5, q5, #1 \n"
  1799. "vrshr.u16 q6, q6, #1 \n"
  1800. "subs %4, %4, #16 \n" // 16 processed per loop.
  1801. "vmul.s16 q8, q4, q10 \n" // B
  1802. "vmls.s16 q8, q5, q11 \n" // G
  1803. "vmls.s16 q8, q6, q12 \n" // R
  1804. "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
  1805. "vmul.s16 q9, q6, q10 \n" // R
  1806. "vmls.s16 q9, q5, q14 \n" // G
  1807. "vmls.s16 q9, q4, q13 \n" // B
  1808. "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
  1809. "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
  1810. "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
  1811. MEMACCESS(2)
  1812. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1813. MEMACCESS(3)
  1814. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1815. "bgt 1b \n"
  1816. : "+r"(src_argb1555), // %0
  1817. "+r"(src_stride_argb1555), // %1
  1818. "+r"(dst_u), // %2
  1819. "+r"(dst_v), // %3
  1820. "+r"(width) // %4
  1821. :
  1822. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1823. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1824. );
  1825. }
  1826. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
  1827. void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
  1828. uint8* dst_u, uint8* dst_v, int width) {
  1829. asm volatile (
  1830. "add %1, %0, %1 \n" // src_stride + src_argb
  1831. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1832. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1833. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1834. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1835. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1836. "vmov.u16 q15, #0x8080 \n" // 128.5
  1837. "1: \n"
  1838. MEMACCESS(0)
  1839. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
  1840. ARGB4444TOARGB
  1841. "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
  1842. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
  1843. "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
  1844. MEMACCESS(0)
  1845. "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
  1846. ARGB4444TOARGB
  1847. "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
  1848. "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
  1849. "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
  1850. MEMACCESS(1)
  1851. "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
  1852. ARGB4444TOARGB
  1853. "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
  1854. "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
  1855. "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
  1856. MEMACCESS(1)
  1857. "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
  1858. ARGB4444TOARGB
  1859. "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
  1860. "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
  1861. "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
  1862. "vrshr.u16 q4, q4, #1 \n" // 2x average
  1863. "vrshr.u16 q5, q5, #1 \n"
  1864. "vrshr.u16 q6, q6, #1 \n"
  1865. "subs %4, %4, #16 \n" // 16 processed per loop.
  1866. "vmul.s16 q8, q4, q10 \n" // B
  1867. "vmls.s16 q8, q5, q11 \n" // G
  1868. "vmls.s16 q8, q6, q12 \n" // R
  1869. "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
  1870. "vmul.s16 q9, q6, q10 \n" // R
  1871. "vmls.s16 q9, q5, q14 \n" // G
  1872. "vmls.s16 q9, q4, q13 \n" // B
  1873. "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
  1874. "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
  1875. "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
  1876. MEMACCESS(2)
  1877. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1878. MEMACCESS(3)
  1879. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1880. "bgt 1b \n"
  1881. : "+r"(src_argb4444), // %0
  1882. "+r"(src_stride_argb4444), // %1
  1883. "+r"(dst_u), // %2
  1884. "+r"(dst_v), // %3
  1885. "+r"(width) // %4
  1886. :
  1887. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1888. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1889. );
  1890. }
  1891. void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
  1892. asm volatile (
  1893. "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
  1894. "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
  1895. "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
  1896. "vmov.u8 d27, #16 \n" // Add 16 constant
  1897. "1: \n"
  1898. MEMACCESS(0)
  1899. "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
  1900. "subs %2, %2, #8 \n" // 8 processed per loop.
  1901. RGB565TOARGB
  1902. "vmull.u8 q2, d0, d24 \n" // B
  1903. "vmlal.u8 q2, d1, d25 \n" // G
  1904. "vmlal.u8 q2, d2, d26 \n" // R
  1905. "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
  1906. "vqadd.u8 d0, d27 \n"
  1907. MEMACCESS(1)
  1908. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1909. "bgt 1b \n"
  1910. : "+r"(src_rgb565), // %0
  1911. "+r"(dst_y), // %1
  1912. "+r"(width) // %2
  1913. :
  1914. : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
  1915. );
  1916. }
  1917. void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
  1918. asm volatile (
  1919. "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
  1920. "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
  1921. "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
  1922. "vmov.u8 d27, #16 \n" // Add 16 constant
  1923. "1: \n"
  1924. MEMACCESS(0)
  1925. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
  1926. "subs %2, %2, #8 \n" // 8 processed per loop.
  1927. ARGB1555TOARGB
  1928. "vmull.u8 q2, d0, d24 \n" // B
  1929. "vmlal.u8 q2, d1, d25 \n" // G
  1930. "vmlal.u8 q2, d2, d26 \n" // R
  1931. "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
  1932. "vqadd.u8 d0, d27 \n"
  1933. MEMACCESS(1)
  1934. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1935. "bgt 1b \n"
  1936. : "+r"(src_argb1555), // %0
  1937. "+r"(dst_y), // %1
  1938. "+r"(width) // %2
  1939. :
  1940. : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
  1941. );
  1942. }
  1943. void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
  1944. asm volatile (
  1945. "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
  1946. "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
  1947. "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
  1948. "vmov.u8 d27, #16 \n" // Add 16 constant
  1949. "1: \n"
  1950. MEMACCESS(0)
  1951. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
  1952. "subs %2, %2, #8 \n" // 8 processed per loop.
  1953. ARGB4444TOARGB
  1954. "vmull.u8 q2, d0, d24 \n" // B
  1955. "vmlal.u8 q2, d1, d25 \n" // G
  1956. "vmlal.u8 q2, d2, d26 \n" // R
  1957. "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
  1958. "vqadd.u8 d0, d27 \n"
  1959. MEMACCESS(1)
  1960. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1961. "bgt 1b \n"
  1962. : "+r"(src_argb4444), // %0
  1963. "+r"(dst_y), // %1
  1964. "+r"(width) // %2
  1965. :
  1966. : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
  1967. );
  1968. }
  1969. void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
  1970. asm volatile (
  1971. "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
  1972. "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
  1973. "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
  1974. "vmov.u8 d7, #16 \n" // Add 16 constant
  1975. "1: \n"
  1976. MEMACCESS(0)
  1977. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
  1978. "subs %2, %2, #8 \n" // 8 processed per loop.
  1979. "vmull.u8 q8, d1, d4 \n" // R
  1980. "vmlal.u8 q8, d2, d5 \n" // G
  1981. "vmlal.u8 q8, d3, d6 \n" // B
  1982. "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
  1983. "vqadd.u8 d0, d7 \n"
  1984. MEMACCESS(1)
  1985. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1986. "bgt 1b \n"
  1987. : "+r"(src_bgra), // %0
  1988. "+r"(dst_y), // %1
  1989. "+r"(width) // %2
  1990. :
  1991. : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
  1992. );
  1993. }
  1994. void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
  1995. asm volatile (
  1996. "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
  1997. "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
  1998. "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
  1999. "vmov.u8 d7, #16 \n" // Add 16 constant
  2000. "1: \n"
  2001. MEMACCESS(0)
  2002. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
  2003. "subs %2, %2, #8 \n" // 8 processed per loop.
  2004. "vmull.u8 q8, d0, d4 \n" // R
  2005. "vmlal.u8 q8, d1, d5 \n" // G
  2006. "vmlal.u8 q8, d2, d6 \n" // B
  2007. "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
  2008. "vqadd.u8 d0, d7 \n"
  2009. MEMACCESS(1)
  2010. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  2011. "bgt 1b \n"
  2012. : "+r"(src_abgr), // %0
  2013. "+r"(dst_y), // %1
  2014. "+r"(width) // %2
  2015. :
  2016. : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
  2017. );
  2018. }
  2019. void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
  2020. asm volatile (
  2021. "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
  2022. "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
  2023. "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
  2024. "vmov.u8 d7, #16 \n" // Add 16 constant
  2025. "1: \n"
  2026. MEMACCESS(0)
  2027. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
  2028. "subs %2, %2, #8 \n" // 8 processed per loop.
  2029. "vmull.u8 q8, d1, d4 \n" // B
  2030. "vmlal.u8 q8, d2, d5 \n" // G
  2031. "vmlal.u8 q8, d3, d6 \n" // R
  2032. "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
  2033. "vqadd.u8 d0, d7 \n"
  2034. MEMACCESS(1)
  2035. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  2036. "bgt 1b \n"
  2037. : "+r"(src_rgba), // %0
  2038. "+r"(dst_y), // %1
  2039. "+r"(width) // %2
  2040. :
  2041. : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
  2042. );
  2043. }
  2044. void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
  2045. asm volatile (
  2046. "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
  2047. "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
  2048. "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
  2049. "vmov.u8 d7, #16 \n" // Add 16 constant
  2050. "1: \n"
  2051. MEMACCESS(0)
  2052. "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
  2053. "subs %2, %2, #8 \n" // 8 processed per loop.
  2054. "vmull.u8 q8, d0, d4 \n" // B
  2055. "vmlal.u8 q8, d1, d5 \n" // G
  2056. "vmlal.u8 q8, d2, d6 \n" // R
  2057. "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
  2058. "vqadd.u8 d0, d7 \n"
  2059. MEMACCESS(1)
  2060. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  2061. "bgt 1b \n"
  2062. : "+r"(src_rgb24), // %0
  2063. "+r"(dst_y), // %1
  2064. "+r"(width) // %2
  2065. :
  2066. : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
  2067. );
  2068. }
  2069. void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
  2070. asm volatile (
  2071. "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
  2072. "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
  2073. "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
  2074. "vmov.u8 d7, #16 \n" // Add 16 constant
  2075. "1: \n"
  2076. MEMACCESS(0)
  2077. "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
  2078. "subs %2, %2, #8 \n" // 8 processed per loop.
  2079. "vmull.u8 q8, d0, d4 \n" // B
  2080. "vmlal.u8 q8, d1, d5 \n" // G
  2081. "vmlal.u8 q8, d2, d6 \n" // R
  2082. "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
  2083. "vqadd.u8 d0, d7 \n"
  2084. MEMACCESS(1)
  2085. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  2086. "bgt 1b \n"
  2087. : "+r"(src_raw), // %0
  2088. "+r"(dst_y), // %1
  2089. "+r"(width) // %2
  2090. :
  2091. : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
  2092. );
  2093. }
  2094. // Bilinear filter 16x2 -> 16x1
  2095. void InterpolateRow_NEON(uint8* dst_ptr,
  2096. const uint8* src_ptr, ptrdiff_t src_stride,
  2097. int dst_width, int source_y_fraction) {
  2098. int y1_fraction = source_y_fraction;
  2099. asm volatile (
  2100. "cmp %4, #0 \n"
  2101. "beq 100f \n"
  2102. "add %2, %1 \n"
  2103. "cmp %4, #128 \n"
  2104. "beq 50f \n"
  2105. "vdup.8 d5, %4 \n"
  2106. "rsb %4, #256 \n"
  2107. "vdup.8 d4, %4 \n"
  2108. // General purpose row blend.
  2109. "1: \n"
  2110. MEMACCESS(1)
  2111. "vld1.8 {q0}, [%1]! \n"
  2112. MEMACCESS(2)
  2113. "vld1.8 {q1}, [%2]! \n"
  2114. "subs %3, %3, #16 \n"
  2115. "vmull.u8 q13, d0, d4 \n"
  2116. "vmull.u8 q14, d1, d4 \n"
  2117. "vmlal.u8 q13, d2, d5 \n"
  2118. "vmlal.u8 q14, d3, d5 \n"
  2119. "vrshrn.u16 d0, q13, #8 \n"
  2120. "vrshrn.u16 d1, q14, #8 \n"
  2121. MEMACCESS(0)
  2122. "vst1.8 {q0}, [%0]! \n"
  2123. "bgt 1b \n"
  2124. "b 99f \n"
  2125. // Blend 50 / 50.
  2126. "50: \n"
  2127. MEMACCESS(1)
  2128. "vld1.8 {q0}, [%1]! \n"
  2129. MEMACCESS(2)
  2130. "vld1.8 {q1}, [%2]! \n"
  2131. "subs %3, %3, #16 \n"
  2132. "vrhadd.u8 q0, q1 \n"
  2133. MEMACCESS(0)
  2134. "vst1.8 {q0}, [%0]! \n"
  2135. "bgt 50b \n"
  2136. "b 99f \n"
  2137. // Blend 100 / 0 - Copy row unchanged.
  2138. "100: \n"
  2139. MEMACCESS(1)
  2140. "vld1.8 {q0}, [%1]! \n"
  2141. "subs %3, %3, #16 \n"
  2142. MEMACCESS(0)
  2143. "vst1.8 {q0}, [%0]! \n"
  2144. "bgt 100b \n"
  2145. "99: \n"
  2146. : "+r"(dst_ptr), // %0
  2147. "+r"(src_ptr), // %1
  2148. "+r"(src_stride), // %2
  2149. "+r"(dst_width), // %3
  2150. "+r"(y1_fraction) // %4
  2151. :
  2152. : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
  2153. );
  2154. }
  2155. // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
  2156. void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
  2157. uint8* dst_argb, int width) {
  2158. asm volatile (
  2159. "subs %3, #8 \n"
  2160. "blt 89f \n"
  2161. // Blend 8 pixels.
  2162. "8: \n"
  2163. MEMACCESS(0)
  2164. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
  2165. MEMACCESS(1)
  2166. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
  2167. "subs %3, %3, #8 \n" // 8 processed per loop.
  2168. "vmull.u8 q10, d4, d3 \n" // db * a
  2169. "vmull.u8 q11, d5, d3 \n" // dg * a
  2170. "vmull.u8 q12, d6, d3 \n" // dr * a
  2171. "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
  2172. "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
  2173. "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
  2174. "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
  2175. "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
  2176. "vqadd.u8 q0, q0, q2 \n" // + sbg
  2177. "vqadd.u8 d2, d2, d6 \n" // + sr
  2178. "vmov.u8 d3, #255 \n" // a = 255
  2179. MEMACCESS(2)
  2180. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
  2181. "bge 8b \n"
  2182. "89: \n"
  2183. "adds %3, #8-1 \n"
  2184. "blt 99f \n"
  2185. // Blend 1 pixels.
  2186. "1: \n"
  2187. MEMACCESS(0)
  2188. "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
  2189. MEMACCESS(1)
  2190. "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
  2191. "subs %3, %3, #1 \n" // 1 processed per loop.
  2192. "vmull.u8 q10, d4, d3 \n" // db * a
  2193. "vmull.u8 q11, d5, d3 \n" // dg * a
  2194. "vmull.u8 q12, d6, d3 \n" // dr * a
  2195. "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
  2196. "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
  2197. "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
  2198. "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
  2199. "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
  2200. "vqadd.u8 q0, q0, q2 \n" // + sbg
  2201. "vqadd.u8 d2, d2, d6 \n" // + sr
  2202. "vmov.u8 d3, #255 \n" // a = 255
  2203. MEMACCESS(2)
  2204. "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
  2205. "bge 1b \n"
  2206. "99: \n"
  2207. : "+r"(src_argb0), // %0
  2208. "+r"(src_argb1), // %1
  2209. "+r"(dst_argb), // %2
  2210. "+r"(width) // %3
  2211. :
  2212. : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
  2213. );
  2214. }
  2215. // Attenuate 8 pixels at a time.
  2216. void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
  2217. asm volatile (
  2218. // Attenuate 8 pixels.
  2219. "1: \n"
  2220. MEMACCESS(0)
  2221. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
  2222. "subs %2, %2, #8 \n" // 8 processed per loop.
  2223. "vmull.u8 q10, d0, d3 \n" // b * a
  2224. "vmull.u8 q11, d1, d3 \n" // g * a
  2225. "vmull.u8 q12, d2, d3 \n" // r * a
  2226. "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
  2227. "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
  2228. "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
  2229. MEMACCESS(1)
  2230. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
  2231. "bgt 1b \n"
  2232. : "+r"(src_argb), // %0
  2233. "+r"(dst_argb), // %1
  2234. "+r"(width) // %2
  2235. :
  2236. : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
  2237. );
  2238. }
  2239. // Quantize 8 ARGB pixels (32 bytes).
  2240. // dst = (dst * scale >> 16) * interval_size + interval_offset;
  2241. void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
  2242. int interval_offset, int width) {
  2243. asm volatile (
  2244. "vdup.u16 q8, %2 \n"
  2245. "vshr.u16 q8, q8, #1 \n" // scale >>= 1
  2246. "vdup.u16 q9, %3 \n" // interval multiply.
  2247. "vdup.u16 q10, %4 \n" // interval add
  2248. // 8 pixel loop.
  2249. "1: \n"
  2250. MEMACCESS(0)
  2251. "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
  2252. "subs %1, %1, #8 \n" // 8 processed per loop.
  2253. "vmovl.u8 q0, d0 \n" // b (0 .. 255)
  2254. "vmovl.u8 q1, d2 \n"
  2255. "vmovl.u8 q2, d4 \n"
  2256. "vqdmulh.s16 q0, q0, q8 \n" // b * scale
  2257. "vqdmulh.s16 q1, q1, q8 \n" // g
  2258. "vqdmulh.s16 q2, q2, q8 \n" // r
  2259. "vmul.u16 q0, q0, q9 \n" // b * interval_size
  2260. "vmul.u16 q1, q1, q9 \n" // g
  2261. "vmul.u16 q2, q2, q9 \n" // r
  2262. "vadd.u16 q0, q0, q10 \n" // b + interval_offset
  2263. "vadd.u16 q1, q1, q10 \n" // g
  2264. "vadd.u16 q2, q2, q10 \n" // r
  2265. "vqmovn.u16 d0, q0 \n"
  2266. "vqmovn.u16 d2, q1 \n"
  2267. "vqmovn.u16 d4, q2 \n"
  2268. MEMACCESS(0)
  2269. "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
  2270. "bgt 1b \n"
  2271. : "+r"(dst_argb), // %0
  2272. "+r"(width) // %1
  2273. : "r"(scale), // %2
  2274. "r"(interval_size), // %3
  2275. "r"(interval_offset) // %4
  2276. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
  2277. );
  2278. }
  2279. // Shade 8 pixels at a time by specified value.
  2280. // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
  2281. // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
  2282. void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
  2283. uint32 value) {
  2284. asm volatile (
  2285. "vdup.u32 q0, %3 \n" // duplicate scale value.
  2286. "vzip.u8 d0, d1 \n" // d0 aarrggbb.
  2287. "vshr.u16 q0, q0, #1 \n" // scale / 2.
  2288. // 8 pixel loop.
  2289. "1: \n"
  2290. MEMACCESS(0)
  2291. "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
  2292. "subs %2, %2, #8 \n" // 8 processed per loop.
  2293. "vmovl.u8 q10, d20 \n" // b (0 .. 255)
  2294. "vmovl.u8 q11, d22 \n"
  2295. "vmovl.u8 q12, d24 \n"
  2296. "vmovl.u8 q13, d26 \n"
  2297. "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2
  2298. "vqrdmulh.s16 q11, q11, d0[1] \n" // g
  2299. "vqrdmulh.s16 q12, q12, d0[2] \n" // r
  2300. "vqrdmulh.s16 q13, q13, d0[3] \n" // a
  2301. "vqmovn.u16 d20, q10 \n"
  2302. "vqmovn.u16 d22, q11 \n"
  2303. "vqmovn.u16 d24, q12 \n"
  2304. "vqmovn.u16 d26, q13 \n"
  2305. MEMACCESS(1)
  2306. "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
  2307. "bgt 1b \n"
  2308. : "+r"(src_argb), // %0
  2309. "+r"(dst_argb), // %1
  2310. "+r"(width) // %2
  2311. : "r"(value) // %3
  2312. : "cc", "memory", "q0", "q10", "q11", "q12", "q13"
  2313. );
  2314. }
  2315. // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
  2316. // Similar to ARGBToYJ but stores ARGB.
  2317. // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
  2318. void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
  2319. asm volatile (
  2320. "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
  2321. "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
  2322. "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
  2323. "1: \n"
  2324. MEMACCESS(0)
  2325. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
  2326. "subs %2, %2, #8 \n" // 8 processed per loop.
  2327. "vmull.u8 q2, d0, d24 \n" // B
  2328. "vmlal.u8 q2, d1, d25 \n" // G
  2329. "vmlal.u8 q2, d2, d26 \n" // R
  2330. "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B
  2331. "vmov d1, d0 \n" // G
  2332. "vmov d2, d0 \n" // R
  2333. MEMACCESS(1)
  2334. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
  2335. "bgt 1b \n"
  2336. : "+r"(src_argb), // %0
  2337. "+r"(dst_argb), // %1
  2338. "+r"(width) // %2
  2339. :
  2340. : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
  2341. );
  2342. }
  2343. // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
  2344. // b = (r * 35 + g * 68 + b * 17) >> 7
  2345. // g = (r * 45 + g * 88 + b * 22) >> 7
  2346. // r = (r * 50 + g * 98 + b * 24) >> 7
  2347. void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
  2348. asm volatile (
  2349. "vmov.u8 d20, #17 \n" // BB coefficient
  2350. "vmov.u8 d21, #68 \n" // BG coefficient
  2351. "vmov.u8 d22, #35 \n" // BR coefficient
  2352. "vmov.u8 d24, #22 \n" // GB coefficient
  2353. "vmov.u8 d25, #88 \n" // GG coefficient
  2354. "vmov.u8 d26, #45 \n" // GR coefficient
  2355. "vmov.u8 d28, #24 \n" // BB coefficient
  2356. "vmov.u8 d29, #98 \n" // BG coefficient
  2357. "vmov.u8 d30, #50 \n" // BR coefficient
  2358. "1: \n"
  2359. MEMACCESS(0)
  2360. "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
  2361. "subs %1, %1, #8 \n" // 8 processed per loop.
  2362. "vmull.u8 q2, d0, d20 \n" // B to Sepia B
  2363. "vmlal.u8 q2, d1, d21 \n" // G
  2364. "vmlal.u8 q2, d2, d22 \n" // R
  2365. "vmull.u8 q3, d0, d24 \n" // B to Sepia G
  2366. "vmlal.u8 q3, d1, d25 \n" // G
  2367. "vmlal.u8 q3, d2, d26 \n" // R
  2368. "vmull.u8 q8, d0, d28 \n" // B to Sepia R
  2369. "vmlal.u8 q8, d1, d29 \n" // G
  2370. "vmlal.u8 q8, d2, d30 \n" // R
  2371. "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
  2372. "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
  2373. "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
  2374. MEMACCESS(0)
  2375. "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
  2376. "bgt 1b \n"
  2377. : "+r"(dst_argb), // %0
  2378. "+r"(width) // %1
  2379. :
  2380. : "cc", "memory", "q0", "q1", "q2", "q3",
  2381. "q10", "q11", "q12", "q13", "q14", "q15"
  2382. );
  2383. }
  2384. // Tranform 8 ARGB pixels (32 bytes) with color matrix.
  2385. // TODO(fbarchard): Was same as Sepia except matrix is provided. This function
  2386. // needs to saturate. Consider doing a non-saturating version.
  2387. void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
  2388. const int8* matrix_argb, int width) {
  2389. asm volatile (
  2390. MEMACCESS(3)
  2391. "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
  2392. "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
  2393. "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
  2394. "1: \n"
  2395. MEMACCESS(0)
  2396. "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
  2397. "subs %2, %2, #8 \n" // 8 processed per loop.
  2398. "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
  2399. "vmovl.u8 q9, d18 \n" // g
  2400. "vmovl.u8 q10, d20 \n" // r
  2401. "vmovl.u8 q11, d22 \n" // a
  2402. "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
  2403. "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
  2404. "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
  2405. "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A
  2406. "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
  2407. "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
  2408. "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
  2409. "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A
  2410. "vqadd.s16 q12, q12, q4 \n" // Accumulate B
  2411. "vqadd.s16 q13, q13, q5 \n" // Accumulate G
  2412. "vqadd.s16 q14, q14, q6 \n" // Accumulate R
  2413. "vqadd.s16 q15, q15, q7 \n" // Accumulate A
  2414. "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
  2415. "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
  2416. "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
  2417. "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A
  2418. "vqadd.s16 q12, q12, q4 \n" // Accumulate B
  2419. "vqadd.s16 q13, q13, q5 \n" // Accumulate G
  2420. "vqadd.s16 q14, q14, q6 \n" // Accumulate R
  2421. "vqadd.s16 q15, q15, q7 \n" // Accumulate A
  2422. "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B
  2423. "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G
  2424. "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R
  2425. "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A
  2426. "vqadd.s16 q12, q12, q4 \n" // Accumulate B
  2427. "vqadd.s16 q13, q13, q5 \n" // Accumulate G
  2428. "vqadd.s16 q14, q14, q6 \n" // Accumulate R
  2429. "vqadd.s16 q15, q15, q7 \n" // Accumulate A
  2430. "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B
  2431. "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G
  2432. "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R
  2433. "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A
  2434. MEMACCESS(1)
  2435. "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
  2436. "bgt 1b \n"
  2437. : "+r"(src_argb), // %0
  2438. "+r"(dst_argb), // %1
  2439. "+r"(width) // %2
  2440. : "r"(matrix_argb) // %3
  2441. : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
  2442. "q10", "q11", "q12", "q13", "q14", "q15"
  2443. );
  2444. }
  2445. // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
  2446. void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
  2447. uint8* dst_argb, int width) {
  2448. asm volatile (
  2449. // 8 pixel loop.
  2450. "1: \n"
  2451. MEMACCESS(0)
  2452. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
  2453. MEMACCESS(1)
  2454. "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels.
  2455. "subs %3, %3, #8 \n" // 8 processed per loop.
  2456. "vmull.u8 q0, d0, d1 \n" // multiply B
  2457. "vmull.u8 q1, d2, d3 \n" // multiply G
  2458. "vmull.u8 q2, d4, d5 \n" // multiply R
  2459. "vmull.u8 q3, d6, d7 \n" // multiply A
  2460. "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
  2461. "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
  2462. "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
  2463. "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
  2464. MEMACCESS(2)
  2465. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
  2466. "bgt 1b \n"
  2467. : "+r"(src_argb0), // %0
  2468. "+r"(src_argb1), // %1
  2469. "+r"(dst_argb), // %2
  2470. "+r"(width) // %3
  2471. :
  2472. : "cc", "memory", "q0", "q1", "q2", "q3"
  2473. );
  2474. }
  2475. // Add 2 rows of ARGB pixels together, 8 pixels at a time.
  2476. void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
  2477. uint8* dst_argb, int width) {
  2478. asm volatile (
  2479. // 8 pixel loop.
  2480. "1: \n"
  2481. MEMACCESS(0)
  2482. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
  2483. MEMACCESS(1)
  2484. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
  2485. "subs %3, %3, #8 \n" // 8 processed per loop.
  2486. "vqadd.u8 q0, q0, q2 \n" // add B, G
  2487. "vqadd.u8 q1, q1, q3 \n" // add R, A
  2488. MEMACCESS(2)
  2489. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
  2490. "bgt 1b \n"
  2491. : "+r"(src_argb0), // %0
  2492. "+r"(src_argb1), // %1
  2493. "+r"(dst_argb), // %2
  2494. "+r"(width) // %3
  2495. :
  2496. : "cc", "memory", "q0", "q1", "q2", "q3"
  2497. );
  2498. }
  2499. // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
  2500. void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
  2501. uint8* dst_argb, int width) {
  2502. asm volatile (
  2503. // 8 pixel loop.
  2504. "1: \n"
  2505. MEMACCESS(0)
  2506. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
  2507. MEMACCESS(1)
  2508. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
  2509. "subs %3, %3, #8 \n" // 8 processed per loop.
  2510. "vqsub.u8 q0, q0, q2 \n" // subtract B, G
  2511. "vqsub.u8 q1, q1, q3 \n" // subtract R, A
  2512. MEMACCESS(2)
  2513. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
  2514. "bgt 1b \n"
  2515. : "+r"(src_argb0), // %0
  2516. "+r"(src_argb1), // %1
  2517. "+r"(dst_argb), // %2
  2518. "+r"(width) // %3
  2519. :
  2520. : "cc", "memory", "q0", "q1", "q2", "q3"
  2521. );
  2522. }
  2523. // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
  2524. // A = 255
  2525. // R = Sobel
  2526. // G = Sobel
  2527. // B = Sobel
  2528. void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
  2529. uint8* dst_argb, int width) {
  2530. asm volatile (
  2531. "vmov.u8 d3, #255 \n" // alpha
  2532. // 8 pixel loop.
  2533. "1: \n"
  2534. MEMACCESS(0)
  2535. "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
  2536. MEMACCESS(1)
  2537. "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
  2538. "subs %3, %3, #8 \n" // 8 processed per loop.
  2539. "vqadd.u8 d0, d0, d1 \n" // add
  2540. "vmov.u8 d1, d0 \n"
  2541. "vmov.u8 d2, d0 \n"
  2542. MEMACCESS(2)
  2543. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
  2544. "bgt 1b \n"
  2545. : "+r"(src_sobelx), // %0
  2546. "+r"(src_sobely), // %1
  2547. "+r"(dst_argb), // %2
  2548. "+r"(width) // %3
  2549. :
  2550. : "cc", "memory", "q0", "q1"
  2551. );
  2552. }
  2553. // Adds Sobel X and Sobel Y and stores Sobel into plane.
  2554. void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
  2555. uint8* dst_y, int width) {
  2556. asm volatile (
  2557. // 16 pixel loop.
  2558. "1: \n"
  2559. MEMACCESS(0)
  2560. "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
  2561. MEMACCESS(1)
  2562. "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
  2563. "subs %3, %3, #16 \n" // 16 processed per loop.
  2564. "vqadd.u8 q0, q0, q1 \n" // add
  2565. MEMACCESS(2)
  2566. "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
  2567. "bgt 1b \n"
  2568. : "+r"(src_sobelx), // %0
  2569. "+r"(src_sobely), // %1
  2570. "+r"(dst_y), // %2
  2571. "+r"(width) // %3
  2572. :
  2573. : "cc", "memory", "q0", "q1"
  2574. );
  2575. }
  2576. // Mixes Sobel X, Sobel Y and Sobel into ARGB.
  2577. // A = 255
  2578. // R = Sobel X
  2579. // G = Sobel
  2580. // B = Sobel Y
  2581. void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
  2582. uint8* dst_argb, int width) {
  2583. asm volatile (
  2584. "vmov.u8 d3, #255 \n" // alpha
  2585. // 8 pixel loop.
  2586. "1: \n"
  2587. MEMACCESS(0)
  2588. "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
  2589. MEMACCESS(1)
  2590. "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
  2591. "subs %3, %3, #8 \n" // 8 processed per loop.
  2592. "vqadd.u8 d1, d0, d2 \n" // add
  2593. MEMACCESS(2)
  2594. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
  2595. "bgt 1b \n"
  2596. : "+r"(src_sobelx), // %0
  2597. "+r"(src_sobely), // %1
  2598. "+r"(dst_argb), // %2
  2599. "+r"(width) // %3
  2600. :
  2601. : "cc", "memory", "q0", "q1"
  2602. );
  2603. }
  2604. // SobelX as a matrix is
  2605. // -1 0 1
  2606. // -2 0 2
  2607. // -1 0 1
  2608. void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
  2609. const uint8* src_y2, uint8* dst_sobelx, int width) {
  2610. asm volatile (
  2611. "1: \n"
  2612. MEMACCESS(0)
  2613. "vld1.8 {d0}, [%0],%5 \n" // top
  2614. MEMACCESS(0)
  2615. "vld1.8 {d1}, [%0],%6 \n"
  2616. "vsubl.u8 q0, d0, d1 \n"
  2617. MEMACCESS(1)
  2618. "vld1.8 {d2}, [%1],%5 \n" // center * 2
  2619. MEMACCESS(1)
  2620. "vld1.8 {d3}, [%1],%6 \n"
  2621. "vsubl.u8 q1, d2, d3 \n"
  2622. "vadd.s16 q0, q0, q1 \n"
  2623. "vadd.s16 q0, q0, q1 \n"
  2624. MEMACCESS(2)
  2625. "vld1.8 {d2}, [%2],%5 \n" // bottom
  2626. MEMACCESS(2)
  2627. "vld1.8 {d3}, [%2],%6 \n"
  2628. "subs %4, %4, #8 \n" // 8 pixels
  2629. "vsubl.u8 q1, d2, d3 \n"
  2630. "vadd.s16 q0, q0, q1 \n"
  2631. "vabs.s16 q0, q0 \n"
  2632. "vqmovn.u16 d0, q0 \n"
  2633. MEMACCESS(3)
  2634. "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
  2635. "bgt 1b \n"
  2636. : "+r"(src_y0), // %0
  2637. "+r"(src_y1), // %1
  2638. "+r"(src_y2), // %2
  2639. "+r"(dst_sobelx), // %3
  2640. "+r"(width) // %4
  2641. : "r"(2), // %5
  2642. "r"(6) // %6
  2643. : "cc", "memory", "q0", "q1" // Clobber List
  2644. );
  2645. }
  2646. // SobelY as a matrix is
  2647. // -1 -2 -1
  2648. // 0 0 0
  2649. // 1 2 1
  2650. void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
  2651. uint8* dst_sobely, int width) {
  2652. asm volatile (
  2653. "1: \n"
  2654. MEMACCESS(0)
  2655. "vld1.8 {d0}, [%0],%4 \n" // left
  2656. MEMACCESS(1)
  2657. "vld1.8 {d1}, [%1],%4 \n"
  2658. "vsubl.u8 q0, d0, d1 \n"
  2659. MEMACCESS(0)
  2660. "vld1.8 {d2}, [%0],%4 \n" // center * 2
  2661. MEMACCESS(1)
  2662. "vld1.8 {d3}, [%1],%4 \n"
  2663. "vsubl.u8 q1, d2, d3 \n"
  2664. "vadd.s16 q0, q0, q1 \n"
  2665. "vadd.s16 q0, q0, q1 \n"
  2666. MEMACCESS(0)
  2667. "vld1.8 {d2}, [%0],%5 \n" // right
  2668. MEMACCESS(1)
  2669. "vld1.8 {d3}, [%1],%5 \n"
  2670. "subs %3, %3, #8 \n" // 8 pixels
  2671. "vsubl.u8 q1, d2, d3 \n"
  2672. "vadd.s16 q0, q0, q1 \n"
  2673. "vabs.s16 q0, q0 \n"
  2674. "vqmovn.u16 d0, q0 \n"
  2675. MEMACCESS(2)
  2676. "vst1.8 {d0}, [%2]! \n" // store 8 sobely
  2677. "bgt 1b \n"
  2678. : "+r"(src_y0), // %0
  2679. "+r"(src_y1), // %1
  2680. "+r"(dst_sobely), // %2
  2681. "+r"(width) // %3
  2682. : "r"(1), // %4
  2683. "r"(6) // %5
  2684. : "cc", "memory", "q0", "q1" // Clobber List
  2685. );
  2686. }
  2687. #endif // defined(__ARM_NEON__) && !defined(__aarch64__)
  2688. #ifdef __cplusplus
  2689. } // extern "C"
  2690. } // namespace libyuv
  2691. #endif