row_neon64.cc 121 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810
  1. /*
  2. * Copyright 2014 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/row.h"
  11. #ifdef __cplusplus
  12. namespace libyuv {
  13. extern "C" {
  14. #endif
  15. // This module is for GCC Neon armv8 64 bit.
  16. #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
  17. // Read 8 Y, 4 U and 4 V from 422
  18. #define READYUV422 \
  19. MEMACCESS(0) \
  20. "ld1 {v0.8b}, [%0], #8 \n" \
  21. MEMACCESS(1) \
  22. "ld1 {v1.s}[0], [%1], #4 \n" \
  23. MEMACCESS(2) \
  24. "ld1 {v1.s}[1], [%2], #4 \n"
  25. // Read 8 Y, 2 U and 2 V from 422
  26. #define READYUV411 \
  27. MEMACCESS(0) \
  28. "ld1 {v0.8b}, [%0], #8 \n" \
  29. MEMACCESS(1) \
  30. "ld1 {v2.h}[0], [%1], #2 \n" \
  31. MEMACCESS(2) \
  32. "ld1 {v2.h}[1], [%2], #2 \n" \
  33. "zip1 v1.8b, v2.8b, v2.8b \n"
  34. // Read 8 Y, 8 U and 8 V from 444
  35. #define READYUV444 \
  36. MEMACCESS(0) \
  37. "ld1 {v0.8b}, [%0], #8 \n" \
  38. MEMACCESS(1) \
  39. "ld1 {v1.d}[0], [%1], #8 \n" \
  40. MEMACCESS(2) \
  41. "ld1 {v1.d}[1], [%2], #8 \n" \
  42. "uaddlp v1.8h, v1.16b \n" \
  43. "rshrn v1.8b, v1.8h, #1 \n"
  44. // Read 8 Y, and set 4 U and 4 V to 128
  45. #define READYUV400 \
  46. MEMACCESS(0) \
  47. "ld1 {v0.8b}, [%0], #8 \n" \
  48. "movi v1.8b , #128 \n"
  49. // Read 8 Y and 4 UV from NV12
  50. #define READNV12 \
  51. MEMACCESS(0) \
  52. "ld1 {v0.8b}, [%0], #8 \n" \
  53. MEMACCESS(1) \
  54. "ld1 {v2.8b}, [%1], #8 \n" \
  55. "uzp1 v1.8b, v2.8b, v2.8b \n" \
  56. "uzp2 v3.8b, v2.8b, v2.8b \n" \
  57. "ins v1.s[1], v3.s[0] \n"
  58. // Read 8 Y and 4 VU from NV21
  59. #define READNV21 \
  60. MEMACCESS(0) \
  61. "ld1 {v0.8b}, [%0], #8 \n" \
  62. MEMACCESS(1) \
  63. "ld1 {v2.8b}, [%1], #8 \n" \
  64. "uzp1 v3.8b, v2.8b, v2.8b \n" \
  65. "uzp2 v1.8b, v2.8b, v2.8b \n" \
  66. "ins v1.s[1], v3.s[0] \n"
  67. // Read 8 YUY2
  68. #define READYUY2 \
  69. MEMACCESS(0) \
  70. "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \
  71. "uzp2 v3.8b, v1.8b, v1.8b \n" \
  72. "uzp1 v1.8b, v1.8b, v1.8b \n" \
  73. "ins v1.s[1], v3.s[0] \n"
  74. // Read 8 UYVY
  75. #define READUYVY \
  76. MEMACCESS(0) \
  77. "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \
  78. "orr v0.8b, v3.8b, v3.8b \n" \
  79. "uzp1 v1.8b, v2.8b, v2.8b \n" \
  80. "uzp2 v3.8b, v2.8b, v2.8b \n" \
  81. "ins v1.s[1], v3.s[0] \n"
  82. #define YUVTORGB_SETUP \
  83. "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \
  84. "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \
  85. "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \
  86. "ld1r {v31.4s}, [%[kYToRgb]] \n" \
  87. "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
  88. "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n"
  89. #define YUVTORGB(vR, vG, vB) \
  90. "uxtl v0.8h, v0.8b \n" /* Extract Y */ \
  91. "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \
  92. "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \
  93. "ushll v0.4s, v0.4h, #0 \n" \
  94. "mul v3.4s, v3.4s, v31.4s \n" \
  95. "mul v0.4s, v0.4s, v31.4s \n" \
  96. "sqshrun v0.4h, v0.4s, #16 \n" \
  97. "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \
  98. "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \
  99. "mov v2.d[0], v1.d[1] \n" /* Extract V */ \
  100. "uxtl v2.8h, v2.8b \n" \
  101. "uxtl v1.8h, v1.8b \n" /* Extract U */ \
  102. "mul v3.8h, v1.8h, v27.8h \n" \
  103. "mul v5.8h, v1.8h, v29.8h \n" \
  104. "mul v6.8h, v2.8h, v30.8h \n" \
  105. "mul v7.8h, v2.8h, v28.8h \n" \
  106. "sqadd v6.8h, v6.8h, v5.8h \n" \
  107. "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \
  108. "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \
  109. "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \
  110. "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \
  111. "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \
  112. "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \
  113. "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \
  114. "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \
  115. "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \
  116. void I444ToARGBRow_NEON(const uint8* src_y,
  117. const uint8* src_u,
  118. const uint8* src_v,
  119. uint8* dst_argb,
  120. const struct YuvConstants* yuvconstants,
  121. int width) {
  122. asm volatile (
  123. YUVTORGB_SETUP
  124. "movi v23.8b, #255 \n" /* A */
  125. "1: \n"
  126. READYUV444
  127. YUVTORGB(v22, v21, v20)
  128. "subs %w4, %w4, #8 \n"
  129. MEMACCESS(3)
  130. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
  131. "b.gt 1b \n"
  132. : "+r"(src_y), // %0
  133. "+r"(src_u), // %1
  134. "+r"(src_v), // %2
  135. "+r"(dst_argb), // %3
  136. "+r"(width) // %4
  137. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  138. [kUVToG]"r"(&yuvconstants->kUVToG),
  139. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  140. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  141. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  142. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  143. );
  144. }
  145. void I422ToARGBRow_NEON(const uint8* src_y,
  146. const uint8* src_u,
  147. const uint8* src_v,
  148. uint8* dst_argb,
  149. const struct YuvConstants* yuvconstants,
  150. int width) {
  151. asm volatile (
  152. YUVTORGB_SETUP
  153. "movi v23.8b, #255 \n" /* A */
  154. "1: \n"
  155. READYUV422
  156. YUVTORGB(v22, v21, v20)
  157. "subs %w4, %w4, #8 \n"
  158. MEMACCESS(3)
  159. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
  160. "b.gt 1b \n"
  161. : "+r"(src_y), // %0
  162. "+r"(src_u), // %1
  163. "+r"(src_v), // %2
  164. "+r"(dst_argb), // %3
  165. "+r"(width) // %4
  166. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  167. [kUVToG]"r"(&yuvconstants->kUVToG),
  168. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  169. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  170. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  171. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  172. );
  173. }
  174. void I422AlphaToARGBRow_NEON(const uint8* src_y,
  175. const uint8* src_u,
  176. const uint8* src_v,
  177. const uint8* src_a,
  178. uint8* dst_argb,
  179. const struct YuvConstants* yuvconstants,
  180. int width) {
  181. asm volatile (
  182. YUVTORGB_SETUP
  183. "1: \n"
  184. READYUV422
  185. YUVTORGB(v22, v21, v20)
  186. MEMACCESS(3)
  187. "ld1 {v23.8b}, [%3], #8 \n"
  188. "subs %w5, %w5, #8 \n"
  189. MEMACCESS(4)
  190. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
  191. "b.gt 1b \n"
  192. : "+r"(src_y), // %0
  193. "+r"(src_u), // %1
  194. "+r"(src_v), // %2
  195. "+r"(src_a), // %3
  196. "+r"(dst_argb), // %4
  197. "+r"(width) // %5
  198. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  199. [kUVToG]"r"(&yuvconstants->kUVToG),
  200. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  201. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  202. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  203. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  204. );
  205. }
  206. void I411ToARGBRow_NEON(const uint8* src_y,
  207. const uint8* src_u,
  208. const uint8* src_v,
  209. uint8* dst_argb,
  210. const struct YuvConstants* yuvconstants,
  211. int width) {
  212. asm volatile (
  213. YUVTORGB_SETUP
  214. "movi v23.8b, #255 \n" /* A */
  215. "1: \n"
  216. READYUV411
  217. YUVTORGB(v22, v21, v20)
  218. "subs %w4, %w4, #8 \n"
  219. MEMACCESS(3)
  220. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
  221. "b.gt 1b \n"
  222. : "+r"(src_y), // %0
  223. "+r"(src_u), // %1
  224. "+r"(src_v), // %2
  225. "+r"(dst_argb), // %3
  226. "+r"(width) // %4
  227. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  228. [kUVToG]"r"(&yuvconstants->kUVToG),
  229. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  230. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  231. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  232. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  233. );
  234. }
  235. void I422ToRGBARow_NEON(const uint8* src_y,
  236. const uint8* src_u,
  237. const uint8* src_v,
  238. uint8* dst_rgba,
  239. const struct YuvConstants* yuvconstants,
  240. int width) {
  241. asm volatile (
  242. YUVTORGB_SETUP
  243. "movi v20.8b, #255 \n" /* A */
  244. "1: \n"
  245. READYUV422
  246. YUVTORGB(v23, v22, v21)
  247. "subs %w4, %w4, #8 \n"
  248. MEMACCESS(3)
  249. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
  250. "b.gt 1b \n"
  251. : "+r"(src_y), // %0
  252. "+r"(src_u), // %1
  253. "+r"(src_v), // %2
  254. "+r"(dst_rgba), // %3
  255. "+r"(width) // %4
  256. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  257. [kUVToG]"r"(&yuvconstants->kUVToG),
  258. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  259. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  260. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  261. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  262. );
  263. }
  264. void I422ToRGB24Row_NEON(const uint8* src_y,
  265. const uint8* src_u,
  266. const uint8* src_v,
  267. uint8* dst_rgb24,
  268. const struct YuvConstants* yuvconstants,
  269. int width) {
  270. asm volatile (
  271. YUVTORGB_SETUP
  272. "1: \n"
  273. READYUV422
  274. YUVTORGB(v22, v21, v20)
  275. "subs %w4, %w4, #8 \n"
  276. MEMACCESS(3)
  277. "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
  278. "b.gt 1b \n"
  279. : "+r"(src_y), // %0
  280. "+r"(src_u), // %1
  281. "+r"(src_v), // %2
  282. "+r"(dst_rgb24), // %3
  283. "+r"(width) // %4
  284. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  285. [kUVToG]"r"(&yuvconstants->kUVToG),
  286. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  287. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  288. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  289. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  290. );
  291. }
  292. #define ARGBTORGB565 \
  293. "shll v0.8h, v22.8b, #8 \n" /* R */ \
  294. "shll v21.8h, v21.8b, #8 \n" /* G */ \
  295. "shll v20.8h, v20.8b, #8 \n" /* B */ \
  296. "sri v0.8h, v21.8h, #5 \n" /* RG */ \
  297. "sri v0.8h, v20.8h, #11 \n" /* RGB */
  298. void I422ToRGB565Row_NEON(const uint8* src_y,
  299. const uint8* src_u,
  300. const uint8* src_v,
  301. uint8* dst_rgb565,
  302. const struct YuvConstants* yuvconstants,
  303. int width) {
  304. asm volatile (
  305. YUVTORGB_SETUP
  306. "1: \n"
  307. READYUV422
  308. YUVTORGB(v22, v21, v20)
  309. "subs %w4, %w4, #8 \n"
  310. ARGBTORGB565
  311. MEMACCESS(3)
  312. "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
  313. "b.gt 1b \n"
  314. : "+r"(src_y), // %0
  315. "+r"(src_u), // %1
  316. "+r"(src_v), // %2
  317. "+r"(dst_rgb565), // %3
  318. "+r"(width) // %4
  319. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  320. [kUVToG]"r"(&yuvconstants->kUVToG),
  321. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  322. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  323. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  324. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  325. );
  326. }
  327. #define ARGBTOARGB1555 \
  328. "shll v0.8h, v23.8b, #8 \n" /* A */ \
  329. "shll v22.8h, v22.8b, #8 \n" /* R */ \
  330. "shll v21.8h, v21.8b, #8 \n" /* G */ \
  331. "shll v20.8h, v20.8b, #8 \n" /* B */ \
  332. "sri v0.8h, v22.8h, #1 \n" /* AR */ \
  333. "sri v0.8h, v21.8h, #6 \n" /* ARG */ \
  334. "sri v0.8h, v20.8h, #11 \n" /* ARGB */
  335. void I422ToARGB1555Row_NEON(const uint8* src_y,
  336. const uint8* src_u,
  337. const uint8* src_v,
  338. uint8* dst_argb1555,
  339. const struct YuvConstants* yuvconstants,
  340. int width) {
  341. asm volatile (
  342. YUVTORGB_SETUP
  343. "movi v23.8b, #255 \n"
  344. "1: \n"
  345. READYUV422
  346. YUVTORGB(v22, v21, v20)
  347. "subs %w4, %w4, #8 \n"
  348. ARGBTOARGB1555
  349. MEMACCESS(3)
  350. "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
  351. "b.gt 1b \n"
  352. : "+r"(src_y), // %0
  353. "+r"(src_u), // %1
  354. "+r"(src_v), // %2
  355. "+r"(dst_argb1555), // %3
  356. "+r"(width) // %4
  357. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  358. [kUVToG]"r"(&yuvconstants->kUVToG),
  359. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  360. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  361. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  362. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  363. );
  364. }
  365. #define ARGBTOARGB4444 \
  366. /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \
  367. "ushr v20.8b, v20.8b, #4 \n" /* B */ \
  368. "bic v21.8b, v21.8b, v4.8b \n" /* G */ \
  369. "ushr v22.8b, v22.8b, #4 \n" /* R */ \
  370. "bic v23.8b, v23.8b, v4.8b \n" /* A */ \
  371. "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \
  372. "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \
  373. "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */
  374. void I422ToARGB4444Row_NEON(const uint8* src_y,
  375. const uint8* src_u,
  376. const uint8* src_v,
  377. uint8* dst_argb4444,
  378. const struct YuvConstants* yuvconstants,
  379. int width) {
  380. asm volatile (
  381. YUVTORGB_SETUP
  382. "movi v4.16b, #0x0f \n" // bits to clear with vbic.
  383. "1: \n"
  384. READYUV422
  385. YUVTORGB(v22, v21, v20)
  386. "subs %w4, %w4, #8 \n"
  387. "movi v23.8b, #255 \n"
  388. ARGBTOARGB4444
  389. MEMACCESS(3)
  390. "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444.
  391. "b.gt 1b \n"
  392. : "+r"(src_y), // %0
  393. "+r"(src_u), // %1
  394. "+r"(src_v), // %2
  395. "+r"(dst_argb4444), // %3
  396. "+r"(width) // %4
  397. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  398. [kUVToG]"r"(&yuvconstants->kUVToG),
  399. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  400. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  401. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  402. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  403. );
  404. }
  405. void I400ToARGBRow_NEON(const uint8* src_y,
  406. uint8* dst_argb,
  407. int width) {
  408. asm volatile (
  409. YUVTORGB_SETUP
  410. "movi v23.8b, #255 \n"
  411. "1: \n"
  412. READYUV400
  413. YUVTORGB(v22, v21, v20)
  414. "subs %w2, %w2, #8 \n"
  415. MEMACCESS(1)
  416. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
  417. "b.gt 1b \n"
  418. : "+r"(src_y), // %0
  419. "+r"(dst_argb), // %1
  420. "+r"(width) // %2
  421. : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
  422. [kUVToG]"r"(&kYuvI601Constants.kUVToG),
  423. [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
  424. [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
  425. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  426. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  427. );
  428. }
  429. void J400ToARGBRow_NEON(const uint8* src_y,
  430. uint8* dst_argb,
  431. int width) {
  432. asm volatile (
  433. "movi v23.8b, #255 \n"
  434. "1: \n"
  435. MEMACCESS(0)
  436. "ld1 {v20.8b}, [%0], #8 \n"
  437. "orr v21.8b, v20.8b, v20.8b \n"
  438. "orr v22.8b, v20.8b, v20.8b \n"
  439. "subs %w2, %w2, #8 \n"
  440. MEMACCESS(1)
  441. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
  442. "b.gt 1b \n"
  443. : "+r"(src_y), // %0
  444. "+r"(dst_argb), // %1
  445. "+r"(width) // %2
  446. :
  447. : "cc", "memory", "v20", "v21", "v22", "v23"
  448. );
  449. }
  450. void NV12ToARGBRow_NEON(const uint8* src_y,
  451. const uint8* src_uv,
  452. uint8* dst_argb,
  453. const struct YuvConstants* yuvconstants,
  454. int width) {
  455. asm volatile (
  456. YUVTORGB_SETUP
  457. "movi v23.8b, #255 \n"
  458. "1: \n"
  459. READNV12
  460. YUVTORGB(v22, v21, v20)
  461. "subs %w3, %w3, #8 \n"
  462. MEMACCESS(2)
  463. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
  464. "b.gt 1b \n"
  465. : "+r"(src_y), // %0
  466. "+r"(src_uv), // %1
  467. "+r"(dst_argb), // %2
  468. "+r"(width) // %3
  469. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  470. [kUVToG]"r"(&yuvconstants->kUVToG),
  471. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  472. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  473. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  474. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  475. );
  476. }
  477. void NV21ToARGBRow_NEON(const uint8* src_y,
  478. const uint8* src_vu,
  479. uint8* dst_argb,
  480. const struct YuvConstants* yuvconstants,
  481. int width) {
  482. asm volatile (
  483. YUVTORGB_SETUP
  484. "movi v23.8b, #255 \n"
  485. "1: \n"
  486. READNV21
  487. YUVTORGB(v22, v21, v20)
  488. "subs %w3, %w3, #8 \n"
  489. MEMACCESS(2)
  490. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
  491. "b.gt 1b \n"
  492. : "+r"(src_y), // %0
  493. "+r"(src_vu), // %1
  494. "+r"(dst_argb), // %2
  495. "+r"(width) // %3
  496. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  497. [kUVToG]"r"(&yuvconstants->kUVToG),
  498. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  499. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  500. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  501. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  502. );
  503. }
  504. void NV12ToRGB565Row_NEON(const uint8* src_y,
  505. const uint8* src_uv,
  506. uint8* dst_rgb565,
  507. const struct YuvConstants* yuvconstants,
  508. int width) {
  509. asm volatile (
  510. YUVTORGB_SETUP
  511. "1: \n"
  512. READNV12
  513. YUVTORGB(v22, v21, v20)
  514. "subs %w3, %w3, #8 \n"
  515. ARGBTORGB565
  516. MEMACCESS(2)
  517. "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565.
  518. "b.gt 1b \n"
  519. : "+r"(src_y), // %0
  520. "+r"(src_uv), // %1
  521. "+r"(dst_rgb565), // %2
  522. "+r"(width) // %3
  523. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  524. [kUVToG]"r"(&yuvconstants->kUVToG),
  525. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  526. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  527. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  528. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  529. );
  530. }
  531. void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
  532. uint8* dst_argb,
  533. const struct YuvConstants* yuvconstants,
  534. int width) {
  535. asm volatile (
  536. YUVTORGB_SETUP
  537. "movi v23.8b, #255 \n"
  538. "1: \n"
  539. READYUY2
  540. YUVTORGB(v22, v21, v20)
  541. "subs %w2, %w2, #8 \n"
  542. MEMACCESS(1)
  543. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
  544. "b.gt 1b \n"
  545. : "+r"(src_yuy2), // %0
  546. "+r"(dst_argb), // %1
  547. "+r"(width) // %2
  548. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  549. [kUVToG]"r"(&yuvconstants->kUVToG),
  550. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  551. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  552. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  553. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  554. );
  555. }
  556. void UYVYToARGBRow_NEON(const uint8* src_uyvy,
  557. uint8* dst_argb,
  558. const struct YuvConstants* yuvconstants,
  559. int width) {
  560. asm volatile (
  561. YUVTORGB_SETUP
  562. "movi v23.8b, #255 \n"
  563. "1: \n"
  564. READUYVY
  565. YUVTORGB(v22, v21, v20)
  566. "subs %w2, %w2, #8 \n"
  567. MEMACCESS(1)
  568. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
  569. "b.gt 1b \n"
  570. : "+r"(src_uyvy), // %0
  571. "+r"(dst_argb), // %1
  572. "+r"(width) // %2
  573. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  574. [kUVToG]"r"(&yuvconstants->kUVToG),
  575. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  576. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  577. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  578. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  579. );
  580. }
  581. // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
  582. void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
  583. int width) {
  584. asm volatile (
  585. "1: \n"
  586. MEMACCESS(0)
  587. "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
  588. "subs %w3, %w3, #16 \n" // 16 processed per loop
  589. MEMACCESS(1)
  590. "st1 {v0.16b}, [%1], #16 \n" // store U
  591. MEMACCESS(2)
  592. "st1 {v1.16b}, [%2], #16 \n" // store V
  593. "b.gt 1b \n"
  594. : "+r"(src_uv), // %0
  595. "+r"(dst_u), // %1
  596. "+r"(dst_v), // %2
  597. "+r"(width) // %3 // Output registers
  598. : // Input registers
  599. : "cc", "memory", "v0", "v1" // Clobber List
  600. );
  601. }
  602. // Reads 16 U's and V's and writes out 16 pairs of UV.
  603. void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
  604. int width) {
  605. asm volatile (
  606. "1: \n"
  607. MEMACCESS(0)
  608. "ld1 {v0.16b}, [%0], #16 \n" // load U
  609. MEMACCESS(1)
  610. "ld1 {v1.16b}, [%1], #16 \n" // load V
  611. "subs %w3, %w3, #16 \n" // 16 processed per loop
  612. MEMACCESS(2)
  613. "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
  614. "b.gt 1b \n"
  615. :
  616. "+r"(src_u), // %0
  617. "+r"(src_v), // %1
  618. "+r"(dst_uv), // %2
  619. "+r"(width) // %3 // Output registers
  620. : // Input registers
  621. : "cc", "memory", "v0", "v1" // Clobber List
  622. );
  623. }
  624. // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
  625. void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
  626. asm volatile (
  627. "1: \n"
  628. MEMACCESS(0)
  629. "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32
  630. "subs %w2, %w2, #32 \n" // 32 processed per loop
  631. MEMACCESS(1)
  632. "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32
  633. "b.gt 1b \n"
  634. : "+r"(src), // %0
  635. "+r"(dst), // %1
  636. "+r"(count) // %2 // Output registers
  637. : // Input registers
  638. : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
  639. );
  640. }
  641. // SetRow writes 'count' bytes using an 8 bit value repeated.
  642. void SetRow_NEON(uint8* dst, uint8 v8, int count) {
  643. asm volatile (
  644. "dup v0.16b, %w2 \n" // duplicate 16 bytes
  645. "1: \n"
  646. "subs %w1, %w1, #16 \n" // 16 bytes per loop
  647. MEMACCESS(0)
  648. "st1 {v0.16b}, [%0], #16 \n" // store
  649. "b.gt 1b \n"
  650. : "+r"(dst), // %0
  651. "+r"(count) // %1
  652. : "r"(v8) // %2
  653. : "cc", "memory", "v0"
  654. );
  655. }
  656. void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
  657. asm volatile (
  658. "dup v0.4s, %w2 \n" // duplicate 4 ints
  659. "1: \n"
  660. "subs %w1, %w1, #4 \n" // 4 ints per loop
  661. MEMACCESS(0)
  662. "st1 {v0.16b}, [%0], #16 \n" // store
  663. "b.gt 1b \n"
  664. : "+r"(dst), // %0
  665. "+r"(count) // %1
  666. : "r"(v32) // %2
  667. : "cc", "memory", "v0"
  668. );
  669. }
  670. void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
  671. asm volatile (
  672. // Start at end of source row.
  673. "add %0, %0, %w2, sxtw \n"
  674. "sub %0, %0, #16 \n"
  675. "1: \n"
  676. MEMACCESS(0)
  677. "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
  678. "subs %w2, %w2, #16 \n" // 16 pixels per loop.
  679. "rev64 v0.16b, v0.16b \n"
  680. MEMACCESS(1)
  681. "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
  682. MEMACCESS(1)
  683. "st1 {v0.D}[0], [%1], #8 \n"
  684. "b.gt 1b \n"
  685. : "+r"(src), // %0
  686. "+r"(dst), // %1
  687. "+r"(width) // %2
  688. : "r"((ptrdiff_t)-16) // %3
  689. : "cc", "memory", "v0"
  690. );
  691. }
  692. void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
  693. int width) {
  694. asm volatile (
  695. // Start at end of source row.
  696. "add %0, %0, %w3, sxtw #1 \n"
  697. "sub %0, %0, #16 \n"
  698. "1: \n"
  699. MEMACCESS(0)
  700. "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
  701. "subs %w3, %w3, #8 \n" // 8 pixels per loop.
  702. "rev64 v0.8b, v0.8b \n"
  703. "rev64 v1.8b, v1.8b \n"
  704. MEMACCESS(1)
  705. "st1 {v0.8b}, [%1], #8 \n" // dst += 8
  706. MEMACCESS(2)
  707. "st1 {v1.8b}, [%2], #8 \n"
  708. "b.gt 1b \n"
  709. : "+r"(src_uv), // %0
  710. "+r"(dst_u), // %1
  711. "+r"(dst_v), // %2
  712. "+r"(width) // %3
  713. : "r"((ptrdiff_t)-16) // %4
  714. : "cc", "memory", "v0", "v1"
  715. );
  716. }
  717. void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
  718. asm volatile (
  719. // Start at end of source row.
  720. "add %0, %0, %w2, sxtw #2 \n"
  721. "sub %0, %0, #16 \n"
  722. "1: \n"
  723. MEMACCESS(0)
  724. "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
  725. "subs %w2, %w2, #4 \n" // 4 pixels per loop.
  726. "rev64 v0.4s, v0.4s \n"
  727. MEMACCESS(1)
  728. "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
  729. MEMACCESS(1)
  730. "st1 {v0.D}[0], [%1], #8 \n"
  731. "b.gt 1b \n"
  732. : "+r"(src), // %0
  733. "+r"(dst), // %1
  734. "+r"(width) // %2
  735. : "r"((ptrdiff_t)-16) // %3
  736. : "cc", "memory", "v0"
  737. );
  738. }
  739. void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
  740. asm volatile (
  741. "movi v4.8b, #255 \n" // Alpha
  742. "1: \n"
  743. MEMACCESS(0)
  744. "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
  745. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  746. MEMACCESS(1)
  747. "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels
  748. "b.gt 1b \n"
  749. : "+r"(src_rgb24), // %0
  750. "+r"(dst_argb), // %1
  751. "+r"(width) // %2
  752. :
  753. : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
  754. );
  755. }
  756. void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
  757. asm volatile (
  758. "movi v5.8b, #255 \n" // Alpha
  759. "1: \n"
  760. MEMACCESS(0)
  761. "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
  762. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  763. "orr v3.8b, v1.8b, v1.8b \n" // move g
  764. "orr v4.8b, v0.8b, v0.8b \n" // move r
  765. MEMACCESS(1)
  766. "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
  767. "b.gt 1b \n"
  768. : "+r"(src_raw), // %0
  769. "+r"(dst_argb), // %1
  770. "+r"(width) // %2
  771. :
  772. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
  773. );
  774. }
  775. void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
  776. asm volatile (
  777. "1: \n"
  778. MEMACCESS(0)
  779. "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
  780. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  781. "orr v3.8b, v1.8b, v1.8b \n" // move g
  782. "orr v4.8b, v0.8b, v0.8b \n" // move r
  783. MEMACCESS(1)
  784. "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
  785. "b.gt 1b \n"
  786. : "+r"(src_raw), // %0
  787. "+r"(dst_rgb24), // %1
  788. "+r"(width) // %2
  789. :
  790. : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
  791. );
  792. }
  793. #define RGB565TOARGB \
  794. "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \
  795. "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \
  796. "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \
  797. "orr v1.8b, v4.8b, v6.8b \n" /* G */ \
  798. "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
  799. "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \
  800. "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \
  801. "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \
  802. "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \
  803. "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \
  804. "dup v2.2D, v0.D[1] \n" /* R */
  805. void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
  806. asm volatile (
  807. "movi v3.8b, #255 \n" // Alpha
  808. "1: \n"
  809. MEMACCESS(0)
  810. "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
  811. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  812. RGB565TOARGB
  813. MEMACCESS(1)
  814. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
  815. "b.gt 1b \n"
  816. : "+r"(src_rgb565), // %0
  817. "+r"(dst_argb), // %1
  818. "+r"(width) // %2
  819. :
  820. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List
  821. );
  822. }
  823. #define ARGB1555TOARGB \
  824. "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
  825. "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
  826. "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \
  827. \
  828. "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \
  829. "xtn2 v3.16b, v2.8h \n" \
  830. \
  831. "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
  832. "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
  833. \
  834. "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \
  835. "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
  836. "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
  837. \
  838. "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
  839. "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \
  840. "dup v1.2D, v0.D[1] \n" \
  841. "dup v3.2D, v2.D[1] \n"
  842. // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
  843. #define RGB555TOARGB \
  844. "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
  845. "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
  846. "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \
  847. \
  848. "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
  849. "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
  850. \
  851. "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \
  852. "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
  853. "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
  854. \
  855. "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
  856. "orr v2.16b, v1.16b, v3.16b \n" /* R */ \
  857. "dup v1.2D, v0.D[1] \n" /* G */ \
  858. void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
  859. int width) {
  860. asm volatile (
  861. "movi v3.8b, #255 \n" // Alpha
  862. "1: \n"
  863. MEMACCESS(0)
  864. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
  865. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  866. ARGB1555TOARGB
  867. MEMACCESS(1)
  868. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
  869. "b.gt 1b \n"
  870. : "+r"(src_argb1555), // %0
  871. "+r"(dst_argb), // %1
  872. "+r"(width) // %2
  873. :
  874. : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
  875. );
  876. }
  877. #define ARGB4444TOARGB \
  878. "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
  879. "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \
  880. "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \
  881. "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \
  882. "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \
  883. "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \
  884. "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \
  885. "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \
  886. "dup v0.2D, v2.D[1] \n" \
  887. "dup v1.2D, v3.D[1] \n"
  888. void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
  889. int width) {
  890. asm volatile (
  891. "1: \n"
  892. MEMACCESS(0)
  893. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
  894. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  895. ARGB4444TOARGB
  896. MEMACCESS(1)
  897. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
  898. "b.gt 1b \n"
  899. : "+r"(src_argb4444), // %0
  900. "+r"(dst_argb), // %1
  901. "+r"(width) // %2
  902. :
  903. : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
  904. );
  905. }
  906. void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
  907. asm volatile (
  908. "1: \n"
  909. MEMACCESS(0)
  910. "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels
  911. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  912. MEMACCESS(1)
  913. "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
  914. "b.gt 1b \n"
  915. : "+r"(src_argb), // %0
  916. "+r"(dst_rgb24), // %1
  917. "+r"(width) // %2
  918. :
  919. : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
  920. );
  921. }
  922. void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
  923. asm volatile (
  924. "1: \n"
  925. MEMACCESS(0)
  926. "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
  927. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  928. "orr v4.8b, v2.8b, v2.8b \n" // mov g
  929. "orr v5.8b, v1.8b, v1.8b \n" // mov b
  930. MEMACCESS(1)
  931. "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
  932. "b.gt 1b \n"
  933. : "+r"(src_argb), // %0
  934. "+r"(dst_raw), // %1
  935. "+r"(width) // %2
  936. :
  937. : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
  938. );
  939. }
  940. void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
  941. asm volatile (
  942. "1: \n"
  943. MEMACCESS(0)
  944. "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
  945. "subs %w2, %w2, #16 \n" // 16 processed per loop.
  946. MEMACCESS(1)
  947. "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
  948. "b.gt 1b \n"
  949. : "+r"(src_yuy2), // %0
  950. "+r"(dst_y), // %1
  951. "+r"(width) // %2
  952. :
  953. : "cc", "memory", "v0", "v1" // Clobber List
  954. );
  955. }
  956. void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
  957. asm volatile (
  958. "1: \n"
  959. MEMACCESS(0)
  960. "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
  961. "subs %w2, %w2, #16 \n" // 16 processed per loop.
  962. MEMACCESS(1)
  963. "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
  964. "b.gt 1b \n"
  965. : "+r"(src_uyvy), // %0
  966. "+r"(dst_y), // %1
  967. "+r"(width) // %2
  968. :
  969. : "cc", "memory", "v0", "v1" // Clobber List
  970. );
  971. }
  972. void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
  973. int width) {
  974. asm volatile (
  975. "1: \n"
  976. MEMACCESS(0)
  977. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels
  978. "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
  979. MEMACCESS(1)
  980. "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
  981. MEMACCESS(2)
  982. "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
  983. "b.gt 1b \n"
  984. : "+r"(src_yuy2), // %0
  985. "+r"(dst_u), // %1
  986. "+r"(dst_v), // %2
  987. "+r"(width) // %3
  988. :
  989. : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
  990. );
  991. }
  992. void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
  993. int width) {
  994. asm volatile (
  995. "1: \n"
  996. MEMACCESS(0)
  997. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels
  998. "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
  999. MEMACCESS(1)
  1000. "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
  1001. MEMACCESS(2)
  1002. "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
  1003. "b.gt 1b \n"
  1004. : "+r"(src_uyvy), // %0
  1005. "+r"(dst_u), // %1
  1006. "+r"(dst_v), // %2
  1007. "+r"(width) // %3
  1008. :
  1009. : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
  1010. );
  1011. }
  1012. void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
  1013. uint8* dst_u, uint8* dst_v, int width) {
  1014. const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
  1015. asm volatile (
  1016. "1: \n"
  1017. MEMACCESS(0)
  1018. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
  1019. "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
  1020. MEMACCESS(1)
  1021. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
  1022. "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
  1023. "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
  1024. MEMACCESS(2)
  1025. "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
  1026. MEMACCESS(3)
  1027. "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
  1028. "b.gt 1b \n"
  1029. : "+r"(src_yuy2), // %0
  1030. "+r"(src_yuy2b), // %1
  1031. "+r"(dst_u), // %2
  1032. "+r"(dst_v), // %3
  1033. "+r"(width) // %4
  1034. :
  1035. : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
  1036. "v5", "v6", "v7" // Clobber List
  1037. );
  1038. }
  1039. void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
  1040. uint8* dst_u, uint8* dst_v, int width) {
  1041. const uint8* src_uyvyb = src_uyvy + stride_uyvy;
  1042. asm volatile (
  1043. "1: \n"
  1044. MEMACCESS(0)
  1045. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
  1046. "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
  1047. MEMACCESS(1)
  1048. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
  1049. "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
  1050. "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
  1051. MEMACCESS(2)
  1052. "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
  1053. MEMACCESS(3)
  1054. "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
  1055. "b.gt 1b \n"
  1056. : "+r"(src_uyvy), // %0
  1057. "+r"(src_uyvyb), // %1
  1058. "+r"(dst_u), // %2
  1059. "+r"(dst_v), // %3
  1060. "+r"(width) // %4
  1061. :
  1062. : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
  1063. "v5", "v6", "v7" // Clobber List
  1064. );
  1065. }
  1066. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  1067. void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
  1068. const uint8* shuffler, int width) {
  1069. asm volatile (
  1070. MEMACCESS(3)
  1071. "ld1 {v2.16b}, [%3] \n" // shuffler
  1072. "1: \n"
  1073. MEMACCESS(0)
  1074. "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
  1075. "subs %w2, %w2, #4 \n" // 4 processed per loop
  1076. "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
  1077. MEMACCESS(1)
  1078. "st1 {v1.16b}, [%1], #16 \n" // store 4.
  1079. "b.gt 1b \n"
  1080. : "+r"(src_argb), // %0
  1081. "+r"(dst_argb), // %1
  1082. "+r"(width) // %2
  1083. : "r"(shuffler) // %3
  1084. : "cc", "memory", "v0", "v1", "v2" // Clobber List
  1085. );
  1086. }
  1087. void I422ToYUY2Row_NEON(const uint8* src_y,
  1088. const uint8* src_u,
  1089. const uint8* src_v,
  1090. uint8* dst_yuy2, int width) {
  1091. asm volatile (
  1092. "1: \n"
  1093. MEMACCESS(0)
  1094. "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
  1095. "orr v2.8b, v1.8b, v1.8b \n"
  1096. MEMACCESS(1)
  1097. "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
  1098. MEMACCESS(2)
  1099. "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
  1100. "subs %w4, %w4, #16 \n" // 16 pixels
  1101. MEMACCESS(3)
  1102. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
  1103. "b.gt 1b \n"
  1104. : "+r"(src_y), // %0
  1105. "+r"(src_u), // %1
  1106. "+r"(src_v), // %2
  1107. "+r"(dst_yuy2), // %3
  1108. "+r"(width) // %4
  1109. :
  1110. : "cc", "memory", "v0", "v1", "v2", "v3"
  1111. );
  1112. }
  1113. void I422ToUYVYRow_NEON(const uint8* src_y,
  1114. const uint8* src_u,
  1115. const uint8* src_v,
  1116. uint8* dst_uyvy, int width) {
  1117. asm volatile (
  1118. "1: \n"
  1119. MEMACCESS(0)
  1120. "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
  1121. "orr v3.8b, v2.8b, v2.8b \n"
  1122. MEMACCESS(1)
  1123. "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
  1124. MEMACCESS(2)
  1125. "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
  1126. "subs %w4, %w4, #16 \n" // 16 pixels
  1127. MEMACCESS(3)
  1128. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
  1129. "b.gt 1b \n"
  1130. : "+r"(src_y), // %0
  1131. "+r"(src_u), // %1
  1132. "+r"(src_v), // %2
  1133. "+r"(dst_uyvy), // %3
  1134. "+r"(width) // %4
  1135. :
  1136. : "cc", "memory", "v0", "v1", "v2", "v3"
  1137. );
  1138. }
  1139. void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
  1140. asm volatile (
  1141. "1: \n"
  1142. MEMACCESS(0)
  1143. "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
  1144. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1145. ARGBTORGB565
  1146. MEMACCESS(1)
  1147. "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
  1148. "b.gt 1b \n"
  1149. : "+r"(src_argb), // %0
  1150. "+r"(dst_rgb565), // %1
  1151. "+r"(width) // %2
  1152. :
  1153. : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
  1154. );
  1155. }
  1156. void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
  1157. const uint32 dither4, int width) {
  1158. asm volatile (
  1159. "dup v1.4s, %w2 \n" // dither4
  1160. "1: \n"
  1161. MEMACCESS(1)
  1162. "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels
  1163. "subs %w3, %w3, #8 \n" // 8 processed per loop.
  1164. "uqadd v20.8b, v20.8b, v1.8b \n"
  1165. "uqadd v21.8b, v21.8b, v1.8b \n"
  1166. "uqadd v22.8b, v22.8b, v1.8b \n"
  1167. ARGBTORGB565
  1168. MEMACCESS(0)
  1169. "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
  1170. "b.gt 1b \n"
  1171. : "+r"(dst_rgb) // %0
  1172. : "r"(src_argb), // %1
  1173. "r"(dither4), // %2
  1174. "r"(width) // %3
  1175. : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
  1176. );
  1177. }
  1178. void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
  1179. int width) {
  1180. asm volatile (
  1181. "1: \n"
  1182. MEMACCESS(0)
  1183. "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
  1184. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1185. ARGBTOARGB1555
  1186. MEMACCESS(1)
  1187. "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555.
  1188. "b.gt 1b \n"
  1189. : "+r"(src_argb), // %0
  1190. "+r"(dst_argb1555), // %1
  1191. "+r"(width) // %2
  1192. :
  1193. : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
  1194. );
  1195. }
  1196. void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
  1197. int width) {
  1198. asm volatile (
  1199. "movi v4.16b, #0x0f \n" // bits to clear with vbic.
  1200. "1: \n"
  1201. MEMACCESS(0)
  1202. "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
  1203. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1204. ARGBTOARGB4444
  1205. MEMACCESS(1)
  1206. "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444.
  1207. "b.gt 1b \n"
  1208. : "+r"(src_argb), // %0
  1209. "+r"(dst_argb4444), // %1
  1210. "+r"(width) // %2
  1211. :
  1212. : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
  1213. );
  1214. }
  1215. void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
  1216. asm volatile (
  1217. "movi v4.8b, #13 \n" // B * 0.1016 coefficient
  1218. "movi v5.8b, #65 \n" // G * 0.5078 coefficient
  1219. "movi v6.8b, #33 \n" // R * 0.2578 coefficient
  1220. "movi v7.8b, #16 \n" // Add 16 constant
  1221. "1: \n"
  1222. MEMACCESS(0)
  1223. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
  1224. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1225. "umull v3.8h, v0.8b, v4.8b \n" // B
  1226. "umlal v3.8h, v1.8b, v5.8b \n" // G
  1227. "umlal v3.8h, v2.8b, v6.8b \n" // R
  1228. "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
  1229. "uqadd v0.8b, v0.8b, v7.8b \n"
  1230. MEMACCESS(1)
  1231. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  1232. "b.gt 1b \n"
  1233. : "+r"(src_argb), // %0
  1234. "+r"(dst_y), // %1
  1235. "+r"(width) // %2
  1236. :
  1237. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
  1238. );
  1239. }
  1240. void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
  1241. asm volatile (
  1242. "1: \n"
  1243. MEMACCESS(0)
  1244. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 pixels
  1245. "subs %w2, %w2, #16 \n" // 16 processed per loop
  1246. MEMACCESS(1)
  1247. "st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
  1248. "b.gt 1b \n"
  1249. : "+r"(src_argb), // %0
  1250. "+r"(dst_a), // %1
  1251. "+r"(width) // %2
  1252. :
  1253. : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
  1254. );
  1255. }
  1256. void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
  1257. asm volatile (
  1258. "movi v4.8b, #15 \n" // B * 0.11400 coefficient
  1259. "movi v5.8b, #75 \n" // G * 0.58700 coefficient
  1260. "movi v6.8b, #38 \n" // R * 0.29900 coefficient
  1261. "1: \n"
  1262. MEMACCESS(0)
  1263. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
  1264. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1265. "umull v3.8h, v0.8b, v4.8b \n" // B
  1266. "umlal v3.8h, v1.8b, v5.8b \n" // G
  1267. "umlal v3.8h, v2.8b, v6.8b \n" // R
  1268. "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y
  1269. MEMACCESS(1)
  1270. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  1271. "b.gt 1b \n"
  1272. : "+r"(src_argb), // %0
  1273. "+r"(dst_y), // %1
  1274. "+r"(width) // %2
  1275. :
  1276. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
  1277. );
  1278. }
  1279. // 8x1 pixels.
  1280. void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
  1281. int width) {
  1282. asm volatile (
  1283. "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient
  1284. "movi v25.8b, #74 \n" // UG -0.5781 coefficient
  1285. "movi v26.8b, #38 \n" // UR -0.2969 coefficient
  1286. "movi v27.8b, #18 \n" // VB -0.1406 coefficient
  1287. "movi v28.8b, #94 \n" // VG -0.7344 coefficient
  1288. "movi v29.16b,#0x80 \n" // 128.5
  1289. "1: \n"
  1290. MEMACCESS(0)
  1291. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
  1292. "subs %w3, %w3, #8 \n" // 8 processed per loop.
  1293. "umull v4.8h, v0.8b, v24.8b \n" // B
  1294. "umlsl v4.8h, v1.8b, v25.8b \n" // G
  1295. "umlsl v4.8h, v2.8b, v26.8b \n" // R
  1296. "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned
  1297. "umull v3.8h, v2.8b, v24.8b \n" // R
  1298. "umlsl v3.8h, v1.8b, v28.8b \n" // G
  1299. "umlsl v3.8h, v0.8b, v27.8b \n" // B
  1300. "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned
  1301. "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U
  1302. "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
  1303. MEMACCESS(1)
  1304. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
  1305. MEMACCESS(2)
  1306. "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
  1307. "b.gt 1b \n"
  1308. : "+r"(src_argb), // %0
  1309. "+r"(dst_u), // %1
  1310. "+r"(dst_v), // %2
  1311. "+r"(width) // %3
  1312. :
  1313. : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
  1314. "v24", "v25", "v26", "v27", "v28", "v29"
  1315. );
  1316. }
  1317. #define RGBTOUV_SETUP_REG \
  1318. "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
  1319. "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
  1320. "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \
  1321. "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \
  1322. "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \
  1323. "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
  1324. // 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32.
  1325. void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
  1326. int width) {
  1327. asm volatile (
  1328. RGBTOUV_SETUP_REG
  1329. "1: \n"
  1330. MEMACCESS(0)
  1331. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
  1332. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
  1333. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
  1334. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
  1335. MEMACCESS(0)
  1336. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n" // load next 16.
  1337. "uaddlp v4.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
  1338. "uaddlp v5.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
  1339. "uaddlp v6.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
  1340. "addp v0.8h, v0.8h, v4.8h \n" // B 16 shorts -> 8 shorts.
  1341. "addp v1.8h, v1.8h, v5.8h \n" // G 16 shorts -> 8 shorts.
  1342. "addp v2.8h, v2.8h, v6.8h \n" // R 16 shorts -> 8 shorts.
  1343. "urshr v0.8h, v0.8h, #1 \n" // 2x average
  1344. "urshr v1.8h, v1.8h, #1 \n"
  1345. "urshr v2.8h, v2.8h, #1 \n"
  1346. "subs %w3, %w3, #32 \n" // 32 processed per loop.
  1347. "mul v3.8h, v0.8h, v20.8h \n" // B
  1348. "mls v3.8h, v1.8h, v21.8h \n" // G
  1349. "mls v3.8h, v2.8h, v22.8h \n" // R
  1350. "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
  1351. "mul v4.8h, v2.8h, v20.8h \n" // R
  1352. "mls v4.8h, v1.8h, v24.8h \n" // G
  1353. "mls v4.8h, v0.8h, v23.8h \n" // B
  1354. "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned
  1355. "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U
  1356. "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V
  1357. MEMACCESS(1)
  1358. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
  1359. MEMACCESS(2)
  1360. "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
  1361. "b.gt 1b \n"
  1362. : "+r"(src_argb), // %0
  1363. "+r"(dst_u), // %1
  1364. "+r"(dst_v), // %2
  1365. "+r"(width) // %3
  1366. :
  1367. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  1368. "v20", "v21", "v22", "v23", "v24", "v25"
  1369. );
  1370. }
  1371. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
  1372. #define RGBTOUV(QB, QG, QR) \
  1373. "mul v3.8h, " #QB ",v20.8h \n" /* B */ \
  1374. "mul v4.8h, " #QR ",v20.8h \n" /* R */ \
  1375. "mls v3.8h, " #QG ",v21.8h \n" /* G */ \
  1376. "mls v4.8h, " #QG ",v24.8h \n" /* G */ \
  1377. "mls v3.8h, " #QR ",v22.8h \n" /* R */ \
  1378. "mls v4.8h, " #QB ",v23.8h \n" /* B */ \
  1379. "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
  1380. "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
  1381. "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
  1382. "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
  1383. // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
  1384. // TODO(fbarchard): consider ptrdiff_t for all strides.
  1385. void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
  1386. uint8* dst_u, uint8* dst_v, int width) {
  1387. const uint8* src_argb_1 = src_argb + src_stride_argb;
  1388. asm volatile (
  1389. RGBTOUV_SETUP_REG
  1390. "1: \n"
  1391. MEMACCESS(0)
  1392. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
  1393. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
  1394. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
  1395. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
  1396. MEMACCESS(1)
  1397. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
  1398. "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
  1399. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
  1400. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
  1401. "urshr v0.8h, v0.8h, #1 \n" // 2x average
  1402. "urshr v1.8h, v1.8h, #1 \n"
  1403. "urshr v2.8h, v2.8h, #1 \n"
  1404. "subs %w4, %w4, #16 \n" // 32 processed per loop.
  1405. RGBTOUV(v0.8h, v1.8h, v2.8h)
  1406. MEMACCESS(2)
  1407. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1408. MEMACCESS(3)
  1409. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1410. "b.gt 1b \n"
  1411. : "+r"(src_argb), // %0
  1412. "+r"(src_argb_1), // %1
  1413. "+r"(dst_u), // %2
  1414. "+r"(dst_v), // %3
  1415. "+r"(width) // %4
  1416. :
  1417. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  1418. "v20", "v21", "v22", "v23", "v24", "v25"
  1419. );
  1420. }
  1421. // TODO(fbarchard): Subsample match C code.
  1422. void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
  1423. uint8* dst_u, uint8* dst_v, int width) {
  1424. const uint8* src_argb_1 = src_argb + src_stride_argb;
  1425. asm volatile (
  1426. "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
  1427. "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
  1428. "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
  1429. "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
  1430. "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
  1431. "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
  1432. "1: \n"
  1433. MEMACCESS(0)
  1434. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
  1435. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
  1436. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
  1437. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
  1438. MEMACCESS(1)
  1439. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
  1440. "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
  1441. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
  1442. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
  1443. "urshr v0.8h, v0.8h, #1 \n" // 2x average
  1444. "urshr v1.8h, v1.8h, #1 \n"
  1445. "urshr v2.8h, v2.8h, #1 \n"
  1446. "subs %w4, %w4, #16 \n" // 32 processed per loop.
  1447. RGBTOUV(v0.8h, v1.8h, v2.8h)
  1448. MEMACCESS(2)
  1449. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1450. MEMACCESS(3)
  1451. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1452. "b.gt 1b \n"
  1453. : "+r"(src_argb), // %0
  1454. "+r"(src_argb_1), // %1
  1455. "+r"(dst_u), // %2
  1456. "+r"(dst_v), // %3
  1457. "+r"(width) // %4
  1458. :
  1459. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  1460. "v20", "v21", "v22", "v23", "v24", "v25"
  1461. );
  1462. }
  1463. void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
  1464. uint8* dst_u, uint8* dst_v, int width) {
  1465. const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
  1466. asm volatile (
  1467. RGBTOUV_SETUP_REG
  1468. "1: \n"
  1469. MEMACCESS(0)
  1470. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
  1471. "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
  1472. "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
  1473. "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
  1474. MEMACCESS(1)
  1475. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
  1476. "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
  1477. "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
  1478. "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
  1479. "urshr v0.8h, v0.8h, #1 \n" // 2x average
  1480. "urshr v1.8h, v3.8h, #1 \n"
  1481. "urshr v2.8h, v2.8h, #1 \n"
  1482. "subs %w4, %w4, #16 \n" // 32 processed per loop.
  1483. RGBTOUV(v0.8h, v1.8h, v2.8h)
  1484. MEMACCESS(2)
  1485. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1486. MEMACCESS(3)
  1487. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1488. "b.gt 1b \n"
  1489. : "+r"(src_bgra), // %0
  1490. "+r"(src_bgra_1), // %1
  1491. "+r"(dst_u), // %2
  1492. "+r"(dst_v), // %3
  1493. "+r"(width) // %4
  1494. :
  1495. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  1496. "v20", "v21", "v22", "v23", "v24", "v25"
  1497. );
  1498. }
  1499. void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
  1500. uint8* dst_u, uint8* dst_v, int width) {
  1501. const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
  1502. asm volatile (
  1503. RGBTOUV_SETUP_REG
  1504. "1: \n"
  1505. MEMACCESS(0)
  1506. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
  1507. "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
  1508. "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
  1509. "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
  1510. MEMACCESS(1)
  1511. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
  1512. "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
  1513. "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
  1514. "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
  1515. "urshr v0.8h, v3.8h, #1 \n" // 2x average
  1516. "urshr v2.8h, v2.8h, #1 \n"
  1517. "urshr v1.8h, v1.8h, #1 \n"
  1518. "subs %w4, %w4, #16 \n" // 32 processed per loop.
  1519. RGBTOUV(v0.8h, v2.8h, v1.8h)
  1520. MEMACCESS(2)
  1521. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1522. MEMACCESS(3)
  1523. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1524. "b.gt 1b \n"
  1525. : "+r"(src_abgr), // %0
  1526. "+r"(src_abgr_1), // %1
  1527. "+r"(dst_u), // %2
  1528. "+r"(dst_v), // %3
  1529. "+r"(width) // %4
  1530. :
  1531. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  1532. "v20", "v21", "v22", "v23", "v24", "v25"
  1533. );
  1534. }
  1535. void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
  1536. uint8* dst_u, uint8* dst_v, int width) {
  1537. const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
  1538. asm volatile (
  1539. RGBTOUV_SETUP_REG
  1540. "1: \n"
  1541. MEMACCESS(0)
  1542. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
  1543. "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
  1544. "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
  1545. "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
  1546. MEMACCESS(1)
  1547. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
  1548. "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
  1549. "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
  1550. "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
  1551. "urshr v0.8h, v0.8h, #1 \n" // 2x average
  1552. "urshr v1.8h, v1.8h, #1 \n"
  1553. "urshr v2.8h, v2.8h, #1 \n"
  1554. "subs %w4, %w4, #16 \n" // 32 processed per loop.
  1555. RGBTOUV(v0.8h, v1.8h, v2.8h)
  1556. MEMACCESS(2)
  1557. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1558. MEMACCESS(3)
  1559. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1560. "b.gt 1b \n"
  1561. : "+r"(src_rgba), // %0
  1562. "+r"(src_rgba_1), // %1
  1563. "+r"(dst_u), // %2
  1564. "+r"(dst_v), // %3
  1565. "+r"(width) // %4
  1566. :
  1567. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  1568. "v20", "v21", "v22", "v23", "v24", "v25"
  1569. );
  1570. }
  1571. void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
  1572. uint8* dst_u, uint8* dst_v, int width) {
  1573. const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
  1574. asm volatile (
  1575. RGBTOUV_SETUP_REG
  1576. "1: \n"
  1577. MEMACCESS(0)
  1578. "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
  1579. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
  1580. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
  1581. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
  1582. MEMACCESS(1)
  1583. "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more.
  1584. "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
  1585. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
  1586. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
  1587. "urshr v0.8h, v0.8h, #1 \n" // 2x average
  1588. "urshr v1.8h, v1.8h, #1 \n"
  1589. "urshr v2.8h, v2.8h, #1 \n"
  1590. "subs %w4, %w4, #16 \n" // 32 processed per loop.
  1591. RGBTOUV(v0.8h, v1.8h, v2.8h)
  1592. MEMACCESS(2)
  1593. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1594. MEMACCESS(3)
  1595. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1596. "b.gt 1b \n"
  1597. : "+r"(src_rgb24), // %0
  1598. "+r"(src_rgb24_1), // %1
  1599. "+r"(dst_u), // %2
  1600. "+r"(dst_v), // %3
  1601. "+r"(width) // %4
  1602. :
  1603. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  1604. "v20", "v21", "v22", "v23", "v24", "v25"
  1605. );
  1606. }
  1607. void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
  1608. uint8* dst_u, uint8* dst_v, int width) {
  1609. const uint8* src_raw_1 = src_raw + src_stride_raw;
  1610. asm volatile (
  1611. RGBTOUV_SETUP_REG
  1612. "1: \n"
  1613. MEMACCESS(0)
  1614. "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels.
  1615. "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
  1616. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
  1617. "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
  1618. MEMACCESS(1)
  1619. "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels
  1620. "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
  1621. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
  1622. "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
  1623. "urshr v2.8h, v2.8h, #1 \n" // 2x average
  1624. "urshr v1.8h, v1.8h, #1 \n"
  1625. "urshr v0.8h, v0.8h, #1 \n"
  1626. "subs %w4, %w4, #16 \n" // 32 processed per loop.
  1627. RGBTOUV(v2.8h, v1.8h, v0.8h)
  1628. MEMACCESS(2)
  1629. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1630. MEMACCESS(3)
  1631. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1632. "b.gt 1b \n"
  1633. : "+r"(src_raw), // %0
  1634. "+r"(src_raw_1), // %1
  1635. "+r"(dst_u), // %2
  1636. "+r"(dst_v), // %3
  1637. "+r"(width) // %4
  1638. :
  1639. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  1640. "v20", "v21", "v22", "v23", "v24", "v25"
  1641. );
  1642. }
  1643. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
  1644. void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
  1645. uint8* dst_u, uint8* dst_v, int width) {
  1646. const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
  1647. asm volatile (
  1648. "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2
  1649. "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2
  1650. "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2
  1651. "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2
  1652. "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2
  1653. "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
  1654. "1: \n"
  1655. MEMACCESS(0)
  1656. "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
  1657. RGB565TOARGB
  1658. "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1659. "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1660. "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1661. MEMACCESS(0)
  1662. "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels.
  1663. RGB565TOARGB
  1664. "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1665. "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1666. "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1667. MEMACCESS(1)
  1668. "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels.
  1669. RGB565TOARGB
  1670. "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1671. "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1672. "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1673. MEMACCESS(1)
  1674. "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels.
  1675. RGB565TOARGB
  1676. "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1677. "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1678. "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1679. "ins v16.D[1], v17.D[0] \n"
  1680. "ins v18.D[1], v19.D[0] \n"
  1681. "ins v20.D[1], v21.D[0] \n"
  1682. "urshr v4.8h, v16.8h, #1 \n" // 2x average
  1683. "urshr v5.8h, v18.8h, #1 \n"
  1684. "urshr v6.8h, v20.8h, #1 \n"
  1685. "subs %w4, %w4, #16 \n" // 16 processed per loop.
  1686. "mul v16.8h, v4.8h, v22.8h \n" // B
  1687. "mls v16.8h, v5.8h, v23.8h \n" // G
  1688. "mls v16.8h, v6.8h, v24.8h \n" // R
  1689. "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned
  1690. "mul v17.8h, v6.8h, v22.8h \n" // R
  1691. "mls v17.8h, v5.8h, v26.8h \n" // G
  1692. "mls v17.8h, v4.8h, v25.8h \n" // B
  1693. "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned
  1694. "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U
  1695. "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V
  1696. MEMACCESS(2)
  1697. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1698. MEMACCESS(3)
  1699. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1700. "b.gt 1b \n"
  1701. : "+r"(src_rgb565), // %0
  1702. "+r"(src_rgb565_1), // %1
  1703. "+r"(dst_u), // %2
  1704. "+r"(dst_v), // %3
  1705. "+r"(width) // %4
  1706. :
  1707. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  1708. "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
  1709. "v25", "v26", "v27"
  1710. );
  1711. }
  1712. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
  1713. void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
  1714. uint8* dst_u, uint8* dst_v, int width) {
  1715. const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
  1716. asm volatile (
  1717. RGBTOUV_SETUP_REG
  1718. "1: \n"
  1719. MEMACCESS(0)
  1720. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
  1721. RGB555TOARGB
  1722. "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1723. "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1724. "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1725. MEMACCESS(0)
  1726. "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
  1727. RGB555TOARGB
  1728. "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1729. "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1730. "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1731. MEMACCESS(1)
  1732. "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
  1733. RGB555TOARGB
  1734. "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1735. "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1736. "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1737. MEMACCESS(1)
  1738. "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
  1739. RGB555TOARGB
  1740. "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1741. "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1742. "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1743. "ins v16.D[1], v26.D[0] \n"
  1744. "ins v17.D[1], v27.D[0] \n"
  1745. "ins v18.D[1], v28.D[0] \n"
  1746. "urshr v4.8h, v16.8h, #1 \n" // 2x average
  1747. "urshr v5.8h, v17.8h, #1 \n"
  1748. "urshr v6.8h, v18.8h, #1 \n"
  1749. "subs %w4, %w4, #16 \n" // 16 processed per loop.
  1750. "mul v2.8h, v4.8h, v20.8h \n" // B
  1751. "mls v2.8h, v5.8h, v21.8h \n" // G
  1752. "mls v2.8h, v6.8h, v22.8h \n" // R
  1753. "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
  1754. "mul v3.8h, v6.8h, v20.8h \n" // R
  1755. "mls v3.8h, v5.8h, v24.8h \n" // G
  1756. "mls v3.8h, v4.8h, v23.8h \n" // B
  1757. "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
  1758. "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
  1759. "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
  1760. MEMACCESS(2)
  1761. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1762. MEMACCESS(3)
  1763. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1764. "b.gt 1b \n"
  1765. : "+r"(src_argb1555), // %0
  1766. "+r"(src_argb1555_1), // %1
  1767. "+r"(dst_u), // %2
  1768. "+r"(dst_v), // %3
  1769. "+r"(width) // %4
  1770. :
  1771. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
  1772. "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
  1773. "v26", "v27", "v28"
  1774. );
  1775. }
  1776. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
  1777. void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
  1778. uint8* dst_u, uint8* dst_v, int width) {
  1779. const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
  1780. asm volatile (
  1781. RGBTOUV_SETUP_REG
  1782. "1: \n"
  1783. MEMACCESS(0)
  1784. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
  1785. ARGB4444TOARGB
  1786. "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1787. "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1788. "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1789. MEMACCESS(0)
  1790. "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels.
  1791. ARGB4444TOARGB
  1792. "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1793. "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1794. "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1795. MEMACCESS(1)
  1796. "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels.
  1797. ARGB4444TOARGB
  1798. "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1799. "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1800. "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1801. MEMACCESS(1)
  1802. "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels.
  1803. ARGB4444TOARGB
  1804. "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1805. "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1806. "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1807. "ins v16.D[1], v26.D[0] \n"
  1808. "ins v17.D[1], v27.D[0] \n"
  1809. "ins v18.D[1], v28.D[0] \n"
  1810. "urshr v4.8h, v16.8h, #1 \n" // 2x average
  1811. "urshr v5.8h, v17.8h, #1 \n"
  1812. "urshr v6.8h, v18.8h, #1 \n"
  1813. "subs %w4, %w4, #16 \n" // 16 processed per loop.
  1814. "mul v2.8h, v4.8h, v20.8h \n" // B
  1815. "mls v2.8h, v5.8h, v21.8h \n" // G
  1816. "mls v2.8h, v6.8h, v22.8h \n" // R
  1817. "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
  1818. "mul v3.8h, v6.8h, v20.8h \n" // R
  1819. "mls v3.8h, v5.8h, v24.8h \n" // G
  1820. "mls v3.8h, v4.8h, v23.8h \n" // B
  1821. "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
  1822. "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
  1823. "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
  1824. MEMACCESS(2)
  1825. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1826. MEMACCESS(3)
  1827. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1828. "b.gt 1b \n"
  1829. : "+r"(src_argb4444), // %0
  1830. "+r"(src_argb4444_1), // %1
  1831. "+r"(dst_u), // %2
  1832. "+r"(dst_v), // %3
  1833. "+r"(width) // %4
  1834. :
  1835. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
  1836. "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
  1837. "v26", "v27", "v28"
  1838. );
  1839. }
  1840. void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
  1841. asm volatile (
  1842. "movi v24.8b, #13 \n" // B * 0.1016 coefficient
  1843. "movi v25.8b, #65 \n" // G * 0.5078 coefficient
  1844. "movi v26.8b, #33 \n" // R * 0.2578 coefficient
  1845. "movi v27.8b, #16 \n" // Add 16 constant
  1846. "1: \n"
  1847. MEMACCESS(0)
  1848. "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
  1849. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1850. RGB565TOARGB
  1851. "umull v3.8h, v0.8b, v24.8b \n" // B
  1852. "umlal v3.8h, v1.8b, v25.8b \n" // G
  1853. "umlal v3.8h, v2.8b, v26.8b \n" // R
  1854. "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
  1855. "uqadd v0.8b, v0.8b, v27.8b \n"
  1856. MEMACCESS(1)
  1857. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  1858. "b.gt 1b \n"
  1859. : "+r"(src_rgb565), // %0
  1860. "+r"(dst_y), // %1
  1861. "+r"(width) // %2
  1862. :
  1863. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
  1864. "v24", "v25", "v26", "v27"
  1865. );
  1866. }
  1867. void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
  1868. asm volatile (
  1869. "movi v4.8b, #13 \n" // B * 0.1016 coefficient
  1870. "movi v5.8b, #65 \n" // G * 0.5078 coefficient
  1871. "movi v6.8b, #33 \n" // R * 0.2578 coefficient
  1872. "movi v7.8b, #16 \n" // Add 16 constant
  1873. "1: \n"
  1874. MEMACCESS(0)
  1875. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
  1876. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1877. ARGB1555TOARGB
  1878. "umull v3.8h, v0.8b, v4.8b \n" // B
  1879. "umlal v3.8h, v1.8b, v5.8b \n" // G
  1880. "umlal v3.8h, v2.8b, v6.8b \n" // R
  1881. "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
  1882. "uqadd v0.8b, v0.8b, v7.8b \n"
  1883. MEMACCESS(1)
  1884. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  1885. "b.gt 1b \n"
  1886. : "+r"(src_argb1555), // %0
  1887. "+r"(dst_y), // %1
  1888. "+r"(width) // %2
  1889. :
  1890. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
  1891. );
  1892. }
  1893. void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
  1894. asm volatile (
  1895. "movi v24.8b, #13 \n" // B * 0.1016 coefficient
  1896. "movi v25.8b, #65 \n" // G * 0.5078 coefficient
  1897. "movi v26.8b, #33 \n" // R * 0.2578 coefficient
  1898. "movi v27.8b, #16 \n" // Add 16 constant
  1899. "1: \n"
  1900. MEMACCESS(0)
  1901. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
  1902. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1903. ARGB4444TOARGB
  1904. "umull v3.8h, v0.8b, v24.8b \n" // B
  1905. "umlal v3.8h, v1.8b, v25.8b \n" // G
  1906. "umlal v3.8h, v2.8b, v26.8b \n" // R
  1907. "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
  1908. "uqadd v0.8b, v0.8b, v27.8b \n"
  1909. MEMACCESS(1)
  1910. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  1911. "b.gt 1b \n"
  1912. : "+r"(src_argb4444), // %0
  1913. "+r"(dst_y), // %1
  1914. "+r"(width) // %2
  1915. :
  1916. : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
  1917. );
  1918. }
  1919. void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
  1920. asm volatile (
  1921. "movi v4.8b, #33 \n" // R * 0.2578 coefficient
  1922. "movi v5.8b, #65 \n" // G * 0.5078 coefficient
  1923. "movi v6.8b, #13 \n" // B * 0.1016 coefficient
  1924. "movi v7.8b, #16 \n" // Add 16 constant
  1925. "1: \n"
  1926. MEMACCESS(0)
  1927. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
  1928. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1929. "umull v16.8h, v1.8b, v4.8b \n" // R
  1930. "umlal v16.8h, v2.8b, v5.8b \n" // G
  1931. "umlal v16.8h, v3.8b, v6.8b \n" // B
  1932. "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
  1933. "uqadd v0.8b, v0.8b, v7.8b \n"
  1934. MEMACCESS(1)
  1935. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  1936. "b.gt 1b \n"
  1937. : "+r"(src_bgra), // %0
  1938. "+r"(dst_y), // %1
  1939. "+r"(width) // %2
  1940. :
  1941. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
  1942. );
  1943. }
  1944. void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
  1945. asm volatile (
  1946. "movi v4.8b, #33 \n" // R * 0.2578 coefficient
  1947. "movi v5.8b, #65 \n" // G * 0.5078 coefficient
  1948. "movi v6.8b, #13 \n" // B * 0.1016 coefficient
  1949. "movi v7.8b, #16 \n" // Add 16 constant
  1950. "1: \n"
  1951. MEMACCESS(0)
  1952. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
  1953. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1954. "umull v16.8h, v0.8b, v4.8b \n" // R
  1955. "umlal v16.8h, v1.8b, v5.8b \n" // G
  1956. "umlal v16.8h, v2.8b, v6.8b \n" // B
  1957. "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
  1958. "uqadd v0.8b, v0.8b, v7.8b \n"
  1959. MEMACCESS(1)
  1960. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  1961. "b.gt 1b \n"
  1962. : "+r"(src_abgr), // %0
  1963. "+r"(dst_y), // %1
  1964. "+r"(width) // %2
  1965. :
  1966. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
  1967. );
  1968. }
  1969. void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
  1970. asm volatile (
  1971. "movi v4.8b, #13 \n" // B * 0.1016 coefficient
  1972. "movi v5.8b, #65 \n" // G * 0.5078 coefficient
  1973. "movi v6.8b, #33 \n" // R * 0.2578 coefficient
  1974. "movi v7.8b, #16 \n" // Add 16 constant
  1975. "1: \n"
  1976. MEMACCESS(0)
  1977. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
  1978. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1979. "umull v16.8h, v1.8b, v4.8b \n" // B
  1980. "umlal v16.8h, v2.8b, v5.8b \n" // G
  1981. "umlal v16.8h, v3.8b, v6.8b \n" // R
  1982. "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
  1983. "uqadd v0.8b, v0.8b, v7.8b \n"
  1984. MEMACCESS(1)
  1985. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  1986. "b.gt 1b \n"
  1987. : "+r"(src_rgba), // %0
  1988. "+r"(dst_y), // %1
  1989. "+r"(width) // %2
  1990. :
  1991. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
  1992. );
  1993. }
  1994. void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
  1995. asm volatile (
  1996. "movi v4.8b, #13 \n" // B * 0.1016 coefficient
  1997. "movi v5.8b, #65 \n" // G * 0.5078 coefficient
  1998. "movi v6.8b, #33 \n" // R * 0.2578 coefficient
  1999. "movi v7.8b, #16 \n" // Add 16 constant
  2000. "1: \n"
  2001. MEMACCESS(0)
  2002. "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
  2003. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  2004. "umull v16.8h, v0.8b, v4.8b \n" // B
  2005. "umlal v16.8h, v1.8b, v5.8b \n" // G
  2006. "umlal v16.8h, v2.8b, v6.8b \n" // R
  2007. "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
  2008. "uqadd v0.8b, v0.8b, v7.8b \n"
  2009. MEMACCESS(1)
  2010. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  2011. "b.gt 1b \n"
  2012. : "+r"(src_rgb24), // %0
  2013. "+r"(dst_y), // %1
  2014. "+r"(width) // %2
  2015. :
  2016. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
  2017. );
  2018. }
  2019. void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
  2020. asm volatile (
  2021. "movi v4.8b, #33 \n" // R * 0.2578 coefficient
  2022. "movi v5.8b, #65 \n" // G * 0.5078 coefficient
  2023. "movi v6.8b, #13 \n" // B * 0.1016 coefficient
  2024. "movi v7.8b, #16 \n" // Add 16 constant
  2025. "1: \n"
  2026. MEMACCESS(0)
  2027. "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
  2028. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  2029. "umull v16.8h, v0.8b, v4.8b \n" // B
  2030. "umlal v16.8h, v1.8b, v5.8b \n" // G
  2031. "umlal v16.8h, v2.8b, v6.8b \n" // R
  2032. "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
  2033. "uqadd v0.8b, v0.8b, v7.8b \n"
  2034. MEMACCESS(1)
  2035. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  2036. "b.gt 1b \n"
  2037. : "+r"(src_raw), // %0
  2038. "+r"(dst_y), // %1
  2039. "+r"(width) // %2
  2040. :
  2041. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
  2042. );
  2043. }
  2044. // Bilinear filter 16x2 -> 16x1
  2045. void InterpolateRow_NEON(uint8* dst_ptr,
  2046. const uint8* src_ptr, ptrdiff_t src_stride,
  2047. int dst_width, int source_y_fraction) {
  2048. int y1_fraction = source_y_fraction;
  2049. int y0_fraction = 256 - y1_fraction;
  2050. const uint8* src_ptr1 = src_ptr + src_stride;
  2051. asm volatile (
  2052. "cmp %w4, #0 \n"
  2053. "b.eq 100f \n"
  2054. "cmp %w4, #128 \n"
  2055. "b.eq 50f \n"
  2056. "dup v5.16b, %w4 \n"
  2057. "dup v4.16b, %w5 \n"
  2058. // General purpose row blend.
  2059. "1: \n"
  2060. MEMACCESS(1)
  2061. "ld1 {v0.16b}, [%1], #16 \n"
  2062. MEMACCESS(2)
  2063. "ld1 {v1.16b}, [%2], #16 \n"
  2064. "subs %w3, %w3, #16 \n"
  2065. "umull v2.8h, v0.8b, v4.8b \n"
  2066. "umull2 v3.8h, v0.16b, v4.16b \n"
  2067. "umlal v2.8h, v1.8b, v5.8b \n"
  2068. "umlal2 v3.8h, v1.16b, v5.16b \n"
  2069. "rshrn v0.8b, v2.8h, #8 \n"
  2070. "rshrn2 v0.16b, v3.8h, #8 \n"
  2071. MEMACCESS(0)
  2072. "st1 {v0.16b}, [%0], #16 \n"
  2073. "b.gt 1b \n"
  2074. "b 99f \n"
  2075. // Blend 50 / 50.
  2076. "50: \n"
  2077. MEMACCESS(1)
  2078. "ld1 {v0.16b}, [%1], #16 \n"
  2079. MEMACCESS(2)
  2080. "ld1 {v1.16b}, [%2], #16 \n"
  2081. "subs %w3, %w3, #16 \n"
  2082. "urhadd v0.16b, v0.16b, v1.16b \n"
  2083. MEMACCESS(0)
  2084. "st1 {v0.16b}, [%0], #16 \n"
  2085. "b.gt 50b \n"
  2086. "b 99f \n"
  2087. // Blend 100 / 0 - Copy row unchanged.
  2088. "100: \n"
  2089. MEMACCESS(1)
  2090. "ld1 {v0.16b}, [%1], #16 \n"
  2091. "subs %w3, %w3, #16 \n"
  2092. MEMACCESS(0)
  2093. "st1 {v0.16b}, [%0], #16 \n"
  2094. "b.gt 100b \n"
  2095. "99: \n"
  2096. : "+r"(dst_ptr), // %0
  2097. "+r"(src_ptr), // %1
  2098. "+r"(src_ptr1), // %2
  2099. "+r"(dst_width), // %3
  2100. "+r"(y1_fraction), // %4
  2101. "+r"(y0_fraction) // %5
  2102. :
  2103. : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
  2104. );
  2105. }
  2106. // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
  2107. void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
  2108. uint8* dst_argb, int width) {
  2109. asm volatile (
  2110. "subs %w3, %w3, #8 \n"
  2111. "b.lt 89f \n"
  2112. // Blend 8 pixels.
  2113. "8: \n"
  2114. MEMACCESS(0)
  2115. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels
  2116. MEMACCESS(1)
  2117. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels
  2118. "subs %w3, %w3, #8 \n" // 8 processed per loop.
  2119. "umull v16.8h, v4.8b, v3.8b \n" // db * a
  2120. "umull v17.8h, v5.8b, v3.8b \n" // dg * a
  2121. "umull v18.8h, v6.8b, v3.8b \n" // dr * a
  2122. "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
  2123. "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
  2124. "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
  2125. "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
  2126. "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
  2127. "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
  2128. "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
  2129. "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
  2130. "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
  2131. "movi v3.8b, #255 \n" // a = 255
  2132. MEMACCESS(2)
  2133. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
  2134. "b.ge 8b \n"
  2135. "89: \n"
  2136. "adds %w3, %w3, #8-1 \n"
  2137. "b.lt 99f \n"
  2138. // Blend 1 pixels.
  2139. "1: \n"
  2140. MEMACCESS(0)
  2141. "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
  2142. MEMACCESS(1)
  2143. "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
  2144. "subs %w3, %w3, #1 \n" // 1 processed per loop.
  2145. "umull v16.8h, v4.8b, v3.8b \n" // db * a
  2146. "umull v17.8h, v5.8b, v3.8b \n" // dg * a
  2147. "umull v18.8h, v6.8b, v3.8b \n" // dr * a
  2148. "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
  2149. "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
  2150. "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
  2151. "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
  2152. "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
  2153. "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
  2154. "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
  2155. "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
  2156. "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
  2157. "movi v3.8b, #255 \n" // a = 255
  2158. MEMACCESS(2)
  2159. "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
  2160. "b.ge 1b \n"
  2161. "99: \n"
  2162. : "+r"(src_argb0), // %0
  2163. "+r"(src_argb1), // %1
  2164. "+r"(dst_argb), // %2
  2165. "+r"(width) // %3
  2166. :
  2167. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  2168. "v16", "v17", "v18"
  2169. );
  2170. }
  2171. // Attenuate 8 pixels at a time.
  2172. void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
  2173. asm volatile (
  2174. // Attenuate 8 pixels.
  2175. "1: \n"
  2176. MEMACCESS(0)
  2177. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels
  2178. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  2179. "umull v4.8h, v0.8b, v3.8b \n" // b * a
  2180. "umull v5.8h, v1.8b, v3.8b \n" // g * a
  2181. "umull v6.8h, v2.8b, v3.8b \n" // r * a
  2182. "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
  2183. "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
  2184. "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
  2185. MEMACCESS(1)
  2186. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
  2187. "b.gt 1b \n"
  2188. : "+r"(src_argb), // %0
  2189. "+r"(dst_argb), // %1
  2190. "+r"(width) // %2
  2191. :
  2192. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
  2193. );
  2194. }
  2195. // Quantize 8 ARGB pixels (32 bytes).
  2196. // dst = (dst * scale >> 16) * interval_size + interval_offset;
  2197. void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
  2198. int interval_offset, int width) {
  2199. asm volatile (
  2200. "dup v4.8h, %w2 \n"
  2201. "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
  2202. "dup v5.8h, %w3 \n" // interval multiply.
  2203. "dup v6.8h, %w4 \n" // interval add
  2204. // 8 pixel loop.
  2205. "1: \n"
  2206. MEMACCESS(0)
  2207. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB.
  2208. "subs %w1, %w1, #8 \n" // 8 processed per loop.
  2209. "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
  2210. "uxtl v1.8h, v1.8b \n"
  2211. "uxtl v2.8h, v2.8b \n"
  2212. "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
  2213. "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
  2214. "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
  2215. "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
  2216. "mul v1.8h, v1.8h, v5.8h \n" // g
  2217. "mul v2.8h, v2.8h, v5.8h \n" // r
  2218. "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
  2219. "add v1.8h, v1.8h, v6.8h \n" // g
  2220. "add v2.8h, v2.8h, v6.8h \n" // r
  2221. "uqxtn v0.8b, v0.8h \n"
  2222. "uqxtn v1.8b, v1.8h \n"
  2223. "uqxtn v2.8b, v2.8h \n"
  2224. MEMACCESS(0)
  2225. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels
  2226. "b.gt 1b \n"
  2227. : "+r"(dst_argb), // %0
  2228. "+r"(width) // %1
  2229. : "r"(scale), // %2
  2230. "r"(interval_size), // %3
  2231. "r"(interval_offset) // %4
  2232. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
  2233. );
  2234. }
  2235. // Shade 8 pixels at a time by specified value.
  2236. // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
  2237. // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
  2238. void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
  2239. uint32 value) {
  2240. asm volatile (
  2241. "dup v0.4s, %w3 \n" // duplicate scale value.
  2242. "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
  2243. "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
  2244. // 8 pixel loop.
  2245. "1: \n"
  2246. MEMACCESS(0)
  2247. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels.
  2248. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  2249. "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
  2250. "uxtl v5.8h, v5.8b \n"
  2251. "uxtl v6.8h, v6.8b \n"
  2252. "uxtl v7.8h, v7.8b \n"
  2253. "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
  2254. "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
  2255. "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
  2256. "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
  2257. "uqxtn v4.8b, v4.8h \n"
  2258. "uqxtn v5.8b, v5.8h \n"
  2259. "uqxtn v6.8b, v6.8h \n"
  2260. "uqxtn v7.8b, v7.8h \n"
  2261. MEMACCESS(1)
  2262. "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB pixels
  2263. "b.gt 1b \n"
  2264. : "+r"(src_argb), // %0
  2265. "+r"(dst_argb), // %1
  2266. "+r"(width) // %2
  2267. : "r"(value) // %3
  2268. : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
  2269. );
  2270. }
  2271. // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
  2272. // Similar to ARGBToYJ but stores ARGB.
  2273. // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
  2274. void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
  2275. asm volatile (
  2276. "movi v24.8b, #15 \n" // B * 0.11400 coefficient
  2277. "movi v25.8b, #75 \n" // G * 0.58700 coefficient
  2278. "movi v26.8b, #38 \n" // R * 0.29900 coefficient
  2279. "1: \n"
  2280. MEMACCESS(0)
  2281. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
  2282. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  2283. "umull v4.8h, v0.8b, v24.8b \n" // B
  2284. "umlal v4.8h, v1.8b, v25.8b \n" // G
  2285. "umlal v4.8h, v2.8b, v26.8b \n" // R
  2286. "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B
  2287. "orr v1.8b, v0.8b, v0.8b \n" // G
  2288. "orr v2.8b, v0.8b, v0.8b \n" // R
  2289. MEMACCESS(1)
  2290. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
  2291. "b.gt 1b \n"
  2292. : "+r"(src_argb), // %0
  2293. "+r"(dst_argb), // %1
  2294. "+r"(width) // %2
  2295. :
  2296. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
  2297. );
  2298. }
  2299. // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
  2300. // b = (r * 35 + g * 68 + b * 17) >> 7
  2301. // g = (r * 45 + g * 88 + b * 22) >> 7
  2302. // r = (r * 50 + g * 98 + b * 24) >> 7
  2303. void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
  2304. asm volatile (
  2305. "movi v20.8b, #17 \n" // BB coefficient
  2306. "movi v21.8b, #68 \n" // BG coefficient
  2307. "movi v22.8b, #35 \n" // BR coefficient
  2308. "movi v24.8b, #22 \n" // GB coefficient
  2309. "movi v25.8b, #88 \n" // GG coefficient
  2310. "movi v26.8b, #45 \n" // GR coefficient
  2311. "movi v28.8b, #24 \n" // BB coefficient
  2312. "movi v29.8b, #98 \n" // BG coefficient
  2313. "movi v30.8b, #50 \n" // BR coefficient
  2314. "1: \n"
  2315. MEMACCESS(0)
  2316. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
  2317. "subs %w1, %w1, #8 \n" // 8 processed per loop.
  2318. "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
  2319. "umlal v4.8h, v1.8b, v21.8b \n" // G
  2320. "umlal v4.8h, v2.8b, v22.8b \n" // R
  2321. "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
  2322. "umlal v5.8h, v1.8b, v25.8b \n" // G
  2323. "umlal v5.8h, v2.8b, v26.8b \n" // R
  2324. "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
  2325. "umlal v6.8h, v1.8b, v29.8b \n" // G
  2326. "umlal v6.8h, v2.8b, v30.8b \n" // R
  2327. "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
  2328. "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
  2329. "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
  2330. MEMACCESS(0)
  2331. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
  2332. "b.gt 1b \n"
  2333. : "+r"(dst_argb), // %0
  2334. "+r"(width) // %1
  2335. :
  2336. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  2337. "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
  2338. );
  2339. }
  2340. // Tranform 8 ARGB pixels (32 bytes) with color matrix.
  2341. // TODO(fbarchard): Was same as Sepia except matrix is provided. This function
  2342. // needs to saturate. Consider doing a non-saturating version.
  2343. void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
  2344. const int8* matrix_argb, int width) {
  2345. asm volatile (
  2346. MEMACCESS(3)
  2347. "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
  2348. "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
  2349. "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
  2350. "1: \n"
  2351. MEMACCESS(0)
  2352. "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels.
  2353. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  2354. "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
  2355. "uxtl v17.8h, v17.8b \n" // g
  2356. "uxtl v18.8h, v18.8b \n" // r
  2357. "uxtl v19.8h, v19.8b \n" // a
  2358. "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
  2359. "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
  2360. "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
  2361. "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
  2362. "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
  2363. "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
  2364. "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
  2365. "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
  2366. "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
  2367. "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
  2368. "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
  2369. "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
  2370. "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
  2371. "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
  2372. "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
  2373. "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
  2374. "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
  2375. "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
  2376. "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
  2377. "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
  2378. "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
  2379. "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
  2380. "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
  2381. "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
  2382. "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
  2383. "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
  2384. "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
  2385. "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
  2386. "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
  2387. "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
  2388. "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
  2389. "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
  2390. MEMACCESS(1)
  2391. "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 pixels.
  2392. "b.gt 1b \n"
  2393. : "+r"(src_argb), // %0
  2394. "+r"(dst_argb), // %1
  2395. "+r"(width) // %2
  2396. : "r"(matrix_argb) // %3
  2397. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
  2398. "v18", "v19", "v22", "v23", "v24", "v25"
  2399. );
  2400. }
  2401. // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
  2402. // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
  2403. void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
  2404. uint8* dst_argb, int width) {
  2405. asm volatile (
  2406. // 8 pixel loop.
  2407. "1: \n"
  2408. MEMACCESS(0)
  2409. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
  2410. MEMACCESS(1)
  2411. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
  2412. "subs %w3, %w3, #8 \n" // 8 processed per loop.
  2413. "umull v0.8h, v0.8b, v4.8b \n" // multiply B
  2414. "umull v1.8h, v1.8b, v5.8b \n" // multiply G
  2415. "umull v2.8h, v2.8b, v6.8b \n" // multiply R
  2416. "umull v3.8h, v3.8b, v7.8b \n" // multiply A
  2417. "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
  2418. "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
  2419. "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
  2420. "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
  2421. MEMACCESS(2)
  2422. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
  2423. "b.gt 1b \n"
  2424. : "+r"(src_argb0), // %0
  2425. "+r"(src_argb1), // %1
  2426. "+r"(dst_argb), // %2
  2427. "+r"(width) // %3
  2428. :
  2429. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
  2430. );
  2431. }
  2432. // Add 2 rows of ARGB pixels together, 8 pixels at a time.
  2433. void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
  2434. uint8* dst_argb, int width) {
  2435. asm volatile (
  2436. // 8 pixel loop.
  2437. "1: \n"
  2438. MEMACCESS(0)
  2439. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
  2440. MEMACCESS(1)
  2441. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
  2442. "subs %w3, %w3, #8 \n" // 8 processed per loop.
  2443. "uqadd v0.8b, v0.8b, v4.8b \n"
  2444. "uqadd v1.8b, v1.8b, v5.8b \n"
  2445. "uqadd v2.8b, v2.8b, v6.8b \n"
  2446. "uqadd v3.8b, v3.8b, v7.8b \n"
  2447. MEMACCESS(2)
  2448. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
  2449. "b.gt 1b \n"
  2450. : "+r"(src_argb0), // %0
  2451. "+r"(src_argb1), // %1
  2452. "+r"(dst_argb), // %2
  2453. "+r"(width) // %3
  2454. :
  2455. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
  2456. );
  2457. }
  2458. // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
  2459. void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
  2460. uint8* dst_argb, int width) {
  2461. asm volatile (
  2462. // 8 pixel loop.
  2463. "1: \n"
  2464. MEMACCESS(0)
  2465. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
  2466. MEMACCESS(1)
  2467. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
  2468. "subs %w3, %w3, #8 \n" // 8 processed per loop.
  2469. "uqsub v0.8b, v0.8b, v4.8b \n"
  2470. "uqsub v1.8b, v1.8b, v5.8b \n"
  2471. "uqsub v2.8b, v2.8b, v6.8b \n"
  2472. "uqsub v3.8b, v3.8b, v7.8b \n"
  2473. MEMACCESS(2)
  2474. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
  2475. "b.gt 1b \n"
  2476. : "+r"(src_argb0), // %0
  2477. "+r"(src_argb1), // %1
  2478. "+r"(dst_argb), // %2
  2479. "+r"(width) // %3
  2480. :
  2481. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
  2482. );
  2483. }
  2484. // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
  2485. // A = 255
  2486. // R = Sobel
  2487. // G = Sobel
  2488. // B = Sobel
  2489. void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
  2490. uint8* dst_argb, int width) {
  2491. asm volatile (
  2492. "movi v3.8b, #255 \n" // alpha
  2493. // 8 pixel loop.
  2494. "1: \n"
  2495. MEMACCESS(0)
  2496. "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
  2497. MEMACCESS(1)
  2498. "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
  2499. "subs %w3, %w3, #8 \n" // 8 processed per loop.
  2500. "uqadd v0.8b, v0.8b, v1.8b \n" // add
  2501. "orr v1.8b, v0.8b, v0.8b \n"
  2502. "orr v2.8b, v0.8b, v0.8b \n"
  2503. MEMACCESS(2)
  2504. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
  2505. "b.gt 1b \n"
  2506. : "+r"(src_sobelx), // %0
  2507. "+r"(src_sobely), // %1
  2508. "+r"(dst_argb), // %2
  2509. "+r"(width) // %3
  2510. :
  2511. : "cc", "memory", "v0", "v1", "v2", "v3"
  2512. );
  2513. }
  2514. // Adds Sobel X and Sobel Y and stores Sobel into plane.
  2515. void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
  2516. uint8* dst_y, int width) {
  2517. asm volatile (
  2518. // 16 pixel loop.
  2519. "1: \n"
  2520. MEMACCESS(0)
  2521. "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
  2522. MEMACCESS(1)
  2523. "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
  2524. "subs %w3, %w3, #16 \n" // 16 processed per loop.
  2525. "uqadd v0.16b, v0.16b, v1.16b \n" // add
  2526. MEMACCESS(2)
  2527. "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
  2528. "b.gt 1b \n"
  2529. : "+r"(src_sobelx), // %0
  2530. "+r"(src_sobely), // %1
  2531. "+r"(dst_y), // %2
  2532. "+r"(width) // %3
  2533. :
  2534. : "cc", "memory", "v0", "v1"
  2535. );
  2536. }
  2537. // Mixes Sobel X, Sobel Y and Sobel into ARGB.
  2538. // A = 255
  2539. // R = Sobel X
  2540. // G = Sobel
  2541. // B = Sobel Y
  2542. void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
  2543. uint8* dst_argb, int width) {
  2544. asm volatile (
  2545. "movi v3.8b, #255 \n" // alpha
  2546. // 8 pixel loop.
  2547. "1: \n"
  2548. MEMACCESS(0)
  2549. "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
  2550. MEMACCESS(1)
  2551. "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
  2552. "subs %w3, %w3, #8 \n" // 8 processed per loop.
  2553. "uqadd v1.8b, v0.8b, v2.8b \n" // add
  2554. MEMACCESS(2)
  2555. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
  2556. "b.gt 1b \n"
  2557. : "+r"(src_sobelx), // %0
  2558. "+r"(src_sobely), // %1
  2559. "+r"(dst_argb), // %2
  2560. "+r"(width) // %3
  2561. :
  2562. : "cc", "memory", "v0", "v1", "v2", "v3"
  2563. );
  2564. }
  2565. // SobelX as a matrix is
  2566. // -1 0 1
  2567. // -2 0 2
  2568. // -1 0 1
  2569. void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
  2570. const uint8* src_y2, uint8* dst_sobelx, int width) {
  2571. asm volatile (
  2572. "1: \n"
  2573. MEMACCESS(0)
  2574. "ld1 {v0.8b}, [%0],%5 \n" // top
  2575. MEMACCESS(0)
  2576. "ld1 {v1.8b}, [%0],%6 \n"
  2577. "usubl v0.8h, v0.8b, v1.8b \n"
  2578. MEMACCESS(1)
  2579. "ld1 {v2.8b}, [%1],%5 \n" // center * 2
  2580. MEMACCESS(1)
  2581. "ld1 {v3.8b}, [%1],%6 \n"
  2582. "usubl v1.8h, v2.8b, v3.8b \n"
  2583. "add v0.8h, v0.8h, v1.8h \n"
  2584. "add v0.8h, v0.8h, v1.8h \n"
  2585. MEMACCESS(2)
  2586. "ld1 {v2.8b}, [%2],%5 \n" // bottom
  2587. MEMACCESS(2)
  2588. "ld1 {v3.8b}, [%2],%6 \n"
  2589. "subs %w4, %w4, #8 \n" // 8 pixels
  2590. "usubl v1.8h, v2.8b, v3.8b \n"
  2591. "add v0.8h, v0.8h, v1.8h \n"
  2592. "abs v0.8h, v0.8h \n"
  2593. "uqxtn v0.8b, v0.8h \n"
  2594. MEMACCESS(3)
  2595. "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
  2596. "b.gt 1b \n"
  2597. : "+r"(src_y0), // %0
  2598. "+r"(src_y1), // %1
  2599. "+r"(src_y2), // %2
  2600. "+r"(dst_sobelx), // %3
  2601. "+r"(width) // %4
  2602. : "r"(2LL), // %5
  2603. "r"(6LL) // %6
  2604. : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
  2605. );
  2606. }
  2607. // SobelY as a matrix is
  2608. // -1 -2 -1
  2609. // 0 0 0
  2610. // 1 2 1
  2611. void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
  2612. uint8* dst_sobely, int width) {
  2613. asm volatile (
  2614. "1: \n"
  2615. MEMACCESS(0)
  2616. "ld1 {v0.8b}, [%0],%4 \n" // left
  2617. MEMACCESS(1)
  2618. "ld1 {v1.8b}, [%1],%4 \n"
  2619. "usubl v0.8h, v0.8b, v1.8b \n"
  2620. MEMACCESS(0)
  2621. "ld1 {v2.8b}, [%0],%4 \n" // center * 2
  2622. MEMACCESS(1)
  2623. "ld1 {v3.8b}, [%1],%4 \n"
  2624. "usubl v1.8h, v2.8b, v3.8b \n"
  2625. "add v0.8h, v0.8h, v1.8h \n"
  2626. "add v0.8h, v0.8h, v1.8h \n"
  2627. MEMACCESS(0)
  2628. "ld1 {v2.8b}, [%0],%5 \n" // right
  2629. MEMACCESS(1)
  2630. "ld1 {v3.8b}, [%1],%5 \n"
  2631. "subs %w3, %w3, #8 \n" // 8 pixels
  2632. "usubl v1.8h, v2.8b, v3.8b \n"
  2633. "add v0.8h, v0.8h, v1.8h \n"
  2634. "abs v0.8h, v0.8h \n"
  2635. "uqxtn v0.8b, v0.8h \n"
  2636. MEMACCESS(2)
  2637. "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
  2638. "b.gt 1b \n"
  2639. : "+r"(src_y0), // %0
  2640. "+r"(src_y1), // %1
  2641. "+r"(dst_sobely), // %2
  2642. "+r"(width) // %3
  2643. : "r"(1LL), // %4
  2644. "r"(6LL) // %5
  2645. : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
  2646. );
  2647. }
  2648. #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
  2649. #ifdef __cplusplus
  2650. } // extern "C"
  2651. } // namespace libyuv
  2652. #endif