jsimd_mips_dspr2.S 138 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488
  1. /*
  2. * MIPS DSPr2 optimizations for libjpeg-turbo
  3. *
  4. * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
  5. * All Rights Reserved.
  6. * Authors: Teodora Novkovic (teodora.novkovic@imgtec.com)
  7. * Darko Laus (darko.laus@imgtec.com)
  8. * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
  9. * This software is provided 'as-is', without any express or implied
  10. * warranty. In no event will the authors be held liable for any damages
  11. * arising from the use of this software.
  12. *
  13. * Permission is granted to anyone to use this software for any purpose,
  14. * including commercial applications, and to alter it and redistribute it
  15. * freely, subject to the following restrictions:
  16. *
  17. * 1. The origin of this software must not be misrepresented; you must not
  18. * claim that you wrote the original software. If you use this software
  19. * in a product, an acknowledgment in the product documentation would be
  20. * appreciated but is not required.
  21. * 2. Altered source versions must be plainly marked as such, and must not be
  22. * misrepresented as being the original software.
  23. * 3. This notice may not be removed or altered from any source distribution.
  24. */
  25. #include "jsimd_mips_dspr2_asm.h"
  26. /*****************************************************************************/
  27. LEAF_MIPS_DSPR2(jsimd_c_null_convert_mips_dspr2)
  28. /*
  29. * a0 - cinfo->image_width
  30. * a1 - input_buf
  31. * a2 - output_buf
  32. * a3 - output_row
  33. * 16(sp) - num_rows
  34. * 20(sp) - cinfo->num_components
  35. *
  36. * Null conversion for compression
  37. */
  38. SAVE_REGS_ON_STACK 8, s0, s1
  39. lw t9, 24(sp) // t9 = num_rows
  40. lw s0, 28(sp) // s0 = cinfo->num_components
  41. andi t0, a0, 3 // t0 = cinfo->image_width & 3
  42. beqz t0, 4f // no residual
  43. nop
  44. 0:
  45. addiu t9, t9, -1
  46. bltz t9, 7f
  47. li t1, 0
  48. 1:
  49. sll t3, t1, 2
  50. lwx t5, t3(a2) // t5 = outptr = output_buf[ci]
  51. lw t2, 0(a1) // t2 = inptr = *input_buf
  52. sll t4, a3, 2
  53. lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row]
  54. addu t2, t2, t1
  55. addu s1, t5, a0
  56. addu t6, t5, t0
  57. 2:
  58. lbu t3, 0(t2)
  59. addiu t5, t5, 1
  60. sb t3, -1(t5)
  61. bne t6, t5, 2b
  62. addu t2, t2, s0
  63. 3:
  64. lbu t3, 0(t2)
  65. addu t4, t2, s0
  66. addu t7, t4, s0
  67. addu t8, t7, s0
  68. addu t2, t8, s0
  69. lbu t4, 0(t4)
  70. lbu t7, 0(t7)
  71. lbu t8, 0(t8)
  72. addiu t5, t5, 4
  73. sb t3, -4(t5)
  74. sb t4, -3(t5)
  75. sb t7, -2(t5)
  76. bne s1, t5, 3b
  77. sb t8, -1(t5)
  78. addiu t1, t1, 1
  79. bne t1, s0, 1b
  80. nop
  81. addiu a1, a1, 4
  82. bgez t9, 0b
  83. addiu a3, a3, 1
  84. b 7f
  85. nop
  86. 4:
  87. addiu t9, t9, -1
  88. bltz t9, 7f
  89. li t1, 0
  90. 5:
  91. sll t3, t1, 2
  92. lwx t5, t3(a2) // t5 = outptr = output_buf[ci]
  93. lw t2, 0(a1) // t2 = inptr = *input_buf
  94. sll t4, a3, 2
  95. lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row]
  96. addu t2, t2, t1
  97. addu s1, t5, a0
  98. addu t6, t5, t0
  99. 6:
  100. lbu t3, 0(t2)
  101. addu t4, t2, s0
  102. addu t7, t4, s0
  103. addu t8, t7, s0
  104. addu t2, t8, s0
  105. lbu t4, 0(t4)
  106. lbu t7, 0(t7)
  107. lbu t8, 0(t8)
  108. addiu t5, t5, 4
  109. sb t3, -4(t5)
  110. sb t4, -3(t5)
  111. sb t7, -2(t5)
  112. bne s1, t5, 6b
  113. sb t8, -1(t5)
  114. addiu t1, t1, 1
  115. bne t1, s0, 5b
  116. nop
  117. addiu a1, a1, 4
  118. bgez t9, 4b
  119. addiu a3, a3, 1
  120. 7:
  121. RESTORE_REGS_FROM_STACK 8, s0, s1
  122. j ra
  123. nop
  124. END(jsimd_c_null_convert_mips_dspr2)
  125. /*****************************************************************************/
  126. /*
  127. * jsimd_extrgb_ycc_convert_mips_dspr2
  128. * jsimd_extbgr_ycc_convert_mips_dspr2
  129. * jsimd_extrgbx_ycc_convert_mips_dspr2
  130. * jsimd_extbgrx_ycc_convert_mips_dspr2
  131. * jsimd_extxbgr_ycc_convert_mips_dspr2
  132. * jsimd_extxrgb_ycc_convert_mips_dspr2
  133. *
  134. * Colorspace conversion RGB -> YCbCr
  135. */
  136. .macro GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs
  137. .macro DO_RGB_TO_YCC r, \
  138. g, \
  139. b, \
  140. inptr
  141. lbu \r, \r_offs(\inptr)
  142. lbu \g, \g_offs(\inptr)
  143. lbu \b, \b_offs(\inptr)
  144. addiu \inptr, \pixel_size
  145. .endm
  146. LEAF_MIPS_DSPR2(jsimd_\colorid\()_ycc_convert_mips_dspr2)
  147. /*
  148. * a0 - cinfo->image_width
  149. * a1 - input_buf
  150. * a2 - output_buf
  151. * a3 - output_row
  152. * 16(sp) - num_rows
  153. */
  154. SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  155. lw t7, 48(sp) // t7 = num_rows
  156. li s0, 0x4c8b // FIX(0.29900)
  157. li s1, 0x9646 // FIX(0.58700)
  158. li s2, 0x1d2f // FIX(0.11400)
  159. li s3, 0xffffd4cd // -FIX(0.16874)
  160. li s4, 0xffffab33 // -FIX(0.33126)
  161. li s5, 0x8000 // FIX(0.50000)
  162. li s6, 0xffff94d1 // -FIX(0.41869)
  163. li s7, 0xffffeb2f // -FIX(0.08131)
  164. li t8, 0x807fff // CBCR_OFFSET + ONE_HALF-1
  165. 0:
  166. addiu t7, -1 // --num_rows
  167. lw t6, 0(a1) // t6 = input_buf[0]
  168. lw t0, 0(a2)
  169. lw t1, 4(a2)
  170. lw t2, 8(a2)
  171. sll t3, a3, 2
  172. lwx t0, t3(t0) // t0 = output_buf[0][output_row]
  173. lwx t1, t3(t1) // t1 = output_buf[1][output_row]
  174. lwx t2, t3(t2) // t2 = output_buf[2][output_row]
  175. addu t9, t2, a0 // t9 = end address
  176. addiu a3, 1
  177. 1:
  178. DO_RGB_TO_YCC t3, t4, t5, t6
  179. mtlo s5, $ac0
  180. mtlo t8, $ac1
  181. mtlo t8, $ac2
  182. maddu $ac0, s2, t5
  183. maddu $ac1, s5, t5
  184. maddu $ac2, s5, t3
  185. maddu $ac0, s0, t3
  186. maddu $ac1, s3, t3
  187. maddu $ac2, s6, t4
  188. maddu $ac0, s1, t4
  189. maddu $ac1, s4, t4
  190. maddu $ac2, s7, t5
  191. extr.w t3, $ac0, 16
  192. extr.w t4, $ac1, 16
  193. extr.w t5, $ac2, 16
  194. sb t3, 0(t0)
  195. sb t4, 0(t1)
  196. sb t5, 0(t2)
  197. addiu t0, 1
  198. addiu t2, 1
  199. bne t2, t9, 1b
  200. addiu t1, 1
  201. bgtz t7, 0b
  202. addiu a1, 4
  203. RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  204. j ra
  205. nop
  206. END(jsimd_\colorid\()_ycc_convert_mips_dspr2)
  207. .purgem DO_RGB_TO_YCC
  208. .endm
  209. /*------------------------------------------id -- pix R G B */
  210. GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2
  211. GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0
  212. GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
  213. GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
  214. GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
  215. GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
  216. /*****************************************************************************/
  217. /*
  218. * jsimd_ycc_extrgb_convert_mips_dspr2
  219. * jsimd_ycc_extbgr_convert_mips_dspr2
  220. * jsimd_ycc_extrgbx_convert_mips_dspr2
  221. * jsimd_ycc_extbgrx_convert_mips_dspr2
  222. * jsimd_ycc_extxbgr_convert_mips_dspr2
  223. * jsimd_ycc_extxrgb_convert_mips_dspr2
  224. *
  225. * Colorspace conversion YCbCr -> RGB
  226. */
  227. .macro GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs, a_offs
  228. .macro STORE_YCC_TO_RGB scratch0 \
  229. scratch1 \
  230. scratch2 \
  231. outptr
  232. sb \scratch0, \r_offs(\outptr)
  233. sb \scratch1, \g_offs(\outptr)
  234. sb \scratch2, \b_offs(\outptr)
  235. .if (\pixel_size == 4)
  236. li t0, 0xFF
  237. sb t0, \a_offs(\outptr)
  238. .endif
  239. addiu \outptr, \pixel_size
  240. .endm
  241. LEAF_MIPS_DSPR2(jsimd_ycc_\colorid\()_convert_mips_dspr2)
  242. /*
  243. * a0 - cinfo->image_width
  244. * a1 - input_buf
  245. * a2 - input_row
  246. * a3 - output_buf
  247. * 16(sp) - num_rows
  248. */
  249. SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  250. lw s1, 48(sp)
  251. li t3, 0x8000
  252. li t4, 0x166e9 // FIX(1.40200)
  253. li t5, 0x1c5a2 // FIX(1.77200)
  254. li t6, 0xffff492e // -FIX(0.71414)
  255. li t7, 0xffffa7e6 // -FIX(0.34414)
  256. repl.ph t8, 128
  257. 0:
  258. lw s0, 0(a3)
  259. lw t0, 0(a1)
  260. lw t1, 4(a1)
  261. lw t2, 8(a1)
  262. sll s5, a2, 2
  263. addiu s1, -1
  264. lwx s2, s5(t0)
  265. lwx s3, s5(t1)
  266. lwx s4, s5(t2)
  267. addu t9, s2, a0
  268. addiu a2, 1
  269. 1:
  270. lbu s7, 0(s4) // cr
  271. lbu s6, 0(s3) // cb
  272. lbu s5, 0(s2) // y
  273. addiu s2, 1
  274. addiu s4, 1
  275. addiu s7, -128
  276. addiu s6, -128
  277. mul t2, t7, s6
  278. mul t0, t6, s7 // Crgtab[cr]
  279. sll s7, 15
  280. mulq_rs.w t1, t4, s7 // Crrtab[cr]
  281. sll s6, 15
  282. addu t2, t3 // Cbgtab[cb]
  283. addu t2, t0
  284. mulq_rs.w t0, t5, s6 // Cbbtab[cb]
  285. sra t2, 16
  286. addu t1, s5
  287. addu t2, s5 // add y
  288. ins t2, t1, 16, 16
  289. subu.ph t2, t2, t8
  290. addu t0, s5
  291. shll_s.ph t2, t2, 8
  292. subu t0, 128
  293. shra.ph t2, t2, 8
  294. shll_s.w t0, t0, 24
  295. addu.ph t2, t2, t8 // clip & store
  296. sra t0, t0, 24
  297. sra t1, t2, 16
  298. addiu t0, 128
  299. STORE_YCC_TO_RGB t1, t2, t0, s0
  300. bne s2, t9, 1b
  301. addiu s3, 1
  302. bgtz s1, 0b
  303. addiu a3, 4
  304. RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  305. j ra
  306. nop
  307. END(jsimd_ycc_\colorid\()_convert_mips_dspr2)
  308. .purgem STORE_YCC_TO_RGB
  309. .endm
  310. /*------------------------------------------id -- pix R G B A */
  311. GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2, 3
  312. GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0, 3
  313. GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2, 3
  314. GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0, 3
  315. GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1, 0
  316. GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3, 0
  317. /*****************************************************************************/
  318. /*
  319. * jsimd_extrgb_gray_convert_mips_dspr2
  320. * jsimd_extbgr_gray_convert_mips_dspr2
  321. * jsimd_extrgbx_gray_convert_mips_dspr2
  322. * jsimd_extbgrx_gray_convert_mips_dspr2
  323. * jsimd_extxbgr_gray_convert_mips_dspr2
  324. * jsimd_extxrgb_gray_convert_mips_dspr2
  325. *
  326. * Colorspace conversion RGB -> GRAY
  327. */
  328. .macro GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs
  329. .macro DO_RGB_TO_GRAY r, \
  330. g, \
  331. b, \
  332. inptr
  333. lbu \r, \r_offs(\inptr)
  334. lbu \g, \g_offs(\inptr)
  335. lbu \b, \b_offs(\inptr)
  336. addiu \inptr, \pixel_size
  337. .endm
  338. LEAF_MIPS_DSPR2(jsimd_\colorid\()_gray_convert_mips_dspr2)
  339. /*
  340. * a0 - cinfo->image_width
  341. * a1 - input_buf
  342. * a2 - output_buf
  343. * a3 - output_row
  344. * 16(sp) - num_rows
  345. */
  346. SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  347. li s0, 0x4c8b // s0 = FIX(0.29900)
  348. li s1, 0x9646 // s1 = FIX(0.58700)
  349. li s2, 0x1d2f // s2 = FIX(0.11400)
  350. li s7, 0x8000 // s7 = FIX(0.50000)
  351. lw s6, 48(sp)
  352. andi t7, a0, 3
  353. 0:
  354. addiu s6, -1 // s6 = num_rows
  355. lw t0, 0(a1)
  356. lw t1, 0(a2)
  357. sll t3, a3, 2
  358. lwx t1, t3(t1)
  359. addiu a3, 1
  360. addu t9, t1, a0
  361. subu t8, t9, t7
  362. beq t1, t8, 2f
  363. nop
  364. 1:
  365. DO_RGB_TO_GRAY t3, t4, t5, t0
  366. DO_RGB_TO_GRAY s3, s4, s5, t0
  367. mtlo s7, $ac0
  368. maddu $ac0, s2, t5
  369. maddu $ac0, s1, t4
  370. maddu $ac0, s0, t3
  371. mtlo s7, $ac1
  372. maddu $ac1, s2, s5
  373. maddu $ac1, s1, s4
  374. maddu $ac1, s0, s3
  375. extr.w t6, $ac0, 16
  376. DO_RGB_TO_GRAY t3, t4, t5, t0
  377. DO_RGB_TO_GRAY s3, s4, s5, t0
  378. mtlo s7, $ac0
  379. maddu $ac0, s2, t5
  380. maddu $ac0, s1, t4
  381. extr.w t2, $ac1, 16
  382. maddu $ac0, s0, t3
  383. mtlo s7, $ac1
  384. maddu $ac1, s2, s5
  385. maddu $ac1, s1, s4
  386. maddu $ac1, s0, s3
  387. extr.w t5, $ac0, 16
  388. sb t6, 0(t1)
  389. sb t2, 1(t1)
  390. extr.w t3, $ac1, 16
  391. addiu t1, 4
  392. sb t5, -2(t1)
  393. sb t3, -1(t1)
  394. bne t1, t8, 1b
  395. nop
  396. 2:
  397. beqz t7, 4f
  398. nop
  399. 3:
  400. DO_RGB_TO_GRAY t3, t4, t5, t0
  401. mtlo s7, $ac0
  402. maddu $ac0, s2, t5
  403. maddu $ac0, s1, t4
  404. maddu $ac0, s0, t3
  405. extr.w t6, $ac0, 16
  406. sb t6, 0(t1)
  407. addiu t1, 1
  408. bne t1, t9, 3b
  409. nop
  410. 4:
  411. bgtz s6, 0b
  412. addiu a1, 4
  413. RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  414. j ra
  415. nop
  416. END(jsimd_\colorid\()_gray_convert_mips_dspr2)
  417. .purgem DO_RGB_TO_GRAY
  418. .endm
  419. /*------------------------------------------id -- pix R G B */
  420. GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2
  421. GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0
  422. GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
  423. GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
  424. GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
  425. GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
  426. /*****************************************************************************/
  427. /*
  428. * jsimd_h2v2_merged_upsample_mips_dspr2
  429. * jsimd_h2v2_extrgb_merged_upsample_mips_dspr2
  430. * jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2
  431. * jsimd_h2v2_extbgr_merged_upsample_mips_dspr2
  432. * jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2
  433. * jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2
  434. * jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2
  435. *
  436. * Merged h2v2 upsample routines
  437. */
  438. .macro GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 colorid, \
  439. pixel_size, \
  440. r1_offs, \
  441. g1_offs, \
  442. b1_offs, \
  443. a1_offs, \
  444. r2_offs, \
  445. g2_offs, \
  446. b2_offs, \
  447. a2_offs
  448. .macro STORE_H2V2_2_PIXELS scratch0 \
  449. scratch1 \
  450. scratch2 \
  451. scratch3 \
  452. scratch4 \
  453. scratch5 \
  454. outptr
  455. sb \scratch0, \r1_offs(\outptr)
  456. sb \scratch1, \g1_offs(\outptr)
  457. sb \scratch2, \b1_offs(\outptr)
  458. sb \scratch3, \r2_offs(\outptr)
  459. sb \scratch4, \g2_offs(\outptr)
  460. sb \scratch5, \b2_offs(\outptr)
  461. .if (\pixel_size == 8)
  462. li \scratch0, 0xFF
  463. sb \scratch0, \a1_offs(\outptr)
  464. sb \scratch0, \a2_offs(\outptr)
  465. .endif
  466. addiu \outptr, \pixel_size
  467. .endm
  468. .macro STORE_H2V2_1_PIXEL scratch0 \
  469. scratch1 \
  470. scratch2 \
  471. outptr
  472. sb \scratch0, \r1_offs(\outptr)
  473. sb \scratch1, \g1_offs(\outptr)
  474. sb \scratch2, \b1_offs(\outptr)
  475. .if (\pixel_size == 8)
  476. li t0, 0xFF
  477. sb t0, \a1_offs(\outptr)
  478. .endif
  479. .endm
  480. LEAF_MIPS_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2)
  481. /*
  482. * a0 - cinfo->output_width
  483. * a1 - input_buf
  484. * a2 - in_row_group_ctr
  485. * a3 - output_buf
  486. * 16(sp) - cinfo->sample_range_limit
  487. */
  488. SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
  489. lw t9, 56(sp) // cinfo->sample_range_limit
  490. lw v0, 0(a1)
  491. lw v1, 4(a1)
  492. lw t0, 8(a1)
  493. sll t1, a2, 3
  494. addiu t2, t1, 4
  495. sll t3, a2, 2
  496. lw t4, 0(a3) // t4 = output_buf[0]
  497. lwx t1, t1(v0) // t1 = input_buf[0][in_row_group_ctr*2]
  498. lwx t2, t2(v0) // t2 = input_buf[0][in_row_group_ctr*2 + 1]
  499. lwx t5, t3(v1) // t5 = input_buf[1][in_row_group_ctr]
  500. lwx t6, t3(t0) // t6 = input_buf[2][in_row_group_ctr]
  501. lw t7, 4(a3) // t7 = output_buf[1]
  502. li s1, 0xe6ea
  503. addiu t8, s1, 0x7fff // t8 = 0x166e9 [FIX(1.40200)]
  504. addiu s0, t8, 0x5eb9 // s0 = 0x1c5a2 [FIX(1.77200)]
  505. addiu s1, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)]
  506. xori s2, s1, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)]
  507. srl t3, a0, 1
  508. blez t3, 2f
  509. addu t0, t5, t3 // t0 = end address
  510. 1:
  511. lbu t3, 0(t5)
  512. lbu s3, 0(t6)
  513. addiu t5, t5, 1
  514. addiu t3, t3, -128 // (cb - 128)
  515. addiu s3, s3, -128 // (cr - 128)
  516. mult $ac1, s1, t3
  517. madd $ac1, s2, s3
  518. sll s3, s3, 15
  519. sll t3, t3, 15
  520. mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
  521. extr_r.w s5, $ac1, 16
  522. mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
  523. lbu v0, 0(t1)
  524. addiu t6, t6, 1
  525. addiu t1, t1, 2
  526. addu t3, v0, s4 // y+cred
  527. addu s3, v0, s5 // y+cgreen
  528. addu v1, v0, s6 // y+cblue
  529. addu t3, t9, t3 // y+cred
  530. addu s3, t9, s3 // y+cgreen
  531. addu v1, t9, v1 // y+cblue
  532. lbu AT, 0(t3)
  533. lbu s7, 0(s3)
  534. lbu ra, 0(v1)
  535. lbu v0, -1(t1)
  536. addu t3, v0, s4 // y+cred
  537. addu s3, v0, s5 // y+cgreen
  538. addu v1, v0, s6 // y+cblue
  539. addu t3, t9, t3 // y+cred
  540. addu s3, t9, s3 // y+cgreen
  541. addu v1, t9, v1 // y+cblue
  542. lbu t3, 0(t3)
  543. lbu s3, 0(s3)
  544. lbu v1, 0(v1)
  545. lbu v0, 0(t2)
  546. STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
  547. addu t3, v0, s4 // y+cred
  548. addu s3, v0, s5 // y+cgreen
  549. addu v1, v0, s6 // y+cblue
  550. addu t3, t9, t3 // y+cred
  551. addu s3, t9, s3 // y+cgreen
  552. addu v1, t9, v1 // y+cblue
  553. lbu AT, 0(t3)
  554. lbu s7, 0(s3)
  555. lbu ra, 0(v1)
  556. lbu v0, 1(t2)
  557. addiu t2, t2, 2
  558. addu t3, v0, s4 // y+cred
  559. addu s3, v0, s5 // y+cgreen
  560. addu v1, v0, s6 // y+cblue
  561. addu t3, t9, t3 // y+cred
  562. addu s3, t9, s3 // y+cgreen
  563. addu v1, t9, v1 // y+cblue
  564. lbu t3, 0(t3)
  565. lbu s3, 0(s3)
  566. lbu v1, 0(v1)
  567. STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
  568. bne t0, t5, 1b
  569. nop
  570. 2:
  571. andi t0, a0, 1
  572. beqz t0, 4f
  573. lbu t3, 0(t5)
  574. lbu s3, 0(t6)
  575. addiu t3, t3, -128 // (cb - 128)
  576. addiu s3, s3, -128 // (cr - 128)
  577. mult $ac1, s1, t3
  578. madd $ac1, s2, s3
  579. sll s3, s3, 15
  580. sll t3, t3, 15
  581. lbu v0, 0(t1)
  582. extr_r.w s5, $ac1, 16
  583. mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
  584. mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
  585. addu t3, v0, s4 // y+cred
  586. addu s3, v0, s5 // y+cgreen
  587. addu v1, v0, s6 // y+cblue
  588. addu t3, t9, t3 // y+cred
  589. addu s3, t9, s3 // y+cgreen
  590. addu v1, t9, v1 // y+cblue
  591. lbu t3, 0(t3)
  592. lbu s3, 0(s3)
  593. lbu v1, 0(v1)
  594. lbu v0, 0(t2)
  595. STORE_H2V2_1_PIXEL t3, s3, v1, t4
  596. addu t3, v0, s4 // y+cred
  597. addu s3, v0, s5 // y+cgreen
  598. addu v1, v0, s6 // y+cblue
  599. addu t3, t9, t3 // y+cred
  600. addu s3, t9, s3 // y+cgreen
  601. addu v1, t9, v1 // y+cblue
  602. lbu t3, 0(t3)
  603. lbu s3, 0(s3)
  604. lbu v1, 0(v1)
  605. STORE_H2V2_1_PIXEL t3, s3, v1, t7
  606. 4:
  607. RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
  608. j ra
  609. nop
  610. END(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2)
  611. .purgem STORE_H2V2_1_PIXEL
  612. .purgem STORE_H2V2_2_PIXELS
  613. .endm
  614. /*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
  615. GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
  616. GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
  617. GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
  618. GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
  619. GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
  620. GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
  621. /*****************************************************************************/
  622. /*
  623. * jsimd_h2v1_merged_upsample_mips_dspr2
  624. * jsimd_h2v1_extrgb_merged_upsample_mips_dspr2
  625. * jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2
  626. * jsimd_h2v1_extbgr_merged_upsample_mips_dspr2
  627. * jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2
  628. * jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2
  629. * jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2
  630. *
  631. * Merged h2v1 upsample routines
  632. */
  633. .macro GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 colorid, \
  634. pixel_size, \
  635. r1_offs, \
  636. g1_offs, \
  637. b1_offs, \
  638. a1_offs, \
  639. r2_offs, \
  640. g2_offs, \
  641. b2_offs, \
  642. a2_offs
  643. .macro STORE_H2V1_2_PIXELS scratch0 \
  644. scratch1 \
  645. scratch2 \
  646. scratch3 \
  647. scratch4 \
  648. scratch5 \
  649. outptr
  650. sb \scratch0, \r1_offs(\outptr)
  651. sb \scratch1, \g1_offs(\outptr)
  652. sb \scratch2, \b1_offs(\outptr)
  653. sb \scratch3, \r2_offs(\outptr)
  654. sb \scratch4, \g2_offs(\outptr)
  655. sb \scratch5, \b2_offs(\outptr)
  656. .if (\pixel_size == 8)
  657. li t0, 0xFF
  658. sb t0, \a1_offs(\outptr)
  659. sb t0, \a2_offs(\outptr)
  660. .endif
  661. addiu \outptr, \pixel_size
  662. .endm
  663. .macro STORE_H2V1_1_PIXEL scratch0 \
  664. scratch1 \
  665. scratch2 \
  666. outptr
  667. sb \scratch0, \r1_offs(\outptr)
  668. sb \scratch1, \g1_offs(\outptr)
  669. sb \scratch2, \b1_offs(\outptr)
  670. .if (\pixel_size == 8)
  671. li t0, 0xFF
  672. sb t0, \a1_offs(\outptr)
  673. .endif
  674. .endm
  675. LEAF_MIPS_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2)
  676. /*
  677. * a0 - cinfo->output_width
  678. * a1 - input_buf
  679. * a2 - in_row_group_ctr
  680. * a3 - output_buf
  681. * 16(sp) - range_limit
  682. */
  683. SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
  684. li t0, 0xe6ea
  685. lw t1, 0(a1) // t1 = input_buf[0]
  686. lw t2, 4(a1) // t2 = input_buf[1]
  687. lw t3, 8(a1) // t3 = input_buf[2]
  688. lw t8, 56(sp) // t8 = range_limit
  689. addiu s1, t0, 0x7fff // s1 = 0x166e9 [FIX(1.40200)]
  690. addiu s2, s1, 0x5eb9 // s2 = 0x1c5a2 [FIX(1.77200)]
  691. addiu s0, t0, 0x9916 // s0 = 0x8000
  692. addiu s4, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)]
  693. xori s3, s4, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)]
  694. srl t0, a0, 1
  695. sll t4, a2, 2
  696. lwx s5, t4(t1) // s5 = inptr0
  697. lwx s6, t4(t2) // s6 = inptr1
  698. lwx s7, t4(t3) // s7 = inptr2
  699. lw t7, 0(a3) // t7 = outptr
  700. blez t0, 2f
  701. addu t9, s6, t0 // t9 = end address
  702. 1:
  703. lbu t2, 0(s6) // t2 = cb
  704. lbu t0, 0(s7) // t0 = cr
  705. lbu t1, 0(s5) // t1 = y
  706. addiu t2, t2, -128 // t2 = cb - 128
  707. addiu t0, t0, -128 // t0 = cr - 128
  708. mult $ac1, s4, t2
  709. madd $ac1, s3, t0
  710. sll t0, t0, 15
  711. sll t2, t2, 15
  712. mulq_rs.w t0, s1, t0 // t0 = (C1*cr + ONE_HALF)>> SCALEBITS
  713. extr_r.w t5, $ac1, 16
  714. mulq_rs.w t6, s2, t2 // t6 = (C2*cb + ONE_HALF)>> SCALEBITS
  715. addiu s7, s7, 1
  716. addiu s6, s6, 1
  717. addu t2, t1, t0 // t2 = y + cred
  718. addu t3, t1, t5 // t3 = y + cgreen
  719. addu t4, t1, t6 // t4 = y + cblue
  720. addu t2, t8, t2
  721. addu t3, t8, t3
  722. addu t4, t8, t4
  723. lbu t1, 1(s5)
  724. lbu v0, 0(t2)
  725. lbu v1, 0(t3)
  726. lbu ra, 0(t4)
  727. addu t2, t1, t0
  728. addu t3, t1, t5
  729. addu t4, t1, t6
  730. addu t2, t8, t2
  731. addu t3, t8, t3
  732. addu t4, t8, t4
  733. lbu t2, 0(t2)
  734. lbu t3, 0(t3)
  735. lbu t4, 0(t4)
  736. STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
  737. bne t9, s6, 1b
  738. addiu s5, s5, 2
  739. 2:
  740. andi t0, a0, 1
  741. beqz t0, 4f
  742. nop
  743. 3:
  744. lbu t2, 0(s6)
  745. lbu t0, 0(s7)
  746. lbu t1, 0(s5)
  747. addiu t2, t2, -128 //(cb - 128)
  748. addiu t0, t0, -128 //(cr - 128)
  749. mul t3, s4, t2
  750. mul t4, s3, t0
  751. sll t0, t0, 15
  752. sll t2, t2, 15
  753. mulq_rs.w t0, s1, t0 // (C1*cr + ONE_HALF)>> SCALEBITS
  754. mulq_rs.w t6, s2, t2 // (C2*cb + ONE_HALF)>> SCALEBITS
  755. addu t3, t3, s0
  756. addu t3, t4, t3
  757. sra t5, t3, 16 // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS
  758. addu t2, t1, t0 // y + cred
  759. addu t3, t1, t5 // y + cgreen
  760. addu t4, t1, t6 // y + cblue
  761. addu t2, t8, t2
  762. addu t3, t8, t3
  763. addu t4, t8, t4
  764. lbu t2, 0(t2)
  765. lbu t3, 0(t3)
  766. lbu t4, 0(t4)
  767. STORE_H2V1_1_PIXEL t2, t3, t4, t7
  768. 4:
  769. RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
  770. j ra
  771. nop
  772. END(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2)
  773. .purgem STORE_H2V1_1_PIXEL
  774. .purgem STORE_H2V1_2_PIXELS
  775. .endm
  776. /*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
  777. GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
  778. GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
  779. GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
  780. GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
  781. GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
  782. GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
  783. /*****************************************************************************/
  784. /*
  785. * jsimd_h2v2_fancy_upsample_mips_dspr2
  786. *
  787. * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
  788. */
  789. LEAF_MIPS_DSPR2(jsimd_h2v2_fancy_upsample_mips_dspr2)
  790. /*
  791. * a0 - cinfo->max_v_samp_factor
  792. * a1 - downsampled_width
  793. * a2 - input_data
  794. * a3 - output_data_ptr
  795. */
  796. SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
  797. li s4, 0
  798. lw s2, 0(a3) // s2 = *output_data_ptr
  799. 0:
  800. li t9, 2
  801. lw s1, -4(a2) // s1 = inptr1
  802. 1:
  803. lw s0, 0(a2) // s0 = inptr0
  804. lwx s3, s4(s2)
  805. addiu s5, a1, -2 // s5 = downsampled_width - 2
  806. srl t4, s5, 1
  807. sll t4, t4, 1
  808. lbu t0, 0(s0)
  809. lbu t1, 1(s0)
  810. lbu t2, 0(s1)
  811. lbu t3, 1(s1)
  812. addiu s0, 2
  813. addiu s1, 2
  814. addu t8, s0, t4 // t8 = end address
  815. andi s5, s5, 1 // s5 = residual
  816. sll t4, t0, 1
  817. sll t6, t1, 1
  818. addu t0, t0, t4 // t0 = (*inptr0++) * 3
  819. addu t1, t1, t6 // t1 = (*inptr0++) * 3
  820. addu t7, t0, t2 // t7 = thiscolsum
  821. addu t6, t1, t3 // t5 = nextcolsum
  822. sll t0, t7, 2 // t0 = thiscolsum * 4
  823. subu t1, t0, t7 // t1 = thiscolsum * 3
  824. shra_r.w t0, t0, 4
  825. addiu t1, 7
  826. addu t1, t1, t6
  827. srl t1, t1, 4
  828. sb t0, 0(s3)
  829. sb t1, 1(s3)
  830. beq t8, s0, 22f // skip to final iteration if width == 3
  831. addiu s3, 2
  832. 2:
  833. lh t0, 0(s0) // t0 = A3|A2
  834. lh t2, 0(s1) // t2 = B3|B2
  835. addiu s0, 2
  836. addiu s1, 2
  837. preceu.ph.qbr t0, t0 // t0 = 0|A3|0|A2
  838. preceu.ph.qbr t2, t2 // t2 = 0|B3|0|B2
  839. shll.ph t1, t0, 1
  840. sll t3, t6, 1
  841. addu.ph t0, t1, t0 // t0 = A3*3|A2*3
  842. addu t3, t3, t6 // t3 = this * 3
  843. addu.ph t0, t0, t2 // t0 = next2|next1
  844. addu t1, t3, t7
  845. andi t7, t0, 0xFFFF // t7 = next1
  846. sll t2, t7, 1
  847. addu t2, t7, t2 // t2 = next1*3
  848. addu t4, t2, t6
  849. srl t6, t0, 16 // t6 = next2
  850. shra_r.w t1, t1, 4 // t1 = (this*3 + last + 8) >> 4
  851. addu t0, t3, t7
  852. addiu t0, 7
  853. srl t0, t0, 4 // t0 = (this*3 + next1 + 7) >> 4
  854. shra_r.w t4, t4, 4 // t3 = (next1*3 + this + 8) >> 4
  855. addu t2, t2, t6
  856. addiu t2, 7
  857. srl t2, t2, 4 // t2 = (next1*3 + next2 + 7) >> 4
  858. sb t1, 0(s3)
  859. sb t0, 1(s3)
  860. sb t4, 2(s3)
  861. sb t2, 3(s3)
  862. bne t8, s0, 2b
  863. addiu s3, 4
  864. 22:
  865. beqz s5, 4f
  866. addu t8, s0, s5
  867. 3:
  868. lbu t0, 0(s0)
  869. lbu t2, 0(s1)
  870. addiu s0, 1
  871. addiu s1, 1
  872. sll t3, t6, 1
  873. sll t1, t0, 1
  874. addu t1, t0, t1 // t1 = inptr0 * 3
  875. addu t3, t3, t6 // t3 = thiscolsum * 3
  876. addu t5, t1, t2
  877. addu t1, t3, t7
  878. shra_r.w t1, t1, 4
  879. addu t0, t3, t5
  880. addiu t0, 7
  881. srl t0, t0, 4
  882. sb t1, 0(s3)
  883. sb t0, 1(s3)
  884. addiu s3, 2
  885. move t7, t6
  886. bne t8, s0, 3b
  887. move t6, t5
  888. 4:
  889. sll t0, t6, 2 // t0 = thiscolsum * 4
  890. subu t1, t0, t6 // t1 = thiscolsum * 3
  891. addu t1, t1, t7
  892. addiu s4, 4
  893. shra_r.w t1, t1, 4
  894. addiu t0, 7
  895. srl t0, t0, 4
  896. sb t1, 0(s3)
  897. sb t0, 1(s3)
  898. addiu t9, -1
  899. addiu s3, 2
  900. bnez t9, 1b
  901. lw s1, 4(a2)
  902. srl t0, s4, 2
  903. subu t0, a0, t0
  904. bgtz t0, 0b
  905. addiu a2, 4
  906. RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
  907. j ra
  908. nop
  909. END(jsimd_h2v2_fancy_upsample_mips_dspr2)
  910. /*****************************************************************************/
  911. LEAF_MIPS_DSPR2(jsimd_h2v1_fancy_upsample_mips_dspr2)
  912. /*
  913. * a0 - cinfo->max_v_samp_factor
  914. * a1 - downsampled_width
  915. * a2 - input_data
  916. * a3 - output_data_ptr
  917. */
  918. SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
  919. .set at
  920. beqz a0, 3f
  921. sll t0, a0, 2
  922. lw s1, 0(a3)
  923. li s3, 0x10001
  924. addu s0, s1, t0
  925. 0:
  926. addiu t8, a1, -2
  927. srl t9, t8, 2
  928. lw t7, 0(a2)
  929. lw s2, 0(s1)
  930. lbu t0, 0(t7)
  931. lbu t1, 1(t7) // t1 = inptr[1]
  932. sll t2, t0, 1
  933. addu t2, t2, t0 // t2 = invalue*3
  934. addu t2, t2, t1
  935. shra_r.w t2, t2, 2
  936. sb t0, 0(s2)
  937. sb t2, 1(s2)
  938. beqz t9, 11f
  939. addiu s2, 2
  940. 1:
  941. ulw t0, 0(t7) // t0 = |P3|P2|P1|P0|
  942. ulw t1, 1(t7)
  943. ulh t2, 4(t7) // t2 = |0|0|P5|P4|
  944. preceu.ph.qbl t3, t0 // t3 = |0|P3|0|P2|
  945. preceu.ph.qbr t0, t0 // t0 = |0|P1|0|P0|
  946. preceu.ph.qbr t2, t2 // t2 = |0|P5|0|P4|
  947. preceu.ph.qbl t4, t1 // t4 = |0|P4|0|P3|
  948. preceu.ph.qbr t1, t1 // t1 = |0|P2|0|P1|
  949. shll.ph t5, t4, 1
  950. shll.ph t6, t1, 1
  951. addu.ph t5, t5, t4 // t5 = |P4*3|P3*3|
  952. addu.ph t6, t6, t1 // t6 = |P2*3|P1*3|
  953. addu.ph t4, t3, s3
  954. addu.ph t0, t0, s3
  955. addu.ph t4, t4, t5
  956. addu.ph t0, t0, t6
  957. shrl.ph t4, t4, 2 // t4 = |0|P3|0|P2|
  958. shrl.ph t0, t0, 2 // t0 = |0|P1|0|P0|
  959. addu.ph t2, t2, t5
  960. addu.ph t3, t3, t6
  961. shra_r.ph t2, t2, 2 // t2 = |0|P5|0|P4|
  962. shra_r.ph t3, t3, 2 // t3 = |0|P3|0|P2|
  963. shll.ph t2, t2, 8
  964. shll.ph t3, t3, 8
  965. or t2, t4, t2
  966. or t3, t3, t0
  967. addiu t9, -1
  968. usw t3, 0(s2)
  969. usw t2, 4(s2)
  970. addiu s2, 8
  971. bgtz t9, 1b
  972. addiu t7, 4
  973. 11:
  974. andi t8, 3
  975. beqz t8, 22f
  976. addiu t7, 1
  977. 2:
  978. lbu t0, 0(t7)
  979. addiu t7, 1
  980. sll t1, t0, 1
  981. addu t2, t0, t1 // t2 = invalue
  982. lbu t3, -2(t7)
  983. lbu t4, 0(t7)
  984. addiu t3, 1
  985. addiu t4, 2
  986. addu t3, t3, t2
  987. addu t4, t4, t2
  988. srl t3, 2
  989. srl t4, 2
  990. sb t3, 0(s2)
  991. sb t4, 1(s2)
  992. addiu t8, -1
  993. bgtz t8, 2b
  994. addiu s2, 2
  995. 22:
  996. lbu t0, 0(t7)
  997. lbu t2, -1(t7)
  998. sll t1, t0, 1
  999. addu t1, t1, t0 // t1 = invalue * 3
  1000. addu t1, t1, t2
  1001. addiu t1, 1
  1002. srl t1, t1, 2
  1003. sb t1, 0(s2)
  1004. sb t0, 1(s2)
  1005. addiu s1, 4
  1006. bne s1, s0, 0b
  1007. addiu a2, 4
  1008. 3:
  1009. RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
  1010. j ra
  1011. nop
  1012. END(jsimd_h2v1_fancy_upsample_mips_dspr2)
  1013. /*****************************************************************************/
  1014. LEAF_MIPS_DSPR2(jsimd_h2v1_downsample_mips_dspr2)
  1015. /*
  1016. * a0 - cinfo->image_width
  1017. * a1 - cinfo->max_v_samp_factor
  1018. * a2 - compptr->v_samp_factor
  1019. * a3 - compptr->width_in_blocks
  1020. * 16(sp) - input_data
  1021. * 20(sp) - output_data
  1022. */
  1023. .set at
  1024. SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
  1025. beqz a2, 7f
  1026. lw s1, 44(sp) // s1 = output_data
  1027. lw s0, 40(sp) // s0 = input_data
  1028. srl s2, a0, 2
  1029. andi t9, a0, 2
  1030. srl t7, t9, 1
  1031. addu s2, t7, s2
  1032. sll t0, a3, 3 // t0 = width_in_blocks*DCT
  1033. srl t7, t0, 1
  1034. subu s2, t7, s2
  1035. 0:
  1036. andi t6, a0, 1 // t6 = temp_index
  1037. addiu t6, -1
  1038. lw t4, 0(s1) // t4 = outptr
  1039. lw t5, 0(s0) // t5 = inptr0
  1040. li s3, 0 // s3 = bias
  1041. srl t7, a0, 1 // t7 = image_width1
  1042. srl s4, t7, 2
  1043. andi t8, t7, 3
  1044. 1:
  1045. ulhu t0, 0(t5)
  1046. ulhu t1, 2(t5)
  1047. ulhu t2, 4(t5)
  1048. ulhu t3, 6(t5)
  1049. raddu.w.qb t0, t0
  1050. raddu.w.qb t1, t1
  1051. raddu.w.qb t2, t2
  1052. raddu.w.qb t3, t3
  1053. shra.ph t0, t0, 1
  1054. shra_r.ph t1, t1, 1
  1055. shra.ph t2, t2, 1
  1056. shra_r.ph t3, t3, 1
  1057. sb t0, 0(t4)
  1058. sb t1, 1(t4)
  1059. sb t2, 2(t4)
  1060. sb t3, 3(t4)
  1061. addiu s4, -1
  1062. addiu t4, 4
  1063. bgtz s4, 1b
  1064. addiu t5, 8
  1065. beqz t8, 3f
  1066. addu s4, t4, t8
  1067. 2:
  1068. ulhu t0, 0(t5)
  1069. raddu.w.qb t0, t0
  1070. addqh.w t0, t0, s3
  1071. xori s3, s3, 1
  1072. sb t0, 0(t4)
  1073. addiu t4, 1
  1074. bne t4, s4, 2b
  1075. addiu t5, 2
  1076. 3:
  1077. lbux t1, t6(t5)
  1078. sll t1, 1
  1079. addqh.w t2, t1, s3 // t2 = pixval1
  1080. xori s3, s3, 1
  1081. addqh.w t3, t1, s3 // t3 = pixval2
  1082. blez s2, 5f
  1083. append t3, t2, 8
  1084. addu t5, t4, s2 // t5 = loop_end2
  1085. 4:
  1086. ush t3, 0(t4)
  1087. addiu s2, -1
  1088. bgtz s2, 4b
  1089. addiu t4, 2
  1090. 5:
  1091. beqz t9, 6f
  1092. nop
  1093. sb t2, 0(t4)
  1094. 6:
  1095. addiu s1, 4
  1096. addiu a2, -1
  1097. bnez a2, 0b
  1098. addiu s0, 4
  1099. 7:
  1100. RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
  1101. j ra
  1102. nop
  1103. END(jsimd_h2v1_downsample_mips_dspr2)
  1104. /*****************************************************************************/
  1105. LEAF_MIPS_DSPR2(jsimd_h2v2_downsample_mips_dspr2)
  1106. /*
  1107. * a0 - cinfo->image_width
  1108. * a1 - cinfo->max_v_samp_factor
  1109. * a2 - compptr->v_samp_factor
  1110. * a3 - compptr->width_in_blocks
  1111. * 16(sp) - input_data
  1112. * 20(sp) - output_data
  1113. */
  1114. .set at
  1115. SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  1116. beqz a2, 8f
  1117. lw s1, 52(sp) // s1 = output_data
  1118. lw s0, 48(sp) // s0 = input_data
  1119. andi t6, a0, 1 // t6 = temp_index
  1120. addiu t6, -1
  1121. srl t7, a0, 1 // t7 = image_width1
  1122. srl s4, t7, 2
  1123. andi t8, t7, 3
  1124. andi t9, a0, 2
  1125. srl s2, a0, 2
  1126. srl t7, t9, 1
  1127. addu s2, t7, s2
  1128. sll t0, a3, 3 // s2 = width_in_blocks*DCT
  1129. srl t7, t0, 1
  1130. subu s2, t7, s2
  1131. 0:
  1132. lw t4, 0(s1) // t4 = outptr
  1133. lw t5, 0(s0) // t5 = inptr0
  1134. lw s7, 4(s0) // s7 = inptr1
  1135. li s6, 1 // s6 = bias
  1136. 2:
  1137. ulw t0, 0(t5) // t0 = |P3|P2|P1|P0|
  1138. ulw t1, 0(s7) // t1 = |Q3|Q2|Q1|Q0|
  1139. ulw t2, 4(t5)
  1140. ulw t3, 4(s7)
  1141. precrq.ph.w t7, t0, t1 // t2 = |P3|P2|Q3|Q2|
  1142. ins t0, t1, 16, 16 // t0 = |Q1|Q0|P1|P0|
  1143. raddu.w.qb t1, t7
  1144. raddu.w.qb t0, t0
  1145. shra_r.w t1, t1, 2
  1146. addiu t0, 1
  1147. srl t0, 2
  1148. precrq.ph.w t7, t2, t3
  1149. ins t2, t3, 16, 16
  1150. raddu.w.qb t7, t7
  1151. raddu.w.qb t2, t2
  1152. shra_r.w t7, t7, 2
  1153. addiu t2, 1
  1154. srl t2, 2
  1155. sb t0, 0(t4)
  1156. sb t1, 1(t4)
  1157. sb t2, 2(t4)
  1158. sb t7, 3(t4)
  1159. addiu t4, 4
  1160. addiu t5, 8
  1161. addiu s4, s4, -1
  1162. bgtz s4, 2b
  1163. addiu s7, 8
  1164. beqz t8, 4f
  1165. addu t8, t4, t8
  1166. 3:
  1167. ulhu t0, 0(t5)
  1168. ulhu t1, 0(s7)
  1169. ins t0, t1, 16, 16
  1170. raddu.w.qb t0, t0
  1171. addu t0, t0, s6
  1172. srl t0, 2
  1173. xori s6, s6, 3
  1174. sb t0, 0(t4)
  1175. addiu t5, 2
  1176. addiu t4, 1
  1177. bne t8, t4, 3b
  1178. addiu s7, 2
  1179. 4:
  1180. lbux t1, t6(t5)
  1181. sll t1, 1
  1182. lbux t0, t6(s7)
  1183. sll t0, 1
  1184. addu t1, t1, t0
  1185. addu t3, t1, s6
  1186. srl t0, t3, 2 // t2 = pixval1
  1187. xori s6, s6, 3
  1188. addu t2, t1, s6
  1189. srl t1, t2, 2 // t3 = pixval2
  1190. blez s2, 6f
  1191. append t1, t0, 8
  1192. 5:
  1193. ush t1, 0(t4)
  1194. addiu s2, -1
  1195. bgtz s2, 5b
  1196. addiu t4, 2
  1197. 6:
  1198. beqz t9, 7f
  1199. nop
  1200. sb t0, 0(t4)
  1201. 7:
  1202. addiu s1, 4
  1203. addiu a2, -1
  1204. bnez a2, 0b
  1205. addiu s0, 8
  1206. 8:
  1207. RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  1208. j ra
  1209. nop
  1210. END(jsimd_h2v2_downsample_mips_dspr2)
  1211. /*****************************************************************************/
  1212. LEAF_MIPS_DSPR2(jsimd_h2v2_smooth_downsample_mips_dspr2)
  1213. /*
  1214. * a0 - input_data
  1215. * a1 - output_data
  1216. * a2 - compptr->v_samp_factor
  1217. * a3 - cinfo->max_v_samp_factor
  1218. * 16(sp) - cinfo->smoothing_factor
  1219. * 20(sp) - compptr->width_in_blocks
  1220. * 24(sp) - cinfo->image_width
  1221. */
  1222. .set at
  1223. SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  1224. lw s7, 52(sp) // compptr->width_in_blocks
  1225. lw s0, 56(sp) // cinfo->image_width
  1226. lw s6, 48(sp) // cinfo->smoothing_factor
  1227. sll s7, 3 // output_cols = width_in_blocks * DCTSIZE
  1228. sll v0, s7, 1
  1229. subu v0, v0, s0
  1230. blez v0, 2f
  1231. move v1, zero
  1232. addiu t0, a3, 2 // t0 = cinfo->max_v_samp_factor + 2
  1233. 0:
  1234. addiu t1, a0, -4
  1235. sll t2, v1, 2
  1236. lwx t1, t2(t1)
  1237. move t3, v0
  1238. addu t1, t1, s0
  1239. lbu t2, -1(t1)
  1240. 1:
  1241. addiu t3, t3, -1
  1242. sb t2, 0(t1)
  1243. bgtz t3, 1b
  1244. addiu t1, t1, 1
  1245. addiu v1, v1, 1
  1246. bne v1, t0, 0b
  1247. nop
  1248. 2:
  1249. li v0, 80
  1250. mul v0, s6, v0
  1251. li v1, 16384
  1252. move t4, zero
  1253. move t5, zero
  1254. subu t6, v1, v0 // t6 = 16384 - tmp_smoot_f * 80
  1255. sll t7, s6, 4 // t7 = tmp_smoot_f * 16
  1256. 3:
  1257. /* Special case for first column: pretend column -1 is same as column 0 */
  1258. sll v0, t4, 2
  1259. lwx t8, v0(a1) // outptr = output_data[outrow]
  1260. sll v1, t5, 2
  1261. addiu t9, v1, 4
  1262. addiu s0, v1, -4
  1263. addiu s1, v1, 8
  1264. lwx s2, v1(a0) // inptr0 = input_data[inrow]
  1265. lwx t9, t9(a0) // inptr1 = input_data[inrow+1]
  1266. lwx s0, s0(a0) // above_ptr = input_data[inrow-1]
  1267. lwx s1, s1(a0) // below_ptr = input_data[inrow+2]
  1268. lh v0, 0(s2)
  1269. lh v1, 0(t9)
  1270. lh t0, 0(s0)
  1271. lh t1, 0(s1)
  1272. ins v0, v1, 16, 16
  1273. ins t0, t1, 16, 16
  1274. raddu.w.qb t2, v0
  1275. raddu.w.qb s3, t0
  1276. lbu v0, 0(s2)
  1277. lbu v1, 2(s2)
  1278. lbu t0, 0(t9)
  1279. lbu t1, 2(t9)
  1280. addu v0, v0, v1
  1281. mult $ac1,t2, t6
  1282. addu t0, t0, t1
  1283. lbu t2, 2(s0)
  1284. addu t0, t0, v0
  1285. lbu t3, 2(s1)
  1286. addu s3, t0, s3
  1287. lbu v0, 0(s0)
  1288. lbu t0, 0(s1)
  1289. sll s3, s3, 1
  1290. addu v0, v0, t2
  1291. addu t0, t0, t3
  1292. addu t0, t0, v0
  1293. addu s3, t0, s3
  1294. madd $ac1,s3, t7
  1295. extr_r.w v0, $ac1, 16
  1296. addiu t8, t8, 1
  1297. addiu s2, s2, 2
  1298. addiu t9, t9, 2
  1299. addiu s0, s0, 2
  1300. addiu s1, s1, 2
  1301. sb v0, -1(t8)
  1302. addiu s4, s7, -2
  1303. and s4, s4, 3
  1304. addu s5, s4, t8 //end adress
  1305. 4:
  1306. lh v0, 0(s2)
  1307. lh v1, 0(t9)
  1308. lh t0, 0(s0)
  1309. lh t1, 0(s1)
  1310. ins v0, v1, 16, 16
  1311. ins t0, t1, 16, 16
  1312. raddu.w.qb t2, v0
  1313. raddu.w.qb s3, t0
  1314. lbu v0, -1(s2)
  1315. lbu v1, 2(s2)
  1316. lbu t0, -1(t9)
  1317. lbu t1, 2(t9)
  1318. addu v0, v0, v1
  1319. mult $ac1, t2, t6
  1320. addu t0, t0, t1
  1321. lbu t2, 2(s0)
  1322. addu t0, t0, v0
  1323. lbu t3, 2(s1)
  1324. addu s3, t0, s3
  1325. lbu v0, -1(s0)
  1326. lbu t0, -1(s1)
  1327. sll s3, s3, 1
  1328. addu v0, v0, t2
  1329. addu t0, t0, t3
  1330. addu t0, t0, v0
  1331. addu s3, t0, s3
  1332. madd $ac1, s3, t7
  1333. extr_r.w t2, $ac1, 16
  1334. addiu t8, t8, 1
  1335. addiu s2, s2, 2
  1336. addiu t9, t9, 2
  1337. addiu s0, s0, 2
  1338. sb t2, -1(t8)
  1339. bne s5, t8, 4b
  1340. addiu s1, s1, 2
  1341. addiu s5, s7, -2
  1342. subu s5, s5, s4
  1343. addu s5, s5, t8 //end adress
  1344. 5:
  1345. lh v0, 0(s2)
  1346. lh v1, 0(t9)
  1347. lh t0, 0(s0)
  1348. lh t1, 0(s1)
  1349. ins v0, v1, 16, 16
  1350. ins t0, t1, 16, 16
  1351. raddu.w.qb t2, v0
  1352. raddu.w.qb s3, t0
  1353. lbu v0, -1(s2)
  1354. lbu v1, 2(s2)
  1355. lbu t0, -1(t9)
  1356. lbu t1, 2(t9)
  1357. addu v0, v0, v1
  1358. mult $ac1, t2, t6
  1359. addu t0, t0, t1
  1360. lbu t2, 2(s0)
  1361. addu t0, t0, v0
  1362. lbu t3, 2(s1)
  1363. addu s3, t0, s3
  1364. lbu v0, -1(s0)
  1365. lbu t0, -1(s1)
  1366. sll s3, s3, 1
  1367. addu v0, v0, t2
  1368. addu t0, t0, t3
  1369. lh v1, 2(t9)
  1370. addu t0, t0, v0
  1371. lh v0, 2(s2)
  1372. addu s3, t0, s3
  1373. lh t0, 2(s0)
  1374. lh t1, 2(s1)
  1375. madd $ac1, s3, t7
  1376. extr_r.w t2, $ac1, 16
  1377. ins t0, t1, 16, 16
  1378. ins v0, v1, 16, 16
  1379. raddu.w.qb s3, t0
  1380. lbu v1, 4(s2)
  1381. lbu t0, 1(t9)
  1382. lbu t1, 4(t9)
  1383. sb t2, 0(t8)
  1384. raddu.w.qb t3, v0
  1385. lbu v0, 1(s2)
  1386. addu t0, t0, t1
  1387. mult $ac1, t3, t6
  1388. addu v0, v0, v1
  1389. lbu t2, 4(s0)
  1390. addu t0, t0, v0
  1391. lbu v0, 1(s0)
  1392. addu s3, t0, s3
  1393. lbu t0, 1(s1)
  1394. lbu t3, 4(s1)
  1395. addu v0, v0, t2
  1396. sll s3, s3, 1
  1397. addu t0, t0, t3
  1398. lh v1, 4(t9)
  1399. addu t0, t0, v0
  1400. lh v0, 4(s2)
  1401. addu s3, t0, s3
  1402. lh t0, 4(s0)
  1403. lh t1, 4(s1)
  1404. madd $ac1, s3, t7
  1405. extr_r.w t2, $ac1, 16
  1406. ins t0, t1, 16, 16
  1407. ins v0, v1, 16, 16
  1408. raddu.w.qb s3, t0
  1409. lbu v1, 6(s2)
  1410. lbu t0, 3(t9)
  1411. lbu t1, 6(t9)
  1412. sb t2, 1(t8)
  1413. raddu.w.qb t3, v0
  1414. lbu v0, 3(s2)
  1415. addu t0, t0,t1
  1416. mult $ac1, t3, t6
  1417. addu v0, v0, v1
  1418. lbu t2, 6(s0)
  1419. addu t0, t0, v0
  1420. lbu v0, 3(s0)
  1421. addu s3, t0, s3
  1422. lbu t0, 3(s1)
  1423. lbu t3, 6(s1)
  1424. addu v0, v0, t2
  1425. sll s3, s3, 1
  1426. addu t0, t0, t3
  1427. lh v1, 6(t9)
  1428. addu t0, t0, v0
  1429. lh v0, 6(s2)
  1430. addu s3, t0, s3
  1431. lh t0, 6(s0)
  1432. lh t1, 6(s1)
  1433. madd $ac1, s3, t7
  1434. extr_r.w t3, $ac1, 16
  1435. ins t0, t1, 16, 16
  1436. ins v0, v1, 16, 16
  1437. raddu.w.qb s3, t0
  1438. lbu v1, 8(s2)
  1439. lbu t0, 5(t9)
  1440. lbu t1, 8(t9)
  1441. sb t3, 2(t8)
  1442. raddu.w.qb t2, v0
  1443. lbu v0, 5(s2)
  1444. addu t0, t0, t1
  1445. mult $ac1, t2, t6
  1446. addu v0, v0, v1
  1447. lbu t2, 8(s0)
  1448. addu t0, t0, v0
  1449. lbu v0, 5(s0)
  1450. addu s3, t0, s3
  1451. lbu t0, 5(s1)
  1452. lbu t3, 8(s1)
  1453. addu v0, v0, t2
  1454. sll s3, s3, 1
  1455. addu t0, t0, t3
  1456. addiu t8, t8, 4
  1457. addu t0, t0, v0
  1458. addiu s2, s2, 8
  1459. addu s3, t0, s3
  1460. addiu t9, t9, 8
  1461. madd $ac1, s3, t7
  1462. extr_r.w t1, $ac1, 16
  1463. addiu s0, s0, 8
  1464. addiu s1, s1, 8
  1465. bne s5, t8, 5b
  1466. sb t1, -1(t8)
  1467. /* Special case for last column */
  1468. lh v0, 0(s2)
  1469. lh v1, 0(t9)
  1470. lh t0, 0(s0)
  1471. lh t1, 0(s1)
  1472. ins v0, v1, 16, 16
  1473. ins t0, t1, 16, 16
  1474. raddu.w.qb t2, v0
  1475. raddu.w.qb s3, t0
  1476. lbu v0, -1(s2)
  1477. lbu v1, 1(s2)
  1478. lbu t0, -1(t9)
  1479. lbu t1, 1(t9)
  1480. addu v0, v0, v1
  1481. mult $ac1, t2, t6
  1482. addu t0, t0, t1
  1483. lbu t2, 1(s0)
  1484. addu t0, t0, v0
  1485. lbu t3, 1(s1)
  1486. addu s3, t0, s3
  1487. lbu v0, -1(s0)
  1488. lbu t0, -1(s1)
  1489. sll s3, s3, 1
  1490. addu v0, v0, t2
  1491. addu t0, t0, t3
  1492. addu t0, t0, v0
  1493. addu s3, t0, s3
  1494. madd $ac1, s3, t7
  1495. extr_r.w t0, $ac1, 16
  1496. addiu t5, t5, 2
  1497. sb t0, 0(t8)
  1498. addiu t4, t4, 1
  1499. bne t4, a2, 3b
  1500. addiu t5, t5, 2
  1501. RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  1502. j ra
  1503. nop
  1504. END(jsimd_h2v2_smooth_downsample_mips_dspr2)
  1505. /*****************************************************************************/
  1506. LEAF_MIPS_DSPR2(jsimd_int_upsample_mips_dspr2)
  1507. /*
  1508. * a0 - upsample->h_expand[compptr->component_index]
  1509. * a1 - upsample->v_expand[compptr->component_index]
  1510. * a2 - input_data
  1511. * a3 - output_data_ptr
  1512. * 16(sp) - cinfo->output_width
  1513. * 20(sp) - cinfo->max_v_samp_factor
  1514. */
  1515. .set at
  1516. SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
  1517. lw s0, 0(a3) // s0 = output_data
  1518. lw s1, 32(sp) // s1 = cinfo->output_width
  1519. lw s2, 36(sp) // s2 = cinfo->max_v_samp_factor
  1520. li t6, 0 // t6 = inrow
  1521. beqz s2, 10f
  1522. li s3, 0 // s3 = outrow
  1523. 0:
  1524. addu t0, a2, t6
  1525. addu t7, s0, s3
  1526. lw t3, 0(t0) // t3 = inptr
  1527. lw t8, 0(t7) // t8 = outptr
  1528. beqz s1, 4f
  1529. addu t5, t8, s1 // t5 = outend
  1530. 1:
  1531. lb t2, 0(t3) // t2 = invalue = *inptr++
  1532. addiu t3, 1
  1533. beqz a0, 3f
  1534. move t0, a0 // t0 = h_expand
  1535. 2:
  1536. sb t2, 0(t8)
  1537. addiu t0, -1
  1538. bgtz t0, 2b
  1539. addiu t8, 1
  1540. 3:
  1541. bgt t5, t8, 1b
  1542. nop
  1543. 4:
  1544. addiu t9, a1, -1 // t9 = v_expand - 1
  1545. blez t9, 9f
  1546. nop
  1547. 5:
  1548. lw t3, 0(s0)
  1549. lw t4, 4(s0)
  1550. subu t0, s1, 0xF
  1551. blez t0, 7f
  1552. addu t5, t3, s1 // t5 = end address
  1553. andi t7, s1, 0xF // t7 = residual
  1554. subu t8, t5, t7
  1555. 6:
  1556. ulw t0, 0(t3)
  1557. ulw t1, 4(t3)
  1558. ulw t2, 8(t3)
  1559. usw t0, 0(t4)
  1560. ulw t0, 12(t3)
  1561. usw t1, 4(t4)
  1562. usw t2, 8(t4)
  1563. usw t0, 12(t4)
  1564. addiu t3, 16
  1565. bne t3, t8, 6b
  1566. addiu t4, 16
  1567. beqz t7, 8f
  1568. nop
  1569. 7:
  1570. lbu t0, 0(t3)
  1571. sb t0, 0(t4)
  1572. addiu t3, 1
  1573. bne t3, t5, 7b
  1574. addiu t4, 1
  1575. 8:
  1576. addiu t9, -1
  1577. bgtz t9, 5b
  1578. addiu s0, 8
  1579. 9:
  1580. addu s3, s3, a1
  1581. bne s3, s2, 0b
  1582. addiu t6, 1
  1583. 10:
  1584. RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
  1585. j ra
  1586. nop
  1587. END(jsimd_int_upsample_mips_dspr2)
  1588. /*****************************************************************************/
  1589. LEAF_MIPS_DSPR2(jsimd_h2v1_upsample_mips_dspr2)
  1590. /*
  1591. * a0 - cinfo->max_v_samp_factor
  1592. * a1 - cinfo->output_width
  1593. * a2 - input_data
  1594. * a3 - output_data_ptr
  1595. */
  1596. lw t7, 0(a3) // t7 = output_data
  1597. andi t8, a1, 0xf // t8 = residual
  1598. sll t0, a0, 2
  1599. blez a0, 4f
  1600. addu t9, t7, t0 // t9 = output_data end address
  1601. 0:
  1602. lw t5, 0(t7) // t5 = outptr
  1603. lw t6, 0(a2) // t6 = inptr
  1604. addu t3, t5, a1 // t3 = outptr + output_width (end address)
  1605. subu t3, t8 // t3 = end address - residual
  1606. beq t5, t3, 2f
  1607. move t4, t8
  1608. 1:
  1609. ulw t0, 0(t6) // t0 = |P3|P2|P1|P0|
  1610. ulw t2, 4(t6) // t2 = |P7|P6|P5|P4|
  1611. srl t1, t0, 16 // t1 = |X|X|P3|P2|
  1612. ins t0, t0, 16, 16 // t0 = |P1|P0|P1|P0|
  1613. ins t1, t1, 16, 16 // t1 = |P3|P2|P3|P2|
  1614. ins t0, t0, 8, 16 // t0 = |P1|P1|P0|P0|
  1615. ins t1, t1, 8, 16 // t1 = |P3|P3|P2|P2|
  1616. usw t0, 0(t5)
  1617. usw t1, 4(t5)
  1618. srl t0, t2, 16 // t0 = |X|X|P7|P6|
  1619. ins t2, t2, 16, 16 // t2 = |P5|P4|P5|P4|
  1620. ins t0, t0, 16, 16 // t0 = |P7|P6|P7|P6|
  1621. ins t2, t2, 8, 16 // t2 = |P5|P5|P4|P4|
  1622. ins t0, t0, 8, 16 // t0 = |P7|P7|P6|P6|
  1623. usw t2, 8(t5)
  1624. usw t0, 12(t5)
  1625. addiu t5, 16
  1626. bne t5, t3, 1b
  1627. addiu t6, 8
  1628. beqz t8, 3f
  1629. move t4, t8
  1630. 2:
  1631. lbu t1, 0(t6)
  1632. sb t1, 0(t5)
  1633. sb t1, 1(t5)
  1634. addiu t4, -2
  1635. addiu t6, 1
  1636. bgtz t4, 2b
  1637. addiu t5, 2
  1638. 3:
  1639. addiu t7, 4
  1640. bne t9, t7, 0b
  1641. addiu a2, 4
  1642. 4:
  1643. j ra
  1644. nop
  1645. END(jsimd_h2v1_upsample_mips_dspr2)
  1646. /*****************************************************************************/
  1647. LEAF_MIPS_DSPR2(jsimd_h2v2_upsample_mips_dspr2)
  1648. /*
  1649. * a0 - cinfo->max_v_samp_factor
  1650. * a1 - cinfo->output_width
  1651. * a2 - input_data
  1652. * a3 - output_data_ptr
  1653. */
  1654. lw t7, 0(a3)
  1655. blez a0, 7f
  1656. andi t9, a1, 0xf // t9 = residual
  1657. 0:
  1658. lw t6, 0(a2) // t6 = inptr
  1659. lw t5, 0(t7) // t5 = outptr
  1660. addu t8, t5, a1 // t8 = outptr end address
  1661. subu t8, t9 // t8 = end address - residual
  1662. beq t5, t8, 2f
  1663. move t4, t9
  1664. 1:
  1665. ulw t0, 0(t6)
  1666. srl t1, t0, 16
  1667. ins t0, t0, 16, 16
  1668. ins t0, t0, 8, 16
  1669. ins t1, t1, 16, 16
  1670. ins t1, t1, 8, 16
  1671. ulw t2, 4(t6)
  1672. usw t0, 0(t5)
  1673. usw t1, 4(t5)
  1674. srl t3, t2, 16
  1675. ins t2, t2, 16, 16
  1676. ins t2, t2, 8, 16
  1677. ins t3, t3, 16, 16
  1678. ins t3, t3, 8, 16
  1679. usw t2, 8(t5)
  1680. usw t3, 12(t5)
  1681. addiu t5, 16
  1682. bne t5, t8, 1b
  1683. addiu t6, 8
  1684. beqz t9, 3f
  1685. move t4, t9
  1686. 2:
  1687. lbu t0, 0(t6)
  1688. sb t0, 0(t5)
  1689. sb t0, 1(t5)
  1690. addiu t4, -2
  1691. addiu t6, 1
  1692. bgtz t4, 2b
  1693. addiu t5, 2
  1694. 3:
  1695. lw t6, 0(t7) // t6 = outptr[0]
  1696. lw t5, 4(t7) // t5 = outptr[1]
  1697. addu t4, t6, a1 // t4 = new end address
  1698. beq a1, t9, 5f
  1699. subu t8, t4, t9
  1700. 4:
  1701. ulw t0, 0(t6)
  1702. ulw t1, 4(t6)
  1703. ulw t2, 8(t6)
  1704. usw t0, 0(t5)
  1705. ulw t0, 12(t6)
  1706. usw t1, 4(t5)
  1707. usw t2, 8(t5)
  1708. usw t0, 12(t5)
  1709. addiu t6, 16
  1710. bne t6, t8, 4b
  1711. addiu t5, 16
  1712. beqz t9, 6f
  1713. nop
  1714. 5:
  1715. lbu t0, 0(t6)
  1716. sb t0, 0(t5)
  1717. addiu t6, 1
  1718. bne t6, t4, 5b
  1719. addiu t5, 1
  1720. 6:
  1721. addiu t7, 8
  1722. addiu a0, -2
  1723. bgtz a0, 0b
  1724. addiu a2, 4
  1725. 7:
  1726. j ra
  1727. nop
  1728. END(jsimd_h2v2_upsample_mips_dspr2)
  1729. /*****************************************************************************/
  1730. LEAF_MIPS_DSPR2(jsimd_idct_islow_mips_dspr2)
  1731. /*
  1732. * a0 - coef_block
  1733. * a1 - compptr->dcttable
  1734. * a2 - output
  1735. * a3 - range_limit
  1736. */
  1737. SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  1738. addiu sp, sp, -256
  1739. move v0, sp
  1740. addiu v1, zero, 8 // v1 = DCTSIZE = 8
  1741. 1:
  1742. lh s4, 32(a0) // s4 = inptr[16]
  1743. lh s5, 64(a0) // s5 = inptr[32]
  1744. lh s6, 96(a0) // s6 = inptr[48]
  1745. lh t1, 112(a0) // t1 = inptr[56]
  1746. lh t7, 16(a0) // t7 = inptr[8]
  1747. lh t5, 80(a0) // t5 = inptr[40]
  1748. lh t3, 48(a0) // t3 = inptr[24]
  1749. or s4, s4, t1
  1750. or s4, s4, t3
  1751. or s4, s4, t5
  1752. or s4, s4, t7
  1753. or s4, s4, s5
  1754. or s4, s4, s6
  1755. bnez s4, 2f
  1756. addiu v1, v1, -1
  1757. lh s5, 0(a1) // quantptr[DCTSIZE*0]
  1758. lh s6, 0(a0) // inptr[DCTSIZE*0]
  1759. mul s5, s5, s6 // DEQUANTIZE(inptr[0], quantptr[0])
  1760. sll s5, s5, 2
  1761. sw s5, 0(v0)
  1762. sw s5, 32(v0)
  1763. sw s5, 64(v0)
  1764. sw s5, 96(v0)
  1765. sw s5, 128(v0)
  1766. sw s5, 160(v0)
  1767. sw s5, 192(v0)
  1768. b 3f
  1769. sw s5, 224(v0)
  1770. 2:
  1771. lh t0, 112(a1)
  1772. lh t2, 48(a1)
  1773. lh t4, 80(a1)
  1774. lh t6, 16(a1)
  1775. mul t0, t0, t1 // DEQUANTIZE(inptr[DCTSIZE*7],quant[DCTSIZE*7])
  1776. mul t1, t2, t3 // DEQUANTIZE(inptr[DCTSIZE*3],quant[DCTSIZE*3])
  1777. mul t2, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*5],quant[DCTSIZE*5])
  1778. mul t3, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*1],quant[DCTSIZE*1])
  1779. lh t4, 32(a1)
  1780. lh t5, 32(a0)
  1781. lh t6, 96(a1)
  1782. lh t7, 96(a0)
  1783. addu s0, t0, t1 // z3 = tmp0 + tmp2
  1784. addu s1, t1, t2 // z2 = tmp1 + tmp2
  1785. addu s2, t2, t3 // z4 = tmp1 + tmp3
  1786. addu s3, s0, s2 // z3 + z4
  1787. addiu t9, zero, 9633 // FIX_1_175875602
  1788. mul s3, s3, t9 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
  1789. addu t8, t0, t3 // z1 = tmp0 + tmp3
  1790. addiu t9, zero, 2446 // FIX_0_298631336
  1791. mul t0, t0, t9 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
  1792. addiu t9, zero, 16819 // FIX_2_053119869
  1793. mul t2, t2, t9 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
  1794. addiu t9, zero, 25172 // FIX_3_072711026
  1795. mul t1, t1, t9 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
  1796. addiu t9, zero, 12299 // FIX_1_501321110
  1797. mul t3, t3, t9 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
  1798. addiu t9, zero, 16069 // FIX_1_961570560
  1799. mul s0, s0, t9 // -z3 = MULTIPLY(z3, FIX_1_961570560)
  1800. addiu t9, zero, 3196 // FIX_0_390180644
  1801. mul s2, s2, t9 // -z4 = MULTIPLY(z4, FIX_0_390180644)
  1802. addiu t9, zero, 7373 // FIX_0_899976223
  1803. mul t8, t8, t9 // -z1 = MULTIPLY(z1, FIX_0_899976223)
  1804. addiu t9, zero, 20995 // FIX_2_562915447
  1805. mul s1, s1, t9 // -z2 = MULTIPLY(z2, FIX_2_562915447)
  1806. subu s0, s3, s0 // z3 += z5
  1807. addu t0, t0, s0 // tmp0 += z3
  1808. addu t1, t1, s0 // tmp2 += z3
  1809. subu s2, s3, s2 // z4 += z5
  1810. addu t2, t2, s2 // tmp1 += z4
  1811. addu t3, t3, s2 // tmp3 += z4
  1812. subu t0, t0, t8 // tmp0 += z1
  1813. subu t1, t1, s1 // tmp2 += z2
  1814. subu t2, t2, s1 // tmp1 += z2
  1815. subu t3, t3, t8 // tmp3 += z1
  1816. mul s0, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*2],quant[DCTSIZE*2])
  1817. addiu t9, zero, 6270 // FIX_0_765366865
  1818. mul s1, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*6],quant[DCTSIZE*6])
  1819. lh t4, 0(a1)
  1820. lh t5, 0(a0)
  1821. lh t6, 64(a1)
  1822. lh t7, 64(a0)
  1823. mul s2, t9, s0 // MULTIPLY(z2, FIX_0_765366865)
  1824. mul t5, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*0],quant[DCTSIZE*0])
  1825. mul t6, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*4],quant[DCTSIZE*4])
  1826. addiu t9, zero, 4433 // FIX_0_541196100
  1827. addu s3, s0, s1 // z2 + z3
  1828. mul s3, s3, t9 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
  1829. addiu t9, zero, 15137 // FIX_1_847759065
  1830. mul t8, s1, t9 // MULTIPLY(z3, FIX_1_847759065)
  1831. addu t4, t5, t6
  1832. subu t5, t5, t6
  1833. sll t4, t4, 13 // tmp0 = (z2 + z3) << CONST_BITS
  1834. sll t5, t5, 13 // tmp1 = (z2 - z3) << CONST_BITS
  1835. addu t7, s3, s2 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
  1836. subu t6, s3, t8 // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
  1837. addu s0, t4, t7
  1838. subu s1, t4, t7
  1839. addu s2, t5, t6
  1840. subu s3, t5, t6
  1841. addu t4, s0, t3
  1842. subu s0, s0, t3
  1843. addu t3, s2, t1
  1844. subu s2, s2, t1
  1845. addu t1, s3, t2
  1846. subu s3, s3, t2
  1847. addu t2, s1, t0
  1848. subu s1, s1, t0
  1849. shra_r.w t4, t4, 11
  1850. shra_r.w t3, t3, 11
  1851. shra_r.w t1, t1, 11
  1852. shra_r.w t2, t2, 11
  1853. shra_r.w s1, s1, 11
  1854. shra_r.w s3, s3, 11
  1855. shra_r.w s2, s2, 11
  1856. shra_r.w s0, s0, 11
  1857. sw t4, 0(v0)
  1858. sw t3, 32(v0)
  1859. sw t1, 64(v0)
  1860. sw t2, 96(v0)
  1861. sw s1, 128(v0)
  1862. sw s3, 160(v0)
  1863. sw s2, 192(v0)
  1864. sw s0, 224(v0)
  1865. 3:
  1866. addiu a1, a1, 2
  1867. addiu a0, a0, 2
  1868. bgtz v1, 1b
  1869. addiu v0, v0, 4
  1870. move v0, sp
  1871. addiu v1, zero, 8
  1872. 4:
  1873. lw t0, 8(v0) // z2 = (JLONG) wsptr[2]
  1874. lw t1, 24(v0) // z3 = (JLONG) wsptr[6]
  1875. lw t2, 0(v0) // (JLONG) wsptr[0]
  1876. lw t3, 16(v0) // (JLONG) wsptr[4]
  1877. lw s4, 4(v0) // (JLONG) wsptr[1]
  1878. lw s5, 12(v0) // (JLONG) wsptr[3]
  1879. lw s6, 20(v0) // (JLONG) wsptr[5]
  1880. lw s7, 28(v0) // (JLONG) wsptr[7]
  1881. or s4, s4, t0
  1882. or s4, s4, t1
  1883. or s4, s4, t3
  1884. or s4, s4, s7
  1885. or s4, s4, s5
  1886. or s4, s4, s6
  1887. bnez s4, 5f
  1888. addiu v1, v1, -1
  1889. shra_r.w s5, t2, 5
  1890. andi s5, s5, 0x3ff
  1891. lbux s5, s5(a3)
  1892. lw s1, 0(a2)
  1893. replv.qb s5, s5
  1894. usw s5, 0(s1)
  1895. usw s5, 4(s1)
  1896. b 6f
  1897. nop
  1898. 5:
  1899. addu t4, t0, t1 // z2 + z3
  1900. addiu t8, zero, 4433 // FIX_0_541196100
  1901. mul t5, t4, t8 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
  1902. addiu t8, zero, 15137 // FIX_1_847759065
  1903. mul t1, t1, t8 // MULTIPLY(z3, FIX_1_847759065)
  1904. addiu t8, zero, 6270 // FIX_0_765366865
  1905. mul t0, t0, t8 // MULTIPLY(z2, FIX_0_765366865)
  1906. addu t4, t2, t3 // (JLONG) wsptr[0] + (JLONG) wsptr[4]
  1907. subu t2, t2, t3 // (JLONG) wsptr[0] - (JLONG) wsptr[4]
  1908. sll t4, t4, 13 // tmp0 = ((wsptr[0] + wsptr[4]) << CONST_BITS
  1909. sll t2, t2, 13 // tmp1 = ((wsptr[0] - wsptr[4]) << CONST_BITS
  1910. subu t1, t5, t1 // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
  1911. subu t3, t2, t1 // tmp12 = tmp1 - tmp2
  1912. addu t2, t2, t1 // tmp11 = tmp1 + tmp2
  1913. addu t5, t5, t0 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
  1914. subu t1, t4, t5 // tmp13 = tmp0 - tmp3
  1915. addu t0, t4, t5 // tmp10 = tmp0 + tmp3
  1916. lw t4, 28(v0) // tmp0 = (JLONG) wsptr[7]
  1917. lw t6, 12(v0) // tmp2 = (JLONG) wsptr[3]
  1918. lw t5, 20(v0) // tmp1 = (JLONG) wsptr[5]
  1919. lw t7, 4(v0) // tmp3 = (JLONG) wsptr[1]
  1920. addu s0, t4, t6 // z3 = tmp0 + tmp2
  1921. addiu t8, zero, 9633 // FIX_1_175875602
  1922. addu s1, t5, t7 // z4 = tmp1 + tmp3
  1923. addu s2, s0, s1 // z3 + z4
  1924. mul s2, s2, t8 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
  1925. addu s3, t4, t7 // z1 = tmp0 + tmp3
  1926. addu t9, t5, t6 // z2 = tmp1 + tmp2
  1927. addiu t8, zero, 16069 // FIX_1_961570560
  1928. mul s0, s0, t8 // -z3 = MULTIPLY(z3, FIX_1_961570560)
  1929. addiu t8, zero, 3196 // FIX_0_390180644
  1930. mul s1, s1, t8 // -z4 = MULTIPLY(z4, FIX_0_390180644)
  1931. addiu t8, zero, 2446 // FIX_0_298631336
  1932. mul t4, t4, t8 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
  1933. addiu t8, zero, 7373 // FIX_0_899976223
  1934. mul s3, s3, t8 // -z1 = MULTIPLY(z1, FIX_0_899976223)
  1935. addiu t8, zero, 16819 // FIX_2_053119869
  1936. mul t5, t5, t8 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
  1937. addiu t8, zero, 20995 // FIX_2_562915447
  1938. mul t9, t9, t8 // -z2 = MULTIPLY(z2, FIX_2_562915447)
  1939. addiu t8, zero, 25172 // FIX_3_072711026
  1940. mul t6, t6, t8 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
  1941. addiu t8, zero, 12299 // FIX_1_501321110
  1942. mul t7, t7, t8 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
  1943. subu s0, s2, s0 // z3 += z5
  1944. subu s1, s2, s1 // z4 += z5
  1945. addu t4, t4, s0
  1946. subu t4, t4, s3 // tmp0
  1947. addu t5, t5, s1
  1948. subu t5, t5, t9 // tmp1
  1949. addu t6, t6, s0
  1950. subu t6, t6, t9 // tmp2
  1951. addu t7, t7, s1
  1952. subu t7, t7, s3 // tmp3
  1953. addu s0, t0, t7
  1954. subu t0, t0, t7
  1955. addu t7, t2, t6
  1956. subu t2, t2, t6
  1957. addu t6, t3, t5
  1958. subu t3, t3, t5
  1959. addu t5, t1, t4
  1960. subu t1, t1, t4
  1961. shra_r.w s0, s0, 18
  1962. shra_r.w t7, t7, 18
  1963. shra_r.w t6, t6, 18
  1964. shra_r.w t5, t5, 18
  1965. shra_r.w t1, t1, 18
  1966. shra_r.w t3, t3, 18
  1967. shra_r.w t2, t2, 18
  1968. shra_r.w t0, t0, 18
  1969. andi s0, s0, 0x3ff
  1970. andi t7, t7, 0x3ff
  1971. andi t6, t6, 0x3ff
  1972. andi t5, t5, 0x3ff
  1973. andi t1, t1, 0x3ff
  1974. andi t3, t3, 0x3ff
  1975. andi t2, t2, 0x3ff
  1976. andi t0, t0, 0x3ff
  1977. lw s1, 0(a2)
  1978. lbux s0, s0(a3)
  1979. lbux t7, t7(a3)
  1980. lbux t6, t6(a3)
  1981. lbux t5, t5(a3)
  1982. lbux t1, t1(a3)
  1983. lbux t3, t3(a3)
  1984. lbux t2, t2(a3)
  1985. lbux t0, t0(a3)
  1986. sb s0, 0(s1)
  1987. sb t7, 1(s1)
  1988. sb t6, 2(s1)
  1989. sb t5, 3(s1)
  1990. sb t1, 4(s1)
  1991. sb t3, 5(s1)
  1992. sb t2, 6(s1)
  1993. sb t0, 7(s1)
  1994. 6:
  1995. addiu v0, v0, 32
  1996. bgtz v1, 4b
  1997. addiu a2, a2, 4
  1998. addiu sp, sp, 256
  1999. RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  2000. j ra
  2001. nop
  2002. END(jsimd_idct_islow_mips_dspr2)
  2003. /*****************************************************************************/
  2004. LEAF_MIPS_DSPR2(jsimd_idct_ifast_cols_mips_dspr2)
  2005. /*
  2006. * a0 - inptr
  2007. * a1 - quantptr
  2008. * a2 - wsptr
  2009. * a3 - mips_idct_ifast_coefs
  2010. */
  2011. SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  2012. addiu t9, a0, 16 // end address
  2013. or AT, a3, zero
  2014. 0:
  2015. lw s0, 0(a1) // quantptr[DCTSIZE*0]
  2016. lw t0, 0(a0) // inptr[DCTSIZE*0]
  2017. lw t1, 16(a0) // inptr[DCTSIZE*1]
  2018. muleq_s.w.phl v0, t0, s0 // tmp0 ...
  2019. lw t2, 32(a0) // inptr[DCTSIZE*2]
  2020. lw t3, 48(a0) // inptr[DCTSIZE*3]
  2021. lw t4, 64(a0) // inptr[DCTSIZE*4]
  2022. lw t5, 80(a0) // inptr[DCTSIZE*5]
  2023. muleq_s.w.phr t0, t0, s0 // ... tmp0 ...
  2024. lw t6, 96(a0) // inptr[DCTSIZE*6]
  2025. lw t7, 112(a0) // inptr[DCTSIZE*7]
  2026. or s4, t1, t2
  2027. or s5, t3, t4
  2028. bnez s4, 1f
  2029. ins t0, v0, 16, 16 // ... tmp0
  2030. bnez s5, 1f
  2031. or s6, t5, t6
  2032. or s6, s6, t7
  2033. bnez s6, 1f
  2034. sw t0, 0(a2) // wsptr[DCTSIZE*0]
  2035. sw t0, 16(a2) // wsptr[DCTSIZE*1]
  2036. sw t0, 32(a2) // wsptr[DCTSIZE*2]
  2037. sw t0, 48(a2) // wsptr[DCTSIZE*3]
  2038. sw t0, 64(a2) // wsptr[DCTSIZE*4]
  2039. sw t0, 80(a2) // wsptr[DCTSIZE*5]
  2040. sw t0, 96(a2) // wsptr[DCTSIZE*6]
  2041. sw t0, 112(a2) // wsptr[DCTSIZE*7]
  2042. addiu a0, a0, 4
  2043. b 2f
  2044. addiu a1, a1, 4
  2045. 1:
  2046. lw s1, 32(a1) // quantptr[DCTSIZE*2]
  2047. lw s2, 64(a1) // quantptr[DCTSIZE*4]
  2048. muleq_s.w.phl v0, t2, s1 // tmp1 ...
  2049. muleq_s.w.phr t2, t2, s1 // ... tmp1 ...
  2050. lw s0, 16(a1) // quantptr[DCTSIZE*1]
  2051. lw s1, 48(a1) // quantptr[DCTSIZE*3]
  2052. lw s3, 96(a1) // quantptr[DCTSIZE*6]
  2053. muleq_s.w.phl v1, t4, s2 // tmp2 ...
  2054. muleq_s.w.phr t4, t4, s2 // ... tmp2 ...
  2055. lw s2, 80(a1) // quantptr[DCTSIZE*5]
  2056. lw t8, 4(AT) // FIX(1.414213562)
  2057. ins t2, v0, 16, 16 // ... tmp1
  2058. muleq_s.w.phl v0, t6, s3 // tmp3 ...
  2059. muleq_s.w.phr t6, t6, s3 // ... tmp3 ...
  2060. ins t4, v1, 16, 16 // ... tmp2
  2061. addq.ph s4, t0, t4 // tmp10
  2062. subq.ph s5, t0, t4 // tmp11
  2063. ins t6, v0, 16, 16 // ... tmp3
  2064. subq.ph s6, t2, t6 // tmp12 ...
  2065. addq.ph s7, t2, t6 // tmp13
  2066. mulq_s.ph s6, s6, t8 // ... tmp12 ...
  2067. addq.ph t0, s4, s7 // tmp0
  2068. subq.ph t6, s4, s7 // tmp3
  2069. muleq_s.w.phl v0, t1, s0 // tmp4 ...
  2070. muleq_s.w.phr t1, t1, s0 // ... tmp4 ...
  2071. shll_s.ph s6, s6, 1 // x2
  2072. lw s3, 112(a1) // quantptr[DCTSIZE*7]
  2073. subq.ph s6, s6, s7 // ... tmp12
  2074. muleq_s.w.phl v1, t7, s3 // tmp7 ...
  2075. muleq_s.w.phr t7, t7, s3 // ... tmp7 ...
  2076. ins t1, v0, 16, 16 // ... tmp4
  2077. addq.ph t2, s5, s6 // tmp1
  2078. subq.ph t4, s5, s6 // tmp2
  2079. muleq_s.w.phl v0, t5, s2 // tmp6 ...
  2080. muleq_s.w.phr t5, t5, s2 // ... tmp6 ...
  2081. ins t7, v1, 16, 16 // ... tmp7
  2082. addq.ph s5, t1, t7 // z11
  2083. subq.ph s6, t1, t7 // z12
  2084. muleq_s.w.phl v1, t3, s1 // tmp5 ...
  2085. muleq_s.w.phr t3, t3, s1 // ... tmp5 ...
  2086. ins t5, v0, 16, 16 // ... tmp6
  2087. ins t3, v1, 16, 16 // ... tmp5
  2088. addq.ph s7, t5, t3 // z13
  2089. subq.ph v0, t5, t3 // z10
  2090. addq.ph t7, s5, s7 // tmp7
  2091. subq.ph s5, s5, s7 // tmp11 ...
  2092. addq.ph v1, v0, s6 // z5 ...
  2093. mulq_s.ph s5, s5, t8 // ... tmp11
  2094. lw t8, 8(AT) // FIX(1.847759065)
  2095. lw s4, 0(AT) // FIX(1.082392200)
  2096. addq.ph s0, t0, t7
  2097. subq.ph s1, t0, t7
  2098. mulq_s.ph v1, v1, t8 // ... z5
  2099. shll_s.ph s5, s5, 1 // x2
  2100. lw t8, 12(AT) // FIX(-2.613125930)
  2101. sw s0, 0(a2) // wsptr[DCTSIZE*0]
  2102. shll_s.ph v0, v0, 1 // x4
  2103. mulq_s.ph v0, v0, t8 // tmp12 ...
  2104. mulq_s.ph s4, s6, s4 // tmp10 ...
  2105. shll_s.ph v1, v1, 1 // x2
  2106. addiu a0, a0, 4
  2107. addiu a1, a1, 4
  2108. sw s1, 112(a2) // wsptr[DCTSIZE*7]
  2109. shll_s.ph s6, v0, 1 // x4
  2110. shll_s.ph s4, s4, 1 // x2
  2111. addq.ph s6, s6, v1 // ... tmp12
  2112. subq.ph t5, s6, t7 // tmp6
  2113. subq.ph s4, s4, v1 // ... tmp10
  2114. subq.ph t3, s5, t5 // tmp5
  2115. addq.ph s2, t2, t5
  2116. addq.ph t1, s4, t3 // tmp4
  2117. subq.ph s3, t2, t5
  2118. sw s2, 16(a2) // wsptr[DCTSIZE*1]
  2119. sw s3, 96(a2) // wsptr[DCTSIZE*6]
  2120. addq.ph v0, t4, t3
  2121. subq.ph v1, t4, t3
  2122. sw v0, 32(a2) // wsptr[DCTSIZE*2]
  2123. sw v1, 80(a2) // wsptr[DCTSIZE*5]
  2124. addq.ph v0, t6, t1
  2125. subq.ph v1, t6, t1
  2126. sw v0, 64(a2) // wsptr[DCTSIZE*4]
  2127. sw v1, 48(a2) // wsptr[DCTSIZE*3]
  2128. 2:
  2129. bne a0, t9, 0b
  2130. addiu a2, a2, 4
  2131. RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  2132. j ra
  2133. nop
  2134. END(jsimd_idct_ifast_cols_mips_dspr2)
  2135. /*****************************************************************************/
  2136. LEAF_MIPS_DSPR2(jsimd_idct_ifast_rows_mips_dspr2)
  2137. /*
  2138. * a0 - wsptr
  2139. * a1 - output_buf
  2140. * a2 - output_col
  2141. * a3 - mips_idct_ifast_coefs
  2142. */
  2143. SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
  2144. addiu t9, a0, 128 // end address
  2145. lui s8, 0x8080
  2146. ori s8, s8, 0x8080
  2147. 0:
  2148. lw AT, 36(sp) // restore $a3 (mips_idct_ifast_coefs)
  2149. lw t0, 0(a0) // wsptr[DCTSIZE*0+0/1] b a
  2150. lw s0, 16(a0) // wsptr[DCTSIZE*1+0/1] B A
  2151. lw t2, 4(a0) // wsptr[DCTSIZE*0+2/3] d c
  2152. lw s2, 20(a0) // wsptr[DCTSIZE*1+2/3] D C
  2153. lw t4, 8(a0) // wsptr[DCTSIZE*0+4/5] f e
  2154. lw s4, 24(a0) // wsptr[DCTSIZE*1+4/5] F E
  2155. lw t6, 12(a0) // wsptr[DCTSIZE*0+6/7] h g
  2156. lw s6, 28(a0) // wsptr[DCTSIZE*1+6/7] H G
  2157. precrq.ph.w t1, s0, t0 // B b
  2158. ins t0, s0, 16, 16 // A a
  2159. bnez t1, 1f
  2160. or s0, t2, s2
  2161. bnez s0, 1f
  2162. or s0, t4, s4
  2163. bnez s0, 1f
  2164. or s0, t6, s6
  2165. bnez s0, 1f
  2166. shll_s.ph s0, t0, 2 // A a
  2167. lw a3, 0(a1)
  2168. lw AT, 4(a1)
  2169. precrq.ph.w t0, s0, s0 // A A
  2170. ins s0, s0, 16, 16 // a a
  2171. addu a3, a3, a2
  2172. addu AT, AT, a2
  2173. precrq.qb.ph t0, t0, t0 // A A A A
  2174. precrq.qb.ph s0, s0, s0 // a a a a
  2175. addu.qb s0, s0, s8
  2176. addu.qb t0, t0, s8
  2177. sw s0, 0(a3)
  2178. sw s0, 4(a3)
  2179. sw t0, 0(AT)
  2180. sw t0, 4(AT)
  2181. addiu a0, a0, 32
  2182. bne a0, t9, 0b
  2183. addiu a1, a1, 8
  2184. b 2f
  2185. nop
  2186. 1:
  2187. precrq.ph.w t3, s2, t2
  2188. ins t2, s2, 16, 16
  2189. precrq.ph.w t5, s4, t4
  2190. ins t4, s4, 16, 16
  2191. precrq.ph.w t7, s6, t6
  2192. ins t6, s6, 16, 16
  2193. lw t8, 4(AT) // FIX(1.414213562)
  2194. addq.ph s4, t0, t4 // tmp10
  2195. subq.ph s5, t0, t4 // tmp11
  2196. subq.ph s6, t2, t6 // tmp12 ...
  2197. addq.ph s7, t2, t6 // tmp13
  2198. mulq_s.ph s6, s6, t8 // ... tmp12 ...
  2199. addq.ph t0, s4, s7 // tmp0
  2200. subq.ph t6, s4, s7 // tmp3
  2201. shll_s.ph s6, s6, 1 // x2
  2202. subq.ph s6, s6, s7 // ... tmp12
  2203. addq.ph t2, s5, s6 // tmp1
  2204. subq.ph t4, s5, s6 // tmp2
  2205. addq.ph s5, t1, t7 // z11
  2206. subq.ph s6, t1, t7 // z12
  2207. addq.ph s7, t5, t3 // z13
  2208. subq.ph v0, t5, t3 // z10
  2209. addq.ph t7, s5, s7 // tmp7
  2210. subq.ph s5, s5, s7 // tmp11 ...
  2211. addq.ph v1, v0, s6 // z5 ...
  2212. mulq_s.ph s5, s5, t8 // ... tmp11
  2213. lw t8, 8(AT) // FIX(1.847759065)
  2214. lw s4, 0(AT) // FIX(1.082392200)
  2215. addq.ph s0, t0, t7 // tmp0 + tmp7
  2216. subq.ph s7, t0, t7 // tmp0 - tmp7
  2217. mulq_s.ph v1, v1, t8 // ... z5
  2218. lw a3, 0(a1)
  2219. lw t8, 12(AT) // FIX(-2.613125930)
  2220. shll_s.ph s5, s5, 1 // x2
  2221. addu a3, a3, a2
  2222. shll_s.ph v0, v0, 1 // x4
  2223. mulq_s.ph v0, v0, t8 // tmp12 ...
  2224. mulq_s.ph s4, s6, s4 // tmp10 ...
  2225. shll_s.ph v1, v1, 1 // x2
  2226. addiu a0, a0, 32
  2227. addiu a1, a1, 8
  2228. shll_s.ph s6, v0, 1 // x4
  2229. shll_s.ph s4, s4, 1 // x2
  2230. addq.ph s6, s6, v1 // ... tmp12
  2231. shll_s.ph s0, s0, 2
  2232. subq.ph t5, s6, t7 // tmp6
  2233. subq.ph s4, s4, v1 // ... tmp10
  2234. subq.ph t3, s5, t5 // tmp5
  2235. shll_s.ph s7, s7, 2
  2236. addq.ph t1, s4, t3 // tmp4
  2237. addq.ph s1, t2, t5 // tmp1 + tmp6
  2238. subq.ph s6, t2, t5 // tmp1 - tmp6
  2239. addq.ph s2, t4, t3 // tmp2 + tmp5
  2240. subq.ph s5, t4, t3 // tmp2 - tmp5
  2241. addq.ph s4, t6, t1 // tmp3 + tmp4
  2242. subq.ph s3, t6, t1 // tmp3 - tmp4
  2243. shll_s.ph s1, s1, 2
  2244. shll_s.ph s2, s2, 2
  2245. shll_s.ph s3, s3, 2
  2246. shll_s.ph s4, s4, 2
  2247. shll_s.ph s5, s5, 2
  2248. shll_s.ph s6, s6, 2
  2249. precrq.ph.w t0, s1, s0 // B A
  2250. ins s0, s1, 16, 16 // b a
  2251. precrq.ph.w t2, s3, s2 // D C
  2252. ins s2, s3, 16, 16 // d c
  2253. precrq.ph.w t4, s5, s4 // F E
  2254. ins s4, s5, 16, 16 // f e
  2255. precrq.ph.w t6, s7, s6 // H G
  2256. ins s6, s7, 16, 16 // h g
  2257. precrq.qb.ph t0, t2, t0 // D C B A
  2258. precrq.qb.ph s0, s2, s0 // d c b a
  2259. precrq.qb.ph t4, t6, t4 // H G F E
  2260. precrq.qb.ph s4, s6, s4 // h g f e
  2261. addu.qb s0, s0, s8
  2262. addu.qb s4, s4, s8
  2263. sw s0, 0(a3) // outptr[0/1/2/3] d c b a
  2264. sw s4, 4(a3) // outptr[4/5/6/7] h g f e
  2265. lw a3, -4(a1)
  2266. addu.qb t0, t0, s8
  2267. addu a3, a3, a2
  2268. addu.qb t4, t4, s8
  2269. sw t0, 0(a3) // outptr[0/1/2/3] D C B A
  2270. bne a0, t9, 0b
  2271. sw t4, 4(a3) // outptr[4/5/6/7] H G F E
  2272. 2:
  2273. RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
  2274. j ra
  2275. nop
  2276. END(jsimd_idct_ifast_rows_mips_dspr2)
  2277. /*****************************************************************************/
  2278. LEAF_MIPS_DSPR2(jsimd_fdct_islow_mips_dspr2)
  2279. /*
  2280. * a0 - data
  2281. */
  2282. SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
  2283. lui t0, 6437
  2284. ori t0, 2260
  2285. lui t1, 9633
  2286. ori t1, 11363
  2287. lui t2, 0xd39e
  2288. ori t2, 0xe6dc
  2289. lui t3, 0xf72d
  2290. ori t3, 9633
  2291. lui t4, 2261
  2292. ori t4, 9633
  2293. lui t5, 0xd39e
  2294. ori t5, 6437
  2295. lui t6, 9633
  2296. ori t6, 0xd39d
  2297. lui t7, 0xe6dc
  2298. ori t7, 2260
  2299. lui t8, 4433
  2300. ori t8, 10703
  2301. lui t9, 0xd630
  2302. ori t9, 4433
  2303. li s8, 8
  2304. move a1, a0
  2305. 1:
  2306. lw s0, 0(a1) // tmp0 = 1|0
  2307. lw s1, 4(a1) // tmp1 = 3|2
  2308. lw s2, 8(a1) // tmp2 = 5|4
  2309. lw s3, 12(a1) // tmp3 = 7|6
  2310. packrl.ph s1, s1, s1 // tmp1 = 2|3
  2311. packrl.ph s3, s3, s3 // tmp3 = 6|7
  2312. subq.ph s7, s1, s2 // tmp7 = 2-5|3-4 = t5|t4
  2313. subq.ph s5, s0, s3 // tmp5 = 1-6|0-7 = t6|t7
  2314. mult $0, $0 // ac0 = 0
  2315. dpa.w.ph $ac0, s7, t0 // ac0 += t5* 6437 + t4* 2260
  2316. dpa.w.ph $ac0, s5, t1 // ac0 += t6* 9633 + t7* 11363
  2317. mult $ac1, $0, $0 // ac1 = 0
  2318. dpa.w.ph $ac1, s7, t2 // ac1 += t5*-11362 + t4* -6436
  2319. dpa.w.ph $ac1, s5, t3 // ac1 += t6* -2259 + t7* 9633
  2320. mult $ac2, $0, $0 // ac2 = 0
  2321. dpa.w.ph $ac2, s7, t4 // ac2 += t5* 2261 + t4* 9633
  2322. dpa.w.ph $ac2, s5, t5 // ac2 += t6*-11362 + t7* 6437
  2323. mult $ac3, $0, $0 // ac3 = 0
  2324. dpa.w.ph $ac3, s7, t6 // ac3 += t5* 9633 + t4*-11363
  2325. dpa.w.ph $ac3, s5, t7 // ac3 += t6* -6436 + t7* 2260
  2326. addq.ph s6, s1, s2 // tmp6 = 2+5|3+4 = t2|t3
  2327. addq.ph s4, s0, s3 // tmp4 = 1+6|0+7 = t1|t0
  2328. extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11
  2329. extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11
  2330. extr_r.w s2, $ac2, 11 // tmp2 = (ac2 + 1024) >> 11
  2331. extr_r.w s3, $ac3, 11 // tmp3 = (ac3 + 1024) >> 11
  2332. addq.ph s5, s4, s6 // tmp5 = t1+t2|t0+t3 = t11|t10
  2333. subq.ph s7, s4, s6 // tmp7 = t1-t2|t0-t3 = t12|t13
  2334. sh s0, 2(a1)
  2335. sh s1, 6(a1)
  2336. sh s2, 10(a1)
  2337. sh s3, 14(a1)
  2338. mult $0, $0 // ac0 = 0
  2339. dpa.w.ph $ac0, s7, t8 // ac0 += t12* 4433 + t13* 10703
  2340. mult $ac1, $0, $0 // ac1 = 0
  2341. dpa.w.ph $ac1, s7, t9 // ac1 += t12*-10704 + t13* 4433
  2342. sra s4, s5, 16 // tmp4 = t11
  2343. addiu a1, a1, 16
  2344. addiu s8, s8, -1
  2345. extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11
  2346. extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11
  2347. addu s2, s5, s4 // tmp2 = t10 + t11
  2348. subu s3, s5, s4 // tmp3 = t10 - t11
  2349. sll s2, s2, 2 // tmp2 = (t10 + t11) << 2
  2350. sll s3, s3, 2 // tmp3 = (t10 - t11) << 2
  2351. sh s2, -16(a1)
  2352. sh s3, -8(a1)
  2353. sh s0, -12(a1)
  2354. bgtz s8, 1b
  2355. sh s1, -4(a1)
  2356. li t0, 2260
  2357. li t1, 11363
  2358. li t2, 9633
  2359. li t3, 6436
  2360. li t4, 6437
  2361. li t5, 2261
  2362. li t6, 11362
  2363. li t7, 2259
  2364. li t8, 4433
  2365. li t9, 10703
  2366. li a1, 10704
  2367. li s8, 8
  2368. 2:
  2369. lh a2, 0(a0) // 0
  2370. lh a3, 16(a0) // 8
  2371. lh v0, 32(a0) // 16
  2372. lh v1, 48(a0) // 24
  2373. lh s4, 64(a0) // 32
  2374. lh s5, 80(a0) // 40
  2375. lh s6, 96(a0) // 48
  2376. lh s7, 112(a0) // 56
  2377. addu s2, v0, s5 // tmp2 = 16 + 40
  2378. subu s5, v0, s5 // tmp5 = 16 - 40
  2379. addu s3, v1, s4 // tmp3 = 24 + 32
  2380. subu s4, v1, s4 // tmp4 = 24 - 32
  2381. addu s0, a2, s7 // tmp0 = 0 + 56
  2382. subu s7, a2, s7 // tmp7 = 0 - 56
  2383. addu s1, a3, s6 // tmp1 = 8 + 48
  2384. subu s6, a3, s6 // tmp6 = 8 - 48
  2385. addu a2, s0, s3 // tmp10 = tmp0 + tmp3
  2386. subu v1, s0, s3 // tmp13 = tmp0 - tmp3
  2387. addu a3, s1, s2 // tmp11 = tmp1 + tmp2
  2388. subu v0, s1, s2 // tmp12 = tmp1 - tmp2
  2389. mult s7, t1 // ac0 = tmp7 * c1
  2390. madd s4, t0 // ac0 += tmp4 * c0
  2391. madd s5, t4 // ac0 += tmp5 * c4
  2392. madd s6, t2 // ac0 += tmp6 * c2
  2393. mult $ac1, s7, t2 // ac1 = tmp7 * c2
  2394. msub $ac1, s4, t3 // ac1 -= tmp4 * c3
  2395. msub $ac1, s5, t6 // ac1 -= tmp5 * c6
  2396. msub $ac1, s6, t7 // ac1 -= tmp6 * c7
  2397. mult $ac2, s7, t4 // ac2 = tmp7 * c4
  2398. madd $ac2, s4, t2 // ac2 += tmp4 * c2
  2399. madd $ac2, s5, t5 // ac2 += tmp5 * c5
  2400. msub $ac2, s6, t6 // ac2 -= tmp6 * c6
  2401. mult $ac3, s7, t0 // ac3 = tmp7 * c0
  2402. msub $ac3, s4, t1 // ac3 -= tmp4 * c1
  2403. madd $ac3, s5, t2 // ac3 += tmp5 * c2
  2404. msub $ac3, s6, t3 // ac3 -= tmp6 * c3
  2405. extr_r.w s0, $ac0, 15 // tmp0 = (ac0 + 16384) >> 15
  2406. extr_r.w s1, $ac1, 15 // tmp1 = (ac1 + 16384) >> 15
  2407. extr_r.w s2, $ac2, 15 // tmp2 = (ac2 + 16384) >> 15
  2408. extr_r.w s3, $ac3, 15 // tmp3 = (ac3 + 16384) >> 15
  2409. addiu s8, s8, -1
  2410. addu s4, a2, a3 // tmp4 = tmp10 + tmp11
  2411. subu s5, a2, a3 // tmp5 = tmp10 - tmp11
  2412. sh s0, 16(a0)
  2413. sh s1, 48(a0)
  2414. sh s2, 80(a0)
  2415. sh s3, 112(a0)
  2416. mult v0, t8 // ac0 = tmp12 * c8
  2417. madd v1, t9 // ac0 += tmp13 * c9
  2418. mult $ac1, v1, t8 // ac1 = tmp13 * c8
  2419. msub $ac1, v0, a1 // ac1 -= tmp12 * c10
  2420. addiu a0, a0, 2
  2421. extr_r.w s6, $ac0, 15 // tmp6 = (ac0 + 16384) >> 15
  2422. extr_r.w s7, $ac1, 15 // tmp7 = (ac1 + 16384) >> 15
  2423. shra_r.w s4, s4, 2 // tmp4 = (tmp4 + 2) >> 2
  2424. shra_r.w s5, s5, 2 // tmp5 = (tmp5 + 2) >> 2
  2425. sh s4, -2(a0)
  2426. sh s5, 62(a0)
  2427. sh s6, 30(a0)
  2428. bgtz s8, 2b
  2429. sh s7, 94(a0)
  2430. RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
  2431. jr ra
  2432. nop
  2433. END(jsimd_fdct_islow_mips_dspr2)
  2434. /*****************************************************************************/
  2435. LEAF_MIPS_DSPR2(jsimd_fdct_ifast_mips_dspr2)
  2436. /*
  2437. * a0 - data
  2438. */
  2439. .set at
  2440. SAVE_REGS_ON_STACK 8, s0, s1
  2441. li a1, 0x014e014e // FIX_1_306562965 (334 << 16)|(334 & 0xffff)
  2442. li a2, 0x008b008b // FIX_0_541196100 (139 << 16)|(139 & 0xffff)
  2443. li a3, 0x00620062 // FIX_0_382683433 (98 << 16) |(98 & 0xffff)
  2444. li s1, 0x00b500b5 // FIX_0_707106781 (181 << 16)|(181 & 0xffff)
  2445. move v0, a0
  2446. addiu v1, v0, 128 // end address
  2447. 0:
  2448. lw t0, 0(v0) // tmp0 = 1|0
  2449. lw t1, 4(v0) // tmp1 = 3|2
  2450. lw t2, 8(v0) // tmp2 = 5|4
  2451. lw t3, 12(v0) // tmp3 = 7|6
  2452. packrl.ph t1, t1, t1 // tmp1 = 2|3
  2453. packrl.ph t3, t3, t3 // tmp3 = 6|7
  2454. subq.ph t7, t1, t2 // tmp7 = 2-5|3-4 = t5|t4
  2455. subq.ph t5, t0, t3 // tmp5 = 1-6|0-7 = t6|t7
  2456. addq.ph t6, t1, t2 // tmp6 = 2+5|3+4 = t2|t3
  2457. addq.ph t4, t0, t3 // tmp4 = 1+6|0+7 = t1|t0
  2458. addq.ph t8, t4, t6 // tmp5 = t1+t2|t0+t3 = t11|t10
  2459. subq.ph t9, t4, t6 // tmp7 = t1-t2|t0-t3 = t12|t13
  2460. sra t4, t8, 16 // tmp4 = t11
  2461. mult $0, $0 // ac0 = 0
  2462. dpa.w.ph $ac0, t9, s1
  2463. mult $ac1, $0, $0 // ac1 = 0
  2464. dpa.w.ph $ac1, t7, a3 // ac1 += t4*98 + t5*98
  2465. dpsx.w.ph $ac1, t5, a3 // ac1 += t6*98 + t7*98
  2466. mult $ac2, $0, $0 // ac2 = 0
  2467. dpa.w.ph $ac2, t7, a2 // ac2 += t4*139 + t5*139
  2468. mult $ac3, $0, $0 // ac3 = 0
  2469. dpa.w.ph $ac3, t5, a1 // ac3 += t6*334 + t7*334
  2470. precrq.ph.w t0, t5, t7 // t0 = t5|t6
  2471. addq.ph t2, t8, t4 // tmp2 = t10 + t11
  2472. subq.ph t3, t8, t4 // tmp3 = t10 - t11
  2473. extr.w t4, $ac0, 8
  2474. mult $0, $0 // ac0 = 0
  2475. dpa.w.ph $ac0, t0, s1 // ac0 += t5*181 + t6*181
  2476. extr.w t0, $ac1, 8 // t0 = z5
  2477. extr.w t1, $ac2, 8 // t1 = MULTIPLY(tmp10, 139)
  2478. extr.w t7, $ac3, 8 // t2 = MULTIPLY(tmp12, 334)
  2479. extr.w t8, $ac0, 8 // t8 = z3 = MULTIPLY(tmp11, 181)
  2480. add t6, t1, t0 // t6 = z2
  2481. add t7, t7, t0 // t7 = z4
  2482. subq.ph t0, t5, t8 // t0 = z13 = tmp7 - z3
  2483. addq.ph t8, t5, t8 // t9 = z11 = tmp7 + z3
  2484. addq.ph t1, t0, t6 // t1 = z13 + z2
  2485. subq.ph t6, t0, t6 // t6 = z13 - z2
  2486. addq.ph t0, t8, t7 // t0 = z11 + z4
  2487. subq.ph t7, t8, t7 // t7 = z11 - z4
  2488. addq.ph t5, t4, t9
  2489. subq.ph t4, t9, t4
  2490. sh t2, 0(v0)
  2491. sh t5, 4(v0)
  2492. sh t3, 8(v0)
  2493. sh t4, 12(v0)
  2494. sh t1, 10(v0)
  2495. sh t6, 6(v0)
  2496. sh t0, 2(v0)
  2497. sh t7, 14(v0)
  2498. addiu v0, 16
  2499. bne v1, v0, 0b
  2500. nop
  2501. move v0, a0
  2502. addiu v1, v0, 16
  2503. 1:
  2504. lh t0, 0(v0) // 0
  2505. lh t1, 16(v0) // 8
  2506. lh t2, 32(v0) // 16
  2507. lh t3, 48(v0) // 24
  2508. lh t4, 64(v0) // 32
  2509. lh t5, 80(v0) // 40
  2510. lh t6, 96(v0) // 48
  2511. lh t7, 112(v0) // 56
  2512. add t8, t0, t7 // t8 = tmp0
  2513. sub t7, t0, t7 // t7 = tmp7
  2514. add t0, t1, t6 // t0 = tmp1
  2515. sub t1, t1, t6 // t1 = tmp6
  2516. add t6, t2, t5 // t6 = tmp2
  2517. sub t5, t2, t5 // t5 = tmp5
  2518. add t2, t3, t4 // t2 = tmp3
  2519. sub t3, t3, t4 // t3 = tmp4
  2520. add t4, t8, t2 // t4 = tmp10 = tmp0 + tmp3
  2521. sub t8, t8, t2 // t8 = tmp13 = tmp0 - tmp3
  2522. sub s0, t0, t6 // s0 = tmp12 = tmp1 - tmp2
  2523. ins t8, s0, 16, 16 // t8 = tmp12|tmp13
  2524. add t2, t0, t6 // t2 = tmp11 = tmp1 + tmp2
  2525. mult $0, $0 // ac0 = 0
  2526. dpa.w.ph $ac0, t8, s1 // ac0 += t12*181 + t13*181
  2527. add s0, t4, t2 // t8 = tmp10+tmp11
  2528. sub t4, t4, t2 // t4 = tmp10-tmp11
  2529. sh s0, 0(v0)
  2530. sh t4, 64(v0)
  2531. extr.w t2, $ac0, 8 // z1 = MULTIPLY(tmp12+tmp13,FIX_0_707106781)
  2532. addq.ph t4, t8, t2 // t9 = tmp13 + z1
  2533. subq.ph t8, t8, t2 // t2 = tmp13 - z1
  2534. sh t4, 32(v0)
  2535. sh t8, 96(v0)
  2536. add t3, t3, t5 // t3 = tmp10 = tmp4 + tmp5
  2537. add t0, t5, t1 // t0 = tmp11 = tmp5 + tmp6
  2538. add t1, t1, t7 // t1 = tmp12 = tmp6 + tmp7
  2539. andi t4, a1, 0xffff
  2540. mul s0, t1, t4
  2541. sra s0, s0, 8 // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965)
  2542. ins t1, t3, 16, 16 // t1 = tmp10|tmp12
  2543. mult $0, $0 // ac0 = 0
  2544. mulsa.w.ph $ac0, t1, a3 // ac0 += t10*98 - t12*98
  2545. extr.w t8, $ac0, 8 // z5 = MULTIPLY(tmp10-tmp12,FIX_0_382683433)
  2546. add t2, t7, t8 // t2 = tmp7 + z5
  2547. sub t7, t7, t8 // t7 = tmp7 - z5
  2548. andi t4, a2, 0xffff
  2549. mul t8, t3, t4
  2550. sra t8, t8, 8 // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100)
  2551. andi t4, s1, 0xffff
  2552. mul t6, t0, t4
  2553. sra t6, t6, 8 // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781)
  2554. add t0, t6, t8 // t0 = z3 + z2
  2555. sub t1, t6, t8 // t1 = z3 - z2
  2556. add t3, t6, s0 // t3 = z3 + z4
  2557. sub t4, t6, s0 // t4 = z3 - z4
  2558. sub t5, t2, t1 // t5 = dataptr[5]
  2559. sub t6, t7, t0 // t6 = dataptr[3]
  2560. add t3, t2, t3 // t3 = dataptr[1]
  2561. add t4, t7, t4 // t4 = dataptr[7]
  2562. sh t5, 80(v0)
  2563. sh t6, 48(v0)
  2564. sh t3, 16(v0)
  2565. sh t4, 112(v0)
  2566. addiu v0, 2
  2567. bne v0, v1, 1b
  2568. nop
  2569. RESTORE_REGS_FROM_STACK 8, s0, s1
  2570. j ra
  2571. nop
  2572. END(jsimd_fdct_ifast_mips_dspr2)
  2573. /*****************************************************************************/
  2574. LEAF_MIPS_DSPR2(jsimd_quantize_mips_dspr2)
  2575. /*
  2576. * a0 - coef_block
  2577. * a1 - divisors
  2578. * a2 - workspace
  2579. */
  2580. .set at
  2581. SAVE_REGS_ON_STACK 16, s0, s1, s2
  2582. addiu v0, a2, 124 // v0 = workspace_end
  2583. lh t0, 0(a2)
  2584. lh t1, 0(a1)
  2585. lh t2, 128(a1)
  2586. sra t3, t0, 15
  2587. sll t3, t3, 1
  2588. addiu t3, t3, 1
  2589. mul t0, t0, t3
  2590. lh t4, 384(a1)
  2591. lh t5, 130(a1)
  2592. lh t6, 2(a2)
  2593. lh t7, 2(a1)
  2594. lh t8, 386(a1)
  2595. 1:
  2596. andi t1, 0xffff
  2597. add t9, t0, t2
  2598. andi t9, 0xffff
  2599. mul v1, t9, t1
  2600. sra s0, t6, 15
  2601. sll s0, s0, 1
  2602. addiu s0, s0, 1
  2603. addiu t9, t4, 16
  2604. srav v1, v1, t9
  2605. mul v1, v1, t3
  2606. mul t6, t6, s0
  2607. andi t7, 0xffff
  2608. addiu a2, a2, 4
  2609. addiu a1, a1, 4
  2610. add s1, t6, t5
  2611. andi s1, 0xffff
  2612. sh v1, 0(a0)
  2613. mul s2, s1, t7
  2614. addiu s1, t8, 16
  2615. srav s2, s2, s1
  2616. mul s2,s2, s0
  2617. lh t0, 0(a2)
  2618. lh t1, 0(a1)
  2619. sra t3, t0, 15
  2620. sll t3, t3, 1
  2621. addiu t3, t3, 1
  2622. mul t0, t0, t3
  2623. lh t2, 128(a1)
  2624. lh t4, 384(a1)
  2625. lh t5, 130(a1)
  2626. lh t8, 386(a1)
  2627. lh t6, 2(a2)
  2628. lh t7, 2(a1)
  2629. sh s2, 2(a0)
  2630. lh t0, 0(a2)
  2631. sra t3, t0, 15
  2632. sll t3, t3, 1
  2633. addiu t3, t3, 1
  2634. mul t0, t0,t3
  2635. bne a2, v0, 1b
  2636. addiu a0, a0, 4
  2637. andi t1, 0xffff
  2638. add t9, t0, t2
  2639. andi t9, 0xffff
  2640. mul v1, t9, t1
  2641. sra s0, t6, 15
  2642. sll s0, s0, 1
  2643. addiu s0, s0, 1
  2644. addiu t9, t4, 16
  2645. srav v1, v1, t9
  2646. mul v1, v1, t3
  2647. mul t6, t6, s0
  2648. andi t7, 0xffff
  2649. sh v1, 0(a0)
  2650. add s1, t6, t5
  2651. andi s1, 0xffff
  2652. mul s2, s1, t7
  2653. addiu s1, t8, 16
  2654. addiu a2, a2, 4
  2655. addiu a1, a1, 4
  2656. srav s2, s2, s1
  2657. mul s2, s2, s0
  2658. sh s2, 2(a0)
  2659. RESTORE_REGS_FROM_STACK 16, s0, s1, s2
  2660. j ra
  2661. nop
  2662. END(jsimd_quantize_mips_dspr2)
  2663. /*****************************************************************************/
  2664. LEAF_MIPS_DSPR2(jsimd_quantize_float_mips_dspr2)
  2665. /*
  2666. * a0 - coef_block
  2667. * a1 - divisors
  2668. * a2 - workspace
  2669. */
  2670. .set at
  2671. li t1, 0x46800100 //integer representation 16384.5
  2672. mtc1 t1, f0
  2673. li t0, 63
  2674. 0:
  2675. lwc1 f2, 0(a2)
  2676. lwc1 f10, 0(a1)
  2677. lwc1 f4, 4(a2)
  2678. lwc1 f12, 4(a1)
  2679. lwc1 f6, 8(a2)
  2680. lwc1 f14, 8(a1)
  2681. lwc1 f8, 12(a2)
  2682. lwc1 f16, 12(a1)
  2683. madd.s f2, f0, f2, f10
  2684. madd.s f4, f0, f4, f12
  2685. madd.s f6, f0, f6, f14
  2686. madd.s f8, f0, f8, f16
  2687. lwc1 f10, 16(a1)
  2688. lwc1 f12, 20(a1)
  2689. trunc.w.s f2, f2
  2690. trunc.w.s f4, f4
  2691. trunc.w.s f6, f6
  2692. trunc.w.s f8, f8
  2693. lwc1 f14, 24(a1)
  2694. lwc1 f16, 28(a1)
  2695. mfc1 t1, f2
  2696. mfc1 t2, f4
  2697. mfc1 t3, f6
  2698. mfc1 t4, f8
  2699. lwc1 f2, 16(a2)
  2700. lwc1 f4, 20(a2)
  2701. lwc1 f6, 24(a2)
  2702. lwc1 f8, 28(a2)
  2703. madd.s f2, f0, f2, f10
  2704. madd.s f4, f0, f4, f12
  2705. madd.s f6, f0, f6, f14
  2706. madd.s f8, f0, f8, f16
  2707. addiu t1, t1, -16384
  2708. addiu t2, t2, -16384
  2709. addiu t3, t3, -16384
  2710. addiu t4, t4, -16384
  2711. trunc.w.s f2, f2
  2712. trunc.w.s f4, f4
  2713. trunc.w.s f6, f6
  2714. trunc.w.s f8, f8
  2715. sh t1, 0(a0)
  2716. sh t2, 2(a0)
  2717. sh t3, 4(a0)
  2718. sh t4, 6(a0)
  2719. mfc1 t1, f2
  2720. mfc1 t2, f4
  2721. mfc1 t3, f6
  2722. mfc1 t4, f8
  2723. addiu t0, t0, -8
  2724. addiu a2, a2, 32
  2725. addiu a1, a1, 32
  2726. addiu t1, t1, -16384
  2727. addiu t2, t2, -16384
  2728. addiu t3, t3, -16384
  2729. addiu t4, t4, -16384
  2730. sh t1, 8(a0)
  2731. sh t2, 10(a0)
  2732. sh t3, 12(a0)
  2733. sh t4, 14(a0)
  2734. bgez t0, 0b
  2735. addiu a0, a0, 16
  2736. j ra
  2737. nop
  2738. END(jsimd_quantize_float_mips_dspr2)
  2739. /*****************************************************************************/
  2740. LEAF_MIPS_DSPR2(jsimd_idct_2x2_mips_dspr2)
  2741. /*
  2742. * a0 - compptr->dct_table
  2743. * a1 - coef_block
  2744. * a2 - output_buf
  2745. * a3 - output_col
  2746. */
  2747. .set at
  2748. SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
  2749. addiu sp, sp, -40
  2750. move v0, sp
  2751. addiu s2, zero, 29692
  2752. addiu s3, zero, -10426
  2753. addiu s4, zero, 6967
  2754. addiu s5, zero, -5906
  2755. lh t0, 0(a1) // t0 = inptr[DCTSIZE*0]
  2756. lh t5, 0(a0) // t5 = quantptr[DCTSIZE*0]
  2757. lh t1, 48(a1) // t1 = inptr[DCTSIZE*3]
  2758. lh t6, 48(a0) // t6 = quantptr[DCTSIZE*3]
  2759. mul t4, t5, t0
  2760. lh t0, 16(a1) // t0 = inptr[DCTSIZE*1]
  2761. lh t5, 16(a0) // t5 = quantptr[DCTSIZE*1]
  2762. mul t6, t6, t1
  2763. mul t5, t5, t0
  2764. lh t2, 80(a1) // t2 = inptr[DCTSIZE*5]
  2765. lh t7, 80(a0) // t7 = quantptr[DCTSIZE*5]
  2766. lh t3, 112(a1) // t3 = inptr[DCTSIZE*7]
  2767. lh t8, 112(a0) // t8 = quantptr[DCTSIZE*7]
  2768. mul t7, t7, t2
  2769. mult zero, zero
  2770. mul t8, t8, t3
  2771. li s0, 0x73FCD746 // s0 = (29692 << 16) | (-10426 & 0xffff)
  2772. li s1, 0x1B37E8EE // s1 = (6967 << 16) | (-5906 & 0xffff)
  2773. ins t6, t5, 16, 16 // t6 = t5|t6
  2774. sll t4, t4, 15
  2775. dpa.w.ph $ac0, t6, s0
  2776. lh t1, 2(a1)
  2777. lh t6, 2(a0)
  2778. ins t8, t7, 16, 16 // t8 = t7|t8
  2779. dpa.w.ph $ac0, t8, s1
  2780. mflo t0, $ac0
  2781. mul t5, t6, t1
  2782. lh t1, 18(a1)
  2783. lh t6, 18(a0)
  2784. lh t2, 50(a1)
  2785. lh t7, 50(a0)
  2786. mul t6, t6, t1
  2787. subu t8, t4, t0
  2788. mul t7, t7, t2
  2789. addu t0, t4, t0
  2790. shra_r.w t0, t0, 13
  2791. lh t1, 82(a1)
  2792. lh t2, 82(a0)
  2793. lh t3, 114(a1)
  2794. lh t4, 114(a0)
  2795. shra_r.w t8, t8, 13
  2796. mul t1, t1, t2
  2797. mul t3, t3, t4
  2798. sw t0, 0(v0)
  2799. sw t8, 20(v0)
  2800. sll t4, t5, 15
  2801. ins t7, t6, 16, 16
  2802. mult zero, zero
  2803. dpa.w.ph $ac0, t7, s0
  2804. ins t3, t1, 16, 16
  2805. lh t1, 6(a1)
  2806. lh t6, 6(a0)
  2807. dpa.w.ph $ac0, t3, s1
  2808. mflo t0, $ac0
  2809. mul t5, t6, t1
  2810. lh t1, 22(a1)
  2811. lh t6, 22(a0)
  2812. lh t2, 54(a1)
  2813. lh t7, 54(a0)
  2814. mul t6, t6, t1
  2815. subu t8, t4, t0
  2816. mul t7, t7, t2
  2817. addu t0, t4, t0
  2818. shra_r.w t0, t0, 13
  2819. lh t1, 86(a1)
  2820. lh t2, 86(a0)
  2821. lh t3, 118(a1)
  2822. lh t4, 118(a0)
  2823. shra_r.w t8, t8, 13
  2824. mul t1, t1, t2
  2825. mul t3, t3, t4
  2826. sw t0, 4(v0)
  2827. sw t8, 24(v0)
  2828. sll t4, t5, 15
  2829. ins t7, t6, 16, 16
  2830. mult zero, zero
  2831. dpa.w.ph $ac0, t7, s0
  2832. ins t3, t1, 16, 16
  2833. lh t1, 10(a1)
  2834. lh t6, 10(a0)
  2835. dpa.w.ph $ac0, t3, s1
  2836. mflo t0, $ac0
  2837. mul t5, t6, t1
  2838. lh t1, 26(a1)
  2839. lh t6, 26(a0)
  2840. lh t2, 58(a1)
  2841. lh t7, 58(a0)
  2842. mul t6, t6, t1
  2843. subu t8, t4, t0
  2844. mul t7, t7, t2
  2845. addu t0, t4, t0
  2846. shra_r.w t0, t0, 13
  2847. lh t1, 90(a1)
  2848. lh t2, 90(a0)
  2849. lh t3, 122(a1)
  2850. lh t4, 122(a0)
  2851. shra_r.w t8, t8, 13
  2852. mul t1, t1, t2
  2853. mul t3, t3, t4
  2854. sw t0, 8(v0)
  2855. sw t8, 28(v0)
  2856. sll t4, t5, 15
  2857. ins t7, t6, 16, 16
  2858. mult zero, zero
  2859. dpa.w.ph $ac0, t7, s0
  2860. ins t3, t1, 16, 16
  2861. lh t1, 14(a1)
  2862. lh t6, 14(a0)
  2863. dpa.w.ph $ac0, t3, s1
  2864. mflo t0, $ac0
  2865. mul t5, t6, t1
  2866. lh t1, 30(a1)
  2867. lh t6, 30(a0)
  2868. lh t2, 62(a1)
  2869. lh t7, 62(a0)
  2870. mul t6, t6, t1
  2871. subu t8, t4, t0
  2872. mul t7, t7, t2
  2873. addu t0, t4, t0
  2874. shra_r.w t0, t0, 13
  2875. lh t1, 94(a1)
  2876. lh t2, 94(a0)
  2877. lh t3, 126(a1)
  2878. lh t4, 126(a0)
  2879. shra_r.w t8, t8, 13
  2880. mul t1, t1, t2
  2881. mul t3, t3, t4
  2882. sw t0, 12(v0)
  2883. sw t8, 32(v0)
  2884. sll t4, t5, 15
  2885. ins t7, t6, 16, 16
  2886. mult zero, zero
  2887. dpa.w.ph $ac0, t7, s0
  2888. ins t3, t1, 16, 16
  2889. dpa.w.ph $ac0, t3, s1
  2890. mflo t0, $ac0
  2891. lw t9, 0(a2)
  2892. lw t3, 0(v0)
  2893. lw t7, 4(v0)
  2894. lw t1, 8(v0)
  2895. addu t9, t9, a3
  2896. sll t3, t3, 15
  2897. subu t8, t4, t0
  2898. addu t0, t4, t0
  2899. shra_r.w t0, t0, 13
  2900. shra_r.w t8, t8, 13
  2901. sw t0, 16(v0)
  2902. sw t8, 36(v0)
  2903. lw t5, 12(v0)
  2904. lw t6, 16(v0)
  2905. mult t7, s2
  2906. madd t1, s3
  2907. madd t5, s4
  2908. madd t6, s5
  2909. lw t5, 24(v0)
  2910. lw t7, 28(v0)
  2911. mflo t0, $ac0
  2912. lw t8, 32(v0)
  2913. lw t2, 36(v0)
  2914. mult $ac1, t5, s2
  2915. madd $ac1, t7, s3
  2916. madd $ac1, t8, s4
  2917. madd $ac1, t2, s5
  2918. addu t1, t3, t0
  2919. subu t6, t3, t0
  2920. shra_r.w t1, t1, 20
  2921. shra_r.w t6, t6, 20
  2922. mflo t4, $ac1
  2923. shll_s.w t1, t1, 24
  2924. shll_s.w t6, t6, 24
  2925. sra t1, t1, 24
  2926. sra t6, t6, 24
  2927. addiu t1, t1, 128
  2928. addiu t6, t6, 128
  2929. lw t0, 20(v0)
  2930. sb t1, 0(t9)
  2931. sb t6, 1(t9)
  2932. sll t0, t0, 15
  2933. lw t9, 4(a2)
  2934. addu t1, t0, t4
  2935. subu t6, t0, t4
  2936. addu t9, t9, a3
  2937. shra_r.w t1, t1, 20
  2938. shra_r.w t6, t6, 20
  2939. shll_s.w t1, t1, 24
  2940. shll_s.w t6, t6, 24
  2941. sra t1, t1, 24
  2942. sra t6, t6, 24
  2943. addiu t1, t1, 128
  2944. addiu t6, t6, 128
  2945. sb t1, 0(t9)
  2946. sb t6, 1(t9)
  2947. addiu sp, sp, 40
  2948. RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
  2949. j ra
  2950. nop
  2951. END(jsimd_idct_2x2_mips_dspr2)
  2952. /*****************************************************************************/
  2953. LEAF_MIPS_DSPR2(jsimd_idct_4x4_mips_dspr2)
  2954. /*
  2955. * a0 - compptr->dct_table
  2956. * a1 - coef_block
  2957. * a2 - output_buf
  2958. * a3 - output_col
  2959. * 16(sp) - workspace[DCTSIZE*4]; // buffers data between passes
  2960. */
  2961. .set at
  2962. SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  2963. lw v1, 48(sp)
  2964. move t0, a1
  2965. move t1, v1
  2966. li t9, 4
  2967. li s0, 0x2e75f93e
  2968. li s1, 0x21f9ba79
  2969. li s2, 0xecc2efb0
  2970. li s3, 0x52031ccd
  2971. 0:
  2972. lh s6, 32(t0) // inptr[DCTSIZE*2]
  2973. lh t6, 32(a0) // quantptr[DCTSIZE*2]
  2974. lh s7, 96(t0) // inptr[DCTSIZE*6]
  2975. lh t7, 96(a0) // quantptr[DCTSIZE*6]
  2976. mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
  2977. lh s4, 0(t0) // inptr[DCTSIZE*0]
  2978. mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
  2979. lh s5, 0(a0) // quantptr[0]
  2980. li s6, 15137
  2981. li s7, 6270
  2982. mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0])
  2983. mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
  2984. lh t5, 112(t0) // inptr[DCTSIZE*7]
  2985. mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
  2986. lh s4, 112(a0) // quantptr[DCTSIZE*7]
  2987. lh v0, 80(t0) // inptr[DCTSIZE*5]
  2988. lh s5, 80(a0) // quantptr[DCTSIZE*5]
  2989. lh s6, 48(a0) // quantptr[DCTSIZE*3]
  2990. sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1)
  2991. lh s7, 16(a0) // quantptr[DCTSIZE*1]
  2992. lh t8, 16(t0) // inptr[DCTSIZE*1]
  2993. subu t6, t6, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
  2994. lh t7, 48(t0) // inptr[DCTSIZE*3]
  2995. mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
  2996. mul v0, s5, v0 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
  2997. mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
  2998. mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
  2999. addu t3, t2, t6 // tmp10 = tmp0 + z2
  3000. subu t4, t2, t6 // tmp10 = tmp0 - z2
  3001. mult $ac0, zero, zero
  3002. mult $ac1, zero, zero
  3003. ins t5, v0, 16, 16
  3004. ins t7, t8, 16, 16
  3005. addiu t9, t9, -1
  3006. dpa.w.ph $ac0, t5, s0
  3007. dpa.w.ph $ac0, t7, s1
  3008. dpa.w.ph $ac1, t5, s2
  3009. dpa.w.ph $ac1, t7, s3
  3010. mflo s4, $ac0
  3011. mflo s5, $ac1
  3012. addiu a0, a0, 2
  3013. addiu t1, t1, 4
  3014. addiu t0, t0, 2
  3015. addu t6, t4, s4
  3016. subu t5, t4, s4
  3017. addu s6, t3, s5
  3018. subu s7, t3, s5
  3019. shra_r.w t6, t6, 12 // DESCALE(tmp12 + temp1, 12)
  3020. shra_r.w t5, t5, 12 // DESCALE(tmp12 - temp1, 12)
  3021. shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12)
  3022. shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12)
  3023. sw t6, 28(t1)
  3024. sw t5, 60(t1)
  3025. sw s6, -4(t1)
  3026. bgtz t9, 0b
  3027. sw s7, 92(t1)
  3028. // second loop three pass
  3029. li t9, 3
  3030. 1:
  3031. lh s6, 34(t0) // inptr[DCTSIZE*2]
  3032. lh t6, 34(a0) // quantptr[DCTSIZE*2]
  3033. lh s7, 98(t0) // inptr[DCTSIZE*6]
  3034. lh t7, 98(a0) // quantptr[DCTSIZE*6]
  3035. mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
  3036. lh s4, 2(t0) // inptr[DCTSIZE*0]
  3037. mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
  3038. lh s5, 2(a0) // quantptr[DCTSIZE*0]
  3039. li s6, 15137
  3040. li s7, 6270
  3041. mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0])
  3042. mul v0, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
  3043. lh t5, 114(t0) // inptr[DCTSIZE*7]
  3044. mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
  3045. lh s4, 114(a0) // quantptr[DCTSIZE*7]
  3046. lh s5, 82(a0) // quantptr[DCTSIZE*5]
  3047. lh t6, 82(t0) // inptr[DCTSIZE*5]
  3048. sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1)
  3049. lh s6, 50(a0) // quantptr[DCTSIZE*3]
  3050. lh t8, 18(t0) // inptr[DCTSIZE*1]
  3051. subu v0, v0, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
  3052. lh t7, 50(t0) // inptr[DCTSIZE*3]
  3053. lh s7, 18(a0) // quantptr[DCTSIZE*1]
  3054. mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
  3055. mul t6, s5, t6 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
  3056. mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
  3057. mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
  3058. addu t3, t2, v0 // tmp10 = tmp0 + z2
  3059. subu t4, t2, v0 // tmp10 = tmp0 - z2
  3060. mult $ac0, zero, zero
  3061. mult $ac1, zero, zero
  3062. ins t5, t6, 16, 16
  3063. ins t7, t8, 16, 16
  3064. dpa.w.ph $ac0, t5, s0
  3065. dpa.w.ph $ac0, t7, s1
  3066. dpa.w.ph $ac1, t5, s2
  3067. dpa.w.ph $ac1, t7, s3
  3068. mflo t5, $ac0
  3069. mflo t6, $ac1
  3070. addiu t9, t9, -1
  3071. addiu t0, t0, 2
  3072. addiu a0, a0, 2
  3073. addiu t1, t1, 4
  3074. addu s5, t4, t5
  3075. subu s4, t4, t5
  3076. addu s6, t3, t6
  3077. subu s7, t3, t6
  3078. shra_r.w s5, s5, 12 // DESCALE(tmp12 + temp1, 12)
  3079. shra_r.w s4, s4, 12 // DESCALE(tmp12 - temp1, 12)
  3080. shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12)
  3081. shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12)
  3082. sw s5, 32(t1)
  3083. sw s4, 64(t1)
  3084. sw s6, 0(t1)
  3085. bgtz t9, 1b
  3086. sw s7, 96(t1)
  3087. move t1, v1
  3088. li s4, 15137
  3089. lw s6, 8(t1) // wsptr[2]
  3090. li s5, 6270
  3091. lw s7, 24(t1) // wsptr[6]
  3092. mul s4, s4, s6 // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
  3093. lw t2, 0(t1) // wsptr[0]
  3094. mul s5, s5, s7 // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
  3095. lh t5, 28(t1) // wsptr[7]
  3096. lh t6, 20(t1) // wsptr[5]
  3097. lh t7, 12(t1) // wsptr[3]
  3098. lh t8, 4(t1) // wsptr[1]
  3099. ins t5, t6, 16, 16
  3100. ins t7, t8, 16, 16
  3101. mult $ac0, zero, zero
  3102. dpa.w.ph $ac0, t5, s0
  3103. dpa.w.ph $ac0, t7, s1
  3104. mult $ac1, zero, zero
  3105. dpa.w.ph $ac1, t5, s2
  3106. dpa.w.ph $ac1, t7, s3
  3107. sll t2, t2, 14 // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
  3108. mflo s6, $ac0
  3109. // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
  3110. subu s4, s4, s5
  3111. addu t3, t2, s4 // tmp10 = tmp0 + z2
  3112. mflo s7, $ac1
  3113. subu t4, t2, s4 // tmp10 = tmp0 - z2
  3114. addu t7, t4, s6
  3115. subu t8, t4, s6
  3116. addu t5, t3, s7
  3117. subu t6, t3, s7
  3118. shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
  3119. shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
  3120. shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
  3121. shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
  3122. sll s4, t9, 2
  3123. lw v0, 0(a2) // output_buf[ctr]
  3124. shll_s.w t5, t5, 24
  3125. shll_s.w t6, t6, 24
  3126. shll_s.w t7, t7, 24
  3127. shll_s.w t8, t8, 24
  3128. sra t5, t5, 24
  3129. sra t6, t6, 24
  3130. sra t7, t7, 24
  3131. sra t8, t8, 24
  3132. addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
  3133. addiu t5, t5, 128
  3134. addiu t6, t6, 128
  3135. addiu t7, t7, 128
  3136. addiu t8, t8, 128
  3137. sb t5, 0(v0)
  3138. sb t7, 1(v0)
  3139. sb t8, 2(v0)
  3140. sb t6, 3(v0)
  3141. // 2
  3142. li s4, 15137
  3143. lw s6, 40(t1) // wsptr[2]
  3144. li s5, 6270
  3145. lw s7, 56(t1) // wsptr[6]
  3146. mul s4, s4, s6 // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
  3147. lw t2, 32(t1) // wsptr[0]
  3148. mul s5, s5, s7 // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
  3149. lh t5, 60(t1) // wsptr[7]
  3150. lh t6, 52(t1) // wsptr[5]
  3151. lh t7, 44(t1) // wsptr[3]
  3152. lh t8, 36(t1) // wsptr[1]
  3153. ins t5, t6, 16, 16
  3154. ins t7, t8, 16, 16
  3155. mult $ac0, zero, zero
  3156. dpa.w.ph $ac0, t5, s0
  3157. dpa.w.ph $ac0, t7, s1
  3158. mult $ac1, zero, zero
  3159. dpa.w.ph $ac1, t5, s2
  3160. dpa.w.ph $ac1, t7, s3
  3161. sll t2, t2, 14 // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
  3162. mflo s6, $ac0
  3163. // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
  3164. subu s4, s4, s5
  3165. addu t3, t2, s4 // tmp10 = tmp0 + z2
  3166. mflo s7, $ac1
  3167. subu t4, t2, s4 // tmp10 = tmp0 - z2
  3168. addu t7, t4, s6
  3169. subu t8, t4, s6
  3170. addu t5, t3, s7
  3171. subu t6, t3, s7
  3172. shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1)
  3173. shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1)
  3174. shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1)
  3175. shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1)
  3176. sll s4, t9, 2
  3177. lw v0, 4(a2) // output_buf[ctr]
  3178. shll_s.w t5, t5, 24
  3179. shll_s.w t6, t6, 24
  3180. shll_s.w t7, t7, 24
  3181. shll_s.w t8, t8, 24
  3182. sra t5, t5, 24
  3183. sra t6, t6, 24
  3184. sra t7, t7, 24
  3185. sra t8, t8, 24
  3186. addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
  3187. addiu t5, t5, 128
  3188. addiu t6, t6, 128
  3189. addiu t7, t7, 128
  3190. addiu t8, t8, 128
  3191. sb t5, 0(v0)
  3192. sb t7, 1(v0)
  3193. sb t8, 2(v0)
  3194. sb t6, 3(v0)
  3195. // 3
  3196. li s4, 15137
  3197. lw s6, 72(t1) // wsptr[2]
  3198. li s5, 6270
  3199. lw s7, 88(t1) // wsptr[6]
  3200. mul s4, s4, s6 // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
  3201. lw t2, 64(t1) // wsptr[0]
  3202. mul s5, s5, s7 // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
  3203. lh t5, 92(t1) // wsptr[7]
  3204. lh t6, 84(t1) // wsptr[5]
  3205. lh t7, 76(t1) // wsptr[3]
  3206. lh t8, 68(t1) // wsptr[1]
  3207. ins t5, t6, 16, 16
  3208. ins t7, t8, 16, 16
  3209. mult $ac0, zero, zero
  3210. dpa.w.ph $ac0, t5, s0
  3211. dpa.w.ph $ac0, t7, s1
  3212. mult $ac1, zero, zero
  3213. dpa.w.ph $ac1, t5, s2
  3214. dpa.w.ph $ac1, t7, s3
  3215. sll t2, t2, 14 // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
  3216. mflo s6, $ac0
  3217. // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
  3218. subu s4, s4, s5
  3219. addu t3, t2, s4 // tmp10 = tmp0 + z2
  3220. mflo s7, $ac1
  3221. subu t4, t2, s4 // tmp10 = tmp0 - z2
  3222. addu t7, t4, s6
  3223. subu t8, t4, s6
  3224. addu t5, t3, s7
  3225. subu t6, t3, s7
  3226. shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
  3227. shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
  3228. shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
  3229. shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
  3230. sll s4, t9, 2
  3231. lw v0, 8(a2) // output_buf[ctr]
  3232. shll_s.w t5, t5, 24
  3233. shll_s.w t6, t6, 24
  3234. shll_s.w t7, t7, 24
  3235. shll_s.w t8, t8, 24
  3236. sra t5, t5, 24
  3237. sra t6, t6, 24
  3238. sra t7, t7, 24
  3239. sra t8, t8, 24
  3240. addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
  3241. addiu t5, t5, 128
  3242. addiu t6, t6, 128
  3243. addiu t7, t7, 128
  3244. addiu t8, t8, 128
  3245. sb t5, 0(v0)
  3246. sb t7, 1(v0)
  3247. sb t8, 2(v0)
  3248. sb t6, 3(v0)
  3249. li s4, 15137
  3250. lw s6, 104(t1) // wsptr[2]
  3251. li s5, 6270
  3252. lw s7, 120(t1) // wsptr[6]
  3253. mul s4, s4, s6 // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
  3254. lw t2, 96(t1) // wsptr[0]
  3255. mul s5, s5, s7 // MULTIPLY((JLONG) wsptr[6], -FIX_0_765366865)
  3256. lh t5, 124(t1) // wsptr[7]
  3257. lh t6, 116(t1) // wsptr[5]
  3258. lh t7, 108(t1) // wsptr[3]
  3259. lh t8, 100(t1) // wsptr[1]
  3260. ins t5, t6, 16, 16
  3261. ins t7, t8, 16, 16
  3262. mult $ac0, zero, zero
  3263. dpa.w.ph $ac0, t5, s0
  3264. dpa.w.ph $ac0, t7, s1
  3265. mult $ac1, zero, zero
  3266. dpa.w.ph $ac1, t5, s2
  3267. dpa.w.ph $ac1, t7, s3
  3268. sll t2, t2, 14 // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
  3269. mflo s6, $ac0
  3270. // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
  3271. subu s4, s4, s5
  3272. addu t3, t2, s4 // tmp10 = tmp0 + z2;
  3273. mflo s7, $ac1
  3274. subu t4, t2, s4 // tmp10 = tmp0 - z2;
  3275. addu t7, t4, s6
  3276. subu t8, t4, s6
  3277. addu t5, t3, s7
  3278. subu t6, t3, s7
  3279. shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
  3280. shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
  3281. shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
  3282. shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
  3283. sll s4, t9, 2
  3284. lw v0, 12(a2) // output_buf[ctr]
  3285. shll_s.w t5, t5, 24
  3286. shll_s.w t6, t6, 24
  3287. shll_s.w t7, t7, 24
  3288. shll_s.w t8, t8, 24
  3289. sra t5, t5, 24
  3290. sra t6, t6, 24
  3291. sra t7, t7, 24
  3292. sra t8, t8, 24
  3293. addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
  3294. addiu t5, t5, 128
  3295. addiu t6, t6, 128
  3296. addiu t7, t7, 128
  3297. addiu t8, t8, 128
  3298. sb t5, 0(v0)
  3299. sb t7, 1(v0)
  3300. sb t8, 2(v0)
  3301. sb t6, 3(v0)
  3302. RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  3303. j ra
  3304. nop
  3305. END(jsimd_idct_4x4_mips_dspr2)
  3306. /*****************************************************************************/
  3307. LEAF_MIPS_DSPR2(jsimd_idct_6x6_mips_dspr2)
  3308. /*
  3309. * a0 - compptr->dct_table
  3310. * a1 - coef_block
  3311. * a2 - output_buf
  3312. * a3 - output_col
  3313. */
  3314. .set at
  3315. SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  3316. addiu sp, sp, -144
  3317. move v0, sp
  3318. addiu v1, v0, 24
  3319. addiu t9, zero, 5793
  3320. addiu s0, zero, 10033
  3321. addiu s1, zero, 2998
  3322. 1:
  3323. lh s2, 0(a0) // q0 = quantptr[ 0]
  3324. lh s3, 32(a0) // q1 = quantptr[16]
  3325. lh s4, 64(a0) // q2 = quantptr[32]
  3326. lh t2, 64(a1) // tmp2 = inptr[32]
  3327. lh t1, 32(a1) // tmp1 = inptr[16]
  3328. lh t0, 0(a1) // tmp0 = inptr[ 0]
  3329. mul t2, t2, s4 // tmp2 = tmp2 * q2
  3330. mul t1, t1, s3 // tmp1 = tmp1 * q1
  3331. mul t0, t0, s2 // tmp0 = tmp0 * q0
  3332. lh t6, 16(a1) // z1 = inptr[ 8]
  3333. lh t8, 80(a1) // z3 = inptr[40]
  3334. lh t7, 48(a1) // z2 = inptr[24]
  3335. lh s2, 16(a0) // q0 = quantptr[ 8]
  3336. lh s4, 80(a0) // q2 = quantptr[40]
  3337. lh s3, 48(a0) // q1 = quantptr[24]
  3338. mul t2, t2, t9 // tmp2 = tmp2 * 5793
  3339. mul t1, t1, s0 // tmp1 = tmp1 * 10033
  3340. sll t0, t0, 13 // tmp0 = tmp0 << 13
  3341. mul t6, t6, s2 // z1 = z1 * q0
  3342. mul t8, t8, s4 // z3 = z3 * q2
  3343. mul t7, t7, s3 // z2 = z2 * q1
  3344. addu t3, t0, t2 // tmp10 = tmp0 + tmp2
  3345. sll t2, t2, 1 // tmp2 = tmp2 << 2
  3346. subu t4, t0, t2 // tmp11 = tmp0 - tmp2;
  3347. subu t5, t3, t1 // tmp12 = tmp10 - tmp1
  3348. addu t3, t3, t1 // tmp10 = tmp10 + tmp1
  3349. addu t1, t6, t8 // tmp1 = z1 + z3
  3350. mul t1, t1, s1 // tmp1 = tmp1 * 2998
  3351. shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11
  3352. subu t2, t6, t8 // tmp2 = z1 - z3
  3353. subu t2, t2, t7 // tmp2 = tmp2 - z2
  3354. sll t2, t2, 2 // tmp2 = tmp2 << 2
  3355. addu t0, t6, t7 // tmp0 = z1 + z2
  3356. sll t0, t0, 13 // tmp0 = tmp0 << 13
  3357. subu s2, t8, t7 // q0 = z3 - z2
  3358. sll s2, s2, 13 // q0 = q0 << 13
  3359. addu t0, t0, t1 // tmp0 = tmp0 + tmp1
  3360. addu t1, s2, t1 // tmp1 = q0 + tmp1
  3361. addu s2, t4, t2 // q0 = tmp11 + tmp2
  3362. subu s3, t4, t2 // q1 = tmp11 - tmp2
  3363. addu t6, t3, t0 // z1 = tmp10 + tmp0
  3364. subu t7, t3, t0 // z2 = tmp10 - tmp0
  3365. addu t4, t5, t1 // tmp11 = tmp12 + tmp1
  3366. subu t5, t5, t1 // tmp12 = tmp12 - tmp1
  3367. shra_r.w t6, t6, 11 // z1 = (z1 + 1024) >> 11
  3368. shra_r.w t7, t7, 11 // z2 = (z2 + 1024) >> 11
  3369. shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11
  3370. shra_r.w t5, t5, 11 // tmp12 = (tmp12 + 1024) >> 11
  3371. sw s2, 24(v0)
  3372. sw s3, 96(v0)
  3373. sw t6, 0(v0)
  3374. sw t7, 120(v0)
  3375. sw t4, 48(v0)
  3376. sw t5, 72(v0)
  3377. addiu v0, v0, 4
  3378. addiu a1, a1, 2
  3379. bne v0, v1, 1b
  3380. addiu a0, a0, 2
  3381. /* Pass 2: process 6 rows from work array, store into output array. */
  3382. move v0, sp
  3383. addiu v1, v0, 144
  3384. 2:
  3385. lw t0, 0(v0)
  3386. lw t2, 16(v0)
  3387. lw s5, 0(a2)
  3388. addiu t0, t0, 16
  3389. sll t0, t0, 13
  3390. mul t3, t2, t9
  3391. lw t6, 4(v0)
  3392. lw t8, 20(v0)
  3393. lw t7, 12(v0)
  3394. addu s5, s5, a3
  3395. addu s6, t6, t8
  3396. mul s6, s6, s1
  3397. addu t1, t0, t3
  3398. subu t4, t0, t3
  3399. subu t4, t4, t3
  3400. lw t3, 8(v0)
  3401. mul t0, t3, s0
  3402. addu s7, t6, t7
  3403. sll s7, s7, 13
  3404. addu s7, s6, s7
  3405. subu t2, t8, t7
  3406. sll t2, t2, 13
  3407. addu t2, s6, t2
  3408. subu s6, t6, t7
  3409. subu s6, s6, t8
  3410. sll s6, s6, 13
  3411. addu t3, t1, t0
  3412. subu t5, t1, t0
  3413. addu t6, t3, s7
  3414. subu t3, t3, s7
  3415. addu t7, t4, s6
  3416. subu t4, t4, s6
  3417. addu t8, t5, t2
  3418. subu t5, t5, t2
  3419. shll_s.w t6, t6, 6
  3420. shll_s.w t3, t3, 6
  3421. shll_s.w t7, t7, 6
  3422. shll_s.w t4, t4, 6
  3423. shll_s.w t8, t8, 6
  3424. shll_s.w t5, t5, 6
  3425. sra t6, t6, 24
  3426. addiu t6, t6, 128
  3427. sra t3, t3, 24
  3428. addiu t3, t3, 128
  3429. sb t6, 0(s5)
  3430. sra t7, t7, 24
  3431. addiu t7, t7, 128
  3432. sb t3, 5(s5)
  3433. sra t4, t4, 24
  3434. addiu t4, t4, 128
  3435. sb t7, 1(s5)
  3436. sra t8, t8, 24
  3437. addiu t8, t8, 128
  3438. sb t4, 4(s5)
  3439. addiu v0, v0, 24
  3440. sra t5, t5, 24
  3441. addiu t5, t5, 128
  3442. sb t8, 2(s5)
  3443. addiu a2, a2, 4
  3444. bne v0, v1, 2b
  3445. sb t5, 3(s5)
  3446. addiu sp, sp, 144
  3447. RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  3448. j ra
  3449. nop
  3450. END(jsimd_idct_6x6_mips_dspr2)
  3451. /*****************************************************************************/
  3452. LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass1_mips_dspr2)
  3453. /*
  3454. * a0 - compptr->dct_table
  3455. * a1 - coef_block
  3456. * a2 - workspace
  3457. */
  3458. SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
  3459. li a3, 8
  3460. 1:
  3461. // odd part
  3462. lh t0, 48(a1)
  3463. lh t1, 48(a0)
  3464. lh t2, 16(a1)
  3465. lh t3, 16(a0)
  3466. lh t4, 80(a1)
  3467. lh t5, 80(a0)
  3468. lh t6, 112(a1)
  3469. lh t7, 112(a0)
  3470. mul t0, t0, t1 // z2
  3471. mul t1, t2, t3 // z1
  3472. mul t2, t4, t5 // z3
  3473. mul t3, t6, t7 // z4
  3474. li t4, 10703 // FIX(1.306562965)
  3475. li t5, 4433 // FIX_0_541196100
  3476. li t6, 7053 // FIX(0.860918669)
  3477. mul t4, t0,t4 // tmp11
  3478. mul t5, t0,t5 // -tmp14
  3479. addu t7, t1,t2 // tmp10
  3480. addu t8, t7,t3 // tmp10 + z4
  3481. mul t6, t6, t8 // tmp15
  3482. li t8, 2139 // FIX(0.261052384)
  3483. mul t8, t7, t8 // MULTIPLY(tmp10, FIX(0.261052384))
  3484. li t7, 2295 // FIX(0.280143716)
  3485. mul t7, t1, t7 // MULTIPLY(z1, FIX(0.280143716))
  3486. addu t9, t2, t3 // z3 + z4
  3487. li s0, 8565 // FIX(1.045510580)
  3488. mul t9, t9, s0 // -tmp13
  3489. li s0, 12112 // FIX(1.478575242)
  3490. mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242)
  3491. li s1, 12998 // FIX(1.586706681)
  3492. mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681))
  3493. li s2, 5540 // FIX(0.676326758)
  3494. mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758))
  3495. li s3, 16244 // FIX(1.982889723)
  3496. mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723))
  3497. subu t1, t1, t3 // z1-=z4
  3498. subu t0, t0, t2 // z2-=z3
  3499. addu t2, t0, t1 // z1+z2
  3500. li t3, 4433 // FIX_0_541196100
  3501. mul t2, t2, t3 // z3
  3502. li t3, 6270 // FIX_0_765366865
  3503. mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865)
  3504. li t3, 15137 // FIX_0_765366865
  3505. mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065)
  3506. addu t8, t6, t8 // tmp12
  3507. addu t3, t8, t4 // tmp12 + tmp11
  3508. addu t3, t3, t7 // tmp10
  3509. subu t8, t8, t9 // tmp12 + tmp13
  3510. addu s0, t5, s0
  3511. subu t8, t8, s0 // tmp12
  3512. subu t9, t6, t9
  3513. subu s1, s1, t4
  3514. addu t9, t9, s1 // tmp13
  3515. subu t6, t6, t5
  3516. subu t6, t6, s2
  3517. subu t6, t6, s3 // tmp15
  3518. // even part start
  3519. lh t4, 64(a1)
  3520. lh t5, 64(a0)
  3521. lh t7, 32(a1)
  3522. lh s0, 32(a0)
  3523. lh s1, 0(a1)
  3524. lh s2, 0(a0)
  3525. lh s3, 96(a1)
  3526. lh v0, 96(a0)
  3527. mul t4, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*4],quantptr[DCTSIZE*4])
  3528. mul t5, t7, s0 // DEQUANTIZE(inptr[DCTSIZE*2],quantptr[DCTSIZE*2])
  3529. mul t7, s1, s2 // DEQUANTIZE(inptr[DCTSIZE*0],quantptr[DCTSIZE*0])
  3530. mul s0, s3, v0 // DEQUANTIZE(inptr[DCTSIZE*6],quantptr[DCTSIZE*6])
  3531. // odd part end
  3532. addu t1, t2, t1 // tmp11
  3533. subu t0, t2, t0 // tmp14
  3534. // update counter and pointers
  3535. addiu a3, a3, -1
  3536. addiu a0, a0, 2
  3537. addiu a1, a1, 2
  3538. // even part rest
  3539. li s1, 10033
  3540. li s2, 11190
  3541. mul t4, t4, s1 // z4
  3542. mul s1, t5, s2 // z4
  3543. sll t5, t5, 13 // z1
  3544. sll t7, t7, 13
  3545. addiu t7, t7, 1024 // z3
  3546. sll s0, s0, 13 // z2
  3547. addu s2, t7, t4 // tmp10
  3548. subu t4, t7, t4 // tmp11
  3549. subu s3, t5, s0 // tmp12
  3550. addu t2, t7, s3 // tmp21
  3551. subu s3, t7, s3 // tmp24
  3552. addu t7, s1, s0 // tmp12
  3553. addu v0, s2, t7 // tmp20
  3554. subu s2, s2, t7 // tmp25
  3555. subu s1, s1, t5 // z4 - z1
  3556. subu s1, s1, s0 // tmp12
  3557. addu s0, t4, s1 // tmp22
  3558. subu t4, t4, s1 // tmp23
  3559. // final output stage
  3560. addu t5, v0, t3
  3561. subu v0, v0, t3
  3562. addu t3, t2, t1
  3563. subu t2, t2, t1
  3564. addu t1, s0, t8
  3565. subu s0, s0, t8
  3566. addu t8, t4, t9
  3567. subu t4, t4, t9
  3568. addu t9, s3, t0
  3569. subu s3, s3, t0
  3570. addu t0, s2, t6
  3571. subu s2, s2, t6
  3572. sra t5, t5, 11
  3573. sra t3, t3, 11
  3574. sra t1, t1, 11
  3575. sra t8, t8, 11
  3576. sra t9, t9, 11
  3577. sra t0, t0, 11
  3578. sra s2, s2, 11
  3579. sra s3, s3, 11
  3580. sra t4, t4, 11
  3581. sra s0, s0, 11
  3582. sra t2, t2, 11
  3583. sra v0, v0, 11
  3584. sw t5, 0(a2)
  3585. sw t3, 32(a2)
  3586. sw t1, 64(a2)
  3587. sw t8, 96(a2)
  3588. sw t9, 128(a2)
  3589. sw t0, 160(a2)
  3590. sw s2, 192(a2)
  3591. sw s3, 224(a2)
  3592. sw t4, 256(a2)
  3593. sw s0, 288(a2)
  3594. sw t2, 320(a2)
  3595. sw v0, 352(a2)
  3596. bgtz a3, 1b
  3597. addiu a2, a2, 4
  3598. RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
  3599. j ra
  3600. nop
  3601. END(jsimd_idct_12x12_pass1_mips_dspr2)
  3602. /*****************************************************************************/
  3603. LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass2_mips_dspr2)
  3604. /*
  3605. * a0 - workspace
  3606. * a1 - output
  3607. */
  3608. SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
  3609. li a3, 12
  3610. 1:
  3611. // Odd part
  3612. lw t0, 12(a0)
  3613. lw t1, 4(a0)
  3614. lw t2, 20(a0)
  3615. lw t3, 28(a0)
  3616. li t4, 10703 // FIX(1.306562965)
  3617. li t5, 4433 // FIX_0_541196100
  3618. mul t4, t0, t4 // tmp11
  3619. mul t5, t0, t5 // -tmp14
  3620. addu t6, t1, t2 // tmp10
  3621. li t7, 2139 // FIX(0.261052384)
  3622. mul t7, t6, t7 // MULTIPLY(tmp10, FIX(0.261052384))
  3623. addu t6, t6, t3 // tmp10 + z4
  3624. li t8, 7053 // FIX(0.860918669)
  3625. mul t6, t6, t8 // tmp15
  3626. li t8, 2295 // FIX(0.280143716)
  3627. mul t8, t1, t8 // MULTIPLY(z1, FIX(0.280143716))
  3628. addu t9, t2, t3 // z3 + z4
  3629. li s0, 8565 // FIX(1.045510580)
  3630. mul t9, t9, s0 // -tmp13
  3631. li s0, 12112 // FIX(1.478575242)
  3632. mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242))
  3633. li s1, 12998 // FIX(1.586706681)
  3634. mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681))
  3635. li s2, 5540 // FIX(0.676326758)
  3636. mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758))
  3637. li s3, 16244 // FIX(1.982889723)
  3638. mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723))
  3639. subu t1, t1, t3 // z1 -= z4
  3640. subu t0, t0, t2 // z2 -= z3
  3641. addu t2, t1, t0 // z1 + z2
  3642. li t3, 4433 // FIX_0_541196100
  3643. mul t2, t2, t3 // z3
  3644. li t3, 6270 // FIX_0_765366865
  3645. mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865)
  3646. li t3, 15137 // FIX_1_847759065
  3647. mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065)
  3648. addu t3, t6, t7 // tmp12
  3649. addu t7, t3, t4
  3650. addu t7, t7, t8 // tmp10
  3651. subu t3, t3, t9
  3652. subu t3, t3, t5
  3653. subu t3, t3, s0 // tmp12
  3654. subu t9, t6, t9
  3655. subu t9, t9, t4
  3656. addu t9, t9, s1 // tmp13
  3657. subu t6, t6, t5
  3658. subu t6, t6, s2
  3659. subu t6, t6, s3 // tmp15
  3660. addu t1, t2, t1 // tmp11
  3661. subu t0, t2, t0 // tmp14
  3662. // even part
  3663. lw t2, 16(a0) // z4
  3664. lw t4, 8(a0) // z1
  3665. lw t5, 0(a0) // z3
  3666. lw t8, 24(a0) // z2
  3667. li s0, 10033 // FIX(1.224744871)
  3668. li s1, 11190 // FIX(1.366025404)
  3669. mul t2, t2, s0 // z4
  3670. mul s0, t4, s1 // z4
  3671. addiu t5, t5, 0x10
  3672. sll t5, t5, 13 // z3
  3673. sll t4, t4, 13 // z1
  3674. sll t8, t8, 13 // z2
  3675. subu s1, t4, t8 // tmp12
  3676. addu s2, t5, t2 // tmp10
  3677. subu t2, t5, t2 // tmp11
  3678. addu s3, t5, s1 // tmp21
  3679. subu s1, t5, s1 // tmp24
  3680. addu t5, s0, t8 // tmp12
  3681. addu v0, s2, t5 // tmp20
  3682. subu t5, s2, t5 // tmp25
  3683. subu t4, s0, t4
  3684. subu t4, t4, t8 // tmp12
  3685. addu t8, t2, t4 // tmp22
  3686. subu t2, t2, t4 // tmp23
  3687. // increment counter and pointers
  3688. addiu a3, a3, -1
  3689. addiu a0, a0, 32
  3690. // Final stage
  3691. addu t4, v0, t7
  3692. subu v0, v0, t7
  3693. addu t7, s3, t1
  3694. subu s3, s3, t1
  3695. addu t1, t8, t3
  3696. subu t8, t8, t3
  3697. addu t3, t2, t9
  3698. subu t2, t2, t9
  3699. addu t9, s1, t0
  3700. subu s1, s1, t0
  3701. addu t0, t5, t6
  3702. subu t5, t5, t6
  3703. sll t4, t4, 4
  3704. sll t7, t7, 4
  3705. sll t1, t1, 4
  3706. sll t3, t3, 4
  3707. sll t9, t9, 4
  3708. sll t0, t0, 4
  3709. sll t5, t5, 4
  3710. sll s1, s1, 4
  3711. sll t2, t2, 4
  3712. sll t8, t8, 4
  3713. sll s3, s3, 4
  3714. sll v0, v0, 4
  3715. shll_s.w t4, t4, 2
  3716. shll_s.w t7, t7, 2
  3717. shll_s.w t1, t1, 2
  3718. shll_s.w t3, t3, 2
  3719. shll_s.w t9, t9, 2
  3720. shll_s.w t0, t0, 2
  3721. shll_s.w t5, t5, 2
  3722. shll_s.w s1, s1, 2
  3723. shll_s.w t2, t2, 2
  3724. shll_s.w t8, t8, 2
  3725. shll_s.w s3, s3, 2
  3726. shll_s.w v0, v0, 2
  3727. srl t4, t4, 24
  3728. srl t7, t7, 24
  3729. srl t1, t1, 24
  3730. srl t3, t3, 24
  3731. srl t9, t9, 24
  3732. srl t0, t0, 24
  3733. srl t5, t5, 24
  3734. srl s1, s1, 24
  3735. srl t2, t2, 24
  3736. srl t8, t8, 24
  3737. srl s3, s3, 24
  3738. srl v0, v0, 24
  3739. lw t6, 0(a1)
  3740. addiu t4, t4, 0x80
  3741. addiu t7, t7, 0x80
  3742. addiu t1, t1, 0x80
  3743. addiu t3, t3, 0x80
  3744. addiu t9, t9, 0x80
  3745. addiu t0, t0, 0x80
  3746. addiu t5, t5, 0x80
  3747. addiu s1, s1, 0x80
  3748. addiu t2, t2, 0x80
  3749. addiu t8, t8, 0x80
  3750. addiu s3, s3, 0x80
  3751. addiu v0, v0, 0x80
  3752. sb t4, 0(t6)
  3753. sb t7, 1(t6)
  3754. sb t1, 2(t6)
  3755. sb t3, 3(t6)
  3756. sb t9, 4(t6)
  3757. sb t0, 5(t6)
  3758. sb t5, 6(t6)
  3759. sb s1, 7(t6)
  3760. sb t2, 8(t6)
  3761. sb t8, 9(t6)
  3762. sb s3, 10(t6)
  3763. sb v0, 11(t6)
  3764. bgtz a3, 1b
  3765. addiu a1, a1, 4
  3766. RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
  3767. jr ra
  3768. nop
  3769. END(jsimd_idct_12x12_pass2_mips_dspr2)
  3770. /*****************************************************************************/
  3771. LEAF_MIPS_DSPR2(jsimd_convsamp_mips_dspr2)
  3772. /*
  3773. * a0 - sample_data
  3774. * a1 - start_col
  3775. * a2 - workspace
  3776. */
  3777. lw t0, 0(a0)
  3778. li t7, 0xff80ff80
  3779. addu t0, t0, a1
  3780. ulw t1, 0(t0)
  3781. ulw t2, 4(t0)
  3782. preceu.ph.qbr t3, t1
  3783. preceu.ph.qbl t4, t1
  3784. lw t0, 4(a0)
  3785. preceu.ph.qbr t5, t2
  3786. preceu.ph.qbl t6, t2
  3787. addu t0, t0, a1
  3788. addu.ph t3, t3, t7
  3789. addu.ph t4, t4, t7
  3790. ulw t1, 0(t0)
  3791. ulw t2, 4(t0)
  3792. addu.ph t5, t5, t7
  3793. addu.ph t6, t6, t7
  3794. usw t3, 0(a2)
  3795. usw t4, 4(a2)
  3796. preceu.ph.qbr t3, t1
  3797. preceu.ph.qbl t4, t1
  3798. usw t5, 8(a2)
  3799. usw t6, 12(a2)
  3800. lw t0, 8(a0)
  3801. preceu.ph.qbr t5, t2
  3802. preceu.ph.qbl t6, t2
  3803. addu t0, t0, a1
  3804. addu.ph t3, t3, t7
  3805. addu.ph t4, t4, t7
  3806. ulw t1, 0(t0)
  3807. ulw t2, 4(t0)
  3808. addu.ph t5, t5, t7
  3809. addu.ph t6, t6, t7
  3810. usw t3, 16(a2)
  3811. usw t4, 20(a2)
  3812. preceu.ph.qbr t3, t1
  3813. preceu.ph.qbl t4, t1
  3814. usw t5, 24(a2)
  3815. usw t6, 28(a2)
  3816. lw t0, 12(a0)
  3817. preceu.ph.qbr t5, t2
  3818. preceu.ph.qbl t6, t2
  3819. addu t0, t0, a1
  3820. addu.ph t3, t3, t7
  3821. addu.ph t4, t4, t7
  3822. ulw t1, 0(t0)
  3823. ulw t2, 4(t0)
  3824. addu.ph t5, t5, t7
  3825. addu.ph t6, t6, t7
  3826. usw t3, 32(a2)
  3827. usw t4, 36(a2)
  3828. preceu.ph.qbr t3, t1
  3829. preceu.ph.qbl t4, t1
  3830. usw t5, 40(a2)
  3831. usw t6, 44(a2)
  3832. lw t0, 16(a0)
  3833. preceu.ph.qbr t5, t2
  3834. preceu.ph.qbl t6, t2
  3835. addu t0, t0, a1
  3836. addu.ph t3, t3, t7
  3837. addu.ph t4, t4, t7
  3838. ulw t1, 0(t0)
  3839. ulw t2, 4(t0)
  3840. addu.ph t5, t5, t7
  3841. addu.ph t6, t6, t7
  3842. usw t3, 48(a2)
  3843. usw t4, 52(a2)
  3844. preceu.ph.qbr t3, t1
  3845. preceu.ph.qbl t4, t1
  3846. usw t5, 56(a2)
  3847. usw t6, 60(a2)
  3848. lw t0, 20(a0)
  3849. preceu.ph.qbr t5, t2
  3850. preceu.ph.qbl t6, t2
  3851. addu t0, t0, a1
  3852. addu.ph t3, t3, t7
  3853. addu.ph t4, t4, t7
  3854. ulw t1, 0(t0)
  3855. ulw t2, 4(t0)
  3856. addu.ph t5, t5, t7
  3857. addu.ph t6, t6, t7
  3858. usw t3, 64(a2)
  3859. usw t4, 68(a2)
  3860. preceu.ph.qbr t3, t1
  3861. preceu.ph.qbl t4, t1
  3862. usw t5, 72(a2)
  3863. usw t6, 76(a2)
  3864. lw t0, 24(a0)
  3865. preceu.ph.qbr t5, t2
  3866. preceu.ph.qbl t6, t2
  3867. addu t0, t0, a1
  3868. addu.ph t3, t3, t7
  3869. addu.ph t4, t4, t7
  3870. ulw t1, 0(t0)
  3871. ulw t2, 4(t0)
  3872. addu.ph t5, t5, t7
  3873. addu.ph t6, t6, t7
  3874. usw t3, 80(a2)
  3875. usw t4, 84(a2)
  3876. preceu.ph.qbr t3, t1
  3877. preceu.ph.qbl t4, t1
  3878. usw t5, 88(a2)
  3879. usw t6, 92(a2)
  3880. lw t0, 28(a0)
  3881. preceu.ph.qbr t5, t2
  3882. preceu.ph.qbl t6, t2
  3883. addu t0, t0, a1
  3884. addu.ph t3, t3, t7
  3885. addu.ph t4, t4, t7
  3886. ulw t1, 0(t0)
  3887. ulw t2, 4(t0)
  3888. addu.ph t5, t5, t7
  3889. addu.ph t6, t6, t7
  3890. usw t3, 96(a2)
  3891. usw t4, 100(a2)
  3892. preceu.ph.qbr t3, t1
  3893. preceu.ph.qbl t4, t1
  3894. usw t5, 104(a2)
  3895. usw t6, 108(a2)
  3896. preceu.ph.qbr t5, t2
  3897. preceu.ph.qbl t6, t2
  3898. addu.ph t3, t3, t7
  3899. addu.ph t4, t4, t7
  3900. addu.ph t5, t5, t7
  3901. addu.ph t6, t6, t7
  3902. usw t3, 112(a2)
  3903. usw t4, 116(a2)
  3904. usw t5, 120(a2)
  3905. usw t6, 124(a2)
  3906. j ra
  3907. nop
  3908. END(jsimd_convsamp_mips_dspr2)
  3909. /*****************************************************************************/
  3910. LEAF_MIPS_DSPR2(jsimd_convsamp_float_mips_dspr2)
  3911. /*
  3912. * a0 - sample_data
  3913. * a1 - start_col
  3914. * a2 - workspace
  3915. */
  3916. .set at
  3917. lw t0, 0(a0)
  3918. addu t0, t0, a1
  3919. lbu t1, 0(t0)
  3920. lbu t2, 1(t0)
  3921. lbu t3, 2(t0)
  3922. lbu t4, 3(t0)
  3923. lbu t5, 4(t0)
  3924. lbu t6, 5(t0)
  3925. lbu t7, 6(t0)
  3926. lbu t8, 7(t0)
  3927. addiu t1, t1, -128
  3928. addiu t2, t2, -128
  3929. addiu t3, t3, -128
  3930. addiu t4, t4, -128
  3931. addiu t5, t5, -128
  3932. addiu t6, t6, -128
  3933. addiu t7, t7, -128
  3934. addiu t8, t8, -128
  3935. mtc1 t1, f2
  3936. mtc1 t2, f4
  3937. mtc1 t3, f6
  3938. mtc1 t4, f8
  3939. mtc1 t5, f10
  3940. mtc1 t6, f12
  3941. mtc1 t7, f14
  3942. mtc1 t8, f16
  3943. cvt.s.w f2, f2
  3944. cvt.s.w f4, f4
  3945. cvt.s.w f6, f6
  3946. cvt.s.w f8, f8
  3947. cvt.s.w f10, f10
  3948. cvt.s.w f12, f12
  3949. cvt.s.w f14, f14
  3950. cvt.s.w f16, f16
  3951. lw t0, 4(a0)
  3952. swc1 f2, 0(a2)
  3953. swc1 f4, 4(a2)
  3954. swc1 f6, 8(a2)
  3955. addu t0, t0, a1
  3956. swc1 f8, 12(a2)
  3957. swc1 f10, 16(a2)
  3958. swc1 f12, 20(a2)
  3959. swc1 f14, 24(a2)
  3960. swc1 f16, 28(a2)
  3961. //elemr 1
  3962. lbu t1, 0(t0)
  3963. lbu t2, 1(t0)
  3964. lbu t3, 2(t0)
  3965. lbu t4, 3(t0)
  3966. lbu t5, 4(t0)
  3967. lbu t6, 5(t0)
  3968. lbu t7, 6(t0)
  3969. lbu t8, 7(t0)
  3970. addiu t1, t1, -128
  3971. addiu t2, t2, -128
  3972. addiu t3, t3, -128
  3973. addiu t4, t4, -128
  3974. addiu t5, t5, -128
  3975. addiu t6, t6, -128
  3976. addiu t7, t7, -128
  3977. addiu t8, t8, -128
  3978. mtc1 t1, f2
  3979. mtc1 t2, f4
  3980. mtc1 t3, f6
  3981. mtc1 t4, f8
  3982. mtc1 t5, f10
  3983. mtc1 t6, f12
  3984. mtc1 t7, f14
  3985. mtc1 t8, f16
  3986. cvt.s.w f2, f2
  3987. cvt.s.w f4, f4
  3988. cvt.s.w f6, f6
  3989. cvt.s.w f8, f8
  3990. cvt.s.w f10, f10
  3991. cvt.s.w f12, f12
  3992. cvt.s.w f14, f14
  3993. cvt.s.w f16, f16
  3994. lw t0, 8(a0)
  3995. swc1 f2, 32(a2)
  3996. swc1 f4, 36(a2)
  3997. swc1 f6, 40(a2)
  3998. addu t0, t0, a1
  3999. swc1 f8, 44(a2)
  4000. swc1 f10, 48(a2)
  4001. swc1 f12, 52(a2)
  4002. swc1 f14, 56(a2)
  4003. swc1 f16, 60(a2)
  4004. //elemr 2
  4005. lbu t1, 0(t0)
  4006. lbu t2, 1(t0)
  4007. lbu t3, 2(t0)
  4008. lbu t4, 3(t0)
  4009. lbu t5, 4(t0)
  4010. lbu t6, 5(t0)
  4011. lbu t7, 6(t0)
  4012. lbu t8, 7(t0)
  4013. addiu t1, t1, -128
  4014. addiu t2, t2, -128
  4015. addiu t3, t3, -128
  4016. addiu t4, t4, -128
  4017. addiu t5, t5, -128
  4018. addiu t6, t6, -128
  4019. addiu t7, t7, -128
  4020. addiu t8, t8, -128
  4021. mtc1 t1, f2
  4022. mtc1 t2, f4
  4023. mtc1 t3, f6
  4024. mtc1 t4, f8
  4025. mtc1 t5, f10
  4026. mtc1 t6, f12
  4027. mtc1 t7, f14
  4028. mtc1 t8, f16
  4029. cvt.s.w f2, f2
  4030. cvt.s.w f4, f4
  4031. cvt.s.w f6, f6
  4032. cvt.s.w f8, f8
  4033. cvt.s.w f10, f10
  4034. cvt.s.w f12, f12
  4035. cvt.s.w f14, f14
  4036. cvt.s.w f16, f16
  4037. lw t0, 12(a0)
  4038. swc1 f2, 64(a2)
  4039. swc1 f4, 68(a2)
  4040. swc1 f6, 72(a2)
  4041. addu t0, t0, a1
  4042. swc1 f8, 76(a2)
  4043. swc1 f10, 80(a2)
  4044. swc1 f12, 84(a2)
  4045. swc1 f14, 88(a2)
  4046. swc1 f16, 92(a2)
  4047. //elemr 3
  4048. lbu t1, 0(t0)
  4049. lbu t2, 1(t0)
  4050. lbu t3, 2(t0)
  4051. lbu t4, 3(t0)
  4052. lbu t5, 4(t0)
  4053. lbu t6, 5(t0)
  4054. lbu t7, 6(t0)
  4055. lbu t8, 7(t0)
  4056. addiu t1, t1, -128
  4057. addiu t2, t2, -128
  4058. addiu t3, t3, -128
  4059. addiu t4, t4, -128
  4060. addiu t5, t5, -128
  4061. addiu t6, t6, -128
  4062. addiu t7, t7, -128
  4063. addiu t8, t8, -128
  4064. mtc1 t1, f2
  4065. mtc1 t2, f4
  4066. mtc1 t3, f6
  4067. mtc1 t4, f8
  4068. mtc1 t5, f10
  4069. mtc1 t6, f12
  4070. mtc1 t7, f14
  4071. mtc1 t8, f16
  4072. cvt.s.w f2, f2
  4073. cvt.s.w f4, f4
  4074. cvt.s.w f6, f6
  4075. cvt.s.w f8, f8
  4076. cvt.s.w f10, f10
  4077. cvt.s.w f12, f12
  4078. cvt.s.w f14, f14
  4079. cvt.s.w f16, f16
  4080. lw t0, 16(a0)
  4081. swc1 f2, 96(a2)
  4082. swc1 f4, 100(a2)
  4083. swc1 f6, 104(a2)
  4084. addu t0, t0, a1
  4085. swc1 f8, 108(a2)
  4086. swc1 f10, 112(a2)
  4087. swc1 f12, 116(a2)
  4088. swc1 f14, 120(a2)
  4089. swc1 f16, 124(a2)
  4090. //elemr 4
  4091. lbu t1, 0(t0)
  4092. lbu t2, 1(t0)
  4093. lbu t3, 2(t0)
  4094. lbu t4, 3(t0)
  4095. lbu t5, 4(t0)
  4096. lbu t6, 5(t0)
  4097. lbu t7, 6(t0)
  4098. lbu t8, 7(t0)
  4099. addiu t1, t1, -128
  4100. addiu t2, t2, -128
  4101. addiu t3, t3, -128
  4102. addiu t4, t4, -128
  4103. addiu t5, t5, -128
  4104. addiu t6, t6, -128
  4105. addiu t7, t7, -128
  4106. addiu t8, t8, -128
  4107. mtc1 t1, f2
  4108. mtc1 t2, f4
  4109. mtc1 t3, f6
  4110. mtc1 t4, f8
  4111. mtc1 t5, f10
  4112. mtc1 t6, f12
  4113. mtc1 t7, f14
  4114. mtc1 t8, f16
  4115. cvt.s.w f2, f2
  4116. cvt.s.w f4, f4
  4117. cvt.s.w f6, f6
  4118. cvt.s.w f8, f8
  4119. cvt.s.w f10, f10
  4120. cvt.s.w f12, f12
  4121. cvt.s.w f14, f14
  4122. cvt.s.w f16, f16
  4123. lw t0, 20(a0)
  4124. swc1 f2, 128(a2)
  4125. swc1 f4, 132(a2)
  4126. swc1 f6, 136(a2)
  4127. addu t0, t0, a1
  4128. swc1 f8, 140(a2)
  4129. swc1 f10, 144(a2)
  4130. swc1 f12, 148(a2)
  4131. swc1 f14, 152(a2)
  4132. swc1 f16, 156(a2)
  4133. //elemr 5
  4134. lbu t1, 0(t0)
  4135. lbu t2, 1(t0)
  4136. lbu t3, 2(t0)
  4137. lbu t4, 3(t0)
  4138. lbu t5, 4(t0)
  4139. lbu t6, 5(t0)
  4140. lbu t7, 6(t0)
  4141. lbu t8, 7(t0)
  4142. addiu t1, t1, -128
  4143. addiu t2, t2, -128
  4144. addiu t3, t3, -128
  4145. addiu t4, t4, -128
  4146. addiu t5, t5, -128
  4147. addiu t6, t6, -128
  4148. addiu t7, t7, -128
  4149. addiu t8, t8, -128
  4150. mtc1 t1, f2
  4151. mtc1 t2, f4
  4152. mtc1 t3, f6
  4153. mtc1 t4, f8
  4154. mtc1 t5, f10
  4155. mtc1 t6, f12
  4156. mtc1 t7, f14
  4157. mtc1 t8, f16
  4158. cvt.s.w f2, f2
  4159. cvt.s.w f4, f4
  4160. cvt.s.w f6, f6
  4161. cvt.s.w f8, f8
  4162. cvt.s.w f10, f10
  4163. cvt.s.w f12, f12
  4164. cvt.s.w f14, f14
  4165. cvt.s.w f16, f16
  4166. lw t0, 24(a0)
  4167. swc1 f2, 160(a2)
  4168. swc1 f4, 164(a2)
  4169. swc1 f6, 168(a2)
  4170. addu t0, t0, a1
  4171. swc1 f8, 172(a2)
  4172. swc1 f10, 176(a2)
  4173. swc1 f12, 180(a2)
  4174. swc1 f14, 184(a2)
  4175. swc1 f16, 188(a2)
  4176. //elemr 6
  4177. lbu t1, 0(t0)
  4178. lbu t2, 1(t0)
  4179. lbu t3, 2(t0)
  4180. lbu t4, 3(t0)
  4181. lbu t5, 4(t0)
  4182. lbu t6, 5(t0)
  4183. lbu t7, 6(t0)
  4184. lbu t8, 7(t0)
  4185. addiu t1, t1, -128
  4186. addiu t2, t2, -128
  4187. addiu t3, t3, -128
  4188. addiu t4, t4, -128
  4189. addiu t5, t5, -128
  4190. addiu t6, t6, -128
  4191. addiu t7, t7, -128
  4192. addiu t8, t8, -128
  4193. mtc1 t1, f2
  4194. mtc1 t2, f4
  4195. mtc1 t3, f6
  4196. mtc1 t4, f8
  4197. mtc1 t5, f10
  4198. mtc1 t6, f12
  4199. mtc1 t7, f14
  4200. mtc1 t8, f16
  4201. cvt.s.w f2, f2
  4202. cvt.s.w f4, f4
  4203. cvt.s.w f6, f6
  4204. cvt.s.w f8, f8
  4205. cvt.s.w f10, f10
  4206. cvt.s.w f12, f12
  4207. cvt.s.w f14, f14
  4208. cvt.s.w f16, f16
  4209. lw t0, 28(a0)
  4210. swc1 f2, 192(a2)
  4211. swc1 f4, 196(a2)
  4212. swc1 f6, 200(a2)
  4213. addu t0, t0, a1
  4214. swc1 f8, 204(a2)
  4215. swc1 f10, 208(a2)
  4216. swc1 f12, 212(a2)
  4217. swc1 f14, 216(a2)
  4218. swc1 f16, 220(a2)
  4219. //elemr 7
  4220. lbu t1, 0(t0)
  4221. lbu t2, 1(t0)
  4222. lbu t3, 2(t0)
  4223. lbu t4, 3(t0)
  4224. lbu t5, 4(t0)
  4225. lbu t6, 5(t0)
  4226. lbu t7, 6(t0)
  4227. lbu t8, 7(t0)
  4228. addiu t1, t1, -128
  4229. addiu t2, t2, -128
  4230. addiu t3, t3, -128
  4231. addiu t4, t4, -128
  4232. addiu t5, t5, -128
  4233. addiu t6, t6, -128
  4234. addiu t7, t7, -128
  4235. addiu t8, t8, -128
  4236. mtc1 t1, f2
  4237. mtc1 t2, f4
  4238. mtc1 t3, f6
  4239. mtc1 t4, f8
  4240. mtc1 t5, f10
  4241. mtc1 t6, f12
  4242. mtc1 t7, f14
  4243. mtc1 t8, f16
  4244. cvt.s.w f2, f2
  4245. cvt.s.w f4, f4
  4246. cvt.s.w f6, f6
  4247. cvt.s.w f8, f8
  4248. cvt.s.w f10, f10
  4249. cvt.s.w f12, f12
  4250. cvt.s.w f14, f14
  4251. cvt.s.w f16, f16
  4252. swc1 f2, 224(a2)
  4253. swc1 f4, 228(a2)
  4254. swc1 f6, 232(a2)
  4255. swc1 f8, 236(a2)
  4256. swc1 f10, 240(a2)
  4257. swc1 f12, 244(a2)
  4258. swc1 f14, 248(a2)
  4259. swc1 f16, 252(a2)
  4260. j ra
  4261. nop
  4262. END(jsimd_convsamp_float_mips_dspr2)
  4263. /*****************************************************************************/